├── bin └── .keep ├── obj └── .keep ├── .gitignore ├── images ├── compare.jpg └── details.jpg ├── libs ├── lib │ └── OpenCL.lib └── include │ ├── CL │ ├── opencl.h │ ├── cl_gl_ext.h │ ├── cl_d3d10_ext.h │ ├── cl_d3d11_ext.h │ ├── cl_d3d9_ext.h │ └── cl_gl.h │ └── json │ └── gason.h ├── test ├── data │ ├── color_grid.png │ ├── color_grid2.jpg │ ├── config_non_parseable.json │ ├── color_grid2_luma_swapped.png │ ├── config.json │ ├── config_invalid_val.json │ └── test_cases.json ├── specs │ ├── SubtractFromAllTest.cpp │ ├── LayerDeltasTest_script.py │ ├── SumTest.cpp │ ├── TestSpecsDeclarations.hpp │ ├── ExtractLumaTest.cpp │ ├── LumaTests_script.py │ ├── SquaredErrorTest.cpp │ ├── LastLayerDeltaTest.cpp │ ├── SwapLumaTest.cpp │ ├── BackpropagationTest_script.py │ ├── UpdateParametersTest.cpp │ ├── ConfigTest.cpp │ ├── LayerTest_script.R │ ├── LayerTest.cpp │ ├── BackpropagationTest.cpp │ └── LayerDeltasTest.cpp ├── TestCase.hpp ├── TestRunner.cpp └── TestCase.cpp ├── src ├── kernel │ ├── subtract_from_all.cl │ ├── greyscale.cl │ ├── extract_luma.cl │ ├── update_parameters.cl │ ├── last_layer_delta.cl │ ├── sum.cl │ ├── swap_luma.cl │ ├── squared_error.cl │ ├── layer_uber_kernel.cl │ ├── backpropagate.cl │ └── layer_deltas.cl ├── Config.hpp ├── LayerData.hpp ├── LayerData.cpp ├── opencl │ ├── UtilsOpenCL.hpp │ ├── Kernel.hpp │ ├── Kernel.cpp │ └── UtilsOpenCL.cpp ├── pch.hpp ├── ConfigBasedDataPipeline.hpp ├── Config.cpp └── DataPipeline.hpp ├── example_config.json ├── LICENSE ├── profile.py ├── makefile ├── generate_training_samples.py ├── schedule_training.py ├── weights_visualize.py └── README.md /bin/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /obj/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | /obj/*o 3 | /data 4 | /logs 5 | -------------------------------------------------------------------------------- /images/compare.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Scthe/cnn-Super-Resolution/HEAD/images/compare.jpg -------------------------------------------------------------------------------- /images/details.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Scthe/cnn-Super-Resolution/HEAD/images/details.jpg -------------------------------------------------------------------------------- /libs/lib/OpenCL.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Scthe/cnn-Super-Resolution/HEAD/libs/lib/OpenCL.lib -------------------------------------------------------------------------------- /test/data/color_grid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Scthe/cnn-Super-Resolution/HEAD/test/data/color_grid.png -------------------------------------------------------------------------------- /test/data/color_grid2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Scthe/cnn-Super-Resolution/HEAD/test/data/color_grid2.jpg -------------------------------------------------------------------------------- /test/data/config_non_parseable.json: -------------------------------------------------------------------------------- 1 | { 2 | "n1": 64, 3 | "n2": 32, 4 | "f1": 5 | "f2": 1, 6 | "f3": 5 7 | } 8 | -------------------------------------------------------------------------------- /test/data/color_grid2_luma_swapped.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Scthe/cnn-Super-Resolution/HEAD/test/data/color_grid2_luma_swapped.png -------------------------------------------------------------------------------- /src/kernel/subtract_from_all.cl: -------------------------------------------------------------------------------- 1 | __kernel void sub_from_all(__global float* data, // 2 | __const float value, // 3 | __const uint len) { 4 | const int global_index = get_global_id(0); 5 | if (global_index < len) { 6 | data[global_index] = data[global_index] - value; 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /test/data/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "n1": 32, 3 | "n2": 16, 4 | "f1": 9, 5 | "f2": 1, 6 | "f3": 5, 7 | "momentum": 123.5, 8 | "weight_decay_parameter": 0.1, 9 | "learning_rates": [12, 34, 56], 10 | "parameters_file": "cnn-parameters-a.json", 11 | "parameters_distribution_1": { 12 | "mean_w": 0.9, 13 | "mean_b": 0.9, 14 | "std_deviation_w": 0.9, 15 | "std_deviation_b": 0.9 16 | }, 17 | "parameters_distribution_2": { 18 | "mean_w": 2.001, 19 | "mean_b": 2.001, 20 | "std_deviation_w": 2.001, 21 | "std_deviation_b": 2.001 22 | }, 23 | "parameters_distribution_3": { 24 | "mean_w": 0.001, 25 | "mean_b": 0.001, 26 | "std_deviation_w": 0.001, 27 | "std_deviation_b": 0.001 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /example_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "n1": 32, 3 | "n2": 16, 4 | "f1": 9, 5 | "f2": 1, 6 | "f3": 5, 7 | 8 | "momentum": 0.9, 9 | "weight_decay_parameter": 0.001, 10 | "learning_rates": [0.0001, 0.0001, 0.00001], 11 | "parameters_file": "data/parameters.json", 12 | 13 | "parameters_distribution_1": { 14 | "mean_w": 0.0, 15 | "mean_b": 0.0, 16 | "std_deviation_w": 0.005, 17 | "std_deviation_b": 0.0 18 | }, 19 | "parameters_distribution_2": { 20 | "mean_w": 0.0, 21 | "mean_b": 0.0, 22 | "std_deviation_w": 0.005, 23 | "std_deviation_b": 0.0 24 | }, 25 | "parameters_distribution_3": { 26 | "mean_w": 0.0, 27 | "mean_b": 0.0, 28 | "std_deviation_w": 0.005, 29 | "std_deviation_b": 0.0 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /test/data/config_invalid_val.json: -------------------------------------------------------------------------------- 1 | { 2 | "n1": 32, 3 | "n2": 16, 4 | "f1": 9, 5 | "f2": 1, 6 | "f3": 5, 7 | "momentum": 123.5, 8 | "weight_decay_parameter": 0.1, 9 | "learning_rates": [12, 34, 56], 10 | "parameters_file": "cnn-parameters-a.json", 11 | "parameters_distribution_1": { 12 | "mean_w": 0.9, 13 | "mean_b": 0.9, 14 | "std_deviation_w": 0.9, 15 | "std_deviation_b": 0.9 16 | }, 17 | "parameters_distribution_2": { 18 | "mean_w": 2.001, 19 | "mean_b": 2.001, 20 | "std_deviation_w": 2.001, 21 | "std_deviation_b": 2.001 22 | }, 23 | "parameters_distribution_3": { 24 | "mean_w": 9999, 25 | "mean_b": 0.001, 26 | "std_deviation_w": 0.001, 27 | "std_deviation_b": 0.001 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/kernel/greyscale.cl: -------------------------------------------------------------------------------- 1 | __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | 2 | CLK_ADDRESS_CLAMP_TO_EDGE | 3 | CLK_FILTER_NEAREST; 4 | 5 | __kernel 6 | void main(__read_only image2d_t image, 7 | __global uchar* target, 8 | int w, int h){ 9 | 10 | // const uint w = get_global_size(0); 11 | // const uint h = get_global_size(1); 12 | const int2 pos = {get_global_id(0), get_global_id(1)}; 13 | float2 normCoor = convert_float2(pos) / (float2)( w, h ); 14 | 15 | if(pos.x >= 0 && pos.x < w && pos.y >= 0 && pos.y < h){ 16 | int idx = pos.y * w + pos.x; 17 | 18 | uint4 pixel_col = read_imageui(image, sampler, pos); 19 | target[idx] = (uchar)pixel_col.x; 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/kernel/extract_luma.cl: -------------------------------------------------------------------------------- 1 | __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | // 2 | CLK_ADDRESS_CLAMP_TO_EDGE | // 3 | CLK_FILTER_NEAREST; // 4 | 5 | __constant float4 rgb2y = {0.299f, 0.587f, 0.114f, 0.0f}; 6 | 7 | __kernel void extract_luma(__read_only image2d_t image, // 8 | __global float* target, // 9 | int w, int h) { 10 | const int2 pos = {get_global_id(0), get_global_id(1)}; 11 | 12 | if (pos.x >= 0 && pos.x < w && // 13 | pos.y >= 0 && pos.y < h) { 14 | int idx = pos.y * w + pos.x; 15 | uint4 pixel_col = read_imageui(image, sampler, pos); 16 | float4 pixel_col_f = convert_float4(pixel_col); 17 | #ifdef NORMALIZE 18 | target[idx] = dot(pixel_col_f, rgb2y) / 255.0f; 19 | #else 20 | target[idx] = dot(pixel_col_f, rgb2y); 21 | #endif // NORMALIZE 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Marcin Matuszczyk 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /src/Config.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CONFIG_H 2 | #define CONFIG_H 3 | 4 | #include "pch.hpp" 5 | #include // for std::ostream& operator<<(..) 6 | 7 | namespace cnn_sr { 8 | 9 | struct ParametersDistribution { 10 | ParametersDistribution() {} 11 | ParametersDistribution(float, float, float, float); 12 | 13 | float mean_w = 0.01f, sd_w = 0.01f; 14 | float mean_b = 0.0f, sd_b = 0.0f; 15 | }; 16 | 17 | struct Config { 18 | Config(size_t, size_t, // 19 | size_t, size_t, size_t, // 20 | float, float, float*, // 21 | ParametersDistribution, ParametersDistribution, ParametersDistribution, 22 | const char* const = nullptr); 23 | 24 | static void validate(Config&); 25 | 26 | size_t total_padding() const; 27 | 28 | // core parameters 29 | const size_t n1, n2; 30 | const size_t f1, f2, f3; 31 | const float momentum, weight_decay_parameter; 32 | float learning_rate[3]; 33 | std::string parameters_file = ""; 34 | 35 | // random parameters(weights/biases) 36 | ParametersDistribution params_distr_1; 37 | ParametersDistribution params_distr_2; 38 | ParametersDistribution params_distr_3; 39 | }; 40 | 41 | class ConfigReader { 42 | public: 43 | Config read(const char* const); 44 | }; 45 | } 46 | 47 | std::ostream& operator<<(std::ostream&, const cnn_sr::Config&); 48 | 49 | #endif /* CONFIG_H */ 50 | -------------------------------------------------------------------------------- /test/specs/SubtractFromAllTest.cpp: -------------------------------------------------------------------------------- 1 | #include "TestSpecsDeclarations.hpp" 2 | 3 | #include "../../src/DataPipeline.hpp" 4 | 5 | namespace test { 6 | namespace specs { 7 | 8 | /// 9 | /// PIMPL 10 | /// 11 | struct SubtractFromAllTestImpl {}; 12 | 13 | /// 14 | /// SubtractFromAllTest 15 | /// 16 | 17 | TEST_SPEC_PIMPL(SubtractFromAllTest) 18 | 19 | void SubtractFromAllTest::init() {} 20 | 21 | std::string SubtractFromAllTest::name(size_t) { 22 | return "Subtract from all test"; 23 | } 24 | 25 | size_t SubtractFromAllTest::data_set_count() { return 1; } 26 | 27 | bool SubtractFromAllTest::operator()(size_t, 28 | cnn_sr::DataPipeline *const pipeline) { 29 | assert_not_null(pipeline); 30 | auto _context = pipeline->context(); 31 | 32 | const size_t data_len = 900; 33 | const float to_subtract = 450.0f; 34 | std::vector cpu_data(data_len); 35 | std::vector expected_buf(data_len); 36 | for (size_t i = 0; i < data_len; i++) { 37 | cpu_data[i] = i; 38 | expected_buf[i] = cpu_data[i] - to_subtract; 39 | } 40 | 41 | // gpu allocate 42 | auto gpu_buf_data = 43 | _context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * data_len); 44 | _context->write_buffer(gpu_buf_data, (void *)&cpu_data[0], true); 45 | 46 | pipeline->subtract_from_all(gpu_buf_data, to_subtract); 47 | assert_equals(pipeline, expected_buf, gpu_buf_data); 48 | 49 | return true; 50 | } 51 | 52 | // 53 | // 54 | } // namespace specs 55 | } // namespace test 56 | -------------------------------------------------------------------------------- /src/kernel/update_parameters.cl: -------------------------------------------------------------------------------- 1 | __kernel void update_params( 2 | __read_only __global float* weights, // 3 | __read_only __global float* bias, // 4 | __read_only __global float* grad_weights, // 5 | __read_only __global float* grad_bias, // 6 | __read_only __global float* previous_delta_weights, // 7 | __read_only __global float* previous_delta_bias, // 8 | __const float momentum, // 9 | __const float weight_decay_parameter, // 10 | __const float learning_rate, // 11 | __const uint batch_size, // 12 | __const uint weights_size, // 13 | __const uint bias_size) { 14 | const size_t idx = get_global_id(0); 15 | 16 | // update weights 17 | if (idx < weights_size) { 18 | float weight_value = weights[idx]; 19 | float delta_w = momentum * previous_delta_weights[idx] + 20 | learning_rate * grad_weights[idx] + 21 | weight_decay_parameter * weight_value; 22 | weights[idx] = weight_value - delta_w / batch_size; 23 | previous_delta_weights[idx] = delta_w; 24 | } 25 | 26 | // update bias 27 | if (idx < bias_size) { 28 | float delta_b = momentum * previous_delta_bias[idx] + // 29 | learning_rate * grad_bias[idx]; 30 | bias[idx] -= delta_b / batch_size; 31 | previous_delta_bias[idx] = delta_b; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /test/specs/LayerDeltasTest_script.py: -------------------------------------------------------------------------------- 1 | # helper script to generate expected delta values 2 | # for LayerDeltasTest 3 | 4 | deltas=[0.122, 0.083, 0.064, 5 | 0.057, 0.075, 0.055, 6 | 0.025, 0.058, 0.138, 7 | 0.170, 0.068, 0.144, 8 | 0.121, 0.013, 0.176, 9 | 0.065, 0.169, 0.049, 10 | 0.003, 0.181, 0.051, 11 | 0.021, 0.136, 0.062, 12 | 0.066, 0.165, 0.176] 13 | f=3 14 | n_curr = 3 15 | n_prev = 2 16 | 17 | # 00000 - 18 | # 0___0 0 19 | # 0___0 1 20 | # 0___0 2 21 | # 00000 - 22 | 23 | d=[None]*25 24 | d[0] = [(0,0)] 25 | d[1] = [(0,0),(0,1)] 26 | d[2] = [(0,0),(0,1),(0,2)] 27 | d[3] = [(0,1),(0,2)] 28 | d[4] = [(0,2)] 29 | 30 | d[5] = [(0,0),(1,0)] 31 | d[6] = [(0,0),(0,1),(1,0),(1,1)] 32 | d[7] = [(0,0),(0,1),(0,2), (1,0),(1,1),(1,2)] 33 | d[8] = [(0,1),(0,2), (1,1),(1,2)] 34 | d[9] = [(0,2), (1,2)] 35 | 36 | d[10] = [(0,0),(1,0),(2,0)] 37 | d[11] = [(0,0),(0,1),(1,0),(1,1),(2,0),(2,1)] 38 | d[12] = [(0,0),(0,1),(0,2), (1,0),(1,1),(1,2), (2,0),(2,1),(2,2)] 39 | d[13] = [(0,1),(0,2), (1,1),(1,2), (2,1),(2,2)] 40 | d[14] = [(0,2), (1,2), (2,2)] 41 | 42 | d[15] = [(2,0),(1,0)] 43 | d[16] = [(2,0),(2,1),(1,0),(1,1)] 44 | d[17] = [(2,0),(2,1),(2,2), (1,0),(1,1),(1,2)] 45 | d[18] = [(2,1),(2,2), (1,1),(1,2)] 46 | d[19] = [(2,2), (1,2)] 47 | 48 | d[20] = [(2,0)] 49 | d[21] = [(2,0),(2,1)] 50 | d[22] = [(2,0),(2,1),(2,2)] 51 | d[23] = [(2,1),(2,2)] 52 | d[24] = [(2,2)] 53 | 54 | for xs in d: 55 | summ = 0 56 | for row,col in xs: 57 | idx = row*f*f + col*f 58 | for k in range(n_curr): 59 | summ += deltas[idx+k] 60 | print(summ) 61 | -------------------------------------------------------------------------------- /test/specs/SumTest.cpp: -------------------------------------------------------------------------------- 1 | #include "TestSpecsDeclarations.hpp" 2 | #include // snprintf 3 | #include "../../src/DataPipeline.hpp" 4 | 5 | namespace test { 6 | namespace specs { 7 | 8 | /// 9 | /// PIMPL 10 | /// 11 | struct SumTestImpl {}; 12 | 13 | /// 14 | /// SumTest 15 | /// 16 | 17 | TEST_SPEC_PIMPL(SumTest) 18 | 19 | void SumTest::init() {} 20 | 21 | std::string SumTest::name(size_t sq) { 22 | return sq == 1 ? "Sum all test - squared" : "Sum all test"; 23 | } 24 | 25 | size_t SumTest::data_set_count() { return 2; } 26 | 27 | bool SumTest::operator()(size_t sq, cnn_sr::DataPipeline *const pipeline) { 28 | assert_not_null(pipeline); 29 | auto _context = pipeline->context(); 30 | 31 | bool squared = sq == 1; 32 | const size_t data_len = 900; 33 | long long expected = 0; 34 | float cpu_data[data_len]; 35 | for (size_t i = 0; i < data_len; i++) { 36 | cpu_data[i] = i; 37 | expected += squared ? i * i : i; 38 | } 39 | // std::cout << sq << "->" << squared << " exp: " << expected << std::endl; 40 | auto gpu_buf_data = 41 | _context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * data_len); 42 | _context->write_buffer(gpu_buf_data, (void *)cpu_data, true); 43 | 44 | cl_ulong result = pipeline->sum(gpu_buf_data, squared); 45 | 46 | // ok, we do not expect 100% correct result 47 | long long margin = 20; 48 | long long err = expected - result; 49 | err = err < 0 ? -err : err; 50 | if (err > margin) { 51 | char msg_buffer[128]; 52 | snprintf(msg_buffer, sizeof(msg_buffer), // 53 | "Expected %lld to be %lld", result, expected); 54 | throw TestException(msg_buffer); 55 | } 56 | 57 | return true; 58 | } 59 | 60 | // 61 | // 62 | } // namespace specs 63 | } // namespace test 64 | -------------------------------------------------------------------------------- /profile.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | import subprocess 4 | 5 | epochs = 100 6 | seconds_per_epoch = 0.236 7 | cmd = 'bin\\cnn.exe train dry -c data\config.json --epochs {0:} -i data\\train_samples36'.format(epochs) 8 | 9 | kernel_profile_regex = "Kernel '.*/(.*?]).*?([\-e.\d]+)ns.*?([\-e.\d]+)s" 10 | def get_kernel_profiling_info(out): 11 | out = out.decode('UTF-8') 12 | rr = re.findall(kernel_profile_regex, out) 13 | l = [(x[0], int(x[1]), float(x[2])) for x in rr] 14 | l = sorted(l, key=lambda x: x[2]) 15 | ts = 0.0 16 | for _,_,t in l: 17 | ts += t 18 | return l, ts 19 | 20 | 21 | if __name__ == '__main__': 22 | import sys 23 | 24 | kernel_mode = 'kernel' in sys.argv 25 | 26 | cmd_ = cmd.split(' ') 27 | if kernel_mode: 28 | cmd_.append('profile') 29 | print('Command to execute:') 30 | print('\'' + (' '.join(cmd_)) + '\'') 31 | 32 | est_time = epochs * seconds_per_epoch 33 | print('Will do {0:} epochs'.format( epochs)) 34 | print('Estimated required time: {:.3f}s = {:.3f} min'.format(est_time, est_time//60)) 35 | 36 | start = time.time() 37 | proc = subprocess.Popen(cmd_, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 38 | outs, errs = proc.communicate() 39 | if proc.returncode is not 0: 40 | print('---- FAIL ----') 41 | exit() 42 | end = time.time() 43 | dt = end - start 44 | 45 | print("Execution time: {:.3f}s = {:.2f}min ({:.5f} s/epoch)".format(dt, dt/60, dt/epochs)) 46 | 47 | if kernel_mode: 48 | kps, kernel_time = get_kernel_profiling_info(outs) 49 | for name,ns,s in kps: 50 | name = name.replace('-D ', '').replace('\'', '').replace('[--]','') 51 | print("{0:7.4f}s ({1:5.2f}%)- {2:.65}".format(s, s*100/kernel_time, name)) 52 | print( "Time spend in kernel: {:f}s".format(kernel_time)) 53 | print("Percent of time spend in kernel: {:.4f}%".format(kernel_time*100/dt)) 54 | -------------------------------------------------------------------------------- /test/specs/TestSpecsDeclarations.hpp: -------------------------------------------------------------------------------- 1 | #ifndef TEST_SPECS_DECL_H 2 | #define TEST_SPECS_DECL_H 3 | 4 | #include "../TestCase.hpp" 5 | #include "../../src/opencl/Context.hpp" 6 | 7 | #define DECLARE_TEST_SPEC(X, ...) \ 8 | struct CONCATENATE(X, Impl); \ 9 | class X : public TestCase { \ 10 | public: \ 11 | X(); \ 12 | ~X(); \ 13 | void init(__VA_ARGS__); \ 14 | std::string name(size_t data_set_id) override; \ 15 | bool operator()(size_t data_set_id, cnn_sr::DataPipeline *const) override; \ 16 | size_t data_set_count() override; \ 17 | \ 18 | private: \ 19 | CONCATENATE(X, Impl) *const _impl = nullptr; \ 20 | }; 21 | 22 | #define TEST_SPEC_PIMPL(X) \ 23 | X::X() : _impl(new CONCATENATE(X, Impl)()) {} \ 24 | X::~X() { delete _impl; } 25 | 26 | namespace test { 27 | namespace specs { 28 | 29 | DECLARE_TEST_SPEC(ExtractLumaTest) 30 | DECLARE_TEST_SPEC(SwapLumaTest) 31 | DECLARE_TEST_SPEC(SquaredErrorTest) 32 | DECLARE_TEST_SPEC(SubtractFromAllTest) 33 | DECLARE_TEST_SPEC(SumTest) 34 | DECLARE_TEST_SPEC(LayerDeltasTest) 35 | DECLARE_TEST_SPEC(BackpropagationTest) 36 | DECLARE_TEST_SPEC(LayerTest) 37 | DECLARE_TEST_SPEC(LastLayerDeltaTest) 38 | DECLARE_TEST_SPEC(UpdateParametersTest) 39 | DECLARE_TEST_SPEC(ConfigTest) 40 | 41 | } 42 | } 43 | 44 | #endif /* TEST_SPECS_DECL_H */ 45 | -------------------------------------------------------------------------------- /test/TestCase.hpp: -------------------------------------------------------------------------------- 1 | #ifndef TEST_CASE_H 2 | #define TEST_CASE_H 3 | 4 | #include "../src/pch.hpp" 5 | #include 6 | #include 7 | 8 | namespace test { 9 | 10 | /// 11 | /// utils functions 12 | /// 13 | 14 | float activation_function(float); 15 | float activation_function_derivative(float); 16 | 17 | /// 18 | /// TestException 19 | /// 20 | class TestException : public std::runtime_error { 21 | public: 22 | TestException(); 23 | TestException(const char *); 24 | TestException(const TestException &); 25 | 26 | virtual const char *what() const throw(); 27 | 28 | private: 29 | std::ostringstream cnvt; 30 | }; 31 | 32 | /// 33 | /// TestCase etc. 34 | /// 35 | struct DataSet { 36 | DataSet(std::string name) : name(name) {} 37 | DataSet() {} 38 | std::string name; 39 | }; 40 | 41 | class TestCase { 42 | public: 43 | ~TestCase() {} 44 | 45 | virtual std::string name(size_t data_set_id) = 0; 46 | virtual bool operator()(size_t data_set_id, cnn_sr::DataPipeline *const) = 0; 47 | virtual size_t data_set_count() { return 1; } 48 | 49 | protected: 50 | void assert_equals(int expected, int result); 51 | void assert_equals(float expected, float result); 52 | void assert_equals(const std::vector &expected, 53 | const std::vector &result, bool print = false); 54 | void assert_equals(cnn_sr::DataPipeline *const, 55 | const std::vector &expected, opencl::MemoryHandle, 56 | bool print = false); 57 | void assert_true(bool v, const char *msg); 58 | void assert_data_set_ok(size_t); 59 | 60 | template 61 | void assert_not_null(T *, const char *msg = nullptr); 62 | }; 63 | 64 | /// 65 | /// template implementations 66 | /// 67 | 68 | template 69 | void TestCase::assert_not_null(T *ptr, const char *msg) { 70 | if (!msg) msg = "Null pointer"; 71 | assert_true(ptr != nullptr, msg); 72 | } 73 | } 74 | 75 | #endif /* TEST_CASE_H */ 76 | -------------------------------------------------------------------------------- /libs/include/CL/opencl.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2010 The Khronos Group Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and/or associated documentation files (the 6 | * "Materials"), to deal in the Materials without restriction, including 7 | * without limitation the rights to use, copy, modify, merge, publish, 8 | * distribute, sublicense, and/or sell copies of the Materials, and to 9 | * permit persons to whom the Materials are furnished to do so, subject to 10 | * the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included 13 | * in all copies or substantial portions of the Materials. 14 | * 15 | * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. 22 | ******************************************************************************/ 23 | 24 | /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ 25 | 26 | #ifndef __OPENCL_H 27 | #define __OPENCL_H 28 | 29 | #ifdef __cplusplus 30 | extern "C" { 31 | #endif 32 | 33 | #ifdef __APPLE__ 34 | 35 | #include 36 | #include 37 | #include 38 | #include 39 | 40 | #else 41 | 42 | #include "CL/cl.h" 43 | #include "CL/cl_gl.h" 44 | #include "CL/cl_gl_ext.h" 45 | #include "CL/cl_ext.h" 46 | 47 | #endif 48 | 49 | #ifdef __cplusplus 50 | } 51 | #endif 52 | 53 | #endif /* __OPENCL_H */ 54 | 55 | -------------------------------------------------------------------------------- /src/LayerData.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LAYER_DATA_H 2 | #define LAYER_DATA_H 3 | 4 | #include 5 | #include // for size_t 6 | #include // for operator<< 7 | 8 | namespace cnn_sr { 9 | 10 | /* clang-format off */ 11 | /** 12 | * 13 | * Test data schema description (values for each layer provided after '/'): 14 | * 15 | * n_prev_filter_cnt := INT, filter count for previous layer, values: 1/n1/n2 16 | * current_filter_count := INT, filter count for this layer, values: n1/n2/1 17 | * f_spatial_size := INT, spatial size, values: f1/f2/f3 18 | * weights := VECTOR[FLOAT], min size: weight_size 19 | * Each column for different filter(from 1 to current_filter_count) 20 | * Each row for different point in range 0..f_spatial_size^2 21 | * Each paragraph is 1 row of points (f_spatial_size points) 22 | * bias := VECTOR[FLOAT], min size: bias_size 23 | * 24 | * calcutated values: 25 | * input_size := input_w * input_h * n_prev_filter_cnt * current_filter_count 26 | * out_w := input_w - f_spatial_size + 1 27 | * out_h := input_h - f_spatial_size + 1 28 | * weight_count := f_spatial_size^2 * n_prev_filter_cnt 29 | * bias_count := current_filter_count 30 | */ 31 | struct LayerData { 32 | /* clang-format on */ 33 | 34 | LayerData(size_t n_prev_filter_cnt, size_t current_filter_count, 35 | size_t f_spatial_size); 36 | 37 | static void validate(const LayerData&); 38 | 39 | // setters 40 | void set_weights(float*); 41 | void set_bias(float*); 42 | // getters 43 | size_t input_size(size_t w, size_t h) const; 44 | void get_output_dimensions(size_t*, size_t w, size_t h) const; 45 | size_t weight_size() const; 46 | size_t bias_size() const; 47 | inline const float* weights_ptr() const { return &weights[0]; } 48 | inline const float* bias_ptr() const { return &bias[0]; } 49 | 50 | public: 51 | const size_t n_prev_filter_cnt; 52 | const size_t current_filter_count; 53 | const size_t f_spatial_size; 54 | 55 | /** stale */ 56 | std::vector weights; 57 | /** stale */ 58 | std::vector bias; 59 | }; 60 | } 61 | 62 | std::ostream& operator<<(std::ostream&, const cnn_sr::LayerData&); 63 | 64 | #endif /* LAYER_DATA_H */ 65 | -------------------------------------------------------------------------------- /src/kernel/last_layer_delta.cl: -------------------------------------------------------------------------------- 1 | /* clang-format off */ 2 | /** 3 | * [main description] 4 | * @param float* ground_truth_image [description] 5 | * @param float* algo_result [description] 6 | * @param float* target [description] 7 | * @param float weight_decay regularization term to bring the weights down 8 | * @param uint ground_truth_w [description] 9 | * @param uint algo_result_w 10 | * @param uint algo_result_h 11 | * @return {[type]} [description] 12 | */ 13 | /* clang-format on */ 14 | __kernel void last_layer_delta(__read_only __global float* ground_truth_image, 15 | __read_only __global float* algo_result, 16 | __global float* target, // 17 | __const uint ground_truth_w, // 18 | __const uint ground_truth_h, // 19 | __const uint algo_result_w, // 20 | __const uint algo_result_h) { 21 | const int2 pos = {get_global_id(0), get_global_id(1)}; // x=col=i, y=row=j 22 | const uint sample_id = get_global_id(2); 23 | const int2 out_size = {algo_result_w, algo_result_h}; 24 | const int idx = (pos.y * algo_result_w) + pos.x; 25 | const size_t padding = (ground_truth_w - algo_result_w) / 2; 26 | 27 | #define IMAGE_OFFSET_GT sample_id* ground_truth_w* ground_truth_h 28 | #define IMAGE_OFFSET_ALGO sample_id* algo_result_w* algo_result_h 29 | 30 | // size of ground_truth != algo res (padding) 31 | // The offset is not const, since it depends on the row we are in 32 | // algo for ground_truth_idx: 33 | // (row + padding_on_top_of_image) * width + padding_left + col 34 | const size_t ground_truth_idx = 35 | (pos.y + padding) * ground_truth_w + padding + pos.x; 36 | 37 | if (pos.x >= 0 && pos.x < out_size.x && // 38 | pos.y >= 0 && pos.y < out_size.y) { 39 | // usuall square error derivative calculations 40 | float t = ground_truth_image[IMAGE_OFFSET_GT + ground_truth_idx]; 41 | float y = algo_result[IMAGE_OFFSET_ALGO + idx]; 42 | float d = y - t; 43 | 44 | // relu 45 | float relu_deriv = y > 0.0f ? 1.0f : 0.0f; 46 | 47 | // write result 48 | target[IMAGE_OFFSET_ALGO + idx] = d * relu_deriv; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | # $@ - left side of ':' 2 | # $^ - right side of ':' 3 | # $< - first of dependencies 4 | 5 | CC = clang++ 6 | VPATH = src/opencl src test test/specs libs/cpp 7 | IDIR = libs/include 8 | ODIR = obj 9 | BINDIR = bin 10 | LIBS = -lm -L libs/lib -l OpenCL 11 | EXECNAME = cnn.exe 12 | 13 | CFLAGS = -std=c++11 \ 14 | -c \ 15 | -g \ 16 | -Wall \ 17 | -Wextra \ 18 | -stdlib=libstdc++ \ 19 | -isystem "C:\programs\install\MinGW\include" \ 20 | -isystem "C:\programs\install\MinGW\lib\gcc\mingw32\4.7.2\include\c++" \ 21 | -isystem "C:\programs\install\MinGW\lib\gcc\mingw32\4.7.2\include\c++\mingw32" \ 22 | -I$(IDIR) 23 | 24 | LFLAGS = -std=c++11 \ 25 | -l "stdc++" \ 26 | -I$(IDIR) 27 | 28 | __OBJ = Config.o \ 29 | LayerData.o \ 30 | DataPipeline.o \ 31 | ConfigBasedDataPipeline.o \ 32 | pch.o \ 33 | Context.o \ 34 | UtilsOpenCL.o \ 35 | Kernel.o \ 36 | gason.o 37 | 38 | _OBJ = Main_cl.o $(__OBJ) 39 | OBJ = $(patsubst %,$(ODIR)/%,$(_OBJ)) # append ODIR to each entry 40 | 41 | # _TEST_OBJ = TestRunner.o $(__OBJ) TestDataProvider.o LayerDeltasTest.o BackpropagationTest.o 42 | _TEST_OBJ = TestRunner.o $(__OBJ) \ 43 | TestCase.o \ 44 | ExtractLumaTest.o \ 45 | SwapLumaTest.o \ 46 | SquaredErrorTest.o \ 47 | SubtractFromAllTest.o \ 48 | SumTest.o \ 49 | LayerDeltasTest.o \ 50 | BackpropagationTest.o \ 51 | LayerTest.o \ 52 | LastLayerDeltaTest.o \ 53 | UpdateParametersTest.o \ 54 | ConfigTest.o 55 | TEST_OBJ = $(patsubst %,$(ODIR)/%,$(_TEST_OBJ)) 56 | 57 | 58 | # If the first argument is "run"... 59 | ifeq (run,$(firstword $(MAKECMDGOALS))) 60 | # use the rest as arguments for "run" 61 | RUN_ARGS := $(wordlist 2,$(words $(MAKECMDGOALS)),$(MAKECMDGOALS)) 62 | # ...and turn them into do-nothing targets 63 | $(eval $(RUN_ARGS):;@:) 64 | endif 65 | 66 | 67 | # default target 68 | build: $(EXECNAME) 69 | 70 | compile: $(OBJ) 71 | 72 | # if You pass arguments do it like this: 73 | # 'make run -- ARGS_HERE' 74 | run: $(EXECNAME) 75 | @echo ----------------------- 76 | @$(BINDIR)/$< $(RUN_ARGS) 77 | 78 | test: $(TEST_OBJ) 79 | @echo Linking tests.. 80 | g++ -o $(BINDIR)/test.exe $^ $(LFLAGS) $(LIBS) 81 | @echo ----------------------- 82 | @$(BINDIR)/test.exe 83 | 84 | 85 | clean: 86 | rm -f $(ODIR)/*.o 87 | rm -f $(BINDIR)/* 88 | 89 | 90 | 91 | $(EXECNAME): $(OBJ) 92 | @echo Linking.. 93 | g++ -o $(BINDIR)/$@ $^ $(LFLAGS) $(LIBS) 94 | 95 | $(ODIR)/%.o: %.cpp 96 | $(CC) -c -o $@ $< $(CFLAGS) 97 | -------------------------------------------------------------------------------- /src/kernel/sum.cl: -------------------------------------------------------------------------------- 1 | /* clang-format off */ 2 | /** 3 | * @see http://suhorukov.blogspot.com/2011/12/opencl-11-atomic-operations-on-floating.html 4 | * @see http://simpleopencl.blogspot.com/2013/05/atomic-operations-and-floats-in-opencl.html 5 | * 6 | * @param {[type]} volatile __global float *source [description] 7 | * @param {[type]} const float operand [description] 8 | */ 9 | inline void atomic_add_global(volatile __global float* source, const float operand) { 10 | /* clang-format on */ 11 | union { 12 | unsigned int intVal; 13 | float floatVal; 14 | } newVal; 15 | 16 | union { 17 | unsigned int intVal; 18 | float floatVal; 19 | } prevVal; 20 | 21 | // NOTE: atomic_cmpxchg(volatile __global unsigned int *p, 22 | // unsigned int cmp, unsigned int val) 23 | do { 24 | prevVal.floatVal = *source; 25 | newVal.floatVal = prevVal.floatVal + operand; 26 | } while (atomic_cmpxchg((volatile __global unsigned int*)source, 27 | prevVal.intVal, // 28 | newVal.intVal) != prevVal.intVal); 29 | } 30 | 31 | /** 32 | * Code partially inspired by: 33 | * http://developer.amd.com/resources/documentation-articles/articles-whitepapers/opencl-optimization-case-study-simple-reductions/ 34 | */ 35 | __kernel void sum(__read_only __global float* data, // 36 | volatile __global float* target, // 37 | __local float* scratch, // 38 | __const uint len) { 39 | const int global_index = get_global_id(0); 40 | const int local_index = get_local_id(0); 41 | 42 | // each kernel computes it's value and stores in local scratch buffer 43 | float val = global_index < len ? data[global_index] : 0.0f; 44 | #ifdef SUM_SQUARED 45 | val = val * val; 46 | #endif 47 | scratch[local_index] = val; 48 | 49 | // wait till all kernels from local groups finished 50 | barrier(CLK_LOCAL_MEM_FENCE); 51 | 52 | // add all squared_diffs for local group 53 | for (int offset = get_local_size(0) / 2; offset > 0; offset = offset / 2) { 54 | if (local_index < offset) { 55 | float other = scratch[local_index + offset]; 56 | float mine = scratch[local_index]; 57 | scratch[local_index] = mine + other; 58 | } 59 | // wait for all local kernels to finish previous step 60 | // and reach stable state 61 | barrier(CLK_LOCAL_MEM_FENCE); 62 | } 63 | 64 | // add local result to global result 65 | if (local_index == 0) { 66 | atomic_add_global(target, scratch[0]); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /test/specs/ExtractLumaTest.cpp: -------------------------------------------------------------------------------- 1 | #include "TestSpecsDeclarations.hpp" 2 | 3 | #include "../../src/opencl/UtilsOpenCL.hpp" 4 | #include "../../src/DataPipeline.hpp" 5 | 6 | auto test_image = "test/data/color_grid.png"; 7 | 8 | namespace test { 9 | namespace specs { 10 | 11 | /// 12 | /// Data set 13 | /// 14 | struct ExtractLumaDataSet : DataSet { 15 | ExtractLumaDataSet(bool n, std::string name) : DataSet(name), normalize(n) {} 16 | bool normalize; 17 | }; 18 | 19 | /// 20 | /// PIMPL 21 | /// 22 | struct ExtractLumaTestImpl { 23 | const size_t data_size[2] = {5, 5}; 24 | const std::vector output = {0.000f, 1.000f, 0.812f, 0.853f, 0.437f, // 25 | 0.170f, 0.701f, 0.413f, 0.886f, 0.787f, // 26 | 0.430f, 0.299f, 0.587f, 0.114f, 0.707f, // 27 | 0.670f, 0.745f, 0.853f, 0.745f, 0.299f, 28 | 0.810f, 0.588f, 0.859f, 0.593f, 0.702f}; 29 | 30 | ExtractLumaDataSet data_sets[2] = { 31 | ExtractLumaDataSet(true, "normalized"), 32 | ExtractLumaDataSet(false, "not normalized")}; 33 | }; 34 | 35 | /// 36 | /// ExtractLumaTest 37 | /// 38 | 39 | TEST_SPEC_PIMPL(ExtractLumaTest) 40 | 41 | void ExtractLumaTest::init() {} 42 | 43 | size_t ExtractLumaTest::data_set_count() { return 2; } 44 | 45 | std::string ExtractLumaTest::name(size_t data_set_id) { 46 | assert_data_set_ok(data_set_id); 47 | return "Extract luma test - " + _impl->data_sets[data_set_id].name; 48 | } 49 | 50 | bool ExtractLumaTest::operator()(size_t data_set_id, 51 | cnn_sr::DataPipeline *const pipeline) { 52 | assert_not_null(pipeline); 53 | assert_data_set_ok(data_set_id); 54 | bool normalize = _impl->data_sets[data_set_id].normalize; 55 | 56 | opencl::utils::ImageData data; 57 | load_image(test_image, data); 58 | this->assert_true( 59 | _impl->data_size[0] * _impl->data_size[1] == (size_t)(data.w * data.h), 60 | "Vector of 1st layer's input values should be at least as big as test" 61 | " image"); 62 | 63 | opencl::MemoryHandle gpu_buf_raw_img = gpu_nullptr, 64 | gpu_buf_luma = gpu_nullptr; 65 | pipeline->extract_luma(data, gpu_buf_raw_img, gpu_buf_luma, normalize); 66 | 67 | std::vector expected = _impl->output; 68 | for (int i = 0; (!normalize) && (i < data.w * data.h); i++) { 69 | expected[i] *= 255; 70 | } 71 | assert_equals(pipeline, expected, gpu_buf_luma); 72 | 73 | return true; 74 | } 75 | 76 | // 77 | // 78 | } // namespace specs 79 | } // namespace test 80 | -------------------------------------------------------------------------------- /test/specs/LumaTests_script.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | 3 | rgb2y = [ 0.299, 0.587, 0.114, 0.0] 4 | rgb2Cb = [-0.1687, -0.3312, 0.5, 0.0] 5 | rgb2Cr = [ 0.5, -0.4186, -0.0813, 0.0] 6 | YCbCr2r = [1.0, 0.0, 1.4, 0.0] 7 | YCbCr2g = [1.0, -0.343, -0.711, 0.0] 8 | YCbCr2b = [1.0, 1.765, 0.0, 0.0] 9 | 10 | def dot(a,b): 11 | return a[0]*b[0] + a[1]*b[1] + a[2]*b[2] 12 | 13 | 14 | def extract_luma(img): 15 | luma_channel = [] 16 | width, height = img.size 17 | pixels = img.load() # this is not a list, nor is it list()'able 18 | for y in range(height): 19 | for x in range(width): 20 | cpixel = pixels[x, y] 21 | luma_val = dot(cpixel, rgb2y) 22 | luma_channel.append(luma_val/255) 23 | l = int(len(luma_channel)**0.5) 24 | for i in range(l): 25 | xs = luma_channel[i*l : (i+1)*l] 26 | print(', '.join(["{:>6.3}".format(x) for x in xs])) 27 | 28 | def swap_luma(img, padding, out_path): 29 | "To verify this works run it on f.e. 256*256 picture, but it may take some time (like 3 min or so)" 30 | print("Deprecated, see SwapLumaTest.cpp") 31 | raise Exception('SwapLuma does not produce acceptable result file') 32 | 33 | img_w, img_h = img.size 34 | 35 | # generate luma to swap into 36 | total_padding = padding * 2 37 | luma_w,luma_h = img_w - total_padding, img_h - total_padding 38 | new_luma_size = luma_w * luma_w 39 | new_luma = [(i/new_luma_size) for i in range(new_luma_size)] 40 | # print(new_luma) 41 | 42 | pixels = img.load() # this is not a list, nor is it list()'able 43 | for y in range(img_w): 44 | for x in range(img_h): 45 | pos_luma = x - padding, y - padding 46 | idx_luma = pos_luma[1] * luma_w + pos_luma[0] 47 | # idx = y * img_w + x 48 | cpixel = pixels[x, y] # 0..255 49 | 50 | if pos_luma[0] >= 0 and pos_luma[0] < luma_w and \ 51 | pos_luma[1] >= 0 and pos_luma[1] < luma_h: 52 | raw_luma = new_luma[idx_luma] 53 | YCbCr = (raw_luma * 255, # 0..255 54 | dot(rgb2Cb, cpixel), 55 | dot(rgb2Cr, cpixel)) 56 | clamp = lambda x: int(min(255, max(0, x))) 57 | new_color = (clamp(dot(YCbCr2r, YCbCr)), \ 58 | clamp(dot(YCbCr2g, YCbCr)), \ 59 | clamp(dot(YCbCr2b, YCbCr))) 60 | else: 61 | new_color = cpixel 62 | # print(new_color) 63 | pixels[x, y] = new_color 64 | img.save(out_path, "JPEG") 65 | 66 | 67 | 68 | if __name__ == '__main__': 69 | extract_luma_img = Image.open("../data/color_grid.png") 70 | extract_luma(extract_luma_img) 71 | 72 | # swap_luma_img = Image.open( "../data/color_grid2.jpg") 73 | # swap_luma_img = Image.open( "../data/color_grid3.png") 74 | # swap_luma(swap_luma_img, 10, "../data/color_grid2_luma_swapped.png") 75 | -------------------------------------------------------------------------------- /libs/include/CL/cl_gl_ext.h: -------------------------------------------------------------------------------- 1 | /********************************************************************************** 2 | * Copyright (c) 2008-2010 The Khronos Group Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and/or associated documentation files (the 6 | * "Materials"), to deal in the Materials without restriction, including 7 | * without limitation the rights to use, copy, modify, merge, publish, 8 | * distribute, sublicense, and/or sell copies of the Materials, and to 9 | * permit persons to whom the Materials are furnished to do so, subject to 10 | * the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included 13 | * in all copies or substantial portions of the Materials. 14 | * 15 | * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. 22 | **********************************************************************************/ 23 | 24 | /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ 25 | 26 | /* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have */ 27 | /* OpenGL dependencies. */ 28 | 29 | #ifndef __OPENCL_CL_GL_EXT_H 30 | #define __OPENCL_CL_GL_EXT_H 31 | 32 | #ifdef __cplusplus 33 | extern "C" { 34 | #endif 35 | 36 | #ifdef __APPLE__ 37 | #include 38 | #else 39 | #include "CL/cl_gl.h" 40 | #endif 41 | 42 | /* 43 | * For each extension, follow this template 44 | * / * cl_VEN_extname extension */ 45 | /* #define cl_VEN_extname 1 46 | * ... define new types, if any 47 | * ... define new tokens, if any 48 | * ... define new APIs, if any 49 | * 50 | * If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header 51 | * This allows us to avoid having to decide whether to include GL headers or GLES here. 52 | */ 53 | 54 | /* 55 | * cl_khr_gl_event extension 56 | * See section 9.9 in the OpenCL 1.1 spec for more information 57 | */ 58 | #define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D 59 | 60 | extern CL_API_ENTRY cl_event CL_API_CALL 61 | clCreateEventFromGLsyncKHR(cl_context /* context */, 62 | cl_GLsync /* cl_GLsync */, 63 | cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1; 64 | 65 | #ifdef __cplusplus 66 | } 67 | #endif 68 | 69 | #endif /* __OPENCL_CL_GL_EXT_H */ 70 | -------------------------------------------------------------------------------- /generate_training_samples.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import isfile, join 3 | from random import randint 4 | 5 | import argparse 6 | from PIL import Image 7 | 8 | img_id = 0 9 | created_files =[] 10 | 11 | def list_files(dir_path): 12 | return [f for f in os.listdir(dir_path) if isfile(join(dir_path,f)) ] 13 | 14 | def process_img(in_dir,out_dir,file_name, out_size,small_scale): 15 | global img_id, created_files 16 | in_path = join(in_dir, file_name) 17 | large_path = join(out_dir, 'sample_{0:}_{1:}.jpg'.format(img_id, "large")) 18 | small_path = join(out_dir, 'sample_{0:}_{1:}.jpg'.format(img_id, "small")) 19 | # print( in_path) 20 | # print(large_path) 21 | # print(small_path) 22 | img_id += 1 23 | 24 | im = Image.open(in_path) 25 | if im.width < out_size or im.height < out_size: 26 | raise Exception('Image \'{0:}\' is smaller then requested out-size'.format(file_name)) 27 | 28 | crop_upper_left = randint(0,im.width-out_size), randint(0,im.height-out_size) 29 | large = im.crop((crop_upper_left[0],\ 30 | crop_upper_left[1], \ 31 | crop_upper_left[0] + out_size,\ 32 | crop_upper_left[1] + out_size)) 33 | # size = out_size, out_size 34 | # im.resize(size, Image.ANTIALIAS) 35 | large.save(large_path, "JPEG") 36 | 37 | small_size = int(out_size/small_scale) 38 | small1 = large.resize((small_size,small_size), Image.ANTIALIAS) 39 | small2 = small1.resize((out_size,out_size), Image.ANTIALIAS) 40 | small2.save(small_path, "JPEG") 41 | 42 | created_files.append((large_path, small_path)) 43 | 44 | 45 | 46 | if __name__ == '__main__': 47 | help_text = 'Mass resize images. Usage: ' + \ 48 | '"generate_training_samples.py -i data\\train_samples_raw -o data\\train_samples -s 128 -d 3"' 49 | 50 | parser = argparse.ArgumentParser(description=help_text) 51 | parser.add_argument('--in-dir', '-i',required=True, help='input directory' ) 52 | parser.add_argument('--out-dir', '-o',required=True, help='output directory' ) 53 | parser.add_argument('--out-size', '-s',required=True, help='size of output images', type=int) 54 | parser.add_argument('--degrade-factor', '-d', help='scale factor when producing smaller image', type=float, default=2) 55 | args = parser.parse_args() 56 | 57 | in_files = list_files(args.in_dir) 58 | # print('Found following files in \''+args.in_dir+'\': ') 59 | # print(in_files) 60 | 61 | os.makedirs(args.out_dir, exist_ok=True) 62 | for f in in_files: 63 | try: 64 | process_img(args.in_dir,args.out_dir,f, args.out_size, args.degrade_factor) 65 | except IOError: 66 | print("cannot create train samples for '{0:}'".format(f)) 67 | except Exception as e: 68 | print(str(e)) 69 | 70 | if not created_files: 71 | print('No files were created') 72 | else: 73 | print('created {0:} files'.format(len(created_files))) 74 | # print('\n'.join([item.replace("\\","\\\\") for sublist in created_files for item in sublist])) 75 | -------------------------------------------------------------------------------- /src/LayerData.cpp: -------------------------------------------------------------------------------- 1 | #include "LayerData.hpp" 2 | 3 | // #include // for std::copy 4 | #include // snprintf 5 | #include // std::runtime_error 6 | 7 | namespace cnn_sr { 8 | 9 | LayerData::LayerData(size_t n_prev_filter_cnt, size_t current_filter_count, 10 | size_t f_spatial_size) 11 | : n_prev_filter_cnt(n_prev_filter_cnt), 12 | current_filter_count(current_filter_count), 13 | f_spatial_size(f_spatial_size) { 14 | // validation will pass if we set size to proper values, thus limiting it's 15 | // usefulness 16 | this->weights.reserve(this->weight_size()); 17 | this->bias.reserve(this->bias_size()); 18 | } 19 | 20 | void LayerData::validate(const LayerData& data) { 21 | if (data.weights.size() < data.weight_size()) { 22 | char buf[255]; 23 | snprintf(buf, 255, 24 | "Declared f_spatial_size(%d)*f_spatial_size(%d)" 25 | "*n_prev_filter_cnt(%d)*current_filter_count(%d)=%d" 26 | " is bigger then weights array (%d elements)." 27 | " Expected more elements in weights array. ", 28 | data.f_spatial_size, data.f_spatial_size, data.n_prev_filter_cnt, 29 | data.current_filter_count, data.weight_size(), 30 | data.weights.size()); 31 | throw std::runtime_error(buf); 32 | } 33 | 34 | if (data.bias.size() < data.bias_size()) { 35 | char buf[255]; 36 | snprintf(buf, 255, 37 | "Bias array(size=%d) should have equal size to " 38 | "current_filter_count(%d).", 39 | data.bias.size(), data.bias_size()); 40 | throw std::runtime_error(buf); 41 | } 42 | } 43 | 44 | /// 45 | /// get&set 46 | /// 47 | 48 | void LayerData::set_weights(float* x) { 49 | if (x) std::copy(x, x + this->weight_size(), back_inserter(this->weights)); 50 | } 51 | 52 | void LayerData::set_bias(float* x) { 53 | if (x) std::copy(x, x + this->bias_size(), back_inserter(this->bias)); 54 | } 55 | 56 | void LayerData::get_output_dimensions(size_t* dim_arr, size_t input_w, 57 | size_t input_h) const { 58 | dim_arr[0] = input_w - f_spatial_size + 1; 59 | dim_arr[1] = input_h - f_spatial_size + 1; 60 | } 61 | 62 | size_t LayerData::weight_size() const { 63 | return f_spatial_size * f_spatial_size * n_prev_filter_cnt * 64 | current_filter_count; 65 | } 66 | 67 | size_t LayerData::bias_size() const { return current_filter_count; } 68 | 69 | size_t LayerData::input_size(size_t input_w, size_t input_h) const { 70 | return input_w * input_h * n_prev_filter_cnt; 71 | } 72 | 73 | // namespace cnn_sr 74 | } 75 | 76 | std::ostream& operator<<(std::ostream& os, const cnn_sr::LayerData& data) { 77 | os << "Layer {" 78 | << " previous filters: " << data.n_prev_filter_cnt 79 | << ", current filters: " << data.current_filter_count 80 | << ", f_spatial_size: " << data.f_spatial_size 81 | << ", weighs.size: " << data.weights.size() 82 | << ", bias.size: " << data.bias.size() << "}"; 83 | return os; 84 | } 85 | -------------------------------------------------------------------------------- /test/specs/SquaredErrorTest.cpp: -------------------------------------------------------------------------------- 1 | #include "TestSpecsDeclarations.hpp" 2 | 3 | #include // for std::mt19937 4 | #include // for random seed 5 | 6 | #include "../../src/DataPipeline.hpp" 7 | 8 | namespace test { 9 | namespace specs { 10 | 11 | /// 12 | /// PIMPL 13 | /// 14 | struct SquaredErrorTestImpl { 15 | const size_t algo_w = 1000, algo_h = 2000; 16 | const size_t padding = 4; 17 | }; 18 | 19 | /// 20 | /// SquaredErrorTest 21 | /// 22 | 23 | TEST_SPEC_PIMPL(SquaredErrorTest) 24 | 25 | void SquaredErrorTest::init() {} 26 | 27 | std::string SquaredErrorTest::name(size_t) { return "Squared error test"; } 28 | 29 | size_t SquaredErrorTest::data_set_count() { return 1; } 30 | 31 | bool SquaredErrorTest::operator()(size_t, 32 | cnn_sr::DataPipeline *const pipeline) { 33 | assert_not_null(pipeline); 34 | auto _context = pipeline->context(); 35 | 36 | // total padding (from both sides) = padding*2 37 | const size_t total_padding = _impl->padding * 2, 38 | ground_truth_w = _impl->algo_w + total_padding, 39 | ground_truth_h = _impl->algo_h + total_padding, 40 | algo_size = _impl->algo_w * _impl->algo_h, 41 | ground_truth_size = ground_truth_w * ground_truth_h; 42 | 43 | std::vector cpu_algo_res(algo_size); 44 | std::vector cpu_expected(algo_size); 45 | std::vector cpu_ground_truth(ground_truth_size); 46 | for (size_t i = 0; i < ground_truth_size; i++) { 47 | cpu_ground_truth[i] = 99999.0f; 48 | } 49 | 50 | float sum = 0.0f; 51 | unsigned seed1 = std::chrono::system_clock::now().time_since_epoch().count(); 52 | std::mt19937 generator(seed1); 53 | for (size_t i = 0; i < algo_size; i++) { 54 | size_t row = i / _impl->algo_w, col = i % _impl->algo_w, 55 | g_t_idx = 56 | (row + _impl->padding) * ground_truth_w + _impl->padding + col; 57 | cpu_ground_truth[g_t_idx] = generator() % 256; 58 | cpu_algo_res[i] = (generator() % 2560) / 10.0f; 59 | // fill expected buffer 60 | double d = cpu_ground_truth[g_t_idx] - cpu_algo_res[i]; 61 | sum += d * d; 62 | } 63 | 64 | /* clang-format off */ 65 | auto gpu_buf_ground_truth = _context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * ground_truth_size); 66 | _context->write_buffer(gpu_buf_ground_truth, (void *)&cpu_ground_truth[0], true); 67 | auto gpu_buf_algo_res = _context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * algo_size); 68 | _context->write_buffer(gpu_buf_algo_res, (void *)&cpu_algo_res[0], true); 69 | /* clang-format on */ 70 | 71 | // exec 72 | opencl::MemoryHandle tmp_buffer = gpu_nullptr; 73 | float target = 0.0f; 74 | pipeline->squared_error(gpu_buf_ground_truth, // 75 | ground_truth_w, ground_truth_h, // 76 | gpu_buf_algo_res, // 77 | tmp_buffer, target, total_padding); 78 | _context->block(); 79 | assert_equals(sum, target); 80 | 81 | return true; 82 | } 83 | 84 | // 85 | // 86 | } // namespace specs 87 | } // namespace test 88 | -------------------------------------------------------------------------------- /test/specs/LastLayerDeltaTest.cpp: -------------------------------------------------------------------------------- 1 | #include "TestSpecsDeclarations.hpp" 2 | 3 | #include // for std::mt19937 4 | #include // for random seed 5 | 6 | #include "../../src/DataPipeline.hpp" 7 | 8 | using namespace cnn_sr; 9 | 10 | namespace test { 11 | namespace specs { 12 | 13 | /// 14 | /// PIMPL 15 | /// 16 | struct LastLayerDeltaTestImpl { 17 | // const size_t algo_w = 1000, algo_h = 2000; 18 | const size_t algo_w = 6, algo_h = 6; 19 | const size_t padding = 4; 20 | }; 21 | 22 | /// 23 | /// LastLayerDeltaTest 24 | /// 25 | 26 | TEST_SPEC_PIMPL(LastLayerDeltaTest) 27 | 28 | void LastLayerDeltaTest::init() {} 29 | 30 | std::string LastLayerDeltaTest::name(size_t) { return "Last layer delta test"; } 31 | 32 | size_t LastLayerDeltaTest::data_set_count() { return 1; } 33 | 34 | bool LastLayerDeltaTest::operator()(size_t, 35 | cnn_sr::DataPipeline *const pipeline) { 36 | assert_not_null(pipeline); 37 | auto _context = pipeline->context(); 38 | 39 | // total padding (from both sides) = padding*2 40 | const size_t total_padding = _impl->padding * 2, 41 | ground_truth_w = _impl->algo_w + total_padding, 42 | ground_truth_h = _impl->algo_h + total_padding, 43 | algo_size = _impl->algo_w * _impl->algo_h, 44 | ground_truth_size = ground_truth_w * ground_truth_h; 45 | 46 | std::vector cpu_algo_res(algo_size); 47 | std::vector cpu_expected(algo_size); 48 | std::vector cpu_ground_truth(ground_truth_size); 49 | for (size_t i = 0; i < ground_truth_size; i++) { 50 | cpu_ground_truth[i] = 99999.0f; 51 | } 52 | 53 | unsigned seed1 = std::chrono::system_clock::now().time_since_epoch().count(); 54 | std::mt19937 generator(seed1); 55 | for (size_t i = 0; i < algo_size; i++) { 56 | size_t row = i / _impl->algo_w, col = i % _impl->algo_w, 57 | g_t_idx = 58 | (row + _impl->padding) * ground_truth_w + _impl->padding + col; 59 | float t = (generator() % 256) / 100.0f; 60 | // activation_function etc 61 | float x = (generator() % 2560) / 1000.0f; 62 | float y = activation_function(x); 63 | // fill expected buffer 64 | cpu_expected[i] = (y - t) * activation_function_derivative(x); 65 | cpu_ground_truth[g_t_idx] = t; 66 | cpu_algo_res[i] = y; 67 | } 68 | 69 | /* clang-format off */ 70 | auto gpu_buf_ground_truth = _context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * ground_truth_size); 71 | _context->write_buffer(gpu_buf_ground_truth, (void *)&cpu_ground_truth[0], true); 72 | auto gpu_buf_algo_res = _context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * algo_size); 73 | _context->write_buffer(gpu_buf_algo_res, (void *)&cpu_algo_res[0], true); 74 | /* clang-format on */ 75 | opencl::MemoryHandle gpu_buf_out = gpu_nullptr; 76 | 77 | // exec 78 | pipeline->last_layer_delta(gpu_buf_ground_truth, // 79 | ground_truth_w, ground_truth_h, // 80 | gpu_buf_algo_res, gpu_buf_out, total_padding); 81 | assert_equals(pipeline, cpu_expected, gpu_buf_out); 82 | return true; 83 | } 84 | 85 | // 86 | // 87 | } // namespace specs 88 | } // namespace test 89 | -------------------------------------------------------------------------------- /src/opencl/UtilsOpenCL.hpp: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_OPENCL_H 2 | #define UTILS_OPENCL_H 3 | 4 | #include "CL/opencl.h" 5 | 6 | namespace opencl { 7 | 8 | class Kernel; 9 | 10 | namespace utils { 11 | 12 | /** 13 | * From stb_image documentation: 14 | * 15 | * The return value from an image loader is an 'unsigned char *' which points 16 | * to the pixel data, or NULL on an allocation failure or if the image is 17 | * corrupt or invalid. The pixel data consists of *y scanlines of *x pixels, 18 | * with each pixel consisting of N interleaved 8-bit components; the first 19 | * pixel pointed to is top-left-most in the image. There is no padding between 20 | * image scanlines or between pixels, regardless of format. The number of 21 | * components N is 'req_comp' if req_comp is non-zero, or *comp otherwise. 22 | * If req_comp is non-zero, *comp has the number of components that _would_ 23 | * have been output otherwise. E.g. if you set req_comp to 4, you will always 24 | * get RGBA output, but you can check *comp to see if it's trivially opaque 25 | * because e.g. there were only 3 channels in the source image. 26 | */ 27 | struct ImageData { 28 | ImageData(); 29 | ImageData(int, int, int, unsigned char*); 30 | ~ImageData(); 31 | // TODO do not allow copy !!! 32 | 33 | int w, h; 34 | int bpp; // bytes per pixel 35 | unsigned char* data; 36 | 37 | private: 38 | bool read_from_file = true; 39 | }; 40 | 41 | /** 42 | * cl_device_type is a number so we will change it to string 43 | */ 44 | extern char const* device_type_str[9]; 45 | 46 | /** 47 | * Loads a Program file and prepends the cPreamble to the code. 48 | * @param cFilename program filename 49 | * @param cPreamble code that is prepended to the loaded file, typically \ 50 | a set of #defines or a header 51 | * @param szFinalLength returned length of the code string 52 | * @return the source string if succeeded, 0 otherwise 53 | */ 54 | char* load_file(const char* cFilename, const char* cPreamble, 55 | size_t* szFinalLength); 56 | 57 | void load_image(const char*, ImageData&); 58 | 59 | int write_image(const char*, ImageData&); 60 | 61 | void write_image(const char* const, float*, size_t, size_t); 62 | 63 | /** 64 | * Due too different possible resolutions we may have to recalculate this each 65 | * time. 66 | * 67 | * NOTE: this solution tries to maximize work items per group, as most of 68 | *kernels have some __local related optimizations 69 | * 70 | * @param kernel kernel to execute 71 | * @param dims work dimensions: 1 for linear, 2 for 2D, 3 for 3D 72 | * @param global_work_size to be filled size: dims 73 | * @param local_work_size to be filled size: dims 74 | * @param work real work size f.e. array length, image dimesions 75 | * etc. size: dims 76 | */ 77 | void work_sizes(const opencl::Kernel&, size_t dims, size_t* global_work_size, 78 | size_t* local_work_size, size_t* work, bool print = false); 79 | 80 | /** 81 | * convert error code to string 82 | * 83 | * @param cl_int :cl_int, error code 84 | * @return :string 85 | */ 86 | const char* get_opencl_error_str(cl_int); 87 | } 88 | } 89 | 90 | #endif /* UTILS_OPENCL_H */ 91 | -------------------------------------------------------------------------------- /src/kernel/swap_luma.cl: -------------------------------------------------------------------------------- 1 | __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | // 2 | CLK_ADDRESS_CLAMP_TO_EDGE | // 3 | CLK_FILTER_NEAREST; 4 | 5 | // http://www.equasys.de/colorconversion.html 6 | /* clang-format off */ 7 | __constant float4 rgb2y = { 0.299f, 0.587f, 0.114f, 0.0f}; 8 | __constant float4 rgb2Cb = { -0.1687f, -0.3312f, 0.5f, 0.0f}; 9 | __constant float4 rgb2Cr = { 0.5f, -0.4186f, -0.0813f, 0.0f}; 10 | // __constant float4 rgb2Cb = {-37.797f, -74.203f, 112.0f, 0.0f}; 11 | // __constant float4 rgb2Cr = { 112.0f, -98.786f, -18.214f, 0.0f}; 12 | 13 | __constant float4 YCbCr2r = { 1.0f, 0.0f, 1.4f, 0.0f}; 14 | __constant float4 YCbCr2g = { 1.0f, -0.343f, -0.711f, 0.0f}; 15 | __constant float4 YCbCr2b = { 1.0f, 1.765f, 0.0f, 0.0f}; 16 | /* clang-format on */ 17 | 18 | __kernel void swap_luma(__read_only image2d_t original_image, // 19 | __read_only __global float* new_luma, // 20 | __global uchar* target, // 21 | const uint ground_truth_w, 22 | const uint ground_truth_h, // 23 | const uint luma_w, const uint luma_h) { 24 | const size_t padding = (ground_truth_w - luma_w) / 2; 25 | const int2 pos = {get_global_id(0), get_global_id(1)}, 26 | pos_luma = {pos.x - padding, pos.y - padding}; 27 | const size_t idx = pos.y * ground_truth_w + pos.x, 28 | idx_luma = pos_luma.y * luma_w + pos_luma.x; 29 | 30 | if (pos.x < 0 || pos.x >= ground_truth_w || // 31 | pos.y < 0 || pos.y >= ground_truth_h) 32 | return; 33 | 34 | const uint4 pixel_col = read_imageui(original_image, sampler, pos); 35 | const float4 pixel_col_f = convert_float4(pixel_col); 36 | uint3 new_color; 37 | if (pos_luma.x < 0 || pos_luma.x >= luma_w || // 38 | pos_luma.y < 0 || pos_luma.y >= luma_h) { 39 | // sample original image 40 | new_color.x = pixel_col.x; // 0..255 41 | new_color.y = pixel_col.y; // 0..255 42 | new_color.z = pixel_col.z; // 0..255 43 | } else { 44 | // combine new luma with chroma from original image 45 | // to do this we first have to remove old luma 46 | // NOTE: during conversion we skip +128 / -128 steps as they cancel 47 | // themselves out 48 | // TODO after writing tests use matrix version of this code 49 | float raw_luma = new_luma[idx_luma]; // 0..1 50 | float4 YCbCr = {raw_luma * 255.0f, // 0..255 51 | dot(pixel_col_f, rgb2Cb), // 0..255 52 | dot(pixel_col_f, rgb2Cr), // 0..255 53 | 0.0f}; 54 | float3 rgb = {dot(YCbCr, YCbCr2r), // 55 | dot(YCbCr, YCbCr2g), // 56 | dot(YCbCr, YCbCr2b)}; 57 | rgb = clamp(rgb, 0.0f, 255.0f); 58 | // TODO mix luma values in edges of new luma area, to make the transition 59 | // less jarring 60 | new_color.x = convert_uint(rgb.x); 61 | new_color.y = convert_uint(rgb.y); 62 | new_color.z = convert_uint(rgb.z); 63 | } 64 | 65 | // write 66 | target[idx * 3 + 0] = convert_uchar(new_color.x); 67 | target[idx * 3 + 1] = convert_uchar(new_color.y); 68 | target[idx * 3 + 2] = convert_uchar(new_color.z); 69 | } 70 | -------------------------------------------------------------------------------- /test/specs/SwapLumaTest.cpp: -------------------------------------------------------------------------------- 1 | #include "TestSpecsDeclarations.hpp" 2 | 3 | #include "../../src/opencl/UtilsOpenCL.hpp" 4 | #include "../../src/DataPipeline.hpp" 5 | 6 | /* 7 | * Just run the kernel and see if the output is ~what You would expect. 8 | * This is quite easy if luma that You swap into has distinctive pattern. 9 | * You could also try generating expected output through python 10 | * (see LumaTests_script.py), but it uses some weird sampling method, 11 | * which means that result image is slighly blurred. 12 | */ 13 | 14 | namespace test { 15 | namespace specs { 16 | 17 | /// 18 | /// PIMPL 19 | /// 20 | struct SwapLumaTestImpl { 21 | const size_t padding = 10; 22 | // const char * const test_image = "test/data/color_grid.png"; 23 | const char *const input_img = "test/data/color_grid2.jpg"; 24 | const char *const expected_img = "test/data/color_grid2_luma_swapped.png"; 25 | }; 26 | 27 | /// 28 | /// SwapLumaTest 29 | /// 30 | 31 | TEST_SPEC_PIMPL(SwapLumaTest) 32 | 33 | void SwapLumaTest::init() {} 34 | 35 | size_t SwapLumaTest::data_set_count() { return 1; } 36 | 37 | std::string SwapLumaTest::name(size_t) { return "Swap luma test"; } 38 | 39 | bool SwapLumaTest::operator()(size_t, cnn_sr::DataPipeline *const pipeline) { 40 | assert_not_null(pipeline); 41 | auto context = pipeline->context(); 42 | 43 | opencl::utils::ImageData img; 44 | load_image(_impl->input_img, img); 45 | 46 | // generate luma to swap into 47 | size_t luma_w = img.w - 2 * _impl->padding, 48 | luma_h = img.h - 2 * _impl->padding, new_luma_size = luma_w * luma_w; 49 | std::vector new_luma(new_luma_size); 50 | for (size_t i = 0; i < new_luma_size; i++) { 51 | new_luma[i] = i * 1.0f / new_luma_size; 52 | } 53 | 54 | opencl::MemoryHandle gpu_buf_raw_img = gpu_nullptr, 55 | gpu_buf_target = gpu_nullptr; 56 | auto gpu_buf_luma = 57 | context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * new_luma.size()); 58 | context->write_buffer(gpu_buf_luma, (void *)&new_luma[0], true); 59 | 60 | // run 61 | pipeline->swap_luma(img, gpu_buf_raw_img, gpu_buf_luma, gpu_buf_target, 62 | luma_w, luma_h); 63 | // check 64 | opencl::utils::ImageData expected_img; 65 | load_image(_impl->expected_img, expected_img); 66 | size_t result_w = expected_img.w, result_h = expected_img.h, 67 | result_size = result_w * result_h * 3; // 3 channels 68 | std::vector result(result_size); 69 | context->read_buffer(gpu_buf_target, (void *)&result[0], true); 70 | 71 | // dump image - only for debug !!! 72 | // opencl::utils::ImageData res_img(result_w, result_h, 3, &result[0]); 73 | // opencl::utils::write_image("dbg.png", res_img); 74 | 75 | for (size_t y = 0; y < result_h; y++) { 76 | for (size_t x = 0; x < result_w; x++) { 77 | for (size_t ch = 0; ch < 3; ch++) { 78 | // NOTE: expected_img has 4 channels ! 79 | size_t idx1 = y * result_w + x; 80 | int r = static_cast(result[idx1 * 3 + ch]); 81 | int e = static_cast(expected_img.data[idx1 * 4 + ch]); 82 | // std::cout << "[" << idx1 << "] expected >\t" << e << "\tgot> " << r 83 | // << std::endl; 84 | assert_equals(e, r); 85 | } 86 | } 87 | } 88 | 89 | return true; 90 | } 91 | 92 | // 93 | // 94 | } // namespace specs 95 | } // namespace test 96 | -------------------------------------------------------------------------------- /test/TestRunner.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "TestCase.hpp" 4 | #include "../src/opencl/Context.hpp" 5 | #include "../src/DataPipeline.hpp" 6 | #include "specs/TestSpecsDeclarations.hpp" 7 | 8 | /// 9 | /// Test runner main function 10 | /// 11 | 12 | #define ADD_TEST(test_name, ...) \ 13 | test_name CONCATENATE(__test, __LINE__){}; \ 14 | CONCATENATE(__test, __LINE__).init(__VA_ARGS__); \ 15 | cases.push_back(&CONCATENATE(__test, __LINE__)); 16 | 17 | int main(int, char **) { 18 | std::cout << "STARTING TESTS" << std::endl; 19 | 20 | using namespace test; 21 | using namespace test::specs; 22 | 23 | std::vector cases; 24 | std::vector results; 25 | 26 | opencl::Context context; 27 | context.init(); 28 | cnn_sr::DataPipeline pipeline(&context); 29 | pipeline.init(cnn_sr::DataPipeline::LOAD_KERNEL_MISC); 30 | // TODO test opt 31 | // pipeline.init(true, cnn_sr::DataPipeline::LOAD_KERNEL_MISC); 32 | 33 | // 34 | // 35 | // 36 | 37 | ADD_TEST(LayerTest); 38 | ADD_TEST(ExtractLumaTest); 39 | ADD_TEST(SwapLumaTest); 40 | ADD_TEST(SquaredErrorTest); 41 | ADD_TEST(SubtractFromAllTest); 42 | ADD_TEST(SumTest); 43 | ADD_TEST(LayerDeltasTest); 44 | ADD_TEST(BackpropagationTest); 45 | ADD_TEST(LastLayerDeltaTest); 46 | ADD_TEST(UpdateParametersTest); 47 | ADD_TEST(ConfigTest); 48 | 49 | // 50 | // 51 | // 52 | // 53 | 54 | int failures = 0; 55 | for (auto i = begin(cases); i != end(cases); ++i) { 56 | TestCase *test = *i; 57 | size_t data_set_cnt = test->data_set_count(); 58 | if (data_set_cnt == 0) { 59 | data_set_cnt = 1; 60 | } 61 | 62 | // run test case with all data sets 63 | for (size_t ds = 0; ds < data_set_cnt; ds++) { 64 | auto test_name = test->name(ds); 65 | bool passed = false; 66 | 67 | std::cout << std::endl 68 | << test_name << ":" << std::endl; 69 | try { 70 | passed = (*test)(ds, &pipeline); 71 | } catch (const std::exception &ex) { 72 | std::cout << "[ERROR] " << ex.what() << std::endl; 73 | } catch (...) { 74 | std::cout << "[ERROR] Undefined exception" << std::endl; 75 | } 76 | results.push_back(passed ? 1 : 0); 77 | } 78 | } 79 | 80 | // print results 81 | std::cout << std::endl 82 | << "RESULTS:" << std::endl; 83 | size_t test_case_it = 0; 84 | for (size_t i = 0; i < cases.size(); i++) { 85 | TestCase *test = cases[i]; 86 | size_t data_set_cnt = test->data_set_count(); 87 | if (data_set_cnt == 0) { 88 | data_set_cnt = 1; 89 | } 90 | for (size_t ds = 0; ds < data_set_cnt; ds++) { 91 | auto test_name = test->name(ds); 92 | bool passed = results[test_case_it] != 0; 93 | ++test_case_it; 94 | if (passed) { 95 | std::cout << "\t " << test_name << std::endl; 96 | } else { 97 | std::cout << "\t~ " << test_name << std::endl; 98 | ++failures; 99 | } 100 | } 101 | } 102 | 103 | if (failures == 0) { 104 | std::cout << results.size() << " tests completed" << std::endl; 105 | exit(EXIT_SUCCESS); 106 | } else { 107 | std::cout << failures << " of " << results.size() << " failed" << std::endl; 108 | exit(EXIT_FAILURE); 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /test/data/test_cases.json: -------------------------------------------------------------------------------- 1 | { 2 | "k=1, n=3, f=3, input:5*5": { 3 | "n_prev_filter_cnt": 1, 4 | "current_filter_count": 3, 5 | "f_spatial_size": 3, 6 | 7 | "input_w": 5, 8 | "input_h": 5, 9 | "input": [ 10 | -0.6024, 0.3976, 0.2096, 0.2506, -0.1654, 11 | -0.4324, 0.0986, -0.1894, 0.2836, 0.1846, 12 | -0.1724, -0.3034, -0.0154, -0.4884, 0.1046, 13 | 0.0676, 0.1426, 0.2506, 0.1426, -0.3034, 14 | 0.2076, -0.0144, 0.2566, -0.0094, 0.0996], 15 | 16 | "output": [ 17 | 0.000, 0.000, 0.399, 18 | 0.000, 0.776, 0.111, 19 | 0.517, 0.000, 0.584, 20 | 0.000, 0.253, 0.000, 21 | 0.752, 0.000, 0.285, 22 | 0.000, 0.715, 0.000, 23 | 0.519, 0.200, 0.443, 24 | 0.000, 0.726, 0.551, 25 | 0.688, 0.000, 0.443], 26 | 27 | "weights": [ 28 | 1.0, 0.0, 0.0, 29 | 0.0, 1.0, 0.0, 30 | 1.0, 0.0, 0.0, 31 | 32 | 0.0, 1.0, 0.0, 33 | 1.0, 0.0, 1.0, 34 | 0.0, 1.0, 0.0, 35 | 36 | 1.0, 0.0, 0.0, 37 | 0.0, 1.0, 0.0, 38 | 1.0, 0.0, 0.0 39 | ], 40 | 41 | "bias": [0.1, 0.2, 0.3] 42 | }, 43 | 44 | 45 | 46 | "k=3, n=2, f=3, input:3*3": { 47 | "n_prev_filter_cnt": 3, 48 | "current_filter_count": 2, 49 | "f_spatial_size": 3, 50 | 51 | "input_w": 3, 52 | "input_h": 3, 53 | "input": [ 54 | 0.406, 0.419, 0.598, 55 | 0.442, 0.685, 0.528, 56 | 0.627, 0.489, 0.642, 57 | 0.376, 0.563, 0.499, 58 | 0.680, 0.371, 0.571, 59 | 0.390, 0.672, 0.453, 60 | 0.626, 0.550, 0.609, 61 | 0.386, 0.674, 0.634, 62 | 0.666, 0.413, 0.609], 63 | 64 | "output": [0.169, 0.0], 65 | 66 | "weights": [ 67 | -0.369, 0.025, 0.213, 0.058, 0.410, -0.068, 68 | 0.236, 0.071, -0.429, -0.104, 0.161, 0.087, 69 | 0.361, -0.055, 0.273, 0.071, 0.431, -0.095, 70 | 71 | 0.229, 0.378, -0.178, 0.343, 0.114, -0.409, 72 | -0.220, -0.364, 0.711, 0.281, 0.851, -1.001, 73 | -0.411, 0.661, -0.831, -0.091, 0.281, -0.341, 74 | 75 | -0.931, 0.511, 0.141, -0.591, 0.491, -0.921, 76 | 0.291, -0.211, 0.151, 0.491, -0.431, -0.321, 77 | -0.631, 0.301, -0.001, -0.761, -0.021, 0.501], 78 | 79 | "bias": [0.1, 0.2] 80 | }, 81 | 82 | 83 | 84 | "k=3, n=3, f=1, input:3*3": { 85 | "n_prev_filter_cnt": 3, 86 | "current_filter_count": 3, 87 | "f_spatial_size": 1, 88 | 89 | "input_w": 3, 90 | "input_h": 3, 91 | "input": [ 92 | 0.406, 0.419, 0.598, 93 | 0.442, 0.685, 0.528, 94 | 0.627, 0.489, 0.642, 95 | 0.376, 0.563, 0.499, 96 | 0.680, 0.371, 0.571, 97 | 0.390, 0.672, 0.453, 98 | 0.626, 0.550, 0.609, 99 | 0.386, 0.674, 0.634, 100 | 0.666, 0.413, 0.609], 101 | 102 | "output": [ 103 | 0.369, 0.025, 0.229, 104 | 0.213, 0.058, 0.378, 105 | 0.410, 0.000, 0.178, 106 | 0.236, 0.071, 0.343, 107 | 0.429, 0.000, 0.114, 108 | 0.161, 0.087, 0.409, 109 | 0.361, 0.000, 0.220, 110 | 0.273, 0.071, 0.364, 111 | 0.431, 0.000, 0.132], 112 | 113 | "weights": [ 114 | 0.20, -0.45, -0.35, -0.45, 0.16, 0.54, 0.63, -0.10, -0.26], 115 | 116 | "bias": [0.1, 0.2, 0.3] 117 | } 118 | 119 | } 120 | -------------------------------------------------------------------------------- /test/specs/BackpropagationTest_script.py: -------------------------------------------------------------------------------- 1 | #helper script to generate expected delta values 2 | #for BackpropagationTest 3 | 4 | inputs = [[ 5 | -0.083, 0.075, -0.058, -0.068, -0.013, 6 | 0.169, 0.181, 0.136, -0.165, 0.159, 7 | -0.112, 0.003, -0.123, -0.102, 0.242, 8 | 0.406, -0.442, -0.627, 0.376, 0.680, 9 | 0.121, -0.103, 0.106, -0.036, 0.052], 10 | [-0.064, -0.055, -0.138, -0.144, 0.176, 11 | 0.049, -0.051, -0.062, -0.176, -0.060, 12 | 0.228, -0.138, -0.027, -0.061, -0.069, 13 | 0.419, 0.685, -0.489, 0.563, -0.371, 14 | -0.075, 0.031, 0.033, -0.052, -0.035]] 15 | 16 | deltas = [0.122, 0.083, 0.064, # row 1, col 1 17 | 0.057, 0.075, 0.055, # row 1, col 2 18 | 0.025, 0.058, 0.138, # row 1, col 3 19 | 20 | 0.170, 0.068, 0.144, # row 2, col 1 21 | 0.121, 0.013, 0.176, # row 2, col 2 22 | 0.065, 0.169, 0.049, # row 2, col 3 23 | 24 | 0.003, 0.181, 0.051, # row 3, col 1 25 | 0.021, 0.136, 0.062, # row 3, col 2 26 | 0.066, 0.165, 0.176] # row 3, col 3 27 | 28 | f=3 29 | out = 3,3 30 | inn = 5,5 31 | n_curr = 3 32 | n_prev = 2 33 | 34 | w = [1.5] * 54 # we will add algo result to this 35 | 36 | def kernel(x,y): 37 | for n in range(n_curr): 38 | delta_idx = ((y * out[0]) + x) * n_curr + n 39 | delta = deltas[delta_idx] 40 | for a in range(f): 41 | for b in range(f): 42 | for k in range(n_prev): 43 | p = x+b, y+a 44 | val = inputs[k][p[1] * inn[0] + p[0]] 45 | idx = ((a * f) + b) *n_curr*n_prev + k * n_curr + n 46 | w[idx] += val * delta 47 | # w[idx] += val 48 | # w[idx] += delta 49 | 50 | for y in range(out[1]): 51 | for x in range(out[0]): 52 | kernel(x,y) 53 | # print('\n'.join(["[{}]\t{:>6.3}".format(i,x) for i,x in enumerate(w)])) 54 | # print('\n'.join(["[{}]\t{:>6.3}".format(i,x) for i,x in enumerate(w) if i%3==0])) 55 | # print('\n'.join(["[{}]\t{:>6}".format(i,x) for i,x in enumerate(w)])) 56 | for i in range(9): 57 | xs = w[i*6:(i+1)*6] 58 | print(', '.join(["{:>7.5}".format(x) for i,x in enumerate(xs)])) 59 | 60 | 61 | print('\n\nbias:') 62 | bias_res = [ 63 | sum([x for i,x in enumerate(deltas) if i%3==0]), 64 | sum([x for i,x in enumerate(deltas) if i%3==1]), 65 | sum([x for i,x in enumerate(deltas) if i%3==2])] 66 | print(', '.join(["{:>6.3}".format(x) for i,x in enumerate(bias_res)])) 67 | 68 | 69 | ''' 70 | ONLY INPUT: 71 | 72 | PY: 73 | [0] 0.188 74 | [3] -0.258 75 | [6] -0.121 76 | [9] -0.852 77 | [12] 0.008 78 | [15] -0.561 79 | [18] -0.409 80 | [21] 0.614 81 | [24] -0.763 82 | [27] 0.244 83 | [30] 0.576 84 | [33] -0.752 85 | [36] -0.771 86 | [39] 0.667 87 | [42] -0.948 88 | [45] 0.545 89 | [48] 0.568 90 | [51] -0.508 91 | 92 | GPU: 93 | b[0] 0.188 94 | b[3] -0.258 95 | b[6] -0.121 96 | b[9] -0.852 97 | b[14] 0.008 98 | b[17] -0.561 99 | 100 | b[18] -0.409 101 | b[23] 0.614 102 | b[24] -0.763 103 | b[29] 0.244 104 | b[32] 0.576 105 | b[35] -0.752 106 | 107 | b[38] -0.771 108 | b[41] 0.667 109 | b[44] -0.948 110 | b[47] 0.545 111 | b[50] 0.568 112 | b[53] -0.508 113 | ''' 114 | -------------------------------------------------------------------------------- /schedule_training.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import shutil 4 | import argparse 5 | import subprocess 6 | from time import gmtime, strftime 7 | 8 | ''' 9 | typical .bat file: 10 | 11 | make build 12 | if %errorlevel%==0 ( 13 | bin\cnn.exe train -c data\config.json --epochs 100 -i data\train_samples -o data\parameters.json 14 | ) 15 | ''' 16 | 17 | epochs_per_iteration = 500 18 | pars_file = 'data\\parameters.json' 19 | 20 | seconds_per_epoch = 0.7 21 | #seconds_per_epoch = 0.236 22 | 23 | cmd = 'bin\\cnn.exe train -c data\config.json --epochs {} -i data\\train_samples'.format(epochs_per_iteration) 24 | 25 | 26 | def get_dst_file_path(): 27 | #strftime("%Y-%m-%d %H:%M:%S") 28 | tt = strftime("%Y-%m-%d--%H-%M-%S") 29 | log_folder = lambda s: os.path.join('logs', s) 30 | return log_folder('log_{}.txt'.format(tt)), \ 31 | log_folder('parameters_{}.json'.format(tt)), \ 32 | tt 33 | 34 | 35 | seconds_per_unit = {"s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800} 36 | 37 | def convert_to_seconds(s): 38 | return int(s[:-1]) * seconds_per_unit[s[-1]] 39 | 40 | if __name__ == '__main__': 41 | help_text = 'Start training with either duration or #epochs' 42 | parser = argparse.ArgumentParser(description=help_text) 43 | action = parser.add_mutually_exclusive_group(required=True) 44 | action.add_argument('--duration', '-d', help='Duration, provided as: X[s|m|h|d|w] (s=seconds, m=minutes, h=hours, d=days, w=week)') 45 | action.add_argument('--epochs', '-e', type=int, help='Number of epochs') 46 | parser.add_argument('--dry', action='store_true', required=False, help='Do not output any files') 47 | 48 | args = parser.parse_args() 49 | if args.duration: 50 | time_in_s = convert_to_seconds(args.duration) 51 | total_epochs = int(time_in_s / seconds_per_epoch) 52 | else: 53 | total_epochs = args.epochs 54 | total_epochs = max(total_epochs, epochs_per_iteration) 55 | 56 | cmd_ = cmd.split(' ') 57 | if args.dry: 58 | cmd_.append('dry') 59 | else: 60 | cmd_.append('-o') 61 | cmd_.append(pars_file) 62 | print('Command to execute:') 63 | print('\'' + (' '.join(cmd_)) + '\'') 64 | 65 | start = time.time() 66 | iters = total_epochs // epochs_per_iteration 67 | total_epochs = iters * epochs_per_iteration # last iter have same #epochs as others 68 | print('Will do {0:} iterations, {1:} epochs per iteration = {2:} total'.format( \ 69 | iters, epochs_per_iteration, iters * epochs_per_iteration)) 70 | est_time = total_epochs * seconds_per_epoch 71 | print('Estimated required time: {:.3f}s = {:.3f} min'.format(est_time, est_time//60)) 72 | 73 | for i in range(iters): 74 | log_path, tmp_params_path, stamp = get_dst_file_path() 75 | total_epochs_left = (iters - i) * epochs_per_iteration 76 | print('\n---- {0:} - {1:} (time left: {2:d}min)----'.format(i+1, stamp, int(total_epochs_left*seconds_per_epoch)//60)) 77 | 78 | # execute training 79 | with open(log_path, "w") as tmp_log: 80 | ret_code = subprocess.call(cmd_, stdout=tmp_log, stderr=subprocess.STDOUT) 81 | print('return code: '+str(ret_code)) 82 | if ret_code is not 0: 83 | print('---- FAIL ----') 84 | exit() 85 | 86 | # backup results 87 | if not args.dry: 88 | print('saving sub results to: \'' + tmp_params_path + '\'') 89 | shutil.copy2(pars_file, tmp_params_path) 90 | 91 | end = time.time() 92 | dt = end - start 93 | print("Execution time: {:.3f}s = {:.2f}min ({:.5f} s/epoch)".format(dt, dt/60, dt/total_epochs)) 94 | -------------------------------------------------------------------------------- /src/pch.hpp: -------------------------------------------------------------------------------- 1 | #ifndef PCH_H 2 | #define PCH_H 3 | 4 | #include 5 | #include 6 | // #include // for size_t 7 | 8 | // TODO use during compilation 9 | 10 | /// 11 | /// forward declarations 12 | /// 13 | /* clang-format off */ 14 | namespace cnn_sr { 15 | struct ParametersDistribution; 16 | struct Config; 17 | class ConfigReader; 18 | class DataPipeline; 19 | class ConfigBasedDataPipeline; 20 | struct LayerData; 21 | struct CnnLayerGpuAllocationPool; 22 | } 23 | 24 | namespace opencl { 25 | class Kernel; 26 | typedef size_t MemoryHandle; 27 | class Context; 28 | 29 | namespace utils { 30 | struct ImageData; 31 | } 32 | } 33 | /* clang-format on */ 34 | 35 | typedef struct _cl_event* cl_event; 36 | 37 | union JsonValue; 38 | struct JsonNode; 39 | class JsonAllocator; 40 | 41 | /// 42 | /// Utils 43 | /// 44 | namespace cnn_sr { 45 | 46 | extern bool warn_about_blocking_operation; 47 | 48 | namespace utils { 49 | 50 | void require(bool, const char*); 51 | 52 | void dump_vector(std::ostream&, std::vector&, 53 | const char* line_prefix = nullptr, size_t per_line = 0, 54 | bool add_line_numbers = false); 55 | 56 | template 57 | inline bool is_odd(T x) { 58 | return (x & 1) != 0; 59 | } 60 | 61 | template 62 | inline bool is_even(T x) { 63 | return !is_odd(x); 64 | } 65 | 66 | size_t closest_power_of_2(int); 67 | 68 | /// 69 | /// Utils - macros 70 | /// 71 | #define STRINGIFY2(s) #s 72 | #define STRINGIFY(s) STRINGIFY2(s) 73 | 74 | #define CONCATENATE_DETAIL(x, y) x##y 75 | #define CONCATENATE(x, y) CONCATENATE_DETAIL(x, y) 76 | 77 | /// 78 | /// File system 79 | /// 80 | #define IOException std::ios_base::failure 81 | 82 | void get_file_content(const char* const, std::stringstream&); 83 | 84 | void list_files(const char* const, std::vector&); 85 | 86 | /// 87 | /// Json utils 88 | /// 89 | /** NOTE: we need to hold file content in some persistent place, since the 90 | * string argument*/ 91 | void read_json_file(const char* const, JsonValue&, JsonAllocator&, std::string&, 92 | int root_type); 93 | 94 | bool try_read_float(JsonNode&, float&, const char*); 95 | // (unsigned int)node->value.toNumber(); 96 | bool try_read_uint(JsonNode&, unsigned int&, const char*); 97 | bool try_read_vector(JsonNode&, std::vector&, const char*); 98 | bool try_read_string(JsonNode&, std::string&, const char*); 99 | 100 | /// 101 | /// Cmd line args parsing 102 | /// 103 | struct ArgOption { 104 | bool _required = false; 105 | std::string _name = ""; 106 | std::string _help = ""; 107 | std::vector _mnemonics; 108 | 109 | ArgOption& help(const char*); 110 | ArgOption& required(); 111 | }; 112 | 113 | class Argparse { 114 | typedef std::pair ArgValue; 115 | 116 | public: 117 | Argparse(const char*, const char*); 118 | 119 | ArgOption& add_argument(const char*); 120 | ArgOption& add_argument(const char*, const char*); 121 | bool parse(size_t, char**); 122 | void print_help(); 123 | 124 | bool has_arg(const char*); 125 | const char* value(const char*); 126 | void value(const char*, size_t&); 127 | 128 | private: 129 | ArgOption& add_argument(size_t, const char**); 130 | ArgValue* get_value(const char*); 131 | 132 | const std::string _general_help, _exec_name; 133 | std::vector _options; 134 | std::vector _values; 135 | }; 136 | } 137 | } 138 | 139 | #endif /* PCH_H */ 140 | -------------------------------------------------------------------------------- /src/opencl/Kernel.hpp: -------------------------------------------------------------------------------- 1 | #ifndef KERNEL_H 2 | #define KERNEL_H 3 | 4 | #include "CL/opencl.h" 5 | #include // for std::ostream& operator<<(..) 6 | 7 | #define MAX_KERNEL_IDENTIFIER_SIZE 128 8 | 9 | namespace opencl { 10 | 11 | // forward declaration 12 | class Context; 13 | typedef size_t MemoryHandle; 14 | 15 | class Kernel { 16 | public: 17 | void init(Context *, cl_kernel, cl_program, // 18 | const char *, const char *); 19 | void cleanup(); 20 | friend std::ostream &operator<<(std::ostream &os, opencl::Kernel &p); 21 | 22 | size_t current_local_memory(); 23 | 24 | /** 25 | * Set the next argument. To be used as a sequence of calls, 26 | * where each one sets next argument. 27 | * 28 | * @param arg_size size of pointer f.e. sizeof(cl_mem) | sizeof(cl_int) 29 | * @param arg_value void* pointer to argument value 30 | */ 31 | void push_arg(size_t arg_size, const void *); 32 | 33 | /** 34 | * Set the next argument. To be used as a sequence of calls, 35 | * where each one sets next argument. 36 | * 37 | * @param handle gpu memory handler 38 | */ 39 | void push_arg(MemoryHandle); 40 | 41 | /** 42 | * Execute the kernel with arguments that were pushed before this call. 43 | * After this call You will have to provide all arguments againg before 44 | * You execute the kernel again. 45 | * Also this function provides some basics checks for work_size parameters, 46 | * so You can catch them more easily. 47 | * 48 | * @param work_dim number of dimensions 49 | * @param global_work_size :size_t*, total work size provided as 50 | *array 51 | *each value for one of dimensions 52 | * @param local_work_size :size_t*, work group size 53 | * @param events_to_wait_for [OPT] wait for other operations to finish 54 | * @param events_to_wait_for_count [OPT] 55 | * @return opencl event object 56 | */ 57 | cl_event execute(cl_uint work_dim, // 58 | const size_t *global_work_size, // 59 | const size_t *local_work_size, // 60 | cl_event *events_to_wait_for = nullptr, int event_count = 0); 61 | 62 | inline size_t get_max_work_group_size() const { return max_work_group_size; } 63 | inline Context *get_context() const { return context; } 64 | inline cl_ulong get_total_execution_time() const { 65 | return execution_time_sum; 66 | } 67 | inline const char *get_human_identifier() const { return human_identifier; } 68 | 69 | private: 70 | /** 71 | * Basic checks for work parameters. Based on: 72 | * https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html 73 | * 74 | * @return if work parameters fulfill the constraints 75 | */ 76 | void check_work_parameters(cl_uint work_dim, // 77 | const size_t *global_work_size, 78 | const size_t *local_work_size); 79 | 80 | private: 81 | cl_kernel kernel_id; 82 | cl_program program_id; 83 | Context *context; 84 | size_t max_work_group_size; 85 | cl_ulong private_mem_size; 86 | size_t pref_work_group_multiple; 87 | 88 | size_t arg_stack_size; 89 | size_t assigned_local_memory; // by hand, since it does always work 90 | bool initialized = false; 91 | 92 | /** meaningful only if context->is_running_profile_mode */ 93 | cl_ulong execution_time_sum = 0; 94 | char human_identifier[MAX_KERNEL_IDENTIFIER_SIZE]; 95 | }; 96 | 97 | // 98 | } 99 | 100 | // std::ostream &operator<<(std::ostream &, opencl::Kernel &); 101 | 102 | #endif /* KERNEL_H */ 103 | -------------------------------------------------------------------------------- /src/kernel/squared_error.cl: -------------------------------------------------------------------------------- 1 | /* clang-format off */ 2 | /** 3 | * @see http://suhorukov.blogspot.com/2011/12/opencl-11-atomic-operations-on-floating.html 4 | * @see http://simpleopencl.blogspot.com/2013/05/atomic-operations-and-floats-in-opencl.html 5 | * 6 | * @param {[type]} volatile __global float *source [description] 7 | * @param {[type]} const float operand [description] 8 | */ 9 | inline void atomic_add_global(volatile __global float* source, const float operand) { 10 | /* clang-format on */ 11 | union { 12 | unsigned int intVal; 13 | float floatVal; 14 | } newVal; 15 | 16 | union { 17 | unsigned int intVal; 18 | float floatVal; 19 | } prevVal; 20 | 21 | // NOTE: atomic_cmpxchg(volatile __global unsigned int *p, 22 | // unsigned int cmp, unsigned int val) 23 | do { 24 | prevVal.floatVal = *source; 25 | newVal.floatVal = prevVal.floatVal + operand; 26 | } while (atomic_cmpxchg((volatile __global unsigned int*)source, 27 | prevVal.intVal, // 28 | newVal.intVal) != prevVal.intVal); 29 | } 30 | 31 | /** 32 | * Part of mean square error calculations. Here we take 2 same sized, 33 | * single color channel buffers with image data and get the difference 34 | * between respective pixels. 35 | */ 36 | __kernel void squared_err(__read_only __global float* ground_truth_image, 37 | __read_only __global float* algo_result, 38 | __global float* target, // 39 | __local float* scratch, // 40 | __const uint ground_truth_w, // 41 | __const uint ground_truth_h, // 42 | __const uint algo_result_w, // 43 | __const uint algo_result_h) { 44 | const int2 pos = {get_global_id(0), get_global_id(1)}; // x=col=i, y=row=j 45 | const uint sample_id = get_global_id(2); 46 | const int2 out_size = {algo_result_w, algo_result_h}; 47 | const int idx = (pos.y * algo_result_w) + pos.x; 48 | const size_t padding = (ground_truth_w - algo_result_w) / 2; 49 | const size_t local_size = get_local_size(1) * get_local_size(0), 50 | local_index = 51 | get_local_id(1) * get_local_size(0) + get_local_id(0); 52 | 53 | #define IMAGE_OFFSET_GT sample_id* ground_truth_w* ground_truth_h 54 | #define IMAGE_OFFSET_ALGO sample_id* algo_result_w* algo_result_h 55 | 56 | // size of ground_truth != algo res (padding) 57 | // The offset is not const, since it depends on the row we are in 58 | // algo for ground_truth_idx: 59 | // (row + padding_on_top_of_image) * width + padding_left + col 60 | const size_t ground_truth_idx = 61 | (pos.y + padding) * ground_truth_w + padding + pos.x; 62 | 63 | float squared_diff = 0.0f; 64 | if (pos.x >= 0 && pos.x < out_size.x && // 65 | pos.y >= 0 && pos.y < out_size.y) { 66 | float t = ground_truth_image[IMAGE_OFFSET_GT + ground_truth_idx]; 67 | float y = algo_result[IMAGE_OFFSET_ALGO + idx]; 68 | float d = y - t; 69 | squared_diff = d * d; 70 | } 71 | scratch[local_index] = squared_diff; 72 | 73 | // wait till all kernels from local groups finished 74 | barrier(CLK_LOCAL_MEM_FENCE); 75 | 76 | // add all squared_diffs for local group 77 | for (int offset = local_size / 2; offset > 0; offset = offset / 2) { 78 | if (local_index < offset) { 79 | float other = scratch[local_index + offset]; 80 | float mine = scratch[local_index]; 81 | scratch[local_index] = mine + other; 82 | } 83 | // wait for all local kernels to finish previous step 84 | // and reach stable state 85 | barrier(CLK_LOCAL_MEM_FENCE); 86 | } 87 | 88 | // add local result to global result 89 | if (local_index == 0) { 90 | atomic_add_global(target, scratch[0]); 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /libs/include/json/gason.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | enum JsonTag { 8 | JSON_NUMBER = 0, 9 | JSON_STRING, 10 | JSON_ARRAY, 11 | JSON_OBJECT, 12 | JSON_TRUE, 13 | JSON_FALSE, 14 | JSON_NULL = 0xF 15 | }; 16 | 17 | struct JsonNode; 18 | 19 | #define JSON_VALUE_PAYLOAD_MASK 0x00007FFFFFFFFFFFULL 20 | #define JSON_VALUE_NAN_MASK 0x7FF8000000000000ULL 21 | #define JSON_VALUE_TAG_MASK 0xF 22 | #define JSON_VALUE_TAG_SHIFT 47 23 | 24 | union JsonValue { 25 | uint64_t ival; 26 | double fval; 27 | 28 | JsonValue(double x) 29 | : fval(x) { 30 | } 31 | JsonValue(JsonTag tag = JSON_NULL, void *payload = nullptr) { 32 | assert((uint64_t)payload <= JSON_VALUE_PAYLOAD_MASK); 33 | ival = JSON_VALUE_NAN_MASK | ((uint64_t)tag << JSON_VALUE_TAG_SHIFT) | (uintptr_t)payload; 34 | } 35 | bool isDouble() const { 36 | return (int64_t)ival <= (int64_t)JSON_VALUE_NAN_MASK; 37 | } 38 | JsonTag getTag() const { 39 | return isDouble() ? JSON_NUMBER : JsonTag((ival >> JSON_VALUE_TAG_SHIFT) & JSON_VALUE_TAG_MASK); 40 | } 41 | uint64_t getPayload() const { 42 | assert(!isDouble()); 43 | return ival & JSON_VALUE_PAYLOAD_MASK; 44 | } 45 | double toNumber() const { 46 | assert(getTag() == JSON_NUMBER); 47 | return fval; 48 | } 49 | char *toString() const { 50 | assert(getTag() == JSON_STRING); 51 | return (char *)getPayload(); 52 | } 53 | JsonNode *toNode() const { 54 | assert(getTag() == JSON_ARRAY || getTag() == JSON_OBJECT); 55 | return (JsonNode *)getPayload(); 56 | } 57 | }; 58 | 59 | struct JsonNode { 60 | JsonValue value; 61 | JsonNode *next; 62 | char *key; 63 | }; 64 | 65 | struct JsonIterator { 66 | JsonNode *p; 67 | 68 | void operator++() { 69 | p = p->next; 70 | } 71 | bool operator!=(const JsonIterator &x) const { 72 | return p != x.p; 73 | } 74 | JsonNode *operator*() const { 75 | return p; 76 | } 77 | JsonNode *operator->() const { 78 | return p; 79 | } 80 | }; 81 | 82 | inline JsonIterator begin(JsonValue o) { 83 | return JsonIterator{o.toNode()}; 84 | } 85 | inline JsonIterator end(JsonValue) { 86 | return JsonIterator{nullptr}; 87 | } 88 | 89 | #define JSON_ERRNO_MAP(XX) \ 90 | XX(OK, "ok") \ 91 | XX(BAD_NUMBER, "bad number") \ 92 | XX(BAD_STRING, "bad string") \ 93 | XX(BAD_IDENTIFIER, "bad identifier") \ 94 | XX(STACK_OVERFLOW, "stack overflow") \ 95 | XX(STACK_UNDERFLOW, "stack underflow") \ 96 | XX(MISMATCH_BRACKET, "mismatch bracket") \ 97 | XX(UNEXPECTED_CHARACTER, "unexpected character") \ 98 | XX(UNQUOTED_KEY, "unquoted key") \ 99 | XX(BREAKING_BAD, "breaking bad") 100 | 101 | enum JsonErrno { 102 | #define XX(no, str) JSON_##no, 103 | JSON_ERRNO_MAP(XX) 104 | #undef XX 105 | }; 106 | 107 | const char *jsonStrError(int err); 108 | 109 | class JsonAllocator { 110 | struct Zone { 111 | Zone *next; 112 | size_t used; 113 | } *head = nullptr; 114 | 115 | public: 116 | JsonAllocator() = default; 117 | JsonAllocator(const JsonAllocator &) = delete; 118 | JsonAllocator &operator=(const JsonAllocator &) = delete; 119 | JsonAllocator(JsonAllocator &&x) : head(x.head) { 120 | x.head = nullptr; 121 | } 122 | JsonAllocator &operator=(JsonAllocator &&x) { 123 | head = x.head; 124 | x.head = nullptr; 125 | return *this; 126 | } 127 | ~JsonAllocator() { 128 | deallocate(); 129 | } 130 | void *allocate(size_t size); 131 | void deallocate(); 132 | }; 133 | 134 | int jsonParse(char *str, char **endptr, JsonValue *value, JsonAllocator &allocator); 135 | -------------------------------------------------------------------------------- /test/TestCase.cpp: -------------------------------------------------------------------------------- 1 | #include "TestCase.hpp" 2 | 3 | #include 4 | #include // std::abs 5 | #include // snprintf 6 | 7 | #include "../src/opencl/Context.hpp" 8 | #include "../src/DataPipeline.hpp" 9 | 10 | namespace test { 11 | 12 | /// 13 | /// utils functions 14 | /// 15 | float activation_function(float x) { return std::max(x, 0.0f); } 16 | float activation_function_derivative(float x) { return x > 0.0f ? 1.0f : 0.0f; } 17 | 18 | /// 19 | /// TestException 20 | /// 21 | TestException::TestException() : runtime_error("TestException") { 22 | // cnvt.str(""); 23 | cnvt << runtime_error::what() << ": Undefined error"; 24 | } 25 | 26 | TestException::TestException(const char *msg) : runtime_error("TestException") { 27 | // cnvt.str(""); 28 | cnvt << runtime_error::what() << ": " << msg; 29 | } 30 | 31 | TestException::TestException(const TestException &e) 32 | : runtime_error("TestException"), cnvt(e.cnvt.str()) {} 33 | 34 | const char *TestException::what() const throw() { return cnvt.str().c_str(); } 35 | 36 | /// 37 | /// TestCase 38 | /// 39 | void TestCase::assert_equals(int expected, int result) { 40 | if (expected != result) { 41 | char msg_buffer[128]; 42 | snprintf(msg_buffer, sizeof(msg_buffer), // 43 | "[INT] Expected %d to be %d", result, expected); 44 | throw TestException(msg_buffer); 45 | } 46 | } 47 | 48 | void TestCase::assert_equals(float expected, float result) { 49 | // (yeah, this are going to be totally arbitrary numbers) 50 | expected = std::abs(expected); 51 | float margin = 0.005f; 52 | if (expected > 10) margin = 0.15f; 53 | if (expected > 100) margin = 1; 54 | if (expected > 1000) margin = expected / 10000; 55 | float err = expected - std::abs(result); 56 | 57 | if (err > margin) { 58 | char msg_buffer[128]; 59 | snprintf(msg_buffer, sizeof(msg_buffer), // 60 | "[FLOAT] Expected %f to be %f", result, expected); 61 | throw TestException(msg_buffer); 62 | } 63 | } 64 | 65 | void TestCase::assert_true(bool v, const char *msg) { 66 | if (!v) { 67 | throw TestException(msg); 68 | } 69 | } 70 | 71 | void TestCase::assert_equals(const std::vector &expected, 72 | const std::vector &result, bool print) { 73 | if (expected.size() != result.size()) { 74 | char msg_buffer[128]; 75 | snprintf(msg_buffer, sizeof(msg_buffer), // 76 | "Expected vector has %d elements, while result %d. This vectors " 77 | "are not equal", 78 | expected.size(), result.size()); 79 | throw TestException(msg_buffer); 80 | } 81 | 82 | for (size_t i = 0; i < expected.size(); i++) { 83 | float r = result[i]; 84 | float e = expected[i]; 85 | if (print) 86 | std::cout << "[" << i << "] expected >\t" << e << "\tgot> " << r 87 | << std::endl; 88 | assert_equals(e, r); 89 | } 90 | } 91 | void TestCase::assert_equals(cnn_sr::DataPipeline *const pipeline, 92 | const std::vector &expected, 93 | opencl::MemoryHandle handle, bool print) { 94 | auto context = pipeline->context(); 95 | auto raw_gpu_mem = context->raw_memory(handle); 96 | size_t len = raw_gpu_mem->size / sizeof(cl_float); 97 | if (expected.size() != len) { 98 | char msg_buffer[128]; 99 | snprintf(msg_buffer, sizeof(msg_buffer), // 100 | "Expected vector has %d elements, while gpu memory holds %d. This " 101 | "vectors are not equal", 102 | expected.size(), len); 103 | throw TestException(msg_buffer); 104 | } 105 | 106 | context->block(); 107 | std::vector gpu_read(len); 108 | context->read_buffer(handle, (void *)&gpu_read[0], true); 109 | assert_equals(expected, gpu_read, print); 110 | } 111 | 112 | void TestCase::assert_data_set_ok(size_t idx) { 113 | char msg_buffer[128]; 114 | snprintf(msg_buffer, sizeof(msg_buffer), // 115 | "Incorrect data set index(%d), there are only %d data sets", idx, 116 | this->data_set_count()); 117 | assert_true(idx < this->data_set_count(), msg_buffer); 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/kernel/layer_uber_kernel.cl: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * Weights are 4D, indexing formula: 4 | * index(w[a,b,n,k]) = a * F_SPATIAL_SIZE * CURRENT_FILTER_COUNT * PREVIOUS_FILTER_COUNT 5 | * + b * CURRENT_FILTER_COUNT * PREVIOUS_FILTER_COUNT 6 | * + k * CURRENT_FILTER_COUNT 7 | * + n 8 | * where: 9 | * a = 0..F_SPATIAL_SIZE 10 | * b = 0..F_SPATIAL_SIZE 11 | * n = 0..CURRENT_FILTER_COUNT 12 | * k = 0..PREVIOUS_FILTER_COUNT 13 | * 14 | * macros: 15 | * CURRENT_FILTER_COUNT filter count for curent layer 16 | * 17 | * @param input output of previous layer, size: 18 | * * 1st layer: img_w * img_h 19 | * * 2nd layer: (img_w-f1+1) * (img_h-f1+1) * n1 20 | * * 3rd layer: (img_w-f1-f2+2) * (img_h-f1-f2+2) * n2 21 | * @param target zeroed output buffer, size: 22 | * * 1st layer: (img_w-f1+1) * (img_h-f1+1) * n1 23 | * * 2nd layer: (img_w-f1-f2+2) * (img_h-f1-f2+2) * n2 24 | * * 3rd layer: (img_w-f1-f2-f3+3) * (img_h-f1-f2-f3+3) 25 | * @param W weights, size: 26 | * * 1st layer: f1*f1 per each filter (total: f1*f1*n1) 27 | * * 2nd layer: f2*f2*n1 per each filter (total: f2*f2*n1*n2) 28 | * * 3rd layer: f3*f3*n2 29 | * @param B biases, size: 30 | * * 1st layer: n1 31 | * * 2nd layer: n2 32 | * * 3rd layer: 1 33 | * @param input_w source width 34 | * @param input_h source height 35 | */ 36 | __kernel 37 | void forward(__read_only __global float* input, 38 | __global float* target, 39 | __read_only __global float* W, 40 | __read_only __global float* B, 41 | uint input_w, uint input_h){ 42 | 43 | // value range: (0..out_w, 0..out_h) 44 | const int2 pos = {get_global_id(0), get_global_id(1)}; 45 | uint sample_id = get_global_id(2); 46 | 47 | const int2 src_size = {input_w, input_h}; 48 | const int2 out_size = {src_size.x - F_SPATIAL_SIZE + 1, 49 | src_size.y - F_SPATIAL_SIZE + 1}; 50 | 51 | #define IMAGE_OFFSET_IN sample_id* PREVIOUS_FILTER_COUNT* input_w* input_h 52 | #define IMAGE_OFFSET_OUT sample_id* CURRENT_FILTER_COUNT* out_size.x* out_size.y 53 | 54 | // index on which write to target, 55 | // will write total of CURRENT_FILTER_COUNT values 56 | const int out_idx = ((pos.y * out_size.x) + pos.x) * CURRENT_FILTER_COUNT; 57 | 58 | // zeroed result cache 59 | float vals_by_filter[CURRENT_FILTER_COUNT]; 60 | for (size_t filter_id = 0; filter_id < CURRENT_FILTER_COUNT; filter_id++) { 61 | vals_by_filter[filter_id] = 0.0f; 62 | } 63 | 64 | // value range check 65 | if(pos.x < 0 || pos.x >= out_size.x || // 66 | pos.y < 0 || pos.y >= out_size.y) 67 | return; 68 | 69 | // apply weights & write to vals_by_filter 70 | for (size_t dy = 0; dy < F_SPATIAL_SIZE; dy++) { 71 | for (size_t dx = 0; dx < F_SPATIAL_SIZE; dx++) { 72 | int2 input_pos = {pos.x + dx, pos.y + dy}; 73 | int base_input_idx = ((input_pos.y * input_w) + input_pos.x) * PREVIOUS_FILTER_COUNT; 74 | size_t w_idx_2D = ((dy * F_SPATIAL_SIZE) + dx) * CURRENT_FILTER_COUNT * PREVIOUS_FILTER_COUNT; 75 | 76 | for (size_t k = 0; k < PREVIOUS_FILTER_COUNT; k++) { 77 | float point_value = input[IMAGE_OFFSET_IN + base_input_idx + k]; 78 | size_t w_idx_3D = w_idx_2D + k * CURRENT_FILTER_COUNT; 79 | 80 | for (size_t n = 0; n < CURRENT_FILTER_COUNT; n++) { 81 | vals_by_filter[n] += W[w_idx_3D + n] * point_value; 82 | } 83 | } 84 | } 85 | } 86 | 87 | // add bias and write cached results to target buffer 88 | for (size_t filter_id = 0; filter_id < CURRENT_FILTER_COUNT; filter_id++) { 89 | float result = vals_by_filter[filter_id] + B[filter_id]; 90 | #ifdef SKIP_RELU 91 | target[IMAGE_OFFSET_OUT + out_idx + filter_id] = result; 92 | #else 93 | target[IMAGE_OFFSET_OUT + out_idx + filter_id] = max(result, 0.0f); 94 | #endif // SKIP_RELU 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /test/specs/UpdateParametersTest.cpp: -------------------------------------------------------------------------------- 1 | #include "TestSpecsDeclarations.hpp" 2 | 3 | #include // for std::mt19937 4 | #include // for random seed 5 | # 6 | #include "../../src/DataPipeline.hpp" 7 | #include "../../src/LayerData.hpp" 8 | 9 | namespace test { 10 | namespace specs { 11 | 12 | /// 13 | /// PIMPL 14 | /// 15 | struct UpdateParametersTestImpl { 16 | const size_t n_prev_filter_cnt = 2, current_filter_count = 400, 17 | f_spatial_size = 5, batch_size = 2; 18 | // const size_t n_prev_filter_cnt = 2, current_filter_count = 2, 19 | // f_spatial_size = 3; 20 | const float momentum = 0.8f, learning_rate = 0.001; 21 | 22 | void create_data(std::mt19937 &generator, opencl::Context *context, 23 | opencl::MemoryHandle &gpu_current_values, // 24 | opencl::MemoryHandle &gpu_grad, // 25 | opencl::MemoryHandle &gpu_previous_delta, // 26 | std::vector &expected, 27 | std::vector ¤t_vals, 28 | std::vector &deltas) { 29 | size_t len = current_vals.size(); 30 | std::vector grad(len), previous_delta(len); 31 | for (size_t i = 0; i < len; i++) { 32 | current_vals[i] = (generator() % 2560) / 10.0f; 33 | grad[i] = (generator() % 2560) / 100.0f; 34 | previous_delta[i] = (generator() % 2560) / 10.0f; 35 | deltas[i] = momentum * previous_delta[i] + learning_rate * grad[i]; 36 | expected[i] = current_vals[i] - (deltas[i] / batch_size); 37 | } 38 | 39 | // alloc 40 | /* clang-format off */ 41 | gpu_current_values = context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * len); 42 | gpu_grad = context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * len); 43 | gpu_previous_delta = context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * len); 44 | context->write_buffer(gpu_current_values, (void *)¤t_vals[0], true); 45 | context->write_buffer(gpu_grad, (void *)&grad[0], true); 46 | context->write_buffer(gpu_previous_delta, (void *)&previous_delta[0], true); 47 | /* clang-format on */ 48 | } 49 | }; 50 | 51 | /// 52 | /// UpdateParametersTest 53 | /// 54 | 55 | TEST_SPEC_PIMPL(UpdateParametersTest) 56 | 57 | void UpdateParametersTest::init() {} 58 | 59 | size_t UpdateParametersTest::data_set_count() { return 1; } 60 | 61 | std::string UpdateParametersTest::name(size_t) { 62 | return "Update parameters test"; 63 | } 64 | 65 | bool UpdateParametersTest::operator()(size_t, 66 | cnn_sr::DataPipeline *const pipeline) { 67 | using namespace cnn_sr; 68 | assert_not_null(pipeline); 69 | auto context = pipeline->context(); 70 | 71 | // create test data 72 | LayerData layer_data(_impl->n_prev_filter_cnt, _impl->current_filter_count, 73 | _impl->f_spatial_size); 74 | size_t ws = layer_data.weight_size(), bs = layer_data.bias_size(); 75 | 76 | unsigned seed1 = std::chrono::system_clock::now().time_since_epoch().count(); 77 | std::mt19937 generator(seed1); 78 | 79 | LayerAllocationPool gpu_alloc; 80 | std::vector expected_w(ws), current_w(ws), new_deltas_w(ws); 81 | std::vector expected_b(bs), current_b(bs), new_deltas_b(bs); 82 | _impl->create_data(generator, context, 83 | gpu_alloc.weights, // 84 | gpu_alloc.accumulating_grad_w, // 85 | gpu_alloc.previous_batch_delta_w, // 86 | expected_w, current_w, new_deltas_w); 87 | _impl->create_data(generator, context, 88 | gpu_alloc.bias, // 89 | gpu_alloc.accumulating_grad_b, // 90 | gpu_alloc.previous_batch_delta_b, // 91 | expected_b, current_b, new_deltas_b); 92 | 93 | layer_data.set_weights(¤t_w[0]); 94 | layer_data.set_bias(¤t_b[0]); 95 | 96 | pipeline->update_parameters(layer_data, gpu_alloc, _impl->batch_size, 97 | _impl->momentum, 0.0f, _impl->learning_rate); 98 | 99 | assert_equals(pipeline, expected_w, gpu_alloc.weights); 100 | assert_equals(pipeline, expected_b, gpu_alloc.bias); 101 | assert_equals(pipeline, new_deltas_w, gpu_alloc.previous_batch_delta_w); 102 | assert_equals(pipeline, new_deltas_b, gpu_alloc.previous_batch_delta_b); 103 | 104 | return true; 105 | } 106 | 107 | // 108 | // 109 | } // namespace specs 110 | } // namespace test 111 | -------------------------------------------------------------------------------- /test/specs/ConfigTest.cpp: -------------------------------------------------------------------------------- 1 | #include "TestSpecsDeclarations.hpp" 2 | #include "../../src/Config.hpp" 3 | 4 | namespace test { 5 | namespace specs { 6 | 7 | /// 8 | /// Data set 9 | /// 10 | struct ConfigDataSet : DataSet { 11 | ConfigDataSet(std::string name, const char* cfg_file, bool expect_io_error, 12 | bool expect_invalid_val) 13 | : DataSet(name), 14 | cfg_file(cfg_file), 15 | expect_io_error(expect_io_error), 16 | expect_invalid_val(expect_invalid_val) {} 17 | 18 | const char* cfg_file; 19 | bool expect_io_error, expect_invalid_val; 20 | }; 21 | 22 | /// 23 | /// PIMPL 24 | /// 25 | struct ConfigTestImpl { 26 | /* clang-format off */ 27 | ConfigDataSet data_sets[4] = { 28 | ConfigDataSet("ok", "test/data/config.json", false,false), 29 | ConfigDataSet("invalid value", "test/data/config_invalid_val.json", false,true), 30 | ConfigDataSet("invalid file", "test/data/config_non_parseable.json", true,false), 31 | ConfigDataSet("file nonexistent", "test/data/NOPE.json", true,false)}; 32 | /* clang-format on */ 33 | 34 | cnn_sr::ParametersDistribution pd1 = {0.9, 0.9, 0.9, 0.9}; 35 | cnn_sr::ParametersDistribution pd2 = {2.001, 2.001, 2.001, 2.001}; 36 | cnn_sr::ParametersDistribution pd3 = {0.001, 0.001, 0.001, 0.001}; 37 | float learning_rates[3] = {12, 34, 56}; 38 | /* clang-format off */ 39 | cnn_sr::Config correct_result={32, 16, 40 | 9, 1, 5, 41 | 123.5f,0.1f, learning_rates, 42 | pd1, pd2, pd3, 43 | "cnn-parameters-a.json"}; 44 | /* clang-format on */ 45 | }; 46 | 47 | /// 48 | /// ConfigTest 49 | /// 50 | 51 | TEST_SPEC_PIMPL(ConfigTest) 52 | 53 | void ConfigTest::init() {} 54 | 55 | size_t ConfigTest::data_set_count() { return 4; } 56 | 57 | std::string ConfigTest::name(size_t data_set_id) { 58 | assert_data_set_ok(data_set_id); 59 | return "Config test - " + _impl->data_sets[data_set_id].name; 60 | } 61 | 62 | bool params_cmp(cnn_sr::ParametersDistribution a, 63 | cnn_sr::ParametersDistribution b) { 64 | return a.mean_w == b.mean_w && a.sd_w == b.sd_w && // 65 | a.mean_b == b.mean_b && a.sd_b == b.sd_b; 66 | } 67 | 68 | bool ConfigTest::operator()(size_t data_set_id, 69 | cnn_sr::DataPipeline* const pipeline) { 70 | using namespace cnn_sr; 71 | assert_not_null(pipeline); 72 | assert_data_set_ok(data_set_id); 73 | auto data = _impl->data_sets[data_set_id]; 74 | Config& c2 = _impl->correct_result; 75 | 76 | bool io_err = false, invalid_val = false; 77 | ConfigReader reader; 78 | 79 | try { 80 | Config c1 = reader.read(data.cfg_file); 81 | assert_true(c1.n1 == c2.n1 && c1.n2 == c2.n2, 82 | "filter count does not match"); 83 | assert_true(c1.f1 == c2.f1 && c1.f2 == c2.f2 && c1.f3 == c2.f3, 84 | "filter spatial size does not match"); 85 | assert_true(c1.momentum == c2.momentum, "momentum does not match"); 86 | assert_true(c1.weight_decay_parameter == c2.weight_decay_parameter, 87 | "weight decay parameter does not match"); 88 | assert_true(c1.learning_rate[0] == c2.learning_rate[0] // 89 | && c1.learning_rate[1] == c2.learning_rate[1] // 90 | && c1.learning_rate[2] == c2.learning_rate[2], 91 | "learning rate does not match"); 92 | // std::cout << c1.parameters_file << "'" << std::endl; 93 | // std::cout << c2.parameters_file << "'" << std::endl; 94 | assert_true(c1.parameters_file.compare(c2.parameters_file) == 0, 95 | "parameters_file does not match"); 96 | assert_true(params_cmp(c1.params_distr_1, c2.params_distr_1), 97 | "parameters distribution 1 does not match"); 98 | assert_true(params_cmp(c1.params_distr_2, c2.params_distr_2), 99 | "parameters distribution 2 does not match"); 100 | assert_true(params_cmp(c1.params_distr_3, c2.params_distr_3), 101 | "parameters distribution 3 does not match"); 102 | } catch (TestException& e) { 103 | std::cout << e.what() << std::endl; 104 | invalid_val = true; 105 | } catch (IOException& e) { 106 | std::cout << e.what() << std::endl; 107 | io_err = true; 108 | } /* catch (...) { 109 | assert_true(false, "Unknown error"); 110 | }*/ 111 | 112 | assert_true(io_err == data.expect_io_error, "Expected IO error"); 113 | assert_true(invalid_val == data.expect_invalid_val, 114 | "Expected values mismatch"); 115 | return true; 116 | } 117 | 118 | // 119 | // 120 | } // namespace specs 121 | } // namespace test 122 | -------------------------------------------------------------------------------- /src/kernel/backpropagate.cl: -------------------------------------------------------------------------------- 1 | /* clang-format off */ 2 | /** 3 | * @see http://suhorukov.blogspot.com/2011/12/opencl-11-atomic-operations-on-floating.html 4 | * @see http://simpleopencl.blogspot.com/2013/05/atomic-operations-and-floats-in-opencl.html 5 | * 6 | * @param float* source [description] 7 | * @param float operand [description] 8 | */ 9 | inline void atomic_add_global(volatile __global float* source, const float operand) { 10 | /* clang-format on */ 11 | union { 12 | unsigned int intVal; 13 | float floatVal; 14 | } newVal; 15 | 16 | union { 17 | unsigned int intVal; 18 | float floatVal; 19 | } prevVal; 20 | 21 | // NOTE: atomic_cmpxchg(volatile __global unsigned int *p, 22 | // unsigned int cmp, unsigned int val) 23 | do { 24 | prevVal.floatVal = *source; 25 | newVal.floatVal = prevVal.floatVal + operand; 26 | } while (atomic_cmpxchg((volatile __global unsigned int*)source, 27 | prevVal.intVal, // 28 | newVal.intVal) != prevVal.intVal); 29 | } 30 | 31 | /* clang-format off */ 32 | /** 33 | * 34 | * Calculate grad_w and grad_b. Very expensive due too barriers & locks. 35 | * 36 | * 37 | * In following notation (l), (l-1) describes relative layer and [...] lower indices. 38 | * 39 | * Algo for dJ/dw[abnk] on layer (l), where: 40 | * a = 0..spatial_size(l) 41 | * b = 0..spatial_size(l) 42 | * n = 0..filter_count(l) 43 | * k = 0..filter_count(l+1) 44 | * 45 | * dJ/dw[abnk] = 0 46 | * for i = 0..output_w(l): # for each node where this weight is used 47 | * for j = 0..output_h(l): 48 | * for n = 0..filter_count(l): 49 | * for a = 0..spatial_size(l): # offset to weight 50 | * for b = 0..spatial_size(l): # (it's kernel size, what You expect ?) 51 | * for k = 0..filter_count(l-1): # for all inputs 52 | * dJ/dw[abnk] += deltas[i,j,n] # (1) error for this point 53 | * * layer_input[i+b,j+a,k] # (2) input at this point 54 | */ 55 | /* clang-format on */ 56 | __kernel void backpropagate(__read_only __global float* deltas, // 57 | __read_only __global float* layer_input, // 58 | __global float* target_grad_w, // 59 | __global float* target_grad_b, // 60 | uint n_current_filter_cnt, // 61 | uint n_prev_filter_cnt, // 62 | uint f_spatial_size, // 63 | uint layer_out_w, uint layer_out_h) { 64 | const int id = get_global_id(0); 65 | const uint sample_id = get_global_id(1); 66 | const uint input_w = layer_out_w + f_spatial_size - 1; 67 | const uint input_h = layer_out_h + f_spatial_size - 1; 68 | // weight dimensions 69 | const size_t d2 = n_prev_filter_cnt * n_current_filter_cnt, 70 | d3 = d2 * f_spatial_size; 71 | const size_t weights_size = d3 * f_spatial_size; 72 | 73 | #define IMAGE_OFFSET_CURR \ 74 | sample_id* n_current_filter_cnt* layer_out_w* layer_out_h 75 | #define IMAGE_OFFSET_PREV sample_id* n_prev_filter_cnt* input_w* input_h 76 | 77 | // reverse id to get weight parameters: a(as dx), b(as dy), n, k 78 | int w_tmp = id; 79 | const int dy = w_tmp / d3; 80 | w_tmp -= dy * d3; 81 | const int dx = w_tmp / d2; 82 | w_tmp -= dx * d2; 83 | const int k = w_tmp / n_current_filter_cnt; 84 | const int n = 85 | w_tmp - k * n_current_filter_cnt; // = id % n_current_filter_cnt 86 | 87 | if (id < weights_size) { 88 | float grad_w = 0.0f, grad_b = 0.0f; 89 | for (size_t row = 0; row < layer_out_h; row++) { 90 | for (size_t col = 0; col < layer_out_w; col++) { 91 | // (1) delta[i,j,n](l) 92 | int idx = ((row * layer_out_w) + col) * n_current_filter_cnt; 93 | float delta = deltas[IMAGE_OFFSET_CURR + idx + n]; 94 | grad_b += delta; 95 | 96 | // (2) layer_input[i+b,j+a,k] 97 | // NOTE: we normally should be subtracting [dx,dy], but it does 98 | // depend on indexing 99 | int2 prev_layer_pos = {col + dx, row + dy}; 100 | int prev_layer_idx = ((prev_layer_pos.y * input_w) + prev_layer_pos.x) * 101 | n_prev_filter_cnt; 102 | 103 | float input = layer_input[IMAGE_OFFSET_PREV + prev_layer_idx + k]; 104 | grad_w += input * delta; 105 | } 106 | } 107 | 108 | // write 109 | // NOTE: atomic_add_global is custom function, see beginning of the file 110 | target_grad_w[id] += grad_w; 111 | if (k == 0 && dx == 0 && dy == 0) 112 | atomic_add_global(target_grad_b + n, grad_b); 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /weights_visualize.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | from os.path import join 4 | from pprint import pprint 5 | from PIL import Image, ImageDraw, ImageColor 6 | 7 | # cfg_file = 'config_f.json' 8 | # scale = None 9 | per_weight_cell_padding = 2 10 | 11 | def layer_data(cfg, layer_id): 12 | 'returns (f,k,n)' 13 | read = lambda prop: int(cfg[prop]) 14 | if layer_id == 1: 15 | return read('f1'),1,read('n1') 16 | elif layer_id == 2: 17 | return read('f2'),read('n1'),read('n2') 18 | elif layer_id == 3: 19 | return read('f3'),read('n2'),1 20 | else: 21 | raise Exception("Only 1,2,3 are valid layers") 22 | 23 | def idx(layer, dy,dx,n,k): 24 | f, layer_k, layer_n = layer 25 | # print('layer: ',layer, ' a,b:',dy,dx, ' n,k: ',n,k) 26 | return dy * layer_n * layer_k * f + \ 27 | dx * layer_n * layer_k + \ 28 | k * layer_n + \ 29 | n 30 | 31 | def filter_weights(weights, layer, curr_n,curr_k): 32 | f = layer[0] 33 | filter_wx = [0]*(f*f) 34 | for dy in range(f): 35 | for dx in range(f): 36 | w_idx = idx(layer, dy,dx,curr_n,curr_k) 37 | # print(dy*f+dx,'/',len(filter_wx)) 38 | # print(w_idx,'/',len(weights)) 39 | filter_wx[dy*f+dx] = weights[w_idx] 40 | min_w, max_w = min(filter_wx), max(filter_wx) 41 | # norm_w = max_w + min_w 42 | # print('min_w: {}, max_w: {}'.format(min_w, max_w)) 43 | 44 | a,b=-999,999 45 | for dy in range(f): 46 | for dx in range(f): 47 | w = filter_wx[dy*f+dx] 48 | # w = (w-min_w) / (max_w + min_w) 49 | w = (w-min_w) / (max_w - min_w) if max_w != min_w else 0.5 50 | # w = min(1,max(0,w)) 51 | yield dy,dx,w 52 | a=max(a,w) 53 | b=min(b,w) 54 | # print('{:8}\t: {:8} \t-> {:8} : {}'.format(min_w, max_w, b, a)) 55 | 56 | def visualize(cfg, params, scale, layer_id, out_path): 57 | print('--- layer ', layer_id, ' ---') 58 | weights = params['layer' + str(layer_id)]['weights'] 59 | min_w, max_w = min(weights), max(weights) 60 | print('min_w: {}, max_w: {}'.format(min_w, max_w)) 61 | print('overfit: {}'.format(sum([x*x for x in weights]))) 62 | 63 | f, l_k, l_n = layer = layer_data(cfg, layer_id) 64 | cell_size = f * scale + 2 * per_weight_cell_padding 65 | print(layer) 66 | if f == 1: 67 | print('f==1, drawing weights would not show anything') 68 | return 69 | 70 | rows = int((l_n*l_k)**0.5) 71 | cells_in_row = int((l_n*l_k+rows-1) / rows) 72 | print('columns: ', cells_in_row, 'rows: ', rows) 73 | # size = cell_size * l_n, cell_size * l_k 74 | size = cell_size * cells_in_row, cell_size * rows 75 | 76 | img = Image.new('RGB', size, color='#000000') 77 | filter_img = Image.new('RGB', (f*scale,f*scale)) 78 | filter_draw = ImageDraw.Draw(filter_img) 79 | 80 | for n in range(l_n): 81 | for k in range(l_k): 82 | idx = n * l_k + k 83 | row, col = idx // cells_in_row, idx % cells_in_row 84 | # print(idx, '\t-> ',row,', ',col) 85 | pos = int(cell_size * col + per_weight_cell_padding), \ 86 | int(cell_size * row + per_weight_cell_padding) 87 | for (dy,dx,val) in filter_weights(weights, layer, n,k): 88 | v = int(val*255) 89 | col = "rgb({0},{0},{0})".format(v) 90 | pos_ab = dx*scale, dy*scale 91 | pos_ab_ = pos_ab[0] + scale - 1, \ 92 | pos_ab[1] + scale - 1 93 | filter_draw.rectangle((pos_ab, pos_ab_), fill=col) 94 | img.paste(filter_img, pos) 95 | 96 | img.save(out_path, "PNG") 97 | 98 | if __name__ == '__main__': 99 | help_text = 'Draw weights. Usage: ' + \ 100 | '"weights_visualize.py -o data -s 10 data\config_f.json"' 101 | 102 | parser = argparse.ArgumentParser(description=help_text) 103 | parser.add_argument('config', help='config file to analize' ) 104 | parser.add_argument('--parameters-file', '-p', required=False, help='parameters file holding all weights and biases') 105 | parser.add_argument('--out-dir', '-o', required=False, default='', help='where to store result images') 106 | parser.add_argument('--scale', '-s', required=False, default=10, type=int, help='scale factor - cause sometimes 10x10 image is too small') 107 | args = parser.parse_args() 108 | 109 | with open(args.config) as data_file: 110 | cfg = json.load(data_file) 111 | # pprint(cfg) 112 | 113 | if args.parameters_file: 114 | par_file = args.parameters_file 115 | elif 'parameters_file' in cfg: 116 | par_file = cfg['parameters_file'] 117 | else: 118 | raise Exception('Either write parameter file path to config or provide as parametr') 119 | print('Parameter file: \'',par_file,'\'') 120 | with open(par_file) as data_file: 121 | params = json.load(data_file) 122 | # pprint(params) 123 | 124 | visualize(cfg, params, args.scale, 1, join(args.out_dir, 'weights1.png')) 125 | visualize(cfg, params, args.scale, 2, join(args.out_dir, 'weights2.png')) 126 | visualize(cfg, params, args.scale, 3, join(args.out_dir, 'weights3.png')) 127 | -------------------------------------------------------------------------------- /libs/include/CL/cl_d3d10_ext.h: -------------------------------------------------------------------------------- 1 | /********************************************************************************** 2 | * Copyright (c) 2008-2009 The Khronos Group Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and/or associated documentation files (the 6 | * "Materials"), to deal in the Materials without restriction, including 7 | * without limitation the rights to use, copy, modify, merge, publish, 8 | * distribute, sublicense, and/or sell copies of the Materials, and to 9 | * permit persons to whom the Materials are furnished to do so, subject to 10 | * the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included 13 | * in all copies or substantial portions of the Materials. 14 | * 15 | * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. 22 | **********************************************************************************/ 23 | 24 | #ifndef __OPENCL_CL_D3D10_EXT_H 25 | #define __OPENCL_CL_D3D10_EXT_H 26 | 27 | #include 28 | #include 29 | #include 30 | 31 | #ifdef __cplusplus 32 | extern "C" { 33 | #endif 34 | 35 | /****************************************************************************** 36 | * cl_nv_d3d10_sharing */ 37 | 38 | typedef cl_uint cl_d3d10_device_source_nv; 39 | typedef cl_uint cl_d3d10_device_set_nv; 40 | 41 | /******************************************************************************/ 42 | 43 | // Error Codes 44 | #define CL_INVALID_D3D10_DEVICE_NV -1002 45 | #define CL_INVALID_D3D10_RESOURCE_NV -1003 46 | #define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_NV -1004 47 | #define CL_D3D10_RESOURCE_NOT_ACQUIRED_NV -1005 48 | 49 | // cl_d3d10_device_source_nv 50 | #define CL_D3D10_DEVICE_NV 0x4010 51 | #define CL_D3D10_DXGI_ADAPTER_NV 0x4011 52 | 53 | // cl_d3d10_device_set_nv 54 | #define CL_PREFERRED_DEVICES_FOR_D3D10_NV 0x4012 55 | #define CL_ALL_DEVICES_FOR_D3D10_NV 0x4013 56 | 57 | // cl_context_info 58 | #define CL_CONTEXT_D3D10_DEVICE_NV 0x4014 59 | 60 | // cl_mem_info 61 | #define CL_MEM_D3D10_RESOURCE_NV 0x4015 62 | 63 | // cl_image_info 64 | #define CL_IMAGE_D3D10_SUBRESOURCE_NV 0x4016 65 | 66 | // cl_command_type 67 | #define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_NV 0x4017 68 | #define CL_COMMAND_RELEASE_D3D10_OBJECTS_NV 0x4018 69 | 70 | /******************************************************************************/ 71 | 72 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10NV_fn)( 73 | cl_platform_id platform, 74 | cl_d3d10_device_source_nv d3d_device_source, 75 | void * d3d_object, 76 | cl_d3d10_device_set_nv d3d_device_set, 77 | cl_uint num_entries, 78 | cl_device_id * devices, 79 | cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; 80 | 81 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferNV_fn)( 82 | cl_context context, 83 | cl_mem_flags flags, 84 | ID3D10Buffer * resource, 85 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 86 | 87 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DNV_fn)( 88 | cl_context context, 89 | cl_mem_flags flags, 90 | ID3D10Texture2D * resource, 91 | UINT subresource, 92 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 93 | 94 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DNV_fn)( 95 | cl_context context, 96 | cl_mem_flags flags, 97 | ID3D10Texture3D * resource, 98 | UINT subresource, 99 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 100 | 101 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsNV_fn)( 102 | cl_command_queue command_queue, 103 | cl_uint num_objects, 104 | const cl_mem * mem_objects, 105 | cl_uint num_events_in_wait_list, 106 | const cl_event * event_wait_list, 107 | cl_event * event) CL_API_SUFFIX__VERSION_1_0; 108 | 109 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsNV_fn)( 110 | cl_command_queue command_queue, 111 | cl_uint num_objects, 112 | cl_mem * mem_objects, 113 | cl_uint num_events_in_wait_list, 114 | const cl_event * event_wait_list, 115 | cl_event * event) CL_API_SUFFIX__VERSION_1_0; 116 | 117 | #ifdef __cplusplus 118 | } 119 | #endif 120 | 121 | #endif // __OPENCL_CL_D3D10_H 122 | 123 | -------------------------------------------------------------------------------- /libs/include/CL/cl_d3d11_ext.h: -------------------------------------------------------------------------------- 1 | /********************************************************************************** 2 | * Copyright (c) 2008-2009 The Khronos Group Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and/or associated documentation files (the 6 | * "Materials"), to deal in the Materials without restriction, including 7 | * without limitation the rights to use, copy, modify, merge, publish, 8 | * distribute, sublicense, and/or sell copies of the Materials, and to 9 | * permit persons to whom the Materials are furnished to do so, subject to 10 | * the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included 13 | * in all copies or substantial portions of the Materials. 14 | * 15 | * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. 22 | **********************************************************************************/ 23 | 24 | #ifndef __OPENCL_CL_D3D11_EXT_H 25 | #define __OPENCL_CL_D3D11_EXT_H 26 | 27 | #include 28 | #include 29 | #include 30 | 31 | #ifdef __cplusplus 32 | extern "C" { 33 | #endif 34 | 35 | /****************************************************************************** 36 | * cl_nv_d3d11_sharing */ 37 | 38 | typedef cl_uint cl_d3d11_device_source_nv; 39 | typedef cl_uint cl_d3d11_device_set_nv; 40 | 41 | /******************************************************************************/ 42 | 43 | // Error Codes 44 | #define CL_INVALID_D3D11_DEVICE_NV -1006 45 | #define CL_INVALID_D3D11_RESOURCE_NV -1007 46 | #define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_NV -1008 47 | #define CL_D3D11_RESOURCE_NOT_ACQUIRED_NV -1009 48 | 49 | // cl_d3d11_device_source_nv 50 | #define CL_D3D11_DEVICE_NV 0x4019 51 | #define CL_D3D11_DXGI_ADAPTER_NV 0x401A 52 | 53 | // cl_d3d11_device_set_nv 54 | #define CL_PREFERRED_DEVICES_FOR_D3D11_NV 0x401B 55 | #define CL_ALL_DEVICES_FOR_D3D11_NV 0x401C 56 | 57 | // cl_context_info 58 | #define CL_CONTEXT_D3D11_DEVICE_NV 0x401D 59 | 60 | // cl_mem_info 61 | #define CL_MEM_D3D11_RESOURCE_NV 0x401E 62 | 63 | // cl_image_info 64 | #define CL_IMAGE_D3D11_SUBRESOURCE_NV 0x401F 65 | 66 | // cl_command_type 67 | #define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_NV 0x4020 68 | #define CL_COMMAND_RELEASE_D3D11_OBJECTS_NV 0x4021 69 | 70 | /******************************************************************************/ 71 | 72 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11NV_fn)( 73 | cl_platform_id platform, 74 | cl_d3d11_device_source_nv d3d_device_source, 75 | void * d3d_object, 76 | cl_d3d11_device_set_nv d3d_device_set, 77 | cl_uint num_entries, 78 | cl_device_id * devices, 79 | cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; 80 | 81 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferNV_fn)( 82 | cl_context context, 83 | cl_mem_flags flags, 84 | ID3D11Buffer * resource, 85 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 86 | 87 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DNV_fn)( 88 | cl_context context, 89 | cl_mem_flags flags, 90 | ID3D11Texture2D * resource, 91 | UINT subresource, 92 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 93 | 94 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DNV_fn)( 95 | cl_context context, 96 | cl_mem_flags flags, 97 | ID3D11Texture3D * resource, 98 | UINT subresource, 99 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 100 | 101 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsNV_fn)( 102 | cl_command_queue command_queue, 103 | cl_uint num_objects, 104 | const cl_mem * mem_objects, 105 | cl_uint num_events_in_wait_list, 106 | const cl_event * event_wait_list, 107 | cl_event * event) CL_API_SUFFIX__VERSION_1_0; 108 | 109 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsNV_fn)( 110 | cl_command_queue command_queue, 111 | cl_uint num_objects, 112 | cl_mem * mem_objects, 113 | cl_uint num_events_in_wait_list, 114 | const cl_event * event_wait_list, 115 | cl_event * event) CL_API_SUFFIX__VERSION_1_0; 116 | 117 | #ifdef __cplusplus 118 | } 119 | #endif 120 | 121 | #endif // __OPENCL_CL_D3D11_H 122 | 123 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Image super-resolution using deep convolutional neural networks 2 | 3 | ## Overview 4 | 5 | Super-resolution problem tries to upscale the image so that perceived loss of quality is minimal. For example after scaling with bicubic interpolation it is apparent that some pixels are just smudged together. The question is: *can AI do a better job ?* 6 | 7 | 8 | ## Preliminary results 9 | 10 | ![Preliminary result](images/compare.jpg) 11 | *left: upscaling with bicubic interpolation, right: result of the presented algorithm* 12 | 13 | 14 | ![Details](images/details.jpg) 15 | *Details closeup - left: upscaling with bicubic interpolation, right: result of the presented algorithm* 16 | 17 | 18 | As You can see presented method achieved significant improvement in the areas of: face boundaries, fingers, loose hair strands. On the other hand the finer details on the dress that require smooth gradient are lost. There are also numerous artifacts. 19 | 20 | I believe that longer training time could fix some of the mentioned problems. 21 | 22 | 23 | ## Usage 24 | 25 | 26 | #### General guidelines 27 | 28 | * **application input is already scaled image that we are going to run through various filters** 29 | * only luma channel is upscaled (so image presented above is quite close to edge case) 30 | * best effects are achieved if image has lot of edges 31 | * presented method does not handle textures particularly well 32 | * it may be needed to optimize kernels for Your configuration to achieve faster learning process 33 | 34 | 35 | #### Installation 36 | 37 | You will need OpenCL capable hardware, modern GPU recommended. All required libraries are included in this repositorium ([libs](/libs)). It is recommended to link with the OpenCL library that can be found in Your PATH environment variable. You also should change paths in the [makefile](makefile) - they are result of my non-standard MinGW configuration. 38 | 39 | This app was developed using clang & g++, they may be needed some changes to make it work in Visual Studio (like list_files() in [pch.cpp](src/pch.cpp) ) 40 | 41 | #### Command line 42 | 43 | * **make build** - compile and create executable 44 | * **make run -- CMD_ARGUMENTS_HERE** - run app with provided arguments (note double dash) 45 | * **make test** - run all tests 46 | 47 | #### Arguments: 48 | 49 | `cnn [-h] [train] [dry] [profile] --config --in [--out] [--epochs]` 50 | 51 | * **help** - print help 52 | * **train** - train mode 53 | * **dry** - do not store result 54 | * **profile** - print kernel execution times 55 | * **--config CONFIG** - configuration file 56 | * **--in IN** - either image we want to upscale or samples directory during training 57 | * **--out OUT** - output file path (either result image or new set of parameters) 58 | * **--epochs EPOCHS** - number of epochs during training 59 | 60 | #### Examples 61 | 62 | Start the app: `bin\cnn.exe -c data\config.json -i data\my_image.jpg -o data\result.jpg` 63 | 64 | Learning (500 epochs): `bin\cnn.exe train -c data\config.json --epochs 500 -i data\train_samples -o data\parameters.json` 65 | 66 | Start the learning (100 epochs), do not save results: `bin\cnn.exe train -c data\config.json --epochs 100 -i data\train_samples dry` 67 | 68 | 69 | #### Useful scripts: 70 | 71 | * **[generate_training_samples.py](generate_training_samples.py)** - generate ready to use training samples based on images from provided directory 72 | * **[weights_visualize.py](weights_visualize.py)** - present weights as images. Layer 1 is particularly informative 73 | * **[profile.py](profile.py)** - measure total execution time or time spend per OpenCL kernel 74 | * **[schedule_training.py](schedule_training.py)** - executes C++ application, specify number of epochs or how long we want for learning to continue 75 | 76 | 77 | #### Config file ([example](example_config.json)) 78 | 79 | Config file is just a simple JSON with following keys: 80 | * *n1* - numbers of filters in first layer 81 | * *n2* - numbers of filters in second layer 82 | * *f1* - kernel spatial size in first layer 83 | * *f2* - kernel spatial size in second layer 84 | * *f3* - kernel spatial size in third layer 85 | * *momentum* - momentum used during learning 86 | * *weight_decay_parameter* - used to prevent overfitting 87 | * *learning_rates* - learning rates used during training 88 | * *parameters_file* - file that holds all parameters: weights and biases for layers (optional) 89 | 90 | If You do not provide *parameters_file* the parameters will be initialized with random numbers from normal distribution (see example for details how this process can be customized). 91 | 92 | #### Parameters file 93 | 94 | Parameters are described with following simple structure: 95 | ```js 96 | { 97 | "epochs": 0, 98 | "layer1": { 99 | "weights": [..], 100 | "bias": [..] 101 | }, 102 | "layer2": { 103 | "weights": [..], 104 | "bias": [..] 105 | }, 106 | "layer3": { 107 | "weights": [..], 108 | "bias": [..] 109 | } 110 | } 111 | ``` 112 | 113 | Value for key *epochs* is optional and indicates how many epochs were finished during training process. 114 | 115 | 116 | ## References 117 | 118 | SRCNN [1] was used as the main reference, but the idea was taken from waifu2x[2]. 119 | 120 | * [1] Chao Dong, Chen Change Loy, Kaiming He, Xiaoou Tang, "Image Super-Resolution Using Deep Convolutional Networks", http://arxiv.org/abs/1501.00092 121 | 122 | * [2] waifu2x, https://github.com/nagadomi/waifu2x 123 | 124 | If You are interested, I've also wrtitten 2 articles on the topic at hand: 125 | 126 | * [Neural networks: implementation tips](https://scthe.github.io/2015/08/23/neural-networks-implementation-tips.html) 127 | 128 | * [Backpropagation notes](https://scthe.github.io/2015/08/30/backpropagation-notes.html) 129 | 130 | -------------------------------------------------------------------------------- /test/specs/LayerTest_script.R: -------------------------------------------------------------------------------- 1 | # run the script with: 2 | # rscript test\test_confirmation.R 3 | 4 | if (!require("rjson")) { 5 | install.packages("rjson", repos="http://cran.rstudio.com/") 6 | library("rjson") 7 | } 8 | 9 | 10 | json_data <- fromJSON(file="test/data/test_cases.json") 11 | 12 | 13 | activation_function <- function(x){ 14 | #1 / (1 + exp(-x)) 15 | max(x,0) 16 | } 17 | 18 | split_by_columns <- function(arr, column_count, as_lists=FALSE){ 19 | if((length(arr) %% column_count) != 0){ 20 | stop(sprintf("Error: Tried to divide array of size(%d) into %d columns", length(arr), column_count)) 21 | } 22 | 23 | result <- c() 24 | for (i in 1:column_count) { 25 | a <- (i) %% column_count 26 | sub_arr <- split(arr, 1:length(arr) %% column_count == a)$`TRUE` 27 | result <- c(result, sub_arr) 28 | } 29 | 30 | if(length(arr) != length(result)) { 31 | stop(sprintf("Error: Length of provided array(%d) != length of result(%d)", length(arr), length(result))) 32 | } 33 | 34 | if(as_lists){ 35 | result <- matrix(result, ncol=column_count) 36 | } 37 | 38 | result 39 | } 40 | 41 | 42 | test_layer <- function(data, preproces_mean=FALSE, result_multiplier=0, decimal_places=3){ 43 | n_prev_filter_cnt <- data$n_prev_filter_cnt 44 | current_filter_count <- data$current_filter_count 45 | f_spatial_size <- data$f_spatial_size 46 | input_size <- c(data$input_w, data$input_h) 47 | 48 | input_raw <- data$input 49 | output_raw <- data$output 50 | weights_raw <- data$weights 51 | bias <- data$bias 52 | 53 | out_size <- input_size - c(f_spatial_size, f_spatial_size) + c(1,1) 54 | out_dims <- c(out_size[2], out_size[1], current_filter_count) 55 | input_modifier <- if(preproces_mean) mean(input_raw) else 0 56 | # print(out_size) 57 | 58 | # preprocess data so that we can use native * operator for element-wise multiplication 59 | # (in json we have format that is suitable to be dumped into kernel indexing) 60 | input_vec <- split_by_columns(input_raw, n_prev_filter_cnt) - input_modifier 61 | input <- array(input_vec, c(input_size[1], input_size[2], n_prev_filter_cnt)) 62 | # print(round(input, 3)) 63 | 64 | # create submatrices of size f_spatial_size^2 * n_prev_filter_cnt 65 | sub_views <- list() 66 | for (dy in 1:out_size[2]) { 67 | for (dx in 1:out_size[1]) { 68 | end_dx <- dx + f_spatial_size - 1 69 | end_dy <- dy + f_spatial_size - 1 70 | sub_view <- input[dx:end_dx, dy:end_dy,] 71 | sub_views[[length(sub_views)+1]] <- sub_view 72 | # cat("SUBVIEW: ", dx, ":", end_dx, ", ", dy, ":",end_dy, "\n") 73 | # print(round(sub_view, 3)) 74 | } 75 | } 76 | 77 | # weights 78 | weights_vec <- c() 79 | weights_by_filter <- split_by_columns(weights_raw, current_filter_count, as_lists=TRUE) 80 | for (filter_id in 1:current_filter_count) { 81 | ws <- weights_by_filter[,filter_id] 82 | # print(sprintf("Weights for filter %d (len=%d): %s", filter_id, length(ws), paste(ws, collapse=" "))) 83 | for(i in 1:length(ws)){ 84 | # a <- (i-1) %/% (f_spatial_size*f_spatial_size) 85 | b <- (i-1) %/% f_spatial_size 86 | c <- (i-1) %% f_spatial_size 87 | d <- filter_id-1 88 | idx <- c * f_spatial_size * n_prev_filter_cnt * current_filter_count + 89 | # a * n_prev_filter_cnt * current_filter_count + 90 | b * current_filter_count + 91 | d 92 | 93 | weights_vec[idx+1] = ws[i] 94 | } 95 | } 96 | weights <- array(weights_vec, c(current_filter_count, f_spatial_size, f_spatial_size, n_prev_filter_cnt)) 97 | 98 | # weights - debug print 99 | # for (filter_id in 1:current_filter_count) { 100 | # cat("Weights for filter", filter_id, ":\n") 101 | # print(weights[filter_id,,,]) 102 | # } 103 | 104 | # execute 105 | result <- c() 106 | for (filter_id in 1:current_filter_count) { 107 | B <- bias[filter_id] 108 | filter_weight <- weights[filter_id,,,] 109 | # print(filter_weight) 110 | 111 | for (sub_view in sub_views) { 112 | # print(round(sub_view,3)) 113 | res <- sum(sub_view * filter_weight) + B 114 | res <- if(result_multiplier != 0) res * result_multiplier 115 | else activation_function(res) 116 | result <- c(result, res) 117 | } 118 | } 119 | res_arr <- array(round(result, decimal_places), out_dims) 120 | 121 | # print status 122 | output_vec <- split_by_columns(output_raw, current_filter_count) 123 | output <- array(output_vec, c(out_dims[1], out_dims[2], current_filter_count)) 124 | exp_arr <- array(round(output, decimal_places), out_dims) 125 | 126 | cat("DIFFERENCE - calculated result vs JSON output field (should be ~0 across the board):\n") 127 | print(round(result-output,decimal_places)) 128 | cat("RESULT:\n") 129 | print(round(res_arr,decimal_places)) 130 | # cat("EXPECTED:\n") 131 | # print(round(exp_arr,2)) 132 | 133 | result 134 | } 135 | 136 | help_text <- "How to interpret results:\nResults have OUT_W*OUT_H*CURRENT_FILTER_COUNT numbers printed as OUT_W*OUT_H matrices. With the convention that data (in JSON) for each filter is in the respective column write content of each matrix (column-by-column) into single column (in JSON)." 137 | 138 | cat("\n\n", help_text, "\n") 139 | 140 | # print(json_data) 141 | # print(class(json_data)) 142 | # print(json_data[[1]]) 143 | # print(length(json_data)) 144 | # print(names(json_data)) 145 | 146 | for( name in names(json_data)){ 147 | print('------------------') 148 | print(name) 149 | print('------------------') 150 | test_layer(json_data[[name]], preproces_mean = FALSE) 151 | } 152 | 153 | -------------------------------------------------------------------------------- /src/kernel/layer_deltas.cl: -------------------------------------------------------------------------------- 1 | 2 | /* clang-format off */ 3 | /** 4 | * 5 | * Calculate deltas*activation_func_derivative of previous layer. 6 | * 7 | * In following notation (l), (l-1) describes relative layer and [...] lower indices. 8 | * 9 | * Algo for delta_ijn on layer (l-1), where: 10 | * i = 0..output_w(l-1), 11 | * j = 0..output_h(l-1), 12 | * n = 0..filter_count(l-1): 13 | * 14 | * delta[i,j,n](l-1) = 0 15 | * for a = 0..spatial_size(l+1): 16 | * for b = 0..spatial_size(l+1): 17 | * for k = 0..filter_count(l+1): 18 | * delta[i,j,n](l-1) += \ 19 | * w[abnk](l-1) # (1) weight of edge between [i,j,n](l-1) and [i+a,j+b,k](l) 20 | * * delta[i-a,j-b,k](l) # (2) error term for [i-a,j-b,k](l). minus since we have point (i,j) and we asking: 'which output point are we affecting with w[a,b,_,_]' 21 | * * f`(x[i,j,n](l-1) ) # (3) derivative of activation function at measured point 22 | * 23 | * TODO in (3) should index be x[i+a,j+b,n] or x[i,j,n]? 24 | * 25 | * macros: 26 | * CURRENT_FILTER_COUNT filter_count(l-1) 27 | * 28 | * @param float* deltas_next_layer size: output_w(l) * output_w(l) * filter_count(l) 29 | * @param float* layer_output size: output_w(l-1) * output_w(l-1) * filter_count(l-1) 30 | * @param float* target size: output_w(l-1) * output_w(l-1) * filter_count(l-1) 31 | * @param float* W weights between (l-1) and (l). 32 | * WARN: w3 is between (l2) and (l3), w2 -> (l1) and (l2), w1 -> (input) and (l1) 33 | * size: f_spatial_size*f_spatial_size*filter_count(l-1)*filter_count(l) 34 | * @param uint f_spatial_size spatial/kernel size for (l-1) 35 | * @param uint f_next_spatial_size spatial/kernel size for (l) 36 | * @param uint n_next_filter_cnt filter_count(l) 37 | * @param uint layer_out_w output_w(l-1) 38 | * @param uint layer_out_h output_h(l-1) 39 | * @return {[type]} [description] 40 | */ 41 | /* clang-format on */ 42 | __kernel void deltas(__read_only __global float* deltas_next_layer, // 43 | __read_only __global float* layer_output, // 44 | __global float* target, // 45 | __read_only __global float* W, // 46 | uint f_spatial_size, // 47 | uint f_next_spatial_size, // 48 | uint n_next_filter_cnt, // 49 | uint layer_out_w, uint layer_out_h) { 50 | // x=col=i; range: 0..layer_out_w 51 | // y=row=j; range: 0..layer_out_h 52 | const int2 pos = {get_global_id(0), get_global_id(1)}; 53 | const uint sample_id = get_global_id(2); 54 | const int2 out_dim = {layer_out_w, layer_out_h}; 55 | const int idx = ((pos.y * out_dim.x) + pos.x) * CURRENT_FILTER_COUNT; 56 | const int2 next_layer_out = {out_dim.x - f_next_spatial_size + 1, 57 | out_dim.y - f_next_spatial_size + 1}; 58 | 59 | #define IMAGE_OFFSET_CURR \ 60 | sample_id* CURRENT_FILTER_COUNT* layer_out_w* layer_out_h 61 | #define IMAGE_OFFSET_NEXT \ 62 | sample_id* n_next_filter_cnt* next_layer_out.x* next_layer_out.y 63 | 64 | // zeroed result cache and read read output values for output[i,j,n] 65 | float delta_for_filter[CURRENT_FILTER_COUNT]; 66 | float activation_func_derivatives[CURRENT_FILTER_COUNT]; 67 | 68 | // range check for i,j 69 | if (pos.x >= 0 && pos.x < out_dim.x && // 70 | pos.y >= 0 && pos.y < out_dim.y) { 71 | // fill tmp buffer values 72 | for (size_t n = 0; n < CURRENT_FILTER_COUNT; n++) { 73 | delta_for_filter[n] = 0.0f; 74 | // (3) f`( x[i,j,n](l-1) ) 75 | float y_ijn = layer_output[IMAGE_OFFSET_CURR + idx + n]; 76 | activation_func_derivatives[n] = y_ijn > 0.0f ? 1.0f : 0.0f; 77 | } 78 | 79 | for (size_t dy = 0; dy < f_next_spatial_size; dy++) { 80 | for (size_t dx = 0; dx < f_next_spatial_size; dx++) { 81 | // NOTE: dy=a, dx=b 82 | int2 next_layer_pos = {pos.x - dx, pos.y - dy}; 83 | size_t w_idx_2D = ((dy * f_next_spatial_size) + dx) * 84 | n_next_filter_cnt * CURRENT_FILTER_COUNT; 85 | 86 | for (size_t k = 0; k < n_next_filter_cnt; k++) { 87 | // (2) delta[i+a,j+b,k](l) 88 | // this requires us to map curent output_pos to next layer coords, 89 | // but some of the point may not be in range. f.e. point(i=0,j=0) does 90 | // not affect output with w[a,b] if a!=0 && b!=0 91 | int next_layer_idx = 92 | ((next_layer_pos.y * next_layer_out.x) + next_layer_pos.x) * 93 | n_next_filter_cnt; 94 | bool in_range = 95 | next_layer_pos.x >= 0 && next_layer_pos.x < next_layer_out.x && 96 | next_layer_pos.y >= 0 && next_layer_pos.y < next_layer_out.y; 97 | float delta = 98 | in_range 99 | ? deltas_next_layer[IMAGE_OFFSET_NEXT + next_layer_idx + k] 100 | : 0.0f; 101 | 102 | for (size_t n = 0; n < CURRENT_FILTER_COUNT; n++) { 103 | // (1) w[abnk](l-1) 104 | // NOTE: n iterates over lower layer's filters 105 | size_t w_idx = w_idx_2D + n * n_next_filter_cnt + k; 106 | float w = W[w_idx]; 107 | 108 | // (3) f`( x[i,j,n](l-1) ) 109 | float activation_func_derivative = activation_func_derivatives[n]; 110 | 111 | // result 112 | delta_for_filter[n] += delta * w * activation_func_derivative; 113 | } 114 | } 115 | 116 | // 117 | } 118 | } 119 | 120 | // write results 121 | for (size_t n = 0; n < CURRENT_FILTER_COUNT; n++) { 122 | target[IMAGE_OFFSET_CURR + idx + n] = delta_for_filter[n]; 123 | } 124 | 125 | // end 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /libs/include/CL/cl_d3d9_ext.h: -------------------------------------------------------------------------------- 1 | /********************************************************************************** 2 | * Copyright (c) 2008-2009 The Khronos Group Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and/or associated documentation files (the 6 | * "Materials"), to deal in the Materials without restriction, including 7 | * without limitation the rights to use, copy, modify, merge, publish, 8 | * distribute, sublicense, and/or sell copies of the Materials, and to 9 | * permit persons to whom the Materials are furnished to do so, subject to 10 | * the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included 13 | * in all copies or substantial portions of the Materials. 14 | * 15 | * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. 22 | **********************************************************************************/ 23 | 24 | #ifndef __OPENCL_CL_D3D9_EXT_H 25 | #define __OPENCL_CL_D3D9_EXT_H 26 | 27 | #include 28 | #include 29 | #include 30 | 31 | #ifdef __cplusplus 32 | extern "C" { 33 | #endif 34 | 35 | /****************************************************************************** 36 | * cl_nv_d3d9_sharing */ 37 | 38 | typedef cl_uint cl_d3d9_device_source_nv; 39 | typedef cl_uint cl_d3d9_device_set_nv; 40 | 41 | /******************************************************************************/ 42 | 43 | // Error Codes 44 | #define CL_INVALID_D3D9_DEVICE_NV -1010 45 | #define CL_INVALID_D3D9_RESOURCE_NV -1011 46 | #define CL_D3D9_RESOURCE_ALREADY_ACQUIRED_NV -1012 47 | #define CL_D3D9_RESOURCE_NOT_ACQUIRED_NV -1013 48 | 49 | // cl_d3d9_device_source_nv 50 | #define CL_D3D9_DEVICE_NV 0x4022 51 | #define CL_D3D9_ADAPTER_NAME_NV 0x4023 52 | 53 | // cl_d3d9_device_set_nv 54 | #define CL_PREFERRED_DEVICES_FOR_D3D9_NV 0x4024 55 | #define CL_ALL_DEVICES_FOR_D3D9_NV 0x4025 56 | 57 | // cl_context_info 58 | #define CL_CONTEXT_D3D9_DEVICE_NV 0x4026 59 | 60 | // cl_mem_info 61 | #define CL_MEM_D3D9_RESOURCE_NV 0x4027 62 | 63 | // cl_image_info 64 | #define CL_IMAGE_D3D9_FACE_NV 0x4028 65 | #define CL_IMAGE_D3D9_LEVEL_NV 0x4029 66 | 67 | // cl_command_type 68 | #define CL_COMMAND_ACQUIRE_D3D9_OBJECTS_NV 0x402A 69 | #define CL_COMMAND_RELEASE_D3D9_OBJECTS_NV 0x402B 70 | 71 | /******************************************************************************/ 72 | 73 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D9NV_fn)( 74 | cl_platform_id platform, 75 | cl_d3d9_device_source_nv d3d_device_source, 76 | void * d3d_object, 77 | cl_d3d9_device_set_nv d3d_device_set, 78 | cl_uint num_entries, 79 | cl_device_id * devices, 80 | cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; 81 | 82 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9VertexBufferNV_fn)( 83 | cl_context context, 84 | cl_mem_flags flags, 85 | IDirect3DVertexBuffer9 * resource, 86 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 87 | 88 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9IndexBufferNV_fn)( 89 | cl_context context, 90 | cl_mem_flags flags, 91 | IDirect3DIndexBuffer9 * resource, 92 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 93 | 94 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9SurfaceNV_fn)( 95 | cl_context context, 96 | cl_mem_flags flags, 97 | IDirect3DSurface9 * resource, 98 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 99 | 100 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9TextureNV_fn)( 101 | cl_context context, 102 | cl_mem_flags flags, 103 | IDirect3DTexture9 *resource, 104 | UINT miplevel, 105 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 106 | 107 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9CubeTextureNV_fn)( 108 | cl_context context, 109 | cl_mem_flags flags, 110 | IDirect3DCubeTexture9 * resource, 111 | D3DCUBEMAP_FACES facetype, 112 | UINT miplevel, 113 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 114 | 115 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9VolumeTextureNV_fn)( 116 | cl_context context, 117 | cl_mem_flags flags, 118 | IDirect3DVolumeTexture9 * resource, 119 | UINT miplevel, 120 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 121 | 122 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D9ObjectsNV_fn)( 123 | cl_command_queue command_queue, 124 | cl_uint num_objects, 125 | const cl_mem *mem_objects, 126 | cl_uint num_events_in_wait_list, 127 | const cl_event *event_wait_list, 128 | cl_event *event) CL_API_SUFFIX__VERSION_1_0; 129 | 130 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D9ObjectsNV_fn)( 131 | cl_command_queue command_queue, 132 | cl_uint num_objects, 133 | cl_mem *mem_objects, 134 | cl_uint num_events_in_wait_list, 135 | const cl_event *event_wait_list, 136 | cl_event *event) CL_API_SUFFIX__VERSION_1_0; 137 | 138 | #ifdef __cplusplus 139 | } 140 | #endif 141 | 142 | #endif // __OPENCL_CL_D3D9_H 143 | 144 | -------------------------------------------------------------------------------- /src/ConfigBasedDataPipeline.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CONFIG_BASED_DATA_PIPELINE_H 2 | #define CONFIG_BASED_DATA_PIPELINE_H 3 | 4 | #include "DataPipeline.hpp" 5 | #include "LayerData.hpp" 6 | 7 | namespace cnn_sr { 8 | 9 | /** 10 | * All gpu buffer handles related to single image 11 | */ 12 | struct SampleAllocationPool { 13 | /** Raw 3 channel image loaded from hard drive */ 14 | opencl::MemoryHandle input_data = gpu_nullptr; 15 | /** Single channel (luma) of size input_img_w*input_img_h */ 16 | opencl::MemoryHandle input_luma = gpu_nullptr; 17 | /** Dimensions of original image*/ 18 | size_t input_w, input_h; 19 | 20 | /** Training: Raw 3 channel image loaded from hard drive */ 21 | opencl::MemoryHandle expected_data = gpu_nullptr; 22 | /** Training: luma to compare our result to */ 23 | opencl::MemoryHandle expected_luma = gpu_nullptr; 24 | 25 | SampleAllocationPool() = default; 26 | 27 | // private: 28 | // SampleAllocationPool(const SampleAllocationPool&) = delete; 29 | // SampleAllocationPool& operator=(const SampleAllocationPool&) = delete; 30 | }; 31 | 32 | /** Represents all general allocations that we will make */ 33 | struct GpuAllocationPool { 34 | LayerAllocationPool layer_1; 35 | LayerAllocationPool layer_2; 36 | LayerAllocationPool layer_3; 37 | 38 | std::vector samples; 39 | }; 40 | 41 | /** 42 | * Class that wraps all low level functions from DataPipeline into something 43 | * more usable 44 | */ 45 | class ConfigBasedDataPipeline : public DataPipeline { 46 | public: 47 | ConfigBasedDataPipeline(Config&, opencl::Context*); 48 | 49 | void init(int load_flags); 50 | 51 | void set_mini_batch_size(size_t); 52 | 53 | float execute_batch(bool backpropagate, GpuAllocationPool&, 54 | std::vector&); 55 | 56 | cl_event forward(LayerAllocationPool& layer_1_alloc, // 57 | LayerAllocationPool& layer_2_alloc, // 58 | LayerAllocationPool& layer_3_alloc, // 59 | SampleAllocationPool& sample); 60 | 61 | private: 62 | void allocate_buffers(size_t, size_t); 63 | 64 | cl_event forward(LayerAllocationPool& layer_1_alloc, // 65 | LayerAllocationPool& layer_2_alloc, // 66 | LayerAllocationPool& layer_3_alloc, // 67 | size_t w, size_t h, size_t id); 68 | 69 | /* clang-format off */ 70 | /** 71 | * General backpropagation steps: 72 | * - calculate weight decay (NOTE: value expected as a paramter) 73 | * - calculate deltas for last layer 74 | * - calculate deltas other layers in reverse order 75 | * - backpropagate: calculate gradient w, gradient b for all layers 76 | * - update weights and biases (NOTE: requires explicit call to ConfigBasedDataPipeline::update_parameters(...)) 77 | * 78 | * @param layer_1_alloc [description] 79 | * @param layer_2_alloc [description] 80 | * @param layer_3_alloc [description] 81 | * @param cnn_input input that was provided during forward step 82 | * @param gpu_buf_ground_truth expected result 83 | * @param ground_truth_w width of both cnn_input and gpu_buf_ground_truth 84 | * @param ground_truth_h height of both cnn_input and gpu_buf_ground_truth 85 | * @param weight_decay 86 | * @param ev_to_wait_for [description] 87 | * @return [description] 88 | */ 89 | cl_event backpropagate(cnn_sr::LayerAllocationPool&, 90 | cnn_sr::LayerAllocationPool&, 91 | cnn_sr::LayerAllocationPool&, 92 | size_t, size_t, size_t, 93 | cl_event* ev_to_wait_for = nullptr); 94 | /* clang-format on */ 95 | 96 | public: 97 | /** update weights and biases*/ 98 | void update_parameters(cnn_sr::LayerAllocationPool&, 99 | cnn_sr::LayerAllocationPool&, 100 | cnn_sr::LayerAllocationPool&, size_t batch_size, 101 | cl_event* ev_to_wait_for = nullptr); 102 | 103 | void write_params_to_file(const char* const file_path, // 104 | cnn_sr::LayerAllocationPool, 105 | cnn_sr::LayerAllocationPool, 106 | cnn_sr::LayerAllocationPool); 107 | 108 | void write_result_image(const char* const, opencl::utils::ImageData&, 109 | SampleAllocationPool& sample); 110 | 111 | inline const Config* config() { return _config; } 112 | inline const LayerData* layer_1() { return &layer_data_1; } 113 | inline const LayerData* layer_2() { return &layer_data_2; } 114 | inline const LayerData* layer_3() { return &layer_data_3; } 115 | 116 | protected: 117 | void load_kernels(int load_flags); 118 | 119 | private: 120 | void fill_random_parameters(LayerData&, ParametersDistribution&); 121 | 122 | size_t load_parameters_file(const char* const); 123 | 124 | void create_luma_image(const char* const, opencl::MemoryHandle, size_t, 125 | size_t); 126 | 127 | // void create_lumas_delta_image(const char* const, SampleAllocationPool& e, 128 | // AllocationItem&); 129 | 130 | private: 131 | Config* const _config; 132 | LayerData layer_data_1; 133 | LayerData layer_data_2; 134 | LayerData layer_data_3; 135 | size_t epochs = 0; 136 | size_t _mini_batch_size = 0; 137 | 138 | /* ground truth for batch */ 139 | opencl::MemoryHandle _ground_truth_gpu_buf = gpu_nullptr; 140 | /** input for layer 1 */ 141 | opencl::MemoryHandle _forward_gpu_buf = gpu_nullptr; 142 | /** outputs for layers */ 143 | opencl::MemoryHandle _out_1_gpu_buf = gpu_nullptr, // 144 | _out_2_gpu_buf = gpu_nullptr, // 145 | _out_3_gpu_buf = gpu_nullptr; 146 | /** deltas for layers */ 147 | opencl::MemoryHandle _delta_1_gpu_buf = gpu_nullptr, // 148 | _delta_2_gpu_buf = gpu_nullptr, // 149 | _delta_3_gpu_buf = gpu_nullptr; 150 | 151 | opencl::Kernel* _layer_1_kernel = nullptr; 152 | opencl::Kernel* _layer_2_kernel = nullptr; 153 | opencl::Kernel* _layer_3_kernel = nullptr; 154 | opencl::Kernel* _layer_1_deltas_kernel = nullptr; 155 | opencl::Kernel* _layer_2_deltas_kernel = nullptr; 156 | }; 157 | } 158 | 159 | #endif /* CONFIG_BASED_DATA_PIPELINE_H */ 160 | -------------------------------------------------------------------------------- /test/specs/LayerTest.cpp: -------------------------------------------------------------------------------- 1 | #include "TestSpecsDeclarations.hpp" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "json/gason.h" 9 | 10 | #include "../../src/DataPipeline.hpp" 11 | #include "../../src/LayerData.hpp" 12 | 13 | auto test_data_file = "test/data/test_cases.json"; 14 | 15 | /* clang-format off */ 16 | /* 17 | * 18 | * NOTE: use LayerTest_script.R to generate expected output values 19 | * 20 | * 21 | * Test data schema description (values for each layer provided after '/'): 22 | * 23 | * n_prev_filter_cnt := INT, filter count for previous layer, values: 1/n1/n2 24 | * current_filter_count := INT, filter count for this layer, values: n1/n2/1 25 | * f_spatial_size := INT, spatial size, values: f1/f2/f3 26 | * input_w := INT, input dimensions 27 | * input_h := INT, input dimensions 28 | * input := VECTOR[FLOAT], min size: input_w * input_h * n_prev_filter_cnt. 29 | * Each column for different filter(from 1 to n_prev_filter_cnt). 30 | * Each row for different point in range 0..input_w*input_h 31 | * output := VECTOR[FLOAT], min size: out_w * out_h * current_filter_count 32 | * Expected output 33 | * weights := VECTOR[FLOAT], min size: f_spatial_size^2 * n_prev_filter_cnt * current_filter_count 34 | * There are f_spatial_size paragraphs 35 | * Each paragraph consists of f_spatial_size lines, representing 1 row. 36 | * Each row contains current_filter_count*n_prev_filter_cnt numbers, 37 | * grouped by n_prev_filter_cnt (n_prev_filter_cnt groups, 38 | * current_filter_count numbers per each group). 39 | * bias := VECTOR[FLOAT], min size: current_filter_count 40 | * 41 | * 42 | * calcutated values: 43 | * out_w := input_w - f_spatial_size + 1 44 | * out_h := input_h - f_spatial_size + 1 45 | */ 46 | /* clang-format on */ 47 | 48 | namespace test { 49 | namespace specs { 50 | 51 | /// 52 | /// Data set 53 | /// 54 | struct LayerDataSet : DataSet { 55 | size_t n_prev_filter_cnt, // 56 | current_filter_count, // 57 | f_spatial_size, // 58 | input_w, input_h; 59 | std::vector input; 60 | std::vector output; 61 | std::vector weights; 62 | std::vector bias; 63 | }; 64 | 65 | /// 66 | /// PIMPL 67 | /// 68 | struct LayerTestImpl { 69 | bool read_test_data_from_file(char const* const file); 70 | 71 | std::vector data_sets; 72 | }; 73 | 74 | /// 75 | /// LayerTest 76 | /// 77 | 78 | TEST_SPEC_PIMPL(LayerTest) 79 | 80 | void LayerTest::init() { 81 | auto status = _impl->read_test_data_from_file(test_data_file); 82 | if (!status) { 83 | exit(EXIT_FAILURE); 84 | } 85 | } 86 | 87 | size_t LayerTest::data_set_count() { return _impl->data_sets.size(); } 88 | 89 | std::string LayerTest::name(size_t data_set_id) { 90 | if (data_set_count() == 0) { 91 | return "Layer test - no data sets provided"; 92 | } 93 | assert_data_set_ok(data_set_id); 94 | return "Layer test - " + _impl->data_sets[data_set_id].name; 95 | } 96 | 97 | bool LayerTest::operator()(size_t data_set_id, 98 | cnn_sr::DataPipeline* const pipeline) { 99 | if (data_set_count() == 0) return false; 100 | 101 | assert_not_null(pipeline); 102 | assert_data_set_ok(data_set_id); 103 | auto data = &_impl->data_sets[data_set_id]; 104 | auto _context = pipeline->context(); 105 | 106 | // convert layer test definition to cnn_sr::LayerData object 107 | cnn_sr::LayerData layer_data(data->n_prev_filter_cnt, 108 | data->current_filter_count, 109 | data->f_spatial_size); 110 | layer_data.set_weights(&data->weights[0]); 111 | layer_data.set_bias(&data->bias[0]); 112 | 113 | size_t out_dim[2]; 114 | layer_data.get_output_dimensions(out_dim, data->input_w, data->input_h); 115 | 116 | // alloc input 117 | cnn_sr::LayerAllocationPool gpu_alloc; 118 | opencl::MemoryHandle gpu_output = gpu_nullptr; 119 | auto gpu_buf_in = _context->allocate(CL_MEM_WRITE_ONLY, 120 | sizeof(cl_float) * data->input.size()); 121 | _context->write_buffer(gpu_buf_in, (void*)&data->input[0], true); 122 | 123 | // create kernel & run 124 | auto kernel = pipeline->create_layer_kernel(layer_data, false); 125 | pipeline->execute_layer(*kernel, layer_data, gpu_alloc, gpu_buf_in, 126 | data->input_w, data->input_h, gpu_output); 127 | assert_equals(pipeline, data->output, gpu_output); 128 | 129 | return true; 130 | } 131 | 132 | // 133 | // 134 | // 135 | 136 | bool read_layer_data(const JsonValue& object, LayerDataSet& data) { 137 | // ASSERT(object.getTag() == JSON_TAG_OBJECT); 138 | using namespace cnn_sr::utils; 139 | 140 | for (auto node : object) { 141 | try_read_uint(*node, data.n_prev_filter_cnt, "n_prev_filter_cnt"); 142 | try_read_uint(*node, data.f_spatial_size, "f_spatial_size"); 143 | try_read_uint(*node, data.current_filter_count, "current_filter_count"); 144 | try_read_uint(*node, data.input_w, "input_w"); 145 | try_read_uint(*node, data.input_h, "input_h"); 146 | try_read_vector(*node, data.input, "input"); 147 | try_read_vector(*node, data.output, "output"); 148 | try_read_vector(*node, data.weights, "weights"); 149 | try_read_vector(*node, data.bias, "bias"); 150 | } 151 | 152 | return true; 153 | } 154 | 155 | bool LayerTestImpl::read_test_data_from_file(char const* const file) { 156 | std::cout << "Loading layer test data from: '" << file << "'" << std::endl; 157 | 158 | JsonValue value; 159 | JsonAllocator allocator; 160 | std::string source; 161 | cnn_sr::utils::read_json_file(file, value, allocator, source, JSON_OBJECT); 162 | 163 | bool read_status = true; 164 | if (value.getTag() == JSON_OBJECT) { 165 | for (auto object : value) { 166 | if (object->value.getTag() != JSON_OBJECT) continue; 167 | // std::cout << object->key << std::endl; 168 | data_sets.push_back(LayerDataSet()); 169 | LayerDataSet* ptr = &data_sets[data_sets.size() - 1]; 170 | ptr->name = object->key; 171 | read_status &= read_layer_data(object->value, *ptr); 172 | } 173 | } 174 | 175 | return read_status; 176 | } 177 | 178 | // 179 | // 180 | } // namespace specs 181 | } // namespace test 182 | -------------------------------------------------------------------------------- /src/Config.cpp: -------------------------------------------------------------------------------- 1 | #include "Config.hpp" 2 | #include // for std::abs 3 | #include // for strcmp when reading json 4 | 5 | #include "json/gason.h" 6 | 7 | namespace cnn_sr { 8 | using namespace utils; 9 | 10 | const char* const parameters_keys[3] = {"parameters_distribution_1", 11 | "parameters_distribution_2", 12 | "parameters_distribution_3"}; 13 | 14 | ParametersDistribution::ParametersDistribution(float mean_w, float mean_b, 15 | float sd_w, float sd_b) 16 | : mean_w(mean_w), sd_w(sd_w), mean_b(mean_b), sd_b(sd_b) {} 17 | 18 | /// 19 | /// Config 20 | /// 21 | Config::Config(size_t n1, size_t n2, // 22 | size_t f1, size_t f2, size_t f3, // 23 | float momentum, float weight_decay, float* learning_rates, // 24 | ParametersDistribution pd1, // 25 | ParametersDistribution pd2, // 26 | ParametersDistribution pd3, // 27 | const char* const parameters_file) 28 | : n1(n1), 29 | n2(n2), 30 | f1(f1), 31 | f2(f2), 32 | f3(f3), 33 | momentum(momentum), 34 | weight_decay_parameter(weight_decay), 35 | parameters_file(parameters_file), 36 | params_distr_1(pd1), 37 | params_distr_2(pd2), 38 | params_distr_3(pd3) { 39 | for (size_t i = 0; i < 3; i++) { 40 | this->learning_rate[i] = learning_rates[i]; 41 | } 42 | } 43 | 44 | size_t Config::total_padding() const { return f1 + f2 + f3 - 3; } 45 | 46 | void Config::validate(Config& config) { 47 | // spatial size works best if is odd number 48 | utils::require(is_odd(config.f1), "f1 should be odd"); 49 | utils::require(is_odd(config.f2), "f2 should be odd"); 50 | utils::require(is_odd(config.f3), "f3 should be odd"); 51 | // both filter count and spatial size cannot be 0 52 | utils::require(config.n1 > 0, "n1 should be >0"); 53 | utils::require(config.n2 > 0, "n2 should be >0"); 54 | utils::require(config.f1 > 0, "f1 should be >0"); 55 | utils::require(config.f2 > 0, "f2 should be >0"); 56 | utils::require(config.f3 > 0, "f3 should be >0"); 57 | 58 | utils::require(config.f3 > 0, "f3 should be >0"); 59 | utils::require(config.weight_decay_parameter >= 0, 60 | "weight_decay should be >0"); 61 | utils::require(config.learning_rate[0] > 0 && config.learning_rate[1] > 0 && 62 | config.learning_rate[2] > 0, 63 | "All learning rates should be >0"); 64 | 65 | // ParametersDistribution 66 | ParametersDistribution* pd_arr[3] = {&config.params_distr_1, // 67 | &config.params_distr_2, // 68 | &config.params_distr_3}; 69 | for (auto i = 0; i < 3; i++) { 70 | auto pd = pd_arr[i]; 71 | utils::require(pd->sd_w > 0, "std dev. for weights should be > 0"); 72 | utils::require(pd->sd_b >= 0, "std dev. for bias should be >= 0"); 73 | } 74 | } 75 | 76 | /// 77 | /// ConfigReader 78 | /// 79 | 80 | struct ConfigHelper { 81 | size_t n1, n2, f1, f2, f3; 82 | float momentum, weight_decay, lr1, lr2, lr3; 83 | std::string parameters_file = ""; 84 | std::vector learning_rates; 85 | }; 86 | 87 | void fix_params_distribution(ParametersDistribution& d) { 88 | d.mean_w = std::abs(d.mean_w); 89 | d.mean_b = std::abs(d.mean_b); 90 | d.sd_w = std::abs(d.sd_w); 91 | d.sd_b = std::abs(d.sd_b); 92 | } 93 | 94 | void load_parameters_distr(JsonNode* node, ParametersDistribution& data) { 95 | for (auto subnode : node->value) { 96 | utils::try_read_float(*subnode, data.mean_w, "mean_w"); 97 | utils::try_read_float(*subnode, data.mean_b, "mean_b"); 98 | utils::try_read_float(*subnode, data.sd_w, "std_deviation_w"); 99 | utils::try_read_float(*subnode, data.sd_b, "std_deviation_b"); 100 | } 101 | } 102 | 103 | Config ConfigReader::read(const char* const file) { 104 | JsonValue value; 105 | JsonAllocator allocator; 106 | std::string source; 107 | utils::read_json_file(file, value, allocator, source, JSON_OBJECT); 108 | 109 | ConfigHelper cfg_h; 110 | ParametersDistribution pd1, pd2, pd3; 111 | for (auto node : value) { 112 | auto key = node->key; 113 | utils::try_read_uint(*node, cfg_h.n1, "n1"); 114 | utils::try_read_uint(*node, cfg_h.n2, "n2"); 115 | utils::try_read_uint(*node, cfg_h.f1, "f1"); 116 | utils::try_read_uint(*node, cfg_h.f2, "f2"); 117 | utils::try_read_uint(*node, cfg_h.f3, "f3"); 118 | utils::try_read_float(*node, cfg_h.momentum, "momentum"); 119 | utils::try_read_float(*node, cfg_h.weight_decay, "weight_decay_parameter"); 120 | utils::try_read_string(*node, cfg_h.parameters_file, "parameters_file"); 121 | utils::try_read_vector(*node, cfg_h.learning_rates, "learning_rates"); 122 | 123 | if (strcmp(key, parameters_keys[0]) == 0) { 124 | load_parameters_distr(node, pd1); 125 | } else if (strcmp(key, parameters_keys[1]) == 0) { 126 | load_parameters_distr(node, pd2); 127 | } else if (strcmp(key, parameters_keys[2]) == 0) { 128 | load_parameters_distr(node, pd3); 129 | } 130 | } 131 | 132 | fix_params_distribution(pd1); 133 | fix_params_distribution(pd2); 134 | fix_params_distribution(pd3); 135 | utils::require(cfg_h.learning_rates.size() == 3, 136 | "Expected 3 learning rates (one per layer) to be provided"); 137 | 138 | Config cfg(cfg_h.n1, cfg_h.n2, // 139 | cfg_h.f1, cfg_h.f2, cfg_h.f3, // 140 | cfg_h.momentum, cfg_h.weight_decay, 141 | &cfg_h.learning_rates[0], // 142 | pd1, pd2, pd3, // 143 | cfg_h.parameters_file.c_str()); 144 | Config::validate(cfg); 145 | 146 | return cfg; 147 | } 148 | } 149 | 150 | std::ostream& operator<<(std::ostream& os, 151 | const cnn_sr::ParametersDistribution& pd) { 152 | /* clang-format off */ 153 | os << "{ weights(" << pd.mean_w << ", " << pd.sd_w 154 | << "), bias(" << pd.mean_b << ", " << pd.sd_b << ")}"; 155 | /* clang-format on */ 156 | return os; 157 | } 158 | 159 | std::ostream& operator<<(std::ostream& os, const cnn_sr::Config& cfg) { 160 | /* clang-format off */ 161 | os << "Config {" << std::endl 162 | << " parameters file: '" << cfg.parameters_file << "'" << std::endl 163 | << " momentum: " << cfg.momentum << std::endl 164 | << " learning rates: { " << cfg.learning_rate[0] << ", " 165 | << cfg.learning_rate[1] << ", " 166 | << cfg.learning_rate[2] << "}" << std::endl 167 | << " layer 1: " << cfg.n1 << " filters, " << cfg.f1 << " spatial size" << std::endl 168 | << " layer 2: " << cfg.n2 << " filters, " << cfg.f2 << " spatial size" << std::endl 169 | << " layer 3: " << cfg.f3 << " spatial size" << std::endl 170 | << " parameters dist. 1 " << cfg.params_distr_1 << std::endl 171 | << " parameters dist. 2 " << cfg.params_distr_2 << std::endl 172 | << " parameters dist. 3 " << cfg.params_distr_3 << "}" << std::endl; 173 | /* clang-format on */ 174 | return os; 175 | } 176 | -------------------------------------------------------------------------------- /libs/include/CL/cl_gl.h: -------------------------------------------------------------------------------- 1 | /********************************************************************************** 2 | * Copyright (c) 2008-2010 The Khronos Group Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and/or associated documentation files (the 6 | * "Materials"), to deal in the Materials without restriction, including 7 | * without limitation the rights to use, copy, modify, merge, publish, 8 | * distribute, sublicense, and/or sell copies of the Materials, and to 9 | * permit persons to whom the Materials are furnished to do so, subject to 10 | * the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included 13 | * in all copies or substantial portions of the Materials. 14 | * 15 | * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. 22 | **********************************************************************************/ 23 | 24 | /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ 25 | 26 | /* 27 | * cl_gl.h contains Khronos-approved (KHR) OpenCL extensions which have 28 | * OpenGL dependencies. The application is responsible for #including 29 | * OpenGL or OpenGL ES headers before #including cl_gl.h. 30 | */ 31 | 32 | #ifndef __OPENCL_CL_GL_H 33 | #define __OPENCL_CL_GL_H 34 | 35 | #ifdef __APPLE__ 36 | #include 37 | #include 38 | #else 39 | #include "CL/cl.h" 40 | #endif 41 | 42 | #ifdef __cplusplus 43 | extern "C" { 44 | #endif 45 | 46 | typedef cl_uint cl_gl_object_type; 47 | typedef cl_uint cl_gl_texture_info; 48 | typedef cl_uint cl_gl_platform_info; 49 | typedef struct __GLsync *cl_GLsync; 50 | 51 | /* cl_gl_object_type */ 52 | #define CL_GL_OBJECT_BUFFER 0x2000 53 | #define CL_GL_OBJECT_TEXTURE2D 0x2001 54 | #define CL_GL_OBJECT_TEXTURE3D 0x2002 55 | #define CL_GL_OBJECT_RENDERBUFFER 0x2003 56 | 57 | /* cl_gl_texture_info */ 58 | #define CL_GL_TEXTURE_TARGET 0x2004 59 | #define CL_GL_MIPMAP_LEVEL 0x2005 60 | 61 | extern CL_API_ENTRY cl_mem CL_API_CALL 62 | clCreateFromGLBuffer(cl_context /* context */, 63 | cl_mem_flags /* flags */, 64 | cl_GLuint /* bufobj */, 65 | int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; 66 | 67 | extern CL_API_ENTRY cl_mem CL_API_CALL 68 | clCreateFromGLTexture2D(cl_context /* context */, 69 | cl_mem_flags /* flags */, 70 | cl_GLenum /* target */, 71 | cl_GLint /* miplevel */, 72 | cl_GLuint /* texture */, 73 | cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; 74 | 75 | extern CL_API_ENTRY cl_mem CL_API_CALL 76 | clCreateFromGLTexture3D(cl_context /* context */, 77 | cl_mem_flags /* flags */, 78 | cl_GLenum /* target */, 79 | cl_GLint /* miplevel */, 80 | cl_GLuint /* texture */, 81 | cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; 82 | 83 | extern CL_API_ENTRY cl_mem CL_API_CALL 84 | clCreateFromGLRenderbuffer(cl_context /* context */, 85 | cl_mem_flags /* flags */, 86 | cl_GLuint /* renderbuffer */, 87 | cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; 88 | 89 | extern CL_API_ENTRY cl_int CL_API_CALL 90 | clGetGLObjectInfo(cl_mem /* memobj */, 91 | cl_gl_object_type * /* gl_object_type */, 92 | cl_GLuint * /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0; 93 | 94 | extern CL_API_ENTRY cl_int CL_API_CALL 95 | clGetGLTextureInfo(cl_mem /* memobj */, 96 | cl_gl_texture_info /* param_name */, 97 | size_t /* param_value_size */, 98 | void * /* param_value */, 99 | size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; 100 | 101 | extern CL_API_ENTRY cl_int CL_API_CALL 102 | clEnqueueAcquireGLObjects(cl_command_queue /* command_queue */, 103 | cl_uint /* num_objects */, 104 | const cl_mem * /* mem_objects */, 105 | cl_uint /* num_events_in_wait_list */, 106 | const cl_event * /* event_wait_list */, 107 | cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; 108 | 109 | extern CL_API_ENTRY cl_int CL_API_CALL 110 | clEnqueueReleaseGLObjects(cl_command_queue /* command_queue */, 111 | cl_uint /* num_objects */, 112 | const cl_mem * /* mem_objects */, 113 | cl_uint /* num_events_in_wait_list */, 114 | const cl_event * /* event_wait_list */, 115 | cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; 116 | 117 | /* cl_khr_gl_sharing extension */ 118 | 119 | #define cl_khr_gl_sharing 1 120 | 121 | typedef cl_uint cl_gl_context_info; 122 | 123 | /* Additional Error Codes */ 124 | #define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000 125 | 126 | /* cl_gl_context_info */ 127 | #define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006 128 | #define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007 129 | 130 | /* Additional cl_context_properties */ 131 | #define CL_GL_CONTEXT_KHR 0x2008 132 | #define CL_EGL_DISPLAY_KHR 0x2009 133 | #define CL_GLX_DISPLAY_KHR 0x200A 134 | #define CL_WGL_HDC_KHR 0x200B 135 | #define CL_CGL_SHAREGROUP_KHR 0x200C 136 | 137 | extern CL_API_ENTRY cl_int CL_API_CALL 138 | clGetGLContextInfoKHR(const cl_context_properties * /* properties */, 139 | cl_gl_context_info /* param_name */, 140 | size_t /* param_value_size */, 141 | void * /* param_value */, 142 | size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; 143 | 144 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)( 145 | const cl_context_properties * properties, 146 | cl_gl_context_info param_name, 147 | size_t param_value_size, 148 | void * param_value, 149 | size_t * param_value_size_ret); 150 | 151 | #ifdef __cplusplus 152 | } 153 | #endif 154 | 155 | #endif /* __OPENCL_CL_GL_H */ 156 | -------------------------------------------------------------------------------- /test/specs/BackpropagationTest.cpp: -------------------------------------------------------------------------------- 1 | #include "TestSpecsDeclarations.hpp" 2 | 3 | #include "../../src/DataPipeline.hpp" 4 | #include "../../src/LayerData.hpp" 5 | 6 | using namespace cnn_sr; 7 | 8 | /// 9 | /// NOTE: generating expected output is just checking if all inputs, deltas 10 | /// are read correctly. Change following line: 11 | /// 'scratch_w[idx] = delta * layer_input[prev_layer_idx + k];' 12 | /// to: 13 | /// 'scratch_w[idx] = layer_input[prev_layer_idx + k];' 14 | /// OR 15 | /// 'scratch_w[idx] = delta;' 16 | /// Also just use BackpropagationTest_script.py to calc the values. 17 | /// 18 | /// NOTE: data set 1 checks if kernel works, data set 2 checks if it does not 19 | /// crash when used with big number of data 20 | /// 21 | 22 | namespace test { 23 | namespace specs { 24 | 25 | /// 26 | /// PIMPL 27 | /// 28 | struct BackpropagationTestImpl { 29 | // INPUT_SIZE = input_dim*n(l-1) 30 | #define INPUT_SIZE 50 31 | float input[INPUT_SIZE] = {-0.083, -0.064, // 32 | 0.075, -0.055, // 33 | -0.058, -0.138, // 34 | -0.068, -0.144, // 35 | -0.013, 0.176, // 36 | 37 | 0.169, 0.049, // 38 | 0.181, -0.051, // 39 | 0.136, -0.062, // 40 | -0.165, -0.176, // 41 | 0.159, -0.060, // 42 | 43 | -0.112, 0.228, // 44 | 0.003, -0.138, // 45 | -0.123, -0.027, // 46 | -0.102, -0.061, // 47 | 0.242, -0.069, // 48 | 49 | 0.406, 0.419, // 50 | -0.442, 0.685, // 51 | -0.627, -0.489, // 52 | 0.376, 0.563, // 53 | 0.680, -0.371, // 54 | 55 | 0.121, -0.075, // 56 | -0.103, 0.031, // 57 | 0.106, 0.033, // 58 | -0.036, -0.052, // 59 | 0.052, -0.035}; // 60 | 61 | // DELTAS_SIZE = output_dim * n(l) 62 | #define DELTAS_SIZE 27 63 | float deltas[DELTAS_SIZE] = {0.122f, 0.083f, 0.064f, // row 1, col 1 64 | 0.057f, 0.075f, 0.055f, // row 1, col 2 65 | 0.025f, 0.058f, 0.138f, // row 1, col 3 66 | 67 | 0.170f, 0.068f, 0.144f, // row 2, col 1 68 | 0.121f, 0.013f, 0.176f, // row 2, col 2 69 | 0.065f, 0.169f, 0.049f, // row 2, col 3 70 | 71 | 0.003f, 0.181f, 0.051f, // row 3, col 1 72 | 0.021f, 0.136f, 0.062f, // row 3, col 2 73 | 0.066f, 0.165f, 0.176f}; // row 3, col 3 74 | #define WEIGHTS_SIZE 54 75 | const float grad_weights_init_val = 1.5f; 76 | /* clang-format off */ 77 | const std::vector expected_weights = { 78 | 1.5438, 1.4920, 1.5265, 1.4797, 1.4928, 1.4672, 79 | 1.5313, 1.4511, 1.5087, 1.4492, 1.4040, 1.4227, 80 | 1.5157, 1.5271, 1.5191, 1.4377, 1.4467, 1.4474, 81 | 1.4582, 1.4170, 1.4009, 1.5052, 1.5941, 1.4768, 82 | 1.5150, 1.3938, 1.4748, 1.4841, 1.6112, 1.5451, 83 | 1.5445, 1.5892, 1.6088, 1.4503, 1.3907, 1.4047, 84 | 1.4634, 1.4251, 1.4444, 1.6442, 1.4578, 1.6641, 85 | 1.3638, 1.5003, 1.3188, 1.5713, 1.6199, 1.5159, 86 | 1.4713, 1.5962, 1.5414, 1.4491, 1.3937, 1.4882 87 | }; 88 | /* clang-format on */ 89 | 90 | const std::vector expected_bias = {0.650f, 0.948f, 0.915f}; 91 | }; 92 | 93 | /// 94 | /// BackpropagationTest 95 | /// 96 | 97 | TEST_SPEC_PIMPL(BackpropagationTest) 98 | 99 | void BackpropagationTest::init() {} 100 | 101 | std::string BackpropagationTest::name(size_t data_set_id) { 102 | return data_set_id == 0 ? // 103 | "Backpropagation test - value correctness" // 104 | : "Backpropagation test - big data"; 105 | } 106 | 107 | size_t BackpropagationTest::data_set_count() { return 2; } 108 | 109 | void execute(DataPipeline *pipeline, LayerData &data, // 110 | cnn_sr::LayerAllocationPool &gpu_buf, // 111 | float *deltas, float *input, float w_init, // 112 | size_t input_w, size_t input_h) { 113 | auto context = pipeline->context(); 114 | size_t output_dim[2]; 115 | data.get_output_dimensions(output_dim, input_w, input_h); 116 | size_t deltas_size = 117 | output_dim[0] * output_dim[1] * data.current_filter_count, 118 | input_size = data.input_size(input_w, input_h); 119 | 120 | // gpu memory alloc 121 | /* clang-format off */ 122 | auto gpu_deltas = context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * deltas_size); 123 | auto gpu_input = context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * input_size); 124 | gpu_buf.accumulating_grad_w = context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * data.weight_size()); 125 | /* clang-format on */ 126 | context->write_buffer(gpu_deltas, (void *)deltas, true); 127 | context->write_buffer(gpu_input, (void *)input, true); 128 | context->fill_float(gpu_buf.accumulating_grad_w, w_init, true); 129 | 130 | // run 131 | pipeline->backpropagate(data, gpu_input, gpu_deltas, gpu_buf, // 132 | output_dim[0], output_dim[1]); 133 | } 134 | 135 | bool BackpropagationTest::operator()(size_t data_set_id, 136 | cnn_sr::DataPipeline *const pipeline) { 137 | assert_not_null(pipeline); 138 | auto context = pipeline->context(); 139 | cnn_sr::LayerAllocationPool gpu_buf; 140 | 141 | if (data_set_id == 0) { 142 | // data for layer, needs filled up weights&bias to pass validation 143 | LayerData data(2, 3, 3); // n_prev_filter_cnt/FILTER_CNT/f_spatial_size 144 | float w[WEIGHTS_SIZE], bias[10]; 145 | data.set_bias(bias); 146 | data.set_weights(w); 147 | execute(pipeline, data, gpu_buf, _impl->deltas, _impl->input, 148 | _impl->grad_weights_init_val, 5, 5); 149 | // check results 150 | std::cout << "checking weights" << std::endl; 151 | assert_equals(pipeline, _impl->expected_weights, 152 | gpu_buf.accumulating_grad_w); 153 | std::cout << "checking bias" << std::endl; 154 | assert_equals(pipeline, _impl->expected_bias, gpu_buf.accumulating_grad_b); 155 | } else { 156 | LayerData data(32, 16, 3); 157 | float w[4608], bias[16]; 158 | data.set_bias(bias); 159 | data.set_weights(w); 160 | // (we dont care about values and sizes must only be at least enough) 161 | const size_t input_w = 1024, input_h = 1024; 162 | std::vector deltas(input_w * input_h * data.current_filter_count), 163 | input(input_w * input_h * data.n_prev_filter_cnt); 164 | execute(pipeline, data, gpu_buf, // 165 | &deltas[0], &input[0], 0.0f, input_w, input_h); 166 | context->block(); 167 | // didn't crash? then it's ok 168 | } 169 | 170 | return true; 171 | } 172 | 173 | // 174 | // 175 | } // namespace specs 176 | } // namespace test 177 | -------------------------------------------------------------------------------- /test/specs/LayerDeltasTest.cpp: -------------------------------------------------------------------------------- 1 | #include "TestSpecsDeclarations.hpp" 2 | 3 | #include "../../src/DataPipeline.hpp" 4 | #include "../../src/LayerData.hpp" 5 | 6 | using namespace cnn_sr; 7 | 8 | /// 9 | /// NOTE: generating expected output is just checking if all weights, deltas 10 | /// and outputs are read correctly. To this change following line: 11 | /// 'delta_for_filter[n] += delta * w * activation_func_derivative;' 12 | /// to: 13 | /// 'delta_for_filter[n] += ONLY_ONE_OF_MULTIPLIERS' 14 | /// 15 | /// compare results with: 16 | /// * weight: should be [-0.077999..., -0.584] (sum columns 1-3 for first 17 | /// value, columns 4-6 for second) 18 | /// * activation_func_derivative: should be expected_derivative 19 | /// (code-generated) 20 | /// * delta: run LayerDeltasTest_script.py 21 | /// if all of multipliers have correct value their produt will be ok. 22 | /// 23 | 24 | namespace test { 25 | namespace specs { 26 | 27 | /// 28 | /// PIMPL 29 | /// 30 | struct LayerDeltasTestImpl { 31 | // INPUT_SIZE = input_dim*n(l-1) 32 | #define INPUT_SIZE 50 33 | float input_x[INPUT_SIZE] = {-0.083, -0.064, // 34 | 0.075, -0.055, // 35 | -0.058, -0.138, // 36 | -0.068, -0.144, // 37 | -0.013, 0.176, // 38 | 39 | 0.169, 0.049, // 40 | 0.181, -0.051, // 41 | 0.136, -0.062, // 42 | -0.165, -0.176, // 43 | 0.159, -0.060, // 44 | 45 | -0.112, 0.228, // 46 | 0.003, -0.138, // 47 | -0.123, -0.027, // 48 | -0.102, -0.061, // 49 | 0.242, -0.069, // 50 | 51 | 0.406, 0.419, // 52 | -0.442, 0.685, // 53 | -0.627, -0.489, // 54 | 0.376, 0.563, // 55 | 0.680, -0.371, // 56 | 57 | 0.121, -0.075, // 58 | -0.103, 0.031, // 59 | 0.106, 0.033, // 60 | -0.036, -0.052, // 61 | 0.052, -0.035}; // 62 | 63 | // weights 64 | // WEIGTHS_SIZE = f(l)*f(l)*n(l-1)*n(l) 65 | // n(l)=3 | n(l-1)=2 66 | #define WEIGHTS_SIZE 54 67 | /* clang-format off */ 68 | float weights[WEIGHTS_SIZE] = { 69 | -0.369, 0.025, 0.213, 0.058, 0.410, -0.068, 70 | 0.236, 0.071, -0.429, -0.104, 0.161, 0.087, 71 | 0.361, -0.055, 0.273, 0.071, 0.431, -0.095, 72 | 73 | 0.229, 0.378, -0.178, 0.343, 0.114, -0.409, 74 | -0.220, -0.364, 0.711, 0.281, 0.851, -1.001, 75 | -0.411, 0.661, -0.831, -0.091, 0.281, -0.341, 76 | 77 | -0.931, 0.511, 0.141, -0.591, 0.491, -0.921, 78 | 0.291, -0.211, 0.151, 0.491, -0.431, -0.321, 79 | -0.631, 0.301, -0.001, -0.761, -0.021, 0.501}; 80 | /* clang-format on */ 81 | 82 | // DELTAS_SIZE = output_dim * n(l) 83 | #define DELTAS_SIZE 27 84 | float deltas[DELTAS_SIZE] = {0.122, 0.083, 0.064, // row 1, col 1 85 | 0.057, 0.075, 0.055, // row 1, col 2 86 | 0.025, 0.058, 0.138, // row 1, col 3 87 | 0.170, 0.068, 0.144, // row 2, col 1 88 | 0.121, 0.013, 0.176, // row 2, col 2 89 | 0.065, 0.169, 0.049, // row 2, col 3 90 | 0.003, 0.181, 0.051, // row 3, col 1 91 | 0.021, 0.136, 0.062, // row 3, col 2 92 | 0.066, 0.165, 0.176}; // row 3, col 3 93 | 94 | /* clang-format off */ 95 | std::vector expected_output = { 96 | 0, 0, 97 | -0.000213999, 0, 98 | 0, 0, 99 | 0, 0, 100 | 0, 0.013663, 101 | 102 | 0.017562, 0.05308, 103 | -0.00359898, 0, 104 | -0.004519, 0, 105 | 0, 0, 106 | -0.059068, 0, 107 | 108 | 0, -0.012211, 109 | 0.06273, 0, 110 | 0, 0, 111 | 0, 0, 112 | 0.108619, 0, 113 | 114 | -0.043191, -0.198902, 115 | 0, -0.118114, 116 | 0, 0, 117 | -0.00165999, -0.062883, 118 | -0.054512, 0, 119 | 120 | 0.096889, 0, 121 | 0, -0.095646, 122 | 0.086999, -0.168827, 123 | 0, 0, 124 | 0.007843, 0 125 | }; 126 | /* clang-format on */ 127 | }; 128 | 129 | /// 130 | /// LayerDeltasTest 131 | /// 132 | 133 | TEST_SPEC_PIMPL(LayerDeltasTest) 134 | 135 | void LayerDeltasTest::init() {} 136 | 137 | std::string LayerDeltasTest::name(size_t) { return "Layer deltas test"; } 138 | 139 | size_t LayerDeltasTest::data_set_count() { return 1; } 140 | 141 | bool LayerDeltasTest::operator()(size_t, cnn_sr::DataPipeline *const pipeline) { 142 | assert_not_null(pipeline); 143 | auto context = pipeline->context(); 144 | 145 | const size_t IGNORED = 10; 146 | 147 | // data for layer, needs filled up weights&bias to pass validation 148 | LayerData prev_data(IGNORED, 2, IGNORED); // n(l-2), n(l-1), f(l-1) 149 | LayerData curr_data(2, 3, 3); // n(l-1), n(l), f(l) 150 | float bias[3] = {0.0f, 0.0f, 0.0f}; 151 | curr_data.set_bias(bias); 152 | curr_data.set_weights(_impl->weights); 153 | 154 | // previous layer results - used to take care of sigmoid func. 155 | size_t output_dim[2] = {3, 3}; 156 | 157 | // all variations with activation function 158 | float output[INPUT_SIZE]; 159 | std::vector expected_derivative(INPUT_SIZE); 160 | size_t derivative_repeat_cnt = curr_data.f_spatial_size * 161 | curr_data.f_spatial_size * 162 | curr_data.current_filter_count; 163 | for (size_t i = 0; i < INPUT_SIZE; i++) { 164 | float x = _impl->input_x[i]; 165 | output[i] = activation_function(x); 166 | expected_derivative[i] = 167 | activation_function_derivative(x) * derivative_repeat_cnt; 168 | } 169 | 170 | // gpu memory alloc 171 | cnn_sr::LayerAllocationPool curr_gpu_buf; 172 | opencl::MemoryHandle prev_deltas = gpu_nullptr; 173 | /* clang-format off */ 174 | auto curr_deltas = context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * DELTAS_SIZE); 175 | auto prev_output = context->allocate(CL_MEM_READ_ONLY, sizeof(cl_float) * INPUT_SIZE); 176 | /* clang-format on */ 177 | context->write_buffer(curr_deltas, (void *)_impl->deltas, true); 178 | context->write_buffer(prev_output, (void *)output, true); 179 | 180 | // create kernel & run 181 | auto kernel = pipeline->create_deltas_kernel(prev_data); 182 | pipeline->calculate_deltas(*kernel, // 183 | prev_data, curr_data, // 184 | curr_gpu_buf, // 185 | prev_deltas, curr_deltas, // 186 | output_dim[0], output_dim[1], prev_output); 187 | assert_equals(pipeline, _impl->expected_output, prev_deltas); 188 | 189 | // sub test with expected_derivative 190 | // assert_equals(pipeline, expected_derivative, prev_deltas); 191 | 192 | return true; 193 | } 194 | 195 | // 196 | // 197 | } // namespace specs 198 | } // namespace test 199 | -------------------------------------------------------------------------------- /src/opencl/Kernel.cpp: -------------------------------------------------------------------------------- 1 | #include "Kernel.hpp" 2 | #include "Context.hpp" 3 | 4 | #include 5 | #include 6 | 7 | namespace opencl { 8 | 9 | void Kernel::init(Context *ctx, cl_kernel k, cl_program p, const char *file, 10 | const char *args) { 11 | if (initialized) cleanup(); 12 | this->context = ctx; 13 | this->kernel_id = k; 14 | this->program_id = p; 15 | arg_stack_size = 0; 16 | assigned_local_memory = 0; 17 | initialized = true; 18 | // read parameters 19 | cl_int ciErr1; 20 | ciErr1 = clGetKernelWorkGroupInfo(k, context->device().device_id, 21 | CL_KERNEL_WORK_GROUP_SIZE, 1024, 22 | &max_work_group_size, nullptr); 23 | ciErr1 = clGetKernelWorkGroupInfo(k, context->device().device_id, 24 | CL_KERNEL_PRIVATE_MEM_SIZE, 1024, 25 | &private_mem_size, nullptr); 26 | ciErr1 = 27 | clGetKernelWorkGroupInfo(k, context->device().device_id, 28 | CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, 29 | 1024, &pref_work_group_multiple, nullptr); 30 | context->check_error(ciErr1, "Could not get kernel informations"); 31 | 32 | file = file == nullptr ? "??" : file; 33 | args = args == nullptr ? "--" : args; 34 | if (file != nullptr && args != nullptr) { 35 | snprintf(this->human_identifier, MAX_KERNEL_IDENTIFIER_SIZE, "'%s'[%s]", 36 | file, args); 37 | } 38 | } 39 | 40 | void Kernel::cleanup() { 41 | if (!initialized) return; 42 | initialized = false; 43 | 44 | if (kernel_id) clReleaseKernel(kernel_id); 45 | if (program_id) clReleaseProgram(program_id); 46 | } 47 | 48 | size_t Kernel::current_local_memory() { 49 | cl_ulong loc_mem_size; 50 | cl_int ciErr1 = clGetKernelWorkGroupInfo( 51 | kernel_id, context->device().device_id, CL_KERNEL_LOCAL_MEM_SIZE, 1024, 52 | &loc_mem_size, nullptr); 53 | context->check_error(ciErr1, "Could not get kernel's local memory usage"); 54 | return loc_mem_size > assigned_local_memory ? loc_mem_size 55 | : assigned_local_memory; 56 | } 57 | 58 | void Kernel::push_arg(size_t arg_size, const void *arg_value) { 59 | cl_int ciErr1 = 60 | clSetKernelArg(kernel_id, arg_stack_size, arg_size, arg_value); 61 | context->check_error(ciErr1, "Could not push kernel argument"); 62 | ++arg_stack_size; 63 | // local memory 64 | if (!arg_value) assigned_local_memory += arg_size; 65 | } 66 | 67 | void Kernel::push_arg(MemoryHandle gpu_buf) { 68 | auto mem = context->raw_memory(gpu_buf); 69 | this->push_arg(sizeof(cl_mem), (void *)&mem->handle); 70 | } 71 | 72 | cl_event Kernel::execute(cl_uint work_dim, // 73 | const size_t *global_work_size, // 74 | const size_t *local_work_size, // 75 | cl_event *events_to_wait_for, 76 | int events_to_wait_for_count) { 77 | context->check_error(context->is_initialized(), 78 | "Context was not initialized"); 79 | check_work_parameters(work_dim, global_work_size, local_work_size); 80 | 81 | // check used amount of local memory 82 | char msg_buffer[192]; 83 | auto used_loc_mem = current_local_memory(); 84 | if (used_loc_mem > context->device().local_mem_size) { 85 | snprintf(msg_buffer, sizeof(msg_buffer), 86 | "You are using too much local memory(%d), only %llu is available", 87 | used_loc_mem, context->device().local_mem_size); 88 | context->check_error(false, msg_buffer); 89 | } 90 | 91 | // correct event parameters 92 | if (!events_to_wait_for) events_to_wait_for_count = 0; 93 | if (events_to_wait_for_count <= 0) events_to_wait_for = nullptr; 94 | 95 | arg_stack_size = 0; // prepare for next invoke 96 | assigned_local_memory = 0; 97 | cl_command_queue *cmd_queue = context->command_queue(); 98 | 99 | cl_event finish_token; 100 | cl_int ciErr1 = clEnqueueNDRangeKernel( 101 | *cmd_queue, kernel_id, // what and where to execute 102 | work_dim, nullptr, // must be NULL 103 | global_work_size, local_work_size, // 104 | events_to_wait_for_count, events_to_wait_for, // sync events 105 | &finish_token); 106 | context->check_error(ciErr1, "Error in clEnqueueNDRangeKernel"); 107 | 108 | if (context->is_running_profile_mode()) { 109 | clWaitForEvents(1, &finish_token); 110 | cl_ulong start = 0, end = 0; 111 | clGetEventProfilingInfo(finish_token, CL_PROFILING_COMMAND_START, 112 | sizeof(cl_ulong), &start, NULL); 113 | clGetEventProfilingInfo(finish_token, CL_PROFILING_COMMAND_END, 114 | sizeof(cl_ulong), &end, NULL); 115 | execution_time_sum += (end - start); 116 | } 117 | 118 | return finish_token; 119 | } 120 | 121 | void Kernel::check_work_parameters(cl_uint work_dim, // 122 | const size_t *global_work_size, 123 | const size_t *local_work_size) { 124 | // std::cout << std::endl 125 | // << "Work size: " << ((unsigned int)work_dim) 126 | // << "/" << (*global_work_size) 127 | // << "/" << (*local_work_size) << std::endl; 128 | 129 | char msg_buffer[192]; 130 | if (work_dim < 1 || work_dim > 3) { 131 | snprintf(msg_buffer, sizeof(msg_buffer), 132 | "Work parameters: 1 <= (work_dim=%d) <= 3", work_dim); 133 | context->check_error(false, msg_buffer); 134 | } 135 | 136 | auto device = context->device(); 137 | long long device_work_id_range = ((long long)1) << device.address_bits; 138 | long long real_global_work_size = 1, 139 | real_local_work_size = 1; // # of work-items in work-group 140 | bool local_dims_lte_device_max = true, 141 | global_dims_divisible_by_local_dims = true; 142 | 143 | for (size_t i = 0; i < work_dim; i++) { 144 | real_global_work_size *= global_work_size[i]; 145 | if (local_work_size) { 146 | real_local_work_size *= local_work_size[i]; 147 | local_dims_lte_device_max &= 148 | local_work_size[i] <= device.work_items_for_dims[i]; 149 | global_dims_divisible_by_local_dims &= 150 | global_work_size[i] % local_work_size[i] == 0; 151 | } 152 | } 153 | 154 | #define WORK_DIMENSIONS_STR "global:[%d,%d,%d], local:[%d,%d,%d]" 155 | #define WORK_DIMENSIONS_VAL global_work_size[0], \ 156 | (work_dim > 1 ? global_work_size[1] : 1), \ 157 | (work_dim == 3 ? global_work_size[2] : 1),\ 158 | local_work_size[0], \ 159 | (work_dim > 1 ? local_work_size[1] : 1), \ 160 | (work_dim == 3 ? local_work_size[2] : 1) 161 | 162 | bool is_ok = true; 163 | if (!local_dims_lte_device_max) { 164 | is_ok = false; 165 | snprintf(msg_buffer, sizeof(msg_buffer), 166 | "Work parameters: one of local dimensions are bigger " 167 | "then device allows. " WORK_DIMENSIONS_STR, 168 | WORK_DIMENSIONS_VAL); 169 | } else if (!global_dims_divisible_by_local_dims) { 170 | is_ok = false; 171 | snprintf(msg_buffer, sizeof(msg_buffer), 172 | "Work parameters: For each dimension " 173 | "global_work_size should be multiply of " 174 | "local_work_size. " WORK_DIMENSIONS_STR, 175 | WORK_DIMENSIONS_VAL); 176 | } else if (real_global_work_size > device_work_id_range) { 177 | is_ok = false; 178 | snprintf(msg_buffer, sizeof(msg_buffer), 179 | "Work parameters: global_work_size(%llu) is bigger then device " 180 | "address_bits(%d) can represent. " WORK_DIMENSIONS_STR, 181 | real_global_work_size, device.address_bits, WORK_DIMENSIONS_VAL); 182 | } else if (real_local_work_size > device.max_work_group_size || 183 | real_local_work_size > this->max_work_group_size) { 184 | is_ok = false; 185 | snprintf(msg_buffer, sizeof(msg_buffer), 186 | "Work parameters: local_work_size(%llu) is bigger then device(%d) " 187 | "or kernel(%d) allows. " WORK_DIMENSIONS_STR, 188 | real_local_work_size, device.max_work_group_size, 189 | this->max_work_group_size, WORK_DIMENSIONS_VAL); 190 | } 191 | 192 | context->check_error(is_ok, msg_buffer); 193 | } 194 | 195 | std::ostream &operator<<(std::ostream &os, opencl::Kernel &k) { 196 | os << "program id: " << k.program_id // 197 | << ", kernel id: " << k.kernel_id // 198 | << ", max_work_group_size: " << k.max_work_group_size // 199 | << ", private_mem_size: " << k.private_mem_size // 200 | << ", pref_work_group_multiple: " << k.pref_work_group_multiple // 201 | << ", allocated local memory: " << (k.current_local_memory() / 1024) 202 | << "KB"; // 203 | return os; 204 | } 205 | } 206 | -------------------------------------------------------------------------------- /src/DataPipeline.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DATA_PIPELINE_H 2 | #define DATA_PIPELINE_H 3 | 4 | #include "pch.hpp" 5 | 6 | // TODO move this to opencl::Context 7 | const opencl::MemoryHandle gpu_nullptr = 1 << 30; 8 | 9 | namespace cnn_sr { 10 | 11 | struct LayerAllocationPool { 12 | /** Forward: weights, size: f*f*n*k */ 13 | opencl::MemoryHandle weights = gpu_nullptr; 14 | /** Forward: bias, size: n */ 15 | opencl::MemoryHandle bias = gpu_nullptr; 16 | 17 | /** Backpropagation: Accumulate gradients through out batch execution, 18 | size: f*f*n*k */ 19 | opencl::MemoryHandle accumulating_grad_w = gpu_nullptr; 20 | /** Backpropagation: Accumulate gradients through out batch execution, 21 | size: f*f*n*k */ 22 | opencl::MemoryHandle accumulating_grad_b = gpu_nullptr; 23 | /** Backpropagation-momentum: Deltas that we had after previous batch, 24 | size: f*f*n*k */ 25 | opencl::MemoryHandle previous_batch_delta_w = gpu_nullptr; 26 | /** Backpropagation-momentum: Deltas that we had after previous batch, 27 | size: f*f*n*k */ 28 | opencl::MemoryHandle previous_batch_delta_b = gpu_nullptr; 29 | }; 30 | 31 | /** 32 | * Class used to execute various pipeline methods f.e.: 33 | * - luma extraction 34 | * - mean squared error 35 | * - all CNN methods 36 | * 37 | * This is quite low level - thin wrappers with validation mostly 38 | */ 39 | class DataPipeline { 40 | public: 41 | static int LOAD_KERNEL_LUMA; 42 | static int LOAD_KERNEL_LAYERS; 43 | static int LOAD_KERNEL_BACKPROPAGATE; 44 | static int LOAD_KERNEL_MISC; 45 | static int LOAD_KERNEL_NONE; 46 | static int LOAD_KERNEL_ALL; 47 | 48 | DataPipeline(opencl::Context*); 49 | virtual ~DataPipeline() {} 50 | virtual void init(int load_flags = DataPipeline::LOAD_KERNEL_ALL); 51 | opencl::Context* context(); 52 | 53 | /** 54 | * Take image, write it to GPU (gpu_buf_raw_img), and write luma channel 55 | * separately to gpu_buf_luma 56 | * 57 | * used buffers: 58 | * in - NONE 59 | * out - param->gpu_buf_raw_img(with raw image data, all 3 channels) 60 | * param->gpu_buf_luma(with luma channel of provided image) 61 | */ 62 | cl_event extract_luma(opencl::utils::ImageData&, opencl::MemoryHandle&, 63 | opencl::MemoryHandle&, bool, cl_event* ev = nullptr); 64 | 65 | /** Swap luma in image to specified set of values */ 66 | cl_event swap_luma(opencl::utils::ImageData&, 67 | opencl::MemoryHandle& gpu_buf_org_img, 68 | opencl::MemoryHandle gpu_buf_new_luma, 69 | opencl::MemoryHandle& target, // 70 | size_t new_luma_w, size_t new_luma_h, 71 | cl_event* ev = nullptr); 72 | 73 | /** 74 | * Forward propagation for single layer. 75 | * 76 | * used buffers: 77 | * in - layer.weights, layer.bias, this layer's input(that means previous 78 | * layer output) 79 | * out - layer.output 80 | */ 81 | cl_event execute_layer(opencl::Kernel&, const LayerData&, 82 | cnn_sr::LayerAllocationPool&, opencl::MemoryHandle&, 83 | size_t, size_t, size_t id, opencl::MemoryHandle&, 84 | cl_event* ev = nullptr); 85 | 86 | /** 87 | * This function blocks. 88 | * 89 | * used buffers: 90 | * in - orginal image luma, layer_3.output 91 | * out - this->_tmp_gpu_float 92 | * 93 | * @param total_padding difference in size between ground_truth image 94 | * and result. Should be equal to f1+f2+f3-3 95 | */ 96 | cl_event squared_error(opencl::MemoryHandle gpu_buf_ground_truth, 97 | size_t ground_truth_w, size_t ground_truth_h, 98 | size_t id, opencl::MemoryHandle gpu_buf_algo_res, 99 | opencl::MemoryHandle tmp_buffer, float& target, 100 | size_t total_padding, cl_event* ev = nullptr); 101 | 102 | /** 103 | * Deltas last layer 104 | * 105 | * used buffers: 106 | * in - orginal image luma, layer_3.output 107 | * out - param->gpu_buf_target 108 | */ 109 | cl_event last_layer_delta(opencl::MemoryHandle gpu_buf_ground_truth, 110 | size_t ground_truth_w, size_t ground_truth_h, 111 | size_t id, opencl::MemoryHandle gpu_buf_algo_res, 112 | opencl::MemoryHandle& gpu_buf_target, 113 | size_t total_padding, cl_event* ev = nullptr); 114 | 115 | /** 116 | * Deltas for current layer based on next layer 117 | * 118 | * used buffers: 119 | * in - next_layer.deltas, curr_layer.output, next_layer.weights 120 | * out - curr_layer.deltas 121 | */ 122 | cl_event calculate_deltas(opencl::Kernel&, // 123 | const LayerData&, const LayerData&, 124 | cnn_sr::LayerAllocationPool&, // 125 | opencl::MemoryHandle, opencl::MemoryHandle, // 126 | size_t, size_t, size_t id, // 127 | opencl::MemoryHandle, // 128 | cl_event* ev = nullptr); 129 | 130 | /** 131 | * Calculate gradients of weights and bias 132 | * 133 | * used buffers: 134 | * in - layer.deltas, this layer's input(that means previous layer output) 135 | * out - layer.grad_w, layer.grad_b 136 | */ 137 | cl_event backpropagate(LayerData&, opencl::MemoryHandle layer_input, 138 | opencl::MemoryHandle layer_deltas, 139 | LayerAllocationPool&, // 140 | size_t layer_out_w, size_t layer_out_h, size_t id, 141 | cl_event* ev = nullptr, size_t ev_cnt = 0); 142 | 143 | /** 144 | * Update weights and biases based on gradients and various factors like batch 145 | * size, momentum, learning rate. Note that we are both using 146 | * previous_delta_w/previous_delta_b to calculate his layers new 147 | * weights/biases(READ) and updating theirs values(WRITE). 148 | * 149 | * used buffers: 150 | * in - layer.grad_w, layer.grad_b 151 | * out - layer.weights, layer.bias 152 | * in/out - layer.previous_delta_w, layer.previous_delta_b 153 | */ 154 | cl_event update_parameters(LayerData&, LayerAllocationPool&, 155 | size_t batch_size, float momentum, float w_decay, 156 | float learning_rate, cl_event* ev = nullptr); 157 | 158 | /// 159 | /// misc. kernels 160 | /// 161 | 162 | /** Subtract mean value from all elements of the buffer. The mean parameter 163 | * will have the mean value */ 164 | cl_event subtract_mean(opencl::MemoryHandle, float* mean = nullptr, 165 | cl_event* ev = nullptr); 166 | 167 | /** 168 | * Sum all float in buffer. You may choose to square the values before adding 169 | * them up. 170 | */ 171 | float sum(opencl::MemoryHandle, bool squared = false, cl_event* ev = nullptr); 172 | 173 | /** Subtract provided value from all elements of the buffer */ 174 | cl_event subtract_from_all(opencl::MemoryHandle, float, 175 | cl_event* ev = nullptr); 176 | 177 | /// 178 | /// kernel creation - ones that are not created during standard init 179 | /// 180 | /** @param skip_relu:bool skip relu step, writing raw result */ 181 | opencl::Kernel* create_layer_kernel(const LayerData&, bool); 182 | opencl::Kernel* create_deltas_kernel(const LayerData&); 183 | 184 | /// 185 | /// misc 186 | /// 187 | void print_buffer(opencl::MemoryHandle, const char* const, size_t); 188 | 189 | protected: 190 | void check_initialized(int kernel_load_flags); 191 | virtual void load_kernels(int load_flags); 192 | 193 | /** Either allocation has exact size or release it. Memory is deallocated 194 | * here, but we cannot allocate it with proper size since f.e. allocating 195 | * image is different then allocating normal buffer. 196 | */ 197 | bool allocation_has_right_size__(opencl::MemoryHandle, size_t, // 198 | size_t, const char*); 199 | 200 | private: 201 | void pre_execute_layer_validation(const LayerData&, opencl::MemoryHandle, 202 | size_t, size_t); 203 | size_t element_count(opencl::MemoryHandle, size_t el_size); 204 | 205 | protected: 206 | opencl::Context* const _context; 207 | bool _initialized; 208 | 209 | /** Single float. Quite useful. */ 210 | opencl::MemoryHandle _tmp_gpu_float = gpu_nullptr; 211 | 212 | opencl::Kernel* _luma_kernel_norm = nullptr; 213 | opencl::Kernel* _luma_kernel_raw = nullptr; 214 | opencl::Kernel* _swap_luma_kernel = nullptr; 215 | opencl::Kernel* _squared_error_kernel = nullptr; 216 | opencl::Kernel* _sum_kernel = nullptr; 217 | opencl::Kernel* _sum_squared_kernel = nullptr; 218 | opencl::Kernel* _subtract_from_all_kernel = nullptr; 219 | opencl::Kernel* _last_layer_delta_kernel = nullptr; 220 | opencl::Kernel* _update_parameters_kernel = nullptr; 221 | opencl::Kernel* _backpropagate_kernel = nullptr; 222 | }; 223 | } 224 | 225 | #endif /* DATA_PIPELINE_H */ 226 | -------------------------------------------------------------------------------- /src/opencl/UtilsOpenCL.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include // f.e. std::minmax_element 6 | 7 | #define STBI_FAILURE_USERMSG 8 | #define STB_IMAGE_IMPLEMENTATION 9 | #include "stb/stb_image.h" 10 | #define STB_IMAGE_WRITE_IMPLEMENTATION 11 | #include "stb/stb_image_write.h" 12 | 13 | #include "UtilsOpenCL.hpp" 14 | #include "Kernel.hpp" 15 | #include "Context.hpp" 16 | #include "../pch.hpp" 17 | 18 | namespace opencl { 19 | namespace utils { 20 | 21 | char const *device_type_str[9] = { 22 | "-", 23 | "default", // 1 24 | "CPU", // 2 25 | "-", 26 | "GPU", // 4 27 | "-", "-", "-", 28 | "Accelerator", // 8 29 | }; 30 | 31 | char *load_file(const char *cFilename, const char *cPreamble, 32 | size_t *szFinalLength) { 33 | FILE *pFileStream = NULL; 34 | size_t szSourceLength; 35 | 36 | #ifdef _MSC_VER // Visual studio 37 | if (fopen_s(&pFileStream, cFilename, "rb") != 0) { 38 | return NULL; 39 | } 40 | #else // Linux version 41 | pFileStream = fopen(cFilename, "rb"); 42 | if (pFileStream == 0) { 43 | return NULL; 44 | } 45 | #endif 46 | 47 | size_t szPreambleLength = strlen(cPreamble); 48 | 49 | // get the length of the source code 50 | fseek(pFileStream, 0, SEEK_END); 51 | szSourceLength = ftell(pFileStream); 52 | fseek(pFileStream, 0, SEEK_SET); 53 | 54 | // allocate a buffer for the source code string and read it in 55 | char *cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); 56 | memcpy(cSourceString, cPreamble, szPreambleLength); 57 | if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, 58 | pFileStream) != 1) { 59 | fclose(pFileStream); 60 | free(cSourceString); 61 | return 0; 62 | } 63 | 64 | // close the file and return the total length of the combined 65 | // (preamble + source) string 66 | fclose(pFileStream); 67 | if (szFinalLength != 0) { 68 | *szFinalLength = szSourceLength + szPreambleLength; 69 | } 70 | cSourceString[szSourceLength + szPreambleLength] = '\0'; 71 | 72 | return cSourceString; 73 | } 74 | 75 | /// 76 | /// images 77 | /// 78 | 79 | ImageData::ImageData() : w(0), h(0), bpp(0), data(nullptr) {} 80 | 81 | ImageData::ImageData(int w, int h, int bpp, unsigned char *data) 82 | : w(w), h(h), bpp(bpp), data(data), read_from_file(false) {} 83 | 84 | ImageData::~ImageData() { 85 | if (data && read_from_file) stbi_image_free(data); 86 | } 87 | 88 | void load_image(const char *filename, ImageData &data) { 89 | data.data = stbi_load(filename, &data.w, &data.h, &data.bpp, 4); 90 | // TODO CHECK_ALLOCATION(data.data); 91 | } 92 | 93 | int write_image(const char *filename, ImageData &data) { 94 | return stbi_write_png(filename, data.w, data.h, data.bpp, data.data, 0); 95 | } 96 | 97 | void write_image(const char *const file_path, float *source, // 98 | size_t w, size_t h) { 99 | size_t px_cnt = w * h; 100 | // normalize values: 0..1 101 | auto min_max_it = std::minmax_element(source, source + px_cnt); 102 | float min = *min_max_it.first, max = *min_max_it.second, 103 | norm_factor = max - min; 104 | for (size_t i = 0; i < px_cnt; i++) { 105 | source[i] = (source[i] - min) / norm_factor; 106 | } 107 | 108 | std::cout << "writing image(" << w << "x" << h << ") to: '" << file_path 109 | << "'" << std::endl; 110 | std::vector data(px_cnt * 3); 111 | for (size_t row = 0; row < h; row++) { 112 | for (size_t col = 0; col < w; col++) { 113 | size_t idx = row * w + col; 114 | float val = source[idx] * 255; 115 | for (size_t k = 0; k < 3; k++) { 116 | data[idx * 3 + k] = (unsigned char)val; 117 | } 118 | } 119 | } 120 | 121 | ImageData dd(w, h, sizeof(unsigned char) * 3, &data[0]); 122 | opencl::utils::write_image(file_path, dd); 123 | } 124 | 125 | /// 126 | /// misc 127 | /// 128 | 129 | void work_sizes(const opencl::Kernel &kernel, size_t dim, 130 | size_t *global_work_size, size_t *local_work_size, size_t *work, 131 | bool print) { 132 | if (dim == 0 || dim > 3) { 133 | throw std::runtime_error("Work dimesions should be 1,2 or 3"); 134 | } 135 | 136 | auto context = kernel.get_context(); 137 | auto device = context->device(); 138 | auto max_local = 139 | std::min(device.max_work_group_size, kernel.get_max_work_group_size()); 140 | auto max_device_local_size = device.work_items_for_dims; 141 | 142 | size_t pow_2[3]; 143 | for (size_t i = 0; i < dim; i++) { 144 | pow_2[i] = cnn_sr::utils::closest_power_of_2(static_cast(work[i])); 145 | } 146 | 147 | // local_work_size 148 | // we are doing round robin (see to_update variable) multiplying each 149 | // dimension by 2 each time. It may not work that good for: 150 | // max_device_local_size = [1024, 1024, 1], since it stops after 3 iterations 151 | // On the other note I've had to look up syntax to do{..}while(...); 152 | size_t tmp[3] = {1, 1, 1}, local_dims_multiplied = 1, to_update = 0; 153 | bool satisfies_conditions; 154 | do { 155 | // copy last correct configuration to local 156 | memcpy(local_work_size, tmp, dim * sizeof(float)); 157 | tmp[to_update] *= 2; 158 | local_dims_multiplied *= 2; 159 | satisfies_conditions = tmp[to_update] <= max_device_local_size[to_update] && 160 | tmp[to_update] <= pow_2[to_update] && 161 | local_dims_multiplied <= max_local; 162 | to_update = (to_update + 1) % dim; 163 | } while (satisfies_conditions); 164 | 165 | // global_work_size 166 | for (size_t i = 0; i < dim; i++) { 167 | global_work_size[i] = 168 | (pow_2[i] == local_work_size[i]) 169 | ? pow_2[i] 170 | : ((work[i] / local_work_size[i]) + 1) * local_work_size[i]; 171 | } 172 | 173 | bool ok = true; 174 | for (size_t i = 0; i < dim; i++) { 175 | ok &= global_work_size[i] >= local_work_size[i]; 176 | ok &= global_work_size[i] >= work[i]; 177 | ok &= local_work_size[i] > 0; 178 | } 179 | 180 | if (!ok) { 181 | char buf[255]; 182 | snprintf(buf, 255, 183 | "Tried to create nonstandard work dimensions: global=[%d,%d,%d], " 184 | "local=[%d,%d,%d]", 185 | global_work_size[0], (dim > 1 ? global_work_size[1] : 1), 186 | (dim == 3 ? global_work_size[2] : 1), // 187 | local_work_size[0], (dim > 1 ? local_work_size[1] : 1), 188 | (dim == 3 ? local_work_size[2] : 1)); 189 | throw std::runtime_error(buf); 190 | } 191 | 192 | if (print) { 193 | std::cout << "global work size: [" // 194 | << global_work_size[0] << ", " // 195 | << (dim > 1 ? global_work_size[1] : 1) << ", " // 196 | << (dim == 3 ? global_work_size[2] : 1) << "]" << std::endl; 197 | std::cout << "local work size: [" // 198 | << local_work_size[0] << ", " // 199 | << (dim > 1 ? local_work_size[1] : 1) << ", " // 200 | << (dim == 3 ? local_work_size[2] : 1) << "]" << std::endl; 201 | } 202 | } 203 | 204 | const char *get_opencl_error_str(cl_int errorCode) { 205 | #define DECLARE_ERROR(err) \ 206 | case (err): \ 207 | return #err 208 | 209 | switch (errorCode) { 210 | DECLARE_ERROR(CL_SUCCESS); 211 | DECLARE_ERROR(CL_DEVICE_NOT_FOUND); 212 | DECLARE_ERROR(CL_DEVICE_NOT_AVAILABLE); 213 | DECLARE_ERROR(CL_COMPILER_NOT_AVAILABLE); 214 | DECLARE_ERROR(CL_MEM_OBJECT_ALLOCATION_FAILURE); 215 | case CL_OUT_OF_RESOURCES: 216 | return "CL_OUT_OF_RESOURCES - either running out of memory or possible " 217 | "watchdog exception. See f.e " 218 | "https://devtalk.nvidia.com/default/topic/471020/" 219 | "driver-crashs-while-opencl-app-is-running/"; 220 | DECLARE_ERROR(CL_OUT_OF_HOST_MEMORY); 221 | DECLARE_ERROR(CL_PROFILING_INFO_NOT_AVAILABLE); 222 | DECLARE_ERROR(CL_MEM_COPY_OVERLAP); 223 | DECLARE_ERROR(CL_IMAGE_FORMAT_MISMATCH); 224 | DECLARE_ERROR(CL_IMAGE_FORMAT_NOT_SUPPORTED); 225 | DECLARE_ERROR(CL_BUILD_PROGRAM_FAILURE); 226 | DECLARE_ERROR(CL_MAP_FAILURE); 227 | DECLARE_ERROR(CL_MISALIGNED_SUB_BUFFER_OFFSET); 228 | DECLARE_ERROR(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); 229 | DECLARE_ERROR(CL_INVALID_VALUE); 230 | DECLARE_ERROR(CL_INVALID_DEVICE_TYPE); 231 | DECLARE_ERROR(CL_INVALID_PLATFORM); 232 | DECLARE_ERROR(CL_INVALID_DEVICE); 233 | DECLARE_ERROR(CL_INVALID_CONTEXT); 234 | DECLARE_ERROR(CL_INVALID_QUEUE_PROPERTIES); 235 | DECLARE_ERROR(CL_INVALID_COMMAND_QUEUE); 236 | DECLARE_ERROR(CL_INVALID_HOST_PTR); 237 | DECLARE_ERROR(CL_INVALID_MEM_OBJECT); 238 | DECLARE_ERROR(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); 239 | DECLARE_ERROR(CL_INVALID_IMAGE_SIZE); 240 | DECLARE_ERROR(CL_INVALID_SAMPLER); 241 | DECLARE_ERROR(CL_INVALID_BINARY); 242 | DECLARE_ERROR(CL_INVALID_BUILD_OPTIONS); 243 | DECLARE_ERROR(CL_INVALID_PROGRAM); 244 | DECLARE_ERROR(CL_INVALID_PROGRAM_EXECUTABLE); 245 | DECLARE_ERROR(CL_INVALID_KERNEL_NAME); 246 | DECLARE_ERROR(CL_INVALID_KERNEL_DEFINITION); 247 | DECLARE_ERROR(CL_INVALID_KERNEL); 248 | DECLARE_ERROR(CL_INVALID_ARG_INDEX); 249 | DECLARE_ERROR(CL_INVALID_ARG_VALUE); 250 | DECLARE_ERROR(CL_INVALID_ARG_SIZE); 251 | DECLARE_ERROR(CL_INVALID_KERNEL_ARGS); 252 | DECLARE_ERROR(CL_INVALID_WORK_DIMENSION); 253 | DECLARE_ERROR(CL_INVALID_WORK_GROUP_SIZE); 254 | DECLARE_ERROR(CL_INVALID_WORK_ITEM_SIZE); 255 | DECLARE_ERROR(CL_INVALID_GLOBAL_OFFSET); 256 | DECLARE_ERROR(CL_INVALID_EVENT_WAIT_LIST); 257 | DECLARE_ERROR(CL_INVALID_EVENT); 258 | DECLARE_ERROR(CL_INVALID_OPERATION); 259 | DECLARE_ERROR(CL_INVALID_GL_OBJECT); 260 | DECLARE_ERROR(CL_INVALID_BUFFER_SIZE); 261 | DECLARE_ERROR(CL_INVALID_MIP_LEVEL); 262 | DECLARE_ERROR(CL_INVALID_GLOBAL_WORK_SIZE); 263 | // DECLARE_ERROR(CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR); 264 | // DECLARE_ERROR(CL_PLATFORM_NOT_FOUND_KHR); 265 | // DECLARE_ERROR(CL_INVALID_PROPERTY_EXT); 266 | // DECLARE_ERROR(CL_DEVICE_PARTITION_FAILED_EXT); 267 | // DECLARE_ERROR(CL_INVALID_PARTITION_COUNT_EXT); 268 | DECLARE_ERROR(CL_INVALID_PROPERTY); 269 | default: 270 | return "unknown error code"; 271 | } 272 | #undef DECLARE_ERROR 273 | } 274 | 275 | // 276 | } 277 | } 278 | --------------------------------------------------------------------------------