├── test ├── example.cpp └── CMakeLists.txt ├── .gitignore ├── res ├── kuppi256.pgm ├── kuppi256_25.pgm └── bm3d.cl ├── googletest-download ├── CMakeLists.txt.in └── CMakeLists.txt ├── README.md ├── src ├── PGM.h ├── utils.h ├── utils.cpp ├── haar.h ├── config.h ├── config_modified.h ├── PGM.cpp ├── dct.h └── main.cpp ├── LICENSE.txt └── CMakeLists.txt /test/example.cpp: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /res/kuppi256.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sampas/bm3dcl/HEAD/res/kuppi256.pgm -------------------------------------------------------------------------------- /res/kuppi256_25.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sampas/bm3dcl/HEAD/res/kuppi256_25.pgm -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Now simply link against gtest or gtest_main as needed. Eg 2 | add_executable(example example.cpp) 3 | target_link_libraries(example gtest_main) 4 | add_test(NAME example_test COMMAND example) 5 | -------------------------------------------------------------------------------- /googletest-download/CMakeLists.txt.in: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.2) 2 | 3 | project(googletest-download NONE) 4 | 5 | include(ExternalProject) 6 | ExternalProject_Add(googletest 7 | GIT_REPOSITORY https://github.com/google/googletest.git 8 | GIT_TAG origin/v1.8.x 9 | SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src" 10 | BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build" 11 | CONFIGURE_COMMAND "" 12 | BUILD_COMMAND "" 13 | INSTALL_COMMAND "" 14 | TEST_COMMAND "" 15 | ) 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BM3DCL 2 | OpenCL implementation of BM3D image denoising algorithm. 3 | 4 | * Developed mainly on NVIDIA drivers, CUDA installation required. 5 | * Use CMake to generate project files for compiling. 6 | 7 | ## Usage 8 | 9 | ```sh 10 | ./bm3dcl [sigma] [original] 11 | ``` 12 | 13 | ## Acknowledgements 14 | 15 | This work was supported by the Center for Machine Vision Research in the University of Oulu. The work is part of DASIP 2015 proceedings publication *"BM3D Image Denoising Using Heterogeneous Computing Platforms"* authored by Sampsa Sarjanoja, Jani Boutellier and Jari Hannuksela. 16 | 17 | ## License 18 | 19 | This work is licensed under a MIT License. 20 | -------------------------------------------------------------------------------- /src/PGM.h: -------------------------------------------------------------------------------- 1 | #ifndef PGM_H 2 | #define PGM_H 3 | 4 | #include 5 | 6 | class PGM { 7 | public: 8 | PGM(); 9 | PGM(const std::string& _filename); 10 | PGM(const int _width, const int _height, const int _type=5, const int _scale=255); 11 | virtual ~PGM(); 12 | /*explicit*/ operator bool() const; 13 | 14 | bool load(const std::string& _filename); 15 | void unload(); 16 | bool save(const std::string& _filename) const; 17 | void debug() const; 18 | void debug_content(const int w=16, const int h=16, const int ox=0, const int oy=0) const; 19 | 20 | std::string filename; 21 | int type; 22 | int width; 23 | int height; 24 | int scale; 25 | unsigned char* image; 26 | }; 27 | 28 | #endif 29 | 30 | -------------------------------------------------------------------------------- /src/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H 2 | #define UTILS_H 3 | 4 | #include 5 | #include 6 | 7 | #ifdef __APPLE__ 8 | # include 9 | #else 10 | # include 11 | #endif 12 | 13 | #define CL_CHECK(expr) do { \ 14 | cl_int status = expr; \ 15 | if (status != CL_SUCCESS) { \ 16 | std::cerr << "OpenCL error code " << status << " at " << __FILE__ << " [" << __LINE__ << "]: " << #expr << std::endl; \ 17 | exit(EXIT_FAILURE); \ 18 | } \ 19 | } while (0); 20 | 21 | void CL_CALLBACK pfn_notify(const char *errinfo, const void *private_info, size_t cb, void *user_data); 22 | unsigned next_multiple(const unsigned x, const unsigned n); 23 | double psnr(const unsigned char * const original, const unsigned char * const result, const size_t size); 24 | 25 | #endif 26 | 27 | -------------------------------------------------------------------------------- /src/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | #include 4 | #include 5 | 6 | void CL_CALLBACK pfn_notify(const char *errinfo, const void *private_info, size_t cb, void *user_data) { 7 | std::cerr << "OpenCL Error: " << errinfo << std::endl; 8 | } 9 | 10 | unsigned next_multiple(const unsigned x, const unsigned n) { 11 | return (x + n-1) & ~(n-1); 12 | } 13 | 14 | double psnr(const unsigned char * const original, const unsigned char * const result, const size_t size) { 15 | double mse = 0.0; 16 | 17 | for (size_t i = 0; i < size; i++) { 18 | double e = (double)original[i] - (double)result[i]; 19 | mse += e * e; 20 | } 21 | 22 | mse /= (double)size; 23 | if (mse == 0.0) return std::numeric_limits::infinity(); 24 | return 10.0 * log10(255.0 * 255.0 / mse); 25 | } 26 | 27 | -------------------------------------------------------------------------------- /src/haar.h: -------------------------------------------------------------------------------- 1 | #ifndef HAAR_H 2 | #define HAAR_H 3 | 4 | #define INV_SQRT2_F 0.70710678118654752440084436210485f 5 | 6 | inline void haar(float x[8], float y[8]) { 7 | int i, j; 8 | int k = 8; 9 | 10 | #pragma unroll 11 | for (j = 0; j < 3; j++) { 12 | int k2 = k; 13 | k >>= 1; 14 | 15 | #pragma unroll 16 | for (i = 0; i < k; i++) { 17 | int i2 = i << 1; 18 | int i21 = i2 + 1; 19 | y[i] = ( x[i2] + x[i21] ) * INV_SQRT2_F; 20 | y[i+k] = ( x[i2] - x[i21] ) * INV_SQRT2_F; 21 | } 22 | 23 | #pragma unroll 24 | for (i = 0; i < k2; i++) { 25 | x[i] = y[i]; 26 | } 27 | } 28 | } 29 | 30 | inline void ihaar(float x[8], float y[8]) { 31 | int i, j; 32 | int k = 1; 33 | 34 | #pragma unroll 35 | for (j = 0; j < 3; j++) { 36 | 37 | #pragma unroll 38 | for (i = 0; i < k; i++) { 39 | int i2 = i << 1; 40 | int ik = i + k; 41 | y[i2] = ( x[i] + x[ik] ) * INV_SQRT2_F; 42 | y[i2+1] = ( x[i] - x[ik] ) * INV_SQRT2_F; 43 | } 44 | 45 | k <<= 1; 46 | 47 | #pragma unroll 48 | for (i = 0; i < k; i++) { 49 | x[i] = y[i]; 50 | } 51 | } 52 | } 53 | 54 | #endif 55 | 56 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Sampsa Sarjanoja 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /src/config.h: -------------------------------------------------------------------------------- 1 | #ifndef CONFIG_H 2 | #define CONFIG_H 3 | 4 | #define PROFILE "original" 5 | 6 | #define PLATFORM_NVIDIA 0 7 | #define PLATFORM_ATI 1 8 | 9 | #define ENABLE_PROFILING 1 10 | #define UNROLL 0 11 | 12 | #define DISTANCE_IN_SPARSE 0 13 | 14 | #define BLOCK_SIZE 8 15 | #define BLOCK_SIZE_HALF 4 16 | #define BLOCK_SIZE_SQ 64 17 | 18 | #define WINDOW_SIZE 39 19 | #define WINDOW_SIZE_HALF 19 20 | 21 | #define STEP_SIZE 3 22 | // Multiple of STEP_SIZE 23 | #define SPLIT_SIZE_X (3*STEP_SIZE) 24 | #define SPLIT_SIZE_Y (3*STEP_SIZE) 25 | 26 | #define WINDOW_STEP_SIZE_1 1 27 | #define WINDOW_STEP_SIZE_2 1 28 | 29 | #define MAX_BLOCK_COUNT_1 16 30 | // 32 causes crash on CPU 31 | #define MAX_BLOCK_COUNT_2 32 32 | 33 | #define USE_KAISER_WINDOW 1 34 | 35 | #define DCT_1D 0 36 | #define HAAR_1D 1 37 | #define TRANSFORM_METHOD_1D HAAR_1D 38 | 39 | #define D_THRESHOLD_1 (3 * 2500) 40 | #define D_THRESHOLD_2 (3 * 400) 41 | 42 | // Default sigma value to use 43 | #ifndef SIGMA 44 | # define SIGMA 25 45 | #endif 46 | 47 | #define VARIANCE ((float)SIGMA*(float)SIGMA) 48 | 49 | #if (SIGMA > 40) 50 | # define USE_2D_THRESHOLD 1 51 | # define TAU_1D (2.8f * (float)SIGMA) 52 | #else 53 | # define USE_2D_THRESHOLD 0 54 | # define TAU_1D (2.7f * (float)SIGMA) 55 | #endif 56 | 57 | #define TAU_2D (2.0f * (float)SIGMA) 58 | 59 | #endif 60 | 61 | -------------------------------------------------------------------------------- /src/config_modified.h: -------------------------------------------------------------------------------- 1 | #ifndef CONFIG_H 2 | #define CONFIG_H 3 | 4 | #define PROFILE "modified" 5 | 6 | #define PLATFORM_NVIDIA 0 7 | #define PLATFORM_ATI 1 8 | 9 | #define ENABLE_PROFILING 1 10 | #define UNROLL 0 11 | 12 | #define DISTANCE_IN_SPARSE 0 13 | 14 | #define BLOCK_SIZE 8 15 | #define BLOCK_SIZE_HALF 4 16 | #define BLOCK_SIZE_SQ 64 17 | 18 | #define WINDOW_SIZE 21 19 | #define WINDOW_SIZE_HALF 10 20 | 21 | #define STEP_SIZE 7 22 | // Multiple of STEP_SIZE 23 | #define SPLIT_SIZE_X (12*STEP_SIZE) 24 | #define SPLIT_SIZE_Y (12*STEP_SIZE) 25 | 26 | #define WINDOW_STEP_SIZE_1 1 27 | #define WINDOW_STEP_SIZE_2 1 28 | 29 | #define MAX_BLOCK_COUNT_1 8 30 | // 32 causes crash on CPU 31 | #define MAX_BLOCK_COUNT_2 8 32 | 33 | #define USE_KAISER_WINDOW 0 34 | 35 | #define DCT_1D 0 36 | #define HAAR_1D 1 37 | #define TRANSFORM_METHOD_1D HAAR_1D 38 | 39 | #define D_THRESHOLD_1 (3 * 2500) 40 | #define D_THRESHOLD_2 (3 * 400) 41 | 42 | // Default sigma value to use 43 | #ifndef SIGMA 44 | # define SIGMA 25 45 | #endif 46 | 47 | #define VARIANCE ((float)SIGMA*(float)SIGMA) 48 | 49 | #if (SIGMA > 40) 50 | # define USE_2D_THRESHOLD 0 51 | # define TAU_1D (2.7f * (float)SIGMA) 52 | #else 53 | # define USE_2D_THRESHOLD 0 54 | # define TAU_1D (2.7f * (float)SIGMA) 55 | #endif 56 | 57 | #define TAU_2D (2.0f * (float)SIGMA) 58 | 59 | #endif 60 | 61 | -------------------------------------------------------------------------------- /googletest-download/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Download and unpack googletest at configure time 2 | configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt) 3 | execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . 4 | RESULT_VARIABLE result 5 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download ) 6 | if(result) 7 | message(FATAL_ERROR "CMake step for googletest failed: ${result}") 8 | endif() 9 | execute_process(COMMAND ${CMAKE_COMMAND} --build . 10 | RESULT_VARIABLE result 11 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download ) 12 | if(result) 13 | message(FATAL_ERROR "Build step for googletest failed: ${result}") 14 | endif() 15 | 16 | # Prevent overriding the parent project's compiler/linker 17 | # settings on Windows 18 | set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) 19 | 20 | # Add googletest directly to our build. This defines 21 | # the gtest and gtest_main targets. 22 | add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src 23 | ${CMAKE_CURRENT_BINARY_DIR}/googletest-build 24 | EXCLUDE_FROM_ALL) 25 | 26 | # The gtest/gtest_main targets carry header search path 27 | # dependencies automatically when using CMake 2.8.11 or 28 | # later. Otherwise we have to add them here ourselves. 29 | if (CMAKE_VERSION VERSION_LESS 2.8.11) 30 | include_directories("${gtest_SOURCE_DIR}/include") 31 | endif() 32 | 33 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.4.1) 2 | project(BM3DCL) 3 | 4 | mark_as_advanced(CMAKE_CONFIGURATION_TYPES) 5 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/runtime) 6 | 7 | find_package(OpenCL) 8 | 9 | add_executable(bm3dcl 10 | res/bm3d.cl 11 | src/config.h 12 | src/dct.h 13 | src/haar.h 14 | src/PGM.h 15 | src/utils.h 16 | src/main.cpp 17 | src/PGM.cpp 18 | src/utils.cpp 19 | ) 20 | 21 | if (OpenCL_FOUND) 22 | target_include_directories(bm3dcl PRIVATE ${OpenCL_INCLUDE_DIRS}) 23 | target_link_libraries(bm3dcl ${OpenCL_LIBRARIES}) 24 | endif () 25 | 26 | target_compile_definitions(bm3dcl PRIVATE -DUSE_PLATFORM=PLATFORM_NVIDIA) 27 | 28 | set(resources 29 | src/config.h 30 | src/dct.h 31 | src/haar.h 32 | res/bm3d.cl 33 | res/kuppi256.pgm 34 | res/kuppi256_25.pgm 35 | ) 36 | 37 | file(MAKE_DIRECTORY "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") 38 | if (DEFINED CMAKE_CONFIGURATION_TYPES) 39 | foreach(cfg ${CMAKE_CONFIGURATION_TYPES}) 40 | foreach(resource ${resources}) 41 | file(MAKE_DIRECTORY "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${cfg}") 42 | configure_file(${resource} "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${cfg}" COPYONLY) 43 | endforeach() 44 | endforeach() 45 | else() 46 | foreach(resource ${resources}) 47 | configure_file(${resource} "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}" COPYONLY) 48 | endforeach() 49 | endif() 50 | 51 | add_subdirectory(googletest-download) 52 | add_subdirectory(test) 53 | 54 | -------------------------------------------------------------------------------- /src/PGM.cpp: -------------------------------------------------------------------------------- 1 | #define _CRT_SECURE_NO_WARNINGS 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "PGM.h" 10 | 11 | PGM::PGM() 12 | : filename("") 13 | , type(0) 14 | , width(0) 15 | , height(0) 16 | , scale(0) 17 | , image(NULL) 18 | { 19 | unload(); 20 | } 21 | 22 | PGM::PGM(const std::string& _filename) 23 | : filename("") 24 | , type(0) 25 | , width(0) 26 | , height(0) 27 | , scale(0) 28 | , image(NULL) 29 | { 30 | unload(); 31 | load(_filename); 32 | } 33 | 34 | PGM::PGM(const int _width, const int _height, const int _type, const int _scale) 35 | : filename("") 36 | , type(_type) 37 | , width(_width) 38 | , height(_height) 39 | , scale(_scale) 40 | , image(new unsigned char[width*height]) 41 | { 42 | 43 | } 44 | 45 | PGM::operator bool() const { 46 | return !filename.empty() && image != NULL && width != 0 && height != 0 && type != 0 && scale != 0; 47 | } 48 | 49 | bool PGM::load(const std::string& _filename) { 50 | if (image) { 51 | unload(); 52 | } 53 | 54 | FILE* f = fopen(_filename.c_str(), "rb"); 55 | if (f == NULL) { 56 | return false; 57 | } 58 | 59 | filename = _filename; 60 | fscanf(f, "P%d\n%d %d\n%d\n", &type, &width, &height, &scale); 61 | image = new unsigned char[width*height]; 62 | size_t n = fread(image, sizeof(unsigned char), width*height, f); 63 | fclose(f); 64 | 65 | assert(n == width*height); 66 | return true; 67 | } 68 | 69 | void PGM::unload() { 70 | if (image) { 71 | delete[] image; 72 | } 73 | filename = ""; 74 | image = NULL; 75 | type = 0; 76 | width = 0; 77 | height = 0; 78 | scale = 0; 79 | } 80 | 81 | PGM::~PGM() { 82 | unload(); 83 | } 84 | 85 | bool PGM::save(const std::string& _filename) const { 86 | std::string fn; 87 | 88 | if (_filename.empty()) { 89 | fn = filename; 90 | } 91 | else { 92 | fn = _filename; 93 | } 94 | 95 | if (fn.empty()) return false; 96 | 97 | FILE* f = fopen(fn.c_str(), "wb"); 98 | if (f == NULL) return false; 99 | 100 | fprintf(f, "P%d\n%d %d\n%d\n", type, width, height, scale); 101 | 102 | for (int i = 0; i < width*height; i++) { 103 | fputc(image[i], f); 104 | } 105 | 106 | fclose(f); 107 | return true; 108 | } 109 | 110 | void PGM::debug() const { 111 | std::cout << "Filename: " << filename << std::endl; 112 | std::cout << "Type: " << type << std::endl; 113 | std::cout << "Scale: " << scale << std::endl; 114 | std::cout << "Width: " << width << std::endl; 115 | std::cout << "Height: " << height << std::endl; 116 | } 117 | 118 | void PGM::debug_content(const int w, const int h, const int ox, const int oy) const { 119 | if (image) { 120 | for (int j = oy; j < std::min(oy + h, height); j++) { 121 | for (int i = ox; i < std::min(ox + w, width); i++) { 122 | printf("%u ", image[j*width + i]); 123 | } 124 | std::cout << std::endl; 125 | } 126 | } 127 | } 128 | 129 | -------------------------------------------------------------------------------- /src/dct.h: -------------------------------------------------------------------------------- 1 | #ifndef DCT_H 2 | #define DCT_H 3 | 4 | // cos(3*PI/16) 5 | #define C3A 0.83146961230254523707878837761791f 6 | // sin(3*PI/16) 7 | #define C3B 0.55557023301960222474283081394853f 8 | // cos(PI/16) 9 | #define C1A 0.98078528040323044912618223613424f 10 | // sin(PI/16) 11 | #define C1B 0.19509032201612826784828486847702f 12 | // sqrt(2) * cos(3*PI/8) 13 | // NOT: sqrt(2) * cos(PI/16) 14 | #define S2C3A 0.54119610014619698439972320536639f 15 | // sqrt(2) * sin(3*PI/8) 16 | // NOT: sqrt(2) * sin(PI/16) 17 | #define S2C3B 1.3065629648763765278566431734272f 18 | // 1 / sqrt(8) 19 | #define C_NORM_1D 0.35355339059327376220042218105242f 20 | // 1 / 8 21 | #define C_NORM_2D 0.125f 22 | 23 | #ifndef M_SQRT2_F 24 | # define M_SQRT2_F 1.4142135623730950488016887242097f 25 | #endif 26 | 27 | inline void dct(const float in[8], float out[8], bool normalize) { 28 | 29 | float st2[8], st3[2]; 30 | float tmp; 31 | 32 | // Stage 1 33 | 34 | out[0] = in[0] + in[7]; 35 | out[1] = in[1] + in[6]; 36 | out[2] = in[2] + in[5]; 37 | out[3] = in[3] + in[4]; 38 | 39 | out[4] = in[3] - in[4]; 40 | out[5] = in[2] - in[5]; 41 | out[6] = in[1] - in[6]; 42 | out[7] = in[0] - in[7]; 43 | 44 | // Stage 2 45 | 46 | st2[0] = out[0] + out[3]; 47 | st2[1] = out[1] + out[2]; 48 | st2[2] = out[1] - out[2]; 49 | st2[3] = out[0] - out[3]; 50 | 51 | tmp = C3A * (out[4] + out[7]); 52 | 53 | st2[4] = tmp + (C3B-C3A)*out[7]; 54 | st2[7] = tmp - (C3A+C3B)*out[4]; 55 | 56 | tmp = C1A * (out[5] + out[6]); 57 | 58 | st2[5] = tmp + (C1B-C1A)*out[6]; 59 | st2[6] = tmp - (C1A+C1B)*out[5]; 60 | 61 | // Stage 3 62 | 63 | out[0] = st2[0] + st2[1]; 64 | out[4] = st2[0] - st2[1]; 65 | 66 | tmp = S2C3A * (st2[2] + st2[3]); 67 | 68 | out[2] = tmp + (S2C3B-S2C3A)*st2[3]; 69 | out[6] = tmp - (S2C3A+S2C3B)*st2[2]; 70 | 71 | st3[0] = st2[4] + st2[6]; 72 | st3[1] = st2[5] + st2[7]; 73 | out[3] = st2[7] - st2[5]; 74 | out[5] = st2[4] - st2[6]; 75 | 76 | // Stage 4 77 | 78 | out[7] = st3[1] - st3[0]; 79 | out[3] *= M_SQRT2_F; 80 | out[5] *= M_SQRT2_F; 81 | out[1] = st3[0] + st3[1]; 82 | 83 | if (normalize) 84 | for (int i = 0; i < 8; i++) 85 | out[i] *= C_NORM_1D; 86 | } 87 | 88 | inline void idct(const float in[8], float out[8], bool normalize) { 89 | 90 | float st1[8], st4[2]; 91 | float tmp; 92 | 93 | // Stage 4 94 | 95 | st4[0] = in[1] - in[7]; 96 | st1[5] = in[3] * M_SQRT2_F; 97 | st1[6] = in[5] * M_SQRT2_F; 98 | st4[1] = in[1] + in[7]; 99 | 100 | // Stage 3 101 | 102 | out[0] = in[0] + in[4]; 103 | out[1] = in[0] - in[4]; 104 | 105 | tmp = S2C3A * (in[2] + in[6]); 106 | 107 | out[2] = tmp - (S2C3A+S2C3B)*in[6]; 108 | out[3] = tmp + (S2C3B-S2C3A)*in[2]; 109 | 110 | out[4] = st4[0] + st1[6]; 111 | out[5] = st4[1] - st1[5]; 112 | out[6] = st4[0] - st1[6]; 113 | out[7] = st1[5] + st4[1]; 114 | 115 | // Stage 2 116 | 117 | st1[0] = out[0] + out[3]; 118 | st1[1] = out[1] + out[2]; 119 | st1[2] = out[1] - out[2]; 120 | st1[3] = out[0] - out[3]; 121 | 122 | tmp = C3A * (out[4] + out[7]); 123 | 124 | st1[4] = tmp - (C3A+C3B)*out[7]; 125 | st1[7] = tmp + (C3B-C3A)*out[4]; 126 | 127 | tmp = C1A * (out[5] + out[6]); 128 | 129 | st1[5] = tmp - (C1A+C1B)*out[6]; 130 | st1[6] = tmp + (C1B-C1A)*out[5]; 131 | 132 | // Stage 1 133 | 134 | out[0] = st1[0] + st1[7]; 135 | out[1] = st1[1] + st1[6]; 136 | out[2] = st1[2] + st1[5]; 137 | out[3] = st1[3] + st1[4]; 138 | 139 | out[4] = st1[3] - st1[4]; 140 | out[5] = st1[2] - st1[5]; 141 | out[6] = st1[1] - st1[6]; 142 | out[7] = st1[0] - st1[7]; 143 | 144 | if (normalize) 145 | for (int i = 0; i < 8; i++) 146 | out[i] *= C_NORM_1D; 147 | } 148 | 149 | inline void transpose(float in[8][8], float out[8][8]) { 150 | int i, j; 151 | for (j = 0; j < 8; j++) { 152 | for (i = 0; i < 8; i++) { 153 | out[j][i] = in[i][j]; 154 | } 155 | } 156 | } 157 | 158 | inline void dct2(float in[8][8], float out[8][8]) { 159 | int i, j; 160 | 161 | float res[8][8]; 162 | 163 | #if 1 164 | 165 | // Process rows 166 | for (j = 0; j < 8; j++) { 167 | 168 | float st2[8], st3[2]; 169 | float tmp; 170 | 171 | // Stage 1 172 | 173 | res[j][0] = in[j][0] + in[j][7]; 174 | res[j][1] = in[j][1] + in[j][6]; 175 | res[j][2] = in[j][2] + in[j][5]; 176 | res[j][3] = in[j][3] + in[j][4]; 177 | 178 | res[j][4] = in[j][3] - in[j][4]; 179 | res[j][5] = in[j][2] - in[j][5]; 180 | res[j][6] = in[j][1] - in[j][6]; 181 | res[j][7] = in[j][0] - in[j][7]; 182 | 183 | // Stage 2 184 | 185 | st2[0] = res[j][0] + res[j][3]; 186 | st2[1] = res[j][1] + res[j][2]; 187 | st2[2] = res[j][1] - res[j][2]; 188 | st2[3] = res[j][0] - res[j][3]; 189 | 190 | tmp = C3A * (res[j][4] + res[j][7]); 191 | 192 | st2[4] = tmp + (C3B-C3A)*res[j][7]; 193 | st2[7] = tmp - (C3A+C3B)*res[j][4]; 194 | 195 | tmp = C1A * (res[j][5] + res[j][6]); 196 | 197 | st2[5] = tmp + (C1B-C1A)*res[j][6]; 198 | st2[6] = tmp - (C1A+C1B)*res[j][5]; 199 | 200 | // Stage 3 201 | 202 | res[j][0] = st2[0] + st2[1]; 203 | res[j][4] = st2[0] - st2[1]; 204 | 205 | tmp = S2C3A * (st2[2] + st2[3]); 206 | 207 | res[j][2] = tmp + (S2C3B-S2C3A)*st2[3]; 208 | res[j][6] = tmp - (S2C3A+S2C3B)*st2[2]; 209 | 210 | st3[0] = st2[4] + st2[6]; 211 | st3[1] = st2[5] + st2[7]; 212 | res[j][3] = st2[7] - st2[5]; 213 | res[j][5] = st2[4] - st2[6]; 214 | 215 | // Stage 4 216 | 217 | res[j][7] = st3[1] - st3[0]; 218 | res[j][3] *= M_SQRT2_F; 219 | res[j][5] *= M_SQRT2_F; 220 | res[j][1] = st3[0] + st3[1]; 221 | } 222 | 223 | // Process columns 224 | for (i = 0; i < 8; i++) { 225 | 226 | float st2[8], st3[2]; 227 | float tmp; 228 | 229 | // Stage 1 230 | 231 | out[i][0] = res[0][i] + res[7][i]; 232 | out[i][1] = res[1][i] + res[6][i]; 233 | out[i][2] = res[2][i] + res[5][i]; 234 | out[i][3] = res[3][i] + res[4][i]; 235 | 236 | out[i][4] = res[3][i] - res[4][i]; 237 | out[i][5] = res[2][i] - res[5][i]; 238 | out[i][6] = res[1][i] - res[6][i]; 239 | out[i][7] = res[0][i] - res[7][i]; 240 | 241 | // Stage 2 242 | 243 | st2[0] = out[i][0] + out[i][3]; 244 | st2[1] = out[i][1] + out[i][2]; 245 | st2[2] = out[i][1] - out[i][2]; 246 | st2[3] = out[i][0] - out[i][3]; 247 | 248 | tmp = C3A * (out[i][4] + out[i][7]); 249 | 250 | st2[4] = tmp + (C3B-C3A)*out[i][7]; 251 | st2[7] = tmp - (C3A+C3B)*out[i][4]; 252 | 253 | tmp = C1A * (out[i][5] + out[i][6]); 254 | 255 | st2[5] = tmp + (C1B-C1A)*out[i][6]; 256 | st2[6] = tmp - (C1A+C1B)*out[i][5]; 257 | 258 | // Stage 3 259 | 260 | out[i][0] = st2[0] + st2[1]; 261 | out[i][4] = st2[0] - st2[1]; 262 | 263 | tmp = S2C3A * (st2[2] + st2[3]); 264 | 265 | out[i][2] = tmp + (S2C3B-S2C3A)*st2[3]; 266 | out[i][6] = tmp - (S2C3A+S2C3B)*st2[2]; 267 | 268 | st3[0] = st2[4] + st2[6]; 269 | st3[1] = st2[5] + st2[7]; 270 | out[i][3] = st2[7] - st2[5]; 271 | out[i][5] = st2[4] - st2[6]; 272 | 273 | // Stage 4 274 | 275 | out[i][7] = st3[1] - st3[0]; 276 | out[i][3] *= M_SQRT2_F; 277 | out[i][5] *= M_SQRT2_F; 278 | out[i][1] = st3[0] + st3[1]; 279 | 280 | // Normalize 281 | for (j = 0; j < 8; j++) { 282 | out[i][j] *= C_NORM_2D; 283 | } 284 | } 285 | 286 | #else 287 | 288 | // Process rows 289 | for (j = 0; j < 8; j++) { 290 | dct(in[j], res[j], false); 291 | } 292 | 293 | transpose(res, out); 294 | 295 | // Process columns 296 | for (j = 0; j < 8; j++) { 297 | dct(out[j], res[j], false); 298 | } 299 | 300 | // Normalize 301 | for (j = 0; j < 8; j++) { 302 | for (i = 0; i < 8; i++) { 303 | out[j][i] = res[j][i] * C_NORM_2D; 304 | } 305 | } 306 | 307 | #endif 308 | } 309 | 310 | inline void idct2(float in[8][8], float out[8][8]) { 311 | int i, j; 312 | 313 | float res[8][8]; 314 | 315 | #if 1 316 | 317 | // Process rows 318 | for (j = 0; j < 8; j++) { 319 | 320 | float st1[8], st4[2]; 321 | float tmp; 322 | 323 | // Stage 4 324 | 325 | st4[0] = in[j][1] - in[j][7]; 326 | st1[5] = in[j][3] * M_SQRT2_F; 327 | st1[6] = in[j][5] * M_SQRT2_F; 328 | st4[1] = in[j][1] + in[j][7]; 329 | 330 | // Stage 3 331 | 332 | res[j][0] = in[j][0] + in[j][4]; 333 | res[j][1] = in[j][0] - in[j][4]; 334 | 335 | tmp = S2C3A * (in[j][2] + in[j][6]); 336 | 337 | res[j][2] = tmp - (S2C3A+S2C3B)*in[j][6]; 338 | res[j][3] = tmp + (S2C3B-S2C3A)*in[j][2]; 339 | 340 | res[j][4] = st4[0] + st1[6]; 341 | res[j][5] = st4[1] - st1[5]; 342 | res[j][6] = st4[0] - st1[6]; 343 | res[j][7] = st1[5] + st4[1]; 344 | 345 | // Stage 2 346 | 347 | st1[0] = res[j][0] + res[j][3]; 348 | st1[1] = res[j][1] + res[j][2]; 349 | st1[2] = res[j][1] - res[j][2]; 350 | st1[3] = res[j][0] - res[j][3]; 351 | 352 | tmp = C3A * (res[j][4] + res[j][7]); 353 | 354 | st1[4] = tmp - (C3A+C3B)*res[j][7]; 355 | st1[7] = tmp + (C3B-C3A)*res[j][4]; 356 | 357 | tmp = C1A * (res[j][5] + res[j][6]); 358 | 359 | st1[5] = tmp - (C1A+C1B)*res[j][6]; 360 | st1[6] = tmp + (C1B-C1A)*res[j][5]; 361 | 362 | // Stage 1 363 | 364 | res[j][0] = st1[0] + st1[7]; 365 | res[j][1] = st1[1] + st1[6]; 366 | res[j][2] = st1[2] + st1[5]; 367 | res[j][3] = st1[3] + st1[4]; 368 | 369 | res[j][4] = st1[3] - st1[4]; 370 | res[j][5] = st1[2] - st1[5]; 371 | res[j][6] = st1[1] - st1[6]; 372 | res[j][7] = st1[0] - st1[7]; 373 | } 374 | 375 | // Process columns 376 | for (i = 0; i < 8; i++) { 377 | 378 | float st1[8], st4[2]; 379 | float tmp; 380 | 381 | // Stage 4 382 | 383 | st4[0] = res[1][i] - res[7][i]; 384 | st1[5] = res[3][i] * M_SQRT2_F; 385 | st1[6] = res[5][i] * M_SQRT2_F; 386 | st4[1] = res[1][i] + res[7][i]; 387 | 388 | // Stage 3 389 | 390 | out[i][0] = res[0][i] + res[4][i]; 391 | out[i][1] = res[0][i] - res[4][i]; 392 | 393 | tmp = S2C3A * (res[2][i] + res[6][i]); 394 | 395 | out[i][2] = tmp - (S2C3A+S2C3B)*res[6][i]; 396 | out[i][3] = tmp + (S2C3B-S2C3A)*res[2][i]; 397 | 398 | out[i][4] = st4[0] + st1[6]; 399 | out[i][5] = st4[1] - st1[5]; 400 | out[i][6] = st4[0] - st1[6]; 401 | out[i][7] = st1[5] + st4[1]; 402 | 403 | // Stage 2 404 | 405 | st1[0] = out[i][0] + out[i][3]; 406 | st1[1] = out[i][1] + out[i][2]; 407 | st1[2] = out[i][1] - out[i][2]; 408 | st1[3] = out[i][0] - out[i][3]; 409 | 410 | tmp = C3A * (out[i][4] + out[i][7]); 411 | 412 | st1[4] = tmp - (C3A+C3B)*out[i][7]; 413 | st1[7] = tmp + (C3B-C3A)*out[i][4]; 414 | 415 | tmp = C1A * (out[i][5] + out[i][6]); 416 | 417 | st1[5] = tmp - (C1A+C1B)*out[i][6]; 418 | st1[6] = tmp + (C1B-C1A)*out[i][5]; 419 | 420 | // Stage 1 421 | 422 | out[i][0] = st1[0] + st1[7]; 423 | out[i][1] = st1[1] + st1[6]; 424 | out[i][2] = st1[2] + st1[5]; 425 | out[i][3] = st1[3] + st1[4]; 426 | 427 | out[i][4] = st1[3] - st1[4]; 428 | out[i][5] = st1[2] - st1[5]; 429 | out[i][6] = st1[1] - st1[6]; 430 | out[i][7] = st1[0] - st1[7]; 431 | 432 | // Normalize 433 | for (j = 0; j < 8; j++) { 434 | out[i][j] *= C_NORM_2D; 435 | } 436 | } 437 | 438 | #else 439 | 440 | // Process rows 441 | for (j = 0; j < 8; j++) { 442 | idct(in[j], res[j], false); 443 | } 444 | 445 | transpose(res, out); 446 | 447 | // Process columns 448 | for (j = 0; j < 8; j++) { 449 | idct(out[j], res[j], false); 450 | } 451 | 452 | // Normalize 453 | for (j = 0; j < 8; j++) { 454 | for (i = 0; i < 8; i++) { 455 | out[j][i] = res[j][i] * C_NORM_2D; 456 | } 457 | } 458 | 459 | #endif 460 | } 461 | 462 | #endif 463 | 464 | -------------------------------------------------------------------------------- /res/bm3d.cl: -------------------------------------------------------------------------------- 1 | // vim: ft=c 2 | 3 | // Note: changes in included files won't trigger rebuilding 4 | #include "config.h" 5 | #include "dct.h" 6 | #include "haar.h" 7 | 8 | __constant sampler_t sampler = 9 | CLK_NORMALIZED_COORDS_FALSE 10 | // | CLK_ADDRESS_CLAMP_TO_EDGE // Clamp to edge value 11 | | CLK_ADDRESS_CLAMP // Clamp to zeros 12 | | CLK_FILTER_NEAREST; 13 | 14 | #if USE_KAISER_WINDOW 15 | # define KAISER(x, y) kaiser_b_2[y*8 + x] 16 | __constant float kaiser_b_2[] = { 17 | 0.1924f, 0.2989f, 0.3846f, 0.4325f, 0.4325f, 0.3846f, 0.2989f, 0.1924f, 18 | 0.2989f, 0.4642f, 0.5974f, 0.6717f, 0.6717f, 0.5974f, 0.4642f, 0.2989f, 19 | 0.3846f, 0.5974f, 0.7688f, 0.8644f, 0.8644f, 0.7688f, 0.5974f, 0.3846f, 20 | 0.4325f, 0.6717f, 0.8644f, 0.9718f, 0.9718f, 0.8644f, 0.6717f, 0.4325f, 21 | 0.4325f, 0.6717f, 0.8644f, 0.9718f, 0.9718f, 0.8644f, 0.6717f, 0.4325f, 22 | 0.3846f, 0.5974f, 0.7688f, 0.8644f, 0.8644f, 0.7688f, 0.5974f, 0.3846f, 23 | 0.2989f, 0.4642f, 0.5974f, 0.6717f, 0.6717f, 0.5974f, 0.4642f, 0.2989f, 24 | 0.1924f, 0.2989f, 0.3846f, 0.4325f, 0.4325f, 0.3846f, 0.2989f, 0.1924f 25 | }; 26 | #endif 27 | 28 | inline void threshold_2d(float in[8][8]) { 29 | int i, j; 30 | for (j = 0; j < BLOCK_SIZE; j++) { 31 | for (i = 0; i < BLOCK_SIZE; i++) { 32 | if (fabs(in[j][i]) <= TAU_2D) in[j][i] = 0.0f; 33 | } 34 | } 35 | } 36 | 37 | inline void threshold_1d(float in[8], int* weight_count, int block_count) { 38 | int i; 39 | for (i = 0; i < BLOCK_SIZE; i++) { 40 | //if (fabs(in[i]) <= TAU_1D * sqrt((float)block_count)) in[i] = 0.0f; 41 | if (fabs(in[i]) <= TAU_1D) in[i] = 0.0f; 42 | else (*weight_count)++; 43 | } 44 | } 45 | 46 | __kernel void calc_distances( 47 | __read_only image2d_t input, 48 | __global short* similar_coords, 49 | __global uchar* block_counts, 50 | const int threshold, 51 | const int max_block_count, 52 | const int window_step_size 53 | ) { 54 | 55 | const short2 gid = {get_global_id(0) * STEP_SIZE, get_global_id(1) * STEP_SIZE}; 56 | const size_t tot_globals = get_global_size(0) * get_global_size(1); 57 | const size_t global_id = get_global_id(1) * get_global_size(0) + get_global_id(0); 58 | // Use this if you want to crash AMD's clBuildProgram() 59 | //int distances[MAX_BLOCK_COUNT] = { [0 ... MAX_BLOCK_COUNT-1] = INT_MAX }; 60 | int distances[MAX_BLOCK_COUNT_2]; 61 | short2 positions[MAX_BLOCK_COUNT_2]; 62 | short block_count = 0; 63 | 64 | for (int n = 0; n < max_block_count; n++) 65 | distances[n] = INT_MAX; 66 | 67 | // Start block matching from window 68 | for (int j = WINDOW_SIZE_HALF % window_step_size; j < WINDOW_SIZE; j += window_step_size) { 69 | for (int i = WINDOW_SIZE_HALF % window_step_size; i < WINDOW_SIZE; i += window_step_size) { 70 | 71 | // Create reference block 72 | uchar ref[BLOCK_SIZE][BLOCK_SIZE]; 73 | 74 | // Interestingly moving this outside outer for loops reduces performance significantly 75 | for (int y = 0; y < BLOCK_SIZE; y++) { 76 | for (int x = 0; x < BLOCK_SIZE; x++) { 77 | const int2 ref_pos = {gid.x + x, gid.y + y}; 78 | ref[y][x] = (uchar)read_imageui(input, sampler, ref_pos).s0; 79 | } 80 | } 81 | 82 | int d = 0; 83 | 84 | // Create matching block 85 | for (int y = 0; y < BLOCK_SIZE; y++) { 86 | for (int x = 0; x < BLOCK_SIZE; x++) { 87 | const int2 pos = {gid.x + i + x - WINDOW_SIZE_HALF, 88 | gid.y + j + y - WINDOW_SIZE_HALF}; 89 | const uchar b = (uchar)read_imageui(input, sampler, pos).s0; 90 | d += (ref[y][x]-b) * (ref[y][x]-b); 91 | } 92 | } 93 | 94 | // Only count block if it's distance is under threshold and 95 | // is smaller than any already found block. 96 | if (d <= threshold) { 97 | for (int n = 0; n < max_block_count; n++) { 98 | if (d < distances[n]) { 99 | for (int k = max_block_count-1; k > n; k--) { 100 | distances[k] = distances[k-1]; 101 | positions[k] = positions[k-1]; 102 | } 103 | block_count++; 104 | distances[n] = d; 105 | positions[n].x = i; 106 | positions[n].y = j; 107 | break; 108 | } 109 | } 110 | } 111 | } 112 | } 113 | 114 | if (block_count > max_block_count) block_count = max_block_count; 115 | #if 0 116 | else { 117 | uchar n = 1; 118 | while ((n << 1) <= block_count) n <<= 1; 119 | block_count = n; 120 | } 121 | #endif 122 | 123 | block_counts[global_id] = (uchar)block_count; 124 | 125 | /* Nearest block coordinates are written as such into memory: 126 | * most similar second most similar - sorted order of similarity 127 | * | x0 | y0 | x0 | y0 | ... | x1 | y1 | x1 | y1 | ... - coords for similar block 128 | * t0 t1 tn t0 t1 tn - thread id 129 | * R00 R01 Rmn R00 R01 Rmn - reference block id 130 | */ 131 | for (int n = 0; n < block_count; n++) { 132 | // SoA or AoS? 133 | const int ind = 2 * (n * tot_globals + global_id); 134 | similar_coords[ind] = positions[n].x; 135 | similar_coords[ind+1] = positions[n].y; 136 | } 137 | } 138 | 139 | #if USE_PLATFORM == PLATFORM_ATI 140 | # define ACCU(x, y) accumulator[(y*SPLIT_SIZE_X + x) * tot_globals + global_id] 141 | # define WM(x, y) weight_map[(y*SPLIT_SIZE_X + x) * tot_globals + global_id] 142 | #else 143 | # define ACCU(x, y) accumulator[y][x] 144 | # define WM(x, y) weight_map[y][x] 145 | #endif 146 | 147 | __kernel void bm3d_basic_filter( 148 | __read_only image2d_t input, 149 | __write_only image2d_t output, 150 | __global short* similar_coords, 151 | __global uchar* block_counts, 152 | const int global_size_x_d, 153 | const int tot_globals_d 154 | #if USE_PLATFORM == PLATFORM_ATI 155 | , __global float* accumulator 156 | , __global float* weight_map 157 | #endif 158 | ) { 159 | #if 1 160 | const int2 gid = {get_global_id(0) * SPLIT_SIZE_X, get_global_id(1) * SPLIT_SIZE_Y}; 161 | if (gid.x > WIDTH-1 || gid.y > HEIGHT-1) return; 162 | //const size_t tot_globals = get_global_size(0) * get_global_size(1); 163 | //const size_t global_id = get_global_id(1) * get_global_size(0) + get_global_id(0); 164 | 165 | #if 1 166 | const int2 back_limit = {max(gid.x - WINDOW_SIZE_HALF, 0), 167 | max(gid.y - WINDOW_SIZE_HALF, 0)}; 168 | const int2 front_limit = {min(gid.x + SPLIT_SIZE_X - 1 + WINDOW_SIZE_HALF, WIDTH-1), 169 | min(gid.y + SPLIT_SIZE_Y - 1 + WINDOW_SIZE_HALF, HEIGHT-1)}; 170 | #else 171 | const int2 back_limit = gid; 172 | const int2 front_limit = gid; 173 | #endif 174 | 175 | #if USE_PLATFORM == PLATFORM_ATI 176 | for (int j = 0; j < SPLIT_SIZE_Y; j++) { 177 | for (int i = 0; i < SPLIT_SIZE_X; i++) { 178 | ACCU(i, j) = 0.0f; 179 | WM(i, j) = 0.0f; 180 | } 181 | } 182 | #else 183 | float accumulator[SPLIT_SIZE_Y][SPLIT_SIZE_X] = {{0.0f}}; 184 | float weight_map[SPLIT_SIZE_Y][SPLIT_SIZE_X] = {{0.0f}}; 185 | #endif 186 | 187 | int ri = gid.x; 188 | int rj = gid.y; 189 | 190 | while (ri - STEP_SIZE >= back_limit.x) ri -= STEP_SIZE; 191 | while (rj - STEP_SIZE >= back_limit.y) rj -= STEP_SIZE; 192 | 193 | const int ri_min = ri; 194 | const int rj_min = rj; 195 | 196 | // Loop through all reference blocks that can contribute to a split block. 197 | for (rj = rj_min; rj <= front_limit.y; rj += STEP_SIZE) { 198 | for (ri = ri_min; ri <= front_limit.x; ri += STEP_SIZE) { 199 | 200 | const int rgid = (rj/STEP_SIZE)*global_size_x_d + (ri/STEP_SIZE); 201 | 202 | float stack[MAX_BLOCK_COUNT_1][BLOCK_SIZE][BLOCK_SIZE]; 203 | 204 | const uchar block_count = block_counts[rgid]; 205 | int weight_count = 0; 206 | 207 | // Build stack of similar blocks 208 | for (int n = 0; n < block_count; n++) { 209 | float block[BLOCK_SIZE][BLOCK_SIZE]; 210 | for (int j = 0; j < BLOCK_SIZE; j++) { 211 | for (int i = 0; i < BLOCK_SIZE; i++) { 212 | const int2 pos = {ri - WINDOW_SIZE_HALF + similar_coords[2*(n*tot_globals_d + rgid)] + i, 213 | rj - WINDOW_SIZE_HALF + similar_coords[2*(n*tot_globals_d + rgid)+1] + j}; 214 | block[j][i] = (float)read_imageui(input, sampler, pos).s0; 215 | } 216 | } 217 | 218 | dct2(block, stack[n]); 219 | #if USE_2D_THRESHOLD 220 | threshold_2d(stack[n]); 221 | #endif 222 | } 223 | 224 | // Do collaborative filtering 225 | for (int j = 0; j < BLOCK_SIZE; j++) { 226 | for (int i = 0; i < BLOCK_SIZE; i++) { 227 | 228 | int blocks_left = block_count; 229 | int k = 0; 230 | 231 | // Process only max 8 layers at the time because of 8-point DCT 232 | while (blocks_left > 0) { 233 | 234 | float pipe[8] = { 0.0f }; 235 | float tr_pipe[8]; 236 | 237 | for (int n = 0; n < min(blocks_left, 8); n++) { 238 | pipe[n] = stack[k*8 + n][j][i]; 239 | } 240 | 241 | #if TRANSFORM_METHOD_1D == DCT_1D 242 | dct(pipe, tr_pipe, true); 243 | threshold_1d(tr_pipe, &weight_count, block_count); 244 | idct(tr_pipe, pipe, true); 245 | #elif TRANSFORM_METHOD_1D == HAAR_1D 246 | haar(pipe, tr_pipe); 247 | threshold_1d(tr_pipe, &weight_count, block_count); 248 | ihaar(tr_pipe, pipe); 249 | #endif 250 | 251 | for (int n = 0; n < min(blocks_left, 8); n++) { 252 | stack[k*8 + n][j][i] = pipe[n]; 253 | } 254 | 255 | k++; 256 | blocks_left -= 8; 257 | } 258 | } 259 | } 260 | 261 | // Convert weight count to weight multiplier 262 | const float wx = (weight_count >= 1) ? (1.0f / (VARIANCE * (float)weight_count)) : 1.0f; 263 | 264 | // Relocate stack blocks to their positions in split rectangle 265 | for (int n = 0; n < block_count; n++) { 266 | float block[BLOCK_SIZE][BLOCK_SIZE]; 267 | idct2(stack[n], block); 268 | 269 | for (int j = 0; j < BLOCK_SIZE; j++) { 270 | for (int i = 0; i < BLOCK_SIZE; i++) { 271 | const int2 pixel_offset = {ri - gid.x, rj - gid.y}; 272 | const int2 pixel_pos = {similar_coords[2*(n*tot_globals_d + rgid)] - WINDOW_SIZE_HALF + i + pixel_offset.x, 273 | similar_coords[2*(n*tot_globals_d + rgid)+1] - WINDOW_SIZE_HALF + j + pixel_offset.y}; 274 | 275 | if (pixel_pos.x >= 0 && pixel_pos.y >= 0 && pixel_pos.x < SPLIT_SIZE_X && pixel_pos.y < SPLIT_SIZE_Y) { 276 | #if USE_KAISER_WINDOW 277 | const float pixel_wx = wx * KAISER(i, j); 278 | #else 279 | const float pixel_wx = wx; 280 | #endif 281 | ACCU(pixel_pos.x, pixel_pos.y) += block[j][i] * pixel_wx; 282 | WM(pixel_pos.x, pixel_pos.y) += pixel_wx; 283 | } 284 | } 285 | } 286 | } 287 | } 288 | } 289 | 290 | #if UNROLL 291 | # pragma unroll 292 | #endif 293 | for (int j = 0; j < SPLIT_SIZE_Y; j++) { 294 | #if UNROLL 295 | # pragma unroll 296 | #endif 297 | for (int i = 0; i < SPLIT_SIZE_X; i++) { 298 | const int2 pos = {gid.x + i, gid.y + j}; 299 | if (pos.x < WIDTH && pos.y < HEIGHT) { 300 | // Normalize aggregation output 301 | uchar pixel_value = convert_uchar_sat( 302 | ACCU(i, j) / WM(i, j) 303 | ); 304 | //if (WM(i, j) == 0) pixel_value = 255; 305 | //if (pixel_value == 0) pixel_value = 255; 306 | write_imageui(output, pos, pixel_value); 307 | } 308 | } 309 | } 310 | #endif 311 | } 312 | 313 | __kernel void bm3d_wiener_filter( 314 | __read_only image2d_t input, 315 | __read_only image2d_t basic, 316 | __write_only image2d_t output, 317 | __global short* similar_coords, 318 | __global uchar* block_counts, 319 | const int global_size_x_d, 320 | const int tot_globals_d 321 | #if USE_PLATFORM == PLATFORM_ATI 322 | , __global float* accumulator 323 | , __global float* weight_map 324 | #endif 325 | ) { 326 | #if 1 327 | const int2 gid = {get_global_id(0) * SPLIT_SIZE_X, get_global_id(1) * SPLIT_SIZE_Y}; 328 | if (gid.x > WIDTH-1 || gid.y > HEIGHT-1) return; 329 | //const size_t tot_globals = get_global_size(0) * get_global_size(1); 330 | //const size_t global_id = get_global_id(1)*get_global_size(0) + get_global_id(0); 331 | 332 | #if 1 333 | const int2 back_limit = {max(gid.x - WINDOW_SIZE_HALF, 0), 334 | max(gid.y - WINDOW_SIZE_HALF, 0)}; 335 | const int2 front_limit = {min(gid.x + SPLIT_SIZE_X - 1 + WINDOW_SIZE_HALF, WIDTH-1), 336 | min(gid.y + SPLIT_SIZE_Y - 1 + WINDOW_SIZE_HALF, HEIGHT-1)}; 337 | #else 338 | const int2 back_limit = gid; 339 | const int2 front_limit = gid; 340 | #endif 341 | 342 | #if USE_PLATFORM == PLATFORM_ATI 343 | for (int j = 0; j < SPLIT_SIZE_Y; j++) { 344 | for (int i = 0; i < SPLIT_SIZE_X; i++) { 345 | ACCU(i, j) = 0.0f; 346 | WM(i, j) = 0.0f; 347 | } 348 | } 349 | #else 350 | float accumulator[SPLIT_SIZE_Y][SPLIT_SIZE_X] = {{0.0f}}; 351 | float weight_map[SPLIT_SIZE_Y][SPLIT_SIZE_X] = {{0.0f}}; 352 | #endif 353 | 354 | int ri = gid.x; 355 | int rj = gid.y; 356 | 357 | while (ri - STEP_SIZE >= back_limit.x) ri -= STEP_SIZE; 358 | while (rj - STEP_SIZE >= back_limit.y) rj -= STEP_SIZE; 359 | 360 | const int ri_min = ri; 361 | const int rj_min = rj; 362 | 363 | for (rj = rj_min; rj <= front_limit.y; rj += STEP_SIZE) { 364 | for (ri = ri_min; ri <= front_limit.x; ri += STEP_SIZE) { 365 | const int rgid = (rj/STEP_SIZE)*global_size_x_d + (ri/STEP_SIZE); 366 | 367 | float basic_stack[MAX_BLOCK_COUNT_2][BLOCK_SIZE][BLOCK_SIZE]; 368 | float noise_stack[MAX_BLOCK_COUNT_2][BLOCK_SIZE][BLOCK_SIZE]; 369 | 370 | const uchar block_count = block_counts[rgid]; 371 | float sumsqr_weights = 0.0f; 372 | 373 | // Build stack 374 | for (int n = 0; n < block_count; n++) { 375 | float basic_block[BLOCK_SIZE][BLOCK_SIZE]; 376 | float noise_block[BLOCK_SIZE][BLOCK_SIZE]; 377 | 378 | for (int j = 0; j < BLOCK_SIZE; j++) { 379 | for (int i = 0; i < BLOCK_SIZE; i++) { 380 | const int2 pos = {ri - WINDOW_SIZE_HALF + similar_coords[2*(n*tot_globals_d + rgid)] + i, 381 | rj - WINDOW_SIZE_HALF + similar_coords[2*(n*tot_globals_d + rgid)+1] + j}; 382 | basic_block[j][i] = (float)read_imageui(basic, sampler, pos).s0; 383 | noise_block[j][i] = (float)read_imageui(input, sampler, pos).s0; 384 | } 385 | } 386 | 387 | dct2(basic_block, basic_stack[n]); 388 | dct2(noise_block, noise_stack[n]); 389 | } 390 | 391 | for (int j = 0; j < BLOCK_SIZE; j++) { 392 | for (int i = 0; i < BLOCK_SIZE; i++) { 393 | 394 | int blocks_left = block_count; 395 | int k = 0; 396 | 397 | while (blocks_left > 0) { 398 | 399 | float pipe[8] = {0.0f}; 400 | float pipe2[8] = {0.0f}; 401 | float tr_basic_pipe[8]; 402 | float tr_noise_pipe[8]; 403 | 404 | for (int n = 0; n < min(blocks_left, 8); n++) { 405 | pipe[n] = basic_stack[k*8 + n][j][i]; 406 | pipe2[n] = noise_stack[k*8 + n][j][i]; 407 | } 408 | 409 | #if TRANSFORM_METHOD_1D == DCT_1D 410 | dct(pipe, tr_basic_pipe, true); 411 | dct(pipe2, tr_noise_pipe, true); 412 | #elif TRANSFORM_METHOD_1D == HAAR_1D 413 | haar(pipe, tr_basic_pipe); 414 | haar(pipe2, tr_noise_pipe); 415 | #endif 416 | 417 | for (int n = 0; n < 8; n++) { 418 | // weights 419 | pipe[n] = tr_basic_pipe[n]*tr_basic_pipe[n] / (tr_basic_pipe[n]*tr_basic_pipe[n] + VARIANCE); 420 | sumsqr_weights += pipe[n]*pipe[n]; 421 | pipe2[n] = pipe[n]*tr_noise_pipe[n]; 422 | } 423 | 424 | #if TRANSFORM_METHOD_1D == DCT_1D 425 | idct(pipe2, pipe, true); 426 | #elif TRANSFORM_METHOD_1D == HAAR_1D 427 | ihaar(pipe2, pipe); 428 | #endif 429 | 430 | for (int n = 0; n < 8; n++) { 431 | noise_stack[k*8 + n][j][i] = pipe[n]; 432 | } 433 | 434 | k++; 435 | blocks_left -= 8; 436 | } 437 | } 438 | } 439 | 440 | const float wx = 1.0f / (VARIANCE * sumsqr_weights); 441 | 442 | for (int n = 0; n < block_count; n++) { 443 | float block[BLOCK_SIZE][BLOCK_SIZE]; 444 | idct2(noise_stack[n], block); 445 | 446 | for (int j = 0; j < BLOCK_SIZE; j++) { 447 | for (int i = 0; i < BLOCK_SIZE; i++) { 448 | const int2 pixel_offset = {ri - gid.x, rj - gid.y}; 449 | const int2 pixel_pos = {similar_coords[2*(n*tot_globals_d + rgid)] - WINDOW_SIZE_HALF + i + pixel_offset.x, 450 | similar_coords[2*(n*tot_globals_d + rgid)+1] - WINDOW_SIZE_HALF + j + pixel_offset.y}; 451 | 452 | if (pixel_pos.x >= 0 && pixel_pos.y >= 0 && pixel_pos.x < SPLIT_SIZE_X && pixel_pos.y < SPLIT_SIZE_Y) { 453 | #if USE_KAISER_WINDOW 454 | const float pixel_wx = wx * KAISER(i, j); 455 | #else 456 | const float pixel_wx = wx; 457 | #endif 458 | ACCU(pixel_pos.x, pixel_pos.y) += block[j][i] * pixel_wx; 459 | WM(pixel_pos.x, pixel_pos.y) += pixel_wx; 460 | } 461 | } 462 | } 463 | } 464 | } 465 | } 466 | 467 | #if UNROLL 468 | # pragma unroll 469 | #endif 470 | for (int j = 0; j < SPLIT_SIZE_Y; j++) { 471 | #if UNROLL 472 | # pragma unroll 473 | #endif 474 | for (int i = 0; i < SPLIT_SIZE_X; i++) { 475 | const int2 pos = {gid.x + i, gid.y + j}; 476 | if (pos.x < WIDTH && pos.y < HEIGHT) { 477 | const uchar pixel_value = convert_uchar_sat( 478 | ACCU(i, j) / WM(i, j) 479 | ); 480 | write_imageui(output, pos, pixel_value); 481 | } 482 | } 483 | } 484 | #endif 485 | } 486 | 487 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #define _CRT_SECURE_NO_WARNINGS 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "config.h" 13 | #include "PGM.h" 14 | #include "utils.h" 15 | 16 | #ifdef __APPLE__ 17 | # include 18 | #else 19 | # include 20 | #endif 21 | 22 | #ifdef _MSC_VER 23 | # include 24 | # define getcwd _getcwd 25 | # define putenv _putenv 26 | #else 27 | # include 28 | #endif 29 | 30 | #ifndef KERNEL_BUILD_OPTIONS 31 | # define KERNEL_BUILD_OPTIONS "" 32 | #endif 33 | 34 | using namespace std; 35 | 36 | void usage(const char * const exe) { 37 | cout << "Usage: " << exe << " [sigma] [original]" << endl; 38 | exit(EXIT_FAILURE); 39 | } 40 | 41 | int main(int argc, char** argv) { 42 | 43 | char* cwd = getcwd(NULL, 0); 44 | cout << "CWD: " << cwd << endl; 45 | free(cwd); 46 | 47 | string original_fn, noisy_fn; 48 | int sigma = (int)SIGMA; 49 | 50 | putenv("CUDA_CACHE_DISABLE=1"); 51 | 52 | if (argc >= 4) { 53 | original_fn = argv[3]; 54 | } 55 | if (argc >= 3) { 56 | sigma = atoi(argv[2]); 57 | } 58 | if (argc >= 2) { 59 | noisy_fn = argv[1]; 60 | } 61 | 62 | if (noisy_fn.empty()) { 63 | usage(argv[0]); 64 | } 65 | 66 | PGM noisy(noisy_fn); 67 | PGM original(original_fn); 68 | 69 | if (!noisy) { 70 | cout << "Failed to load noisy image: " << noisy_fn << endl; 71 | return EXIT_FAILURE; 72 | } 73 | cout << "Noisy image: " << noisy_fn << endl; 74 | 75 | if (original) { 76 | cout << "Original image: " << original_fn << endl; 77 | } 78 | cout << "Sigma: " << sigma << endl; 79 | 80 | //noisy.debug(); 81 | //noisy.debug_content(); 82 | 83 | cout << "Getting OpenCL platforms..." << endl; 84 | 85 | // Get platform ID count 86 | cl_uint platform_id_count = 0; 87 | CL_CHECK(clGetPlatformIDs(0, NULL, &platform_id_count)); 88 | 89 | // Get actual platform IDs 90 | cl_platform_id *platform_ids = new cl_platform_id[platform_id_count]; 91 | CL_CHECK(clGetPlatformIDs(platform_id_count, platform_ids, NULL)); 92 | 93 | int platform_id = 0; 94 | 95 | for (int i = 0; i < static_cast(platform_id_count); i++) { 96 | size_t info_size; 97 | CL_CHECK(clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, 0, NULL, &info_size)); 98 | 99 | char *info = new char[info_size]; 100 | CL_CHECK(clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, info_size, info, NULL)); 101 | 102 | cout << "[" << i << "] Name: " << info << endl; 103 | 104 | #if USE_PLATFORM == PLATFORM_NVIDIA 105 | if (!strncmp(info, "NVIDIA", 6)) platform_id = i; 106 | #elif USE_PLATFORM == PLATFORM_ATI 107 | if (!strncmp(info, "AMD", 3)) platform_id = i; 108 | #endif 109 | delete[] info; 110 | } 111 | 112 | cout << "Platform selected: " << platform_id << endl; 113 | 114 | // Get devices 115 | cl_int error = CL_SUCCESS; 116 | cl_uint device_id_count = 0; 117 | CL_CHECK(clGetDeviceIDs(platform_ids[platform_id], CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, 0, NULL, &device_id_count)); 118 | cl_device_id *device_ids = new cl_device_id[device_id_count]; 119 | CL_CHECK(clGetDeviceIDs(platform_ids[platform_id], CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, device_id_count, device_ids, NULL)); 120 | cl_context context = clCreateContext(NULL, device_id_count, device_ids, &pfn_notify, NULL, &error); 121 | CL_CHECK(error); 122 | 123 | FILE *f = fopen("bm3d.cl", "rb"); 124 | if (f == NULL) { 125 | cout << "File not found: bm3d.cl" << endl; 126 | exit(EXIT_FAILURE); 127 | } 128 | fseek(f, 0, SEEK_END); 129 | size_t size = ftell(f); 130 | rewind(f); 131 | char *buf = new char[size+1]; 132 | buf[size] = '\0'; 133 | fread(buf, sizeof(char), size, f); 134 | fclose(f); 135 | 136 | cl_uint string_count = 1; 137 | 138 | cl_program program = clCreateProgramWithSource(context, string_count, (const char**)&buf, NULL, &error); 139 | delete[] buf; 140 | CL_CHECK(error); 141 | 142 | cout << "Available devices:" << endl; 143 | for (int i = 0; i < static_cast(device_id_count); i++) { 144 | size_t info_size; 145 | CL_CHECK(clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, 0, NULL, &info_size)); 146 | char *info = new char[info_size]; 147 | CL_CHECK(clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, info_size, info, NULL)); 148 | cout << "[" << i << "] Device name: " << info << endl; 149 | delete[] info; 150 | } 151 | 152 | #if USE_PLATFORM == PLATFORM_NVIDIA 153 | int device_id = 0; 154 | #else 155 | // 0 = GPU, 1 = CPU usually 156 | int device_id = 0; 157 | #endif 158 | cout << "Selected device: " << device_id << endl; 159 | 160 | cl_device_id device = device_ids[device_id]; 161 | 162 | cl_uint CU_count; 163 | CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &CU_count, NULL)); 164 | cout << "Compute units: " << CU_count << endl; 165 | 166 | cl_ulong global_mem_size; 167 | CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &global_mem_size, NULL)); 168 | cout << "Global mem size available: " << global_mem_size << endl; 169 | 170 | string options; 171 | stringstream ss; 172 | ss << KERNEL_BUILD_OPTIONS " -cl-nv-verbose -cl-std=CL1.1 -DWIDTH=" << noisy.width << " -DHEIGHT=" << noisy.height << " -DUSE_PLATFORM=" << USE_PLATFORM << " -DSIGMA=" << sigma; 173 | options = ss.str(); 174 | 175 | cout << "Starting kernel build with options: " << options << endl; 176 | error = clBuildProgram(program, 0, NULL, options.c_str(), NULL, NULL); 177 | 178 | size_t log_size; 179 | CL_CHECK(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); 180 | 181 | char *log = new char[log_size+1]; 182 | log[log_size] = '\0'; 183 | 184 | CL_CHECK(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size + 1, log, NULL)); 185 | 186 | cout << endl << "----Kernel build log----" << endl << log << endl << "----Kernel build end----" << endl << endl; 187 | 188 | CL_CHECK(error); // Check clBuildProgram after printing build log 189 | 190 | cl_kernel dist_kernel = clCreateKernel(program, "calc_distances", &error); 191 | CL_CHECK(error); 192 | 193 | cl_kernel basic_kernel = clCreateKernel(program, "bm3d_basic_filter", &error); 194 | CL_CHECK(error); 195 | 196 | cl_kernel wiener_kernel = clCreateKernel(program, "bm3d_wiener_filter", &error); 197 | CL_CHECK(error); 198 | 199 | size_t multiple; 200 | CL_CHECK(clGetKernelWorkGroupInfo(dist_kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &multiple, NULL)); 201 | size_t maxWG; 202 | CL_CHECK(clGetKernelWorkGroupInfo(dist_kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxWG, NULL)); 203 | 204 | cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &error); 205 | CL_CHECK(error); 206 | 207 | const cl_image_format image_format_in = { CL_R, CL_UNSIGNED_INT8 }; 208 | const cl_image_format image_format_out = { CL_R, CL_UNSIGNED_INT8 }; 209 | const size_t image_origin[3] = {0, 0, 0}; 210 | const size_t image_region[3] = {noisy.width, noisy.height, 1}; 211 | const size_t image_size = noisy.width * noisy.height; 212 | #if USE_PLATFORM == PLATFORM_NVIDIA 213 | cl_mem noisy_image_buffer = clCreateImage2D(context, CL_MEM_READ_ONLY, &image_format_in, noisy.width, noisy.height, 0, NULL, &error); 214 | CL_CHECK(error); 215 | cl_mem basic_image_buffer = clCreateImage2D(context, CL_MEM_READ_WRITE, &image_format_out, noisy.width, noisy.height, 0, NULL, &error); 216 | CL_CHECK(error); 217 | cl_mem wiener_image_buffer = clCreateImage2D(context, CL_MEM_WRITE_ONLY, &image_format_out, noisy.width, noisy.height, 0, NULL, &error); 218 | CL_CHECK(error); 219 | #else 220 | const cl_image_desc image_desc = { CL_MEM_OBJECT_IMAGE2D, noisy.width, noisy.height }; 221 | cl_mem noisy_image_buffer = clCreateImage(context, CL_MEM_READ_ONLY, &image_format_in, &image_desc, NULL, &error); 222 | CL_CHECK(error); 223 | cl_mem basic_image_buffer = clCreateImage(context, CL_MEM_READ_WRITE, &image_format_out, &image_desc, NULL, &error); 224 | CL_CHECK(error); 225 | cl_mem wiener_image_buffer = clCreateImage(context, CL_MEM_WRITE_ONLY, &image_format_out, &image_desc, NULL, &error); 226 | CL_CHECK(error); 227 | #endif 228 | 229 | // This can be changed to other powers of two for testing performance 230 | const size_t ls[2] = {16, 8}; 231 | const int gx_d = next_multiple((unsigned)ceil(noisy.width / (double)STEP_SIZE), ls[0]); 232 | const int gy_d = next_multiple((unsigned)ceil(noisy.height / (double)STEP_SIZE), ls[1]); 233 | const int tot_items_d = gx_d * gy_d; 234 | 235 | const size_t gx = next_multiple((unsigned)ceil(noisy.width / (double)SPLIT_SIZE_X), ls[0]); 236 | const size_t gy = next_multiple((unsigned)ceil(noisy.height / (double)SPLIT_SIZE_Y), ls[1]); 237 | const size_t tot_items = gx * gy; 238 | 239 | cout << "----Size info----" << endl; 240 | cout << "Preferred work group size multiple: " << multiple << endl; 241 | cout << "Max work group size: " << maxWG << endl; 242 | cout << "Total amount of distance work items: [" << gx_d << ", " << gy_d << "] = " << tot_items_d << endl; 243 | cout << "Work items in a distance group: [" << ls[0] << ", " << ls[1] << "] = " << ls[0]*ls[1] << endl; 244 | cout << "Total amount of basic/wiener work items: [" << gx << ", " << gy << "] = " << tot_items << endl; 245 | cout << "Work items in a basic/wiener group: [" << ls[0] << ", " << ls[1] << "] = " << ls[0]*ls[1] << endl; 246 | cout << "----Size end ----" << endl << endl; 247 | 248 | const size_t similar_coords_size = MAX_BLOCK_COUNT_2 * tot_items_d * sizeof(cl_short) * 2; 249 | cout << "Similar coords size: " << similar_coords_size << endl; 250 | cl_mem similar_coords_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, similar_coords_size, NULL, &error); 251 | CL_CHECK(error); 252 | 253 | const size_t block_counts_size = tot_items_d * sizeof(cl_uchar); 254 | cout << "block_counts size: " << block_counts_size << endl; 255 | cl_mem block_counts_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, block_counts_size, NULL, &error); 256 | CL_CHECK(error); 257 | 258 | CL_CHECK(clEnqueueWriteImage(queue, noisy_image_buffer, CL_TRUE, image_origin, image_region, 0, 0, noisy.image, 0, NULL, NULL)); 259 | 260 | PGM basic(noisy.width, noisy.height); 261 | PGM wiener(noisy.width, noisy.height); 262 | cout << "Output image size: " << image_size << " bytes" << endl; 263 | 264 | const cl_int hard_threshold = D_THRESHOLD_1 * BLOCK_SIZE_SQ; 265 | const cl_int wiener_threshold = D_THRESHOLD_2 * BLOCK_SIZE_SQ; 266 | const cl_int max_block_count_1 = MAX_BLOCK_COUNT_1; 267 | const cl_int max_block_count_2 = MAX_BLOCK_COUNT_2; 268 | const cl_int window_step_size_1 = WINDOW_STEP_SIZE_1; 269 | const cl_int window_step_size_2 = WINDOW_STEP_SIZE_2; 270 | 271 | CL_CHECK(clSetKernelArg(dist_kernel, 0, sizeof(cl_mem), &noisy_image_buffer)); 272 | CL_CHECK(clSetKernelArg(dist_kernel, 1, sizeof(cl_mem), &similar_coords_buffer)); 273 | CL_CHECK(clSetKernelArg(dist_kernel, 2, sizeof(cl_mem), &block_counts_buffer)); 274 | CL_CHECK(clSetKernelArg(dist_kernel, 3, sizeof(cl_int), &hard_threshold)); 275 | CL_CHECK(clSetKernelArg(dist_kernel, 4, sizeof(cl_int), &max_block_count_1)); 276 | CL_CHECK(clSetKernelArg(dist_kernel, 5, sizeof(cl_int), &window_step_size_1)); 277 | 278 | CL_CHECK(clSetKernelArg(basic_kernel, 0, sizeof(cl_mem), &noisy_image_buffer)); 279 | CL_CHECK(clSetKernelArg(basic_kernel, 1, sizeof(cl_mem), &basic_image_buffer)); 280 | CL_CHECK(clSetKernelArg(basic_kernel, 2, sizeof(cl_mem), &similar_coords_buffer)); 281 | CL_CHECK(clSetKernelArg(basic_kernel, 3, sizeof(cl_mem), &block_counts_buffer)); 282 | CL_CHECK(clSetKernelArg(basic_kernel, 4, sizeof(cl_int), &gx_d)); 283 | CL_CHECK(clSetKernelArg(basic_kernel, 5, sizeof(cl_int), &tot_items_d)); 284 | #if USE_PLATFORM == PLATFORM_ATI 285 | const size_t accu_size = tot_items * SPLIT_SIZE_X * SPLIT_SIZE_Y * sizeof(cl_float); 286 | cout << "accu_size: " << accu_size << endl; 287 | cl_mem accumulator_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, accu_size, NULL, &error); 288 | CL_CHECK(error); 289 | cl_mem weight_map_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, accu_size, NULL, &error); 290 | CL_CHECK(error); 291 | CL_CHECK(clSetKernelArg(basic_kernel, 6, sizeof(cl_mem), &accumulator_buffer)); 292 | CL_CHECK(clSetKernelArg(basic_kernel, 7, sizeof(cl_mem), &weight_map_buffer)); 293 | #endif 294 | 295 | cl_short *distances = new cl_short[similar_coords_size]; 296 | for (int i = 0; i < static_cast(similar_coords_size / sizeof(cl_short)); i++) { 297 | distances[i] = 0; 298 | } 299 | cl_short *distances2 = new cl_short[similar_coords_size]; 300 | for (int i = 0; i < static_cast(similar_coords_size / sizeof(cl_short)); i++) { 301 | distances2[i] = 0; 302 | } 303 | 304 | cl_event event; 305 | 306 | #if ENABLE_PROFILING 307 | double total_time = 0.0; 308 | #endif 309 | 310 | cout << endl << "1st step..." << endl; 311 | 312 | // Change these if want to use offset 313 | for (int j = 0; j < 1; j++) { 314 | for (int i = 0; i < 1; i++) { 315 | size_t gs_d[2] = {gx_d, gy_d}; 316 | size_t gs[2] = {gx, gy}; 317 | size_t offset[2] = {i*gx, j*gy}; 318 | assert(ls[0] * ls[1] <= maxWG); 319 | assert(!(gs[0] % ls[0])); 320 | assert(!(gs[1] % ls[1])); 321 | 322 | CL_CHECK(clEnqueueNDRangeKernel(queue, dist_kernel, 2, offset, gs_d, ls, 0, NULL, &event)); 323 | cout << "Distances kernel enqueued" << endl; 324 | 325 | #if ENABLE_PROFILING 326 | CL_CHECK(clWaitForEvents(1, &event)); 327 | 328 | cl_ulong time_start, time_end; 329 | double exec_time; 330 | 331 | CL_CHECK(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &time_start, NULL)); 332 | CL_CHECK(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &time_end, NULL)); 333 | exec_time = static_cast(time_end - time_start); 334 | total_time += exec_time; 335 | printf("Distances execution time: %0.3f ms\n", exec_time/1000000.0); 336 | #endif 337 | 338 | CL_CHECK(clEnqueueReadBuffer(queue, similar_coords_buffer, CL_TRUE, 0, similar_coords_size, distances, 0, NULL, NULL)); 339 | 340 | CL_CHECK(clEnqueueNDRangeKernel(queue, basic_kernel, 2, offset, gs, ls, 0, NULL, &event)); 341 | cout << "Basic kernel enqueued" << endl; 342 | 343 | #if ENABLE_PROFILING 344 | CL_CHECK(clWaitForEvents(1, &event)); 345 | CL_CHECK(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &time_start, NULL)); 346 | CL_CHECK(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &time_end, NULL)); 347 | exec_time = static_cast(time_end - time_start); 348 | total_time += exec_time; 349 | printf("Basic execution time: %0.3f ms\n", exec_time/1000000.0); 350 | #endif 351 | } 352 | } 353 | 354 | cout << endl << "2nd step..." << endl; 355 | 356 | CL_CHECK(clSetKernelArg(dist_kernel, 0, sizeof(cl_mem), &basic_image_buffer)); 357 | CL_CHECK(clSetKernelArg(dist_kernel, 3, sizeof(cl_int), &wiener_threshold)); 358 | CL_CHECK(clSetKernelArg(dist_kernel, 4, sizeof(cl_int), &max_block_count_2)); 359 | CL_CHECK(clSetKernelArg(dist_kernel, 5, sizeof(cl_int), &window_step_size_2)); 360 | 361 | CL_CHECK(clSetKernelArg(wiener_kernel, 0, sizeof(cl_mem), &noisy_image_buffer)); 362 | CL_CHECK(clSetKernelArg(wiener_kernel, 1, sizeof(cl_mem), &basic_image_buffer)); 363 | CL_CHECK(clSetKernelArg(wiener_kernel, 2, sizeof(cl_mem), &wiener_image_buffer)); 364 | CL_CHECK(clSetKernelArg(wiener_kernel, 3, sizeof(cl_mem), &similar_coords_buffer)); 365 | CL_CHECK(clSetKernelArg(wiener_kernel, 4, sizeof(cl_mem), &block_counts_buffer)); 366 | CL_CHECK(clSetKernelArg(wiener_kernel, 5, sizeof(cl_int), &gx_d)); 367 | CL_CHECK(clSetKernelArg(wiener_kernel, 6, sizeof(cl_int), &tot_items_d)); 368 | #if USE_PLATFORM == PLATFORM_ATI 369 | CL_CHECK(clSetKernelArg(wiener_kernel, 7, sizeof(cl_mem), &accumulator_buffer)); 370 | CL_CHECK(clSetKernelArg(wiener_kernel, 8, sizeof(cl_mem), &weight_map_buffer)); 371 | #endif 372 | 373 | // Change these if want to use offset 374 | for (int j = 0; j < 1; j++) { 375 | for (int i = 0; i < 1; i++) { 376 | size_t gs_d[2] = {gx_d, gy_d}; 377 | size_t gs[2] = {gx, gy}; 378 | size_t offset[2] = {i*gx, j*gy}; 379 | assert(ls[0] * ls[1] <= maxWG); 380 | assert(!(gs[0] % ls[0])); 381 | assert(!(gs[1] % ls[1])); 382 | 383 | CL_CHECK(clEnqueueNDRangeKernel(queue, dist_kernel, 2, offset, gs_d, ls, 0, NULL, &event)); 384 | cout << "Distances kernel enqueued" << endl; 385 | 386 | #if ENABLE_PROFILING 387 | CL_CHECK(clWaitForEvents(1, &event)); 388 | 389 | cl_ulong time_start, time_end; 390 | double exec_time; 391 | 392 | CL_CHECK(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &time_start, NULL)); 393 | CL_CHECK(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &time_end, NULL)); 394 | exec_time = static_cast(time_end - time_start); 395 | total_time += exec_time; 396 | printf("Distances execution time: %0.3f ms\n", exec_time/1000000.0); 397 | #endif 398 | 399 | CL_CHECK(clEnqueueReadBuffer(queue, similar_coords_buffer, CL_TRUE, 0, similar_coords_size, distances2, 0, NULL, NULL)); 400 | 401 | CL_CHECK(clEnqueueNDRangeKernel(queue, wiener_kernel, 2, offset, gs, ls, 0, NULL, &event)); 402 | cout << "Wiener kernel enqueued" << endl; 403 | 404 | #if ENABLE_PROFILING 405 | CL_CHECK(clWaitForEvents(1, &event)); 406 | CL_CHECK(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &time_start, NULL)); 407 | CL_CHECK(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &time_end, NULL)); 408 | exec_time = static_cast(time_end - time_start); 409 | total_time += exec_time; 410 | printf("Wiener execution time: %0.3f ms\n", exec_time/1000000.0); 411 | #endif 412 | } 413 | } 414 | 415 | #if ENABLE_PROFILING 416 | printf("Total execution time: %0.3f ms\n", total_time/1000000.0); 417 | #endif 418 | 419 | cout << "Reading basic image..." << endl; 420 | 421 | CL_CHECK(clEnqueueReadImage(queue, basic_image_buffer, CL_TRUE, image_origin, image_region, 0, 0, basic.image, 0, NULL, NULL)); 422 | 423 | cout << "Reading wiener image..." << endl; 424 | 425 | CL_CHECK(clEnqueueReadImage(queue, wiener_image_buffer, CL_TRUE, image_origin, image_region, 0, 0, wiener.image, 0, NULL, NULL)); 426 | 427 | cout << "Wiener output:" << endl; 428 | for (int j = 0; j < 16; j++) { 429 | for (int i = 0; i < 16; i++) { 430 | printf("%u ", wiener.image[j*wiener.width + i]); 431 | } 432 | cout << endl; 433 | } 434 | 435 | if (original) { 436 | cout << "Reference:" << endl; 437 | for (int j = 0; j < 16; j++) { 438 | for (int i = 0; i < 16; i++) { 439 | printf("%u ", original.image[j*original.width + i]); 440 | } 441 | cout << endl; 442 | } 443 | } 444 | 445 | #if 1 446 | f = fopen("distances1.dat", "wb"); 447 | 448 | for (int i = 0; i < static_cast(similar_coords_size); i += 2) { 449 | fputc(*((char*)distances + i + 1), f); 450 | fputc(*((char*)distances + i), f); 451 | } 452 | 453 | fclose(f); 454 | #endif 455 | 456 | #if 1 457 | f = fopen("distances2.dat", "wb"); 458 | 459 | for (int i = 0; i < static_cast(similar_coords_size); i += 2) { 460 | fputc(*((char*)distances2 + i + 1), f); 461 | fputc(*((char*)distances2 + i), f); 462 | } 463 | 464 | fclose(f); 465 | #endif 466 | 467 | cout << "Writing: basic.pgm" << endl; 468 | basic.save("basic.pgm"); 469 | cout << "Writing: wiener.pgm" << endl; 470 | wiener.save("wiener.pgm"); 471 | 472 | if (original) { 473 | std::cout << "PSNR (noisy) : " << psnr(original.image, noisy.image, original.width*original.height) << " dB" << std::endl; 474 | std::cout << "PSNR (basic) : " << psnr(original.image, basic.image, original.width*original.height) << " dB" << std::endl; 475 | std::cout << "PSNR (wiener): " << psnr(original.image, wiener.image, original.width*original.height) << " dB" << std::endl; 476 | } 477 | 478 | 479 | // Create collage 480 | if (original) { 481 | PGM collage(original.width*2, original.height*2); 482 | for (int j = 0; j < original.height; j++) { 483 | for (int i = 0; i < original.width; i++) { 484 | collage.image[j*collage.width + i] = original.image[j*original.width + i]; 485 | collage.image[j*collage.width + i + original.width] = noisy.image[j*original.width + i]; 486 | collage.image[(j + original.height)*collage.width + i] = basic.image[j*original.width + i]; 487 | collage.image[(j + original.height)*collage.width + i + original.width] = wiener.image[j*original.width + i]; 488 | } 489 | } 490 | 491 | collage.save("collage.pgm"); 492 | } 493 | else { 494 | cout << "No original image provided for reference, so no PSNR calculations or collage" << endl; 495 | } 496 | 497 | system("md5sum distances1.dat"); 498 | system("md5sum distances2.dat"); 499 | system("md5sum basic.pgm"); 500 | system("md5sum wiener.pgm"); 501 | 502 | delete[] device_ids; 503 | delete[] platform_ids; 504 | delete[] distances; 505 | delete[] distances2; 506 | 507 | CL_CHECK(clReleaseMemObject(noisy_image_buffer)); 508 | CL_CHECK(clReleaseMemObject(basic_image_buffer)); 509 | CL_CHECK(clReleaseMemObject(wiener_image_buffer)); 510 | CL_CHECK(clReleaseMemObject(similar_coords_buffer)); 511 | CL_CHECK(clReleaseMemObject(block_counts_buffer)); 512 | #if USE_PLATFORM == PLATFORM_ATI 513 | CL_CHECK(clReleaseMemObject(accumulator_buffer)); 514 | CL_CHECK(clReleaseMemObject(weight_map_buffer)); 515 | #endif 516 | CL_CHECK(clReleaseKernel(dist_kernel)); 517 | CL_CHECK(clReleaseKernel(basic_kernel)); 518 | CL_CHECK(clReleaseKernel(wiener_kernel)); 519 | CL_CHECK(clReleaseCommandQueue(queue)); 520 | CL_CHECK(clReleaseProgram(program)); 521 | CL_CHECK(clReleaseContext(context)); 522 | 523 | system("pause"); 524 | return EXIT_SUCCESS; 525 | } 526 | 527 | --------------------------------------------------------------------------------