├── CMakeLists.txt ├── include ├── indices.cuh ├── params.hpp ├── stopwatch.hpp └── bm3d.hpp ├── LICENSE.md ├── README.md └── src ├── main_nodisplay.cpp ├── dct8x8.cu ├── blockmatching.cu └── filtering.cu /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR) 2 | project(bm3d LANGUAGES CXX CUDA) 3 | 4 | include_directories( 5 | ${CMAKE_CURRENT_SOURCE_DIR}/include 6 | ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 7 | ) 8 | 9 | set(CMAKE_CXX_STANDARD 11) 10 | set(CMAKE_CXX_STANDARD_REQUIRED True) 11 | 12 | # The following lines build the main executable. 13 | add_executable(bm3d 14 | include/CImg.h 15 | include/stopwatch.hpp 16 | include/indices.cuh 17 | include/params.hpp 18 | include/bm3d.hpp 19 | src/filtering.cu 20 | src/blockmatching.cu 21 | src/dct8x8.cu 22 | src/main_nodisplay.cpp 23 | ) 24 | 25 | target_link_libraries(bm3d cufft cudart png) 26 | -------------------------------------------------------------------------------- /include/indices.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _INDICES_CUH_ 2 | #define _INDICES_CUH_ 3 | 4 | typedef unsigned char uchar; 5 | typedef unsigned int uint; 6 | typedef unsigned short ushort; 7 | 8 | //Index handling 9 | #define idx2(x,y,dim_x) ( (x) + ((y)*(dim_x)) ) 10 | #define idx3(x,y,z,dim_x,dim_y) ( (x) + ((y)*(dim_x)) + ((z)*(dim_x)*(dim_y)) ) 11 | 12 | template 13 | __device__ __forceinline__ T* idx2p(T* BaseAddress, uint Column, uint Row, uint pitch) 14 | { 15 | return (T*)((char*)BaseAddress + Row * pitch) + Column; 16 | } 17 | 18 | struct uint2float1 19 | { 20 | short x; 21 | short y; 22 | float val; 23 | 24 | __host__ __device__ uint2float1(short x, short y, float val) : x(x), y(y), val(val) { } 25 | }; 26 | 27 | 28 | #endif -------------------------------------------------------------------------------- /include/params.hpp: -------------------------------------------------------------------------------- 1 | #ifndef _PARAMS_HPP_ 2 | #define _PARAMS_HPP_ 3 | 4 | struct Params 5 | { 6 | /* 7 | RESTRICTIONS: 8 | k must be divisible by p 9 | */ 10 | unsigned int n; // Half of area (in each dim) in which the similar blocks are searched 11 | unsigned int k; // width and height of a patch 12 | unsigned int N; // Maximal number of similar blocks in stack (without reference block) 13 | unsigned int T; // Distance treshold under which two blocks are assumed simialr //DEV: NOT NECESSARY 14 | unsigned int Tn; // Distance treshold under which two blocks are assumed simialr (with normalization facotr) 15 | unsigned int p; // Step between reference patches 16 | float L3D; // Treshold in colaborative filtering under which coefficients are replaced by zeros. 17 | 18 | 19 | Params(unsigned int n = 32, 20 | unsigned int k = 8, 21 | unsigned int N = 8, 22 | unsigned int T = 2500, 23 | unsigned int p = 3, 24 | float L3D = 2.7f) : 25 | n(n), k(k), N(N-1), T(T), Tn(T*k*k), p(p), L3D(L3D) {} 26 | }; 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # The 2-Clause BSD License 2 | 3 | Note: This license has also been called the "Simplified BSD License" and the "FreeBSD License". 4 | 5 | Copyright (C) 2018 David Honzátko 6 | 7 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | BM3D-GPU 2 | ======== 3 | 4 | CUDA-accelerated implementation of BM3D image denoising method 5 | 6 | Author : David Honzátko 7 | 8 | # Unix/Linux User Guide 9 | 10 | The code is compilable on Unix/Linux. 11 | 12 | - Compilation. 13 | Automated compilation requires the cmake program. 14 | 15 | - Libraries. 16 | This code requires the CUDA toolkit installed. 17 | 18 | - Image format. 19 | All the image formats supported by the Cimg library. 20 | For users that have convert or gm installed, it supports most of the image formats. Otherwise we recommend to use the .bmp format. 21 | 22 | 23 | Usage: 24 | 25 | 1. Download the code package and extract it. Go to that directory. 26 | 27 | 2. Create build directory, create the makefiles using cmake and compile the application 28 | Run 29 | ``` 30 | mkdir build 31 | cd build 32 | cmake .. 33 | make 34 | ``` 35 | 3. Run CUDA-accelerated BM3D image denoising application 36 | ``` 37 | ./bm3d 38 | ``` 39 | The generic way to run the code is: 40 | ``` 41 | ./bm3d NoisyImage.png DenoisedImage.png sigma [color [twostep [quiet [ReferenceImage]]]] 42 | ``` 43 | Options: 44 | - color - color image denoising (experimental only) 45 | - twostep - process both steps of the BM3D method 46 | - quiet - no information about the state of processing is displayed 47 | - ReferenceImage - if provided, computes and prints PSNR between the ReferenceImage and DenoisedImage 48 | 49 | Example of gray-scale denoising by the fisrt step of BM3D: 50 | ``` 51 | ./bm3d lena_20.png lena_den.png 20 52 | ``` 53 | Example of color denoising by both steps of BM3D: 54 | ``` 55 | ./bm3d lena_20_color.png lena_den_color.png 20 color twostep 56 | ``` 57 | Example of grayscale denoising by both steps of BM3D with PSNR computation: 58 | ``` 59 | ./bm3d lena_25.png lena_den.png 25 nocolor twostep quiet lena.png 60 | ``` 61 | # Citation 62 | If you find this implementation useful please cite the following paper in your work: 63 | 64 | @article{bm3d-gpu, 65 | author = {Honzátko, David and Kruliš, Martin}, 66 | year = {2017}, month = {11}, 67 | title = {Accelerating block-matching and 3D filtering method for image denoising on GPUs}, 68 | booktitle = {Journal of Real-Time Image Processing} 69 | } 70 | -------------------------------------------------------------------------------- /include/stopwatch.hpp: -------------------------------------------------------------------------------- 1 | #ifndef _STOPWATCH_HPP_ 2 | #define _STOPWATCH_HPP_ 3 | 4 | 5 | #ifdef WIN32 6 | #include 7 | #else 8 | #include 9 | #include 10 | #include 11 | #include 12 | #endif 13 | 14 | 15 | /** 16 | * \brief Implementation of high precision wall-time stopwatch based on system timers. 17 | */ 18 | class Stopwatch 19 | { 20 | private: 21 | typedef unsigned long long ticks_t; 22 | 23 | ticks_t mStartTime; 24 | double mInterval; 25 | bool mTiming; 26 | 27 | /** 28 | * \brief Get current system timer status in ticks. 29 | */ 30 | ticks_t now() 31 | { 32 | #ifdef WIN32 33 | LARGE_INTEGER ticks; 34 | ::QueryPerformanceCounter(&ticks); 35 | return static_cast(ticks.QuadPart); 36 | #else 37 | struct timespec ts; 38 | ::clock_gettime(CLOCK_REALTIME, &ts); 39 | return static_cast(ts.tv_sec) * 1000000000UL + static_cast(ts.tv_nsec); 40 | #endif 41 | } 42 | 43 | 44 | /** 45 | * Measure current time and update mInterval. 46 | */ 47 | void measureTime() 48 | { 49 | #ifdef WIN32 50 | LARGE_INTEGER ticks; 51 | ::QueryPerformanceFrequency(&ticks); 52 | mInterval += static_cast(now() - mStartTime) / static_cast(ticks.QuadPart); 53 | #else 54 | mInterval += static_cast((now() - mStartTime)*1E-9); 55 | #endif 56 | } 57 | 58 | 59 | public: 60 | /** 61 | * \brief Create new stopwatch. The stopwatch are not running when created. 62 | */ 63 | Stopwatch() : mTiming(false), mInterval(0.0) { } 64 | 65 | /** 66 | * \brief Create new stopwatch (and optionaly start it). 67 | * \param start If start is true, the stapwatch are started immediately. 68 | */ 69 | Stopwatch(bool start) 70 | { 71 | if (start) this->start(); 72 | } 73 | 74 | 75 | /** 76 | * \brief Start the stopwatch. If the stopwatch are already timing, they are reset. 77 | */ 78 | void start() 79 | { 80 | mTiming = true; 81 | mStartTime = now(); 82 | } 83 | 84 | 85 | /** 86 | * \brief Stop the stopwatch. Multiple invocation has no effect. 87 | */ 88 | void stop() 89 | { 90 | if (mTiming == false) return; 91 | mTiming = false; 92 | measureTime(); 93 | } 94 | 95 | /** 96 | * \brief Stop and reset the stopwatch. Multiple invocation has no effect. 97 | */ 98 | void reset() 99 | { 100 | mInterval = 0.0; 101 | if (mTiming == false) return; 102 | mTiming = false; 103 | } 104 | 105 | 106 | /** 107 | * \brief Return measured time in seconds. 108 | */ 109 | double getSeconds() 110 | { 111 | if (mTiming) 112 | measureTime(); 113 | return mInterval; 114 | } 115 | 116 | /** 117 | * \brief Return mesured time in miliseconds. 118 | */ 119 | double getMiliseconds() 120 | { 121 | return getSeconds() * 1000.0; 122 | } 123 | }; 124 | 125 | #endif 126 | -------------------------------------------------------------------------------- /src/main_nodisplay.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "bm3d.hpp" 5 | #define cimg_display 0 6 | #include "CImg.h" 7 | 8 | using namespace cimg_library; 9 | 10 | int main(int argc, char** argv) 11 | { 12 | if( argc < 4 ) 13 | { 14 | std::cerr << "Usage: " << argv[0] << " NosiyImage DenoisedImage sigma [color [twostep [quiet [ReferenceImage]]]]" << std::endl; 15 | return 1; 16 | } 17 | float sigma = strtof(argv[3],NULL); 18 | 19 | unsigned int channels = 1; 20 | if (argc >= 5 && strcmp(argv[4],"color") == 0) 21 | { 22 | channels = 3; 23 | } 24 | bool twostep = false; 25 | if (argc >= 6 && strcmp(argv[5],"twostep") == 0) 26 | { 27 | twostep = true; 28 | } 29 | bool verbose = true; 30 | if (argc >= 7 && strcmp(argv[6],"quiet") == 0) 31 | { 32 | verbose = false; 33 | } 34 | 35 | if (verbose) 36 | { 37 | std::cout << "Sigma = " << sigma << std::endl; 38 | if (twostep) 39 | std::cout << "Number of Steps: 2" << std::endl; 40 | else 41 | std::cout << "Number of Steps: 1" << std::endl; 42 | 43 | if (channels > 1) 44 | std::cout << "Color denoising: yes" << std::endl; 45 | else 46 | std::cout << "Color denoising: no" << std::endl; 47 | } 48 | 49 | //Allocate images 50 | CImg image(argv[1]); 51 | CImg image2(image.width(), image.height(), 1, channels, 0); 52 | std::vector sigma2(channels); 53 | sigma2[0] = (unsigned int)(sigma * sigma); 54 | 55 | //Convert color image to YCbCr color space 56 | if (channels == 3) 57 | { 58 | image = image.get_channels(0, 2).RGBtoYCbCr(); 59 | //Convert the sigma^2 variance to the YCbCr color space 60 | long s = sigma * sigma; 61 | sigma2[0] = ((66l*66l*s + 129l*129l*s + 25l*25l*s) / (256l*256l)); 62 | sigma2[1] = ((38l*38l*s + 74l*74l*s + 112l*112l*s) / (256l*256l)), 63 | sigma2[2] = ((112l*112l*s + 94l*94l*s + 18l*18l*s) / (256l*256l)); 64 | } 65 | 66 | std::cout << "Noise variance for individual channels (YCrCb if color): "; 67 | for (unsigned int k = 0; k < sigma2.size(); k++) 68 | std::cout << sigma2[k] << " "; 69 | std::cout << std::endl; 70 | 71 | // Check for invalid input 72 | if(! image.data() ) 73 | { 74 | std::cerr << "Could not open or find the image" << std::endl; 75 | return 1; 76 | } 77 | 78 | if(verbose) 79 | std::cout << "width: " << image.width() << " height: " << image.height() << std::endl; 80 | 81 | //Launch BM3D 82 | try { 83 | BM3D bm3d; 84 | // (n, k,N, T, p,sigma, L3D) 85 | bm3d.set_hard_params(19,8,16,2500,3, 2.7f); 86 | bm3d.set_wien_params(19,8,32,400, 3); 87 | bm3d.set_verbose(verbose); 88 | bm3d.denoise_host_image( 89 | image.data(), 90 | image2.data(), 91 | image.width(), 92 | image.height(), 93 | channels, 94 | sigma2.data(), 95 | twostep); 96 | } 97 | catch(std::exception & e) { 98 | std::cerr << "There was an error while processing image: " << std::endl << e.what() << std::endl; 99 | return 1; 100 | } 101 | 102 | if (channels == 3) //color 103 | //Convert back to RGB color space 104 | image2 = image2.get_channels(0,2).YCbCrtoRGB(); 105 | else 106 | image2 = image2.get_channel(0); 107 | //Save denoised image 108 | image2.save( argv[2] ); 109 | 110 | if (argc >= 8) 111 | { 112 | CImg reference_image(argv[7]); 113 | std::cout << "PSNR:" << reference_image.PSNR(image2) << std::endl; 114 | } 115 | 116 | return 0; 117 | } 118 | -------------------------------------------------------------------------------- /src/dct8x8.cu: -------------------------------------------------------------------------------- 1 | #include "indices.cuh" 2 | #include "cuda_runtime.h" 3 | #include "device_launch_parameters.h" 4 | /* 5 | * Based on dct8x8_kernel2.cu provided in CUDA samples form NVIDIA Corporation. 6 | * 7 | * Provide functions to compute many 2D DCT and 2D IDCT of size 8x8 8 | */ 9 | 10 | 11 | #define C_a 1.387039845322148f //!< a = (2^0.5) * cos( pi / 16); Used in forward and inverse DCT. 12 | #define C_b 1.306562964876377f //!< b = (2^0.5) * cos( pi / 8); Used in forward and inverse DCT. 13 | #define C_c 1.175875602419359f //!< c = (2^0.5) * cos(3 * pi / 16); Used in forward and inverse DCT. 14 | #define C_d 0.785694958387102f //!< d = (2^0.5) * cos(5 * pi / 16); Used in forward and inverse DCT. 15 | #define C_e 0.541196100146197f //!< e = (2^0.5) * cos(3 * pi / 8); Used in forward and inverse DCT. 16 | #define C_f 0.275899379282943f //!< f = (2^0.5) * cos(7 * pi / 16); Used in forward and inverse DCT. 17 | 18 | 19 | /** 20 | * Normalization constant that is used in forward and inverse DCT 21 | */ 22 | #define C_norm 0.3535533905932737f // 1 / (8^0.5) 23 | 24 | #define BLOCK_SIZE 8 25 | 26 | /** 27 | * Width of macro-block 28 | */ 29 | #define KER2_BLOCK_WIDTH 128 30 | 31 | 32 | /** 33 | * Height of macro-block 34 | */ 35 | #define KER2_BLOCK_HEIGHT 8 36 | 37 | 38 | /** 39 | * Stride of shared memory buffer (2nd kernel) 40 | */ 41 | #define KER2_SMEMBLOCK_STRIDE (KER2_BLOCK_WIDTH+1) 42 | 43 | 44 | /** 45 | ************************************************************************** 46 | * Performs in-place DCT of vector of 8 elements. 47 | * 48 | * \param Vect0 [IN/OUT] - Pointer to the first element of vector 49 | * \param Step [IN/OUT] - Value to add to ptr to access other elements 50 | * 51 | * \return None 52 | */ 53 | __device__ void InplaceDCTvector(float *Vect0, int Step) 54 | { 55 | float *Vect1 = Vect0 + Step; 56 | float *Vect2 = Vect1 + Step; 57 | float *Vect3 = Vect2 + Step; 58 | float *Vect4 = Vect3 + Step; 59 | float *Vect5 = Vect4 + Step; 60 | float *Vect6 = Vect5 + Step; 61 | float *Vect7 = Vect6 + Step; 62 | 63 | float X07P = (*Vect0) + (*Vect7); 64 | float X16P = (*Vect1) + (*Vect6); 65 | float X25P = (*Vect2) + (*Vect5); 66 | float X34P = (*Vect3) + (*Vect4); 67 | 68 | float X07M = (*Vect0) - (*Vect7); 69 | float X61M = (*Vect6) - (*Vect1); 70 | float X25M = (*Vect2) - (*Vect5); 71 | float X43M = (*Vect4) - (*Vect3); 72 | 73 | float X07P34PP = X07P + X34P; 74 | float X07P34PM = X07P - X34P; 75 | float X16P25PP = X16P + X25P; 76 | float X16P25PM = X16P - X25P; 77 | 78 | (*Vect0) = C_norm * (X07P34PP + X16P25PP); 79 | (*Vect2) = C_norm * (C_b * X07P34PM + C_e * X16P25PM); 80 | (*Vect4) = C_norm * (X07P34PP - X16P25PP); 81 | (*Vect6) = C_norm * (C_e * X07P34PM - C_b * X16P25PM); 82 | 83 | (*Vect1) = C_norm * (C_a * X07M - C_c * X61M + C_d * X25M - C_f * X43M); 84 | (*Vect3) = C_norm * (C_c * X07M + C_f * X61M - C_a * X25M + C_d * X43M); 85 | (*Vect5) = C_norm * (C_d * X07M + C_a * X61M + C_f * X25M - C_c * X43M); 86 | (*Vect7) = C_norm * (C_f * X07M + C_d * X61M + C_c * X25M + C_a * X43M); 87 | } 88 | 89 | 90 | /** 91 | ************************************************************************** 92 | * Performs in-place IDCT of vector of 8 elements. 93 | * 94 | * \param Vect0 [IN/OUT] - Pointer to the first element of vector 95 | * \param Step [IN/OUT] - Value to add to ptr to access other elements 96 | * 97 | * \return None 98 | */ 99 | __device__ void InplaceIDCTvector(float *Vect0, int Step) 100 | { 101 | float *Vect1 = Vect0 + Step; 102 | float *Vect2 = Vect1 + Step; 103 | float *Vect3 = Vect2 + Step; 104 | float *Vect4 = Vect3 + Step; 105 | float *Vect5 = Vect4 + Step; 106 | float *Vect6 = Vect5 + Step; 107 | float *Vect7 = Vect6 + Step; 108 | 109 | float Y04P = (*Vect0) + (*Vect4); 110 | float Y2b6eP = C_b * (*Vect2) + C_e * (*Vect6); 111 | 112 | float Y04P2b6ePP = Y04P + Y2b6eP; 113 | float Y04P2b6ePM = Y04P - Y2b6eP; 114 | float Y7f1aP3c5dPP = C_f * (*Vect7) + C_a * (*Vect1) + C_c * (*Vect3) + C_d * (*Vect5); 115 | float Y7a1fM3d5cMP = C_a * (*Vect7) - C_f * (*Vect1) + C_d * (*Vect3) - C_c * (*Vect5); 116 | 117 | float Y04M = (*Vect0) - (*Vect4); 118 | float Y2e6bM = C_e * (*Vect2) - C_b * (*Vect6); 119 | 120 | float Y04M2e6bMP = Y04M + Y2e6bM; 121 | float Y04M2e6bMM = Y04M - Y2e6bM; 122 | float Y1c7dM3f5aPM = C_c * (*Vect1) - C_d * (*Vect7) - C_f * (*Vect3) - C_a * (*Vect5); 123 | float Y1d7cP3a5fMM = C_d * (*Vect1) + C_c * (*Vect7) - C_a * (*Vect3) + C_f * (*Vect5); 124 | 125 | (*Vect0) = C_norm * (Y04P2b6ePP + Y7f1aP3c5dPP); 126 | (*Vect7) = C_norm * (Y04P2b6ePP - Y7f1aP3c5dPP); 127 | (*Vect4) = C_norm * (Y04P2b6ePM + Y7a1fM3d5cMP); 128 | (*Vect3) = C_norm * (Y04P2b6ePM - Y7a1fM3d5cMP); 129 | 130 | (*Vect1) = C_norm * (Y04M2e6bMP + Y1c7dM3f5aPM); 131 | (*Vect5) = C_norm * (Y04M2e6bMM - Y1d7cP3a5fMM); 132 | (*Vect2) = C_norm * (Y04M2e6bMM + Y1d7cP3a5fMM); 133 | (*Vect6) = C_norm * (Y04M2e6bMP - Y1c7dM3f5aPM); 134 | } 135 | 136 | 137 | /** 138 | ************************************************************************** 139 | * Performs 8x8 block-wise Forward Discrete Cosine Transform of the given 140 | * image plane and outputs result to the array of coefficients. 2nd implementation. 141 | * This kernel is designed to process image by blocks of blocks8x8 that 142 | * utilizes maximum warps capacity, assuming that it is enough of 8 threads 143 | * per block8x8. 144 | * 145 | * \param SrcDst [OUT] - Coefficients plane 146 | * \param ImgStride [IN] - Stride of SrcDst 147 | * 148 | * \return None 149 | */ 150 | 151 | __global__ void DCT2D8x8(float *dst, const float *src, const uint size) 152 | { 153 | __shared__ float block[KER2_BLOCK_HEIGHT * KER2_SMEMBLOCK_STRIDE]; 154 | 155 | if (blockIdx.x * KER2_BLOCK_HEIGHT * KER2_BLOCK_WIDTH + (threadIdx.y+1) * BLOCK_SIZE*BLOCK_SIZE-1 >= size) return; 156 | 157 | int offset = threadIdx.y * (BLOCK_SIZE*BLOCK_SIZE) + threadIdx.x; 158 | 159 | //Get macro-block address 160 | src += blockIdx.x * KER2_BLOCK_HEIGHT * KER2_BLOCK_WIDTH; 161 | dst += blockIdx.x * KER2_BLOCK_HEIGHT * KER2_BLOCK_WIDTH; 162 | 163 | //8x1 blocks in one macro-block (threadIdx.y - index of block inside the macro-block) 164 | //Get the first element of the column in the block with index threadIdx.y 165 | src += offset; 166 | dst += offset; 167 | 168 | float *bl_ptr = block + offset; 169 | 170 | #pragma unroll 171 | 172 | for (unsigned int i = 0; i < BLOCK_SIZE; i++) 173 | bl_ptr[i * BLOCK_SIZE] = src[i * BLOCK_SIZE]; //Load column to the shared mem 174 | 175 | //process rows 176 | InplaceDCTvector(bl_ptr - threadIdx.x + BLOCK_SIZE * threadIdx.x, 1); 177 | 178 | //process columns 179 | InplaceDCTvector(bl_ptr, BLOCK_SIZE); 180 | 181 | for (unsigned int i = 0; i < BLOCK_SIZE; i++) 182 | dst[i * BLOCK_SIZE] = bl_ptr[i * BLOCK_SIZE]; 183 | } 184 | 185 | 186 | /** 187 | ************************************************************************** 188 | * Performs 8x8 block-wise Inverse Discrete Cosine Transform of the given 189 | * coefficients plane and outputs result to the image. 2nd implementation. 190 | * This kernel is designed to process image by blocks of blocks8x8 that 191 | * utilizes maximum warps capacity, assuming that it is enough of 8 threads 192 | * per block8x8. 193 | * 194 | * \param SrcDst [OUT] - Coefficients plane 195 | * \param ImgStride [IN] - Stride of SrcDst 196 | * 197 | * \return None 198 | */ 199 | 200 | __global__ void IDCT2D8x8(float *dst, const float *src, const uint size) 201 | { 202 | __shared__ float block[KER2_BLOCK_HEIGHT * KER2_SMEMBLOCK_STRIDE]; 203 | 204 | if (blockIdx.x * KER2_BLOCK_HEIGHT * KER2_BLOCK_WIDTH + (threadIdx.y+1) * BLOCK_SIZE*BLOCK_SIZE-1 >= size) return; 205 | 206 | int offset = threadIdx.y * (BLOCK_SIZE*BLOCK_SIZE) + threadIdx.x; 207 | 208 | src += blockIdx.x * KER2_BLOCK_HEIGHT * KER2_BLOCK_WIDTH; 209 | dst += blockIdx.x * KER2_BLOCK_HEIGHT * KER2_BLOCK_WIDTH; 210 | 211 | src += offset; 212 | dst += offset; 213 | 214 | float *bl_ptr = block + offset; 215 | 216 | #pragma unroll 217 | 218 | for (unsigned int i = 0; i < BLOCK_SIZE; i++) 219 | bl_ptr[i * BLOCK_SIZE] = src[i * BLOCK_SIZE]; 220 | 221 | //process rows 222 | InplaceIDCTvector(bl_ptr - threadIdx.x + BLOCK_SIZE * threadIdx.x, 1); 223 | 224 | //process columns 225 | InplaceIDCTvector(bl_ptr, BLOCK_SIZE); 226 | 227 | for (unsigned int i = 0; i < BLOCK_SIZE; i++) 228 | dst[i * BLOCK_SIZE] = bl_ptr[i * BLOCK_SIZE]; 229 | } 230 | 231 | extern "C" void run_DCT2D8x8( 232 | float *transformed_stacks, 233 | const float *gathered_stacks, 234 | const uint size, 235 | const dim3 num_threads, 236 | const dim3 num_blocks) 237 | { 238 | DCT2D8x8<<>>(transformed_stacks, gathered_stacks, size); 239 | } 240 | 241 | extern "C" void run_IDCT2D8x8( 242 | float *gathered_stacks, 243 | const float *transformed_stacks, 244 | const uint size, 245 | const dim3 num_threads, 246 | const dim3 num_blocks) 247 | { 248 | IDCT2D8x8<<>>(gathered_stacks, transformed_stacks, size); 249 | } 250 | -------------------------------------------------------------------------------- /src/blockmatching.cu: -------------------------------------------------------------------------------- 1 | #include "params.hpp" 2 | #include "indices.cuh" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | 10 | // Nearest lower power of 2 11 | __device__ __inline__ uint flp2 (uint x) 12 | { 13 | return (0x80000000u >> __clz(x)); 14 | } 15 | 16 | //Computes the squared difference between two numbers 17 | template 18 | __device__ __inline__ T L2p2(const T i1, const T i2) 19 | { 20 | T diff = i1 - i2; 21 | return diff*diff; 22 | } 23 | 24 | /* 25 | Adds new patch to patch stack (only N most similar are kept) 26 | Note: Stack is just an array, not FIFO 27 | */ 28 | __device__ 29 | void add_to_matched_image( 30 | uint *stack, //IN/OUT: Stack of N patches matched to current reference patch 31 | uchar *num_patches_in_stack,//IN/OUT: Number of patches in stack 32 | const uint value, //IN: [..DIFF(ushort)..|..LOC_Y(sbyte)..|..LOC_X(sbyte)..] 33 | const Params & params //IN: Denoising parameters 34 | ) 35 | { 36 | //stack[*num_patches_in_stack-1] is most similar (lowest number) 37 | int k; 38 | 39 | uchar num = (*num_patches_in_stack); 40 | if (num < params.N) //add new value 41 | { 42 | k = num++; 43 | while(k > 0 && value > stack[k-1]) 44 | { 45 | stack[k] = stack[k-1]; 46 | --k; 47 | } 48 | 49 | stack[k] = value; 50 | *num_patches_in_stack = num; 51 | } 52 | else if (value >= stack[0]) 53 | return; 54 | else //delete highest value and add new 55 | { 56 | k = 1; 57 | while (k < params.N && value < stack[k]) 58 | { 59 | stack[k-1] = stack[k]; 60 | k++; 61 | } 62 | stack[k-1] = value; 63 | } 64 | } 65 | 66 | /* 67 | Block-matching algorithm 68 | For each processed reference patch it finds maximaly N similar patches that pass the distance threshold and stores them to the g_stacks array. 69 | It also returns the number of them for each reference patch in g_num_patches_in_stack. 70 | Used denoising parameters: n,k,N,T,p 71 | Division: Kernel handles gridDim.y lines starting with the line passed in argument. Each block handles warpSize reference patches in line. 72 | Each thread process one reference patch. All the warps of a block process the same reference patches. 73 | */ 74 | __global__ 75 | void block_matching( 76 | const uchar* __restrict image, //IN: Original image 77 | ushort* g_stacks, //OUT: For each reference patch contains addresses of similar patches (patch is adressed by top left corner) [..LOC_Y(sbyte)..|..LOC_X(sbyte)..] 78 | uint* g_num_patches_in_stack, //OUT: For each reference patch contains number of similar patches 79 | const uint2 image_dim, //IN: Image dimensions 80 | const uint2 stacks_dim, //IN: Size of area, where reference patches could be located 81 | const Params params, //IN: Denoising parameters 82 | const uint2 start_point) //IN: Address of the top-left reference patch of a batch 83 | { 84 | //One block is processing warpSize patches (because each warp is computing distance of same warpSize patches from different displaced patches) 85 | int tid = threadIdx.x % warpSize; 86 | int wid = threadIdx.x / warpSize; 87 | int num_warps = blockDim.x/warpSize; 88 | 89 | //p_block denotes reference rectangle on which current cuda block is computing 90 | uint p_rectangle_width = ((warpSize-1) * params.p) + params.k; 91 | uint p_rectangle_start = start_point.x + blockIdx.x * warpSize * params.p; 92 | 93 | //Shared arrays 94 | extern __shared__ uint s_data[]; 95 | uint *s_diff = (uint*)&s_data; //SIZE: p_rectangle_width*num_warps 96 | uint *s_stacks = (uint*)&s_data[p_rectangle_width*num_warps]; //SIZE: params.N*num_warps*warpSize 97 | uchar *s_patches_in_stack = (uchar*)&s_data[num_warps*(p_rectangle_width + params.N*warpSize)]; //SIZE: num_warps*warpSize 98 | uchar *s_image_p = (uchar*)&s_patches_in_stack[num_warps*warpSize]; //SIZE: p_rectangle_width*params.k 99 | 100 | s_diff += idx2(0, wid, p_rectangle_width); 101 | 102 | //Initialize s_patches_in_stack to zero 103 | s_patches_in_stack[ idx2(tid, wid, warpSize) ] = 0; 104 | 105 | int2 p; //Address of reference patch 106 | int2 q; //Address of patch against which the difference is computed 107 | 108 | p.x = p_rectangle_start + (tid*params.p); 109 | p.y = start_point.y + (blockIdx.y*params.p); 110 | 111 | //Ensure, that the bottom most patches will be taken as reference patches regardless the p parameter. 112 | if (p.y >= stacks_dim.y && p.y < stacks_dim.y + params.p - 1) 113 | p.y = stacks_dim.y - 1; 114 | else if (p.y >= stacks_dim.y) return; 115 | 116 | //Ensure, that the right most patches will be taken as reference patches regardless the p parameter. 117 | uint inner_p_x = tid*params.p; 118 | if (p.x >= stacks_dim.x && p.x < stacks_dim.x + params.p - 1) 119 | { 120 | inner_p_x -= (p.x - (stacks_dim.x - 1)); 121 | p.x = stacks_dim.x - 1; 122 | } 123 | 124 | //Load reference patches needed by actual block to shared memory 125 | for(int i = threadIdx.x; i < p_rectangle_width*params.k; i+=blockDim.x) 126 | { 127 | int sx = i % p_rectangle_width; 128 | int sy = i / p_rectangle_width; 129 | if (p_rectangle_start+sx >= image_dim.x) continue; 130 | s_image_p[i] = image[idx2(p_rectangle_start+sx,p.y+sy,image_dim.x)]; 131 | } 132 | 133 | __syncthreads(); 134 | 135 | //scale difference so that it can fit ushort 136 | uint shift = (__clz(params.Tn) < 16u) ? 16u - (uint)__clz(params.Tn) : 0; 137 | 138 | 139 | //Ensure that displaced patch coordinates (q) will be positive 140 | int2 from; 141 | from.y = (p.y - (int)params.n < 0) ? -p.y : -(int)params.n; 142 | from.x = (((int)p_rectangle_start) - (int)params.n < 0) ? -((int)p_rectangle_start) : -(int)params.n; 143 | from.x += wid; 144 | 145 | //For each displacement (x,y) in n neighbourhood 146 | for(int y = from.y; y <= (int)params.n; ++y) 147 | { 148 | q.y = p.y + y; 149 | if (q.y >= stacks_dim.y) break; 150 | 151 | for(int x = from.x; x <= (int)params.n; x += num_warps) 152 | { 153 | //Reference patch is always the most similar to itself (there is no need to copute it) 154 | if (x == 0 && y == 0) continue; 155 | 156 | //Each warp is computing the same patch with slightly different displacement. 157 | //Compute distance of reference patch p from current patch q which is dispaced by (x+tid,y) 158 | 159 | //q_block denotes displaced rectangle which is processed by the current warp 160 | uint q_rectangle_start = p_rectangle_start + x; 161 | q.x = q_rectangle_start + inner_p_x; 162 | 163 | //Compute distance for each column of reference patch 164 | for(uint i = tid; i < p_rectangle_width && p_rectangle_start+i < image_dim.x && q_rectangle_start+i < image_dim.x; i+=warpSize) 165 | { 166 | uint dist = 0; 167 | for(uint iy = 0; iy < params.k; ++iy) 168 | { 169 | dist += L2p2((int)s_image_p[ idx2(i, iy, p_rectangle_width) ], (int)image[ idx2(q_rectangle_start+i, q.y+iy, image_dim.x) ]); 170 | } 171 | s_diff[i] = dist; 172 | } 173 | 174 | if (p.x >= stacks_dim.x || q.x >= stacks_dim.x) continue; 175 | 176 | //Sum column distances to obtain patch distance 177 | uint diff = 0; 178 | for (uint i = 0; i < params.k; ++i) 179 | diff += s_diff[inner_p_x + i]; 180 | 181 | //Distance threshold 182 | if(diff < params.Tn) 183 | { 184 | uint loc_y = (uint)((q.y - p.y) & 0xFF); //relative location y (-127 to 127) 185 | uint loc_x = (uint)((q.x - p.x) & 0xFF); //relative location x (-127 to 127) 186 | diff >>= shift; 187 | diff <<= 16u; // [..DIFF(ushort)..|..LOC_Y(sbyte)..|..LOC_X(sbyte)..] 188 | diff |= (loc_y << 8u); 189 | diff |= loc_x; 190 | 191 | //Add current patch to s_stacks 192 | add_to_matched_image( 193 | &s_stacks[ params.N * idx2(tid, wid, warpSize) ], 194 | &s_patches_in_stack[ idx2(tid, wid, warpSize) ], 195 | diff, 196 | params 197 | ); 198 | } 199 | } 200 | } 201 | 202 | __syncthreads(); 203 | 204 | uint batch_size = gridDim.x*warpSize; 205 | uint block_address_x = blockIdx.x*warpSize+tid; 206 | 207 | if (wid > 0) return; 208 | //Select N most similar patches for each reference patch from stacks in shared memory and save them to global memory 209 | //Each thread represents one reference patch 210 | //Each thread will find N most similar blocks in num_warps stacks (which were computed by different warps) and save them into global memory 211 | //In shared memory the most similar patch is at the end, in global memory the order does not matter 212 | //DEV: performance impact cca 8% 213 | if (p.x >= stacks_dim.x) return; 214 | 215 | int j; 216 | for (j = 0; j < params.N; ++j) 217 | { 218 | uint count = 0; 219 | uint minIdx = 0; 220 | uint minVal = 0xFFFFFFFF; //INF 221 | 222 | //Finds patch with minimal value of remaining 223 | for (int i = minIdx; i < num_warps; ++i) 224 | { 225 | count = (uint)s_patches_in_stack[ idx2(tid, i, warpSize) ]; 226 | if (count == 0) continue; 227 | 228 | uint newMinVal = s_stacks[ idx3(count-1,tid,i,params.N,warpSize) ]; 229 | if (newMinVal < minVal) 230 | { 231 | minVal = newMinVal; 232 | minIdx = i; 233 | } 234 | } 235 | if (minVal == 0xFFFFFFFF) break; //All stacks are empty 236 | 237 | //Remove patch from shared stack 238 | s_patches_in_stack[ idx2(tid, minIdx, warpSize) ]--; 239 | 240 | //Adds patch to stack in global memory 241 | g_stacks[idx3(j, block_address_x, blockIdx.y, params.N, batch_size)] = (ushort)(minVal & 0xFFFF); 242 | } 243 | //Save to the global memory the number of similar patches rounded to the nearest lower power of two 244 | g_num_patches_in_stack[ idx2(block_address_x ,blockIdx.y, batch_size) ] = flp2((uint)j+1)-1; 245 | } 246 | 247 | 248 | extern "C" void run_block_matching( 249 | const uchar* __restrict image, //Original image 250 | ushort* stacks, //For each reference patch contains addresses of similar patches (patch is adressed by top left corner) 251 | uint* num_patches_in_stack, //For each reference patch contains number of similar patches 252 | const uint2 image_dim, //Image dimensions 253 | const uint2 stacks_dim, //size of area where reference patches could be located 254 | const Params params, //Denoising parameters 255 | const uint2 start_point, //Address of the top-left reference patch of a batch 256 | const dim3 num_threads, 257 | const dim3 num_blocks, 258 | const uint shared_memory_size 259 | ) 260 | { 261 | block_matching<<>>( 262 | image, 263 | stacks, 264 | num_patches_in_stack, 265 | image_dim, 266 | stacks_dim, 267 | params, 268 | start_point 269 | ); 270 | } 271 | -------------------------------------------------------------------------------- /src/filtering.cu: -------------------------------------------------------------------------------- 1 | #include "indices.cuh" 2 | #include "params.hpp" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | // Kernels used for collaborative filtering and aggregation 12 | 13 | //Sum the passed values in a warp to the first thread of this warp. 14 | template 15 | __device__ inline T warpReduceSum(T val) 16 | { 17 | for (int offset = warpSize/2; offset > 0; offset /= 2) 18 | val += __shfl_down_sync(0xffffffff, val, offset); 19 | return val; 20 | } 21 | 22 | 23 | //Sum the passed values in a block to the first thread of a block. 24 | template 25 | __inline__ __device__ float blockReduceSum(T* shared, T val, int tid, int tcount) 26 | { 27 | int lane = tid % warpSize; 28 | int wid = tid / warpSize; 29 | 30 | val = warpReduceSum(val); // Each warp performs partial reduction 31 | 32 | if (lane==0) shared[wid]=val; // Write reduced value to shared memory 33 | 34 | __syncthreads(); // Wait for all partial reductions 35 | 36 | //read from shared memory only if that warp existed 37 | val = (tid < tcount / warpSize) ? shared[lane] : 0; 38 | 39 | if (wid==0) val = warpReduceSum(val); //Final reduce within first warp 40 | 41 | return val; 42 | } 43 | 44 | //Returns absolute value of the passed real number raised to the power of two 45 | __device__ __forceinline__ 46 | float abspow2(float & a) 47 | { 48 | return a * a; 49 | } 50 | 51 | 52 | //Integer logarithm base 2. 53 | template 54 | __device__ __inline__ uint ilog2(IntType n) 55 | { 56 | uint l; 57 | for (l = 0; n; n >>= 1, ++l); 58 | return l; 59 | } 60 | 61 | 62 | //Orthogonal transformation. 63 | template 64 | __device__ __inline__ void rotate(T& a, T& b) 65 | { 66 | T tmp; 67 | tmp = a; 68 | a = tmp + b; 69 | b = tmp - b; 70 | } 71 | 72 | 73 | //Fast Walsh-Hadamard transform. 74 | template 75 | __device__ __inline__ void fwht(T *data, uint n) 76 | { 77 | unsigned l2 = ilog2(n) - 1; 78 | for ( uint i = 0; i < l2; ++i ) 79 | { 80 | for (uint j = 0; j < n; j += (1 << (i + 1))) 81 | for (uint k = 0; k < (uint)(1 << i); ++k) 82 | rotate(data[j + k], data[j + k + (uint)(1 << i)]); 83 | } 84 | } 85 | 86 | //Based on blockIdx it computes the addresses to the arrays in global memory 87 | __device__ inline void get_block_addresses( 88 | const uint2 & start_point, //IN: first reference patch of a batch 89 | const uint & patch_stack_size, //IN: maximal size of a 3D group 90 | const uint2 & stacks_dim, //IN: Size of area, where reference patches could be located 91 | const Params & params, //IN: Denoising parameters 92 | uint2 & outer_address, //OUT: Coordinetes of reference patch in the image 93 | uint & start_idx) //OUT: Address of a first element of the 3D group in stacks array 94 | { 95 | //One block handles one patch_stack, data are in array one after one. 96 | start_idx = patch_stack_size * idx2(blockIdx.x,blockIdx.y,gridDim.x); 97 | 98 | outer_address.x = start_point.x + (blockIdx.x * params.p); 99 | outer_address.y = start_point.y + (blockIdx.y * params.p); 100 | 101 | //Ensure, that the bottom most patches will be taken as reference patches regardless the p parameter. 102 | if (outer_address.y >= stacks_dim.y && outer_address.y < stacks_dim.y + params.p - 1) 103 | outer_address.y = stacks_dim.y - 1; 104 | //Ensure, that the right most patches will be taken as reference patches regardless the p parameter. 105 | if (outer_address.x >= stacks_dim.x && outer_address.x < stacks_dim.x + params.p - 1) 106 | outer_address.x = stacks_dim.x - 1; 107 | } 108 | 109 | /* 110 | Gather patches form image based on matching stored in 3D array stacks 111 | Used parameters: p,k,N 112 | Division: One block handles one patch_stack, threads match to the pixels of a patch 113 | */ 114 | __global__ 115 | void get_block( 116 | const uint2 start_point, //IN: first reference patch of a batch 117 | const uchar* __restrict image, //IN: image 118 | const ushort* __restrict stacks, //IN: array of adresses of similar patches 119 | const uint* __restrict g_num_patches_in_stack, //IN: numbers of patches in 3D groups 120 | float* patch_stack, //OUT: assembled 3D groups 121 | const uint2 image_dim, //IN: image dimensions 122 | const uint2 stacks_dim, //IN: dimensions limiting addresses of reference patches 123 | const Params params) //IN: denoising parameters 124 | { 125 | 126 | 127 | uint startidx; 128 | uint2 outer_address; 129 | get_block_addresses(start_point, params.k*params.k*(params.N+1), stacks_dim, params, outer_address, startidx); 130 | 131 | if (outer_address.x >= stacks_dim.x || outer_address.y >= stacks_dim.y) return; 132 | 133 | patch_stack += startidx; 134 | 135 | const ushort* z_ptr = &stacks[ idx3(0, blockIdx.x, blockIdx.y, params.N, gridDim.x) ]; 136 | 137 | uint num_patches = g_num_patches_in_stack[ idx2(blockIdx.x, blockIdx.y, gridDim.x) ]; 138 | 139 | patch_stack[ idx3(threadIdx.x, threadIdx.y, 0, params.k, params.k) ] = (float)(image[ idx2(outer_address.x+threadIdx.x, outer_address.y+threadIdx.y, image_dim.x)]); 140 | for(uint i = 0; i < num_patches; ++i) 141 | { 142 | int x = (int)((signed char)(z_ptr[i] & 0xFF)); 143 | int y = (int)((signed char)((z_ptr[i] >> 8) & 0xFF)); 144 | patch_stack[ idx3(threadIdx.x, threadIdx.y, i+1, params.k, params.k) ] = (float)(image[ idx2(outer_address.x+x+threadIdx.x, outer_address.y+y+threadIdx.y, image_dim.x)]); 145 | } 146 | } 147 | 148 | /* 149 | 1) Do the Walsh-Hadamard 1D transform on the z axis of 3D stack. 150 | 2) Treshold every pixel and count the number of non-zero coefficients 151 | 3) Do the inverse Walsh-Hadamard 1D transform on the z axis of 3D stack. 152 | Used parameters: L3D,N,k,p 153 | Division: Each block delas with one transformed patch stack. (number of threads in block should be k*k) 154 | */ 155 | __global__ 156 | void hard_treshold_block( 157 | const uint2 start_point, //IN: first reference patch of a batch 158 | float* patch_stack, //IN/OUT: 3D groups with thransfomed patches 159 | float* w_P, //OUT: weight of each 3D group 160 | const uint* __restrict g_num_patches_in_stack, //IN: numbers of patches in 3D groups 161 | uint2 stacks_dim, //IN: dimensions limiting addresses of reference patches 162 | const Params params, //IN: denoising parameters 163 | const uint sigma //IN: noise variance 164 | ) 165 | { 166 | extern __shared__ float data[]; 167 | 168 | int paramN = params.N+1; 169 | uint tcount = blockDim.x*blockDim.y; 170 | uint tid = idx2(threadIdx.x, threadIdx.y, blockDim.x); 171 | uint patch_stack_size = tcount * paramN; 172 | 173 | uint startidx; 174 | uint2 outer_address; 175 | get_block_addresses(start_point, patch_stack_size, stacks_dim, params, outer_address, startidx); 176 | 177 | if (outer_address.x >= stacks_dim.x || outer_address.y >= stacks_dim.y) return; 178 | 179 | uint num_patches = g_num_patches_in_stack[ idx2(blockIdx.x, blockIdx.y, gridDim.x) ]+1; //+1 for the reference patch. 180 | float* s_patch_stack = data + (tid * (num_patches+1)); //+1 for avoiding bank conflicts //TODO:sometimes 181 | patch_stack = patch_stack + startidx + tid; 182 | 183 | //Load to the shared memory 184 | for(uint i = 0; i < num_patches; ++i) 185 | s_patch_stack[i] = patch_stack[ i*tcount ]; 186 | 187 | //1D Transform 188 | fwht(s_patch_stack, num_patches); 189 | 190 | //Hard-thresholding + counting of nonzero coefficients 191 | uint nonzero = 0; 192 | float threshold = params.L3D * sqrtf((float)(num_patches * sigma)); 193 | for(int i = 0; i < num_patches; ++i) 194 | { 195 | if (fabsf(s_patch_stack[ i ]) < threshold) 196 | { 197 | s_patch_stack[ i ] = 0.0f; 198 | } 199 | else 200 | ++nonzero; 201 | } 202 | 203 | //Inverse 1D Transform 204 | fwht(s_patch_stack, num_patches); 205 | 206 | //Normalize and save to global memory 207 | for (uint i = 0; i < num_patches; ++i) 208 | { 209 | patch_stack[ i*tcount ] = s_patch_stack[i] / num_patches; 210 | } 211 | 212 | //Reuse the shared memory for 32 partial sums 213 | __syncthreads(); 214 | uint* shared = (uint*)data; 215 | //Sum the number of non-zero coefficients for a 3D group 216 | nonzero = blockReduceSum(shared, nonzero, tid, tcount); 217 | 218 | //Save the weight of a 3D group (1/nonzero coefficients) 219 | if (tid == 0) 220 | { 221 | if (nonzero < 1) nonzero = 1; 222 | w_P[ idx2(blockIdx.x, blockIdx.y, gridDim.x ) ] = 1.0f/(float)nonzero; 223 | } 224 | } 225 | 226 | /* 227 | Fills two buffers: numerator and denominator in order to compute weighted average of pixels 228 | Used parameters: k,N,p 229 | Division: Each block delas with one transformed patch stack. 230 | */ 231 | __global__ 232 | void aggregate_block( 233 | const uint2 start_point, //IN: first reference patch of a batch 234 | const float* __restrict patch_stack, //IN: 3D groups with thransfomed patches 235 | const float* __restrict w_P, //IN: weight for each 3D group 236 | const ushort* __restrict stacks, //IN: array of adresses of similar patches 237 | const float* __restrict kaiser_window, //IN: kaiser window 238 | float* numerator, //IN/OUT: numerator aggregation buffer (have to be initialized to 0) 239 | float* denominator, //IN/OUT: denominator aggregation buffer (have to be initialized to 0) 240 | const uint* __restrict g_num_patches_in_stack, //IN: numbers of patches in 3D groups 241 | const uint2 image_dim, //IN: image dimensions 242 | const uint2 stacks_dim, //IN: dimensions limiting addresses of reference patches 243 | const Params params //IN: denoising parameters 244 | ) 245 | { 246 | uint startidx; 247 | uint2 outer_address; 248 | get_block_addresses(start_point, params.k*params.k*(params.N+1), stacks_dim, params, outer_address, startidx); 249 | 250 | if (outer_address.x >= stacks_dim.x || outer_address.y >= stacks_dim.y) return; 251 | 252 | patch_stack += startidx; 253 | 254 | uint num_patches = g_num_patches_in_stack[ idx2(blockIdx.x, blockIdx.y, gridDim.x) ]+1; 255 | 256 | float wp = w_P[ idx2(blockIdx.x, blockIdx.y, gridDim.x ) ]; 257 | 258 | const ushort* z_ptr = &stacks[ idx3(0, blockIdx.x, blockIdx.y, params.N, gridDim.x) ]; 259 | 260 | float kaiser_value = kaiser_window[ idx2(threadIdx.x, threadIdx.y, params.k) ]; 261 | 262 | for(uint z = 0; z < num_patches; ++z) 263 | { 264 | int x = 0; 265 | int y = 0; 266 | if (z > 0) { 267 | x = (int)((signed char)(z_ptr[z-1] & 0xFF)); 268 | y = (int)((signed char)((z_ptr[z-1] >> 8) & 0xFF)); 269 | } 270 | 271 | float value = ( patch_stack[ idx3(threadIdx.x, threadIdx.y, z, params.k, params.k) ]); 272 | int idx = idx2(outer_address.x + x + threadIdx.x, outer_address.y + y + threadIdx.y, image_dim.x); 273 | atomicAdd(numerator + idx, value * kaiser_value * wp); 274 | atomicAdd(denominator + idx, kaiser_value * wp); 275 | } 276 | } 277 | 278 | /* 279 | Divide numerator with denominator and round result to image_o 280 | */ 281 | __global__ 282 | void aggregate_final( 283 | const float* __restrict numerator, //IN: numerator aggregation buffer 284 | const float* __restrict denominator, //IN: denominator aggregation buffer 285 | const uint2 image_dim, //IN: image dimensions 286 | uchar* image_o) //OUT: image estimate 287 | { 288 | uint idx = blockIdx.x * blockDim.x + threadIdx.x; 289 | uint idy = blockIdx.y * blockDim.y + threadIdx.y; 290 | if (idx >= image_dim.x || idy >= image_dim.y) return; 291 | 292 | int value = lrintf(numerator[ idx2(idx,idy,image_dim.x) ] / denominator[ idx2(idx,idy,image_dim.x) ] ); 293 | if (value < 0) value = 0; 294 | if (value > 255) value = 255; 295 | image_o[ idx2(idx,idy,image_dim.x) ] = (uchar)value; 296 | } 297 | 298 | 299 | /* 300 | Calculate the wiener coefficients 301 | */ 302 | __global__ 303 | void wiener_filtering( 304 | const uint2 start_point, //IN: first reference patch of a batch 305 | float* patch_stack, //IN/OUT: 3D groups with thransfomed nosiy patches 306 | const float* __restrict patch_stack_basic, //IN: 3D groups with thransfomed patches of the basic estimate 307 | float* w_P, //OUT: Weigths of 3D groups 308 | const uint* __restrict g_num_patches_in_stack, //IN: numbers of patches in 3D groups 309 | uint2 stacks_dim, //IN: dimensions limiting addresses of reference patches 310 | const Params params, //IN: denoising parameters 311 | const uint sigma //IN: Noise variance 312 | ) 313 | { 314 | extern __shared__ float data[]; 315 | 316 | int paramN = params.N+1; 317 | uint tcount = blockDim.x*blockDim.y; 318 | uint tid = idx2(threadIdx.x, threadIdx.y, blockDim.x); 319 | uint patch_stack_size = tcount * paramN; 320 | 321 | uint startidx; 322 | uint2 outer_address; 323 | get_block_addresses(start_point, patch_stack_size, stacks_dim, params, outer_address, startidx); 324 | 325 | if (outer_address.x >= stacks_dim.x || outer_address.y >= stacks_dim.y) return; 326 | 327 | uint num_patches = g_num_patches_in_stack[ idx2(blockIdx.x, blockIdx.y, gridDim.x) ]+1; 328 | 329 | float* s_patch_stack_basic = data + (tid * (num_patches+1)); //+1 for avoiding bank conflicts 330 | float* s_patch_stack = s_patch_stack_basic + (tcount * (num_patches+1)); //+1 for avoiding bank conflicts 331 | 332 | patch_stack = patch_stack + startidx + tid; 333 | patch_stack_basic = patch_stack_basic + startidx + tid; 334 | 335 | //Load to the shared memory 336 | for(uint i = 0; i < num_patches; ++i) 337 | { 338 | s_patch_stack[i] = patch_stack[ i*tcount ]; 339 | s_patch_stack_basic[i] = patch_stack_basic[ i*tcount ]; 340 | } 341 | 342 | //1D Transforms 343 | fwht(s_patch_stack, num_patches); 344 | fwht(s_patch_stack_basic, num_patches); 345 | 346 | float normcoeff = 1.0f/((float)num_patches); 347 | 348 | //Wiener filtering 349 | float wien_sum = 0.0f; 350 | for(int i = 0; i < num_patches; ++i) 351 | { 352 | float wien = abspow2(s_patch_stack_basic[i]) * normcoeff; 353 | wien /= (wien + sigma); 354 | s_patch_stack[i] *= wien * normcoeff; 355 | wien_sum += wien*wien; 356 | } 357 | 358 | //1D inverse transform 359 | fwht(s_patch_stack, num_patches); 360 | 361 | //Save to global memory 362 | for(uint i = 0; i < num_patches; ++i) 363 | { 364 | patch_stack[ i*tcount ] = s_patch_stack[i]; 365 | } 366 | 367 | __syncthreads(); 368 | //reuse of the shared memory for 32 partial sums 369 | float* shared = (float*)data; 370 | 371 | //Sum all wiener coefficients 372 | wien_sum = blockReduceSum(shared, wien_sum, tid, tcount); 373 | 374 | //Save inverse L2 norm powered by two of wien_coef to global memory 375 | if (tid == 0) 376 | { 377 | if (wien_sum > 0.0) 378 | w_P[ idx2(blockIdx.x, blockIdx.y, gridDim.x ) ] = 1.0f/wien_sum; 379 | else 380 | w_P[ idx2(blockIdx.x, blockIdx.y, gridDim.x ) ] = 1.0f; 381 | } 382 | } 383 | 384 | extern "C" void run_get_block( 385 | const uint2 start_point, 386 | const uchar* __restrict image, 387 | const ushort* __restrict stacks, 388 | const uint* __restrict num_patches_in_stack, 389 | float* patch_stack, 390 | const uint2 image_dim, 391 | const uint2 stacks_dim, 392 | const Params params, 393 | const dim3 num_threads, 394 | const dim3 num_blocks) 395 | { 396 | get_block<<>>( 397 | start_point, 398 | image, 399 | stacks, 400 | num_patches_in_stack, 401 | patch_stack, 402 | image_dim, 403 | stacks_dim, 404 | params 405 | ); 406 | } 407 | 408 | extern "C" void run_hard_treshold_block( 409 | const uint2 start_point, 410 | float* patch_stack, 411 | float* w_P, 412 | const uint* __restrict num_patches_in_stack, 413 | const uint2 stacks_dim, 414 | const Params params, 415 | const uint sigma, 416 | const dim3 num_threads, 417 | const dim3 num_blocks, 418 | const uint shared_memory_size) 419 | { 420 | hard_treshold_block<<>>( 421 | start_point, 422 | patch_stack, 423 | w_P, 424 | num_patches_in_stack, 425 | stacks_dim, 426 | params, 427 | sigma 428 | ); 429 | } 430 | 431 | extern "C" void run_aggregate_block( 432 | const uint2 start_point, 433 | const float* __restrict patch_stack, 434 | const float* __restrict w_P, 435 | const ushort* __restrict stacks, 436 | const float* __restrict kaiser_window, 437 | float* numerator, 438 | float* denominator, 439 | const uint* __restrict num_patches_in_stack, 440 | const uint2 image_dim, 441 | const uint2 stacks_dim, 442 | const Params params, 443 | const dim3 num_threads, 444 | const dim3 num_blocks) 445 | { 446 | aggregate_block<<>>( 447 | start_point, 448 | patch_stack, 449 | w_P, 450 | stacks, 451 | kaiser_window, 452 | numerator, 453 | denominator, 454 | num_patches_in_stack, 455 | image_dim, 456 | stacks_dim, 457 | params 458 | ); 459 | } 460 | 461 | extern "C" void run_aggregate_final( 462 | const float* __restrict numerator, 463 | const float* __restrict denominator, 464 | const uint2 image_dim, 465 | uchar* denoised_image, 466 | const dim3 num_threads, 467 | const dim3 num_blocks 468 | ) 469 | { 470 | aggregate_final<<>>( 471 | numerator, 472 | denominator, 473 | image_dim, 474 | denoised_image 475 | ); 476 | } 477 | 478 | extern "C" void run_wiener_filtering( 479 | const uint2 start_point, 480 | float* patch_stack, 481 | const float* __restrict patch_stack_basic, 482 | float* w_P, 483 | const uint* __restrict num_patches_in_stack, 484 | uint2 stacks_dim, 485 | const Params params, 486 | const uint sigma, 487 | const dim3 num_threads, 488 | const dim3 num_blocks, 489 | const uint shared_memory_size 490 | ) 491 | { 492 | wiener_filtering<<>>( 493 | start_point, 494 | patch_stack, 495 | patch_stack_basic, 496 | w_P, 497 | num_patches_in_stack, 498 | stacks_dim, 499 | params, 500 | sigma 501 | ); 502 | } 503 | -------------------------------------------------------------------------------- /include/bm3d.hpp: -------------------------------------------------------------------------------- 1 | #define NOMINMAX 2 | #include "params.hpp" 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include //min max 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "indices.cuh" 15 | 16 | //2DDCT - has to be consistent with dct8x8.cu 17 | #define KER2_BLOCK_WIDTH 128 18 | 19 | //Exception handling 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | 26 | //Debug 27 | #include "stopwatch.hpp" 28 | #include 29 | #include 30 | 31 | //Extern kernels 32 | 33 | extern "C" void run_block_matching( 34 | const uchar* __restrict image, 35 | ushort* stacks, 36 | uint* num_patches_in_stack, 37 | const uint2 image_dim, 38 | const uint2 stacks_dim, 39 | const Params params, 40 | const uint2 start_point, 41 | const dim3 num_threads, 42 | const dim3 num_blocks, 43 | const uint shared_memory_size 44 | ); 45 | 46 | extern "C" void run_get_block( 47 | const uint2 start_point, 48 | const uchar* __restrict image, 49 | const ushort* __restrict stacks, 50 | const uint* __restrict num_patches_in_stack, 51 | float* patch_stack, 52 | const uint2 image_dim, 53 | const uint2 stacks_dim, 54 | const Params params, 55 | const dim3 num_threads, 56 | const dim3 num_blocks 57 | ); 58 | 59 | extern "C" void run_DCT2D8x8( 60 | float *d_transformed_stacks, 61 | const float *d_gathered_stacks, 62 | const uint size, 63 | const dim3 num_threads, 64 | const dim3 num_blocks 65 | ); 66 | 67 | extern "C" void run_hard_treshold_block( 68 | const uint2 start_point, 69 | float* patch_stack, 70 | float* w_P, 71 | const uint* __restrict num_patches_in_stack, 72 | const uint2 stacks_dim, 73 | const Params params, 74 | const uint sigma, 75 | const dim3 num_threads, 76 | const dim3 num_blocks, 77 | const uint shared_memory_size 78 | ); 79 | 80 | extern "C" void run_IDCT2D8x8( 81 | float *d_gathered_stacks, 82 | const float *d_transformed_stacks, 83 | const uint size, 84 | const dim3 num_threads, 85 | const dim3 num_blocks 86 | ); 87 | 88 | extern "C" void run_aggregate_block( 89 | const uint2 start_point, 90 | const float* __restrict patch_stack, 91 | const float* __restrict w_P, 92 | const ushort* __restrict stacks, 93 | const float* __restrict kaiser_window, 94 | float* numerator, 95 | float* denominator, 96 | const uint* __restrict num_patches_in_stack, 97 | const uint2 image_dim, 98 | const uint2 stacks_dim, 99 | const Params params, 100 | const dim3 num_threads, 101 | const dim3 num_blocks 102 | ); 103 | 104 | extern "C" void run_aggregate_final( 105 | const float* __restrict numerator, 106 | const float* __restrict denominator, 107 | const uint2 image_dim, 108 | uchar* denoised_noisy_image, 109 | const dim3 num_threads, 110 | const dim3 num_blocks 111 | ); 112 | 113 | extern "C" void run_wiener_filtering( 114 | const uint2 start_point, 115 | float* patch_stack, 116 | const float* __restrict patch_stack_basic, 117 | float* w_P, 118 | const uint* __restrict num_patches_in_stack, 119 | uint2 stacks_dim, 120 | const Params params, 121 | const uint sigma, 122 | const dim3 num_threads, 123 | const dim3 num_blocks, 124 | const uint shared_memory_size 125 | ); 126 | 127 | //Cuda error handling 128 | //Sometimes does not work 129 | #define cuda_error_check(ans) { throw_on_cuda_error((ans),__FILE__, __LINE__); } 130 | void throw_on_cuda_error(cudaError_t code, const char *file, int line) 131 | { 132 | if(code != cudaSuccess) 133 | { 134 | std::stringstream ss; 135 | ss << file << "(" << line << "): " << cudaGetErrorString(code); 136 | std::string file_and_line; 137 | ss >> file_and_line; 138 | throw thrust::system_error(code, thrust::cuda_category(), file_and_line); 139 | } 140 | } 141 | 142 | class BM3D 143 | { 144 | private: 145 | //Image (vector of image channels) 146 | std::vector d_noisy_image; 147 | std::vector d_denoised_image; 148 | 149 | //Auxiliary arrays 150 | ushort* d_stacks; //Addresses of similar patches to each reference patch of a batch 151 | std::vector d_numerator; //Numerator used for aggregation 152 | std::vector d_denominator; //Denminator used for aggregation 153 | uint* d_num_patches_in_stack; //Number of similar patches for each referenca patch of a batch that are stored in d_stacks 154 | float* d_gathered_stacks; //3D groups of a batch 155 | float* d_gathered_stacks_basic; //Only for two step denoising, contains wiener coefficients 156 | float* d_w_P; //Weights for aggregation 157 | float* d_kaiser_window; //Kaiser window used for aggregation 158 | 159 | 160 | //Reserved sizes 161 | int h_reserved_width; 162 | int h_reserved_height; 163 | int h_reserved_channels; 164 | bool h_reserved_two_step; 165 | uint2 h_batch_size; //h_batch_size.x has to be divisible by properties.warpSize 166 | 167 | //Denoising parameters 168 | Params h_hard_params; 169 | Params h_wien_params; 170 | 171 | //Device properties 172 | cudaDeviceProp properties; 173 | 174 | bool _verbose; 175 | 176 | //Allocate device buffers dependent on denoising parameters 177 | void allocate_device_auxiliary_arrays() 178 | { 179 | int maxk = std::max(h_wien_params.k,h_hard_params.k); 180 | int maxN = std::max(h_wien_params.N,h_hard_params.N); 181 | 182 | cuda_error_check( cudaMalloc((void**)&d_stacks, sizeof(ushort) * h_batch_size.x * h_batch_size.y * maxN) ); 183 | 184 | cuda_error_check( cudaMalloc((void**)&d_num_patches_in_stack, sizeof(uint) * h_batch_size.x * h_batch_size.y ) ); 185 | 186 | cuda_error_check( cudaMalloc((void**)&d_gathered_stacks, sizeof(float)*(maxN+1)*maxk*maxk*h_batch_size.x*h_batch_size.y) ); 187 | 188 | cuda_error_check( cudaMalloc((void**)&d_w_P, sizeof(float) * h_batch_size.x*h_batch_size.y) ); 189 | 190 | cuda_error_check( cudaMalloc((void**)&d_kaiser_window, sizeof(float) * maxk * maxk) ); 191 | 192 | if (h_reserved_two_step) 193 | cuda_error_check( cudaMalloc((void**)&d_gathered_stacks_basic, sizeof(float)*(maxN+1)*maxk*maxk*h_batch_size.x*h_batch_size.y) ); 194 | } 195 | 196 | //Allocate device buffers dependent on image dimensions 197 | void allocate_device_image(uint width, uint height, uint channels) 198 | { 199 | d_noisy_image.resize(channels); 200 | d_denoised_image.resize(channels); 201 | d_numerator.resize(channels); 202 | d_denominator.resize(channels); 203 | 204 | int size = width * height; 205 | for(auto & it : d_noisy_image) { 206 | cuda_error_check( cudaMalloc((void**)&it, sizeof(uchar) * size) ); 207 | } 208 | 209 | for(auto & it : d_denoised_image) { 210 | cuda_error_check( cudaMalloc((void**)&it, sizeof(uchar) * size) ); 211 | } 212 | 213 | for(auto & it : d_numerator) { 214 | cuda_error_check( cudaMalloc((void**)&it, sizeof(float) * size) ); 215 | } 216 | 217 | for(auto & it : d_denominator) { 218 | cuda_error_check( cudaMalloc((void**)&it, sizeof(float) * size) ); 219 | } 220 | 221 | } 222 | 223 | //Creates an kaiser window (only for k = 8, alpha = 2.0) and copies it to the device. 224 | void prepare_kaiser_window(uint k) 225 | { 226 | std::vector kaiserWindow(k*k); 227 | if (k == 8) 228 | { 229 | //! First quarter of the matrix 230 | kaiserWindow[0 + k * 0] = 0.1924f; kaiserWindow[0 + k * 1] = 0.2989f; kaiserWindow[0 + k * 2] = 0.3846f; kaiserWindow[0 + k * 3] = 0.4325f; 231 | kaiserWindow[1 + k * 0] = 0.2989f; kaiserWindow[1 + k * 1] = 0.4642f; kaiserWindow[1 + k * 2] = 0.5974f; kaiserWindow[1 + k * 3] = 0.6717f; 232 | kaiserWindow[2 + k * 0] = 0.3846f; kaiserWindow[2 + k * 1] = 0.5974f; kaiserWindow[2 + k * 2] = 0.7688f; kaiserWindow[2 + k * 3] = 0.8644f; 233 | kaiserWindow[3 + k * 0] = 0.4325f; kaiserWindow[3 + k * 1] = 0.6717f; kaiserWindow[3 + k * 2] = 0.8644f; kaiserWindow[3 + k * 3] = 0.9718f; 234 | 235 | //! Completing the rest of the matrix by symmetry 236 | for(unsigned i = 0; i < k / 2; i++) 237 | for (unsigned j = k / 2; j < k; j++) 238 | kaiserWindow[i + k * j] = kaiserWindow[i + k * (k - j - 1)]; 239 | 240 | for (unsigned i = k / 2; i < k; i++) 241 | for (unsigned j = 0; j < k; j++) 242 | kaiserWindow[i + k * j] = kaiserWindow[k - i - 1 + k * j]; 243 | } 244 | else 245 | for (unsigned i = 0; i < k * k; i++) 246 | kaiserWindow[i] = 1.0f; 247 | 248 | cuda_error_check( cudaMemcpy(d_kaiser_window,&kaiserWindow[0],k*k*sizeof(float),cudaMemcpyHostToDevice)); 249 | } 250 | 251 | //Copy image to device 252 | void copy_device_image(const uchar * src_image, int width, int height, int channels) 253 | { 254 | size_t image_size = width * height; 255 | for(int i = 0; i < channels; ++i) { 256 | //Copy image to device 257 | cuda_error_check( cudaMemcpy(d_noisy_image[i],src_image+i*image_size,image_size*sizeof(uchar),cudaMemcpyHostToDevice)); 258 | } 259 | } 260 | 261 | //Compute launch parameters for block-matching kernel 262 | void get_BM_launch_parameters( 263 | const Params & params, //IN: Denoising parameters 264 | dim3 & num_threads, //OUT: number of threads 265 | dim3 & num_blocks, //OUT: numbe of blocks 266 | uint & s_mem_size) //OUT: shared memory size 267 | { 268 | //Determine number of warps form block-matching according to the size of shared memory. 269 | const uint p_block_width = ((properties.warpSize-1) * params.p) + params.k; 270 | const uint s_image_p_size = p_block_width * params.k * sizeof(uchar); 271 | 272 | const float shared_mem_usage = 1.0f; // 0 - 1 273 | const uint shared_mem_avaliable = (uint)(properties.sharedMemPerBlock * shared_mem_usage) - s_image_p_size; 274 | 275 | //Block-matching shared memory sizes per warp 276 | const uint s_diff_size = p_block_width * sizeof(uint); 277 | const uint s_patches_in_stack_size = properties.warpSize * sizeof(uchar); 278 | const uint s_patch_stacks_size = params.N * properties.warpSize * sizeof(uint); 279 | 280 | const uint num_warps = std::min(shared_mem_avaliable / (s_diff_size + s_patches_in_stack_size + s_patch_stacks_size),32u); 281 | 282 | //Block-matching Launch parameters 283 | s_mem_size = ((s_diff_size + s_patches_in_stack_size + s_patch_stacks_size) * num_warps) + s_image_p_size; 284 | num_threads = dim3(properties.warpSize*num_warps, 1); 285 | num_blocks = dim3(h_batch_size.x / properties.warpSize, h_batch_size.y); 286 | 287 | if (_verbose) 288 | { 289 | std::cout << "Shared memory : " << s_mem_size/1024 << "KB/" << properties.sharedMemPerBlock/1024 << "KB" << std::endl; 290 | std::cout << "Number of warps: " << num_warps << std::endl; 291 | } 292 | } 293 | 294 | /* 295 | Launch first step of BM3D. It produces basic estimate in denoised_image arrays. 296 | */ 297 | void first_step(std::vector & denoised_image, int width, int height, int channels, uint* sigma) 298 | { 299 | //image dimensions 300 | const uint2 image_dim = make_uint2(width,height); 301 | 302 | //dimensions limiting addresses of reference patches 303 | const uint2 stacks_dim = make_uint2(width - (h_hard_params.k - 1), height - (h_hard_params.k - 1)); 304 | 305 | int paramN1 = h_hard_params.N + 1; //maximal size of a stack with a reference patch 306 | 307 | //Determine launch parameteres for block-matching kernel 308 | dim3 num_threads_bm; 309 | dim3 num_blocks_bm; 310 | uint s_size_bm; 311 | get_BM_launch_parameters(h_hard_params, num_threads_bm, num_blocks_bm, s_size_bm); 312 | 313 | //Determine launch parameteres for get and aggregate kernels 314 | const dim3 num_threads(h_hard_params.k, h_hard_params.k); 315 | const dim3 num_blocks(h_batch_size.x, h_batch_size.y); 316 | 317 | //Determine launch parameteres for DCT kernel 318 | const uint trans_size = h_hard_params.k*h_hard_params.k*paramN1*h_batch_size.x*h_batch_size.y; 319 | const dim3 num_blocks_tr((trans_size + (KER2_BLOCK_WIDTH*h_hard_params.k) - 1) / (KER2_BLOCK_WIDTH*h_hard_params.k), 1, 1); 320 | const dim3 num_threads_tr(h_hard_params.k, KER2_BLOCK_WIDTH/h_hard_params.k, 1); 321 | 322 | //Determine launch parameteres for filtering kernel 323 | const uint s_size_t = h_hard_params.k*h_hard_params.k*(paramN1+1)*sizeof(float); //+1 for avoinding bank conflicts 324 | 325 | //Determine launch parameteres for final division kernel 326 | const dim3 num_threads_f(128, 4); 327 | const dim3 num_blocks_f((width + num_threads_f.x - 1) / num_threads_f.x, (height + num_threads_f.y - 1) / num_threads_f.y); 328 | 329 | //Create and copy to device kaiser window 330 | prepare_kaiser_window(h_hard_params.k); 331 | 332 | //Timers 333 | Stopwatch time_blockmatching; 334 | Stopwatch time_get; 335 | Stopwatch time_transform; 336 | Stopwatch time_itransform; 337 | Stopwatch time_aggregate; 338 | Stopwatch time_treshold; 339 | 340 | 341 | //Batch processing: in each iteration only the batch_size reference patches are processed. 342 | uint2 start_point; 343 | for(start_point.y = 0; start_point.y < stacks_dim.y + h_hard_params.p - 1; start_point.y+=(h_batch_size.y*h_hard_params.p)) 344 | { 345 | //Show progress 346 | if (_verbose) 347 | { 348 | int percent = (int)(((float)start_point.y / (float)stacks_dim.y) * (float)100); 349 | std::cout << "\rProcessing " << percent << "%" << std::flush; 350 | } 351 | for(start_point.x = 0; start_point.x < stacks_dim.x + h_hard_params.p - 1; start_point.x+=(h_batch_size.x*h_hard_params.p)) 352 | { 353 | if (_verbose) 354 | time_blockmatching.start(); 355 | 356 | //Finds similar patches for each reference patch of a batch and stores them in d_stacks array 357 | run_block_matching( 358 | d_noisy_image[0], // IN: Image 359 | d_stacks, // OUT: Array of adresses of similar patches 360 | d_num_patches_in_stack, // OUT: Array containing numbers of these addresses 361 | image_dim, // IN: Image dimensions 362 | stacks_dim, // IN: Dimensions limiting addresses of reference patches 363 | h_hard_params, // IN: Denoising parameters 364 | start_point, // IN: Address of the top-left reference patch of a batch 365 | num_threads_bm, // CUDA: Threads in block 366 | num_blocks_bm, // CUDA: Blocks in grid 367 | s_size_bm // CUDA: Shared memory size 368 | ); 369 | 370 | cuda_error_check( cudaGetLastError() ); 371 | cuda_error_check( cudaDeviceSynchronize() ); 372 | 373 | if (_verbose) 374 | time_blockmatching.stop(); 375 | 376 | for (int channel = 0; channel < channels; ++channel) 377 | { 378 | if (_verbose) 379 | time_get.start(); 380 | 381 | //Assembles 3D groups of a batch according to the d_stacks array 382 | run_get_block( 383 | start_point, //IN: First reference patch of a batch 384 | d_noisy_image[channel], //IN: Image 385 | d_stacks, //IN: Array of adresses of similar patches 386 | d_num_patches_in_stack, //IN: Numbers of patches in 3D groups 387 | d_gathered_stacks, //OUT: Assembled 3D groups 388 | image_dim, //IN: Image dimensions 389 | stacks_dim, //IN: Dimensions limiting addresses of reference patches 390 | h_hard_params, //IN: Denoising parameters 391 | num_threads, //CUDA: Threads in block 392 | num_blocks //CUDA: Blocks in grid 393 | ); 394 | 395 | cuda_error_check( cudaGetLastError() ); 396 | cuda_error_check( cudaDeviceSynchronize() ); 397 | 398 | if (_verbose) 399 | { 400 | time_get.stop(); 401 | time_transform.start(); 402 | } 403 | 404 | //Apply the 2D DCT transform to each layer of 3D group 405 | run_DCT2D8x8(d_gathered_stacks, d_gathered_stacks, trans_size, num_threads_tr, num_blocks_tr); 406 | cuda_error_check( cudaGetLastError() ); 407 | cuda_error_check( cudaDeviceSynchronize() ); 408 | 409 | 410 | if (_verbose) 411 | { 412 | time_transform.stop(); 413 | time_treshold.start(); 414 | } 415 | 416 | 417 | /* 418 | 1) 1D Walsh-Hadamard transform of proper size on the 3rd dimension of each 3D group of a batch to complete the 3D transform. 419 | 2) Hard thresholding 420 | 3) Inverse 1D Walsh-Hadamard trannsform. 421 | 4) Compute the weingt of each 3D group 422 | */ 423 | 424 | run_hard_treshold_block( 425 | start_point, //IN: First reference patch of a batch 426 | d_gathered_stacks, //IN/OUT: 3D groups with thransfomed patches 427 | d_w_P, //OUT: Weight of each 3D group 428 | d_num_patches_in_stack, //IN: Numbers of patches in 3D groups 429 | stacks_dim, //IN: Dimensions limiting addresses of reference patches 430 | h_hard_params, //IN: Denoising parameters 431 | sigma[channel], //IN: sigma 432 | num_threads, //CUDA: Threads in block 433 | num_blocks, //CUDA: Blocks in grid 434 | s_size_t //CUDA: Shared memory size 435 | ); 436 | 437 | cuda_error_check( cudaGetLastError() ); 438 | cuda_error_check( cudaDeviceSynchronize() ); 439 | 440 | if (_verbose) 441 | { 442 | time_treshold.stop(); 443 | time_itransform.start(); 444 | } 445 | 446 | //Apply inverse 2D DCT transform to each layer of 3D group 447 | run_IDCT2D8x8(d_gathered_stacks, d_gathered_stacks, trans_size, num_threads_tr, num_blocks_tr); 448 | 449 | cuda_error_check( cudaGetLastError() ); 450 | cuda_error_check( cudaDeviceSynchronize() ); 451 | 452 | if (_verbose) 453 | { 454 | time_itransform.stop(); 455 | time_aggregate.start(); 456 | } 457 | 458 | //Aggregates filtered patches of all 3D groups of a batch into numerator and denominator buffers 459 | run_aggregate_block( 460 | start_point, //IN: First reference patch of a batch 461 | d_gathered_stacks, //IN: 3D groups with thransfomed patches 462 | d_w_P, //IN: Numbers of non zero coeficients after 3D thresholding 463 | d_stacks, //IN: Array of adresses of similar patches 464 | d_kaiser_window, //IN: Kaiser window 465 | d_numerator[channel], //IN/OUT: Numerator aggregation buffer 466 | d_denominator[channel], //IN/OUT: Denominator aggregation buffer 467 | d_num_patches_in_stack, //IN: Numbers of patches in 3D groups 468 | image_dim, //IN: Image dimensions 469 | stacks_dim, //IN: Dimensions limiting addresses of reference patches 470 | h_hard_params, //IN: Denoising parameters 471 | num_threads, //CUDA: Threads in block 472 | num_blocks //CUDA: Blocks in grid 473 | ); 474 | cuda_error_check( cudaGetLastError() ); 475 | cuda_error_check( cudaDeviceSynchronize() ); 476 | 477 | 478 | if (_verbose) 479 | time_aggregate.stop(); 480 | } 481 | } 482 | } 483 | 484 | //Divide numerator by denominator and save resullt to output image 485 | for (int channel = 0; channel < channels; ++channel) 486 | { 487 | run_aggregate_final( 488 | d_numerator[channel], //IN: Numerator aggregation buffer 489 | d_denominator[channel], //IN: Denominator aggregation buffer 490 | image_dim, //IN: Image dimensions 491 | denoised_image[channel], //OUT: Image estimate 492 | num_threads_f, //CUDA: Threads in block 493 | num_blocks_f //CUDA: Blocks in grid 494 | ); 495 | cuda_error_check( cudaGetLastError() ); 496 | cuda_error_check( cudaDeviceSynchronize() ); 497 | } 498 | 499 | if(_verbose) 500 | { 501 | //Print timers 502 | std::cout << "\rFirst step details:" << std::endl; //DEBUG: erase line 503 | std::cout << " Block-Matching took: " << time_blockmatching.getSeconds() << std::endl; 504 | std::cout << " Get took: " << time_get.getSeconds() << std::endl; 505 | std::cout << " Transform took: " << time_transform.getSeconds() << std::endl; 506 | std::cout << " Tresholding took: " << time_treshold.getSeconds() << std::endl; 507 | std::cout << " Inverse transform took: " << time_itransform.getSeconds() << std::endl; 508 | std::cout << " Aggregation took: " << time_aggregate.getSeconds() << std::endl; 509 | } 510 | 511 | } 512 | 513 | void second_step(std::vector & denoised_image, int width, int height, int channels, uint* sigma) 514 | { 515 | //Image dimensions 516 | const uint2 image_dim = make_uint2(width,height); 517 | 518 | //Dimensions limiting addresses of reference patches 519 | const uint2 stacks_dim = make_uint2(width - (h_wien_params.k - 1), height - (h_wien_params.k - 1)); 520 | 521 | int paramN1 = h_wien_params.N + 1; //Maximal size of a stack with a reference patch 522 | 523 | //Determine launch parameteres for block-matching kernel 524 | dim3 num_threads_bm; 525 | dim3 num_blocks_bm; 526 | uint s_size_bm; 527 | get_BM_launch_parameters(h_wien_params, num_threads_bm, num_blocks_bm, s_size_bm); 528 | 529 | //Determine launch parameteres for get and aggregate kernels 530 | const dim3 num_threads(h_wien_params.k, h_wien_params.k); 531 | const dim3 num_blocks(h_batch_size.x, h_batch_size.y); 532 | 533 | //Determine launch parameteres for DCT kernel 534 | const uint trans_size = h_wien_params.k*h_wien_params.k*paramN1*h_batch_size.x*h_batch_size.y; 535 | const dim3 num_blocks_tr((trans_size + (KER2_BLOCK_WIDTH*h_wien_params.k) - 1) / (KER2_BLOCK_WIDTH*h_wien_params.k), 1, 1); 536 | const dim3 num_threads_tr(h_wien_params.k, KER2_BLOCK_WIDTH/h_wien_params.k, 1); 537 | 538 | //Determine launch parameteres for filtering kernel 539 | const uint s_size_t = 2*h_wien_params.k*h_wien_params.k*(paramN1+1)*sizeof(float); //+1 for avoinding bank conflicts 540 | 541 | //Determine launch parameteres for final division kernel 542 | const dim3 num_threads_f(128, 4); 543 | const dim3 num_blocks_f((width + num_threads_f.x - 1) / num_threads_f.x, (height + num_threads_f.y - 1) / num_threads_f.y); 544 | 545 | //Create and copy to device kaiser window 546 | prepare_kaiser_window(h_wien_params.k); 547 | 548 | //Timers 549 | Stopwatch time_blockmatching; 550 | Stopwatch time_get; 551 | Stopwatch time_get2; 552 | Stopwatch time_transform; 553 | Stopwatch time_transform2; 554 | Stopwatch time_itransform; 555 | Stopwatch time_aggregate; 556 | Stopwatch time_wien; 557 | Stopwatch time_times_wien; 558 | 559 | 560 | uint2 start_point; 561 | 562 | for(start_point.y = 0; start_point.y < stacks_dim.y + h_wien_params.p - 1; start_point.y+=(h_batch_size.y*h_wien_params.p)) 563 | { 564 | //Show progress 565 | if (_verbose) 566 | { 567 | int percent = (int)(((float)start_point.y / (float)stacks_dim.y) * (float)100); 568 | std::cout << "\rProcessing " << percent << "%" << std::flush; 569 | } 570 | for(start_point.x = 0; start_point.x < stacks_dim.x + h_wien_params.p - 1; start_point.x+=(h_batch_size.x*h_wien_params.p)) 571 | { 572 | if (_verbose) 573 | time_blockmatching.start(); 574 | 575 | run_block_matching( 576 | d_denoised_image[0], // IN: Image 577 | d_stacks, // OUT: Array of adresses of similar patches 578 | d_num_patches_in_stack, // OUT: Number of blocks on each adress 579 | image_dim, // IN: Image dimensions 580 | stacks_dim, // IN: Image_m dimensions 581 | h_wien_params, // IN: Parameters 582 | start_point, // IN: Line to process 583 | num_threads_bm, // CUDA: Threads in block 584 | num_blocks_bm, // CUDA: Blocks in grid 585 | s_size_bm // CUDA: Shared memory size 586 | ); 587 | cuda_error_check( cudaGetLastError() ); 588 | cuda_error_check( cudaDeviceSynchronize() ); 589 | 590 | 591 | if (_verbose) 592 | time_blockmatching.stop(); 593 | 594 | for (int channel = 0; channel < channels; ++channel) 595 | { 596 | if (_verbose) 597 | time_get.start(); 598 | 599 | //Get patches from basic image estimate to 3D auxiliary array according to the addresess form block-matching 600 | run_get_block( 601 | start_point, //IN: First reference patch of a batch 602 | d_denoised_image[channel], //IN: Basic image estimate (produced by 1st step) 603 | d_stacks, //IN: Array of adresses of similar patches 604 | d_num_patches_in_stack, //IN: Numbers of patches in 3D groups 605 | d_gathered_stacks_basic, //OUT: Assembled 3D groups 606 | image_dim, //IN: Image dimensions 607 | stacks_dim, //IN: Dimensions limiting addresses of reference patches 608 | h_wien_params, //IN: Denoising parameters 609 | num_threads, //CUDA: Threads in block 610 | num_blocks //CUDA: Blocks in grid 611 | ); 612 | 613 | cuda_error_check( cudaGetLastError() ); 614 | cuda_error_check( cudaDeviceSynchronize() ); 615 | 616 | //Get patches from noisy image to 3D auxiliary array according to the addresess form block-matching 617 | run_get_block( 618 | start_point, //IN: First reference patch of a batch 619 | d_noisy_image[channel], //IN: Basic image estimate (produced by 1st step) 620 | d_stacks, //IN: Array of adresses of similar patches 621 | d_num_patches_in_stack, //IN: Numbers of patches in 3D groups 622 | d_gathered_stacks, //OUT: Assembled 3D groups 623 | image_dim, //IN: Image dimensions 624 | stacks_dim, //IN: Dimensions limiting addresses of reference patches 625 | h_wien_params, //IN: Denoising parameters 626 | num_threads, //CUDA: Threads in block 627 | num_blocks //CUDA: Blocks in grid 628 | ); 629 | 630 | cuda_error_check( cudaGetLastError() ); 631 | cuda_error_check( cudaDeviceSynchronize() ); 632 | 633 | 634 | if (_verbose) 635 | { 636 | time_get.stop(); 637 | time_transform.start(); 638 | } 639 | 640 | //Apply 2D DCT transform to each layer of 3D group that contains noisy patches 641 | run_DCT2D8x8(d_gathered_stacks, d_gathered_stacks, trans_size, num_threads_tr, num_blocks_tr); 642 | cuda_error_check( cudaGetLastError() ); 643 | cuda_error_check( cudaDeviceSynchronize() ); 644 | 645 | //Apply 2D DCT transform to each layer of 3D group that contains patches from basic image estimate 646 | run_DCT2D8x8(d_gathered_stacks_basic, d_gathered_stacks_basic, trans_size, num_threads_tr, num_blocks_tr); 647 | cuda_error_check( cudaGetLastError() ); 648 | cuda_error_check( cudaDeviceSynchronize() ); 649 | 650 | 651 | if (_verbose) 652 | { 653 | time_transform.stop(); 654 | time_wien.start(); 655 | } 656 | 657 | /* 658 | 1) 1D Walsh-Hadamard transform of proper size on the 3rd dimension of each 3D noisy group (from noisy patches) and each 3D basic group (pathes from the basic image estimate) 659 | 2) Compute wiener coeficients from basic groups 660 | 3) Filtering: Element-wise multiplication between noisy group and corresponding wiener coefficinets 661 | 4) Inverse 1D transform to the filtered groups 662 | 5) Compute the weingt of each 3D group 663 | */ 664 | run_wiener_filtering( 665 | start_point, //IN: First reference patch of a batch 666 | d_gathered_stacks, //IN/OUT: 3D groups with thransfomed noisy patches that will be filtered 667 | d_gathered_stacks_basic, //IN: 3D groups with thransfomed basic patches estimates 668 | d_w_P, //OUT: Weight of each 3D group 669 | d_num_patches_in_stack, //IN: Numbers of patches in 3D groups 670 | stacks_dim, //IN: Dimensions limiting addresses of reference patches 671 | h_wien_params, //IN: Denoising parameters 672 | sigma[channel], //IN: Noise variance 673 | num_threads, //CUDA: Threads in block 674 | num_blocks, //CUDA: Blocks in grid 675 | s_size_t //CUDA: Shared memory size 676 | ); 677 | 678 | cuda_error_check( cudaGetLastError() ); 679 | cuda_error_check( cudaDeviceSynchronize() ); 680 | 681 | 682 | if (_verbose) 683 | { 684 | time_wien.stop(); 685 | time_itransform.start(); 686 | } 687 | 688 | //Apply 2D IDCT transform to each layer of 3D group that contains filtered patches 689 | run_IDCT2D8x8(d_gathered_stacks, d_gathered_stacks, trans_size, num_threads_tr, num_blocks_tr); 690 | 691 | cuda_error_check( cudaGetLastError() ); 692 | cuda_error_check( cudaDeviceSynchronize() ); 693 | 694 | 695 | if (_verbose) 696 | { 697 | time_itransform.stop(); 698 | time_aggregate.start(); 699 | } 700 | 701 | 702 | //Aggregate filtered patches of all 3D groups of a batch into numerator and denominator buffers 703 | run_aggregate_block( 704 | start_point, //IN: First reference patch of a batch 705 | d_gathered_stacks, //IN: 3D groups with thransfomed patches 706 | d_w_P, //IN: Numbers of non zero coeficients after 3D thresholding 707 | d_stacks, //IN: Array of adresses of similar patches 708 | d_kaiser_window, //IN: Kaiser window 709 | d_numerator[channel], //IN/OUT: Numerator aggregation buffer 710 | d_denominator[channel], //IN/OUT: Denominator aggregation buffer 711 | d_num_patches_in_stack, //IN: Numbers of patches in 3D groups 712 | image_dim, //IN: Image dimensions 713 | stacks_dim, //IN: Dimensions limiting addresses of reference patches 714 | h_wien_params, //IN: Denoising parameters 715 | num_threads, //CUDA: Threads in block 716 | num_blocks //CUDA: Blocks in grid 717 | ); 718 | cuda_error_check( cudaGetLastError() ); 719 | cuda_error_check( cudaDeviceSynchronize() ); 720 | 721 | 722 | if (_verbose) 723 | time_aggregate.stop(); 724 | } 725 | } 726 | } 727 | //Divide numerator by denominator and save resullt to output image 728 | for (int channel = 0; channel < channels; ++channel) 729 | { 730 | run_aggregate_final( 731 | d_numerator[channel], //IN: Aggregation buffer 732 | d_denominator[channel], //IN: Aggregation buffer 733 | image_dim, //IN: Image dimensions 734 | denoised_image[channel], //OUT: Image estimate 735 | num_threads_f, 736 | num_blocks_f 737 | ); 738 | cuda_error_check( cudaGetLastError() ); 739 | cuda_error_check( cudaDeviceSynchronize() ); 740 | } 741 | 742 | if(_verbose) 743 | { 744 | //Print timers 745 | std::cout << "\rSecond step details:" << std::endl; 746 | std::cout << " BlockMatching took: " << time_blockmatching.getSeconds() << std::endl; 747 | std::cout << " 2x Get took: " << time_get.getSeconds() << std::endl; 748 | std::cout << " 2x Transform took: " << time_transform.getSeconds() << std::endl; 749 | std::cout << " Wiener filtering took: " << time_wien.getSeconds() << std::endl; 750 | std::cout << " Inverse transform took: " << time_itransform.getSeconds() << std::endl; 751 | std::cout << " Aggregation took: " << time_aggregate.getSeconds() << std::endl; 752 | } 753 | } 754 | 755 | //Copy image from device to host 756 | void copy_host_image(uchar * dst_image, int width, int height, int channels) 757 | { 758 | size_t image_size = width * height; 759 | for (int channel = 0; channel < channels; ++channel) 760 | { 761 | cuda_error_check( cudaMemcpy( 762 | dst_image+channel*image_size, // Destination 763 | d_denoised_image[channel], // Source 764 | image_size*sizeof(uchar), // Size 765 | cudaMemcpyDeviceToHost) ); // Copy direction 766 | } 767 | } 768 | 769 | //Free all buffers allocated on device that are dependent on 'denoising parameters'. 770 | void free_device_auxiliary_arrays() 771 | { 772 | cuda_error_check( cudaFree(d_stacks) ); 773 | cuda_error_check( cudaFree(d_num_patches_in_stack) ); 774 | 775 | cuda_error_check( cudaFree(d_gathered_stacks)); 776 | cuda_error_check( cudaFree(d_w_P)); 777 | 778 | cuda_error_check( cudaFree(d_kaiser_window)); 779 | 780 | if (h_reserved_two_step) 781 | cuda_error_check( cudaFree(d_gathered_stacks_basic)); 782 | } 783 | 784 | //Free all buffers allocated on device that are dependent on image dimensions 785 | void free_device_image() 786 | { 787 | for (auto & it : d_noisy_image) 788 | cuda_error_check( cudaFree(it) ); 789 | d_noisy_image.clear(); 790 | for (auto & it : d_denoised_image) 791 | cuda_error_check( cudaFree(it) ); 792 | d_denoised_image.clear(); 793 | for(auto & it : d_numerator) { 794 | cuda_error_check( cudaFree(it) ); 795 | } 796 | d_numerator.clear(); 797 | for(auto & it : d_denominator) { 798 | cuda_error_check( cudaFree(it) ); 799 | } 800 | d_denominator.clear(); 801 | } 802 | 803 | //Initialize necessary arrays for aggregation by value 0 804 | void null_aggregation_buffers(int width, int height) 805 | { 806 | int size = width * height; 807 | for(auto & it : d_numerator) { 808 | cuda_error_check( cudaMemset(it, 0, size * sizeof(float)) ); 809 | } 810 | for(auto & it : d_denominator) { 811 | cuda_error_check( cudaMemset(it, 0, size * sizeof(float)) ); 812 | } 813 | } 814 | 815 | public: 816 | BM3D() : 817 | h_hard_params(), 818 | h_wien_params(), 819 | d_gathered_stacks(0), d_gathered_stacks_basic(0), d_w_P(0), d_stacks(0), d_num_patches_in_stack(0), 820 | h_reserved_width(0), h_reserved_height(0), h_reserved_channels(0), h_reserved_two_step(0), d_kaiser_window(0), _verbose(false) 821 | { 822 | int device; 823 | cuda_error_check( cudaGetDevice(&device) ); 824 | cuda_error_check( cudaGetDeviceProperties(&properties,device) ); 825 | 826 | h_batch_size = make_uint2(256,128); 827 | } 828 | BM3D(uint n, uint k, uint N, uint T, uint p, float L3D, bool seceon_step) : 829 | h_hard_params(n, k, N, T, p, L3D), 830 | h_wien_params(n, k, N, T, p, L3D), 831 | d_gathered_stacks(0), d_gathered_stacks_basic(0), d_w_P(0), d_stacks(0), d_num_patches_in_stack(0), 832 | h_reserved_width(0), h_reserved_height(0), h_reserved_channels(0), h_reserved_two_step(0), d_kaiser_window(0), _verbose(false) 833 | { 834 | int device; 835 | cuda_error_check( cudaGetDevice(&device) ); 836 | cuda_error_check( cudaGetDeviceProperties(&properties,device) ); 837 | 838 | h_batch_size = make_uint2(256,128); 839 | 840 | if (k != 8) 841 | throw std::invalid_argument("k has to be 8, other values not implemented yet."); 842 | } 843 | 844 | ~BM3D() 845 | { 846 | free_device_image(); 847 | free_device_auxiliary_arrays(); 848 | } 849 | 850 | 851 | /* 852 | Source image is denoised unig BM3D algorithm 853 | src_image and dst_image are arrays allocated in the host memory and the pixels are stored here by the channels. 854 | First width*height pixels represent luma (Y) component and each following width*height pixels represent color components 855 | */ 856 | void denoise_host_image(uchar *src_image, uchar *dst_image, int width, int height, int channels, uint* sigma, bool two_step) 857 | { 858 | Stopwatch total; 859 | total.start(); 860 | 861 | //Allocation 862 | if (h_reserved_width != width || h_reserved_height != height || h_reserved_channels != channels || h_reserved_two_step != two_step) 863 | reserve(width, height, channels, two_step); 864 | 865 | if (h_reserved_width == 0 || h_reserved_height == 0 || h_reserved_channels == 0 ) 866 | return; 867 | 868 | Stopwatch p1; 869 | p1.start(); 870 | 871 | //Copying 872 | copy_device_image(src_image, width, height, channels); 873 | 874 | //1st denoising step 875 | null_aggregation_buffers(width,height); 876 | first_step(d_denoised_image, width, height, channels, sigma); 877 | 878 | p1.stop(); 879 | if (_verbose) 880 | std::cout << "1st step took: " << p1.getSeconds() << std::endl; 881 | 882 | //2nd denoising step 883 | if (two_step) 884 | { 885 | Stopwatch p2; 886 | p2.start(); 887 | null_aggregation_buffers(width,height); 888 | second_step(d_denoised_image, width, height, channels, sigma); 889 | if (_verbose) 890 | std::cout << "2nd step took: " << p2.getSeconds() << std::endl; 891 | } 892 | 893 | //Copy back 894 | copy_host_image(dst_image, width, height, channels); 895 | 896 | //if(_verbose) 897 | std::cout << "Total time: " << total.getSeconds() << std::endl; 898 | } 899 | 900 | /*void denoise_device_image(uchar *src_image, uchar *dst_image, int width, int height, int channels, bool two_step) 901 | { 902 | //TODO 903 | }*/ 904 | 905 | void set_hard_params(uint n, uint k, uint N, uint T, uint p, float L3D) 906 | { 907 | if (h_hard_params.k != k || h_hard_params.N != N) 908 | { 909 | h_hard_params = Params(n,k,N,T,p,L3D); 910 | free_device_auxiliary_arrays(); 911 | allocate_device_auxiliary_arrays(); 912 | } 913 | else 914 | h_hard_params = Params(n,k,N,T,p,L3D); 915 | 916 | if (k != 8) 917 | throw std::invalid_argument("k has to be 8, other values not implemented yet."); 918 | } 919 | void set_wien_params(uint n, uint k, uint N, uint T, uint p) 920 | { 921 | if (h_wien_params.k != k || h_wien_params.N != N){ 922 | h_wien_params = Params(n,k,N,T,p,0.0); 923 | free_device_auxiliary_arrays(); 924 | allocate_device_auxiliary_arrays(); 925 | } 926 | else 927 | h_wien_params = Params(n,k,N,T,p,0.0); 928 | 929 | if (k != 8) 930 | throw std::invalid_argument("k has to be 8, other values not implemented yet."); 931 | } 932 | 933 | void set_verbose(bool verbose) 934 | { 935 | _verbose = verbose; 936 | } 937 | 938 | void reserve(int width, int height, int channels, bool two_step) 939 | { 940 | h_reserved_width = width; 941 | h_reserved_height = height; 942 | h_reserved_channels = channels; 943 | h_reserved_two_step = two_step; 944 | 945 | free_device_image(); 946 | free_device_auxiliary_arrays(); //TODO: not necessary 947 | 948 | allocate_device_image(width,height,channels); 949 | allocate_device_auxiliary_arrays(); //TODO: not necessary 950 | } 951 | void clear() 952 | { 953 | h_reserved_width = 0; 954 | h_reserved_height = 0; 955 | h_reserved_channels = 0; 956 | h_reserved_two_step = 0; 957 | 958 | free_device_image(); 959 | free_device_auxiliary_arrays(); 960 | } 961 | }; 962 | --------------------------------------------------------------------------------