├── README.md ├── data ├── house.mat ├── lena_512.png ├── image_128.jpg └── image_256.png ├── cuda ├── nlm_naive │ ├── Makefile │ └── nlmNaiveKernel.cu └── nlm_shared │ ├── Makefile │ └── nlmSharedKernel.cu └── matlab ├── nonLocalMeans.m ├── pipeline_non_local_means.m ├── nlm_naive_kernel.m └── nlm_shared_kernel.m /README.md: -------------------------------------------------------------------------------- 1 | # nlm-image-denoising 2 | Non Local Means Filter for Image Denoising in CUDA 3 | -------------------------------------------------------------------------------- /data/house.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rosevoul/nlm-image-denoising/HEAD/data/house.mat -------------------------------------------------------------------------------- /data/lena_512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rosevoul/nlm-image-denoising/HEAD/data/lena_512.png -------------------------------------------------------------------------------- /data/image_128.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rosevoul/nlm-image-denoising/HEAD/data/image_128.jpg -------------------------------------------------------------------------------- /data/image_256.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rosevoul/nlm-image-denoising/HEAD/data/image_256.png -------------------------------------------------------------------------------- /cuda/nlm_naive/Makefile: -------------------------------------------------------------------------------- 1 | SHELL := /bin/bash # Use bash syntax 2 | 3 | CC = sm_20 4 | 5 | NVCC = nvcc -arch=$(CC) 6 | nlm_naive = nlmNaiveKernel 7 | ARGS = -ptx 8 | 9 | all: nlm_naive 10 | 11 | nlm_naive: 12 | $(NVCC) $(ARGS) $(nlm_naive).cu 13 | 14 | clean: 15 | rm -rf *~ *.ptx 16 | -------------------------------------------------------------------------------- /cuda/nlm_shared/Makefile: -------------------------------------------------------------------------------- 1 | SHELL := /bin/bash # Use bash syntax 2 | 3 | CC = sm_20 4 | 5 | NVCC = nvcc -arch=$(CC) 6 | nlm_shared = nlmSharedKernel 7 | ARGS = -ptx 8 | 9 | all: nlm_shared 10 | 11 | nlm_shared: 12 | $(NVCC) $(ARGS) $(nlm_shared).cu 13 | 14 | clean: 15 | rm -rf *~ *.ptx 16 | -------------------------------------------------------------------------------- /cuda/nlm_naive/nlmNaiveKernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // Array access macros 5 | #define INPUT(i,j) input_grid[(j) + (i)*(N)] 6 | 7 | #define WINDOW_SIZE (7) 8 | #define NEIGHBOR_SIZE (3) 9 | 10 | __global__ void nlmSimple(int N, double const *input_grid, double *output_grid, float filtSigma) 11 | { 12 | int gindex = threadIdx.x + blockIdx.x * blockDim.x; 13 | 14 | int pix_ix, 15 | pix_iy, 16 | pix_jx, 17 | pix_jy; 18 | 19 | double neighbor_j, 20 | neighbor_i, 21 | output = 0, 22 | sum_weights = 0; 23 | 24 | pix_iy = gindex % N; 25 | pix_ix = (gindex - pix_iy) / N; 26 | 27 | if (pix_ix < N && pix_iy < N) 28 | { 29 | int window_radius = (WINDOW_SIZE - 1) / 2; 30 | int neighbor_radius = (NEIGHBOR_SIZE - 1) / 2; 31 | 32 | // Iterate through window 33 | for (int k = -window_radius; k <= window_radius; k++) 34 | for (int l = -window_radius; l <= window_radius; l++) 35 | { 36 | double weight = 0; 37 | double distance = 0; 38 | 39 | pix_jx = pix_ix + k; 40 | pix_jy = pix_iy + l; 41 | 42 | if (pix_jx < 0 || pix_jx >= N || 43 | pix_jy < 0 || pix_jy >= N) 44 | continue; 45 | 46 | // Iterate through every pix_j neighbors 47 | for (int p = -neighbor_radius; p <= neighbor_radius; p++) 48 | for (int q = -neighbor_radius; q <= neighbor_radius; q++) 49 | { 50 | if (pix_jx + p < 0 || pix_jx + p >= N || 51 | pix_jy + q < 0 || pix_jy + q >= N || 52 | pix_ix + p < 0 || pix_ix + p >= N || 53 | pix_iy + q < 0 || pix_iy + q >= N) 54 | continue; 55 | 56 | neighbor_j = INPUT(pix_jx + p, pix_jy + q); 57 | neighbor_i = INPUT(pix_ix + p, pix_iy + q); 58 | distance += (neighbor_i - neighbor_j) * (neighbor_i - neighbor_j); 59 | } 60 | 61 | // Derive weight for pixels i and j 62 | weight = __expf(-(distance / filtSigma + 63 | (k*k + l*l) * (1.0f)/(float)(WINDOW_SIZE* WINDOW_SIZE))); 64 | 65 | sum_weights += weight; 66 | 67 | // Sum for every pixel in the window 68 | output += INPUT(pix_jx, pix_jy) * weight; 69 | } 70 | 71 | // Normalize 72 | sum_weights = (double)(1 / sum_weights); 73 | output *= sum_weights; 74 | 75 | // Write output to global memory 76 | output_grid[gindex] = output; 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /matlab/nonLocalMeans.m: -------------------------------------------------------------------------------- 1 | function If = nonLocalMeans(I, patchSize, filtSigma, patchSigma) 2 | % NONLOCALMEANS - Non local means CPU implementation 3 | % 4 | % SYNTAX 5 | % 6 | % IF = NONLOCALMEANS( IN, FILTSIGMA, PATCHSIGMA ) 7 | % 8 | % INPUT 9 | % 10 | % IN Input image [m-by-n] 11 | % PATCHSIZE Neighborhood size in pixels [1-by-2] 12 | % FILTSIGMA Filter sigma value [scalar] 13 | % PATCHSIGMA Patch sigma value [scalar] 14 | % 15 | % OUTPUT 16 | % 17 | % IF Filtered image after nlm [m-by-n] 18 | % 19 | % DESCRIPTION 20 | % 21 | % IF = NONLOCALMEANS( IN, PATCHSIZE, FILTSIGMA, PATCHSIGMA ) applies 22 | % non local means algorithm with sigma value of FILTSIGMA, using a 23 | % Gaussian patch of size PATCHSIZE with sigma value of PATCHSIGMA. 24 | % 25 | % 26 | 27 | %% USEFUL FUNCTIONS 28 | 29 | % create 3-D cube with local patches 30 | patchCube = @(X,w) ... 31 | permute( ... 32 | reshape( ... 33 | im2col( ... 34 | padarray( ... 35 | X, ... 36 | (w-1)./2, 'symmetric'), ... 37 | w, 'sliding' ), ... 38 | [prod(w) size(X)] ), ... 39 | [2 3 1] ); 40 | 41 | % create 3D cube 42 | B = patchCube(I, patchSize); 43 | [m, n, d] = size( B ); 44 | B = reshape(B, [ m*n d ] ); 45 | 46 | % gaussian patch 47 | H = fspecial('gaussian',patchSize, patchSigma); 48 | H = H(:) ./ max(H(:)); 49 | 50 | % apply gaussian patch on 3D cube 51 | B = bsxfun( @times, B, H' ); 52 | 53 | % compute kernel 54 | D = squareform( pdist( B, 'euclidean' ) ); 55 | D = exp( -D.^2 / filtSigma ); 56 | D(1:length(D)+1:end) = max(max(D-diag(diag(D)),[],2), eps); 57 | 58 | % generate filtered image 59 | If = D*I(:) ./ sum(D, 2); 60 | 61 | % reshape for image 62 | If = reshape( If, [m n] ); 63 | 64 | end 65 | 66 | 67 | %%------------------------------------------------------------ 68 | % 69 | % AUTHORS 70 | % 71 | % Dimitris Floros fcdimitr@auth.gr 72 | % 73 | % VERSION 74 | % 75 | % 0.2 - January 05, 2017 76 | % 77 | % CHANGELOG 78 | % 79 | % 0.1 (Dec 28, 2016) - Dimitris 80 | % * initial implementation 81 | % 0.2 (Jan 05, 2017) - Dimitris 82 | % * minor fix (distance squared) 83 | % 84 | % ------------------------------------------------------------ 85 | 86 | -------------------------------------------------------------------------------- /matlab/pipeline_non_local_means.m: -------------------------------------------------------------------------------- 1 | %% SCRIPT: PIPELINE_NON_LOCAL_MEANS 2 | % 3 | % Pipeline for non local means algorithm as described in [1]. 4 | % 5 | % The code thus far is implemented in CPU. 6 | % 7 | % DEPENDENCIES 8 | % 9 | % [1] Antoni Buades, Bartomeu Coll, and J-M Morel. A non-local 10 | % algorithm for image denoising. In 2005 IEEE Computer Society 11 | % Conference on Computer Vision and Pattern Recognition (CVPR’05), 12 | % volume 2, pages 60–65. IEEE, 2005. 13 | % 14 | 15 | clear all %#ok 16 | close all 17 | 18 | %% PARAMETERS 19 | 20 | % input image 21 | pathImg = '../data/house.mat'; 22 | strImgVar = 'house'; 23 | 24 | % noise 25 | noiseParams = {'gaussian', ... 26 | 0,... 27 | 0.001}; 28 | 29 | % filter sigma value 30 | filtSigma = 0.02; 31 | patchSize = [3 3]; 32 | patchSigma = 5/3; 33 | 34 | %% USEFUL FUNCTIONS 35 | 36 | % image normalizer 37 | normImg = @(I) (I - min(I(:))) ./ max(I(:) - min(I(:))); 38 | 39 | %% (BEGIN) 40 | 41 | fprintf('...begin %s...\n',mfilename); 42 | 43 | %% INPUT DATA 44 | 45 | fprintf('...loading input data...\n') 46 | 47 | I = mat2gray(rgb2gray(imread('../data/image_128.jpg'))); 48 | %ioImg = matfile( pathImg ); 49 | %I = ioImg.(strImgVar); 50 | 51 | %% PREPROCESS 52 | 53 | fprintf(' - normalizing image...\n') 54 | I = normImg( I ); 55 | 56 | figure('Name','Original Image'); 57 | imagesc(I); axis image; 58 | colormap gray; 59 | 60 | %% NOISE 61 | 62 | fprintf(' - applying noise...\n') 63 | J = imnoise( I, noiseParams{:} ); 64 | figure('Name','Noisy-Input Image'); 65 | imagesc(J); axis image; 66 | colormap gray; 67 | 68 | %% NON LOCAL MEANS 69 | 70 | tic; 71 | If = nonLocalMeans( J, patchSize, filtSigma, patchSigma ); 72 | toc 73 | 74 | %% VISUALIZE RESULT 75 | 76 | figure('Name', 'Filtered image'); 77 | imagesc(If); axis image; 78 | colormap gray; 79 | 80 | figure('Name', 'Residual'); 81 | imagesc(If-J); axis image; 82 | colormap gray; 83 | 84 | %% (END) 85 | 86 | fprintf('...end %s...\n',mfilename); 87 | 88 | 89 | %%------------------------------------------------------------ 90 | % 91 | % AUTHORS 92 | % 93 | % Dimitris Floros fcdimitr@auth.gr 94 | % 95 | % VERSION 96 | % 97 | % 0.1 - December 28, 2016 98 | % 99 | % CHANGELOG 100 | % 101 | % 0.1 (Dec 28, 2016) - Dimitris 102 | % * initial implementation 103 | % 104 | % ------------------------------------------------------------ 105 | -------------------------------------------------------------------------------- /matlab/nlm_naive_kernel.m: -------------------------------------------------------------------------------- 1 | %% SCRIPT: NLM_NAIVE_KERNEL 2 | % 3 | % GPU naive kernel for NLM filter 4 | % 5 | % DEPENDENCIES 6 | % 7 | % nlmKernel.cu 8 | %% 9 | 10 | clear variables; 11 | clear all %#ok 12 | close all 13 | 14 | %% ARRAY PARAMETERS 15 | n = 512; 16 | m = 512; 17 | 18 | %% NLM FILTER PARAMETERS 19 | filtSigma = 0.02; 20 | 21 | %% CUDA PARAMETERS 22 | 23 | % maxThreadsPerBlock depend on the GPU 24 | maxThreadsPerBlock = 1024; 25 | 26 | %% (BEGIN) 27 | 28 | fprintf('...begin %s...\n',mfilename); 29 | 30 | %%------------------PIPELINE------------------------%% 31 | %% PARAMETERS 32 | % input image 33 | strImgVar = 'house'; 34 | 35 | 36 | % noise 37 | noiseParams = {'gaussian', ... 38 | 0,... 39 | 0.001}; 40 | 41 | % filter sigma value 42 | filtSigma = 0.02; 43 | patchSigma = filtSigma; 44 | 45 | %% USEFUL FUNCTIONS 46 | 47 | % image normalizer 48 | normImg = @(I) (I - min(I(:))) ./ max(I(:) - min(I(:))); 49 | 50 | %% (BEGIN) 51 | 52 | fprintf('...begin %s...\n',mfilename); 53 | 54 | %% INPUT DATA 55 | 56 | fprintf('...loading input data...\n') 57 | 58 | % I = mat2gray(imread('../data/image_128.jpg')); 59 | % I = mat2gray(imread('../data/image_256.png')); 60 | I = mat2gray(imread('../data/lena_512.png')); 61 | 62 | % ioImg = matfile('../data/house.mat'); 63 | % I = ioImg.(strImgVar); 64 | 65 | imwrite(I, '../data/cuda_lena_1.png'); 66 | 67 | %% PREPROCESS 68 | 69 | fprintf(' - normalizing image...\n') 70 | I = normImg( I ); 71 | 72 | figure('Name','Original Image'); 73 | imagesc(I); axis image; 74 | colormap gray; 75 | imwrite(I, '../data/cuda_lena_2_norm.png'); 76 | 77 | %% NOISE 78 | 79 | fprintf(' - applying noise...\n') 80 | J = imnoise( I, noiseParams{:} ); 81 | figure('Name','Noisy-Input Image'); 82 | imagesc(J); axis image; 83 | colormap gray; 84 | imwrite(J, '../data/cuda_lena_3_noise.png') 85 | 86 | %%------------------PIPELINE END------------------------%% 87 | 88 | 89 | 90 | %% KERNEL 91 | 92 | k = parallel.gpu.CUDAKernel( '../cuda/nlm_naive/nlmNaiveKernel.ptx', ... 93 | '../cuda/nlm_naive/nlmNaiveKernel.cu'); 94 | 95 | threadsPerBlock = n; %cols 96 | 97 | 98 | k.ThreadBlockSize = min(threadsPerBlock, maxThreadsPerBlock); 99 | k.GridSize = ceil(m*n/k.ThreadBlockSize(1)); 100 | 101 | %% DATA 102 | 103 | B = zeros([m n], 'gpuArray'); 104 | fprintf('Naive NLM...\n'); 105 | 106 | tic; 107 | B = gather( feval(k, n, J, B, filtSigma) ); 108 | toc 109 | 110 | 111 | % fprintf('This implementation currently supports only NxN images. Sorry...\n'); 112 | imwrite(B, '../data/cuda_lena_4_naive.png') 113 | 114 | 115 | %% (END) 116 | 117 | fprintf('Done %s...\n',mfilename); 118 | -------------------------------------------------------------------------------- /matlab/nlm_shared_kernel.m: -------------------------------------------------------------------------------- 1 | %% SCRIPT: NLM_SHARED_KERNEL 2 | % 3 | % GPU shared kernel for NLM filter 4 | % 5 | % DEPENDENCIES 6 | % 7 | % nlmKernel.cu 8 | %% 9 | 10 | clear variables; 11 | clear all %#ok 12 | close all 13 | 14 | %% ARRAY PARAMETERS 15 | n = 512; 16 | m = 512; 17 | 18 | %% NLM FILTER PARAMETERS 19 | filtSigma = 0.02; 20 | 21 | %% CUDA PARAMETERS 22 | 23 | % maxThreadsPerBlock depend on the GPU 24 | maxThreadsPerBlock = 1024; 25 | 26 | %% (BEGIN) 27 | 28 | fprintf('...begin %s...\n',mfilename); 29 | 30 | %%------------------PIPELINE------------------------%% 31 | %% PARAMETERS 32 | % input image 33 | strImgVar = 'house'; 34 | 35 | 36 | % noise 37 | noiseParams = {'gaussian', ... 38 | 0,... 39 | 0.001}; 40 | 41 | % filter sigma value 42 | filtSigma = 0.02; 43 | patchSigma = filtSigma; 44 | 45 | %% USEFUL FUNCTIONS 46 | 47 | % image normalizer 48 | normImg = @(I) (I - min(I(:))) ./ max(I(:) - min(I(:))); 49 | 50 | %% (BEGIN) 51 | 52 | fprintf('...begin %s...\n',mfilename); 53 | 54 | %% INPUT DATA 55 | 56 | fprintf('...loading input data...\n') 57 | 58 | % I = mat2gray(imread('../data/image_128.jpg')); 59 | % I = mat2gray(imread('../data/image_256.png')); 60 | I = mat2gray(imread('../data/lena_512.png')); 61 | 62 | % ioImg = matfile('../data/house.mat'); 63 | % I = ioImg.(strImgVar); 64 | 65 | imwrite(I, '../data/cuda_lena_1.png'); 66 | 67 | %% PREPROCESS 68 | 69 | fprintf(' - normalizing image...\n') 70 | I = normImg( I ); 71 | 72 | figure('Name','Original Image'); 73 | imagesc(I); axis image; 74 | colormap gray; 75 | imwrite(I, '../data/cuda_lena_2_norm.png'); 76 | 77 | %% NOISE 78 | 79 | fprintf(' - applying noise...\n') 80 | J = imnoise( I, noiseParams{:} ); 81 | figure('Name','Noisy-Input Image'); 82 | imagesc(J); axis image; 83 | colormap gray; 84 | imwrite(J, '../data/cuda_lena_3_noise.png') 85 | 86 | %%------------------PIPELINE END------------------------%% 87 | 88 | 89 | 90 | %% KERNEL 91 | 92 | k = parallel.gpu.CUDAKernel( '../cuda/nlm_shared/nlmSharedKernel.ptx', ... 93 | '../cuda/nlm_shared/nlmSharedKernel.cu'); 94 | 95 | threadsPerBlock = n; %cols 96 | 97 | 98 | k.ThreadBlockSize = min(threadsPerBlock, maxThreadsPerBlock); 99 | k.GridSize = ceil(m*n/k.ThreadBlockSize(1)); 100 | 101 | %% DATA 102 | 103 | B = zeros([m n], 'gpuArray'); 104 | fprintf('Shared NLM...\n'); 105 | 106 | tic; 107 | B = gather( feval(k, n, J, B, filtSigma) ); 108 | toc 109 | 110 | 111 | % fprintf('This implementation currently supports only NxN images. Sorry...\n'); 112 | imwrite(B, '../data/cuda_lena_5_shared.png') 113 | 114 | 115 | %% (END) 116 | 117 | fprintf('Done %s...\n',mfilename); 118 | -------------------------------------------------------------------------------- /cuda/nlm_shared/nlmSharedKernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // Array access macros 5 | #define INPUT(i,j) input_grid[(j) + (i)*(N)] 6 | #define TEMP(i,j) temp_grid[(j) + (i)*(N)] 7 | 8 | #define WINDOW_SIZE (7) 9 | #define NEIGHBOR_SIZE (3) 10 | #define BLOCK_SIZE (512) 11 | 12 | #define FILTER_SIZE ((WINDOW_SIZE) + (NEIGHBOR_SIZE) - 1) 13 | #define FILTER_RADIUS (((FILTER_SIZE) - 1) / 2) 14 | 15 | __global__ void nlmSimple(int N, double const *input_grid, double *output_grid, float filtSigma) 16 | { 17 | __shared__ double temp_grid[BLOCK_SIZE * FILTER_SIZE]; 18 | 19 | // Define global and local indices of current pixel 20 | int gindex = threadIdx.x + blockIdx.x * blockDim.x; 21 | int lindex = threadIdx.x + FILTER_RADIUS * blockDim.x; 22 | 23 | int pix_ix, pix_iy, pix_jx, pix_jy; 24 | 25 | double neighbor_j, 26 | neighbor_i, 27 | output = 0, 28 | sum_weights = 0; 29 | 30 | // Read input elements into shared memory 31 | for (int i = -FILTER_RADIUS; i <= FILTER_RADIUS; i++) 32 | { 33 | if ((int)blockIdx.x + i >= 0 && (int)blockIdx.x + i < N) 34 | { 35 | temp_grid[lindex + i * (int)blockDim.x] = input_grid[gindex + i * (int)blockDim.x]; 36 | } 37 | } 38 | 39 | // Synchronize (ensure all the data is available) 40 | __syncthreads(); 41 | 42 | pix_iy = lindex % N; 43 | pix_ix = (lindex - pix_iy) / N; 44 | 45 | if (pix_ix < FILTER_SIZE && pix_iy < N) 46 | { 47 | int window_radius = (WINDOW_SIZE - 1) / 2; 48 | int neighbor_radius = (NEIGHBOR_SIZE - 1) / 2; 49 | 50 | // Iterate through window 51 | for (int k = -window_radius; k <= window_radius; k++) 52 | for (int l = -window_radius; l <= window_radius; l++) 53 | { 54 | double weight = 0; 55 | double distance = 0; 56 | 57 | pix_jx = pix_ix + k; 58 | pix_jy = pix_iy + l; 59 | 60 | if (pix_jx < 0 || pix_jx >= FILTER_SIZE || 61 | pix_jy < 0 || pix_jy >= N) 62 | continue; 63 | 64 | // Iterate through every pix_j neighbors 65 | for (int p = -neighbor_radius; p <= neighbor_radius; p++) 66 | for (int q = -neighbor_radius; q <= neighbor_radius; q++) 67 | { 68 | if (pix_jx + p < 0 || pix_jx + p >= FILTER_SIZE || 69 | pix_jy + q < 0 || pix_jy + q >= N || 70 | pix_ix + p < 0 || pix_ix + p >= FILTER_SIZE || 71 | pix_iy + q < 0 || pix_iy + q >= N) 72 | continue; 73 | 74 | neighbor_j = TEMP(pix_jx + p, pix_jy + q); 75 | neighbor_i = TEMP(pix_ix + p, pix_iy + q); 76 | distance += (neighbor_i - neighbor_j) * (neighbor_i - neighbor_j); 77 | } 78 | 79 | // Derive weight for pixels i and j 80 | weight = __expf(-(distance / filtSigma + 81 | (k*k + l*l) * (1.0f)/(float)(WINDOW_SIZE* WINDOW_SIZE))); 82 | 83 | sum_weights += weight; 84 | 85 | // Sum for every pixel in the window 86 | output += TEMP(pix_jx, pix_jy) * weight; 87 | } 88 | 89 | // Normalize 90 | sum_weights = (double)(1 / sum_weights); 91 | output *= sum_weights; 92 | 93 | // Write output to global memory 94 | output_grid[gindex] = output; 95 | } 96 | } 97 | --------------------------------------------------------------------------------