├── README.md
├── data
    ├── house.mat
    ├── lena_512.png
    ├── image_128.jpg
    └── image_256.png
├── cuda
    ├── nlm_naive
    │   ├── Makefile
    │   └── nlmNaiveKernel.cu
    └── nlm_shared
    │   ├── Makefile
    │   └── nlmSharedKernel.cu
└── matlab
    ├── nonLocalMeans.m
    ├── pipeline_non_local_means.m
    ├── nlm_naive_kernel.m
    └── nlm_shared_kernel.m


/README.md:
--------------------------------------------------------------------------------
1 | # nlm-image-denoising
2 | Non Local Means Filter for Image Denoising in CUDA
3 | 


--------------------------------------------------------------------------------
/data/house.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rosevoul/nlm-image-denoising/HEAD/data/house.mat


--------------------------------------------------------------------------------
/data/lena_512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rosevoul/nlm-image-denoising/HEAD/data/lena_512.png


--------------------------------------------------------------------------------
/data/image_128.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rosevoul/nlm-image-denoising/HEAD/data/image_128.jpg


--------------------------------------------------------------------------------
/data/image_256.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rosevoul/nlm-image-denoising/HEAD/data/image_256.png


--------------------------------------------------------------------------------
/cuda/nlm_naive/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL := /bin/bash # Use bash syntax
 2 | 
 3 | CC = sm_20
 4 | 
 5 | NVCC	= nvcc -arch=$(CC)
 6 | nlm_naive	= nlmNaiveKernel
 7 | ARGS	= -ptx
 8 | 
 9 | all: nlm_naive
10 | 
11 | nlm_naive:
12 | 	$(NVCC) $(ARGS) $(nlm_naive).cu
13 | 
14 | clean:
15 | 	rm -rf *~ *.ptx
16 | 


--------------------------------------------------------------------------------
/cuda/nlm_shared/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL := /bin/bash # Use bash syntax
 2 | 
 3 | CC = sm_20
 4 | 
 5 | NVCC	= nvcc -arch=$(CC)
 6 | nlm_shared	= nlmSharedKernel
 7 | ARGS	= -ptx
 8 | 
 9 | all: nlm_shared
10 | 
11 | nlm_shared:
12 | 	$(NVCC) $(ARGS) $(nlm_shared).cu
13 | 
14 | clean:
15 | 	rm -rf *~ *.ptx
16 | 


--------------------------------------------------------------------------------
/cuda/nlm_naive/nlmNaiveKernel.cu:
--------------------------------------------------------------------------------
 1 | #include <math.h>
 2 | #include <stdio.h>
 3 | 
 4 | // Array access macros
 5 | #define INPUT(i,j) input_grid[(j) + (i)*(N)]
 6 | 
 7 | #define WINDOW_SIZE (7)
 8 | #define NEIGHBOR_SIZE (3)
 9 | 
10 | __global__ void nlmSimple(int N, double const *input_grid, double *output_grid, float filtSigma)
11 | {
12 | 	int gindex = threadIdx.x + blockIdx.x * blockDim.x;
13 | 
14 | 	int pix_ix, 
15 | 		pix_iy, 
16 | 		pix_jx,
17 | 		pix_jy;
18 | 
19 | 	double neighbor_j,
20 | 			neighbor_i,
21 | 			output = 0,
22 | 			sum_weights = 0;
23 | 	
24 | 	pix_iy = gindex % N;
25 | 	pix_ix = (gindex - pix_iy) / N;
26 | 
27 | 	if (pix_ix < N && pix_iy < N)
28 | 	{ 
29 | 		int window_radius = (WINDOW_SIZE - 1) / 2;
30 | 		int neighbor_radius = (NEIGHBOR_SIZE - 1) / 2; 
31 | 			
32 | 		// Iterate through window
33 | 		for (int k = -window_radius; k <= window_radius; k++)
34 | 			for (int l = -window_radius; l <= window_radius; l++)
35 | 			{
36 | 				double weight = 0;
37 | 				double distance = 0;
38 | 
39 | 				pix_jx = pix_ix + k; 
40 | 				pix_jy = pix_iy + l;
41 | 
42 | 				if (pix_jx < 0 || pix_jx >= N ||
43 | 					pix_jy < 0 || pix_jy >= N)
44 | 					continue;
45 | 
46 | 				// Iterate through every pix_j neighbors
47 | 				for (int p = -neighbor_radius; p <= neighbor_radius; p++)
48 | 					for (int q = -neighbor_radius; q <= neighbor_radius; q++)
49 | 					{
50 | 						if (pix_jx + p < 0 || pix_jx + p >= N ||
51 | 							pix_jy + q < 0 || pix_jy + q >= N ||
52 | 							pix_ix + p < 0 || pix_ix + p >= N ||
53 | 							pix_iy + q < 0 || pix_iy + q >= N)
54 | 							continue;
55 | 						
56 | 						neighbor_j = INPUT(pix_jx + p, pix_jy + q);
57 | 						neighbor_i = INPUT(pix_ix + p, pix_iy + q);
58 | 						distance += (neighbor_i - neighbor_j) * (neighbor_i - neighbor_j);
59 | 					}
60 | 
61 | 				// Derive weight for pixels i and j
62 | 				weight = __expf(-(distance / filtSigma + 
63 | 								(k*k + l*l) * (1.0f)/(float)(WINDOW_SIZE* WINDOW_SIZE)));
64 | 
65 | 				sum_weights += weight;
66 | 				
67 | 				// Sum for every pixel in the window
68 | 				output += INPUT(pix_jx, pix_jy) * weight;				
69 | 			}
70 | 
71 | 		// Normalize
72 | 		sum_weights = (double)(1 / sum_weights);
73 | 		output *= sum_weights;
74 | 
75 | 		// Write output to global memory
76 | 		output_grid[gindex] = output;
77 | 	}
78 | }
79 | 


--------------------------------------------------------------------------------
/matlab/nonLocalMeans.m:
--------------------------------------------------------------------------------
 1 | function If = nonLocalMeans(I, patchSize, filtSigma, patchSigma)
 2 | % NONLOCALMEANS - Non local means CPU implementation
 3 | %   
 4 | % SYNTAX
 5 | %
 6 | %   IF = NONLOCALMEANS( IN, FILTSIGMA, PATCHSIGMA )
 7 | %
 8 | % INPUT
 9 | %
10 | %   IN          Input image                     [m-by-n]
11 | %   PATCHSIZE   Neighborhood size in pixels     [1-by-2]
12 | %   FILTSIGMA   Filter sigma value              [scalar]
13 | %   PATCHSIGMA  Patch sigma value               [scalar]
14 | %
15 | % OUTPUT
16 | %
17 | %   IF          Filtered image after nlm        [m-by-n]
18 | %
19 | % DESCRIPTION
20 | %
21 | %   IF = NONLOCALMEANS( IN, PATCHSIZE, FILTSIGMA, PATCHSIGMA ) applies
22 | %   non local means algorithm with sigma value of FILTSIGMA, using a
23 | %   Gaussian patch of size PATCHSIZE with sigma value of PATCHSIGMA.
24 | %
25 | %
26 |   
27 |   %% USEFUL FUNCTIONS
28 |   
29 |   % create 3-D cube with local patches
30 |   patchCube = @(X,w) ...
31 |       permute( ...
32 |           reshape( ...
33 |               im2col( ...
34 |                   padarray( ...
35 |                       X, ...
36 |                       (w-1)./2, 'symmetric'), ...
37 |                   w, 'sliding' ), ...
38 |               [prod(w) size(X)] ), ...
39 |           [2 3 1] );
40 | 
41 |   % create 3D cube
42 |   B = patchCube(I, patchSize);
43 |   [m, n, d] = size( B );
44 |   B = reshape(B, [ m*n d ] );
45 |   
46 |   % gaussian patch
47 |   H = fspecial('gaussian',patchSize, patchSigma);
48 |   H = H(:) ./ max(H(:));
49 |   
50 |   % apply gaussian patch on 3D cube
51 |   B = bsxfun( @times, B, H' );
52 |   
53 |   % compute kernel
54 |   D = squareform( pdist( B, 'euclidean' ) );
55 |   D = exp( -D.^2 / filtSigma );
56 |   D(1:length(D)+1:end) = max(max(D-diag(diag(D)),[],2), eps);
57 |   
58 |   % generate filtered image
59 |   If = D*I(:) ./ sum(D, 2);
60 |   
61 |   % reshape for image
62 |   If = reshape( If, [m n] );
63 |   
64 | end
65 | 
66 | 
67 | %%------------------------------------------------------------
68 | %
69 | % AUTHORS
70 | %
71 | %   Dimitris Floros                         fcdimitr@auth.gr
72 | %
73 | % VERSION
74 | %
75 | %   0.2 - January 05, 2017
76 | %
77 | % CHANGELOG
78 | %
79 | %   0.1 (Dec 28, 2016) - Dimitris
80 | %       * initial implementation
81 | %   0.2 (Jan 05, 2017) - Dimitris
82 | %       * minor fix (distance squared)
83 | %
84 | % ------------------------------------------------------------
85 | 
86 | 


--------------------------------------------------------------------------------
/matlab/pipeline_non_local_means.m:
--------------------------------------------------------------------------------
  1 | %% SCRIPT: PIPELINE_NON_LOCAL_MEANS
  2 | %
  3 | % Pipeline for non local means algorithm as described in [1].
  4 | %
  5 | % The code thus far is implemented in CPU.
  6 | %
  7 | % DEPENDENCIES
  8 | %
  9 | % [1] Antoni Buades, Bartomeu Coll, and J-M Morel. A non-local
 10 | %     algorithm for image denoising. In 2005 IEEE Computer Society
 11 | %     Conference on Computer Vision and Pattern Recognition (CVPR’05),
 12 | %      volume 2, pages 60–65. IEEE, 2005.
 13 | %
 14 |   
 15 |   clear all %#ok
 16 |   close all
 17 | 
 18 |   %% PARAMETERS
 19 |   
 20 |   % input image
 21 |   pathImg   = '../data/house.mat';
 22 |   strImgVar = 'house';
 23 |   
 24 |   % noise
 25 |   noiseParams = {'gaussian', ...
 26 |                  0,...
 27 |                  0.001};
 28 |   
 29 |   % filter sigma value
 30 |   filtSigma = 0.02;
 31 |   patchSize = [3 3];
 32 |   patchSigma = 5/3;
 33 |   
 34 |   %% USEFUL FUNCTIONS
 35 | 
 36 |   % image normalizer
 37 |   normImg = @(I) (I - min(I(:))) ./ max(I(:) - min(I(:)));
 38 |   
 39 |   %% (BEGIN)
 40 | 
 41 |   fprintf('...begin %s...\n',mfilename);  
 42 |   
 43 |   %% INPUT DATA
 44 |   
 45 |   fprintf('...loading input data...\n')
 46 |   
 47 |   I = mat2gray(rgb2gray(imread('../data/image_128.jpg')));
 48 |   %ioImg = matfile( pathImg );
 49 |   %I     = ioImg.(strImgVar);
 50 |   
 51 |   %% PREPROCESS
 52 |   
 53 |   fprintf(' - normalizing image...\n')
 54 |   I = normImg( I );
 55 |   
 56 |   figure('Name','Original Image');
 57 |   imagesc(I); axis image;
 58 |   colormap gray;
 59 |   
 60 |   %% NOISE
 61 |   
 62 |   fprintf(' - applying noise...\n')
 63 |   J = imnoise( I, noiseParams{:} );
 64 |   figure('Name','Noisy-Input Image');
 65 |   imagesc(J); axis image;
 66 |   colormap gray;
 67 |   
 68 |   %% NON LOCAL MEANS
 69 |   
 70 |   tic;
 71 |   If = nonLocalMeans( J, patchSize, filtSigma, patchSigma );
 72 |   toc
 73 |   
 74 |   %% VISUALIZE RESULT
 75 |   
 76 |   figure('Name', 'Filtered image');
 77 |   imagesc(If); axis image;
 78 |   colormap gray;
 79 |   
 80 |   figure('Name', 'Residual');
 81 |   imagesc(If-J); axis image;
 82 |   colormap gray;
 83 |   
 84 |   %% (END)
 85 | 
 86 |   fprintf('...end %s...\n',mfilename);
 87 | 
 88 | 
 89 | %%------------------------------------------------------------
 90 | %
 91 | % AUTHORS
 92 | %
 93 | %   Dimitris Floros                         fcdimitr@auth.gr
 94 | %
 95 | % VERSION
 96 | %
 97 | %   0.1 - December 28, 2016
 98 | %
 99 | % CHANGELOG
100 | %
101 | %   0.1 (Dec 28, 2016) - Dimitris
102 | %       * initial implementation
103 | %
104 | % ------------------------------------------------------------
105 | 


--------------------------------------------------------------------------------
/matlab/nlm_naive_kernel.m:
--------------------------------------------------------------------------------
  1 | %% SCRIPT: NLM_NAIVE_KERNEL
  2 | %
  3 | % GPU naive kernel for NLM filter
  4 | %
  5 | % DEPENDENCIES
  6 | %
  7 | %  nlmKernel.cu
  8 | %% 
  9 |   
 10 |   clear variables;
 11 |   clear all %#ok
 12 |   close all
 13 | 
 14 |   %% ARRAY PARAMETERS
 15 |   n = 512;
 16 |   m = 512;
 17 | 
 18 |   %% NLM FILTER PARAMETERS
 19 |   filtSigma = 0.02;
 20 | 
 21 |   %% CUDA PARAMETERS
 22 |   
 23 |   % maxThreadsPerBlock depend on the GPU
 24 |   maxThreadsPerBlock = 1024;
 25 | 
 26 |   %% (BEGIN)
 27 | 
 28 |   fprintf('...begin %s...\n',mfilename);  
 29 | 
 30 |  %%------------------PIPELINE------------------------%%
 31 |   %% PARAMETERS
 32 |   % input image
 33 |   strImgVar = 'house';
 34 |   
 35 |   
 36 |   % noise
 37 |   noiseParams = {'gaussian', ...
 38 |                  0,...
 39 |                  0.001};
 40 |   
 41 |   % filter sigma value
 42 |   filtSigma = 0.02;
 43 |   patchSigma = filtSigma;
 44 |   
 45 |   %% USEFUL FUNCTIONS
 46 | 
 47 |   % image normalizer
 48 |   normImg = @(I) (I - min(I(:))) ./ max(I(:) - min(I(:)));
 49 | 
 50 |   %% (BEGIN)
 51 | 
 52 |   fprintf('...begin %s...\n',mfilename);  
 53 | 
 54 |   %% INPUT DATA
 55 |   
 56 |   fprintf('...loading input data...\n')
 57 |   
 58 |   % I = mat2gray(imread('../data/image_128.jpg'));
 59 |   % I = mat2gray(imread('../data/image_256.png'));
 60 |   I = mat2gray(imread('../data/lena_512.png'));
 61 | 
 62 |   % ioImg = matfile('../data/house.mat');
 63 |   % I = ioImg.(strImgVar);
 64 |   
 65 |   imwrite(I, '../data/cuda_lena_1.png');
 66 |   
 67 |   %% PREPROCESS
 68 |   
 69 |   fprintf(' - normalizing image...\n')
 70 |   I = normImg( I );
 71 |   
 72 |   figure('Name','Original Image');
 73 |   imagesc(I); axis image;
 74 |   colormap gray;
 75 |   imwrite(I, '../data/cuda_lena_2_norm.png');
 76 |   
 77 |   %% NOISE
 78 |   
 79 |   fprintf(' - applying noise...\n')
 80 |   J = imnoise( I, noiseParams{:} );
 81 |   figure('Name','Noisy-Input Image');
 82 |   imagesc(J); axis image;
 83 |   colormap gray;
 84 |   imwrite(J, '../data/cuda_lena_3_noise.png')
 85 | 
 86 |   %%------------------PIPELINE END------------------------%%
 87 | 
 88 | 
 89 | 
 90 |   %% KERNEL
 91 |   
 92 |   k = parallel.gpu.CUDAKernel( '../cuda/nlm_naive/nlmNaiveKernel.ptx', ...
 93 |                               '../cuda/nlm_naive/nlmNaiveKernel.cu');
 94 | 
 95 |   threadsPerBlock = n;    %cols
 96 |  
 97 | 
 98 |   k.ThreadBlockSize = min(threadsPerBlock, maxThreadsPerBlock);
 99 |   k.GridSize        = ceil(m*n/k.ThreadBlockSize(1)); 
100 | 
101 |   %% DATA
102 |   
103 |   B = zeros([m n], 'gpuArray');
104 |   fprintf('Naive NLM...\n');
105 | 
106 |   tic;
107 |   B = gather( feval(k, n, J, B, filtSigma) );
108 |   toc
109 |   
110 | 
111 |   % fprintf('This implementation currently supports only NxN images. Sorry...\n');
112 |   imwrite(B, '../data/cuda_lena_4_naive.png')
113 | 
114 |   
115 |   %% (END)
116 | 
117 |   fprintf('Done %s...\n',mfilename);
118 | 


--------------------------------------------------------------------------------
/matlab/nlm_shared_kernel.m:
--------------------------------------------------------------------------------
  1 | %% SCRIPT: NLM_SHARED_KERNEL
  2 | %
  3 | % GPU shared kernel for NLM filter
  4 | %
  5 | % DEPENDENCIES
  6 | %
  7 | %  nlmKernel.cu
  8 | %% 
  9 |   
 10 |   clear variables;
 11 |   clear all %#ok
 12 |   close all
 13 | 
 14 |   %% ARRAY PARAMETERS
 15 |   n = 512;
 16 |   m = 512;
 17 | 
 18 |   %% NLM FILTER PARAMETERS
 19 |   filtSigma = 0.02;
 20 | 
 21 |   %% CUDA PARAMETERS
 22 |   
 23 |   % maxThreadsPerBlock depend on the GPU
 24 |   maxThreadsPerBlock = 1024;
 25 | 
 26 |   %% (BEGIN)
 27 | 
 28 |   fprintf('...begin %s...\n',mfilename);  
 29 | 
 30 |  %%------------------PIPELINE------------------------%%
 31 |   %% PARAMETERS
 32 |   % input image
 33 |   strImgVar = 'house';
 34 |   
 35 |   
 36 |   % noise
 37 |   noiseParams = {'gaussian', ...
 38 |                  0,...
 39 |                  0.001};
 40 |   
 41 |   % filter sigma value
 42 |   filtSigma = 0.02;
 43 |   patchSigma = filtSigma;
 44 |   
 45 |   %% USEFUL FUNCTIONS
 46 | 
 47 |   % image normalizer
 48 |   normImg = @(I) (I - min(I(:))) ./ max(I(:) - min(I(:)));
 49 | 
 50 |   %% (BEGIN)
 51 | 
 52 |   fprintf('...begin %s...\n',mfilename);  
 53 | 
 54 |   %% INPUT DATA
 55 |   
 56 |   fprintf('...loading input data...\n')
 57 |   
 58 |   % I = mat2gray(imread('../data/image_128.jpg'));
 59 |   % I = mat2gray(imread('../data/image_256.png'));
 60 |   I = mat2gray(imread('../data/lena_512.png'));
 61 | 
 62 |   % ioImg = matfile('../data/house.mat');
 63 |   % I = ioImg.(strImgVar);
 64 |   
 65 |   imwrite(I, '../data/cuda_lena_1.png');
 66 |   
 67 |   %% PREPROCESS
 68 |   
 69 |   fprintf(' - normalizing image...\n')
 70 |   I = normImg( I );
 71 |   
 72 |   figure('Name','Original Image');
 73 |   imagesc(I); axis image;
 74 |   colormap gray;
 75 |   imwrite(I, '../data/cuda_lena_2_norm.png');
 76 |   
 77 |   %% NOISE
 78 |   
 79 |   fprintf(' - applying noise...\n')
 80 |   J = imnoise( I, noiseParams{:} );
 81 |   figure('Name','Noisy-Input Image');
 82 |   imagesc(J); axis image;
 83 |   colormap gray;
 84 |   imwrite(J, '../data/cuda_lena_3_noise.png')
 85 | 
 86 |   %%------------------PIPELINE END------------------------%%
 87 | 
 88 | 
 89 | 
 90 |   %% KERNEL
 91 |   
 92 |   k = parallel.gpu.CUDAKernel( '../cuda/nlm_shared/nlmSharedKernel.ptx', ...
 93 |                               '../cuda/nlm_shared/nlmSharedKernel.cu');
 94 | 
 95 |   threadsPerBlock = n;    %cols
 96 |  
 97 | 
 98 |   k.ThreadBlockSize = min(threadsPerBlock, maxThreadsPerBlock);
 99 |   k.GridSize        = ceil(m*n/k.ThreadBlockSize(1)); 
100 | 
101 |   %% DATA
102 |   
103 |   B = zeros([m n], 'gpuArray');
104 |   fprintf('Shared NLM...\n');
105 | 
106 |   tic;
107 |   B = gather( feval(k, n, J, B, filtSigma) );
108 |   toc
109 |   
110 | 
111 |   % fprintf('This implementation currently supports only NxN images. Sorry...\n');
112 |   imwrite(B, '../data/cuda_lena_5_shared.png')
113 | 
114 |   
115 |   %% (END)
116 | 
117 |   fprintf('Done %s...\n',mfilename);
118 | 


--------------------------------------------------------------------------------
/cuda/nlm_shared/nlmSharedKernel.cu:
--------------------------------------------------------------------------------
 1 | #include <math.h>
 2 | #include <stdio.h>
 3 | 
 4 | // Array access macros
 5 | #define INPUT(i,j) input_grid[(j) + (i)*(N)]
 6 | #define TEMP(i,j) temp_grid[(j) + (i)*(N)]
 7 | 
 8 | #define WINDOW_SIZE (7)
 9 | #define NEIGHBOR_SIZE (3)
10 | #define BLOCK_SIZE (512)
11 | 
12 | #define FILTER_SIZE ((WINDOW_SIZE) + (NEIGHBOR_SIZE) - 1)
13 | #define FILTER_RADIUS (((FILTER_SIZE) - 1) / 2)
14 | 
15 | __global__ void nlmSimple(int N, double const *input_grid, double *output_grid, float filtSigma)
16 | {
17 |   __shared__ double temp_grid[BLOCK_SIZE * FILTER_SIZE];
18 | 
19 |   // Define global and local indices of current pixel
20 |   int gindex = threadIdx.x + blockIdx.x * blockDim.x;
21 |   int lindex = threadIdx.x + FILTER_RADIUS * blockDim.x;
22 | 
23 |   int pix_ix, pix_iy, pix_jx, pix_jy;
24 | 
25 |   double neighbor_j,
26 |          neighbor_i,
27 |          output = 0,
28 |          sum_weights = 0;
29 | 
30 |   // Read input elements into shared memory
31 |   for (int i = -FILTER_RADIUS; i <= FILTER_RADIUS; i++)
32 |   {
33 |     if ((int)blockIdx.x + i >= 0 && (int)blockIdx.x + i < N)
34 |     {
35 |       temp_grid[lindex + i * (int)blockDim.x] = input_grid[gindex + i * (int)blockDim.x];
36 |     }
37 |   }
38 | 
39 |   // Synchronize (ensure all the data is available)
40 |   __syncthreads();
41 | 
42 |   pix_iy = lindex % N;
43 |   pix_ix = (lindex - pix_iy) / N;
44 | 
45 |   if (pix_ix < FILTER_SIZE && pix_iy < N)
46 |   { 
47 |     int window_radius = (WINDOW_SIZE - 1) / 2;
48 |     int neighbor_radius = (NEIGHBOR_SIZE - 1) / 2; 
49 | 
50 |     // Iterate through window
51 |     for (int k = -window_radius; k <= window_radius; k++)
52 |       for (int l = -window_radius; l <= window_radius; l++)
53 |       {
54 |         double weight = 0;
55 |         double distance = 0;
56 | 
57 |         pix_jx = pix_ix + k; 
58 |         pix_jy = pix_iy + l;
59 | 
60 |         if (pix_jx < 0 || pix_jx >= FILTER_SIZE ||
61 |             pix_jy < 0 || pix_jy >= N)
62 |           continue;
63 | 
64 |         // Iterate through every pix_j neighbors
65 |         for (int p = -neighbor_radius; p <= neighbor_radius; p++)
66 |           for (int q = -neighbor_radius; q <= neighbor_radius; q++)
67 |           {
68 |             if (pix_jx + p < 0 || pix_jx + p >= FILTER_SIZE ||
69 |                 pix_jy + q < 0 || pix_jy + q >= N ||
70 |                 pix_ix + p < 0 || pix_ix + p >= FILTER_SIZE ||
71 |                 pix_iy + q < 0 || pix_iy + q >= N)
72 |               continue;
73 | 
74 |             neighbor_j = TEMP(pix_jx + p, pix_jy + q);
75 |             neighbor_i = TEMP(pix_ix + p, pix_iy + q);
76 |             distance += (neighbor_i - neighbor_j) * (neighbor_i - neighbor_j);
77 |           }
78 | 
79 |         // Derive weight for pixels i and j
80 |         weight = __expf(-(distance / filtSigma + 
81 |               (k*k + l*l) * (1.0f)/(float)(WINDOW_SIZE* WINDOW_SIZE)));
82 | 
83 |         sum_weights += weight;
84 | 
85 |         // Sum for every pixel in the window
86 |         output += TEMP(pix_jx, pix_jy) * weight;			
87 |       }
88 | 
89 |     // Normalize
90 |     sum_weights = (double)(1 / sum_weights);
91 |     output *= sum_weights;
92 | 
93 |     // Write output to global memory
94 |     output_grid[gindex] = output;
95 |   }
96 | }
97 | 


--------------------------------------------------------------------------------