├── iv.png
├── rocks.png
├── Debug
    ├── 3.png
    ├── main.o
    ├── dehazing
    └── kernels.o
├── images
    ├── rock.png
    └── forest.jpg
├── README.md
├── Makefile
├── dehazing.h
├── main.cpp
└── dehazing.cu


/iv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arsenalliu123/dehazing-GPU/HEAD/iv.png


--------------------------------------------------------------------------------
/rocks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arsenalliu123/dehazing-GPU/HEAD/rocks.png


--------------------------------------------------------------------------------
/Debug/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arsenalliu123/dehazing-GPU/HEAD/Debug/3.png


--------------------------------------------------------------------------------
/Debug/main.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arsenalliu123/dehazing-GPU/HEAD/Debug/main.o


--------------------------------------------------------------------------------
/Debug/dehazing:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arsenalliu123/dehazing-GPU/HEAD/Debug/dehazing


--------------------------------------------------------------------------------
/Debug/kernels.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arsenalliu123/dehazing-GPU/HEAD/Debug/kernels.o


--------------------------------------------------------------------------------
/images/rock.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arsenalliu123/dehazing-GPU/HEAD/images/rock.png


--------------------------------------------------------------------------------
/images/forest.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arsenalliu123/dehazing-GPU/HEAD/images/forest.jpg


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | #CUDA implementation of dehazing algorithm using dark channel prior
 2 | 
 3 | Dehazing algorithm implemented on CUDA.
 4 | 
 5 | ##Feature
 6 | - OpenCV to read images and processing them on GPU
 7 | - Shared memory optimization
 8 | - Multi-platform support (Windows, Linux, Mac)
 9 | 
10 | ##Usage
11 | 
12 | Make sure you have openCV, CUDA toolkit installed and a NVIDIA graphic card
13 | 
14 | ```sh
15 | git clone https://github.com/arsenalliu123/dehazing-GPU.git
16 | cd dehazing-GPU
17 | make clean && make
18 | Debug/dehazing -h
19 | ```
20 | 
21 | **Developed by Yichen Liu and Yin Lin**
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | 
 3 | CUDA_INSTALL_PATH=/usr/local/cuda-6.5
 4 | CFLAGS= -I. -I$(CUDA_INSTALL_PATH)/include `pkg-config --cflags opencv`
 5 | LDFLAGS= -L$(CUDA_INSTALL_PATH)/lib64 -lcudart `pkg-config --libs opencv`
 6 | COMPILE_FLAGS= -mcmodel=large -fPIC -g -Wall
 7 | 
 8 | #Uncomment the line below if you dont have CUDA enabled GPU
 9 | #EMU=-deviceemu
10 | 
11 | ifdef EMU
12 | CUDAFLAGS+=-deviceemu
13 | endif
14 | 
15 | all:
16 | 	$(CXX) $(COMPILE_FLAGS) -c main.cpp -o Debug/main.o $(CFLAGS)
17 | 	nvcc -c dehazing.cu -o Debug/kernels.o $(CUDAFLAGS) 
18 | 	$(CXX) $(COMPILE_FLAGS) Debug/main.o Debug/kernels.o -o Debug/dehazing $(LDFLAGS)
19 | 
20 | clean:
21 | 	rm -f Debug/*.o Debug/dehazing
22 | 
23 | 


--------------------------------------------------------------------------------
/dehazing.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * dehazing.h
 3 |  *
 4 |  *  Created on: Apr 8, 2015
 5 |  *      Author: river
 6 |  */
 7 | 
 8 | #ifndef DEHAZING_H_
 9 | #define DEHAZING_H_
10 | 
11 | #include <cuda_runtime.h>
12 | #include <cublas_v2.h>
13 | 
14 | #define CUDA_CHECK_RETURN(value) {											\
15 | 	cudaError_t _m_cudaStat = value;										\
16 | 	if (_m_cudaStat != cudaSuccess) {										\
17 | 		fprintf(stderr, "Error %s at line %d in file %s\n",					\
18 | 				cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__);		\
19 | 		exit(1);															\
20 | 	} }
21 | 
22 | #define CEIL(X) ((X-(int)(X)) > 0 ? (int)(X+1) : (int)(X))
23 | 
24 | //dark channel prior
25 | void dark_channel(
26 | 		float *image,
27 | 		float *img_grey,
28 | 		float *dark_channel,
29 | 		int height,
30 | 		int width,
31 | 		dim3 blocks,
32 | 		dim3 grids
33 | 		);
34 | 
35 | //air light (RGB of maximum dark prior channle pixel)
36 | void air_light(
37 | 		float *image,
38 | 		float *dark,
39 | 		int height,
40 | 		int width,
41 | 		dim3 blocks,
42 | 		dim3 grids
43 | 		);
44 | 
45 | void dehaze(
46 | 	float *image,
47 | 	float *dark,
48 | 	float *t,
49 | 	int height,
50 | 	int width,
51 | 	dim3 blocks,
52 | 	dim3 grids
53 | 	);
54 | 
55 | void transmission(
56 | 	float *image,
57 | 	float *t,
58 | 	int height,
59 | 	int width,
60 | 	dim3 blocks,
61 | 	dim3 grids
62 | 	);
63 | 
64 | void gfilter(
65 | 	float *filter,
66 | 	float *img_gray,
67 | 	float *trans,
68 | 	int height,
69 | 	int width,
70 | 	dim3 blocks,
71 | 	dim3 grids
72 | 	);//filter: guided imaging filter result
73 | 
74 | #endif /* DEHAZING_H_ */
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * main.cpp
  3 |  *
  4 |  *  Created on: Apr 12, 2015
  5 |  *      Author: river
  6 |  */
  7 | 
  8 | #ifdef __APPLE__
  9 |         #include <sys/uio.h>
 10 | #else
 11 |         #include <sys/io.h>
 12 | #endif
 13 | 
 14 | #include "iostream"
 15 | #include "time.h"
 16 | #include "string.h"
 17 | #include <stdio.h>
 18 | #include <stdlib.h>
 19 | #include "limits.h"
 20 | #include <unistd.h>
 21 | #include "dehazing.h"
 22 | #include "opencv2/opencv.hpp"
 23 | 
 24 | using namespace cv;
 25 | using namespace std;
 26 | 
 27 | // Define Const
 28 | clock_t start , finish ;
 29 | float lambda=0.0001;	//lambda
 30 | double _w=0.95;			//w
 31 | int height=0;			//image Height
 32 | int width=0;			//image Width
 33 | int size=0;			//total number of pixels
 34 | int blockdim = 32;
 35 | 
 36 | char img_name[100]="1.png";
 37 | char out_name[100]="2.png";
 38 | char trans_name[100]="3.png";
 39 | char dark_name[100]="4.png";
 40 | 
 41 | /*
 42 |  * dehazing procedures
 43 |  */
 44 | 
 45 | //read from img_name
 46 | Mat read_image(){
 47 | 
 48 | 	Mat img = imread(img_name);
 49 | 	height = img.rows;
 50 | 	width = img.cols;
 51 | 	size = img.rows*img.cols;
 52 | 	Mat real_img(img.rows,img.cols,CV_32FC3);
 53 | 	img.convertTo(real_img,CV_32FC3);
 54 | 	return real_img;
 55 | }
 56 | 
 57 | 
 58 | //************* Utility Functions **********
 59 | //Print Matrix
 60 | void printMat(char * name,Mat m)
 61 | {
 62 | 	cout<<name<<"\n"<<m<<endl;
 63 | }
 64 | 
 65 | //Print Matrix Information
 66 | void printMatInfo(Mat *m)
 67 | {
 68 | 	cout<<"\t"<<"cols="<<m->cols<<endl;
 69 | 	cout<<"\t"<<"rows="<<m->rows<<endl;
 70 | 	cout<<"\t"<<"channels="<<m->channels()<<endl;
 71 | }
 72 | 
 73 | //Process Args from command line
 74 | void processArgs(int argc, char * argv[])
 75 | {
 76 | 	for(int i=1;i<argc;i++)
 77 | 	{
 78 | 		if(strcmp(argv[i], "-h")==0){
 79 | 			printf("usage: -o output -i input -r filtered_transmission -t transmission.\n");
 80 | 			exit(1);
 81 | 		}
 82 | 		else if(strcmp(argv[i],"-o")==0){
 83 | 			i++;
 84 | 			strcpy(out_name,argv[i]);
 85 | 		}
 86 | 		else if(strcmp(argv[i],"-i")==0){
 87 | 			i++;
 88 | 			strcpy(img_name,argv[i]);
 89 | 		}
 90 | 		else if(strcmp(argv[i],"-t")==0){
 91 | 			i++;
 92 | 			strcpy(dark_name,argv[i]);
 93 | 		}
 94 | 		else if(strcmp(argv[i],"-r")==0){
 95 | 			i++;
 96 | 			strcpy(trans_name,argv[i]);
 97 | 		}
 98 | 		else{
 99 | 			printf("use -h to see usage.\n");
100 | 			exit(1);
101 | 		}
102 | 	}
103 | }
104 | 
105 | void finish_clock(){
106 | 	finish=clock();
107 | 	double duration=( double )( finish - start )/ CLOCKS_PER_SEC * 1000;
108 | 	cout<<"Time Cost: "<<duration<<"ms"<<endl;
109 | 	waitKey(1000);
110 | 	cout<<endl;
111 | }
112 | 
113 | void start_clock(){
114 | 	start=clock();
115 | }
116 | 
117 | //Main Function
118 | int main(int argc, char * argv[])
119 | {
120 | 	char filename[100];
121 | 	processArgs(argc,argv);
122 | 
123 | 	while(access(img_name,0)!=0)
124 | 	{
125 | 		cout<<"The image "<<img_name<<" don't exist."<<endl<<"Please enter another one:"<<endl;
126 | 		cin>>filename;
127 | 		strcpy(img_name,filename);
128 | 	}
129 | 
130 | 	cout<<"Reading Image ..."<<endl;
131 | 
132 | 	start_clock();
133 | 	Mat img = read_image();
134 | 	
135 | 	float* cpu_image = (float *)malloc((size+1) * 3 * sizeof(float));
136 | 	float *dark_image = (float *)malloc(size * sizeof(float));
137 | 	float *trans_image = (float *)malloc(size * sizeof(float));
138 | 
139 | 	/* load img into CPU float array and GPU float array */
140 | 	if (!cpu_image)
141 | 	{
142 | 		std::cout << "ERROR: Failed to allocate memory" << std::endl;
143 | 		return -1;
144 | 	}
145 | 	for (int i = 0; i < height; i++){
146 | 		for(int j = 0; j < width; j++)
147 | 		{
148 | 			for(int k = 0; k < 3; k++){
149 | 				cpu_image[(i * width + j) * 3 + k] = img.at<Vec<float,3> >(i,j)[k];
150 | 			}
151 | 		}
152 | 	}
153 | 	cpu_image[size] = 0;
154 | 	cpu_image[size+1] = 0;
155 | 	cpu_image[size+2] = 0;
156 | 
157 | 	
158 | 	float *gpu_image = NULL;
159 | 	float *dark = NULL;
160 | 	float *img_gray = NULL;
161 | 	//size+1 for storing the airlight
162 | 	CUDA_CHECK_RETURN(cudaMalloc((void **)(&gpu_image), ((size+1) * 3) * sizeof(float)));
163 | 
164 | 	CUDA_CHECK_RETURN(cudaMalloc((void **)(&dark), size * sizeof(float)));
165 | 	
166 | 	CUDA_CHECK_RETURN(cudaMalloc((void **)(&img_gray),size * sizeof(float)));
167 | 
168 | 	CUDA_CHECK_RETURN(cudaMemcpy(gpu_image, cpu_image, ((size+1) * 3) * sizeof(float), cudaMemcpyHostToDevice));
169 | 	
170 |     	float *trans = NULL;
171 |     	CUDA_CHECK_RETURN(cudaMalloc((void **)(&trans), size * sizeof(float)));
172 | 
173 |     	float *filter = NULL;
174 |     	CUDA_CHECK_RETURN(cudaMalloc((void **)(&filter), size * sizeof(float)));
175 |     	/////////////////
176 | 	printf("height: %d width: %d\n", height, width);
177 | 
178 | 	finish_clock();
179 | 	/*
180 | 	 * Dehazing Algorithm:
181 | 	 * 1. Calculate Dark Prior
182 | 	 * 2. Calculate Air Light
183 | 	 * 3. Get the image
184 | 	 */
185 | 
186 | 	//define the block size and grid size
187 | 	cout<<"Calculating Dark Channel Prior ..."<<endl;
188 | 	start_clock();
189 | 	dim3 block(blockdim, blockdim);
190 | 	int grid_size_x = CEIL(double(height) / blockdim);
191 | 	int grid_size_y = CEIL(double(width) / blockdim);
192 | 	dim3 grid(grid_size_x, grid_size_y);
193 | 	//dark channel: dark
194 | 	dark_channel(gpu_image, img_gray, dark, height, width, block, grid);
195 | 	finish_clock();
196 | 
197 | 	cout<<"Calculating Airlight ..."<<endl;
198 | 	start_clock();
199 | 	dim3 block_air(1024);
200 | 	dim3 grid_air(CEIL(double(size) / block_air.x));
201 | 	//airlight: gpu_image[height*width]
202 | 	air_light(gpu_image, dark, height, width, block_air, grid_air);
203 | 	finish_clock();
204 |     
205 | 	cout<<"Calculating transmission ..."<<endl;
206 | 	start_clock();
207 |     	//t: transmission
208 |     	transmission(gpu_image, trans, height, width, block, grid);
209 | 	finish_clock();
210 | 
211 | 	cout<<"Refining transmission ..."<<endl;
212 | 	dim3 block_guide(blockdim, blockdim);
213 | 	int grid_size_x_guide = CEIL(double(height) / blockdim);
214 | 	int grid_size_y_guide = CEIL(double(width) / blockdim);
215 | 	dim3 grid_guide(grid_size_x_guide, grid_size_y_guide);
216 | 	//filter: guided imaging filter result
217 |     	gfilter(filter, img_gray, trans, height, width, block_guide, grid_guide);
218 | 	finish_clock();
219 |     
220 | 	cout<<"Calculating dehaze ..."<<endl;
221 |     	start_clock();
222 |     	dehaze(gpu_image, dark, filter, height, width, block, grid);//dehaze image: ori_image
223 |     	finish_clock();
224 |     
225 | 
226 | 	/*
227 | 	 * copy back to CPU memory
228 | 	 */
229 | 	cout<<"Copy back to host memory ..."<<endl;
230 | 	start_clock();
231 | 	
232 | 	CUDA_CHECK_RETURN(cudaFree(dark));
233 | 	
234 | 	CUDA_CHECK_RETURN(cudaMemcpy(trans_image, filter, size * sizeof(float), cudaMemcpyDeviceToHost));
235 | 	CUDA_CHECK_RETURN(cudaFree(filter));
236 | 	
237 | 	CUDA_CHECK_RETURN(cudaMemcpy(dark_image, trans, size * sizeof(float), cudaMemcpyDeviceToHost));
238 | 	CUDA_CHECK_RETURN(cudaFree(trans));
239 | 	
240 | 	CUDA_CHECK_RETURN(cudaMemcpy(cpu_image, gpu_image, ((size+1) * 3) * sizeof(float), cudaMemcpyDeviceToHost));
241 | 	CUDA_CHECK_RETURN(cudaFree(gpu_image));
242 | 	
243 | 	/*
244 | 	printf("air light: %.2f %.2f %.2f\n", 
245 | 		cpu_image[3*size], 
246 | 		cpu_image[3*size+1], 
247 | 		cpu_image[3*size+2]);;
248 | 	*/
249 | 
250 | 	for(int i=0;i<size;i++){
251 | 		trans_image[i] *= 255.f;
252 | 		dark_image[i] *= 255.f;
253 | 	}
254 | 
255 | 	Mat dest(height, width, CV_32FC3, cpu_image);
256 | 	Mat trans_dest(height, width, CV_32FC1, trans_image);
257 | 	Mat dark_dest(height, width, CV_32FC1, dark_image);
258 | 	
259 | 	imwrite(out_name, dest);
260 | 	imwrite(trans_name, trans_dest);
261 | 	imwrite(dark_name, dark_dest);
262 | 	
263 | 	free(cpu_image);
264 | 	free(trans_image);
265 | 	free(dark_image);
266 | 
267 | 	free(cpu_image);
268 | 	free(trans_image);
269 | 	free(dark_image);
270 | 	
271 | 	finish_clock();
272 | 	return 0;
273 | }
274 | 


--------------------------------------------------------------------------------
/dehazing.cu:
--------------------------------------------------------------------------------
  1 | #include "dehazing.h"
  2 | #include "stdio.h"
  3 | 
  4 | 
  5 | 
  6 | 
  7 | //convenient macros
  8 | #define IN_GRAPH(x,y,h,w) ((x>=0)&&(x<h)&&(y>=0)&&(y<w))
  9 | #define min(x,y) ((x<y)?x:y)
 10 | #define max(x,y) ((x>y)?x:y)
 11 | #define WINDOW 7
 12 | #define R 15
 13 | 
 14 | /*
 15 |  * dark_channel host wrapper and kernel
 16 |  */
 17 | //first kernel calculate min of RGB
 18 | 
 19 | void printinfo(float *dark, int height, int width){
 20 | 	float *xx = (float *)malloc(sizeof(float)*height*width);
 21 | 	CUDA_CHECK_RETURN(cudaMemcpy(xx, dark, height * width * sizeof(float), cudaMemcpyDeviceToHost));
 22 | 	for(int i=0;i<height*width;i++){printf("%.2f ", xx[i]);}
 23 | 	
 24 | 
 25 | }
 26 | 
 27 | __global__
 28 | void dark_kernel1(float3 *image, float *img_grey, float *dark, int height, int width){
 29 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
 30 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
 31 | 	const int i = x * width + y;
 32 | 	if(x < height && y < width){
 33 | 		dark[i] = min(image[i].x, min(image[i].y, image[i].z));
 34 | 		img_grey[i] = image[i].x * 0.299 +  image[i].y * 0.587 + image[i].z * 0.114;
 35 | 	}
 36 | }
 37 | 
 38 | __device__
 39 | void padding(float *buffer, float *dark,
 40 | 	int x, int y,
 41 | 	int tx, int ty,
 42 | 	int window,
 43 | 	int bdimx, int bdimy,
 44 | 	int height, int width){
 45 | 	const int si = (tx + window) * (bdimy + window * 2) + ty + window;
 46 | 	int i = x*width + y;
 47 | 	buffer[si] = dark[i];
 48 | 	if(tx < window && IN_GRAPH(x-window, y, height, width) ){
 49 | 		buffer[si - (bdimy + window * 2) * window] = dark[i - window * width];
 50 | 		if(ty < window &&
 51 | 			IN_GRAPH(x-window, y-window, height, width) ){
 52 | 			buffer[si - (bdimy + window * 2) * window - window]
 53 | 			= dark[i - window * width - window];
 54 | 		}
 55 | 		if(ty >= bdimy - window &&
 56 | 			IN_GRAPH(x-window, y+window, height, width) ){
 57 | 			buffer[si - (bdimy + window * 2) * window + window]
 58 | 		       = dark[i - window * width + window];
 59 | 		}
 60 | 	}
 61 | 	if(tx >= bdimx - window && IN_GRAPH(x+window, y, height, width) ){
 62 | 		buffer[si + (bdimy + window * 2) * window] = dark[i + window * width];
 63 | 		if(ty >= bdimy - window &&
 64 | 			IN_GRAPH(x+window, y+window, height, width) ){
 65 | 				buffer[si + (bdimy + window * 2) * window + window]
 66 | 				       = dark[i + window * width + window];
 67 | 		}
 68 | 		if(ty < window &&
 69 | 			IN_GRAPH(x+window, y-window, height, width) ){
 70 | 				buffer[si + (bdimy + window * 2) * window - window]
 71 | 				       = dark[i + window * width - window];
 72 | 		}
 73 | 
 74 | 	}
 75 | 	if(ty >= bdimy - window && IN_GRAPH(x, y+window, height, width) ){
 76 | 		buffer[si + window] = dark[i + window];
 77 | 	}
 78 | 	if(ty < window && IN_GRAPH(x, y-window, height, width) ){
 79 | 		buffer[si - window] = dark[i - window];
 80 | 	}
 81 | 
 82 | }
 83 | 
 84 | 
 85 | //second kernel calculate min of 15 X 15 ceil
 86 | __global__
 87 | void dark_kernel2(float *dark, float *new_dark, int height, int width, int window){
 88 | 	extern __shared__ float buffer[];
 89 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
 90 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
 91 | 	const int i = x * width + y;
 92 | 	if(x < height && y < width){
 93 | 		
 94 | 		//using shared memory
 95 | 		padding(buffer, dark,
 96 | 			x, y,
 97 | 			threadIdx.x, threadIdx.y,
 98 | 			window,
 99 | 			blockDim.x, blockDim.y,
100 | 			height, width);
101 | 
102 | 		__syncthreads();
103 | 		
104 | 		float minval = 255.0;
105 | 		for(int startx = 0; startx < window * 2 + 1; startx++){
106 | 			for(int starty = 0; starty < window * 2 + 1; starty++){
107 | 				if(IN_GRAPH(x-window+startx, y-window+starty, height, width)){
108 | 					int shared_row_index = (threadIdx.x+startx)*(blockDim.y + window * 2);
109 | 					int shared_index = shared_row_index + threadIdx.y + starty;
110 | 					minval = min(buffer[shared_index],minval);
111 | 				}
112 | 			}
113 | 		}
114 | 		new_dark[i] = minval;
115 | 
116 | 		/*
117 | 		//using global memory
118 | 		float minval = 255.0;
119 | 		for(int startx = 0; startx < window * 2 + 1; startx++){
120 | 			for(int starty = 0; starty < window * 2 + 1; starty++){
121 | 				if(IN_GRAPH(x-window+startx, y-window+starty, height, width)){
122 | 					minval = min(dark[i+(startx-window)*width+starty-window], minval);
123 | 					//if(minval-(int)minval>0){printf("%d %d %.2f\n", x-window+startx, y-window+starty, minval);}
124 | 				}
125 | 			}
126 | 		}	
127 | 		//if(minval-(int)minval>0){printf("%.2f\n", minval);}
128 | 
129 | 		buffer[threadIdx.x*blockDim.y + threadIdx.y] = minval;
130 | 		__syncthreads();
131 | 		new_dark[i] = buffer[threadIdx.x*blockDim.y + threadIdx.y];
132 | 		*/
133 | 	}
134 | }
135 | 
136 | void dark_channel(float *image, float *img_grey, float *dark_channel, int height, int width, dim3 blocks, dim3 grids){
137 | 	
138 | 	float *tmp_dark;
139 | 	cudaMalloc((void **)(&tmp_dark), sizeof(float)*height*width);
140 | 	
141 | 	dark_kernel1<<<grids, blocks>>> ((float3 *)image, img_grey, tmp_dark, height, width);
142 | 	
143 | 	int window = WINDOW;
144 | 	int shared_size = (blocks.x + window * 2) * (blocks.y + window * 2) * sizeof(float);
145 | 	dark_kernel2<<<grids, blocks, shared_size>>>(tmp_dark, dark_channel, height, width, window);
146 | 	
147 | 	cudaFree(tmp_dark);
148 | }
149 | 
150 | /*
151 |  * air_light host wrapper and kernel
152 |  */
153 | 
154 | //first kernel reduce to < 1024 values for next kernel
155 | __global__
156 | void airlight_kernel1(
157 | 		float3 *image, float *dark,
158 | 		int height, int width,
159 | 		float3 *int_image, float *int_dark){
160 | 	const int i = blockDim.x * blockIdx.x + threadIdx.x;
161 | 	//printf("%d %d %d %d\n", b_n, i, threadIdx.x , width*height);
162 | 	extern __shared__ float3 tmp_image[];
163 | 	float *tmp_dark = (float *)(tmp_image + blockDim.x);
164 | 	if(i < width * height){
165 | 		tmp_image[threadIdx.x] = image[i];
166 | 		tmp_dark[threadIdx.x] = dark[i];
167 | 		__syncthreads();
168 | 		for(unsigned int stride = blockDim.x/2; stride > 0; stride >>= 1){
169 | 			if(threadIdx.x < stride){
170 | 				if(tmp_dark[threadIdx.x + stride] > tmp_dark[threadIdx.x]){
171 | 					tmp_dark[threadIdx.x] = tmp_dark[threadIdx.x + stride];
172 | 					tmp_image[threadIdx.x] = tmp_image[threadIdx.x + stride];
173 | 				}
174 | 			}
175 | 			__syncthreads();
176 | 		}
177 | 		if(threadIdx.x == 0){
178 | 			int_image[blockIdx.x] = tmp_image[threadIdx.x];
179 | 			int_dark[blockIdx.x] = tmp_dark[threadIdx.x];
180 | 		}
181 | 	}
182 | }
183 | 
184 | //calculate air light
185 | __global__
186 | void airlight_kernel2(float3 *image, int size, float3 *int_image, float *int_dark){
187 | 
188 | 	extern __shared__ float3 tmp_image[];
189 | 	float *tmp_dark = (float *)(tmp_image + blockDim.x);
190 | 	tmp_image[threadIdx.x] = int_image[threadIdx.x];
191 | 	tmp_dark[threadIdx.x] = int_dark[threadIdx.x];
192 | 	__syncthreads();
193 | 	for(unsigned int stride = blockDim.x/2; stride > 0; stride >>= 1){
194 | 		if(threadIdx.x < stride){
195 | 			if(tmp_dark[threadIdx.x + stride] > tmp_dark[threadIdx.x]){
196 | 				tmp_dark[threadIdx.x] = tmp_dark[threadIdx.x + stride];
197 | 				tmp_image[threadIdx.x] = tmp_image[threadIdx.x + stride];
198 | 			//printf("%.2f %.2f %.2f %.2f\n", tmp_image[0].x,tmp_image[0].y,tmp_image[0].z, tmp_dark[0]);
199 | 			}
200 | 		}
201 | 		__syncthreads();
202 | 	}
203 | 	if(threadIdx.x == 0){
204 | 		//float factor = 1.0;
205 | 		image[size] = tmp_image[threadIdx.x];
206 | 	}
207 | }
208 | 
209 | void air_light(float *image, float *dark, int height, int width, dim3 blocks, dim3 grids){
210 | 	
211 | 	float3 *int_image = NULL;
212 | 	float *int_dark = NULL;
213 | 	//printf("%d\n", grids.x);
214 | 
215 | 	cudaMalloc((void **)(&int_image), sizeof(float3)*grids.x);
216 | 	cudaMalloc((void **)(&int_dark), sizeof(float)*grids.x);
217 | 	
218 | 	//float *xx = (float *)malloc(sizeof(float)*height*width);
219 | 	//CUDA_CHECK_RETURN(cudaMemcpy(xx, dark, height * width * sizeof(float), cudaMemcpyDeviceToHost));
220 | 	//for(int i=0;i<height*width;i++){printf("%.2f ", xx[i]);}
221 | 	
222 | 	int shared_size_1 = blocks.x*(sizeof(float3)+sizeof(float));
223 | 	int shared_size_2 = grids.x*(sizeof(float3)+sizeof(float));
224 | 	airlight_kernel1<<<grids, blocks, shared_size_1>>> ((float3 *)image, dark, height, width, int_image, int_dark);
225 | 	airlight_kernel2<<<1, grids, shared_size_2>>> ((float3 *)image, height*width, int_image, int_dark);
226 | 
227 | }
228 | 
229 | __global__
230 | void transmission1_kernel(float3 *image, float *t, int height, int width){
231 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
232 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
233 | 	const int i = x * width + y;
234 | 	float tx, ty, tz;
235 | 	if(x < height && y < width){
236 | 		tx = image[i].x/image[height*width].x;
237 | 		ty = image[i].y/image[height*width].y;
238 | 		tz = image[i].z/image[height*width].z;
239 | 		t[i] = min(tx, min(ty, tz));
240 | 	}
241 | }
242 | 
243 | __global__
244 | void transmission2_kernel(float *dark, float *new_dark, int height, int width, int window){
245 | 	extern __shared__ float buffer[];
246 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
247 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
248 | 	const int i = x * width + y;
249 | 	if(x < height && y < width){
250 | 		
251 | 		//using shared memory
252 | 		padding(buffer, dark,
253 | 			x, y,
254 | 			threadIdx.x, threadIdx.y,
255 | 			window,
256 | 			blockDim.x, blockDim.y,
257 | 			height, width);
258 | 
259 | 		__syncthreads();
260 | 		
261 | 		float minval = 1.0;
262 | 		for(int startx = 0; startx < window * 2 + 1; startx++){
263 | 			for(int starty = 0; starty < window * 2 + 1; starty++){
264 | 				if(IN_GRAPH(x-window+startx, y-window+starty, height, width)){
265 | 					int shared_row_index = (threadIdx.x+startx)*(blockDim.y + window * 2);
266 | 					int shared_index = shared_row_index + threadIdx.y + starty;
267 | 					minval = min(buffer[shared_index], minval);
268 | 				}
269 | 			}
270 | 		}
271 | 
272 | 		new_dark[i] = 1-0.95*minval;
273 | 	}
274 | }
275 | 
276 | 
277 | 
278 | void transmission(float *image, float *t, int height, int width, dim3 blocks,dim3 grids){
279 | 	float *tmp_trans;
280 | 	cudaMalloc((void **)&tmp_trans, sizeof(float)*height*width);
281 | 	transmission1_kernel<<<grids, blocks>>> ((float3 *)image, tmp_trans, height, width);
282 | 	int window = WINDOW;
283 | 	int shared_size = (blocks.x + window * 2) * (blocks.y + window * 2) * sizeof(float);
284 | 	transmission2_kernel<<<grids, blocks, shared_size>>>(tmp_trans, t, height, width, window);
285 | 	cudaFree(tmp_trans);
286 | }
287 | 
288 | __global__
289 | void dehaze_kernel(float3 *image, float *dark, float *t, int height, int width){
290 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
291 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
292 | 	const int i = x * width + y;
293 | 	if(x < height && y < width){
294 | 		image[i].x = (image[i].x - image[height*width].x)/max(0.1, t[i]) + image[height*width].x;
295 | 		image[i].y = (image[i].y - image[height*width].y)/max(0.1, t[i]) + image[height*width].y;
296 | 		image[i].z = (image[i].z - image[height*width].z)/max(0.1, t[i]) + image[height*width].z;
297 | 
298 | 	}
299 | }
300 | 
301 | void dehaze(float *image,float *dark, float *t, int height, int width, dim3 blocks,dim3 grids){
302 | 	dehaze_kernel<<<grids, blocks>>> ((float3 *)image, dark, t, height, width);
303 | }
304 | 
305 | __global__
306 | void setones(float *img_in, int height, int width, float val){
307 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
308 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
309 | 	const int i = x * width + y;
310 | 	if(x < height && y < width){
311 | 		img_in[i] = val;
312 | 	}
313 | }
314 | 
315 | __global__
316 | void boxfilter_kernel(float *img_in, float *img_res, float *patch, int r, int height, int width){//r: local window radius
317 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
318 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
319 | 	const int i = x * width + y;
320 | 	extern __shared__ float buffer[];
321 | 	if(x < height && y < width){
322 | 		padding(
323 | 			buffer, img_in,
324 | 			x, y,
325 | 			threadIdx.x, threadIdx.y,
326 | 			r,
327 | 			blockDim.x, blockDim.y,
328 | 			height, width);
329 | 
330 | 		__syncthreads();
331 | 
332 | 		float val = 0.0;
333 | 		for(int startx = 0; startx < r * 2 + 1; startx++){
334 | 			for(int starty = 0; starty < r * 2 + 1; starty++){
335 | 				if(IN_GRAPH(x-r+startx, y-r+starty, height, width)){
336 | 					int shared_row_index = (threadIdx.x+startx)*(blockDim.y + r * 2);
337 | 					int shared_index = shared_row_index + threadIdx.y + starty;
338 | 					val += buffer[shared_index];
339 | 				}
340 | 			}
341 | 		}
342 | 		
343 | 		img_res[i] = val/patch[i];//((2*r+1)*(2*r+1));
344 | 	}
345 | }
346 | 
347 | __global__
348 | void boxfilter_kernel2(float *img_in,
349 | 	float *img_res,
350 | 	float *img_in2,
351 | 	float *img_res2,
352 | 	float *patch,
353 | 	int r,
354 | 	int height,
355 | 	int width){
356 | 
357 | 	//r: local window radius
358 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
359 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
360 | 	const int i = x * width + y;
361 | 	extern __shared__ float buffer[];
362 | 	float *buffer2 = buffer + (blockDim.x + r * 2) * (blockDim.y + r * 2);
363 | 	
364 | 	if(x < height && y < width){
365 | 		
366 | 		padding(
367 | 			buffer, img_in,
368 | 			x, y,
369 | 			threadIdx.x, threadIdx.y,
370 | 			r,
371 | 			blockDim.x, blockDim.y,
372 | 			height, width);
373 | 
374 | 		padding(buffer2, img_in2,
375 | 			x, y,
376 | 			threadIdx.x, threadIdx.y,
377 | 			r,
378 | 			blockDim.x, blockDim.y,
379 | 			height, width);
380 | 
381 | 		__syncthreads();
382 | 
383 | 		float val = 0.0;
384 | 		float val2 = 0.0;
385 | 		for(int startx = 0; startx < r * 2 + 1; startx++){
386 | 			for(int starty = 0; starty < r * 2 + 1; starty++){
387 | 				if(IN_GRAPH(x-r+startx, y-r+starty, height, width)){
388 | 					int shared_row_index = (threadIdx.x+startx)*(blockDim.y + r * 2);
389 | 					int shared_index = shared_row_index + threadIdx.y + starty;
390 | 					val += buffer[shared_index];
391 | 					val2 += buffer2[shared_index];
392 | 				}
393 | 			}
394 | 		}
395 | 
396 | 		img_res[i] = val/patch[i];
397 | 		img_res2[i] = val2/patch[i];
398 | 	}
399 | }
400 | 
401 | __global__
402 | void matmul_kernel(float *a, float *b, float *res1, float *res2, int height, int width){
403 | //b=a.*b
404 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
405 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
406 | 	const int i = x * width + y;
407 | 	if(x < height && y < width){
408 | 		res1[i] = a[i]*b[i];
409 | 		res2[i] = a[i]*a[i];
410 | 	}
411 | }
412 | 
413 | __global__//(mean_IP, mean_II, mean_I, mean_P, cov_IP, var_I, height, width)
414 | //(a, b, cov_IP, var_I, mean_P, mean_I, height, width)
415 | void var_kernel(float *a, float *b, float *mean_IP, float *mean_II, float *mean_I, float *mean_P, float *cov_IP, float *var_I, int height, int width){
416 | //d = a-b.*c
417 | 
418 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
419 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
420 | 	const int i = x * width + y;
421 | 	if(x < height && y < width){
422 | 		cov_IP[i] = mean_IP[i]-mean_I[i]*mean_P[i];
423 | 		var_I[i] = mean_II[i]-mean_I[i]*mean_I[i];
424 | 		a[i] = cov_IP[i]/(var_I[i] + 0.000001);
425 | 		b[i] = mean_P[i] - a[i]*mean_I[i];
426 | 	}
427 | }
428 | /*
429 | __global__
430 | void compab_kernel(float *a, float *b, float *cov_IP, float *var_I, float *mean_P, float *mean_I, int height, int width){
431 | //a=cov_IP./(var_I.+eps);
432 | 	//eps = 10^-6
433 | //b=mean_P-a.*mean_I;
434 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
435 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
436 | 	const int i = x * width + y;
437 | 	if(x < height && y < width){
438 | 		a[i] = cov_IP[i]/(var_I[i] + 0.000001);
439 | 		b[i] = mean_P[i] - a[i]*mean_I[i];
440 | 	}
441 | 
442 | }
443 | */
444 | __global__
445 | void result_kernel(float *result, float *mean_a, float *I, float *mean_b, int height, int width){
446 | //mean_a = mean_a.*I+mean_b
447 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
448 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
449 | 	const int i = x * width + y;
450 | 	if(x < height && y < width){
451 | 		result[i] = mean_a[i]*I[i] + mean_b[i];
452 | 	}
453 | }
454 | void gfilter(float *result, float *I, float *P, int height, int width, dim3 blocks, dim3 grids){
455 | 	
456 | 	//I: guided image - origin gray scale image - 1 channel
457 | 	//P: imaged need to be filtered - transmission image - 1 channel
458 | 	//result: refined trans image - 1 channel
459 | 
460 | 	int r = R;
461 | 	//float eps = 10^-6;
462 | 	
463 | 	float *N;
464 | 	float *ones;
465 | 	float *mean_I;
466 | 	float *mean_P;
467 | 	float *mean_IP;
468 | 	float *cov_IP;
469 | 	float *mean_II;
470 | 	float *var_I;
471 | 	float *a;
472 | 	float *b;
473 | 	float *mean_a;
474 | 	float *mean_b;
475 | 	
476 | 	//intermediate variables
477 | 	cudaMalloc((void **)(&N), sizeof(float)*height*width);
478 | 	cudaMalloc((void **)(&ones), sizeof(float)*height*width);
479 | 	cudaMalloc((void **)(&mean_I), sizeof(float)*height*width);
480 | 	cudaMalloc((void **)(&mean_P), sizeof(float)*height*width);
481 | 	cudaMalloc((void **)(&mean_IP), sizeof(float)*height*width);
482 | 	cudaMalloc((void **)(&mean_II), sizeof(float)*height*width);
483 | 	cudaMalloc((void **)(&a), sizeof(float)*height*width);
484 | 	cudaMalloc((void **)(&b), sizeof(float)*height*width);
485 | 	cudaMalloc((void **)(&mean_a), sizeof(float)*height*width);
486 | 	cudaMalloc((void **)(&mean_b), sizeof(float)*height*width);
487 | 	cudaMalloc((void **)(&cov_IP), sizeof(float)*height*width);
488 | 	cudaMalloc((void **)(&var_I), sizeof(float)*height*width);
489 | 
490 | 	setones<<<grids, blocks>>> (ones, height, width, 1.0);
491 | 	//printinfo(ones, height, width);
492 | 	int shared_size = (blocks.x + r * 2) * (blocks.y + r * 2) * sizeof(float);
493 | 	int shared_size2 = 2 * shared_size;
494 | 	//compute N
495 | 	boxfilter_kernel<<<grids, blocks, shared_size>>> (
496 | 		ones, N, ones, r, height, width);
497 | 	
498 | 	cudaFree(ones);
499 | 
500 | 	//compute mean_I and mean_P
501 | 	boxfilter_kernel2<<<grids, blocks, shared_size2>>> (
502 | 		I, mean_I, P, mean_P, N, r, height, width);
503 | 
504 | 
505 | 
506 | 	float *ImulP;
507 | 	float *ImulI;
508 | 	cudaMalloc((void **)(&ImulP), sizeof(float)*height*width);
509 | 	cudaMalloc((void **)(&ImulI), sizeof(float)*height*width);
510 | 	matmul_kernel<<<grids, blocks>>> (I, P, ImulP, ImulI, height, width);// compute P = I.*P
511 | 	boxfilter_kernel2<<<grids, blocks, shared_size2>>> (ImulP, mean_IP, ImulI, mean_II, N, r, height, width);//compute mean_IP
512 | 	cudaFree(ImulP);
513 | 	
514 | 	//var_kernel<<<grids, blocks>>> (mean_IP, mean_I, mean_P, cov_IP, height, width);//compute cov_IP=mean_Ip-mean_I*mean_P
515 | 
516 | 	//boxfilter_kernel<<<grids, blocks, shared_size>>> (ImulI, mean_II, N, r, height, width);//compute mean_II
517 | 	cudaFree(ImulI);
518 | 	//mean_IP
519 | 	var_kernel<<<grids, blocks>>> (a, b, mean_IP, mean_II, mean_I, mean_P, cov_IP, var_I, height, width);//compute var_I=mean_II-mean_I^2
520 | 
521 | 	//compab_kernel<<<grids, blocks>>>(a, b, cov_IP, var_I, mean_P, mean_I, height, width);//compute a&b
522 | 	cudaFree(mean_I);
523 | 	cudaFree(mean_P);
524 | 	cudaFree(cov_IP);
525 | 	cudaFree(var_I);
526 | 	//compute mean_II
527 | 	boxfilter_kernel2<<<grids, blocks, shared_size2>>> (
528 | 		a, mean_a, b, mean_b, N, r, height, width);
529 | 	cudaFree(N);
530 | 	cudaFree(a);
531 | 	cudaFree(b);
532 | 	
533 | 	result_kernel<<<grids, blocks>>> (result, mean_a, I, mean_b, height, width);//return result
534 | 	cudaFree(mean_a);
535 | 	cudaFree(mean_b);
536 | }
537 | 


--------------------------------------------------------------------------------