├── iv.png ├── rocks.png ├── Debug ├── 3.png ├── main.o ├── dehazing └── kernels.o ├── images ├── rock.png └── forest.jpg ├── README.md ├── Makefile ├── dehazing.h ├── main.cpp └── dehazing.cu /iv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arsenalliu123/dehazing-GPU/HEAD/iv.png -------------------------------------------------------------------------------- /rocks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arsenalliu123/dehazing-GPU/HEAD/rocks.png -------------------------------------------------------------------------------- /Debug/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arsenalliu123/dehazing-GPU/HEAD/Debug/3.png -------------------------------------------------------------------------------- /Debug/main.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arsenalliu123/dehazing-GPU/HEAD/Debug/main.o -------------------------------------------------------------------------------- /Debug/dehazing: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arsenalliu123/dehazing-GPU/HEAD/Debug/dehazing -------------------------------------------------------------------------------- /Debug/kernels.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arsenalliu123/dehazing-GPU/HEAD/Debug/kernels.o -------------------------------------------------------------------------------- /images/rock.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arsenalliu123/dehazing-GPU/HEAD/images/rock.png -------------------------------------------------------------------------------- /images/forest.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arsenalliu123/dehazing-GPU/HEAD/images/forest.jpg -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #CUDA implementation of dehazing algorithm using dark channel prior 2 | 3 | Dehazing algorithm implemented on CUDA. 4 | 5 | ##Feature 6 | - OpenCV to read images and processing them on GPU 7 | - Shared memory optimization 8 | - Multi-platform support (Windows, Linux, Mac) 9 | 10 | ##Usage 11 | 12 | Make sure you have openCV, CUDA toolkit installed and a NVIDIA graphic card 13 | 14 | ```sh 15 | git clone https://github.com/arsenalliu123/dehazing-GPU.git 16 | cd dehazing-GPU 17 | make clean && make 18 | Debug/dehazing -h 19 | ``` 20 | 21 | **Developed by Yichen Liu and Yin Lin** 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | 3 | CUDA_INSTALL_PATH=/usr/local/cuda-6.5 4 | CFLAGS= -I. -I$(CUDA_INSTALL_PATH)/include `pkg-config --cflags opencv` 5 | LDFLAGS= -L$(CUDA_INSTALL_PATH)/lib64 -lcudart `pkg-config --libs opencv` 6 | COMPILE_FLAGS= -mcmodel=large -fPIC -g -Wall 7 | 8 | #Uncomment the line below if you dont have CUDA enabled GPU 9 | #EMU=-deviceemu 10 | 11 | ifdef EMU 12 | CUDAFLAGS+=-deviceemu 13 | endif 14 | 15 | all: 16 | $(CXX) $(COMPILE_FLAGS) -c main.cpp -o Debug/main.o $(CFLAGS) 17 | nvcc -c dehazing.cu -o Debug/kernels.o $(CUDAFLAGS) 18 | $(CXX) $(COMPILE_FLAGS) Debug/main.o Debug/kernels.o -o Debug/dehazing $(LDFLAGS) 19 | 20 | clean: 21 | rm -f Debug/*.o Debug/dehazing 22 | 23 | -------------------------------------------------------------------------------- /dehazing.h: -------------------------------------------------------------------------------- 1 | /* 2 | * dehazing.h 3 | * 4 | * Created on: Apr 8, 2015 5 | * Author: river 6 | */ 7 | 8 | #ifndef DEHAZING_H_ 9 | #define DEHAZING_H_ 10 | 11 | #include 12 | #include 13 | 14 | #define CUDA_CHECK_RETURN(value) { \ 15 | cudaError_t _m_cudaStat = value; \ 16 | if (_m_cudaStat != cudaSuccess) { \ 17 | fprintf(stderr, "Error %s at line %d in file %s\n", \ 18 | cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__); \ 19 | exit(1); \ 20 | } } 21 | 22 | #define CEIL(X) ((X-(int)(X)) > 0 ? (int)(X+1) : (int)(X)) 23 | 24 | //dark channel prior 25 | void dark_channel( 26 | float *image, 27 | float *img_grey, 28 | float *dark_channel, 29 | int height, 30 | int width, 31 | dim3 blocks, 32 | dim3 grids 33 | ); 34 | 35 | //air light (RGB of maximum dark prior channle pixel) 36 | void air_light( 37 | float *image, 38 | float *dark, 39 | int height, 40 | int width, 41 | dim3 blocks, 42 | dim3 grids 43 | ); 44 | 45 | void dehaze( 46 | float *image, 47 | float *dark, 48 | float *t, 49 | int height, 50 | int width, 51 | dim3 blocks, 52 | dim3 grids 53 | ); 54 | 55 | void transmission( 56 | float *image, 57 | float *t, 58 | int height, 59 | int width, 60 | dim3 blocks, 61 | dim3 grids 62 | ); 63 | 64 | void gfilter( 65 | float *filter, 66 | float *img_gray, 67 | float *trans, 68 | int height, 69 | int width, 70 | dim3 blocks, 71 | dim3 grids 72 | );//filter: guided imaging filter result 73 | 74 | #endif /* DEHAZING_H_ */ 75 | 76 | 77 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * main.cpp 3 | * 4 | * Created on: Apr 12, 2015 5 | * Author: river 6 | */ 7 | 8 | #ifdef __APPLE__ 9 | #include 10 | #else 11 | #include 12 | #endif 13 | 14 | #include "iostream" 15 | #include "time.h" 16 | #include "string.h" 17 | #include 18 | #include 19 | #include "limits.h" 20 | #include 21 | #include "dehazing.h" 22 | #include "opencv2/opencv.hpp" 23 | 24 | using namespace cv; 25 | using namespace std; 26 | 27 | // Define Const 28 | clock_t start , finish ; 29 | float lambda=0.0001; //lambda 30 | double _w=0.95; //w 31 | int height=0; //image Height 32 | int width=0; //image Width 33 | int size=0; //total number of pixels 34 | int blockdim = 32; 35 | 36 | char img_name[100]="1.png"; 37 | char out_name[100]="2.png"; 38 | char trans_name[100]="3.png"; 39 | char dark_name[100]="4.png"; 40 | 41 | /* 42 | * dehazing procedures 43 | */ 44 | 45 | //read from img_name 46 | Mat read_image(){ 47 | 48 | Mat img = imread(img_name); 49 | height = img.rows; 50 | width = img.cols; 51 | size = img.rows*img.cols; 52 | Mat real_img(img.rows,img.cols,CV_32FC3); 53 | img.convertTo(real_img,CV_32FC3); 54 | return real_img; 55 | } 56 | 57 | 58 | //************* Utility Functions ********** 59 | //Print Matrix 60 | void printMat(char * name,Mat m) 61 | { 62 | cout<>filename; 127 | strcpy(img_name,filename); 128 | } 129 | 130 | cout<<"Reading Image ..."< >(i,j)[k]; 150 | } 151 | } 152 | } 153 | cpu_image[size] = 0; 154 | cpu_image[size+1] = 0; 155 | cpu_image[size+2] = 0; 156 | 157 | 158 | float *gpu_image = NULL; 159 | float *dark = NULL; 160 | float *img_gray = NULL; 161 | //size+1 for storing the airlight 162 | CUDA_CHECK_RETURN(cudaMalloc((void **)(&gpu_image), ((size+1) * 3) * sizeof(float))); 163 | 164 | CUDA_CHECK_RETURN(cudaMalloc((void **)(&dark), size * sizeof(float))); 165 | 166 | CUDA_CHECK_RETURN(cudaMalloc((void **)(&img_gray),size * sizeof(float))); 167 | 168 | CUDA_CHECK_RETURN(cudaMemcpy(gpu_image, cpu_image, ((size+1) * 3) * sizeof(float), cudaMemcpyHostToDevice)); 169 | 170 | float *trans = NULL; 171 | CUDA_CHECK_RETURN(cudaMalloc((void **)(&trans), size * sizeof(float))); 172 | 173 | float *filter = NULL; 174 | CUDA_CHECK_RETURN(cudaMalloc((void **)(&filter), size * sizeof(float))); 175 | ///////////////// 176 | printf("height: %d width: %d\n", height, width); 177 | 178 | finish_clock(); 179 | /* 180 | * Dehazing Algorithm: 181 | * 1. Calculate Dark Prior 182 | * 2. Calculate Air Light 183 | * 3. Get the image 184 | */ 185 | 186 | //define the block size and grid size 187 | cout<<"Calculating Dark Channel Prior ..."<=0)&&(x=0)&&(yy)?x:y) 11 | #define WINDOW 7 12 | #define R 15 13 | 14 | /* 15 | * dark_channel host wrapper and kernel 16 | */ 17 | //first kernel calculate min of RGB 18 | 19 | void printinfo(float *dark, int height, int width){ 20 | float *xx = (float *)malloc(sizeof(float)*height*width); 21 | CUDA_CHECK_RETURN(cudaMemcpy(xx, dark, height * width * sizeof(float), cudaMemcpyDeviceToHost)); 22 | for(int i=0;i= bdimy - window && 56 | IN_GRAPH(x-window, y+window, height, width) ){ 57 | buffer[si - (bdimy + window * 2) * window + window] 58 | = dark[i - window * width + window]; 59 | } 60 | } 61 | if(tx >= bdimx - window && IN_GRAPH(x+window, y, height, width) ){ 62 | buffer[si + (bdimy + window * 2) * window] = dark[i + window * width]; 63 | if(ty >= bdimy - window && 64 | IN_GRAPH(x+window, y+window, height, width) ){ 65 | buffer[si + (bdimy + window * 2) * window + window] 66 | = dark[i + window * width + window]; 67 | } 68 | if(ty < window && 69 | IN_GRAPH(x+window, y-window, height, width) ){ 70 | buffer[si + (bdimy + window * 2) * window - window] 71 | = dark[i + window * width - window]; 72 | } 73 | 74 | } 75 | if(ty >= bdimy - window && IN_GRAPH(x, y+window, height, width) ){ 76 | buffer[si + window] = dark[i + window]; 77 | } 78 | if(ty < window && IN_GRAPH(x, y-window, height, width) ){ 79 | buffer[si - window] = dark[i - window]; 80 | } 81 | 82 | } 83 | 84 | 85 | //second kernel calculate min of 15 X 15 ceil 86 | __global__ 87 | void dark_kernel2(float *dark, float *new_dark, int height, int width, int window){ 88 | extern __shared__ float buffer[]; 89 | const int x = blockIdx.x * blockDim.x + threadIdx.x; 90 | const int y = blockIdx.y * blockDim.y + threadIdx.y; 91 | const int i = x * width + y; 92 | if(x < height && y < width){ 93 | 94 | //using shared memory 95 | padding(buffer, dark, 96 | x, y, 97 | threadIdx.x, threadIdx.y, 98 | window, 99 | blockDim.x, blockDim.y, 100 | height, width); 101 | 102 | __syncthreads(); 103 | 104 | float minval = 255.0; 105 | for(int startx = 0; startx < window * 2 + 1; startx++){ 106 | for(int starty = 0; starty < window * 2 + 1; starty++){ 107 | if(IN_GRAPH(x-window+startx, y-window+starty, height, width)){ 108 | int shared_row_index = (threadIdx.x+startx)*(blockDim.y + window * 2); 109 | int shared_index = shared_row_index + threadIdx.y + starty; 110 | minval = min(buffer[shared_index],minval); 111 | } 112 | } 113 | } 114 | new_dark[i] = minval; 115 | 116 | /* 117 | //using global memory 118 | float minval = 255.0; 119 | for(int startx = 0; startx < window * 2 + 1; startx++){ 120 | for(int starty = 0; starty < window * 2 + 1; starty++){ 121 | if(IN_GRAPH(x-window+startx, y-window+starty, height, width)){ 122 | minval = min(dark[i+(startx-window)*width+starty-window], minval); 123 | //if(minval-(int)minval>0){printf("%d %d %.2f\n", x-window+startx, y-window+starty, minval);} 124 | } 125 | } 126 | } 127 | //if(minval-(int)minval>0){printf("%.2f\n", minval);} 128 | 129 | buffer[threadIdx.x*blockDim.y + threadIdx.y] = minval; 130 | __syncthreads(); 131 | new_dark[i] = buffer[threadIdx.x*blockDim.y + threadIdx.y]; 132 | */ 133 | } 134 | } 135 | 136 | void dark_channel(float *image, float *img_grey, float *dark_channel, int height, int width, dim3 blocks, dim3 grids){ 137 | 138 | float *tmp_dark; 139 | cudaMalloc((void **)(&tmp_dark), sizeof(float)*height*width); 140 | 141 | dark_kernel1<<>> ((float3 *)image, img_grey, tmp_dark, height, width); 142 | 143 | int window = WINDOW; 144 | int shared_size = (blocks.x + window * 2) * (blocks.y + window * 2) * sizeof(float); 145 | dark_kernel2<<>>(tmp_dark, dark_channel, height, width, window); 146 | 147 | cudaFree(tmp_dark); 148 | } 149 | 150 | /* 151 | * air_light host wrapper and kernel 152 | */ 153 | 154 | //first kernel reduce to < 1024 values for next kernel 155 | __global__ 156 | void airlight_kernel1( 157 | float3 *image, float *dark, 158 | int height, int width, 159 | float3 *int_image, float *int_dark){ 160 | const int i = blockDim.x * blockIdx.x + threadIdx.x; 161 | //printf("%d %d %d %d\n", b_n, i, threadIdx.x , width*height); 162 | extern __shared__ float3 tmp_image[]; 163 | float *tmp_dark = (float *)(tmp_image + blockDim.x); 164 | if(i < width * height){ 165 | tmp_image[threadIdx.x] = image[i]; 166 | tmp_dark[threadIdx.x] = dark[i]; 167 | __syncthreads(); 168 | for(unsigned int stride = blockDim.x/2; stride > 0; stride >>= 1){ 169 | if(threadIdx.x < stride){ 170 | if(tmp_dark[threadIdx.x + stride] > tmp_dark[threadIdx.x]){ 171 | tmp_dark[threadIdx.x] = tmp_dark[threadIdx.x + stride]; 172 | tmp_image[threadIdx.x] = tmp_image[threadIdx.x + stride]; 173 | } 174 | } 175 | __syncthreads(); 176 | } 177 | if(threadIdx.x == 0){ 178 | int_image[blockIdx.x] = tmp_image[threadIdx.x]; 179 | int_dark[blockIdx.x] = tmp_dark[threadIdx.x]; 180 | } 181 | } 182 | } 183 | 184 | //calculate air light 185 | __global__ 186 | void airlight_kernel2(float3 *image, int size, float3 *int_image, float *int_dark){ 187 | 188 | extern __shared__ float3 tmp_image[]; 189 | float *tmp_dark = (float *)(tmp_image + blockDim.x); 190 | tmp_image[threadIdx.x] = int_image[threadIdx.x]; 191 | tmp_dark[threadIdx.x] = int_dark[threadIdx.x]; 192 | __syncthreads(); 193 | for(unsigned int stride = blockDim.x/2; stride > 0; stride >>= 1){ 194 | if(threadIdx.x < stride){ 195 | if(tmp_dark[threadIdx.x + stride] > tmp_dark[threadIdx.x]){ 196 | tmp_dark[threadIdx.x] = tmp_dark[threadIdx.x + stride]; 197 | tmp_image[threadIdx.x] = tmp_image[threadIdx.x + stride]; 198 | //printf("%.2f %.2f %.2f %.2f\n", tmp_image[0].x,tmp_image[0].y,tmp_image[0].z, tmp_dark[0]); 199 | } 200 | } 201 | __syncthreads(); 202 | } 203 | if(threadIdx.x == 0){ 204 | //float factor = 1.0; 205 | image[size] = tmp_image[threadIdx.x]; 206 | } 207 | } 208 | 209 | void air_light(float *image, float *dark, int height, int width, dim3 blocks, dim3 grids){ 210 | 211 | float3 *int_image = NULL; 212 | float *int_dark = NULL; 213 | //printf("%d\n", grids.x); 214 | 215 | cudaMalloc((void **)(&int_image), sizeof(float3)*grids.x); 216 | cudaMalloc((void **)(&int_dark), sizeof(float)*grids.x); 217 | 218 | //float *xx = (float *)malloc(sizeof(float)*height*width); 219 | //CUDA_CHECK_RETURN(cudaMemcpy(xx, dark, height * width * sizeof(float), cudaMemcpyDeviceToHost)); 220 | //for(int i=0;i>> ((float3 *)image, dark, height, width, int_image, int_dark); 225 | airlight_kernel2<<<1, grids, shared_size_2>>> ((float3 *)image, height*width, int_image, int_dark); 226 | 227 | } 228 | 229 | __global__ 230 | void transmission1_kernel(float3 *image, float *t, int height, int width){ 231 | const int x = blockIdx.x * blockDim.x + threadIdx.x; 232 | const int y = blockIdx.y * blockDim.y + threadIdx.y; 233 | const int i = x * width + y; 234 | float tx, ty, tz; 235 | if(x < height && y < width){ 236 | tx = image[i].x/image[height*width].x; 237 | ty = image[i].y/image[height*width].y; 238 | tz = image[i].z/image[height*width].z; 239 | t[i] = min(tx, min(ty, tz)); 240 | } 241 | } 242 | 243 | __global__ 244 | void transmission2_kernel(float *dark, float *new_dark, int height, int width, int window){ 245 | extern __shared__ float buffer[]; 246 | const int x = blockIdx.x * blockDim.x + threadIdx.x; 247 | const int y = blockIdx.y * blockDim.y + threadIdx.y; 248 | const int i = x * width + y; 249 | if(x < height && y < width){ 250 | 251 | //using shared memory 252 | padding(buffer, dark, 253 | x, y, 254 | threadIdx.x, threadIdx.y, 255 | window, 256 | blockDim.x, blockDim.y, 257 | height, width); 258 | 259 | __syncthreads(); 260 | 261 | float minval = 1.0; 262 | for(int startx = 0; startx < window * 2 + 1; startx++){ 263 | for(int starty = 0; starty < window * 2 + 1; starty++){ 264 | if(IN_GRAPH(x-window+startx, y-window+starty, height, width)){ 265 | int shared_row_index = (threadIdx.x+startx)*(blockDim.y + window * 2); 266 | int shared_index = shared_row_index + threadIdx.y + starty; 267 | minval = min(buffer[shared_index], minval); 268 | } 269 | } 270 | } 271 | 272 | new_dark[i] = 1-0.95*minval; 273 | } 274 | } 275 | 276 | 277 | 278 | void transmission(float *image, float *t, int height, int width, dim3 blocks,dim3 grids){ 279 | float *tmp_trans; 280 | cudaMalloc((void **)&tmp_trans, sizeof(float)*height*width); 281 | transmission1_kernel<<>> ((float3 *)image, tmp_trans, height, width); 282 | int window = WINDOW; 283 | int shared_size = (blocks.x + window * 2) * (blocks.y + window * 2) * sizeof(float); 284 | transmission2_kernel<<>>(tmp_trans, t, height, width, window); 285 | cudaFree(tmp_trans); 286 | } 287 | 288 | __global__ 289 | void dehaze_kernel(float3 *image, float *dark, float *t, int height, int width){ 290 | const int x = blockIdx.x * blockDim.x + threadIdx.x; 291 | const int y = blockIdx.y * blockDim.y + threadIdx.y; 292 | const int i = x * width + y; 293 | if(x < height && y < width){ 294 | image[i].x = (image[i].x - image[height*width].x)/max(0.1, t[i]) + image[height*width].x; 295 | image[i].y = (image[i].y - image[height*width].y)/max(0.1, t[i]) + image[height*width].y; 296 | image[i].z = (image[i].z - image[height*width].z)/max(0.1, t[i]) + image[height*width].z; 297 | 298 | } 299 | } 300 | 301 | void dehaze(float *image,float *dark, float *t, int height, int width, dim3 blocks,dim3 grids){ 302 | dehaze_kernel<<>> ((float3 *)image, dark, t, height, width); 303 | } 304 | 305 | __global__ 306 | void setones(float *img_in, int height, int width, float val){ 307 | const int x = blockIdx.x * blockDim.x + threadIdx.x; 308 | const int y = blockIdx.y * blockDim.y + threadIdx.y; 309 | const int i = x * width + y; 310 | if(x < height && y < width){ 311 | img_in[i] = val; 312 | } 313 | } 314 | 315 | __global__ 316 | void boxfilter_kernel(float *img_in, float *img_res, float *patch, int r, int height, int width){//r: local window radius 317 | const int x = blockIdx.x * blockDim.x + threadIdx.x; 318 | const int y = blockIdx.y * blockDim.y + threadIdx.y; 319 | const int i = x * width + y; 320 | extern __shared__ float buffer[]; 321 | if(x < height && y < width){ 322 | padding( 323 | buffer, img_in, 324 | x, y, 325 | threadIdx.x, threadIdx.y, 326 | r, 327 | blockDim.x, blockDim.y, 328 | height, width); 329 | 330 | __syncthreads(); 331 | 332 | float val = 0.0; 333 | for(int startx = 0; startx < r * 2 + 1; startx++){ 334 | for(int starty = 0; starty < r * 2 + 1; starty++){ 335 | if(IN_GRAPH(x-r+startx, y-r+starty, height, width)){ 336 | int shared_row_index = (threadIdx.x+startx)*(blockDim.y + r * 2); 337 | int shared_index = shared_row_index + threadIdx.y + starty; 338 | val += buffer[shared_index]; 339 | } 340 | } 341 | } 342 | 343 | img_res[i] = val/patch[i];//((2*r+1)*(2*r+1)); 344 | } 345 | } 346 | 347 | __global__ 348 | void boxfilter_kernel2(float *img_in, 349 | float *img_res, 350 | float *img_in2, 351 | float *img_res2, 352 | float *patch, 353 | int r, 354 | int height, 355 | int width){ 356 | 357 | //r: local window radius 358 | const int x = blockIdx.x * blockDim.x + threadIdx.x; 359 | const int y = blockIdx.y * blockDim.y + threadIdx.y; 360 | const int i = x * width + y; 361 | extern __shared__ float buffer[]; 362 | float *buffer2 = buffer + (blockDim.x + r * 2) * (blockDim.y + r * 2); 363 | 364 | if(x < height && y < width){ 365 | 366 | padding( 367 | buffer, img_in, 368 | x, y, 369 | threadIdx.x, threadIdx.y, 370 | r, 371 | blockDim.x, blockDim.y, 372 | height, width); 373 | 374 | padding(buffer2, img_in2, 375 | x, y, 376 | threadIdx.x, threadIdx.y, 377 | r, 378 | blockDim.x, blockDim.y, 379 | height, width); 380 | 381 | __syncthreads(); 382 | 383 | float val = 0.0; 384 | float val2 = 0.0; 385 | for(int startx = 0; startx < r * 2 + 1; startx++){ 386 | for(int starty = 0; starty < r * 2 + 1; starty++){ 387 | if(IN_GRAPH(x-r+startx, y-r+starty, height, width)){ 388 | int shared_row_index = (threadIdx.x+startx)*(blockDim.y + r * 2); 389 | int shared_index = shared_row_index + threadIdx.y + starty; 390 | val += buffer[shared_index]; 391 | val2 += buffer2[shared_index]; 392 | } 393 | } 394 | } 395 | 396 | img_res[i] = val/patch[i]; 397 | img_res2[i] = val2/patch[i]; 398 | } 399 | } 400 | 401 | __global__ 402 | void matmul_kernel(float *a, float *b, float *res1, float *res2, int height, int width){ 403 | //b=a.*b 404 | const int x = blockIdx.x * blockDim.x + threadIdx.x; 405 | const int y = blockIdx.y * blockDim.y + threadIdx.y; 406 | const int i = x * width + y; 407 | if(x < height && y < width){ 408 | res1[i] = a[i]*b[i]; 409 | res2[i] = a[i]*a[i]; 410 | } 411 | } 412 | 413 | __global__//(mean_IP, mean_II, mean_I, mean_P, cov_IP, var_I, height, width) 414 | //(a, b, cov_IP, var_I, mean_P, mean_I, height, width) 415 | void var_kernel(float *a, float *b, float *mean_IP, float *mean_II, float *mean_I, float *mean_P, float *cov_IP, float *var_I, int height, int width){ 416 | //d = a-b.*c 417 | 418 | const int x = blockIdx.x * blockDim.x + threadIdx.x; 419 | const int y = blockIdx.y * blockDim.y + threadIdx.y; 420 | const int i = x * width + y; 421 | if(x < height && y < width){ 422 | cov_IP[i] = mean_IP[i]-mean_I[i]*mean_P[i]; 423 | var_I[i] = mean_II[i]-mean_I[i]*mean_I[i]; 424 | a[i] = cov_IP[i]/(var_I[i] + 0.000001); 425 | b[i] = mean_P[i] - a[i]*mean_I[i]; 426 | } 427 | } 428 | /* 429 | __global__ 430 | void compab_kernel(float *a, float *b, float *cov_IP, float *var_I, float *mean_P, float *mean_I, int height, int width){ 431 | //a=cov_IP./(var_I.+eps); 432 | //eps = 10^-6 433 | //b=mean_P-a.*mean_I; 434 | const int x = blockIdx.x * blockDim.x + threadIdx.x; 435 | const int y = blockIdx.y * blockDim.y + threadIdx.y; 436 | const int i = x * width + y; 437 | if(x < height && y < width){ 438 | a[i] = cov_IP[i]/(var_I[i] + 0.000001); 439 | b[i] = mean_P[i] - a[i]*mean_I[i]; 440 | } 441 | 442 | } 443 | */ 444 | __global__ 445 | void result_kernel(float *result, float *mean_a, float *I, float *mean_b, int height, int width){ 446 | //mean_a = mean_a.*I+mean_b 447 | const int x = blockIdx.x * blockDim.x + threadIdx.x; 448 | const int y = blockIdx.y * blockDim.y + threadIdx.y; 449 | const int i = x * width + y; 450 | if(x < height && y < width){ 451 | result[i] = mean_a[i]*I[i] + mean_b[i]; 452 | } 453 | } 454 | void gfilter(float *result, float *I, float *P, int height, int width, dim3 blocks, dim3 grids){ 455 | 456 | //I: guided image - origin gray scale image - 1 channel 457 | //P: imaged need to be filtered - transmission image - 1 channel 458 | //result: refined trans image - 1 channel 459 | 460 | int r = R; 461 | //float eps = 10^-6; 462 | 463 | float *N; 464 | float *ones; 465 | float *mean_I; 466 | float *mean_P; 467 | float *mean_IP; 468 | float *cov_IP; 469 | float *mean_II; 470 | float *var_I; 471 | float *a; 472 | float *b; 473 | float *mean_a; 474 | float *mean_b; 475 | 476 | //intermediate variables 477 | cudaMalloc((void **)(&N), sizeof(float)*height*width); 478 | cudaMalloc((void **)(&ones), sizeof(float)*height*width); 479 | cudaMalloc((void **)(&mean_I), sizeof(float)*height*width); 480 | cudaMalloc((void **)(&mean_P), sizeof(float)*height*width); 481 | cudaMalloc((void **)(&mean_IP), sizeof(float)*height*width); 482 | cudaMalloc((void **)(&mean_II), sizeof(float)*height*width); 483 | cudaMalloc((void **)(&a), sizeof(float)*height*width); 484 | cudaMalloc((void **)(&b), sizeof(float)*height*width); 485 | cudaMalloc((void **)(&mean_a), sizeof(float)*height*width); 486 | cudaMalloc((void **)(&mean_b), sizeof(float)*height*width); 487 | cudaMalloc((void **)(&cov_IP), sizeof(float)*height*width); 488 | cudaMalloc((void **)(&var_I), sizeof(float)*height*width); 489 | 490 | setones<<>> (ones, height, width, 1.0); 491 | //printinfo(ones, height, width); 492 | int shared_size = (blocks.x + r * 2) * (blocks.y + r * 2) * sizeof(float); 493 | int shared_size2 = 2 * shared_size; 494 | //compute N 495 | boxfilter_kernel<<>> ( 496 | ones, N, ones, r, height, width); 497 | 498 | cudaFree(ones); 499 | 500 | //compute mean_I and mean_P 501 | boxfilter_kernel2<<>> ( 502 | I, mean_I, P, mean_P, N, r, height, width); 503 | 504 | 505 | 506 | float *ImulP; 507 | float *ImulI; 508 | cudaMalloc((void **)(&ImulP), sizeof(float)*height*width); 509 | cudaMalloc((void **)(&ImulI), sizeof(float)*height*width); 510 | matmul_kernel<<>> (I, P, ImulP, ImulI, height, width);// compute P = I.*P 511 | boxfilter_kernel2<<>> (ImulP, mean_IP, ImulI, mean_II, N, r, height, width);//compute mean_IP 512 | cudaFree(ImulP); 513 | 514 | //var_kernel<<>> (mean_IP, mean_I, mean_P, cov_IP, height, width);//compute cov_IP=mean_Ip-mean_I*mean_P 515 | 516 | //boxfilter_kernel<<>> (ImulI, mean_II, N, r, height, width);//compute mean_II 517 | cudaFree(ImulI); 518 | //mean_IP 519 | var_kernel<<>> (a, b, mean_IP, mean_II, mean_I, mean_P, cov_IP, var_I, height, width);//compute var_I=mean_II-mean_I^2 520 | 521 | //compab_kernel<<>>(a, b, cov_IP, var_I, mean_P, mean_I, height, width);//compute a&b 522 | cudaFree(mean_I); 523 | cudaFree(mean_P); 524 | cudaFree(cov_IP); 525 | cudaFree(var_I); 526 | //compute mean_II 527 | boxfilter_kernel2<<>> ( 528 | a, mean_a, b, mean_b, N, r, height, width); 529 | cudaFree(N); 530 | cudaFree(a); 531 | cudaFree(b); 532 | 533 | result_kernel<<>> (result, mean_a, I, mean_b, height, width);//return result 534 | cudaFree(mean_a); 535 | cudaFree(mean_b); 536 | } 537 | --------------------------------------------------------------------------------