├── guide
    ├── 1.jpg
    ├── 2.jpg
    ├── 3.jpg
    └── 4.jpg
├── output
    ├── 1_guided.png
    ├── 2_guided.png
    ├── 3_guided.png
    └── 4_guided.png
├── input
    ├── 1_transmission.png
    ├── 2_transmission.png
    ├── 3_transmission .png
    └── 4_transmission.png
├── CMakeLists.txt
├── guidedFilter.cuh
├── README-zh.md
├── LICENSE
├── README.md
├── main.cu
└── guidedFilter.cu


/guide/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/guide/1.jpg


--------------------------------------------------------------------------------
/guide/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/guide/2.jpg


--------------------------------------------------------------------------------
/guide/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/guide/3.jpg


--------------------------------------------------------------------------------
/guide/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/guide/4.jpg


--------------------------------------------------------------------------------
/output/1_guided.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/output/1_guided.png


--------------------------------------------------------------------------------
/output/2_guided.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/output/2_guided.png


--------------------------------------------------------------------------------
/output/3_guided.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/output/3_guided.png


--------------------------------------------------------------------------------
/output/4_guided.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/output/4_guided.png


--------------------------------------------------------------------------------
/input/1_transmission.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/input/1_transmission.png


--------------------------------------------------------------------------------
/input/2_transmission.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/input/2_transmission.png


--------------------------------------------------------------------------------
/input/3_transmission .png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/input/3_transmission .png


--------------------------------------------------------------------------------
/input/4_transmission.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/input/4_transmission.png


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.20 FATAL_ERROR)
 2 | project(GuidedFilter LANGUAGES CXX CUDA)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 17)
 5 | set(CMAKE_CUDA_STANDARD 17)
 6 | 
 7 | find_package(CUDA REQUIRED)
 8 | 
 9 | include_directories(${CUDA_INCLUDE_DIRS})
10 | 
11 | file(GLOB CU_SOURCES ./*.cu)
12 | file(GLOB HEADERS ./*.h ./*.cuh)
13 | 
14 | add_executable(GuidedFilter ${CU_SOURCES} ${CXX_SOURCES} ${HEADERS})
15 | set_target_properties(GuidedFilter PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
16 | 
17 | # 设置 OpenCV 路径
18 | set(OpenCV_DIR "E:/OpenCV/install") # 根据你的实际情况修改此处路径
19 | find_package(OpenCV REQUIRED)
20 | target_link_libraries(GuidedFilter ${OpenCV_LIBS})
21 | 
22 | set(CMAKE_BUILD_TYPE Debug CACHE STRING "set build type to debug")
23 | 
24 | 


--------------------------------------------------------------------------------
/guidedFilter.cuh:
--------------------------------------------------------------------------------
 1 | 
 2 | #define BLOCK_W     32
 3 | #define BLOCK_H     32
 4 | 
 5 | #include <cuda_runtime.h>
 6 | #include "device_launch_parameters.h"
 7 | 
 8 | void guided_filter_cuda(float* ptrT_device, unsigned char* guidedRGBImg_device, float* ptrGuidedT_device, int rows, int cols,
 9 |                 int inStride, int outStride, int m_nGBlockSize, float fEps, cudaStream_t stream,
10 |                 float* fGuidedImg_device, float* pfInitN_device, float* pfInitMeanIp_device, float* pfInitMeanII_device,
11 |                 float* pfMeanP_device, float* pfN_device, float* pfMeanI_device, float* pfMeanIp_device,
12 |                 float* pfMeanII_device, float* pfvarI_device, float* pfCovIp_device, float* pfA_device, float* pfB_device,
13 |                 float* pfOutA_device, float* pfOutB_device, float* pfArrayCum_device);


--------------------------------------------------------------------------------
/README-zh.md:
--------------------------------------------------------------------------------
 1 | # Guided-Filter-Using-CUDA
 2 | 
 3 | [中文](README-zh.md) | [English](README.md)
 4 | 
 5 | 这是导向滤波/引导滤波的一种GPU实现，经测试，在i9, RTX4090上运行，包括malloc和memcpy操作，**1080P单帧处理可以达到9.8ms**，可以直接或经过Resize缩放后加入到实时视觉任务中
 6 | 
 7 | ## 介绍
 8 | 
 9 | 测试场景为去雾工作，输入测试为RGB图的透射图，形式为灰度图，为了可视化保存为了数值范围为[0, 255]的png格式，在main函数中处理为[0, 1]的float类型；引导图为RGB原图，在main函数中处理为[0, 255]的unsigned char类型；输出为[0, 1]的float类型。
10 | 
11 | 其中，RGB原图在进行Guided Filter之前按照BGR通道顺序，处理成了灰度图，如果有需要，可以自行修改[guidedFilter.cu](./guidedfilter.cu/)中的to_float_point函数。
12 | 
13 | ## 实验结果
14 | 
15 | 项目中给出的input测试结果如下
16 | 
17 | ```
18 | input/1_transmission.png
19 | This Time: 140025 us
20 | input/2_transmission.png
21 | This Time: 9481 us
22 | input/3_transmission.png
23 | This Time: 10538 us
24 | input/4_transmission.png
25 | This Time: 9458 us
26 | 
27 | Average Time: 9825.67 us
28 | ```
29 | 
30 | PS：第一张处理包含GPU WarmUp过程，故计算平均时长时忽略了该值。
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Liu Hengyu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Guided-Filter-Using-CUDA
 2 | 
 3 | [中文](README-zh.md) | [English](README.md)
 4 | 
 5 | This is a GPU implementation of the Guided Filter, using CUDA C/C++. It can **_process a 1080P image in 9.8ms_** on Intel Core i9, Nvidia RTX4090, including malloc and memcpy operations. It can be applied directly or after resizing to real-time visual tasks.
 6 | 
 7 | ## Description
 8 | 
 9 | In the de-fogging test scenario, the input is the transmission map of an RGB image, which is in the form of a grayscale image with values ranging from 0 to 255, saved as a png format for visualization. In the main function, it is processed as a float ranging from 0 to 1. The guide map is the original RGB image, processed as an unsigned char ranging from 0 to 255 in the main function. The output is a float ranging from 0 to 1.
10 | 
11 | Note that the RGB image is processed into a grayscale image before the filter is applied, according to the BGR channel order. If necessary, you can modify the "to_float_point" function in "[guidedFilter.cu](./guidedFilter.cu)".
12 | 
13 | ## Results
14 | 
15 | The data below shows the processing time in microseconds for a given input image.
16 | 
17 | ```
18 | input/1_transmission.png
19 | This Time: 140025 us
20 | input/2_transmission.png
21 | This Time: 9481 us
22 | input/3_transmission.png
23 | This Time: 10538 us
24 | input/4_transmission.png
25 | This Time: 9458 us
26 | 
27 | Average Time: 9825.67 us
28 | ```
29 | 
30 | Note that the processing time of the first image includes the GPU WarmUp process, so it is ignored when calculating the average duration.
31 | 


--------------------------------------------------------------------------------
/main.cu:
--------------------------------------------------------------------------------
  1 | #include "guidedFilter.cuh"
  2 | #include <opencv2/opencv.hpp>
  3 | #include <filesystem>
  4 | 
  5 | using namespace std;
  6 | 
  7 | void tmpMalloc(float*& fGuidedImg_device, float*& pfInitN_device, float*& pfInitMeanIp_device, float*& pfInitMeanII_device,
  8 |                float*& pfMeanP_device, float*& pfN_device, float*& pfMeanI_device, float*& pfMeanIp_device,
  9 |                float*& pfMeanII_device, float*& pfvarI_device, float*& pfCovIp_device, float*& pfA_device, float*& pfB_device,
 10 |                float*& pfOutA_device, float*& pfOutB_device, float*& pfArrayCum_device, int fSize, cudaStream_t streamIdx){
 11 | 
 12 |     cudaMallocAsync((void **)&fGuidedImg_device, fSize, streamIdx);
 13 |     cudaMallocAsync((void **)&pfInitN_device, fSize, streamIdx);
 14 |     cudaMallocAsync((void **)&pfInitMeanIp_device, fSize, streamIdx);
 15 |     cudaMallocAsync((void **)&pfInitMeanII_device, fSize, streamIdx);
 16 |     cudaMallocAsync((void **)&pfMeanP_device, fSize, streamIdx);
 17 |     cudaMallocAsync((void **)&pfN_device, fSize, streamIdx);
 18 |     cudaMallocAsync((void **)&pfMeanI_device, fSize, streamIdx);
 19 |     cudaMallocAsync((void **)&pfMeanIp_device, fSize, streamIdx);
 20 |     cudaMallocAsync((void **)&pfMeanII_device, fSize, streamIdx);
 21 |     cudaMallocAsync((void **)&pfvarI_device, fSize, streamIdx);
 22 |     cudaMallocAsync((void **)&pfCovIp_device, fSize, streamIdx);
 23 |     cudaMallocAsync((void **)&pfA_device, fSize, streamIdx);
 24 |     cudaMallocAsync((void **)&pfB_device, fSize, streamIdx);
 25 |     cudaMallocAsync((void **)&pfOutA_device, fSize, streamIdx);
 26 |     cudaMallocAsync((void **)&pfOutB_device, fSize, streamIdx);
 27 |     cudaMallocAsync((void **)&pfArrayCum_device, fSize, streamIdx);
 28 | }
 29 | 
 30 | void tmpFree(float* fGuidedImg_device, float* pfInitN_device, float* pfInitMeanIp_device, float* pfInitMeanII_device,
 31 |                float* pfMeanP_device, float* pfN_device, float* pfMeanI_device, float* pfMeanIp_device,
 32 |                float* pfMeanII_device, float* pfvarI_device, float* pfCovIp_device, float* pfA_device, float* pfB_device,
 33 |                float* pfOutA_device, float* pfOutB_device, float* pfArrayCum_device, cudaStream_t streamIdx){
 34 | 
 35 |     cudaFreeAsync(fGuidedImg_device, streamIdx);
 36 |     cudaFreeAsync(pfInitN_device, streamIdx);
 37 |     cudaFreeAsync(pfInitMeanIp_device, streamIdx);
 38 |     cudaFreeAsync(pfInitMeanII_device, streamIdx);
 39 |     cudaFreeAsync(pfMeanP_device, streamIdx);
 40 |     cudaFreeAsync(pfN_device, streamIdx);
 41 |     cudaFreeAsync(pfMeanI_device, streamIdx);
 42 |     cudaFreeAsync(pfMeanIp_device, streamIdx);
 43 |     cudaFreeAsync(pfMeanII_device, streamIdx);
 44 |     cudaFreeAsync(pfvarI_device, streamIdx);
 45 |     cudaFreeAsync(pfCovIp_device, streamIdx);
 46 |     cudaFreeAsync(pfA_device, streamIdx);
 47 |     cudaFreeAsync(pfB_device, streamIdx);
 48 |     cudaFreeAsync(pfOutA_device, streamIdx);
 49 |     cudaFreeAsync(pfOutB_device, streamIdx);
 50 |     cudaFreeAsync(pfArrayCum_device, streamIdx);
 51 | }
 52 | 
 53 | 
 54 | void Guided_Filter(float* input, unsigned char* guideRGB, float* guidedOut, int rows, int cols,
 55 |                    int r, float eps, cudaStream_t streamIdx){
 56 | 
 57 |     int inStride = cols;
 58 |     int outStride = cols;
 59 |     int rgbSize = rows * cols * 3 * sizeof(unsigned char);
 60 |     int fSize = rows * cols * sizeof(float);
 61 | 
 62 |     // Malloc Device Memory
 63 |     float* d_input, *d_guidedOut;
 64 |     unsigned char* d_guideRGB;
 65 |     cudaMallocAsync((void **)&d_input, fSize, streamIdx);
 66 |     cudaMallocAsync((void **)&d_guidedOut, fSize, streamIdx);
 67 |     cudaMallocAsync((void **)&d_guideRGB, rgbSize, streamIdx);
 68 | 
 69 |     // Copy from Host Memory
 70 |     cudaMemcpyAsync(d_input, input, rows * cols * sizeof(float), cudaMemcpyHostToDevice, streamIdx);
 71 |     cudaMemcpyAsync(d_guideRGB, guideRGB, rows * cols * 3 * sizeof(unsigned char), cudaMemcpyHostToDevice, streamIdx);
 72 | 
 73 |     // Malloc Temporary Memory
 74 |     float *fGuidedImg_device = nullptr, *pfInitN_device = nullptr, *pfInitMeanIp_device = nullptr, *pfInitMeanII_device = nullptr,
 75 |           *pfMeanP_device = nullptr, *pfN_device = nullptr, *pfMeanI_device = nullptr, *pfMeanIp_device = nullptr,
 76 |           *pfMeanII_device = nullptr, *pfvarI_device = nullptr, *pfCovIp_device = nullptr, *pfA_device = nullptr, *pfB_device = nullptr,
 77 |           *pfOutA_device = nullptr, *pfOutB_device = nullptr, *pfArrayCum_device = nullptr;
 78 | 
 79 |     tmpMalloc(fGuidedImg_device, pfInitN_device, pfInitMeanIp_device, pfInitMeanII_device,
 80 |               pfMeanP_device, pfN_device, pfMeanI_device, pfMeanIp_device,
 81 |               pfMeanII_device, pfvarI_device, pfCovIp_device, pfA_device, pfB_device,
 82 |               pfOutA_device, pfOutB_device, pfArrayCum_device, fSize, streamIdx);
 83 | 
 84 |     // Guided Filter using CUDA
 85 |     guided_filter_cuda(d_input, d_guideRGB, d_guidedOut, rows, cols,
 86 |                        inStride, outStride, r, eps, streamIdx,
 87 |                        fGuidedImg_device, pfInitN_device, pfInitMeanIp_device, pfInitMeanII_device,
 88 |                        pfMeanP_device, pfN_device, pfMeanI_device, pfMeanIp_device,
 89 |                        pfMeanII_device, pfvarI_device, pfCovIp_device, pfA_device, pfB_device,
 90 |                        pfOutA_device, pfOutB_device, pfArrayCum_device);
 91 | 
 92 |     // Out
 93 |     cudaMemcpyAsync(guidedOut, d_guidedOut, rows * cols * sizeof(float), cudaMemcpyDeviceToHost, streamIdx);
 94 | 
 95 |     // Free
 96 |     cudaFreeAsync(d_input, streamIdx);
 97 |     cudaFreeAsync(d_guideRGB, streamIdx);
 98 |     cudaFreeAsync(d_guidedOut, streamIdx);
 99 |     tmpFree(fGuidedImg_device, pfInitN_device, pfInitMeanIp_device, pfInitMeanII_device,
100 |             pfMeanP_device, pfN_device, pfMeanI_device, pfMeanIp_device,
101 |             pfMeanII_device, pfvarI_device, pfCovIp_device, pfA_device, pfB_device,
102 |             pfOutA_device, pfOutB_device, pfArrayCum_device, streamIdx);
103 | }
104 | 
105 | int main(int argc, char *argv[]) {
106 |     string workspace = argv[1];
107 |     string input_folder = workspace + "input/";
108 |     string guide_folder = workspace + "guide/";
109 |     string output_folder = workspace + "output/";
110 | 
111 |     int frameIdx = 0;
112 |     int streamIdx = 0;
113 | 
114 |     int r = 60;
115 |     float eps = 1e-2;
116 |     int rows = 0;
117 |     int cols = 0;
118 |     float* input = nullptr;
119 |     unsigned char* guideRGB = nullptr;
120 |     float* guidedOut = nullptr;
121 | 
122 |     auto total_time = std::chrono::duration<double, std::micro>::zero();
123 | 
124 |     for (const auto & entry : std::filesystem::directory_iterator(input_folder)) {
125 |         std::string input_path = entry.path().string();
126 |         std::string filename = entry.path().filename().string(); // get filename include extensions
127 |         std::string basename = entry.path().stem().string(); // get filename without extensions
128 |         // remove "_transmission" postfix
129 |         std::size_t pos = basename.find("_transmission");
130 |         if (pos != std::string::npos) {
131 |             basename = basename.substr(0, pos);
132 |         }
133 | 
134 |         std::cout<< input_path << std::endl;
135 | 
136 |         // load gray png
137 |         cv::Mat input_img = cv::imread(input_path, cv::IMREAD_GRAYSCALE);
138 |         // load rgb guide
139 |         std::string guide_path = guide_folder + basename + ".jpg";
140 |         cv::Mat guide_img = cv::imread(guide_path, cv::IMREAD_COLOR);
141 | 
142 |         // init size
143 |         if(frameIdx==0){
144 |             rows = input_img.rows;
145 |             cols = input_img.cols;
146 |             input = new float[rows * cols];
147 |             guideRGB = new unsigned char[rows * cols * 3];
148 |             guidedOut = new float[rows * cols];
149 |         }
150 | 
151 |         // convert input to float
152 |         cv::Mat input_float;
153 |         input_img.convertTo(input_float, CV_32F, 1/255.0);
154 | 
155 |         memcpy(input, input_float.data, rows * cols * sizeof(float));
156 |         memcpy(guideRGB, guide_img.data, rows * cols * 3 * sizeof(unsigned char));
157 | 
158 |         auto t1 = std::chrono::system_clock::now();
159 | 
160 |         // guided filter cuda
161 |         Guided_Filter(input, guideRGB, guidedOut, rows, cols, r, eps, reinterpret_cast<cudaStream_t>(streamIdx));
162 | 
163 |         auto t2 = std::chrono::system_clock::now();
164 |         auto full_time = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
165 |         std::cout << "This Time: " << full_time.count() << " us" << std::endl;
166 |         // ignore warmup
167 |         if(frameIdx!=0)
168 |             total_time += full_time;
169 | 
170 |         // save result
171 |         cv::Mat guided_transmission(rows, cols, CV_32F, guidedOut);
172 |         cv::Mat guided_transmission_8UC1;
173 |         guided_transmission.convertTo(guided_transmission_8UC1, CV_8UC1, 255.0);
174 |         std::string save_path = output_folder + basename + "_guided.png";
175 |         cv::imwrite(save_path, guided_transmission_8UC1);
176 | 
177 |         frameIdx++;
178 |     }
179 | 
180 |     // ignore warmup
181 |     if(frameIdx!=0)
182 |         std::cout << "\nAverage Time: " << total_time.count() / (frameIdx-1) << " us" << std::endl;
183 | 
184 |     delete[] input;
185 |     delete[] guideRGB;
186 |     delete[] guidedOut;
187 | 
188 |     return 0;
189 | }
190 | 
191 | 


--------------------------------------------------------------------------------
/guidedFilter.cu:
--------------------------------------------------------------------------------
  1 | #include "guidedFilter.cuh"
  2 | 
  3 | __global__ void to_float_point(float* ptrT,
  4 | 								float *fGuidedImg, 
  5 | 								unsigned char *guidedRGBImg, 
  6 | 								float *pfInitN,
  7 | 								float *pfInitMeanIp,
  8 | 								float *pfInitMeanII,
  9 | 								int rows,
 10 | 								int cols, 
 11 | 								int inStride,
 12 | 								int outStride){
 13 | 
 14 | 	int j = blockDim.x * blockIdx.x + threadIdx.x;
 15 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
 16 |     if((j >= cols) || (i >= rows)){
 17 |         return;
 18 |     }
 19 |     //RGB
 20 |     //fGuidedImg[i * outStride + j] = float((0.299 * guidedRGBImg[i * inStride * 3 + j * 3 + 0] + 0.587 * guidedRGBImg[i * inStride * 3 + j * 3 + 1] + 0.114 * guidedRGBImg[i * inStride * 3 + j * 3 + 2]) / 255.);
 21 | 	//BGR
 22 |     fGuidedImg[i * outStride + j] = float((0.114 * guidedRGBImg[i * inStride * 3 + j * 3 + 0] + 0.587 * guidedRGBImg[i * inStride * 3 + j * 3 + 1] + 0.299 * guidedRGBImg[i * inStride * 3 + j * 3 + 2]) / 255.);
 23 |     //N
 24 | 	pfInitN[i * outStride + j] = 1.0f;
 25 | 	pfInitMeanIp[i * outStride + j] = fGuidedImg[i * outStride + j] * ptrT[i * outStride + j];
 26 | 	pfInitMeanII[i * outStride + j] = fGuidedImg[i * outStride + j] * fGuidedImg[i * outStride + j];
 27 | }
 28 | 
 29 | 
 30 | __global__ void init_pfArrayCum_Y(float* pfInArray, float *pfArrayCum, int rows, int cols){
 31 | 	int j = blockDim.x * blockIdx.x + threadIdx.x;
 32 | 	if(j >= cols){
 33 | 		return;
 34 | 	}
 35 | 	pfArrayCum[j] = pfInArray[j];
 36 | }
 37 | 
 38 | __global__ void pfArrayCum_Y(float* pfInArray, float *pfArrayCum, int rows, int cols, int stride){
 39 | 	int j = blockDim.x * blockIdx.x + threadIdx.x;
 40 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
 41 |     if((j >= cols) || (i > 0)){
 42 |         return;
 43 |     }
 44 | 	
 45 | 	for(int k = 1; k < rows; k++){
 46 | 	 	pfArrayCum[k * stride + j] = pfArrayCum[(k - 1) * stride + j] + pfInArray[k * stride + j];
 47 | 	}
 48 | }
 49 | 
 50 | __global__ void diff_Y_axis( float* fOutArray, float *pfArrayCum, int nR, int rows, int cols, int stride){
 51 | 	int j = blockDim.x * blockIdx.x + threadIdx.x;
 52 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
 53 |     if((j >= cols) || (i >= rows)){
 54 |         return;
 55 |     }
 56 | 	
 57 | 	if(i <  (nR + 1)){
 58 | 		fOutArray[i * stride + j] = pfArrayCum[(i + nR + 1) * stride + j];
 59 | 	}
 60 | 	if(i >=(nR + 1) && i < (rows - nR)){
 61 | 		fOutArray[i * stride + j] = pfArrayCum[(i + nR) * stride + j] - pfArrayCum[(i - nR - 1) * stride + j];
 62 | 	}
 63 | 	if(i>=(rows - nR) && i < rows){
 64 | 		fOutArray[i * stride + j] = pfArrayCum[(rows - 1) * stride + j] - pfArrayCum[(i - nR - 1) * stride + j];
 65 | 	}
 66 | }
 67 | 
 68 | 
 69 | __global__ void init_pfArrayCum_X(float* fOutArray, float *pfArrayCum, int rows, int cols, int stride){
 70 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
 71 |     if(i >= rows){
 72 |         return;
 73 |     }
 74 | 	pfArrayCum[i * stride] = fOutArray[i * stride];
 75 | 
 76 | }
 77 | 
 78 | __global__ void pfArrayCum_X(float* fOutArray, float *pfArrayCum, int rows, int cols, int stride){
 79 | 	int j = blockDim.x * blockIdx.x + threadIdx.x;
 80 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
 81 |     if((i >= rows) || (j > 0)){
 82 |         return;
 83 |     }
 84 | 
 85 | 	for(int k = 1; k < cols; k++){
 86 | 	 	pfArrayCum[i * stride + k] = pfArrayCum[i * stride + k - 1] + fOutArray[i * stride + k];
 87 | 	}
 88 | }
 89 | 
 90 | __global__ void diff_X_axis( float* fOutArray, float *pfArrayCum, int nR, int rows, int cols, int stride){
 91 | 	int j = blockDim.x * blockIdx.x + threadIdx.x;
 92 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
 93 |     if((j >= cols) || (i >= rows)){
 94 |         return;
 95 |     }
 96 | 	if(j < (nR + 1)){
 97 | 		fOutArray[i * stride + j] = pfArrayCum[i * stride + j + nR];
 98 | 	}
 99 | 	if(j >= (nR + 1) && j < (cols - nR)){
100 | 		fOutArray[i * stride + j] = pfArrayCum[i * stride + j + nR] - pfArrayCum[i * stride + j - nR - 1];
101 | 	}
102 | 	if(j >= (cols - nR) && j < cols){
103 | 		fOutArray[i * stride + j] = pfArrayCum[i * stride + cols - 1] - pfArrayCum[i * stride + j - nR - 1];
104 | 	}
105 | }
106 | 
107 | 
108 | void BoxFilter_gpu(float* pfArrayCum, float* pfInArray, float* fOutArray, int nR, int rows, int cols, int stride, cudaStream_t stream){
109 | 	dim3 gridSize((cols + BLOCK_W - 1) / BLOCK_W, (rows + BLOCK_H - 1) / BLOCK_H);
110 |     dim3 blockSize(BLOCK_W, BLOCK_H);
111 | 
112 | 	init_pfArrayCum_Y<<<gridSize, blockSize, 0, stream>>>(pfInArray, pfArrayCum, rows, cols);
113 | 	pfArrayCum_Y<<<gridSize, blockSize, 0, stream>>>(pfInArray, pfArrayCum, rows, cols, stride);
114 | 	diff_Y_axis<<<gridSize, blockSize, 0, stream>>>(fOutArray, pfArrayCum, nR, rows, cols, stride);
115 | 
116 | 	init_pfArrayCum_X<<<gridSize, blockSize, 0, stream>>>(fOutArray, pfArrayCum, rows, cols, stride);
117 | 	pfArrayCum_X<<<gridSize, blockSize, 0, stream>>>(fOutArray, pfArrayCum, rows, cols, stride);
118 | 	diff_X_axis<<<gridSize, blockSize, 0, stream>>>(fOutArray, pfArrayCum, nR, rows, cols, stride);
119 | }
120 | 
121 | __global__ void set_value(float* pfMeanI,
122 | 						  float* pfMeanP,
123 | 						  float* pfN,
124 | 						  float* pfMeanIp,
125 | 						  float* pfCovIp,
126 | 						  float* pfMeanII,
127 | 						  float* pfvarI,
128 | 						  float* pfA,
129 | 						  float* pfB, 
130 | 						  float fEps,
131 | 						  int rows, 
132 | 						  int cols, 
133 | 						  int outStride){
134 | 	int j = blockDim.x * blockIdx.x + threadIdx.x;
135 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
136 |     if((j >= cols) || (i >= rows)){
137 |         return;
138 |     }
139 | 	pfMeanI[i * outStride + j] = pfMeanI[i * outStride + j] / pfN[i * outStride + j];
140 | 	pfMeanP[i * outStride + j] = pfMeanP[i * outStride + j] / pfN[i * outStride + j];
141 | 	pfMeanIp[i * outStride + j] = pfMeanIp[i * outStride + j] / pfN[i * outStride + j];
142 | 	pfCovIp[i * outStride + j] = pfMeanIp[i * outStride + j] - pfMeanI[i * outStride + j] * pfMeanP[i * outStride + j];
143 | 	pfMeanII[i * outStride + j] = pfMeanII[i * outStride + j] / pfN[i * outStride + j];
144 | 	pfvarI[i * outStride + j] = pfMeanII[i * outStride + j] - pfMeanI[i * outStride + j] * pfMeanI[i * outStride + j];
145 | 	//a and b
146 | 	pfA[i * outStride + j] = pfCovIp[i * outStride + j] / (pfvarI[i * outStride + j] + fEps);
147 | 	pfB[i * outStride + j] = pfMeanP[i * outStride + j] - pfA[i * outStride + j] * pfMeanI[i * outStride + j];
148 | }
149 | 
150 | 
151 | __global__ void get_guide_output(float* ptrGuidedT,
152 | 							float* pfOutA,
153 | 							float* fGuidedImg,
154 | 							float* pfOutB,
155 | 							float* pfN,
156 | 							int rows, 
157 | 							int cols, 
158 | 							int outStride){
159 | 	int j = blockDim.x * blockIdx.x + threadIdx.x;
160 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
161 |     if((j >= cols) || (i >= rows)){
162 |         return;
163 |     }
164 | 	ptrGuidedT[i * outStride + j] = 1.0;
165 | 	ptrGuidedT[i * outStride + j] = (pfOutA[i * outStride + j] * fGuidedImg[i * outStride + j] + pfOutB[i * outStride + j]) / pfN[i * outStride + j];
166 | }
167 | 
168 | void guided_filter_cuda(float* ptrT_device, unsigned char* guidedRGBImg_device, float* ptrGuidedT_device, int rows, int cols,
169 | 			   int inStride, int outStride, int m_nGBlockSize, float fEps, cudaStream_t stream,
170 |                float* fGuidedImg_device, float* pfInitN_device, float* pfInitMeanIp_device, float* pfInitMeanII_device,
171 |                float* pfMeanP_device, float* pfN_device, float* pfMeanI_device, float* pfMeanIp_device,
172 |                float* pfMeanII_device, float* pfvarI_device, float* pfCovIp_device, float* pfA_device, float* pfB_device,
173 |                float* pfOutA_device, float* pfOutB_device, float* pfArrayCum_device){
174 | 	dim3 gridSize((cols + BLOCK_W - 1) / BLOCK_W, (rows + BLOCK_H - 1) / BLOCK_H);
175 |     dim3 blockSize(BLOCK_W, BLOCK_H);
176 | 	to_float_point<<<gridSize, blockSize, 0, stream>>>(ptrT_device,
177 |                                                        fGuidedImg_device,
178 | 											           guidedRGBImg_device,
179 | 											           pfInitN_device,
180 | 											           pfInitMeanIp_device,
181 | 											           pfInitMeanII_device,
182 | 											           rows,
183 | 											           cols,
184 | 											           inStride,
185 | 											           outStride);
186 | 
187 | 	BoxFilter_gpu(pfArrayCum_device, pfInitN_device, pfN_device, m_nGBlockSize, rows, cols, outStride, stream);
188 | 	 //Mean_I
189 |     BoxFilter_gpu(pfArrayCum_device, fGuidedImg_device, pfMeanI_device, m_nGBlockSize, rows, cols, outStride, stream);
190 | 	 //Mean_P
191 |     BoxFilter_gpu(pfArrayCum_device, ptrT_device, pfMeanP_device, m_nGBlockSize, rows, cols, outStride, stream);
192 | 	//mean_IP
193 |     BoxFilter_gpu(pfArrayCum_device, pfInitMeanIp_device, pfMeanIp_device, m_nGBlockSize, rows, cols, outStride, stream);
194 | 	//mean_II
195 |     BoxFilter_gpu(pfArrayCum_device, pfInitMeanII_device, pfMeanII_device, m_nGBlockSize, rows, cols, outStride, stream);
196 | 
197 | 	set_value<<<gridSize, blockSize, 0, stream>>>(pfMeanI_device,
198 | 										          pfMeanP_device,
199 | 										          pfN_device,
200 | 										          pfMeanIp_device,
201 | 										          pfCovIp_device,
202 | 										          pfMeanII_device,
203 | 										          pfvarI_device,
204 | 										          pfA_device,
205 | 										          pfB_device,
206 | 										          fEps,
207 | 										          rows,
208 | 										          cols,
209 | 										          outStride);
210 | 	BoxFilter_gpu(pfArrayCum_device, pfA_device, pfOutA_device, m_nGBlockSize, rows, cols, outStride, stream);
211 |     BoxFilter_gpu(pfArrayCum_device, pfB_device, pfOutB_device, m_nGBlockSize, rows, cols, outStride, stream);
212 | 	get_guide_output<<<gridSize, blockSize, 0, stream>>>(ptrGuidedT_device,
213 | 												         pfOutA_device,
214 | 												         fGuidedImg_device,
215 | 												         pfOutB_device,
216 | 												         pfN_device,
217 | 												         rows,
218 | 												         cols,
219 | 												         outStride);
220 | }


--------------------------------------------------------------------------------