├── guide ├── 1.jpg ├── 2.jpg ├── 3.jpg └── 4.jpg ├── output ├── 1_guided.png ├── 2_guided.png ├── 3_guided.png └── 4_guided.png ├── input ├── 1_transmission.png ├── 2_transmission.png ├── 3_transmission .png └── 4_transmission.png ├── CMakeLists.txt ├── guidedFilter.cuh ├── README-zh.md ├── LICENSE ├── README.md ├── main.cu └── guidedFilter.cu /guide/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/guide/1.jpg -------------------------------------------------------------------------------- /guide/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/guide/2.jpg -------------------------------------------------------------------------------- /guide/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/guide/3.jpg -------------------------------------------------------------------------------- /guide/4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/guide/4.jpg -------------------------------------------------------------------------------- /output/1_guided.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/output/1_guided.png -------------------------------------------------------------------------------- /output/2_guided.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/output/2_guided.png -------------------------------------------------------------------------------- /output/3_guided.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/output/3_guided.png -------------------------------------------------------------------------------- /output/4_guided.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/output/4_guided.png -------------------------------------------------------------------------------- /input/1_transmission.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/input/1_transmission.png -------------------------------------------------------------------------------- /input/2_transmission.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/input/2_transmission.png -------------------------------------------------------------------------------- /input/3_transmission .png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/input/3_transmission .png -------------------------------------------------------------------------------- /input/4_transmission.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Plumess/Guided-Filter-Using-CUDA/HEAD/input/4_transmission.png -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20 FATAL_ERROR) 2 | project(GuidedFilter LANGUAGES CXX CUDA) 3 | 4 | set(CMAKE_CXX_STANDARD 17) 5 | set(CMAKE_CUDA_STANDARD 17) 6 | 7 | find_package(CUDA REQUIRED) 8 | 9 | include_directories(${CUDA_INCLUDE_DIRS}) 10 | 11 | file(GLOB CU_SOURCES ./*.cu) 12 | file(GLOB HEADERS ./*.h ./*.cuh) 13 | 14 | add_executable(GuidedFilter ${CU_SOURCES} ${CXX_SOURCES} ${HEADERS}) 15 | set_target_properties(GuidedFilter PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 16 | 17 | # 设置 OpenCV 路径 18 | set(OpenCV_DIR "E:/OpenCV/install") # 根据你的实际情况修改此处路径 19 | find_package(OpenCV REQUIRED) 20 | target_link_libraries(GuidedFilter ${OpenCV_LIBS}) 21 | 22 | set(CMAKE_BUILD_TYPE Debug CACHE STRING "set build type to debug") 23 | 24 | -------------------------------------------------------------------------------- /guidedFilter.cuh: -------------------------------------------------------------------------------- 1 | 2 | #define BLOCK_W 32 3 | #define BLOCK_H 32 4 | 5 | #include 6 | #include "device_launch_parameters.h" 7 | 8 | void guided_filter_cuda(float* ptrT_device, unsigned char* guidedRGBImg_device, float* ptrGuidedT_device, int rows, int cols, 9 | int inStride, int outStride, int m_nGBlockSize, float fEps, cudaStream_t stream, 10 | float* fGuidedImg_device, float* pfInitN_device, float* pfInitMeanIp_device, float* pfInitMeanII_device, 11 | float* pfMeanP_device, float* pfN_device, float* pfMeanI_device, float* pfMeanIp_device, 12 | float* pfMeanII_device, float* pfvarI_device, float* pfCovIp_device, float* pfA_device, float* pfB_device, 13 | float* pfOutA_device, float* pfOutB_device, float* pfArrayCum_device); -------------------------------------------------------------------------------- /README-zh.md: -------------------------------------------------------------------------------- 1 | # Guided-Filter-Using-CUDA 2 | 3 | [中文](README-zh.md) | [English](README.md) 4 | 5 | 这是导向滤波/引导滤波的一种GPU实现,经测试,在i9, RTX4090上运行,包括malloc和memcpy操作,**1080P单帧处理可以达到9.8ms**,可以直接或经过Resize缩放后加入到实时视觉任务中 6 | 7 | ## 介绍 8 | 9 | 测试场景为去雾工作,输入测试为RGB图的透射图,形式为灰度图,为了可视化保存为了数值范围为[0, 255]的png格式,在main函数中处理为[0, 1]的float类型;引导图为RGB原图,在main函数中处理为[0, 255]的unsigned char类型;输出为[0, 1]的float类型。 10 | 11 | 其中,RGB原图在进行Guided Filter之前按照BGR通道顺序,处理成了灰度图,如果有需要,可以自行修改[guidedFilter.cu](./guidedfilter.cu/)中的to_float_point函数。 12 | 13 | ## 实验结果 14 | 15 | 项目中给出的input测试结果如下 16 | 17 | ``` 18 | input/1_transmission.png 19 | This Time: 140025 us 20 | input/2_transmission.png 21 | This Time: 9481 us 22 | input/3_transmission.png 23 | This Time: 10538 us 24 | input/4_transmission.png 25 | This Time: 9458 us 26 | 27 | Average Time: 9825.67 us 28 | ``` 29 | 30 | PS:第一张处理包含GPU WarmUp过程,故计算平均时长时忽略了该值。 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Liu Hengyu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Guided-Filter-Using-CUDA 2 | 3 | [中文](README-zh.md) | [English](README.md) 4 | 5 | This is a GPU implementation of the Guided Filter, using CUDA C/C++. It can **_process a 1080P image in 9.8ms_** on Intel Core i9, Nvidia RTX4090, including malloc and memcpy operations. It can be applied directly or after resizing to real-time visual tasks. 6 | 7 | ## Description 8 | 9 | In the de-fogging test scenario, the input is the transmission map of an RGB image, which is in the form of a grayscale image with values ranging from 0 to 255, saved as a png format for visualization. In the main function, it is processed as a float ranging from 0 to 1. The guide map is the original RGB image, processed as an unsigned char ranging from 0 to 255 in the main function. The output is a float ranging from 0 to 1. 10 | 11 | Note that the RGB image is processed into a grayscale image before the filter is applied, according to the BGR channel order. If necessary, you can modify the "to_float_point" function in "[guidedFilter.cu](./guidedFilter.cu)". 12 | 13 | ## Results 14 | 15 | The data below shows the processing time in microseconds for a given input image. 16 | 17 | ``` 18 | input/1_transmission.png 19 | This Time: 140025 us 20 | input/2_transmission.png 21 | This Time: 9481 us 22 | input/3_transmission.png 23 | This Time: 10538 us 24 | input/4_transmission.png 25 | This Time: 9458 us 26 | 27 | Average Time: 9825.67 us 28 | ``` 29 | 30 | Note that the processing time of the first image includes the GPU WarmUp process, so it is ignored when calculating the average duration. 31 | -------------------------------------------------------------------------------- /main.cu: -------------------------------------------------------------------------------- 1 | #include "guidedFilter.cuh" 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | 7 | void tmpMalloc(float*& fGuidedImg_device, float*& pfInitN_device, float*& pfInitMeanIp_device, float*& pfInitMeanII_device, 8 | float*& pfMeanP_device, float*& pfN_device, float*& pfMeanI_device, float*& pfMeanIp_device, 9 | float*& pfMeanII_device, float*& pfvarI_device, float*& pfCovIp_device, float*& pfA_device, float*& pfB_device, 10 | float*& pfOutA_device, float*& pfOutB_device, float*& pfArrayCum_device, int fSize, cudaStream_t streamIdx){ 11 | 12 | cudaMallocAsync((void **)&fGuidedImg_device, fSize, streamIdx); 13 | cudaMallocAsync((void **)&pfInitN_device, fSize, streamIdx); 14 | cudaMallocAsync((void **)&pfInitMeanIp_device, fSize, streamIdx); 15 | cudaMallocAsync((void **)&pfInitMeanII_device, fSize, streamIdx); 16 | cudaMallocAsync((void **)&pfMeanP_device, fSize, streamIdx); 17 | cudaMallocAsync((void **)&pfN_device, fSize, streamIdx); 18 | cudaMallocAsync((void **)&pfMeanI_device, fSize, streamIdx); 19 | cudaMallocAsync((void **)&pfMeanIp_device, fSize, streamIdx); 20 | cudaMallocAsync((void **)&pfMeanII_device, fSize, streamIdx); 21 | cudaMallocAsync((void **)&pfvarI_device, fSize, streamIdx); 22 | cudaMallocAsync((void **)&pfCovIp_device, fSize, streamIdx); 23 | cudaMallocAsync((void **)&pfA_device, fSize, streamIdx); 24 | cudaMallocAsync((void **)&pfB_device, fSize, streamIdx); 25 | cudaMallocAsync((void **)&pfOutA_device, fSize, streamIdx); 26 | cudaMallocAsync((void **)&pfOutB_device, fSize, streamIdx); 27 | cudaMallocAsync((void **)&pfArrayCum_device, fSize, streamIdx); 28 | } 29 | 30 | void tmpFree(float* fGuidedImg_device, float* pfInitN_device, float* pfInitMeanIp_device, float* pfInitMeanII_device, 31 | float* pfMeanP_device, float* pfN_device, float* pfMeanI_device, float* pfMeanIp_device, 32 | float* pfMeanII_device, float* pfvarI_device, float* pfCovIp_device, float* pfA_device, float* pfB_device, 33 | float* pfOutA_device, float* pfOutB_device, float* pfArrayCum_device, cudaStream_t streamIdx){ 34 | 35 | cudaFreeAsync(fGuidedImg_device, streamIdx); 36 | cudaFreeAsync(pfInitN_device, streamIdx); 37 | cudaFreeAsync(pfInitMeanIp_device, streamIdx); 38 | cudaFreeAsync(pfInitMeanII_device, streamIdx); 39 | cudaFreeAsync(pfMeanP_device, streamIdx); 40 | cudaFreeAsync(pfN_device, streamIdx); 41 | cudaFreeAsync(pfMeanI_device, streamIdx); 42 | cudaFreeAsync(pfMeanIp_device, streamIdx); 43 | cudaFreeAsync(pfMeanII_device, streamIdx); 44 | cudaFreeAsync(pfvarI_device, streamIdx); 45 | cudaFreeAsync(pfCovIp_device, streamIdx); 46 | cudaFreeAsync(pfA_device, streamIdx); 47 | cudaFreeAsync(pfB_device, streamIdx); 48 | cudaFreeAsync(pfOutA_device, streamIdx); 49 | cudaFreeAsync(pfOutB_device, streamIdx); 50 | cudaFreeAsync(pfArrayCum_device, streamIdx); 51 | } 52 | 53 | 54 | void Guided_Filter(float* input, unsigned char* guideRGB, float* guidedOut, int rows, int cols, 55 | int r, float eps, cudaStream_t streamIdx){ 56 | 57 | int inStride = cols; 58 | int outStride = cols; 59 | int rgbSize = rows * cols * 3 * sizeof(unsigned char); 60 | int fSize = rows * cols * sizeof(float); 61 | 62 | // Malloc Device Memory 63 | float* d_input, *d_guidedOut; 64 | unsigned char* d_guideRGB; 65 | cudaMallocAsync((void **)&d_input, fSize, streamIdx); 66 | cudaMallocAsync((void **)&d_guidedOut, fSize, streamIdx); 67 | cudaMallocAsync((void **)&d_guideRGB, rgbSize, streamIdx); 68 | 69 | // Copy from Host Memory 70 | cudaMemcpyAsync(d_input, input, rows * cols * sizeof(float), cudaMemcpyHostToDevice, streamIdx); 71 | cudaMemcpyAsync(d_guideRGB, guideRGB, rows * cols * 3 * sizeof(unsigned char), cudaMemcpyHostToDevice, streamIdx); 72 | 73 | // Malloc Temporary Memory 74 | float *fGuidedImg_device = nullptr, *pfInitN_device = nullptr, *pfInitMeanIp_device = nullptr, *pfInitMeanII_device = nullptr, 75 | *pfMeanP_device = nullptr, *pfN_device = nullptr, *pfMeanI_device = nullptr, *pfMeanIp_device = nullptr, 76 | *pfMeanII_device = nullptr, *pfvarI_device = nullptr, *pfCovIp_device = nullptr, *pfA_device = nullptr, *pfB_device = nullptr, 77 | *pfOutA_device = nullptr, *pfOutB_device = nullptr, *pfArrayCum_device = nullptr; 78 | 79 | tmpMalloc(fGuidedImg_device, pfInitN_device, pfInitMeanIp_device, pfInitMeanII_device, 80 | pfMeanP_device, pfN_device, pfMeanI_device, pfMeanIp_device, 81 | pfMeanII_device, pfvarI_device, pfCovIp_device, pfA_device, pfB_device, 82 | pfOutA_device, pfOutB_device, pfArrayCum_device, fSize, streamIdx); 83 | 84 | // Guided Filter using CUDA 85 | guided_filter_cuda(d_input, d_guideRGB, d_guidedOut, rows, cols, 86 | inStride, outStride, r, eps, streamIdx, 87 | fGuidedImg_device, pfInitN_device, pfInitMeanIp_device, pfInitMeanII_device, 88 | pfMeanP_device, pfN_device, pfMeanI_device, pfMeanIp_device, 89 | pfMeanII_device, pfvarI_device, pfCovIp_device, pfA_device, pfB_device, 90 | pfOutA_device, pfOutB_device, pfArrayCum_device); 91 | 92 | // Out 93 | cudaMemcpyAsync(guidedOut, d_guidedOut, rows * cols * sizeof(float), cudaMemcpyDeviceToHost, streamIdx); 94 | 95 | // Free 96 | cudaFreeAsync(d_input, streamIdx); 97 | cudaFreeAsync(d_guideRGB, streamIdx); 98 | cudaFreeAsync(d_guidedOut, streamIdx); 99 | tmpFree(fGuidedImg_device, pfInitN_device, pfInitMeanIp_device, pfInitMeanII_device, 100 | pfMeanP_device, pfN_device, pfMeanI_device, pfMeanIp_device, 101 | pfMeanII_device, pfvarI_device, pfCovIp_device, pfA_device, pfB_device, 102 | pfOutA_device, pfOutB_device, pfArrayCum_device, streamIdx); 103 | } 104 | 105 | int main(int argc, char *argv[]) { 106 | string workspace = argv[1]; 107 | string input_folder = workspace + "input/"; 108 | string guide_folder = workspace + "guide/"; 109 | string output_folder = workspace + "output/"; 110 | 111 | int frameIdx = 0; 112 | int streamIdx = 0; 113 | 114 | int r = 60; 115 | float eps = 1e-2; 116 | int rows = 0; 117 | int cols = 0; 118 | float* input = nullptr; 119 | unsigned char* guideRGB = nullptr; 120 | float* guidedOut = nullptr; 121 | 122 | auto total_time = std::chrono::duration::zero(); 123 | 124 | for (const auto & entry : std::filesystem::directory_iterator(input_folder)) { 125 | std::string input_path = entry.path().string(); 126 | std::string filename = entry.path().filename().string(); // get filename include extensions 127 | std::string basename = entry.path().stem().string(); // get filename without extensions 128 | // remove "_transmission" postfix 129 | std::size_t pos = basename.find("_transmission"); 130 | if (pos != std::string::npos) { 131 | basename = basename.substr(0, pos); 132 | } 133 | 134 | std::cout<< input_path << std::endl; 135 | 136 | // load gray png 137 | cv::Mat input_img = cv::imread(input_path, cv::IMREAD_GRAYSCALE); 138 | // load rgb guide 139 | std::string guide_path = guide_folder + basename + ".jpg"; 140 | cv::Mat guide_img = cv::imread(guide_path, cv::IMREAD_COLOR); 141 | 142 | // init size 143 | if(frameIdx==0){ 144 | rows = input_img.rows; 145 | cols = input_img.cols; 146 | input = new float[rows * cols]; 147 | guideRGB = new unsigned char[rows * cols * 3]; 148 | guidedOut = new float[rows * cols]; 149 | } 150 | 151 | // convert input to float 152 | cv::Mat input_float; 153 | input_img.convertTo(input_float, CV_32F, 1/255.0); 154 | 155 | memcpy(input, input_float.data, rows * cols * sizeof(float)); 156 | memcpy(guideRGB, guide_img.data, rows * cols * 3 * sizeof(unsigned char)); 157 | 158 | auto t1 = std::chrono::system_clock::now(); 159 | 160 | // guided filter cuda 161 | Guided_Filter(input, guideRGB, guidedOut, rows, cols, r, eps, reinterpret_cast(streamIdx)); 162 | 163 | auto t2 = std::chrono::system_clock::now(); 164 | auto full_time = std::chrono::duration_cast(t2 - t1); 165 | std::cout << "This Time: " << full_time.count() << " us" << std::endl; 166 | // ignore warmup 167 | if(frameIdx!=0) 168 | total_time += full_time; 169 | 170 | // save result 171 | cv::Mat guided_transmission(rows, cols, CV_32F, guidedOut); 172 | cv::Mat guided_transmission_8UC1; 173 | guided_transmission.convertTo(guided_transmission_8UC1, CV_8UC1, 255.0); 174 | std::string save_path = output_folder + basename + "_guided.png"; 175 | cv::imwrite(save_path, guided_transmission_8UC1); 176 | 177 | frameIdx++; 178 | } 179 | 180 | // ignore warmup 181 | if(frameIdx!=0) 182 | std::cout << "\nAverage Time: " << total_time.count() / (frameIdx-1) << " us" << std::endl; 183 | 184 | delete[] input; 185 | delete[] guideRGB; 186 | delete[] guidedOut; 187 | 188 | return 0; 189 | } 190 | 191 | -------------------------------------------------------------------------------- /guidedFilter.cu: -------------------------------------------------------------------------------- 1 | #include "guidedFilter.cuh" 2 | 3 | __global__ void to_float_point(float* ptrT, 4 | float *fGuidedImg, 5 | unsigned char *guidedRGBImg, 6 | float *pfInitN, 7 | float *pfInitMeanIp, 8 | float *pfInitMeanII, 9 | int rows, 10 | int cols, 11 | int inStride, 12 | int outStride){ 13 | 14 | int j = blockDim.x * blockIdx.x + threadIdx.x; 15 | int i = blockDim.y * blockIdx.y + threadIdx.y; 16 | if((j >= cols) || (i >= rows)){ 17 | return; 18 | } 19 | //RGB 20 | //fGuidedImg[i * outStride + j] = float((0.299 * guidedRGBImg[i * inStride * 3 + j * 3 + 0] + 0.587 * guidedRGBImg[i * inStride * 3 + j * 3 + 1] + 0.114 * guidedRGBImg[i * inStride * 3 + j * 3 + 2]) / 255.); 21 | //BGR 22 | fGuidedImg[i * outStride + j] = float((0.114 * guidedRGBImg[i * inStride * 3 + j * 3 + 0] + 0.587 * guidedRGBImg[i * inStride * 3 + j * 3 + 1] + 0.299 * guidedRGBImg[i * inStride * 3 + j * 3 + 2]) / 255.); 23 | //N 24 | pfInitN[i * outStride + j] = 1.0f; 25 | pfInitMeanIp[i * outStride + j] = fGuidedImg[i * outStride + j] * ptrT[i * outStride + j]; 26 | pfInitMeanII[i * outStride + j] = fGuidedImg[i * outStride + j] * fGuidedImg[i * outStride + j]; 27 | } 28 | 29 | 30 | __global__ void init_pfArrayCum_Y(float* pfInArray, float *pfArrayCum, int rows, int cols){ 31 | int j = blockDim.x * blockIdx.x + threadIdx.x; 32 | if(j >= cols){ 33 | return; 34 | } 35 | pfArrayCum[j] = pfInArray[j]; 36 | } 37 | 38 | __global__ void pfArrayCum_Y(float* pfInArray, float *pfArrayCum, int rows, int cols, int stride){ 39 | int j = blockDim.x * blockIdx.x + threadIdx.x; 40 | int i = blockDim.y * blockIdx.y + threadIdx.y; 41 | if((j >= cols) || (i > 0)){ 42 | return; 43 | } 44 | 45 | for(int k = 1; k < rows; k++){ 46 | pfArrayCum[k * stride + j] = pfArrayCum[(k - 1) * stride + j] + pfInArray[k * stride + j]; 47 | } 48 | } 49 | 50 | __global__ void diff_Y_axis( float* fOutArray, float *pfArrayCum, int nR, int rows, int cols, int stride){ 51 | int j = blockDim.x * blockIdx.x + threadIdx.x; 52 | int i = blockDim.y * blockIdx.y + threadIdx.y; 53 | if((j >= cols) || (i >= rows)){ 54 | return; 55 | } 56 | 57 | if(i < (nR + 1)){ 58 | fOutArray[i * stride + j] = pfArrayCum[(i + nR + 1) * stride + j]; 59 | } 60 | if(i >=(nR + 1) && i < (rows - nR)){ 61 | fOutArray[i * stride + j] = pfArrayCum[(i + nR) * stride + j] - pfArrayCum[(i - nR - 1) * stride + j]; 62 | } 63 | if(i>=(rows - nR) && i < rows){ 64 | fOutArray[i * stride + j] = pfArrayCum[(rows - 1) * stride + j] - pfArrayCum[(i - nR - 1) * stride + j]; 65 | } 66 | } 67 | 68 | 69 | __global__ void init_pfArrayCum_X(float* fOutArray, float *pfArrayCum, int rows, int cols, int stride){ 70 | int i = blockDim.y * blockIdx.y + threadIdx.y; 71 | if(i >= rows){ 72 | return; 73 | } 74 | pfArrayCum[i * stride] = fOutArray[i * stride]; 75 | 76 | } 77 | 78 | __global__ void pfArrayCum_X(float* fOutArray, float *pfArrayCum, int rows, int cols, int stride){ 79 | int j = blockDim.x * blockIdx.x + threadIdx.x; 80 | int i = blockDim.y * blockIdx.y + threadIdx.y; 81 | if((i >= rows) || (j > 0)){ 82 | return; 83 | } 84 | 85 | for(int k = 1; k < cols; k++){ 86 | pfArrayCum[i * stride + k] = pfArrayCum[i * stride + k - 1] + fOutArray[i * stride + k]; 87 | } 88 | } 89 | 90 | __global__ void diff_X_axis( float* fOutArray, float *pfArrayCum, int nR, int rows, int cols, int stride){ 91 | int j = blockDim.x * blockIdx.x + threadIdx.x; 92 | int i = blockDim.y * blockIdx.y + threadIdx.y; 93 | if((j >= cols) || (i >= rows)){ 94 | return; 95 | } 96 | if(j < (nR + 1)){ 97 | fOutArray[i * stride + j] = pfArrayCum[i * stride + j + nR]; 98 | } 99 | if(j >= (nR + 1) && j < (cols - nR)){ 100 | fOutArray[i * stride + j] = pfArrayCum[i * stride + j + nR] - pfArrayCum[i * stride + j - nR - 1]; 101 | } 102 | if(j >= (cols - nR) && j < cols){ 103 | fOutArray[i * stride + j] = pfArrayCum[i * stride + cols - 1] - pfArrayCum[i * stride + j - nR - 1]; 104 | } 105 | } 106 | 107 | 108 | void BoxFilter_gpu(float* pfArrayCum, float* pfInArray, float* fOutArray, int nR, int rows, int cols, int stride, cudaStream_t stream){ 109 | dim3 gridSize((cols + BLOCK_W - 1) / BLOCK_W, (rows + BLOCK_H - 1) / BLOCK_H); 110 | dim3 blockSize(BLOCK_W, BLOCK_H); 111 | 112 | init_pfArrayCum_Y<<>>(pfInArray, pfArrayCum, rows, cols); 113 | pfArrayCum_Y<<>>(pfInArray, pfArrayCum, rows, cols, stride); 114 | diff_Y_axis<<>>(fOutArray, pfArrayCum, nR, rows, cols, stride); 115 | 116 | init_pfArrayCum_X<<>>(fOutArray, pfArrayCum, rows, cols, stride); 117 | pfArrayCum_X<<>>(fOutArray, pfArrayCum, rows, cols, stride); 118 | diff_X_axis<<>>(fOutArray, pfArrayCum, nR, rows, cols, stride); 119 | } 120 | 121 | __global__ void set_value(float* pfMeanI, 122 | float* pfMeanP, 123 | float* pfN, 124 | float* pfMeanIp, 125 | float* pfCovIp, 126 | float* pfMeanII, 127 | float* pfvarI, 128 | float* pfA, 129 | float* pfB, 130 | float fEps, 131 | int rows, 132 | int cols, 133 | int outStride){ 134 | int j = blockDim.x * blockIdx.x + threadIdx.x; 135 | int i = blockDim.y * blockIdx.y + threadIdx.y; 136 | if((j >= cols) || (i >= rows)){ 137 | return; 138 | } 139 | pfMeanI[i * outStride + j] = pfMeanI[i * outStride + j] / pfN[i * outStride + j]; 140 | pfMeanP[i * outStride + j] = pfMeanP[i * outStride + j] / pfN[i * outStride + j]; 141 | pfMeanIp[i * outStride + j] = pfMeanIp[i * outStride + j] / pfN[i * outStride + j]; 142 | pfCovIp[i * outStride + j] = pfMeanIp[i * outStride + j] - pfMeanI[i * outStride + j] * pfMeanP[i * outStride + j]; 143 | pfMeanII[i * outStride + j] = pfMeanII[i * outStride + j] / pfN[i * outStride + j]; 144 | pfvarI[i * outStride + j] = pfMeanII[i * outStride + j] - pfMeanI[i * outStride + j] * pfMeanI[i * outStride + j]; 145 | //a and b 146 | pfA[i * outStride + j] = pfCovIp[i * outStride + j] / (pfvarI[i * outStride + j] + fEps); 147 | pfB[i * outStride + j] = pfMeanP[i * outStride + j] - pfA[i * outStride + j] * pfMeanI[i * outStride + j]; 148 | } 149 | 150 | 151 | __global__ void get_guide_output(float* ptrGuidedT, 152 | float* pfOutA, 153 | float* fGuidedImg, 154 | float* pfOutB, 155 | float* pfN, 156 | int rows, 157 | int cols, 158 | int outStride){ 159 | int j = blockDim.x * blockIdx.x + threadIdx.x; 160 | int i = blockDim.y * blockIdx.y + threadIdx.y; 161 | if((j >= cols) || (i >= rows)){ 162 | return; 163 | } 164 | ptrGuidedT[i * outStride + j] = 1.0; 165 | ptrGuidedT[i * outStride + j] = (pfOutA[i * outStride + j] * fGuidedImg[i * outStride + j] + pfOutB[i * outStride + j]) / pfN[i * outStride + j]; 166 | } 167 | 168 | void guided_filter_cuda(float* ptrT_device, unsigned char* guidedRGBImg_device, float* ptrGuidedT_device, int rows, int cols, 169 | int inStride, int outStride, int m_nGBlockSize, float fEps, cudaStream_t stream, 170 | float* fGuidedImg_device, float* pfInitN_device, float* pfInitMeanIp_device, float* pfInitMeanII_device, 171 | float* pfMeanP_device, float* pfN_device, float* pfMeanI_device, float* pfMeanIp_device, 172 | float* pfMeanII_device, float* pfvarI_device, float* pfCovIp_device, float* pfA_device, float* pfB_device, 173 | float* pfOutA_device, float* pfOutB_device, float* pfArrayCum_device){ 174 | dim3 gridSize((cols + BLOCK_W - 1) / BLOCK_W, (rows + BLOCK_H - 1) / BLOCK_H); 175 | dim3 blockSize(BLOCK_W, BLOCK_H); 176 | to_float_point<<>>(ptrT_device, 177 | fGuidedImg_device, 178 | guidedRGBImg_device, 179 | pfInitN_device, 180 | pfInitMeanIp_device, 181 | pfInitMeanII_device, 182 | rows, 183 | cols, 184 | inStride, 185 | outStride); 186 | 187 | BoxFilter_gpu(pfArrayCum_device, pfInitN_device, pfN_device, m_nGBlockSize, rows, cols, outStride, stream); 188 | //Mean_I 189 | BoxFilter_gpu(pfArrayCum_device, fGuidedImg_device, pfMeanI_device, m_nGBlockSize, rows, cols, outStride, stream); 190 | //Mean_P 191 | BoxFilter_gpu(pfArrayCum_device, ptrT_device, pfMeanP_device, m_nGBlockSize, rows, cols, outStride, stream); 192 | //mean_IP 193 | BoxFilter_gpu(pfArrayCum_device, pfInitMeanIp_device, pfMeanIp_device, m_nGBlockSize, rows, cols, outStride, stream); 194 | //mean_II 195 | BoxFilter_gpu(pfArrayCum_device, pfInitMeanII_device, pfMeanII_device, m_nGBlockSize, rows, cols, outStride, stream); 196 | 197 | set_value<<>>(pfMeanI_device, 198 | pfMeanP_device, 199 | pfN_device, 200 | pfMeanIp_device, 201 | pfCovIp_device, 202 | pfMeanII_device, 203 | pfvarI_device, 204 | pfA_device, 205 | pfB_device, 206 | fEps, 207 | rows, 208 | cols, 209 | outStride); 210 | BoxFilter_gpu(pfArrayCum_device, pfA_device, pfOutA_device, m_nGBlockSize, rows, cols, outStride, stream); 211 | BoxFilter_gpu(pfArrayCum_device, pfB_device, pfOutB_device, m_nGBlockSize, rows, cols, outStride, stream); 212 | get_guide_output<<>>(ptrGuidedT_device, 213 | pfOutA_device, 214 | fGuidedImg_device, 215 | pfOutB_device, 216 | pfN_device, 217 | rows, 218 | cols, 219 | outStride); 220 | } --------------------------------------------------------------------------------