├── .gitignore ├── l.png ├── r.png ├── timer.h ├── helper.h ├── costVolumeFilter_box.h ├── createCostVolume.h ├── helper.cu ├── costVolumeMinimize.h ├── cost_volume.h ├── costVolumeFilter_guided.h ├── createCostVolume_tadcg.h ├── timer.cpp ├── costVolumeFilter_jointBilateral.h ├── makefile ├── LICENSE.md ├── costVolumeFilter_box.cu ├── kerneltest.cu ├── README.md ├── costVolumeMinimize.cu ├── cost_volume.cu ├── createCostVolume.cu ├── costVolumeFilter_guided.cu ├── createCostVolume_tadcg.cu ├── costVolumeFilter_jointBilateral.cu └── asw.cu /.gitignore: -------------------------------------------------------------------------------- 1 | a.out 2 | *.o 3 | asw 4 | cost_volume 5 | 6 | -------------------------------------------------------------------------------- /l.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdbeethe/asw/HEAD/l.png -------------------------------------------------------------------------------- /r.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdbeethe/asw/HEAD/r.png -------------------------------------------------------------------------------- /timer.h: -------------------------------------------------------------------------------- 1 | #ifndef TIMER_H 2 | #define TIMER_H 3 | 4 | #include 5 | struct timespec check_timer(const char* str, struct timespec* ts); 6 | 7 | #endif // TIMER_H 8 | -------------------------------------------------------------------------------- /helper.h: -------------------------------------------------------------------------------- 1 | #ifndef HELPER_H 2 | #define HELPER_H 3 | 4 | __global__ void gpu_memset(unsigned char* start, unsigned char value, int length); 5 | void gpu_perror(const char* input); 6 | 7 | #endif // HELPER_H 8 | -------------------------------------------------------------------------------- /costVolumeFilter_box.h: -------------------------------------------------------------------------------- 1 | #ifndef COSTVOLUMEFILTER_BOX_H 2 | #define COSTVOLUMEFILTER_BOX_H 3 | 4 | #include 5 | #include "cost_volume.h" 6 | #include "timer.h" 7 | 8 | void costVolumeFilter_box_gpu(struct cost_volume_t& vol, int ksize); 9 | void costVolumeFilter_box(struct cost_volume_t& cost_volume, int kernelSize); 10 | 11 | #endif // COSTVOLUMEFILTER_BOX_H 12 | -------------------------------------------------------------------------------- /createCostVolume.h: -------------------------------------------------------------------------------- 1 | #ifndef CREATECOSTVOLUME_H 2 | #define CREATECOSTVOLUME_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | __global__ void createCostVolume_kernel(int* ref_global, int* tgt_global, struct cost_volume_t vol, int* debug); 9 | struct cost_volume_t createCostVolume_gpu(cv::Mat leftim, cv::Mat rightim, int ndisp); 10 | struct cost_volume_t createCostVolume(cv::Mat leftim, cv::Mat rightim,int ndisp); 11 | 12 | #endif // CREATECOSTVOLUME_H 13 | -------------------------------------------------------------------------------- /helper.cu: -------------------------------------------------------------------------------- 1 | #include "helper.h" 2 | #include 3 | 4 | // little bitty kernel to initialize blocks of device memory 5 | __global__ void gpu_memset(unsigned char* start, unsigned char value, int length){ 6 | int tx = threadIdx.x; 7 | int bx = blockIdx.x; 8 | int gx = bx*blockDim.x + tx; 9 | if(gx < length){ 10 | start[gx] = value; 11 | } 12 | } 13 | 14 | // teeny little helper function 15 | void gpu_perror(const char* input){ 16 | printf("%s: %s\n", input, cudaGetErrorString(cudaGetLastError())); 17 | } 18 | 19 | -------------------------------------------------------------------------------- /costVolumeMinimize.h: -------------------------------------------------------------------------------- 1 | #ifndef COSTVOLUMEMINIMIZE_H 2 | #define COSTVOLUMEMINIMIZE_H 3 | 4 | #include 5 | #include 6 | #include "timer.h" 7 | #include "cost_volume.h" 8 | #include "helper.h" 9 | 10 | __global__ void costVolumeMinimize_kernel(struct cost_volume_t vol, unsigned char* output); 11 | void costVolumeMinimize_gpu(struct cost_volume_t cost_volume, cv::Mat& outim); 12 | void costVolumeMinimize(struct cost_volume_t cost_volume, cv::Mat& outim); 13 | 14 | #endif // COSTVOLUMEMINIMIZE_H 15 | 16 | -------------------------------------------------------------------------------- /cost_volume.h: -------------------------------------------------------------------------------- 1 | #ifndef COST_VOLUME_H 2 | #define COST_VOLUME_H 3 | 4 | struct cost_volume_t { 5 | float* volume; 6 | int nrows; 7 | int ncols; 8 | int ndisp; 9 | int stride; 10 | }; 11 | 12 | struct rgba_pixel { 13 | unsigned char r; 14 | unsigned char g; 15 | unsigned char b; 16 | unsigned char a; 17 | }; 18 | 19 | struct cost_volume_t get_gpu_volume(struct cost_volume_t vin); 20 | void viewSlices(struct cost_volume_t& cost_volume, int first, int last); 21 | void costVolumeBoxFilter(struct cost_volume_t& cost_volume, int kernelSize); 22 | 23 | #endif // COST_VOLUME_H 24 | -------------------------------------------------------------------------------- /costVolumeFilter_guided.h: -------------------------------------------------------------------------------- 1 | #ifndef COSTVOLUMEFILTER_GUIDED_H 2 | #define COSTVOLUMEFILTER_GUIDED_H 3 | 4 | #include 5 | #include "cost_volume.h" 6 | #include "timer.h" 7 | 8 | __global__ void costVolumeFilter_guided_kernel(struct cost_volume_t vol, int* guide_global, int inter_win_padding, float* output, float sigma_s, float sigma_c, int ksize); 9 | void costVolumeFilter_guided_gpu(struct cost_volume_t& vol, cv::Mat guide, int ksize, float eps); 10 | void costVolumeFilter_guided(struct cost_volume_t& vol, cv::Mat guide, int ksize, float eps); 11 | 12 | #endif // COSTVOLUMEFILTER_GUIDED_H 13 | 14 | -------------------------------------------------------------------------------- /createCostVolume_tadcg.h: -------------------------------------------------------------------------------- 1 | #ifndef CREATECOSTVOLUME_TADCG_H 2 | #define CREATECOSTVOLUME_TADCG_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | __global__ void createCostVolume_tadcg_kernel(cv::cuda::PtrStepi ref_global, cv::cuda::PtrStepi tgt_global, struct cost_volume_t vol, cv::cuda::PtrStepi debug, float tc, float tg, float alpha); 9 | struct cost_volume_t createCostVolume_tadcg_gpu(cv::Mat leftim, cv::Mat rightim, int ndisp, float tc, float tg, float alpha); 10 | struct cost_volume_t createCostVolume_tadcg(cv::Mat leftim, cv::Mat rightim, int ndisp, float tc, float tg, float alpha); 11 | 12 | #endif // CREATECOSTVOLUME_TADCG_H 13 | -------------------------------------------------------------------------------- /timer.cpp: -------------------------------------------------------------------------------- 1 | #include "timer.h" 2 | #include 3 | 4 | struct timespec check_timer(const char* str, struct timespec* ts){ 5 | struct timespec oldtime; 6 | // copy old time over 7 | oldtime.tv_nsec = ts->tv_nsec; 8 | oldtime.tv_sec = ts->tv_sec; 9 | // update ts 10 | clock_gettime(CLOCK_REALTIME, ts); 11 | // print old time 12 | int diffsec; 13 | int diffnsec; 14 | if(str != NULL){ 15 | diffsec = ts->tv_sec - oldtime.tv_sec; 16 | diffnsec = ts->tv_nsec - oldtime.tv_nsec; 17 | // correct the values if we measured over an integer second break: 18 | if(diffnsec < 0){ 19 | diffsec--; 20 | diffnsec += 1000000000; 21 | } 22 | printf("%s:%ds %.3fms\n",str,diffsec,diffnsec/1000000.); 23 | } 24 | return (struct timespec) {diffsec, diffnsec}; 25 | } 26 | -------------------------------------------------------------------------------- /costVolumeFilter_jointBilateral.h: -------------------------------------------------------------------------------- 1 | #ifndef COSTVOLUMEFILTER_JOINTBILATERAL_H 2 | #define COSTVOLUMEFILTER_JOINTBILATERAL_H 3 | 4 | #include 5 | #include "cost_volume.h" 6 | #include "timer.h" 7 | 8 | __global__ void costVolumeFilter_jointBilateral_kernel(struct cost_volume_t vol, int* guide_global, int inter_win_padding, float* output, int ksize, float sigma_c, float sigma_s); 9 | void costVolumeFilter_jointBilateral_gpu(struct cost_volume_t& cost_volume, cv::Mat guide, int ksize, float sigma_c, float sigma_s); 10 | //void jointBilateralFilter(cv::Mat& srcim, cv::Mat& guideim, cv::Mat& dst, int kernelSize, float sigma_color, float sigma_space); 11 | 12 | void costVolumeFilter_jointBilateral(struct cost_volume_t& cost_volume, cv::Mat guide, int kernelSize, float sigma_color, float sigma_space); 13 | 14 | #endif // COSTVOLUMEFILTER_JOINTBILATERAL_H 15 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | CFLAGS=`pkg-config opencv --cflags` 2 | LDFLAGS=`pkg-config opencv --libs` -L/usr/local/cuda-7.5/targets/x86_64/lib -lnpps -lnppi -lnppc 3 | 4 | .PHONY: all 5 | all: cost_volume asw 6 | 7 | gpu_volume: cost_volume.cu 8 | nvcc $(CFLAGS) $^ -o $@ $(LDFLAGS) 9 | 10 | cost_volume: cost_volume.o costVolumeFilter_jointBilateral.o costVolumeFilter_guided.o costVolumeFilter_box.o costVolumeMinimize.o createCostVolume_tadcg.o createCostVolume.o timer.o helper.o 11 | nvcc $^ -o $@ $(LDFLAGS) 12 | 13 | %.o: %.cu %.h 14 | nvcc $(CFLAGS) -c $< 15 | 16 | .PHONY: debug 17 | debug: CFLAGS+= -g -G 18 | debug: cost_volume 19 | 20 | .PHONY: run 21 | run: cost_volume 22 | ./cost_volume 23 | 24 | # the old implementation, still faster on some hardware 25 | asw: asw.cu 26 | nvcc `pkg-config opencv --cflags` $< `pkg-config opencv --libs` -o $@ 27 | # example for running the old version: 28 | # ./asw l.png r.png 64 5 50 29 | 30 | .PHONY: clean 31 | clean: 32 | rm asw cost_volume *.o 33 | 34 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Ryan Beethe 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /costVolumeFilter_box.cu: -------------------------------------------------------------------------------- 1 | #include "costVolumeFilter_box.h" 2 | #include "helper.h" 3 | #include "opencv2/ximgproc/edge_filter.hpp" 4 | #include "opencv2/cudafilters.hpp" 5 | #include 6 | 7 | using namespace std; 8 | using namespace cv; 9 | 10 | void costVolumeFilter_box_gpu(struct cost_volume_t& vol, int ksize){ 11 | int nrows = vol.nrows; 12 | int ncols = vol.ncols; 13 | int ndisp = vol.ndisp; 14 | int stride = vol.stride; 15 | 16 | // output volume 17 | float* d_output; 18 | cudaMalloc(&d_output, ndisp*nrows*stride*sizeof(float)); 19 | 20 | struct timespec timer; 21 | check_timer(NULL,&timer); 22 | 23 | for(int disp = 0; disp < ndisp; disp++){ 24 | float* src_data = &(vol.volume[disp*nrows*stride]); 25 | float* out_data = &(d_output[disp*nrows*stride]); 26 | int src_pitch = stride*sizeof(float); 27 | int out_pitch = stride*sizeof(float); 28 | NppiSize size = {ncols , nrows }; 29 | NppiSize sizeROI = {ncols , nrows }; 30 | NppiSize kernel = {ksize , ksize }; 31 | NppiPoint offset = {0 , 0 }; 32 | NppiPoint anchor = {ksize/2 , ksize/2 }; 33 | 34 | 35 | nppiFilterBoxBorder_32f_C1R( 36 | src_data, src_pitch, 37 | size, offset, 38 | out_data, out_pitch, 39 | sizeROI, kernel, anchor, NPP_BORDER_REPLICATE); 40 | } 41 | 42 | check_timer("costVolumeFilter_box_gpu time",&timer); 43 | 44 | // shuffle pointers 45 | cudaFree(vol.volume); 46 | vol.volume = d_output; 47 | } 48 | 49 | 50 | void costVolumeFilter_box(struct cost_volume_t& cost_volume, int kernelSize){ 51 | int nrows = cost_volume.nrows; 52 | int ncols = cost_volume.ncols; 53 | int ndisp = cost_volume.ndisp; 54 | float* vin = cost_volume.volume; 55 | // doesn't do in-place editing... need second float* 56 | float* vout = (float*)malloc(nrows*ncols*ndisp*sizeof(float)); 57 | 58 | struct timespec timer; 59 | check_timer(NULL,&timer); 60 | 61 | for(int disp = 0; disp < ndisp; disp++){ 62 | Mat slicein(nrows,ncols,CV_32F,&(vin[nrows*ncols*disp])); 63 | Mat sliceout(nrows,ncols,CV_32F,&(vout[nrows*ncols*disp])); 64 | boxFilter(slicein, sliceout, -1, Size(kernelSize,kernelSize)); 65 | } 66 | 67 | check_timer("costVolumeFilter_box time",&timer); 68 | 69 | // free old cost_volume float* 70 | free(cost_volume.volume); 71 | // replace with new cost_volume float* 72 | cost_volume.volume = vout; 73 | } 74 | 75 | -------------------------------------------------------------------------------- /kerneltest.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | struct timespec check_timer(const char* str, struct timespec* ts){ 6 | struct timespec oldtime; 7 | // copy old time over 8 | oldtime.tv_nsec = ts->tv_nsec; 9 | oldtime.tv_sec = ts->tv_sec; 10 | // update ts 11 | clock_gettime(CLOCK_REALTIME, ts); 12 | // print old time 13 | int diffsec; 14 | int diffnsec; 15 | if(str != NULL){ 16 | diffsec = ts->tv_sec - oldtime.tv_sec; 17 | diffnsec = ts->tv_nsec - oldtime.tv_nsec; 18 | // correct the values if we measured over an integer second break: 19 | if(diffnsec < 0){ 20 | diffsec--; 21 | diffnsec += 1000000000; 22 | } 23 | printf("%s:%ds %dns\n",str,diffsec,diffnsec); 24 | } 25 | return (struct timespec) {diffsec, diffnsec}; 26 | } 27 | 28 | 29 | // Device code 30 | __global__ void VecAdd(float* A, float* B, float* C, int N) 31 | { 32 | int i = blockDim.x * blockIdx.x + threadIdx.x; 33 | if (i < N) 34 | C[i] = A[i] + B[i]; 35 | } 36 | 37 | // Host code 38 | int main() 39 | { 40 | // declare timer 41 | struct timespec timer; 42 | 43 | int N = 1000000000; 44 | size_t size = N * sizeof(float); 45 | 46 | // Allocate input vectors h_A and h_B in host memory 47 | float* h_A = (float*)malloc(size); 48 | float* h_B = (float*)malloc(size); 49 | float* h_C = (float*)malloc(size); 50 | 51 | 52 | check_timer(NULL,&timer); 53 | // Initialize input vectors 54 | for(int i = 0; i < N; i++){ 55 | h_A[i] = i; 56 | h_B[i] = N - i; 57 | } 58 | check_timer("Time to initialize",&timer); 59 | 60 | // Allocate vectors in device memory 61 | float* d_A; 62 | cudaMalloc(&d_A, size); 63 | float* d_B; 64 | cudaMalloc(&d_B, size); 65 | float* d_C; 66 | cudaMalloc(&d_C, size); 67 | 68 | // Copy vectors from host memory to device memory 69 | check_timer(NULL,&timer); 70 | cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); 71 | cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); 72 | check_timer("Time to copy to device",&timer); 73 | 74 | // Invoke kernel 75 | int threadsPerBlock = 256; 76 | int blocksPerGrid = 77 | (N + threadsPerBlock - 1) / threadsPerBlock; 78 | check_timer(NULL,&timer); 79 | VecAdd<<>>(d_A, d_B, d_C, N); 80 | check_timer("Time to execute kernel",&timer); 81 | 82 | // Copy result from device memory to host memory 83 | // h_C contains the result in host memory 84 | check_timer(NULL,&timer); 85 | cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); 86 | check_timer("Time to copy back to host",&timer); 87 | 88 | // Free device memory 89 | cudaFree(d_A); 90 | cudaFree(d_B); 91 | cudaFree(d_C); 92 | 93 | int errors = 0; 94 | for(int i = 0; i < N; i++){ 95 | if(h_C[i] != N){ 96 | errors ++; 97 | } 98 | 99 | } 100 | printf("checking done, errors = %d\n"); 101 | 102 | // Free host memory 103 | free(h_A); 104 | free(h_B); 105 | free(h_C); 106 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Adaptive Support Weight (ASW) correspondence matching 2 | 3 | This git project is an open-source CUDA implementation of the algorithm described in "Adaptive support-weight approach for correspondence" by Kuk-Jin Yoon and In So Kweon, which is the basis for the most effective local reasoning stereo vision algorithms produced today. The latest effort is based on the articles "Fast cost-volume filtering for visual correspondence and beyond" by Rhemann et al and "Secrets of adaptive support weight for stereo vision for local stereo matching" by Hosni et al. 4 | 5 | 6 | ## Goals and Motivations 7 | 8 | To my knowledge, there is no other open-source implementation of ASW, although it was developed in 2005. Freely-available computer vision libraries (such as OpenCV or Nvidia VisionWorks) do not offer ASW, either in CPU form or a GPU accelerated form. The goal of this project is to implement a GPU-accelerated ASW algorithm which can ultimately be contributed to the OpenCV library. A Free and Open Source Software (FOSS) implementation of ASW would empower those in industry with a more powerful stereo matching algorithm, and it would empower those in research with a quicker starting point for testing modifications to ASW. 9 | 10 | ## Important Branches 11 | 12 | 1. **master**: This git repo has several different attempts at optimizing the `asw.cu` cuda kernel. The master branch has a mostly-stable snapshot of the cost-volume implementation of the ASW algorithm. Note that there is a known memory issue with the `createCostVolume_kernel()` function in createCostVolume.cu, a bug which is fixed on the cost_volume branch but hasn't been backported. 13 | 14 | 2. **cost_volume**: The effort to use the cost-volume approach ("Fast cost-volume filtering for visual correspondence and beyond" by Rhemann et al) is developed on this branch. Currently, the conversion from using a custom `struct cost_volume_t` to the more useful `cv::cuda::GpuMat` object is not complete, so it is not yet on the master branch. 15 | 16 | ## Current State 17 | 18 | The cost-volume filtering method appears to have a higher minimum run time but ports much better to embedded hardware (1.6 sec runtime instead of 6.8 sec). 19 | 20 | Currently the fastest attempt at GPU-acceleration exists with the old implementation (asw.cu) and can be tested by running `git checkout 9b87bdd` then `./asw l.png r.png 64 5 50`. 21 | 22 | ## Known Issues with initial implementation (asw.cu): 23 | 24 | 1. There seems to be some salt-and-pepper noise on the disparity output that I can't explain. 25 | 26 | 2. Shared memory is not handled well. Currently shared memory size limits the combinations of numbers of disparities & window sizes available, but with a good implementation the size of shared memory should not offer any limit to those factors. In fact, an attempt at reducing shared memory exists on the `dev` branch, but it actually made the shared memory issue worse. 27 | 28 | 3. I suspect improved performance could be achieved by tuning the auto-calculation of window size vs spacial sigma. 29 | 30 | 4. The current pixel-matching function is a sum of absolute difference (SAD), but a complete implementation should use a truncated absolute difference of cost and gradient (TAD C+G) as in the ASW paper. An attempt at implementing TAD C+G can be found on the `half` branch, in the cpu version of the code. Furthermore, modifications to the matching should be included to take into account sub-pixel disparities, such as in the paper, "A pixel dissimilarity measure that is insensitive to image sampling" by Birchfield and Tomasi. However, such modifications are a lesser priority to issues such as the shared memory handling in CUDA. 31 | 32 | 5. Left and right disparity calculation comparison should be done. Currently, only the left disparity is calculated. 33 | 34 | # Known issues with cost-volume implementation: 35 | 36 | 1. This list isn't ready yet... 37 | -------------------------------------------------------------------------------- /costVolumeMinimize.cu: -------------------------------------------------------------------------------- 1 | #include "costVolumeMinimize.h" 2 | 3 | using namespace std; 4 | using namespace cv; 5 | 6 | #define ILP_LEVEL 4 7 | // Device code 8 | __global__ void costVolumeMinimize_kernel(struct cost_volume_t vol, unsigned char* output){ 9 | int gx = blockIdx.x*blockDim.x + threadIdx.x; 10 | int gy = blockIdx.y*blockDim.y + threadIdx.y; 11 | gy *= ILP_LEVEL; 12 | 13 | // only threads which land in the image participate 14 | if(gy < vol.nrows && gx < vol.ncols){ 15 | 16 | // this will store the disp val of the lowest cost 17 | int mindisp[ILP_LEVEL]; 18 | float mincost[ILP_LEVEL]; 19 | #pragma unroll 20 | for(int ilp = 0; ilp < ILP_LEVEL; ilp++){ 21 | // arbitrary large number 22 | mincost[ilp] = 1e6; 23 | } 24 | 25 | 26 | // now go through each disparity 27 | for(int disp = 0; disp < vol.ndisp; disp ++){ 28 | float cost[ILP_LEVEL]; 29 | #pragma unroll 30 | for(int ilp = 0; ilp < ILP_LEVEL; ilp++){ 31 | if(gy + ilp < vol.nrows){ 32 | cost[ilp] = vol.volume[vol.stride*vol.nrows*disp + vol.stride*(gy+ilp) + gx]; 33 | } 34 | __syncthreads(); 35 | } 36 | #pragma unroll 37 | for(int ilp = 0; ilp < ILP_LEVEL; ilp++){ 38 | if(cost[ilp] < mincost[ilp]){ 39 | mincost[ilp] = cost[ilp]; 40 | mindisp[ilp] = disp; 41 | } 42 | } 43 | __syncthreads(); 44 | } 45 | 46 | // write the resulting minimum to the output 47 | #pragma unroll 48 | for(int ilp = 0; ilp < ILP_LEVEL; ilp++){ 49 | if(gy + ilp < vol.nrows){ 50 | output[vol.ncols*(gy+ilp) + gx] = mindisp[ilp]; 51 | } 52 | } 53 | } 54 | } 55 | 56 | void costVolumeMinimize_gpu(struct cost_volume_t cost_volume, Mat& outim){ 57 | int nrows = cost_volume.nrows; 58 | int ncols = cost_volume.ncols; 59 | // init out mat 60 | outim = Mat::zeros(nrows,ncols,CV_8U); 61 | // allocate output matrix on gpu 62 | unsigned char* d_output; 63 | cudaMalloc(&d_output, nrows*ncols*sizeof(unsigned char)); 64 | // zero the d_output 65 | // gpu_memset<<>>((unsigned char*)d_output,0,nrows*ncols*sizeof(unsigned char)); 66 | // gpu_perror("memset on output"); 67 | 68 | // settings for the kernel 69 | // trying to use 128 threads-wide so the uchar global write is 128 bytes 70 | dim3 threadsPerBlock(128,1); 71 | dim3 blocksPerGrid(ncols/threadsPerBlock.x+1,nrows/threadsPerBlock.y/ILP_LEVEL+1); 72 | // call the kernel 73 | struct timespec timer; 74 | check_timer(NULL,&timer); 75 | costVolumeMinimize_kernel<<>>(cost_volume, (unsigned char*)d_output); 76 | cudaDeviceSynchronize(); 77 | check_timer("costVolumeMinimize_gpu time",&timer); 78 | gpu_perror("costVolumeMinimize_kernel"); 79 | 80 | // copy debug back over 81 | cudaMemcpy((unsigned char*)outim.data, d_output, nrows*ncols*sizeof(unsigned char), cudaMemcpyDeviceToHost); 82 | // imshow("window",outim); waitKey(0); 83 | 84 | // cleanup the temporary image memory 85 | cudaFree(d_output); 86 | } 87 | 88 | void costVolumeMinimize(struct cost_volume_t cost_volume, Mat& outim){ 89 | int ndisp = cost_volume.ndisp; 90 | int nrows = cost_volume.nrows; 91 | int ncols = cost_volume.ncols; 92 | // init out mat 93 | outim = Mat::zeros(nrows,ncols,CV_8U); 94 | unsigned char* out = (unsigned char*) (outim.data); 95 | float* volume = cost_volume.volume; 96 | for(int col = 0; col < ncols; col++){ 97 | for(int row = 0; row < nrows; row++){ 98 | float minval = volume[nrows*ncols*0 + ncols*row + col]; 99 | int minidx = 0; 100 | // iterate over the disparities 101 | for(int disp = 1; disp < min(ndisp,col); disp++){ 102 | float test = volume[nrows*ncols*disp + ncols*row + col]; 103 | if(test < minval){ 104 | minval = test; 105 | minidx = disp; 106 | } 107 | } 108 | out[ncols*row + col] = (unsigned char)minidx; 109 | } 110 | } 111 | } 112 | 113 | -------------------------------------------------------------------------------- /cost_volume.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "timer.h" 7 | #include "createCostVolume.h" 8 | #include "createCostVolume_tadcg.h" 9 | #include "costVolumeMinimize.h" 10 | #include "costVolumeFilter_jointBilateral.h" 11 | #include "costVolumeFilter_guided.h" 12 | #include "costVolumeFilter_box.h" 13 | #include "cost_volume.h" 14 | 15 | using namespace std; 16 | using namespace cv; 17 | 18 | struct cost_volume_t get_gpu_volume(struct cost_volume_t vin){ 19 | struct cost_volume_t vout; 20 | vout.nrows = vin.nrows; 21 | vout.ncols = vin.ncols; 22 | vout.ndisp = vin.ndisp; 23 | vout.stride = vin.ncols; 24 | // copy the gpu data directly 25 | float* gpu_copy = (float*)malloc(vin.stride*vin.nrows*vin.ndisp*sizeof(float)); 26 | cudaMemcpy(gpu_copy, vin.volume, vin.stride*vin.nrows*vin.ndisp*sizeof(float), cudaMemcpyDeviceToHost); 27 | // now copy without padding 28 | vout.volume = (float*)malloc(vout.ncols*vout.nrows*vout.ndisp*sizeof(float)); 29 | for(int col = 0; col < vout.ncols; col++){ 30 | for(int row = 0; row < vout.nrows; row++){ 31 | // iterate over the disparities 32 | for(int disp = 0; disp < vout.ndisp; disp++){ 33 | vout.volume[vout.nrows*vout.ncols*disp + vout.ncols*row + col] = gpu_copy[vin.nrows*vin.stride*disp + vin.stride*row + col]; 34 | } 35 | } 36 | } 37 | free(gpu_copy); 38 | return vout; 39 | } 40 | 41 | void viewSlices(struct cost_volume_t& cost_volume, int first, int last){ 42 | int nrows = cost_volume.nrows; 43 | int stride = cost_volume.stride; 44 | float* vin = cost_volume.volume; 45 | if(last < 0){ 46 | last = cost_volume.ndisp - last; 47 | } 48 | for(int disp = first; disp <= last; disp++){ 49 | printf("\n%d\n",disp); 50 | Mat slicein(nrows,stride,CV_32F,&(vin[nrows*stride*disp])); 51 | double m,M; minMaxLoc(slicein,&m,&M); 52 | printf("min,Max of slice = %f,%f\n",m,M); 53 | printf("slice rows,cols: %d,%d\n",slicein.rows,slicein.cols); 54 | Mat temp = (slicein - m)/(M-m); 55 | imshow("window",temp); if((char)waitKey(0)=='q') break; 56 | } 57 | } 58 | 59 | int main(int argc, char** argv){ 60 | cudaDeviceReset(); 61 | // spacial and intensity sigmas 62 | double s_sigma, c_sigma; 63 | // size of bilateral kernel 64 | int ksize; 65 | // number of disparities to check 66 | int ndisp; 67 | // input images 68 | Mat l_im, r_im; 69 | 70 | if(argc < 6){ 71 | printf("usage: %s \n\n",argv[0]); 72 | printf("... for now, using defaults (l.png r.png 64 15 5 50)\n"); 73 | l_im = imread("l.png"); 74 | r_im = imread("r.png"); 75 | ndisp = 64; 76 | ksize = 15; 77 | s_sigma = 5; 78 | c_sigma = 50; 79 | }else{ 80 | // read images, convert to floats 81 | l_im = imread(argv[1]); 82 | r_im = imread(argv[2]); 83 | ndisp = atoi(argv[3]); 84 | ksize = atoi(argv[4]); 85 | s_sigma = atof(argv[5]); 86 | c_sigma = atof(argv[6]); 87 | } 88 | printf("ndisp,ksize,s_sigma,c_sigma: %d,%d,%.3f,%.3f\n",ndisp,ksize,s_sigma,c_sigma); 89 | 90 | Mat out,out_gpu; 91 | //struct cost_volume_t gpu_volume = createCostVolume_gpu(l_im, r_im, 64); 92 | struct cost_volume_t gpu_volume = createCostVolume_tadcg_gpu(l_im, r_im, 64,20,90,.9); 93 | costVolumeFilter_jointBilateral_gpu(gpu_volume, l_im, ksize, c_sigma, s_sigma); 94 | //costVolumeFilter_guided_gpu(gpu_volume, l_im, ksize, c_sigma); 95 | //costVolumeFilter_box_gpu(gpu_volume, ksize); 96 | costVolumeMinimize_gpu(gpu_volume, out_gpu); 97 | 98 | //struct cost_volume_t cpu_volume = get_gpu_volume(gpu_volume); 99 | //costVolumeFilter_guided(cpu_volume,l_im,ksize,c_sigma); 100 | //costVolumeMinimize(cpu_volume,out_gpu); 101 | 102 | struct cost_volume_t ref_volume = createCostVolume_tadcg(l_im,r_im,64,20,90,.9); 103 | //struct cost_volume_t ref_volume2 = createCostVolume(l_im,r_im,64); 104 | costVolumeFilter_jointBilateral(ref_volume, l_im, ksize, c_sigma, s_sigma); 105 | //costVolumeBoxFilter(ref_volume,ksize); 106 | //costVolumeFilter_guided(ref_volume,l_im,ksize,c_sigma); 107 | //costVolumeFilter_guided(ref_volume2,l_im,ksize,c_sigma); 108 | costVolumeMinimize(ref_volume, out); 109 | //costVolumeMinimize(ref_volume2, out_gpu); 110 | //viewSlices(cpu_volume,0,12); 111 | //viewSlices(ref_volume,0,10); 112 | // costVolumeFilter_box(cpu_volume,ksize); 113 | int show = 1; 114 | if(show){ 115 | printf("l_im\n"); imshow("window",l_im); waitKey(0); 116 | printf("cpu\n"); imshow("window",out); waitKey(0); 117 | printf("gpu\n"); imshow("window",out_gpu); waitKey(0); 118 | printf("python\n"); imshow("window",imread("tadcg.png")); waitKey(0); 119 | } 120 | } 121 | 122 | -------------------------------------------------------------------------------- /createCostVolume.cu: -------------------------------------------------------------------------------- 1 | #include "cost_volume.h" 2 | #include "createCostVolume.h" 3 | #include "timer.h" 4 | #include "helper.h" 5 | 6 | using namespace std; 7 | using namespace cv; 8 | 9 | // Device code 10 | __global__ void createCostVolume_kernel(int* ref_global, int* tgt_global, struct cost_volume_t vol, int* debug){ 11 | int gx = blockIdx.x*blockDim.x + threadIdx.x; 12 | int gy = blockIdx.y*blockDim.y + threadIdx.y; 13 | 14 | extern __shared__ int tgt_data[]; // contains relevant tgt image data 15 | 16 | // copy target image global memory into shared memory (all threads must participate) 17 | for(int i = 0; i < vol.ndisp + blockDim.x; i += blockDim.x){ 18 | // check to make sure the actual read lands in 0 <= col < ncols && row < nrows 19 | if(gy < vol.nrows && (gx - (vol.ndisp-1) + i) >= 0 && (gx - (vol.ndisp-1) + i) < vol.ncols){ 20 | tgt_data[(blockDim.x + vol.ndisp - 1)*threadIdx.y + threadIdx.x + i] = tgt_global[vol.ncols*gy + gx - (vol.ndisp-1) + i]; 21 | } 22 | __syncthreads(); 23 | } 24 | 25 | // now only threads which land in the image participate 26 | if(gy < vol.nrows && gx < vol.ncols){ 27 | 28 | // get reference pixel from global memory 29 | int ref = ref_global[vol.ncols*gy + gx]; 30 | 31 | // pull out channel data from reference pixel (brought in as an int) 32 | int rr,rg,rb; 33 | rr = (ref&0x000000FF) >> 0; 34 | rb = (ref&0x0000FF00) >> 8; 35 | rg = (ref&0x00FF0000) >> 16; 36 | 37 | // now go through each disparity 38 | for(int disp = 0; disp < vol.ndisp; disp ++){ 39 | float cost; 40 | // check if this disp has a pixel in the tgt image 41 | if( gx - disp >= 0){ 42 | // read tgt pixel from shared memory 43 | int tgt = tgt_data[(blockDim.x + vol.ndisp - 1)*threadIdx.y + (vol.ndisp-1) + threadIdx.x - disp]; 44 | 45 | // separate channel data 46 | int tr,tg,tb; 47 | tr = (tgt&0x000000FF) >> 0; 48 | tb = (tgt&0x0000FF00) >> 8; 49 | tg = (tgt&0x00FF0000) >> 16; 50 | 51 | // using SAD for aggregate cost function 52 | cost = abs(rr - tr) + abs(rb-tb) + abs(rg-tg); 53 | }else{ 54 | // these values of the cost volume don't correspond to two real pixels, so make the cost high 55 | cost = 9999; 56 | } 57 | __syncthreads(); 58 | // now write the cost to the actual cost_volume 59 | vol.volume[vol.stride*vol.nrows*disp + vol.stride*gy + gx] = cost; 60 | } 61 | } 62 | } 63 | 64 | struct cost_volume_t createCostVolume_gpu(Mat leftim, Mat rightim, int ndisp){ 65 | int nchans = leftim.channels(); 66 | int nrows = leftim.rows; 67 | int ncols = leftim.cols; 68 | // find stride so that rows in global memory align to 128-byte boundaries 69 | int boundary = 128/sizeof(float); 70 | int stride = ncols + (boundary - ncols%boundary)%boundary; 71 | // allocate gpu memory for cost volume 72 | float* volume_gpu; 73 | cudaMalloc(&volume_gpu,nrows*ndisp*stride*sizeof(float)); 74 | // zero the volume_gpu 75 | // gpu_memset<<>>((unsigned char*)volume_gpu,0,ncols*ndisp*stride*sizeof(float)); 76 | // gpu_perror("memset on volume"); 77 | // init struct cost_volume_t object 78 | struct cost_volume_t cost_volume = {volume_gpu,nrows,ncols,ndisp,stride}; 79 | // convert BGR images to RGBA 80 | cvtColor(leftim,leftim,CV_BGR2RGBA); 81 | cvtColor(rightim,rightim,CV_BGR2RGBA); 82 | // copy left image to to GPU 83 | unsigned char* d_im_l; 84 | cudaMalloc(&d_im_l, 4*nrows*ncols*sizeof(unsigned char)); 85 | cudaMemcpy(d_im_l, leftim.data, 4*nrows*ncols*sizeof(unsigned char), cudaMemcpyHostToDevice); 86 | // copy right image to to GPU 87 | unsigned char* d_im_r; 88 | cudaMalloc(&d_im_r, 4*nrows*ncols*sizeof(unsigned char)); 89 | cudaMemcpy(d_im_r, rightim.data, 4*nrows*ncols*sizeof(unsigned char), cudaMemcpyHostToDevice); 90 | // debug setup 91 | Mat debug(nrows,ncols,CV_8UC4); 92 | unsigned char* d_debug; 93 | cudaMalloc(&d_debug,nrows*ncols*sizeof(int)); 94 | // zero the volume_gpu 95 | // gpu_memset<<>>((unsigned char*)d_debug,0,ncols*nrows*sizeof(int)); 96 | // gpu_perror("memset on debug"); 97 | 98 | // settings for the kernel 99 | // should be 32-threads wide to ensure 128-byte block global reads 100 | dim3 threadsPerBlock(32,4); 101 | dim3 blocksPerGrid(ncols/threadsPerBlock.x+1,nrows/threadsPerBlock.y+1); 102 | int tgt_shared_mem = (threadsPerBlock.x+ndisp-1)*threadsPerBlock.y*sizeof(int); 103 | // call the kernel 104 | struct timespec timer; 105 | check_timer(NULL,&timer); 106 | createCostVolume_kernel<<>>((int*)d_im_l, (int*)d_im_r, cost_volume, (int*)d_debug); 107 | cudaDeviceSynchronize(); 108 | check_timer("cost_volume_gpu time",&timer); 109 | gpu_perror("createCostVolume_kernel"); 110 | 111 | // copy debug back over 112 | cudaMemcpy((int*)debug.data, d_debug, nrows*ncols*sizeof(int), cudaMemcpyDeviceToHost); 113 | // imshow("window",leftim); waitKey(0); 114 | // imshow("window",debug); waitKey(0); 115 | // imshow("window",leftim); waitKey(0); 116 | 117 | // cleanup the temporary image memory 118 | cudaFree(d_im_l); 119 | cudaFree(d_im_r); 120 | cudaFree(d_debug); 121 | 122 | return cost_volume; 123 | } 124 | 125 | struct cost_volume_t createCostVolume(Mat leftim, Mat rightim,int ndisp){ 126 | int nchans = leftim.channels(); 127 | int nrows = leftim.rows; 128 | int ncols = leftim.cols; 129 | int stride = ncols; 130 | float* volume = (float*)malloc(ncols*nrows*nchans*ndisp*sizeof(float)); 131 | // init struct cost_volume_t object 132 | struct cost_volume_t cost_volume = {volume,nrows,ncols,ndisp,stride}; 133 | 134 | // make sure images are the same size 135 | if(leftim.cols != rightim.cols || leftim.rows != rightim.rows && leftim.channels() == rightim.channels()){ 136 | printf("ERROR: left and right images in createCostVolume do not have matching rows and cols and channels\n"); 137 | return cost_volume; 138 | } 139 | 140 | struct timespec timer; 141 | check_timer(NULL,&timer); 142 | 143 | unsigned char* left = (unsigned char*)leftim.data; 144 | unsigned char* right = (unsigned char*)rightim.data; 145 | // init values to very large numbers 146 | // the reason for this is that some regions near volume edges won't be dealt with 147 | for( int i = 0; i < ncols*nrows*nchans*ndisp; i++){ 148 | // arbitrary large number 149 | volume[i] = 9999; 150 | } 151 | 152 | // organization will be ndisp images of rows of pixels 153 | // iterate over the whole image 154 | for(int col = 0; col < ncols; col++){ 155 | for(int row = 0; row < nrows; row++){ 156 | // iterate over the disparities 157 | for(int disp = 0; disp < min(ndisp,col+1); disp++){ 158 | // get difference over channels 159 | float diff = 0; 160 | for(int chan = 0; chan < nchans; chan++){ 161 | diff += abs(left[(ncols*row + col)*nchans + chan] - right[(ncols*row + col - disp)*nchans + chan]); 162 | } 163 | volume[nrows*ncols*disp + ncols*row + col] = diff; 164 | } 165 | } 166 | } 167 | check_timer("createCostVolume",&timer); 168 | return cost_volume; 169 | } 170 | 171 | -------------------------------------------------------------------------------- /costVolumeFilter_guided.cu: -------------------------------------------------------------------------------- 1 | #include "costVolumeFilter_guided.h" 2 | #include "helper.h" 3 | #include "opencv2/ximgproc/edge_filter.hpp" 4 | #include "opencv2/cudaarithm.hpp" 5 | #include "opencv2/cudafilters.hpp" 6 | #include 7 | 8 | 9 | using namespace std; 10 | using namespace cv; 11 | 12 | __global__ void costVolumeFilter_guided_kernel(struct cost_volume_t vol, int* guide_global, int inter_win_padding, float* output, float sigma_s, float sigma_c, int ksize){ 13 | 14 | } 15 | 16 | void costVolumeFilter_guided_gpu(struct cost_volume_t& vol, Mat guide, int ksize, float eps){ 17 | int nrows = vol.nrows; 18 | int ncols = vol.ncols; 19 | int ndisp = vol.ndisp; 20 | int stride = vol.stride; 21 | 22 | struct timespec timer; 23 | 24 | cuda::GpuMat I; 25 | 26 | // copy guide image to grayscale 27 | cvtColor(guide,guide,CV_BGR2GRAY); 28 | // convert to float 29 | guide.convertTo(guide,CV_32FC1); 30 | // copy guide image to GPU 31 | I.upload(guide); 32 | // set up working memory 33 | cuda::GpuMat mean(I.rows,I.cols,I.type()); 34 | cuda::GpuMat var(I.rows,I.cols,I.type()); 35 | cuda::GpuMat workmem(I.rows,I.cols,I.type()); 36 | cuda::GpuMat workmem2(I.rows,I.cols,I.type()); 37 | cuda::GpuMat workmem3(I.rows,I.cols,I.type()); 38 | 39 | cuda::GpuMat p_(I.rows,I.cols,I.type()); 40 | cuda::GpuMat p_mean(I.rows,I.cols,I.type()); 41 | cuda::GpuMat a(I.rows,I.cols,I.type()); 42 | cuda::GpuMat a_(I.rows,I.cols,I.type()); 43 | cuda::GpuMat a_mean(I.rows,I.cols,I.type()); 44 | cuda::GpuMat a_I(I.rows,I.cols,I.type()); 45 | cuda::GpuMat b(I.rows,I.cols,I.type()); 46 | cuda::GpuMat b_(I.rows,I.cols,I.type()); 47 | cuda::GpuMat Ip(I.rows,I.cols,I.type()); 48 | cuda::GpuMat Ip_(I.rows,I.cols,I.type()); 49 | 50 | check_timer(NULL,&timer); 51 | 52 | // pre-step 1: box filter I to get mean 53 | cudaDeviceSynchronize(); 54 | { 55 | float* src_data = (float*)I.data; 56 | float* out_data = (float*)mean.data; 57 | int src_pitch = I.step; 58 | int out_pitch = mean.step; 59 | NppiSize size = {ncols , nrows }; 60 | NppiSize sizeROI = {ncols , nrows }; 61 | NppiSize kernel = {ksize , ksize }; 62 | NppiPoint offset = {0 , 0 }; 63 | NppiPoint anchor = {ksize/2 , ksize/2 }; 64 | 65 | nppiFilterBoxBorder_32f_C1R( 66 | src_data, src_pitch, 67 | size, offset, 68 | out_data, out_pitch, 69 | sizeROI, kernel, anchor, NPP_BORDER_REPLICATE); 70 | } 71 | // pre-step 2: square I, for variance calculation 72 | cudaDeviceSynchronize(); 73 | cuda::sqr(I,var); 74 | // pre-step 3: box filter I^2 75 | cudaDeviceSynchronize(); 76 | { 77 | float* src_data = (float*)var.data; 78 | float* out_data = (float*)workmem3.data; 79 | int src_pitch = var.step; 80 | int out_pitch = workmem3.step; 81 | NppiSize size = {ncols , nrows }; 82 | NppiSize sizeROI = {ncols , nrows }; 83 | NppiSize kernel = {ksize , ksize }; 84 | NppiPoint offset = {0 , 0 }; 85 | NppiPoint anchor = {ksize/2 , ksize/2 }; 86 | 87 | nppiFilterBoxBorder_32f_C1R( 88 | src_data, src_pitch, 89 | size, offset, 90 | out_data, out_pitch, 91 | sizeROI, kernel, anchor, NPP_BORDER_REPLICATE); 92 | } 93 | // pre-step 4: square the mean 94 | cudaDeviceSynchronize(); 95 | cuda::sqr(mean,workmem); 96 | // pre-step 5: variance = mean(x^2) - mean(x)^2 97 | cudaDeviceSynchronize(); 98 | cuda::subtract(workmem3, workmem, workmem2); 99 | // pre-step 6: add eps to variance 100 | cudaDeviceSynchronize(); 101 | cuda::add(workmem2, eps, var); 102 | 103 | for(int disp = 0; disp < ndisp; disp++){ 104 | // step 1: element-wise multiply I by p 105 | cuda::GpuMat p(Size(ncols,nrows), CV_32F, &(vol.volume[disp*nrows*stride]), stride*sizeof(float)); 106 | //cuda::GpuMat Ip = workmem; 107 | cuda::multiply(I,p,Ip); 108 | // step 2: box filter Ip to be Ip_ 109 | //cuda::GpuMat Ip_ = Ip; 110 | { 111 | float* src_data = (float*)Ip.data; 112 | float* out_data = (float*)Ip_.data; 113 | int src_pitch = Ip.step; 114 | int out_pitch = Ip_.step; 115 | NppiSize size = {ncols, nrows }; 116 | NppiSize sizeROI = {ncols, nrows }; 117 | NppiSize kernel = {ksize , ksize }; 118 | NppiPoint offset = {0 , 0 }; 119 | NppiPoint anchor = {ksize/2 , ksize/2 }; 120 | 121 | nppiFilterBoxBorder_32f_C1R( 122 | src_data, src_pitch, 123 | size, offset, 124 | out_data, out_pitch, 125 | sizeROI, kernel, anchor, NPP_BORDER_REPLICATE); 126 | } 127 | // step 3: box filter p to be p_ 128 | //cuda::GpuMat p_ = p; 129 | { 130 | float* src_data = (float*)p.data; 131 | float* out_data = (float*)p_.data; 132 | int src_pitch = p.step; 133 | int out_pitch = p_.step; 134 | NppiSize size = {ncols, nrows }; 135 | NppiSize sizeROI = {ncols, nrows }; 136 | NppiSize kernel = {ksize , ksize }; 137 | NppiPoint offset = {0 , 0 }; 138 | NppiPoint anchor = {ksize/2 , ksize/2 }; 139 | 140 | nppiFilterBoxBorder_32f_C1R( 141 | src_data, src_pitch, 142 | size, offset, 143 | out_data, out_pitch, 144 | sizeROI, kernel, anchor, NPP_BORDER_REPLICATE); 145 | } 146 | // step 4: combine p_ and mean 147 | //cuda::GpuMat p_mean = workmem2; 148 | cuda::multiply(p_, mean, p_mean); 149 | // step 5: compute Ip_ - mean*p_ 150 | //cuda::GpuMat a = Ip_; 151 | cuda::subtract(Ip_, p_mean, workmem); 152 | // step 6: divide by var+eps (stored as var) 153 | cuda::divide(workmem, var, a); 154 | // step 7: start calculating b with a*mean 155 | //cuda::GpuMat a_mean = workmem2; 156 | cuda::multiply(a, mean, a_mean); 157 | // step 8: b = p_ - a_mean 158 | //cuda::GpuMat b = p_; 159 | cuda::subtract(p_, a_mean, b); 160 | // step 9: box filter a 161 | //cuda::GpuMat a_ = a; 162 | { 163 | float* src_data = (float*)a.data; 164 | float* out_data = (float*)a_.data; 165 | int src_pitch = a.step; 166 | int out_pitch = a_.step; 167 | NppiSize size = {ncols, nrows }; 168 | NppiSize sizeROI = {ncols, nrows }; 169 | NppiSize kernel = {ksize , ksize }; 170 | NppiPoint offset = {0 , 0 }; 171 | NppiPoint anchor = {ksize/2 , ksize/2 }; 172 | 173 | nppiFilterBoxBorder_32f_C1R( 174 | src_data, src_pitch, 175 | size, offset, 176 | out_data, out_pitch, 177 | sizeROI, kernel, anchor, NPP_BORDER_REPLICATE); 178 | } 179 | // step 10: box filter b 180 | //cuda::GpuMat b_ = b; 181 | { 182 | float* src_data = (float*)b.data; 183 | float* out_data = (float*)b_.data; 184 | int src_pitch = b.step; 185 | int out_pitch = b_.step; 186 | NppiSize size = {ncols, nrows }; 187 | NppiSize sizeROI = {ncols, nrows }; 188 | NppiSize kernel = {ksize , ksize }; 189 | NppiPoint offset = {0 , 0 }; 190 | NppiPoint anchor = {ksize/2 , ksize/2 }; 191 | 192 | nppiFilterBoxBorder_32f_C1R( 193 | src_data, src_pitch, 194 | size, offset, 195 | out_data, out_pitch, 196 | sizeROI, kernel, anchor, NPP_BORDER_REPLICATE); 197 | } 198 | // step 11: start to build q with a_ * I 199 | //cuda::GpuMat a_I = a_; 200 | cuda::multiply(a_, I, a_I); 201 | // step 12: q = a_I + b_; 202 | cuda::GpuMat q = p; 203 | cuda::add(a_I, b_, q); 204 | } 205 | 206 | check_timer("costVolumeFilter_guided_gpu time",&timer); 207 | 208 | I.release(); 209 | mean.release(); 210 | var.release(); 211 | workmem.release(); 212 | } 213 | 214 | void costVolumeFilter_guided(struct cost_volume_t& vol, Mat guide, int ksize, float eps){ 215 | int nrows = vol.nrows; 216 | int ncols = vol.ncols; 217 | int ndisp = vol.ndisp; 218 | float* vin = vol.volume; 219 | // doesn't do in-place editing... need second float* 220 | float* vout = (float*)malloc(nrows*ncols*ndisp*sizeof(float)); 221 | // create guided filter 222 | //Ptr guided = ximgproc::createGuidedFilter(guide,ksize,eps); 223 | 224 | cvtColor(guide,guide,CV_BGR2GRAY); 225 | 226 | struct timespec timer; 227 | check_timer(NULL,&timer); 228 | 229 | for(int disp = 0; disp < ndisp; disp++){ 230 | Rect relevant; 231 | relevant.x = disp; relevant.width = ncols-disp; 232 | relevant.y = 0; relevant.height = nrows; 233 | Mat slicein(nrows,ncols,CV_32F,&(vin[nrows*ncols*disp])); 234 | Mat sliceout(nrows,ncols,CV_32F,&(vout[nrows*ncols*disp])); 235 | ximgproc::guidedFilter(guide(relevant),slicein(relevant),sliceout(relevant),ksize,eps); 236 | } 237 | 238 | check_timer("costVolumeFilter_guided time:",&timer); 239 | printf("\n"); 240 | // free old cost_volume float* 241 | free(vol.volume); 242 | // replace with new cost_volume float* 243 | vol.volume = vout; 244 | } 245 | 246 | -------------------------------------------------------------------------------- /createCostVolume_tadcg.cu: -------------------------------------------------------------------------------- 1 | #include "cost_volume.h" 2 | #include "createCostVolume_tadcg.h" 3 | #include "timer.h" 4 | #include "helper.h" 5 | 6 | using namespace std; 7 | using namespace cv; 8 | 9 | struct two16s { 10 | short int a; 11 | short int b; 12 | }; 13 | 14 | // Device code 15 | __global__ void createCostVolume_tadcg_kernel(cuda::PtrStepi ref_global, cuda::PtrStepi tgt_global, struct cost_volume_t vol, cuda::PtrStepi debug, float tc, float tg, float alpha){ 16 | int gx = blockIdx.x*blockDim.x + threadIdx.x; 17 | int gy = blockIdx.y*blockDim.y + threadIdx.y; 18 | 19 | extern __shared__ struct rgba_pixel tgt_data[]; // contains relevant tgt image data 20 | 21 | // set shared row pointer 22 | struct rgba_pixel* s_row = (struct rgba_pixel*)((char*)tgt_data + (blockDim.x+vol.ndisp)*threadIdx.y*sizeof(struct rgba_pixel)); 23 | 24 | { 25 | // set global row for data transfer loop 26 | struct rgba_pixel* g_row = (struct rgba_pixel*)((char*)tgt_global.data + tgt_global.step*gy); 27 | 28 | // copy target image global memory into shared memory (all threads must participate) 29 | for(int i = 0; i < vol.ndisp + blockDim.x; i += blockDim.x){ 30 | // check to make sure the actual read lands in 0 <= col < ncols && row < nrows 31 | if(gy < vol.nrows && (gx - vol.ndisp + i) >= 0 && (gx - vol.ndisp + i) < vol.ncols && threadIdx.x + i < vol.ndisp + blockDim.x){ 32 | s_row[threadIdx.x + i] = g_row[gx - vol.ndisp + i]; 33 | } 34 | __syncthreads(); 35 | } 36 | } 37 | 38 | // now only threads which land in the image participate 39 | if(gy < vol.nrows && gx < vol.ncols){ 40 | struct rgba_pixel ref0, ref; 41 | 42 | { 43 | struct rgba_pixel* g_row = (struct rgba_pixel*)((char*)ref_global.data + ref_global.step*gy); 44 | 45 | // get reference pixels from global memory 46 | ref = g_row[gx]; 47 | // ref0 is the previous pixel, for the gradient calculation 48 | // casting rgba_pixel to int allows multiplying by (gx>0) which avoids a divergence opportunity 49 | ((int*)&ref0)[0] = (((int*)g_row)[max(gx-1,0)]) * (gx > 0); 50 | // old, divergent code (easier to understand: 51 | // if(gx > 0){ 52 | // ref0 = g_row[gx-1]; 53 | // }else{ 54 | // ((int*)&(ref0))[0] = 0; 55 | // } 56 | } 57 | 58 | struct rgba_pixel tgt; 59 | struct rgba_pixel tgt0; 60 | // if(gx == 100 && gy == 100){ 61 | // printf("ref,ref0 = %d,%d,%d %d,%d,%d\n",ref.b,ref.g,ref.r,ref0.b,ref0.g,ref0.r); 62 | // } 63 | 64 | // now go through each disparity 65 | for(int disp = 0; disp < vol.ndisp; disp ++){ 66 | float* g_row = (float*)((char*)vol.volume + (disp*vol.nrows+gy)*vol.stride*sizeof(float)); 67 | float cost; 68 | int adc, adg; 69 | // check if this disp has a pixel in the tgt image 70 | if( gx - disp >= 0){ 71 | // read tgt pixel from shared memory 72 | tgt = s_row[vol.ndisp + threadIdx.x - disp]; 73 | // tgt0 is for calculating the gradient 74 | tgt0 = s_row[vol.ndisp-1 + threadIdx.x - disp]; 75 | 76 | // this is the CUDA-C way to do this 77 | // caluculate absolute difference of color 78 | adc = abs(ref.r - tgt.r) + abs(ref.g - tgt.g) + abs(ref.b - tgt.b); 79 | // caluculate absolute difference of gradient 80 | adg = abs(ref.r-ref0.r - tgt.r+tgt0.r) + abs(ref.g-ref0.g - tgt.g+tgt0.g) + abs(ref.b-ref0.b - tgt.b+tgt0.b); 81 | 82 | // this is the PTX way to do this 83 | // although SIMD assembly instructions show a slight performance improvement, though these instructions are Kepler-specific 84 | // int C = 0; 85 | // int rgrad; 86 | // int tgrad; 87 | // //calculate gradients 88 | // asm("vsub4.s32.u32.u32.sat" " %0, %1, %2, %3;": "=r" (rgrad):"r" (((int*)&ref)[0]), "r" (((int*)&ref0)[0]), "r" (C)); 89 | // asm("vsub4.s32.u32.u32.sat" " %0, %1, %2, %3;": "=r" (tgrad):"r" (((int*)&tgt)[0]), "r" (((int*)&tgt0)[0]), "r" (C)); 90 | // // caluculate absolute difference of color 91 | // asm("vabsdiff4.u32.u32.u32.add" " %0, %1, %2, %3;": "=r" (adc):"r" (((int*)&ref)[0]), "r" (((int*)&tgt)[0]), "r" (C)); 92 | // // caluculate absolute difference of gradient 93 | // asm("vabsdiff4.u32.s32.s32.add" " %0, %1, %2, %3;": "=r" (adg):"r" (rgrad), "r" (tgrad), "r" (C)); 94 | 95 | 96 | // calculate cost with TAD C+G 97 | cost = alpha*min(tc,(float)adc)+(1-alpha)*min(tg,(float)adg); 98 | }else{ 99 | // these values of the cost volume don't correspond to two real pixels, so make the cost high 100 | cost = 9999; 101 | } 102 | __syncthreads(); 103 | // now write the cost to the actual cost_volume 104 | g_row[gx] = cost; 105 | // if(gx == 100 && gy == 100){ 106 | // printf("tgt,tgt0 = %d,%d,%d %d,%d,%d\t",tgt.b,tgt.g,tgt.r,tgt0.b,tgt0.g,tgt0.r); 107 | // printf("disp,cost,adc,adg = %d,%f,%f,%f\n",disp,cost,adc,adg); 108 | // } 109 | __syncthreads(); 110 | } 111 | } 112 | } 113 | 114 | 115 | struct cost_volume_t createCostVolume_tadcg_gpu(Mat leftim, Mat rightim, int ndisp, float tc, float tg, float alpha){ 116 | int nchans = leftim.channels(); 117 | int nrows = leftim.rows; 118 | int ncols = leftim.cols; 119 | size_t pitch; 120 | // allocate gpu memory for cost volume 121 | float* volume_gpu; 122 | cudaMallocPitch(&volume_gpu,&pitch,ncols*sizeof(float),ndisp*nrows); 123 | int stride = pitch / sizeof(float); 124 | // init struct cost_volume_t object 125 | struct cost_volume_t cost_volume = {volume_gpu,nrows,ncols,ndisp,stride}; 126 | // convert BGR images to RGBA 127 | cvtColor(leftim,leftim,CV_BGR2RGBA); 128 | cvtColor(rightim,rightim,CV_BGR2RGBA); 129 | // copy left image to to GPU 130 | cuda::GpuMat d_im_l; 131 | d_im_l.upload(leftim); 132 | // copy right image to to GPU 133 | cuda::GpuMat d_im_r; 134 | d_im_r.upload(rightim); 135 | // debug setup 136 | cuda::GpuMat d_debug(Size(ncols,nrows),CV_8UC4); 137 | 138 | // settings for the kernel 139 | // should be 32-threads wide to ensure 128-byte block global reads 140 | dim3 threadsPerBlock(32,4); 141 | dim3 blocksPerGrid(ncols/threadsPerBlock.x+1,nrows/threadsPerBlock.y+1); 142 | int tgt_shared_mem = (threadsPerBlock.x+ndisp)*threadsPerBlock.y*sizeof(int); 143 | // call the kernel 144 | struct timespec timer; 145 | check_timer(NULL,&timer); 146 | createCostVolume_tadcg_kernel<<>>(d_im_l, d_im_r, cost_volume, d_debug, tc,tg,alpha); 147 | cudaDeviceSynchronize(); 148 | check_timer("createCostVolume_tadcg_gpu time",&timer); 149 | gpu_perror("createCostVolume_tadcg_kernel"); 150 | 151 | // copy debug back over 152 | Mat debug; 153 | //d_debug.download(debug); 154 | // imshow("window",leftim); waitKey(0); 155 | // imshow("window",debug); waitKey(0); 156 | // imshow("window",leftim); waitKey(0); 157 | 158 | // cleanup the temporary image memory 159 | d_im_l.release(); 160 | d_im_r.release(); 161 | d_debug.release(); 162 | 163 | return cost_volume; 164 | } 165 | 166 | 167 | struct bgr_pixel { 168 | unsigned char b; 169 | unsigned char g; 170 | unsigned char r; 171 | }; 172 | 173 | struct cost_volume_t createCostVolume_tadcg(Mat leftim, Mat rightim, int ndisp, float tc, float tg, float alpha){ 174 | int nchans = leftim.channels(); 175 | int nrows = leftim.rows; 176 | int ncols = leftim.cols; 177 | int stride = ncols; 178 | float* volume = (float*)malloc(ncols*nrows*nchans*ndisp*sizeof(float)); 179 | // init struct cost_volume_t object 180 | struct cost_volume_t cost_volume = {volume,nrows,ncols,ndisp,stride}; 181 | 182 | // make sure images are the same size 183 | if(leftim.cols != rightim.cols || leftim.rows != rightim.rows && leftim.channels() == rightim.channels()){ 184 | printf("ERROR: left and right images in createCostVolume do not have matching rows and cols and channels\n"); 185 | return cost_volume; 186 | } 187 | 188 | struct timespec timer; 189 | check_timer(NULL,&timer); 190 | 191 | unsigned char* left = (unsigned char*)leftim.data; 192 | unsigned char* right = (unsigned char*)rightim.data; 193 | 194 | // init values to very large numbers 195 | // the reason for this is that some regions near volume edges won't be dealt with 196 | for( int i = 0; i < ncols*nrows*nchans*ndisp; i++){ 197 | // arbitrary large number 198 | volume[i] = 9999; 199 | } 200 | 201 | // organization will be ndisp images of rows of pixels 202 | // iterate over the whole image 203 | for(int col = 0; col < ncols; col++){ 204 | for(int row = 0; row < nrows; row++){ 205 | struct bgr_pixel ref,ref0, tgt,tgt0; 206 | float cost; 207 | ref = ((struct bgr_pixel*)(left))[ncols*row + col]; 208 | if(col >0){ 209 | ref0 = ((struct bgr_pixel*)(left))[ncols*row + col - 1]; 210 | }else{ 211 | ref0.b = 0; 212 | ref0.g = 0; 213 | ref0.r = 0; 214 | } 215 | // if(col == 100 && row == 100){ 216 | // printf("ref,ref0 = %d,%d,%d %d,%d,%d\n",ref.b,ref.g,ref.r,ref0.b,ref0.g,ref0.r); 217 | // } 218 | // iterate over the disparities 219 | for(int disp = 0; disp < min(ndisp,col+1); disp++){ 220 | // get absolute difference of color and of grad 221 | float adc = 0; 222 | float adg = 0; 223 | tgt = ((struct bgr_pixel*)(right))[ncols*row + col-disp]; 224 | if(col > 0){ 225 | tgt0 = ((struct bgr_pixel*)(right))[ncols*row + col-disp - 1]; 226 | }else{ 227 | tgt0.b = 0; 228 | tgt0.g = 0; 229 | tgt0.r = 0; 230 | } 231 | 232 | // caluculate absolute difference of color 233 | adc = abs((int)ref.r - (int)tgt.r) + abs((int)ref.g - (int)tgt.g) + abs((int)ref.b - (int)tgt.b); 234 | 235 | // caluculate absolute difference of gradient 236 | adg = abs((int)ref.r-(int)ref0.r - (int)tgt.r+(int)tgt0.r) + abs((int)ref.g-(int)ref0.g - (int)tgt.g+(int)tgt0.g) + abs((int)ref.b-(int)ref0.b - (int)tgt.b+(int)tgt0.b); 237 | 238 | // calculate cost with TAD C+G 239 | cost = alpha*min(adc,tc) + (1-alpha)*min(adg,tg); 240 | 241 | // if(col == 100 && row == 100){ 242 | // printf("tgt,tgt0 = %d,%d,%d %d,%d,%d\t",tgt.b,tgt.g,tgt.r,tgt0.b,tgt0.g,tgt0.r); 243 | // printf("disp,cost,adc,adg = %d,%f,%f,%f\n",disp,cost,adc,adg); 244 | // } 245 | volume[nrows*ncols*disp + ncols*row + col] = cost; 246 | } 247 | } 248 | } 249 | check_timer("createCostVolume_tadcg time",&timer); 250 | return cost_volume; 251 | } 252 | -------------------------------------------------------------------------------- /costVolumeFilter_jointBilateral.cu: -------------------------------------------------------------------------------- 1 | #include "costVolumeFilter_jointBilateral.h" 2 | #include "opencv2/ximgproc/edge_filter.hpp" 3 | #include "helper.h" 4 | 5 | using namespace std; 6 | using namespace cv; 7 | 8 | __global__ void costVolumeFilter_jointBilateral_kernel(struct cost_volume_t vol, int* guide_global, int inter_win_padding, float* output, int ksize, float sigma_c, float sigma_s){ 9 | int gx = blockIdx.x*blockDim.x + threadIdx.x; 10 | int gy = blockIdx.y*blockDim.y + threadIdx.y; 11 | 12 | // radius of kernel 13 | int krad = (ksize-1)/2; 14 | 15 | extern __shared__ char shared_mem[]; 16 | 17 | // the guide is first in shared memory 18 | int* guide = (int*)(&shared_mem[0]); 19 | // the slice is second in shared memory 20 | float* slice = (float*)&(shared_mem[(ksize+blockDim.y-1)*(ksize+blockDim.x-1)*sizeof(float) + inter_win_padding]); 21 | 22 | int guide_c; 23 | 24 | // center pixel of guide image 25 | if(gy < vol.nrows && gx < vol.ncols){ 26 | guide_c = guide_global[vol.ncols*gy + gx]; 27 | } 28 | // pull out channel data from guide center pixel (brought in as an int) 29 | int gcr,gcg, gcb; 30 | gcr = (guide_c&0x000000FF) >> 0; 31 | gcb = (guide_c&0x0000FF00) >> 8; 32 | gcg = (guide_c&0x00FF0000) >> 16; 33 | 34 | // copy relevant subimages to shared memory 35 | // starting with the guide sub image 36 | for(int i = 1; i < ksize+blockDim.x-1; i += blockDim.x){ 37 | // only threads in bounds in x dim continue to next loop 38 | if(i + threadIdx.x < ksize+blockDim.x-1 && gx + i - krad >= 0 && gx + i - krad < vol.ncols){ 39 | for(int j = 0; j < ksize+blockDim.y-1; j += blockDim.y){ 40 | // only threads in bounds in y dim continue 41 | if(j + threadIdx.y < ksize+blockDim.y-1 && gy + j - krad >= 0 && gy + j - krad < vol.nrows){ 42 | guide[(ksize+blockDim.x-1) * (j+threadIdx.y) + i + threadIdx.x] = guide_global[vol.ncols * (gy + j - krad) + gx + i - krad]; 43 | } 44 | } 45 | } 46 | } 47 | __syncthreads(); 48 | // continuing with the slice sub image 49 | for(int i = 0; i < ksize+blockDim.x-1; i += blockDim.x){ 50 | // only threads in bounds in x dim continue to next loop 51 | if(i + threadIdx.x < ksize+blockDim.x-1 && gx + i - krad >= 0 && gx + i - krad < vol.ncols){ 52 | for(int j = 0; j < ksize+blockDim.y-1; j += blockDim.y){ 53 | // only threads in bounds in y dim continue 54 | if(j + threadIdx.y < ksize+blockDim.y-1 && gy + j - krad >= 0 && gy + j - krad < vol.nrows){ 55 | slice[(ksize+blockDim.x-1) * (j+threadIdx.y) + i + threadIdx.x] = vol.volume[vol.nrows*vol.stride*blockIdx.z + vol.stride * (gy + j - krad) + gx + i - krad]; 56 | } 57 | } 58 | } 59 | } 60 | __syncthreads(); 61 | 62 | float weight = 0; 63 | float sum = 0; 64 | 65 | // now the bilateral calculation 66 | for(int i = 0; i < ksize; i++){ 67 | if(gx - krad + i >= 0 && gx - krad + i < vol.ncols){ 68 | for(int j = 0; j < ksize; j++){ 69 | if(gy - krad + j >= 0 && gy - krad + j < vol.nrows){ 70 | int guide_p = guide[(ksize+blockDim.x-1)*(j+threadIdx.y) + i + threadIdx.x]; 71 | float slice_p = slice[(ksize+blockDim.x-1)*(j+threadIdx.y) + i + threadIdx.x]; 72 | int gr,gg,gb; 73 | gr = (guide_p&0x000000FF) >> 0; 74 | gb = (guide_p&0x0000FF00) >> 8; 75 | gg = (guide_p&0x00FF0000) >> 16; 76 | int c_diff = abs(gr - gcr) + abs(gb - gcb) + abs(gg - gcg); 77 | float s = __expf( -((j-krad)*(j-krad)+(i-krad)*(i-krad)) / (sigma_s*sigma_s) ); 78 | float c = __expf( -(c_diff*c_diff) / (sigma_c*sigma_c)); 79 | weight += s*c; 80 | sum += slice_p*s*c; 81 | } 82 | } 83 | } 84 | __syncthreads(); 85 | } 86 | 87 | // normalize the weighted sum by the sum of the weights 88 | sum /= weight; 89 | 90 | if(gy < vol.nrows && gx < vol.ncols){ 91 | // for debug, just copy the guide sub image to the output buffer 92 | //output[vol.nrows*vol.stride*blockIdx.z + vol.stride*gy + gx] = (float)(guide[(ksize+blockDim.x-1)*(threadIdx.y + krad) + krad + threadIdx.x] & 0x000000FF); 93 | // for debug, just copy the slice sub image to the output buffer 94 | //output[vol.nrows*vol.stride*blockIdx.z + vol.stride*gy + gx] = slice[(ksize+blockDim.x-1)*(threadIdx.y + krad) + krad + threadIdx.x]; 95 | // ok but for reals, output the bilaterally smoothed value here 96 | output[vol.nrows*vol.stride*blockIdx.z + vol.stride*gy + gx] = sum; 97 | } 98 | } 99 | 100 | void costVolumeFilter_jointBilateral_gpu(struct cost_volume_t& cost_volume, Mat guide, int ksize, float sigma_c, float sigma_s){ 101 | int nrows = cost_volume.nrows; 102 | int ncols = cost_volume.ncols; 103 | int ndisp = cost_volume.ndisp; 104 | int stride = cost_volume.stride; 105 | 106 | if(ksize%2 != 1){ 107 | printf("ERROR: in costVolumeFilter_jointBilateral_gpu, ksize must be odd\n"); 108 | return; 109 | } 110 | 111 | // settings for the kernel 112 | // trying to use 32 threads-wide so the global reads are 128 bytes 113 | dim3 threadsPerBlock(32,16); 114 | dim3 blocksPerGrid(ncols/threadsPerBlock.x+1,nrows/threadsPerBlock.y+1,ndisp); 115 | int guide_win_rows = (ksize + threadsPerBlock.y - 1); 116 | int guide_win_width_bytes = (ksize + threadsPerBlock.x - 1)*sizeof(int); 117 | // pad between images to 256 bytes 118 | int inter_window_pad = (256 - guide_win_width_bytes%256)%256; 119 | int slice_win_rows = (ksize + threadsPerBlock.y - 1); 120 | int slice_win_width_bytes = (ksize + threadsPerBlock.x - 1)*sizeof(float); 121 | int shared_size = guide_win_rows*guide_win_width_bytes + inter_window_pad + slice_win_rows*slice_win_width_bytes; 122 | // make sure the shared size is less than device maximum 123 | int device; 124 | cudaGetDevice(&device); 125 | cudaDeviceProp properties; 126 | cudaGetDeviceProperties(&properties, device); 127 | if(shared_size > properties.sharedMemPerMultiprocessor){ 128 | printf("ERROR: in costVolumeFilter_jointBilateral_gpu, shared_size exceeds device limit\n"); 129 | return; 130 | } 131 | 132 | // allocate output volume (post-filtering) on gpu 133 | float* d_output; 134 | cudaMalloc(&d_output, ndisp*nrows*stride*sizeof(float)); 135 | // copy guide image to to GPU 136 | cvtColor(guide,guide,CV_BGR2RGBA); 137 | int* d_guide; 138 | cudaMalloc(&d_guide, 4*nrows*ncols*sizeof(unsigned char)); 139 | cudaMemcpy(d_guide, guide.data, 4*nrows*ncols*sizeof(unsigned char), cudaMemcpyHostToDevice); 140 | 141 | // call the kernel 142 | struct timespec timer; 143 | check_timer(NULL,&timer); 144 | costVolumeFilter_jointBilateral_kernel<<>>(cost_volume, d_guide, inter_window_pad, d_output, ksize, sigma_c, sigma_s); 145 | //costVolumeFilter_jointBilateral_kernel<<>>(cost_volume, d_guide, inter_window_pad, d_output, sigma_s, sigma_c, ksize); 146 | cudaDeviceSynchronize(); 147 | check_timer("costVolumeFilter_jointBilateral_gpu time",&timer); 148 | gpu_perror("costVolumeFilter_jointBilateral_kernel"); 149 | 150 | // shuffle cost_volume pointers 151 | cudaFree(cost_volume.volume); // don't need the input anymore 152 | cost_volume.volume = d_output; // keep the output instead 153 | } 154 | 155 | void jointBilateralFilter(Mat& srcim, Mat& guideim, Mat& dst, int kernelSize, float sigma_color, float sigma_space){ 156 | // make sure images are the same size 157 | if(srcim.cols != guideim.cols || srcim.rows != guideim.rows){ 158 | printf("ERROR: src and guide images in jointBilateralFilter do not have matching rows and cols\n"); 159 | return; 160 | } 161 | if(kernelSize%2 != 1){ 162 | printf("ERROR: kernelSize jointBilateralFilter must be odd\n"); 163 | return; 164 | } 165 | int nrows = srcim.rows; 166 | int ncols = srcim.cols; 167 | int nchans = guideim.channels(); 168 | // set up some useful variables 169 | int win_rad = (kernelSize -1) / 2; 170 | // assume we are taking in floating point images 171 | float* src = (float*)srcim.data; 172 | float* guide = (float*)guideim.data; 173 | Mat outim = Mat::zeros(nrows,ncols,CV_32F); 174 | float* out = (float*)outim.data; 175 | // iterate over the whole image 176 | for(int col = 0; col < ncols; col++){ 177 | for(int row = 0; row < nrows; row++){ 178 | double normalizing_factor = 0; 179 | double weighted_sum = 0; 180 | float* guide_center = &(guide[(ncols*row + col)*nchans]); 181 | // iterate over the window 182 | for(int j = max(0,row-win_rad); j < min(nrows,row+win_rad+1); j++){ 183 | for(int i = max(0,col-win_rad); i < min(ncols,col+win_rad+1); i++){ 184 | int x = i - col; 185 | int y = j - row; 186 | int radius2 = x*x+y*y; 187 | float src_pixel = src[ncols*j + i]; 188 | float* guide_pixel = &(guide[(ncols*j + i)*nchans]); 189 | double weight = 1; 190 | // apply spacial sigma 191 | weight *= std::exp(-radius2/(2.*sigma_space*sigma_space)); 192 | // get intensity difference from guide image 193 | float diff = 0; 194 | for(int chan = 0; chan < nchans; chan++){ 195 | diff += abs(guide_pixel[chan] - guide_center[chan]); 196 | } 197 | // apply sigma_color 198 | weight *= std::exp(-diff*diff/(2.*sigma_color*sigma_color)); 199 | // add in values 200 | normalizing_factor += weight; 201 | weighted_sum += weight*src_pixel; 202 | } 203 | } 204 | out[ncols*row + col] = weighted_sum / normalizing_factor; 205 | //printf("row,col,val : %d,%d,%f\n",row,col,weighted_sum / normalizing_factor); 206 | } 207 | } 208 | outim.copyTo(dst); 209 | return; 210 | } 211 | 212 | void costVolumeFilter_jointBilateral(struct cost_volume_t& cost_volume, Mat guide, int kernelSize, float sigma_color, float sigma_space){ 213 | int nrows = cost_volume.nrows; 214 | int ncols = cost_volume.ncols; 215 | int ndisp = cost_volume.ndisp; 216 | float* vin = cost_volume.volume; 217 | // doesn't do in-place editing... need second float* 218 | float* vout = (float*)malloc(nrows*ncols*ndisp*sizeof(float)); 219 | // guide must be CV_32F if the cost_volume is 220 | guide.convertTo(guide,CV_32F); 221 | struct timespec timer; 222 | check_timer(NULL,&timer); 223 | for(int disp = 0; disp < ndisp; disp++){ 224 | Mat slicein(nrows,ncols,CV_32F,&(vin[nrows*ncols*disp])); 225 | Mat sliceout(nrows,ncols,CV_32F,&(vout[nrows*ncols*disp])); 226 | //jointBilateralFilter(InputArray joint, InputArray src, OutputArray dst, int d, double sigmaColor, double sigmaSpace, int borderType=BORDER_DEFAULT) 227 | ximgproc::jointBilateralFilter(guide, slicein, sliceout, kernelSize, sigma_color, sigma_space); 228 | } 229 | check_timer("costVolumeFilter_jointBilateral time",&timer); 230 | printf("\n"); 231 | // free old cost_volume float* 232 | free(cost_volume.volume); 233 | // replace with new cost_volume float* 234 | cost_volume.volume = vout; 235 | } 236 | -------------------------------------------------------------------------------- /asw.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | #define MAX_DISP 1000 10 | #define NCHANS 3 11 | #define IDP_LVL 4 12 | 13 | #define BLOCK_SIZE 16 14 | 15 | 16 | // timing utility 17 | struct timespec check_timer(const char* str, struct timespec* ts){ 18 | struct timespec oldtime; 19 | // copy old time over 20 | oldtime.tv_nsec = ts->tv_nsec; 21 | oldtime.tv_sec = ts->tv_sec; 22 | // update ts 23 | clock_gettime(CLOCK_REALTIME, ts); 24 | // print old time 25 | int diffsec; 26 | int diffnsec; 27 | if(str != NULL){ 28 | diffsec = ts->tv_sec - oldtime.tv_sec; 29 | diffnsec = ts->tv_nsec - oldtime.tv_nsec; 30 | // correct the values if we measured over an integer second break: 31 | if(diffnsec < 0){ 32 | diffsec--; 33 | diffnsec += 1000000000; 34 | } 35 | printf("%s:%ds %dns\n",str,diffsec,diffnsec); 36 | } 37 | return (struct timespec) {diffsec, diffnsec}; 38 | } 39 | 40 | // little bitty kernel to initialize blocks of device memory 41 | __global__ void gpu_memset(unsigned char* start, unsigned char value, int length){ 42 | int tx = threadIdx.x; 43 | int bx = blockIdx.x; 44 | int gx = bx*blockDim.x + tx; 45 | if(gx < length){ 46 | start[gx] = value; 47 | } 48 | } 49 | 50 | // teeny little helper function 51 | void gpu_perror(char* input){ 52 | printf("%s: %s\n", input, cudaGetErrorString(cudaGetLastError())); 53 | } 54 | 55 | 56 | // In the future it may be useful to bring a whole line of pixels into local memory... 57 | // ... from shared memory, and then do everything that needs to be dones with that line... 58 | // ... for a given pixel, before moving to the next row... 59 | // ... or maybe it would be better to use a single location of spacial sigma. Oh I like that. 60 | 61 | 62 | // now let's try just running with 32 threads, but each 32 thread warp stretches horizontally across a row 63 | // we want to reduced shared memory accesses and increase IDP, so with this in mind we will... 64 | // ... read in an entire line (blockdim.x + 2*win_rad + ndisp) into shared memory 65 | // ... ok hold on... 66 | // ... first strategy: horizontal line of threads, each thread calculates n pixels below it as well 67 | // ... second strategy: vertical line of threads, each thread calculates n pixels to the right of it 68 | // ... the first strategy is good for IDP, and can reduce shared reads because each pixel below needs 69 | // ... to access a given pixel once for each ndisp (in general) 70 | // ... second strategy is good because each thread needs to access each pixel for differenct ndisp 71 | // ... the IDP-vertically strategy has neater boundary conditions in the disp direction, because all IDPs share same disp 72 | // ... the IDP-horizontally strategy has neater boundary conditions in the window direction because all IDPs share same 73 | // ... but in either case I think the GPU usage should be the same since every thread should run into the same problems. 74 | // ... therefore I don't think that should be a deciding factor. 75 | // My next concern is that we will not generate enough blocks with this strategy to fill a large GPU 76 | // ... obviously that's a problem with the old kernel writeup as well. 77 | // I guess for now we will just work on minimizing shared memory accessess. 78 | __global__ void asw_kernel2(unsigned char* global_left, unsigned char* global_right, unsigned char* output, unsigned char* debug, 79 | int nrows, int ncols, int ndisp, int win_size, int win_rad, float s_sigma, float c_sigma) 80 | { 81 | extern __shared__ unsigned char ref[]; // the beginning of the shared memory block 82 | unsigned char* tgt = &ref[(win_size + blockDim.x)*NCHANS*IDP_LVL]; // tgt follows a block big enough for reference 83 | // if we start somewhere with a middle row of the image, then we can use a shared variable to share center values 84 | // __shared__ unsigned char ref_center_pix[NCHANS*IDP_LVL]; 85 | // __shared__ unsigned char tgt_center_pix[MAX_DISP*NCHANS*IDP_LVL] 86 | 87 | int ref_width_bytes = (2*win_rad+blockDim.x)*NCHANS*sizeof(unsigned char); 88 | int tgt_width_bytes = (ndisp+2*win_rad+blockDim.x)*NCHANS*sizeof(unsigned char); 89 | 90 | // we are sticking with local memory for the sums of the disparities, because since we only access that occasionally I don't think the latency is a problem 91 | float costs[MAX_DISP*IDP_LVL]; 92 | float weights[MAX_DISP*IDP_LVL]; 93 | 94 | // other things should fall into register memory 95 | 96 | 97 | } 98 | 99 | // Device code 100 | __global__ void asw_kernel(unsigned char* global_left, unsigned char* global_right, unsigned char* output, unsigned char* debug, 101 | int nrows, int ncols, int ndisp, int win_size, int win_rad, float s_sigma, float c_sigma) 102 | { 103 | // ok, we're going to try a block size of 32 ( 32x32 = 1024, max threads per block ) 104 | // no... we'll use 16x16 since there's problems with shared memory with two images 105 | // each thread will calculate the full asw stereo output for a single pixel 106 | // shared memory will contain all the input image data for the full block of asw calculations 107 | // texture memory will contain the spacial filter, eventually 108 | extern __shared__ unsigned char ref[]; // contains both left and right image data 109 | 110 | // get the size of the sub-images that we are considering 111 | // reference window 112 | int ref_width_bytes = (2*win_rad+blockDim.x)*NCHANS*sizeof(unsigned char); 113 | // int ref_rows = (2*win_rad+blockDim.y); 114 | // target window 115 | int tgt_width_bytes = (ndisp+2*win_rad+blockDim.x)*NCHANS*sizeof(unsigned char); 116 | // int tgt_rows = (2*win_rad+blockDim.y); 117 | 118 | unsigned char* tgt = (unsigned char*)(&ref[ ref_width_bytes*(2*win_rad+blockDim.y) ]); // tgt image, reference to somwhere of shared allocated memory 119 | 120 | float ref_c_factor; 121 | float tgt_c_factor; 122 | float s_factor; 123 | float ref_c2p_diff; 124 | float tgt_c2p_diff; 125 | float ref2tgt_diff; 126 | // variables for keeping track of the output 127 | float weight; 128 | float cost; 129 | float min_cost; 130 | unsigned char min_cost_index; 131 | unsigned char ref_center_pix[3]; 132 | unsigned char tgt_center_pix[3]; 133 | unsigned char ref_pix[3]; 134 | unsigned char tgt_pix[3]; 135 | 136 | int disp; 137 | int win_x; 138 | int win_y; 139 | int dx; 140 | int tgt_x; 141 | 142 | // get identity of this thread (changing these to #define's) 143 | 144 | #define tx (threadIdx.x) 145 | #define ty (threadIdx.y) 146 | #define bx (blockIdx.x + 5) 147 | #define by (blockIdx.y + 1) 148 | #define gx (bx*blockDim.x + tx) 149 | #define gy (by*blockDim.y + ty) 150 | 151 | // copy relevant subimages to shared memory 152 | // TODO: additional boundary checks on this data 153 | // TODO: better division technique 154 | // TODO: investigate where syncthreads() needs to be called for best performance 155 | // we can copy the 24-bit image over 32 bits at a time 156 | // except then I don't know how to deal with the edge case 157 | // so let's just do one character at a time 158 | // starting with reference image: (4 deleted register variables) 159 | // int xblocks = (ref_width_bytes / blockDim.x + 1); 160 | // int yblocks = ((2*win_rad+blockDim.y) / blockDim.y + 1); 161 | // int xstart = ((bx*blockDim.x - win_rad)*NCHANS); 162 | // int ystart = (gy - win_rad); 163 | // 29 variables here 164 | for(win_x = 0; win_x < (ref_width_bytes / blockDim.x + 1); win_x++){ 165 | // int x_idx = (win_x*blockDim.x + tx); 166 | // int g_x_idx = (((bx*blockDim.x - win_rad)*NCHANS) + win_x*blockDim.x + tx); 167 | if((win_x*blockDim.x + tx) < ref_width_bytes){ 168 | for(win_y = 0; win_y < ((2*win_rad+blockDim.y) / blockDim.y + 1); win_y++){ 169 | // int y_idx = (win_y*blockDim.y + ty); 170 | // int g_y_idx = ((gy - win_rad) + win_y*blockDim.y); 171 | if((win_y*blockDim.y + ty) < (2*win_rad+blockDim.y)){ 172 | // copy bytes (not pixels) from global_left into reference image 173 | ref[(win_y*blockDim.y + ty)*ref_width_bytes + (win_x*blockDim.x + tx)] = global_left[((gy - win_rad) + win_y*blockDim.y)*ncols*NCHANS + (((bx*blockDim.x - win_rad)*NCHANS) + win_x*blockDim.x + tx)]; 174 | // copy into the debug image (only made to work with a single block of threads) 175 | // debug[((gy - win_rad) + win_y*blockDim.y)*ncols*NCHANS + (((bx*blockDim.x - win_rad)*NCHANS) + win_x*blockDim.x + tx)] = ref[(win_y*blockDim.y + ty)*ref_width_bytes + (win_x*blockDim.x + tx)]; 176 | } 177 | } 178 | } 179 | } 180 | // then to the target image: (4 deleted register variables) 181 | // xblocks = (tgt_width_bytes / blockDim.x + 1); 182 | // yblocks = ((2*win_rad+blockDim.y) / blockDim.y + 1); 183 | // xstart = ((bx*blockDim.x - win_rad - ndisp)*NCHANS); 184 | // ystart = (gy - win_rad); 185 | for(win_x = 0; win_x < (tgt_width_bytes / blockDim.x + 1); win_x++){ 186 | // int x_idx = (win_x*blockDim.x + tx); 187 | // int g_x_idx = (((bx*blockDim.x - win_rad - ndisp)*NCHANS) + win_x*blockDim.x + tx); 188 | if((win_x*blockDim.x + tx) < tgt_width_bytes){ 189 | for(win_y = 0; win_y < ((2*win_rad+blockDim.y) / blockDim.y + 1); win_y++){ 190 | // int y_idx = (win_y*blockDim.y + ty); 191 | // int g_y_idx = ((gy - win_rad) + win_y*blockDim.y); 192 | if((win_y*blockDim.y + ty) < (2*win_rad+blockDim.y)){ 193 | // copy bytes (not pixels) from global_left into reference image 194 | tgt[(win_y*blockDim.y + ty)*tgt_width_bytes + (win_x*blockDim.x + tx)] = global_right[((gy - win_rad) + win_y*blockDim.y)*ncols*NCHANS + (((bx*blockDim.x - win_rad - ndisp)*NCHANS) + win_x*blockDim.x + tx)]; 195 | // copy into the debug image (only made to work with a single block of threads) 196 | // debug[((gy - win_rad) + win_y*blockDim.y)*ncols*NCHANS + (((bx*blockDim.x - win_rad - ndisp)*NCHANS) + win_x*blockDim.x + tx)] = tgt[(win_y*blockDim.y + ty)*tgt_width_bytes + (win_x*blockDim.x + tx)]; 197 | } 198 | } 199 | } 200 | } 201 | 202 | __syncthreads(); 203 | 204 | // get a pointer to the ref_center_pix, which is constant for any given thread 205 | ref_center_pix[0] = ref[(win_rad + ty)*ref_width_bytes + (win_rad + tx)*NCHANS + 0]; 206 | ref_center_pix[1] = ref[(win_rad + ty)*ref_width_bytes + (win_rad + tx)*NCHANS + 1]; 207 | ref_center_pix[2] = ref[(win_rad + ty)*ref_width_bytes + (win_rad + tx)*NCHANS + 2]; 208 | // initialize min_cost to some arbitrarily large value 209 | min_cost = 1e12; 210 | // initialize min_cost_index to 0 211 | min_cost_index = 0; 212 | 213 | // for each value of ndisp 214 | for(disp = 0; disp < ndisp; disp++){ 215 | // get a pointer to the tgt_center_pix, which is constant for each disp 216 | tgt_center_pix[0] = tgt[(win_rad + ty)*tgt_width_bytes + (ndisp + win_rad + tx - disp)*NCHANS + 0]; 217 | tgt_center_pix[1] = tgt[(win_rad + ty)*tgt_width_bytes + (ndisp + win_rad + tx - disp)*NCHANS + 1]; 218 | tgt_center_pix[2] = tgt[(win_rad + ty)*tgt_width_bytes + (ndisp + win_rad + tx - disp)*NCHANS + 2]; 219 | // reset weight and cost 220 | weight = 0; 221 | cost = 0; 222 | // in each row in the window: 223 | for(win_x = 0; win_x < win_size; win_x++){ 224 | // locate the pixel in the ref image (deleted this var) 225 | dx = win_x + tx; 226 | // locate the pixel in the tgt image (deleted this var) 227 | tgt_x = ndisp + win_x + tx - disp; 228 | // find the window-center to pixel x-distance (deleted this var) 229 | // int dx = win_x - win_rad; 230 | // in each column of the window: 231 | for(win_y = 0; win_y < win_size; win_y++){ 232 | // locate the pixel in the ref image (deleted this var) 233 | // int ref_y = win_y + ty; 234 | // find the window-center to pixel y-distance (deleted this var) 235 | // int dy = win_y - win_rad; 236 | // get the radius^2 value (deleted this var) 237 | // float radius_2 = (win_x-win_rad)*(win_x-win_rad) + (win_y-win_rad)*(win_y-win_rad); 238 | // get the s_factor for this particular window location 239 | s_factor = __expf(-((win_x-win_rad)*(win_x-win_rad) + (win_y-win_rad)*(win_y-win_rad))/(2.*s_sigma*s_sigma)); 240 | // store tgt and ref pixels in register memory 241 | ref_pix[0] = ref[(win_y+ty)*ref_width_bytes + (dx)*NCHANS + 0]; 242 | ref_pix[1] = ref[(win_y+ty)*ref_width_bytes + (dx)*NCHANS + 1]; 243 | ref_pix[2] = ref[(win_y+ty)*ref_width_bytes + (dx)*NCHANS + 2]; 244 | tgt_pix[0] = tgt[(win_y+ty)*tgt_width_bytes + (tgt_x)*NCHANS + 0]; 245 | tgt_pix[1] = tgt[(win_y+ty)*tgt_width_bytes + (tgt_x)*NCHANS + 1]; 246 | tgt_pix[2] = tgt[(win_y+ty)*tgt_width_bytes + (tgt_x)*NCHANS + 2]; 247 | // get the center-to-pixel and overall color differences (organized together for IDP) 248 | ref_c2p_diff = abs(ref_center_pix[0] - ref_pix[0]); 249 | tgt_c2p_diff = abs(tgt_center_pix[0] - ref_pix[0]); 250 | ref2tgt_diff = abs(ref_pix[0] - tgt_pix[0]); 251 | ref_c2p_diff += abs(ref_center_pix[1] - ref_pix[1]); 252 | tgt_c2p_diff += abs(tgt_center_pix[1] - ref_pix[1]); 253 | ref2tgt_diff+= abs(ref_pix[1] - tgt_pix[1]); 254 | ref_c2p_diff += abs(ref_center_pix[2] - ref_pix[2]); 255 | tgt_c2p_diff += abs(tgt_center_pix[2] - ref_pix[2]); 256 | ref2tgt_diff+= abs(ref_pix[2] - tgt_pix[2]); 257 | // get the c_factors 258 | ref_c_factor = __expf(-ref_c2p_diff*ref_c2p_diff/(2.*c_sigma*c_sigma)); 259 | tgt_c_factor = __expf(-tgt_c2p_diff*tgt_c2p_diff/(2.*c_sigma*c_sigma)); 260 | // calulate the pix_weight (this variable has been done away with to increase ILP) 261 | // pix_weight = s_factor*ref_c_factor*tgt_c_factor; 262 | // add in the cost 263 | cost += s_factor*ref_c_factor*tgt_c_factor*ref2tgt_diff; 264 | // add in the weight 265 | weight += s_factor*ref_c_factor*tgt_c_factor; 266 | } 267 | } 268 | // now that the window is done, compare this cost (after normalizing) to min_cost 269 | if( min_cost > cost / weight){ 270 | min_cost = cost / weight; 271 | min_cost_index = disp; 272 | } 273 | __syncthreads(); 274 | } 275 | 276 | // set the output to the index of min_cost 277 | output[gy*ncols + gx] = min_cost_index; 278 | } 279 | 280 | int asw(cv::Mat im_l, cv::Mat im_r, int ndisp, int s_sigma, int c_sigma){ 281 | // window size and win_rad 282 | int win_size = 3*s_sigma; 283 | int win_rad = (win_size - 1)/2; 284 | // declare timer 285 | struct timespec timer; 286 | 287 | // check that images are matching dimensions 288 | if(im_l.rows != im_r.rows){ 289 | printf("Error: im_l and im_r do not have matching row count\n"); 290 | return 1; 291 | } 292 | if(im_l.cols != im_r.cols){ 293 | printf("Error: im_l and im_r do not have matching col count\n"); 294 | return 1; 295 | } 296 | if(im_l.channels() != im_r.channels()){ 297 | printf("Error: im_l and im_r do not have matching channel count\n"); 298 | return 1; 299 | } 300 | 301 | // set easy-access variables for number of rows, cols, and chans 302 | int nrows = im_l.rows; 303 | int ncols = im_l.cols; 304 | int nchans = im_l.channels(); 305 | // initialize the device input arrays 306 | unsigned char* d_im_l; 307 | cudaMalloc(&d_im_l,nchans*nrows*ncols*sizeof(unsigned char)); 308 | unsigned char* d_im_r; 309 | cudaMalloc(&d_im_r,nchans*nrows*ncols*sizeof(unsigned char)); 310 | // initialize the output data matrix 311 | unsigned char* out = (unsigned char*)malloc(nrows*ncols*sizeof(unsigned char)); 312 | unsigned char* d_out; 313 | cudaMalloc(&d_out,nrows*ncols*sizeof(unsigned char)); 314 | unsigned char* debug = (unsigned char*)malloc(nrows*ncols*nchans*sizeof(unsigned char)); 315 | unsigned char* d_debug; 316 | cudaMalloc(&d_debug,nchans*nrows*ncols*sizeof(unsigned char)); 317 | 318 | // define a shortcut to the host data arrays 319 | unsigned char* data_l = ((unsigned char*)(im_l.data)); 320 | unsigned char* data_r = ((unsigned char*)(im_r.data)); 321 | 322 | //copy the host input data to the device 323 | cudaMemcpy(d_im_l, data_l, nchans*nrows*ncols*sizeof(unsigned char), cudaMemcpyHostToDevice); 324 | cudaMemcpy(d_im_r, data_r, nchans*nrows*ncols*sizeof(unsigned char), cudaMemcpyHostToDevice); 325 | 326 | // get gaussian kernel for spacial look-up table: 327 | // equation from cv::getGaussianKernel(), but without normalization 328 | // float s_weights[win_size][win_size]; 329 | // for(int i=0; i>>(d_out,25,nrows*ncols*sizeof(unsigned char)); 354 | gpu_perror("memset1"); 355 | gpu_memset<<>>(d_debug,25,nchans*nrows*ncols*sizeof(unsigned char)); 356 | gpu_perror("memset2"); 357 | 358 | // check some values before calling the asw_kernel 359 | size_t reference_window_size = (2*win_rad+BLOCK_SIZE)*(2*win_rad+BLOCK_SIZE)*sizeof(unsigned char)*nchans; 360 | size_t target_window_size = (2*win_rad+ndisp+BLOCK_SIZE)*(BLOCK_SIZE+2*win_rad)*sizeof(unsigned char)*nchans; 361 | size_t shared_size = target_window_size+reference_window_size; 362 | printf("win_size %d win_rad %d ndisp %d shared size = %d\n",win_size,win_rad,ndisp,shared_size); 363 | if(shared_size > 47000){ 364 | printf("FATAL ERROR: shared_size for asw_kernel exceeds the device limit (48 kB), exiting\n"); 365 | return 1; 366 | } 367 | 368 | // call the asw_kernel 369 | dim3 blocksPerGrid(22,21); 370 | dim3 threadsPerBlock(BLOCK_SIZE,BLOCK_SIZE); 371 | // __global__ void asw_kernel(unsigned char* global_left, unsigned char* global_right, unsigned char* output, unsigned char* debug, 372 | // int nrows, int ncols, int ndisp, int win_size, int win_rad, float s_sigma, float c_sigma) 373 | printf("starting asw kernel\n"); 374 | check_timer(NULL,&timer); 375 | asw_kernel<<>>(d_im_l, d_im_r, d_out, d_debug, 376 | nrows, ncols, nchans, win_size, win_rad, s_sigma, c_sigma); 377 | cudaDeviceSynchronize(); 378 | check_timer("asw kernel finished",&timer); 379 | gpu_perror("asw_kernel"); 380 | 381 | // copy the device output data to the host 382 | check_timer(NULL,&timer); 383 | cudaMemcpy(out, d_out, nrows*ncols*sizeof(unsigned char), cudaMemcpyDeviceToHost); 384 | cudaMemcpy(debug, d_debug, nrows*ncols*nchans*sizeof(unsigned char), cudaMemcpyDeviceToHost); 385 | check_timer("copying complete",&timer); 386 | 387 | // make an image and view it: 388 | cv::Mat im_out(nrows,ncols,CV_8UC1,out); 389 | cv::Mat im_debug(nrows,ncols,CV_8UC3,debug); 390 | // cv::rectangle(im_debug,cv::Point(16*15,16*15),cv::Point(16*16,16*16),cv::Scalar(255,0,0)); 391 | // cv::rectangle(im_out,cv::Point(16*15,16*15),cv::Point(16*16,16*16),127); 392 | // cv::imshow("window",im_debug); 393 | // cv::waitKey(0); 394 | cv::imshow("window",im_out); 395 | cv::waitKey(0); 396 | 397 | // cleanup memory 398 | cudaFree(d_im_l); 399 | cudaFree(d_im_r); 400 | cudaFree(d_out); 401 | cudaFree(d_debug); 402 | free(out); 403 | free(debug); 404 | 405 | return 0; 406 | } 407 | 408 | int main(int argc, char** argv){ 409 | // spacial and color sigmas 410 | int s_sigma, c_sigma; 411 | // number of disparities to check 412 | int ndisp; 413 | // input images 414 | cv::Mat im_l, im_r; 415 | 416 | if(argc < 6){ 417 | printf("usage: %s ",argv[0]); 418 | return 1; 419 | }else{ 420 | im_l = cv::imread(argv[1]); 421 | im_r = cv::imread(argv[2]); 422 | ndisp = atoi(argv[3]); 423 | s_sigma = atoi(argv[4]); 424 | c_sigma = atoi(argv[5]); 425 | } 426 | 427 | return asw(im_l, im_r, ndisp, s_sigma, c_sigma); 428 | } --------------------------------------------------------------------------------