├── .gitignore
├── l.png
├── r.png
├── timer.h
├── helper.h
├── costVolumeFilter_box.h
├── createCostVolume.h
├── helper.cu
├── costVolumeMinimize.h
├── cost_volume.h
├── costVolumeFilter_guided.h
├── createCostVolume_tadcg.h
├── timer.cpp
├── costVolumeFilter_jointBilateral.h
├── makefile
├── LICENSE.md
├── costVolumeFilter_box.cu
├── kerneltest.cu
├── README.md
├── costVolumeMinimize.cu
├── cost_volume.cu
├── createCostVolume.cu
├── costVolumeFilter_guided.cu
├── createCostVolume_tadcg.cu
├── costVolumeFilter_jointBilateral.cu
└── asw.cu


/.gitignore:
--------------------------------------------------------------------------------
1 | a.out
2 | *.o
3 | asw
4 | cost_volume
5 | 
6 | 


--------------------------------------------------------------------------------
/l.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdbeethe/asw/HEAD/l.png


--------------------------------------------------------------------------------
/r.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdbeethe/asw/HEAD/r.png


--------------------------------------------------------------------------------
/timer.h:
--------------------------------------------------------------------------------
1 | #ifndef TIMER_H
2 | #define TIMER_H
3 | 
4 | #include <time.h>
5 | struct timespec check_timer(const char* str, struct timespec* ts);
6 | 
7 | #endif // TIMER_H
8 | 


--------------------------------------------------------------------------------
/helper.h:
--------------------------------------------------------------------------------
1 | #ifndef HELPER_H
2 | #define HELPER_H
3 | 
4 | __global__ void gpu_memset(unsigned char* start, unsigned char value, int length);
5 | void gpu_perror(const char* input);
6 | 
7 | #endif // HELPER_H
8 | 


--------------------------------------------------------------------------------
/costVolumeFilter_box.h:
--------------------------------------------------------------------------------
 1 | #ifndef COSTVOLUMEFILTER_BOX_H
 2 | #define COSTVOLUMEFILTER_BOX_H
 3 | 
 4 | #include <opencv2/opencv.hpp>
 5 | #include "cost_volume.h"
 6 | #include "timer.h"
 7 | 
 8 | void costVolumeFilter_box_gpu(struct cost_volume_t& vol, int ksize);
 9 | void costVolumeFilter_box(struct cost_volume_t& cost_volume, int kernelSize);
10 | 
11 | #endif // COSTVOLUMEFILTER_BOX_H
12 | 


--------------------------------------------------------------------------------
/createCostVolume.h:
--------------------------------------------------------------------------------
 1 | #ifndef CREATECOSTVOLUME_H
 2 | #define CREATECOSTVOLUME_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <opencv2/opencv.hpp>
 6 | #include <opencv2/core/cuda.hpp>
 7 | 
 8 | __global__ void createCostVolume_kernel(int* ref_global, int* tgt_global, struct cost_volume_t vol, int* debug);
 9 | struct cost_volume_t createCostVolume_gpu(cv::Mat leftim, cv::Mat rightim, int ndisp);
10 | struct cost_volume_t createCostVolume(cv::Mat leftim, cv::Mat rightim,int ndisp);
11 | 
12 | #endif // CREATECOSTVOLUME_H
13 | 


--------------------------------------------------------------------------------
/helper.cu:
--------------------------------------------------------------------------------
 1 | #include "helper.h"
 2 | #include <stdio.h>
 3 | 
 4 | // little bitty kernel to initialize blocks of device memory
 5 | __global__ void gpu_memset(unsigned char* start, unsigned char value, int length){
 6 | 	int tx = threadIdx.x;
 7 | 	int bx = blockIdx.x;
 8 | 	int gx = bx*blockDim.x + tx;
 9 | 	if(gx < length){
10 | 		start[gx] = value;
11 | 	}
12 | }
13 | 
14 | // teeny little helper function
15 | void gpu_perror(const char* input){
16 | 	printf("%s: %s\n", input, cudaGetErrorString(cudaGetLastError()));
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/costVolumeMinimize.h:
--------------------------------------------------------------------------------
 1 | #ifndef COSTVOLUMEMINIMIZE_H
 2 | #define COSTVOLUMEMINIMIZE_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <opencv2/opencv.hpp>
 6 | #include "timer.h"
 7 | #include "cost_volume.h"
 8 | #include "helper.h"
 9 | 
10 | __global__ void costVolumeMinimize_kernel(struct cost_volume_t vol, unsigned char* output);
11 | void costVolumeMinimize_gpu(struct cost_volume_t cost_volume, cv::Mat& outim);
12 | void costVolumeMinimize(struct cost_volume_t cost_volume, cv::Mat& outim);
13 | 
14 | #endif // COSTVOLUMEMINIMIZE_H
15 | 
16 | 


--------------------------------------------------------------------------------
/cost_volume.h:
--------------------------------------------------------------------------------
 1 | #ifndef COST_VOLUME_H
 2 | #define COST_VOLUME_H
 3 | 
 4 | struct cost_volume_t {
 5 | 	float* volume;
 6 | 	int nrows;
 7 | 	int ncols;
 8 | 	int ndisp;
 9 | 	int stride;
10 | };
11 | 
12 | struct rgba_pixel {
13 | 	unsigned char r;
14 | 	unsigned char g;
15 | 	unsigned char b;
16 | 	unsigned char a;
17 | };
18 | 
19 | struct cost_volume_t get_gpu_volume(struct cost_volume_t vin);
20 | void viewSlices(struct cost_volume_t& cost_volume, int first, int last);
21 | void costVolumeBoxFilter(struct cost_volume_t& cost_volume, int kernelSize);
22 | 
23 | #endif // COST_VOLUME_H
24 | 


--------------------------------------------------------------------------------
/costVolumeFilter_guided.h:
--------------------------------------------------------------------------------
 1 | #ifndef COSTVOLUMEFILTER_GUIDED_H
 2 | #define COSTVOLUMEFILTER_GUIDED_H
 3 | 
 4 | #include <opencv2/opencv.hpp>
 5 | #include "cost_volume.h"
 6 | #include "timer.h"
 7 | 
 8 | __global__ void costVolumeFilter_guided_kernel(struct cost_volume_t vol, int* guide_global, int inter_win_padding, float* output, float sigma_s, float sigma_c, int ksize);
 9 | void costVolumeFilter_guided_gpu(struct cost_volume_t& vol, cv::Mat guide, int ksize, float eps);
10 | void costVolumeFilter_guided(struct cost_volume_t& vol, cv::Mat guide, int ksize, float eps);
11 | 
12 | #endif // COSTVOLUMEFILTER_GUIDED_H
13 | 
14 | 


--------------------------------------------------------------------------------
/createCostVolume_tadcg.h:
--------------------------------------------------------------------------------
 1 | #ifndef CREATECOSTVOLUME_TADCG_H
 2 | #define CREATECOSTVOLUME_TADCG_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <opencv2/opencv.hpp>
 6 | #include <opencv2/core/cuda.hpp>
 7 | 
 8 | __global__ void createCostVolume_tadcg_kernel(cv::cuda::PtrStepi ref_global, cv::cuda::PtrStepi tgt_global, struct cost_volume_t vol, cv::cuda::PtrStepi debug, float tc, float tg, float alpha);
 9 | struct cost_volume_t createCostVolume_tadcg_gpu(cv::Mat leftim, cv::Mat rightim, int ndisp, float tc, float tg, float alpha);
10 | struct cost_volume_t createCostVolume_tadcg(cv::Mat leftim, cv::Mat rightim, int ndisp, float tc, float tg, float alpha);
11 | 
12 | #endif // CREATECOSTVOLUME_TADCG_H
13 | 


--------------------------------------------------------------------------------
/timer.cpp:
--------------------------------------------------------------------------------
 1 | #include "timer.h"
 2 | #include <stdio.h>
 3 | 
 4 | struct timespec check_timer(const char* str, struct timespec* ts){
 5 | 	struct timespec oldtime;
 6 | 	// copy old time over
 7 | 	oldtime.tv_nsec = ts->tv_nsec;
 8 | 	oldtime.tv_sec = ts->tv_sec;
 9 | 	// update ts
10 | 	clock_gettime(CLOCK_REALTIME, ts);
11 | 	// print old time
12 | 	int diffsec;
13 | 	int diffnsec;
14 | 	if(str != NULL){
15 | 		diffsec =  ts->tv_sec - oldtime.tv_sec;
16 | 		diffnsec =  ts->tv_nsec - oldtime.tv_nsec;
17 | 		// correct the values if we measured over an integer second break:
18 | 		if(diffnsec < 0){
19 | 			diffsec--;
20 | 			diffnsec += 1000000000;
21 | 		}
22 | 		printf("%s:%ds %.3fms\n",str,diffsec,diffnsec/1000000.);
23 | 	}
24 | 	return (struct timespec) {diffsec, diffnsec};
25 | }
26 | 


--------------------------------------------------------------------------------
/costVolumeFilter_jointBilateral.h:
--------------------------------------------------------------------------------
 1 | #ifndef COSTVOLUMEFILTER_JOINTBILATERAL_H
 2 | #define COSTVOLUMEFILTER_JOINTBILATERAL_H
 3 | 
 4 | #include <opencv2/opencv.hpp>
 5 | #include "cost_volume.h"
 6 | #include "timer.h"
 7 | 
 8 | __global__ void costVolumeFilter_jointBilateral_kernel(struct cost_volume_t vol, int* guide_global, int inter_win_padding, float* output, int ksize, float sigma_c, float sigma_s);
 9 | void costVolumeFilter_jointBilateral_gpu(struct cost_volume_t& cost_volume, cv::Mat guide, int ksize, float sigma_c, float sigma_s);
10 | //void jointBilateralFilter(cv::Mat& srcim, cv::Mat& guideim, cv::Mat& dst, int kernelSize, float sigma_color, float sigma_space);
11 | 
12 | void costVolumeFilter_jointBilateral(struct cost_volume_t& cost_volume, cv::Mat guide, int kernelSize, float sigma_color, float sigma_space);
13 | 
14 | #endif // COSTVOLUMEFILTER_JOINTBILATERAL_H
15 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | CFLAGS=`pkg-config opencv --cflags`
 2 | LDFLAGS=`pkg-config opencv --libs` -L/usr/local/cuda-7.5/targets/x86_64/lib -lnpps -lnppi -lnppc
 3 | 
 4 | .PHONY: all
 5 | all: cost_volume asw
 6 | 
 7 | gpu_volume: cost_volume.cu
 8 | 	nvcc $(CFLAGS) $^ -o $@ $(LDFLAGS)
 9 | 
10 | cost_volume: cost_volume.o costVolumeFilter_jointBilateral.o costVolumeFilter_guided.o costVolumeFilter_box.o costVolumeMinimize.o createCostVolume_tadcg.o createCostVolume.o timer.o helper.o
11 | 	nvcc $^ -o $@ $(LDFLAGS)
12 | 
13 | %.o: %.cu %.h
14 | 	nvcc $(CFLAGS) -c $<
15 | 
16 | .PHONY: debug
17 | debug: CFLAGS+= -g -G
18 | debug: cost_volume
19 | 
20 | .PHONY: run
21 | run: cost_volume
22 | 	./cost_volume
23 | 
24 | # the old implementation, still faster on some hardware
25 | asw: asw.cu
26 | 	nvcc `pkg-config opencv --cflags` $< `pkg-config opencv --libs` -o $@
27 | 	# example for running the old version:
28 | 	# ./asw l.png r.png 64 5 50
29 | 
30 | .PHONY: clean
31 | clean:
32 | 	rm asw cost_volume *.o
33 | 
34 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Ryan Beethe
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/costVolumeFilter_box.cu:
--------------------------------------------------------------------------------
 1 | #include "costVolumeFilter_box.h"
 2 | #include "helper.h"
 3 | #include "opencv2/ximgproc/edge_filter.hpp"
 4 | #include "opencv2/cudafilters.hpp"
 5 | #include <npp.h>
 6 | 
 7 | using namespace std;
 8 | using namespace cv;
 9 | 
10 | void costVolumeFilter_box_gpu(struct cost_volume_t& vol, int ksize){
11 | 	int nrows = vol.nrows;
12 | 	int ncols = vol.ncols;
13 | 	int ndisp = vol.ndisp;
14 | 	int stride = vol.stride;
15 | 
16 | 	// output volume
17 | 	float* d_output;
18 | 	cudaMalloc(&d_output, ndisp*nrows*stride*sizeof(float));
19 | 
20 | 	struct timespec timer;
21 | 	check_timer(NULL,&timer);
22 | 
23 | 	for(int disp = 0; disp < ndisp; disp++){
24 | 		float* src_data  = &(vol.volume[disp*nrows*stride]);
25 | 		float* out_data  = &(d_output[disp*nrows*stride]);
26 | 		int src_pitch    = stride*sizeof(float);
27 | 		int out_pitch    = stride*sizeof(float);
28 | 		NppiSize size    = {ncols , nrows };
29 | 		NppiSize sizeROI = {ncols , nrows };
30 | 		NppiSize kernel  = {ksize , ksize };
31 | 		NppiPoint offset = {0 , 0 };
32 | 		NppiPoint anchor = {ksize/2 , ksize/2 };
33 | 
34 | 
35 | 		nppiFilterBoxBorder_32f_C1R(
36 | 			src_data, src_pitch,
37 | 			size, offset,
38 | 			out_data, out_pitch,
39 | 			sizeROI, kernel, anchor, NPP_BORDER_REPLICATE);
40 | 	}
41 | 
42 | 	check_timer("costVolumeFilter_box_gpu time",&timer);
43 | 
44 | 	// shuffle pointers
45 | 	cudaFree(vol.volume);
46 | 	vol.volume = d_output;
47 | }
48 | 
49 | 
50 | void costVolumeFilter_box(struct cost_volume_t& cost_volume, int kernelSize){
51 | 	int nrows = cost_volume.nrows;
52 | 	int ncols = cost_volume.ncols;
53 | 	int ndisp = cost_volume.ndisp;
54 | 	float* vin = cost_volume.volume;
55 | 	// doesn't do in-place editing... need second float*
56 | 	float* vout = (float*)malloc(nrows*ncols*ndisp*sizeof(float));
57 | 
58 | 	struct timespec timer;
59 | 	check_timer(NULL,&timer);
60 | 
61 | 	for(int disp = 0; disp < ndisp; disp++){
62 | 		Mat slicein(nrows,ncols,CV_32F,&(vin[nrows*ncols*disp]));
63 | 		Mat sliceout(nrows,ncols,CV_32F,&(vout[nrows*ncols*disp]));
64 | 		boxFilter(slicein, sliceout, -1, Size(kernelSize,kernelSize));
65 | 	}
66 | 
67 | 	check_timer("costVolumeFilter_box time",&timer);
68 | 
69 | 	// free old cost_volume float*
70 | 	free(cost_volume.volume);
71 | 	// replace with new cost_volume float*
72 | 	cost_volume.volume = vout;
73 | }
74 | 
75 | 


--------------------------------------------------------------------------------
/kerneltest.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <time.h>
  4 | 
  5 | struct timespec check_timer(const char* str, struct timespec* ts){
  6 | 	struct timespec oldtime;
  7 | 	// copy old time over
  8 | 	oldtime.tv_nsec = ts->tv_nsec;
  9 | 	oldtime.tv_sec = ts->tv_sec;
 10 | 	// update ts
 11 | 	clock_gettime(CLOCK_REALTIME, ts);
 12 | 	// print old time
 13 | 	int diffsec;
 14 | 	int diffnsec;
 15 | 	if(str != NULL){
 16 | 		diffsec =  ts->tv_sec - oldtime.tv_sec;
 17 | 		diffnsec =  ts->tv_nsec - oldtime.tv_nsec;
 18 | 		// correct the values if we measured over an integer second break:
 19 | 		if(diffnsec < 0){
 20 | 			diffsec--;
 21 | 			diffnsec += 1000000000;
 22 | 		}
 23 | 		printf("%s:%ds %dns\n",str,diffsec,diffnsec);
 24 | 	}
 25 | 	return (struct timespec) {diffsec, diffnsec};
 26 | }
 27 |  
 28 | 
 29 | // Device code
 30 | __global__ void VecAdd(float* A, float* B, float* C, int N)
 31 | {
 32 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
 33 |     if (i < N)
 34 |         C[i] = A[i] + B[i];
 35 | }
 36 |             
 37 | // Host code
 38 | int main()
 39 | {
 40 | 	// declare timer
 41 | 	struct timespec timer;
 42 | 
 43 |     int N = 1000000000;
 44 |     size_t size = N * sizeof(float);
 45 | 
 46 |     // Allocate input vectors h_A and h_B in host memory
 47 |     float* h_A = (float*)malloc(size);
 48 |     float* h_B = (float*)malloc(size);
 49 |     float* h_C = (float*)malloc(size);
 50 | 
 51 | 
 52 | 	check_timer(NULL,&timer);
 53 |     // Initialize input vectors
 54 |     for(int i = 0; i < N; i++){
 55 |     	h_A[i] = i;
 56 |     	h_B[i] = N - i;
 57 |     }
 58 | 	check_timer("Time to initialize",&timer);
 59 | 
 60 |     // Allocate vectors in device memory
 61 |     float* d_A;
 62 |     cudaMalloc(&d_A, size);
 63 |     float* d_B;
 64 |     cudaMalloc(&d_B, size);
 65 |     float* d_C;
 66 |     cudaMalloc(&d_C, size);
 67 | 
 68 |     // Copy vectors from host memory to device memory
 69 | 	check_timer(NULL,&timer);
 70 |     cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
 71 |     cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
 72 | 	check_timer("Time to copy to device",&timer);
 73 | 
 74 |     // Invoke kernel
 75 |     int threadsPerBlock = 256;
 76 |     int blocksPerGrid =
 77 |             (N + threadsPerBlock - 1) / threadsPerBlock;
 78 | 	check_timer(NULL,&timer);
 79 |     VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
 80 | 	check_timer("Time to execute kernel",&timer);
 81 | 
 82 |     // Copy result from device memory to host memory
 83 |     // h_C contains the result in host memory
 84 | 	check_timer(NULL,&timer);
 85 |     cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
 86 | 	check_timer("Time to copy back to host",&timer);
 87 | 
 88 |     // Free device memory
 89 |     cudaFree(d_A);
 90 |     cudaFree(d_B);
 91 |     cudaFree(d_C);
 92 | 
 93 |     int errors = 0;
 94 |     for(int i = 0; i < N; i++){
 95 |     	if(h_C[i] != N){
 96 |     		errors ++;
 97 |     	}
 98 | 
 99 |     }
100 |     printf("checking done, errors = %d\n");
101 | 
102 |     // Free host memory
103 |     free(h_A);
104 |     free(h_B);
105 |     free(h_C);
106 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Adaptive Support Weight (ASW) correspondence matching
 2 | 
 3 | This git project is an open-source CUDA implementation of the algorithm described in "Adaptive support-weight approach for correspondence" by Kuk-Jin Yoon and In So Kweon, which is the basis for the most effective local reasoning stereo vision algorithms produced today.  The latest effort is based on the articles "Fast cost-volume filtering for visual correspondence and beyond" by Rhemann et al and "Secrets of adaptive support weight for stereo vision for local stereo matching" by Hosni et al.
 4 | 
 5 | 
 6 | ## Goals and Motivations
 7 | 
 8 | To my knowledge, there is no other open-source implementation of ASW, although it was developed in 2005.  Freely-available computer vision libraries (such as OpenCV or Nvidia VisionWorks) do not offer ASW, either in CPU form or a GPU accelerated form.  The goal of this project is to implement a GPU-accelerated ASW algorithm which can ultimately be contributed to the OpenCV library.  A Free and Open Source Software (FOSS) implementation of ASW would empower those in industry with a more powerful stereo matching algorithm, and it would empower those in research with a quicker starting point for testing modifications to ASW.
 9 | 
10 | ## Important Branches
11 | 
12 | 1. **master**:  This git repo has several different attempts at optimizing the `asw.cu` cuda kernel.  The master branch has a mostly-stable snapshot of the cost-volume implementation of the ASW algorithm.  Note that there is a known memory issue with the `createCostVolume_kernel()` function in createCostVolume.cu, a bug which is fixed on the cost_volume branch but hasn't been backported.
13 | 
14 | 2. **cost_volume**: The effort to use the cost-volume approach ("Fast cost-volume filtering for visual correspondence and beyond" by Rhemann et al) is developed on this branch.  Currently, the conversion from using a custom `struct cost_volume_t` to the more useful `cv::cuda::GpuMat` object is not complete, so it is not yet on the master branch.
15 | 
16 | ## Current State
17 | 
18 | The cost-volume filtering method appears to have a higher minimum run time but ports much better to embedded hardware (1.6 sec runtime instead of 6.8 sec).
19 | 
20 | Currently the fastest attempt at GPU-acceleration exists with the old implementation (asw.cu) and can be tested by running `git checkout 9b87bdd` then `./asw l.png r.png 64 5 50`.
21 | 
22 | ## Known Issues with initial implementation (asw.cu):
23 | 
24 | 1. There seems to be some salt-and-pepper noise on the disparity output that I can't explain.
25 | 
26 | 2. Shared memory is not handled well.  Currently shared memory size limits the combinations of numbers of disparities & window sizes available, but with a good implementation the size of shared memory should not offer any limit to those factors.  In fact, an attempt at reducing shared memory exists on the `dev` branch, but it actually made the shared memory issue worse.
27 | 
28 | 3. I suspect improved performance could be achieved by tuning the auto-calculation of window size vs spacial sigma.
29 | 
30 | 4. The current pixel-matching function is a sum of absolute difference (SAD), but a complete implementation should use a truncated absolute difference of cost and gradient (TAD C+G) as in the ASW paper.  An attempt at implementing TAD C+G can be found on the `half` branch, in the cpu version of the code.  Furthermore, modifications to the matching should be included to take into account sub-pixel disparities, such as in the paper, "A pixel dissimilarity measure that is insensitive to image sampling" by Birchfield and Tomasi.  However, such modifications are a lesser priority to issues such as the shared memory handling in CUDA.
31 | 
32 | 5. Left and right disparity calculation comparison should be done.  Currently, only the left disparity is calculated.
33 | 
34 | # Known issues with cost-volume implementation:
35 | 
36 | 1. This list isn't ready yet...
37 | 


--------------------------------------------------------------------------------
/costVolumeMinimize.cu:
--------------------------------------------------------------------------------
  1 | #include "costVolumeMinimize.h"
  2 | 
  3 | using namespace std;
  4 | using namespace cv;
  5 | 
  6 | #define ILP_LEVEL 4
  7 | // Device code
  8 | __global__ void costVolumeMinimize_kernel(struct cost_volume_t vol, unsigned char* output){
  9 | 	int gx = blockIdx.x*blockDim.x + threadIdx.x;
 10 | 	int gy = blockIdx.y*blockDim.y + threadIdx.y;
 11 | 	gy *= ILP_LEVEL;
 12 | 
 13 | 	// only threads which land in the image participate
 14 | 	if(gy < vol.nrows && gx < vol.ncols){
 15 | 
 16 | 		// this will store the disp val of the lowest cost
 17 | 		int mindisp[ILP_LEVEL];
 18 | 		float mincost[ILP_LEVEL];
 19 | #pragma unroll
 20 | 		for(int ilp = 0; ilp < ILP_LEVEL; ilp++){
 21 | 			// arbitrary large number
 22 | 			mincost[ilp] = 1e6;
 23 | 		}
 24 | 
 25 | 
 26 | 		// now go through each disparity
 27 | 		for(int disp = 0; disp < vol.ndisp; disp ++){
 28 | 			float cost[ILP_LEVEL];
 29 | #pragma unroll
 30 | 			for(int ilp = 0; ilp < ILP_LEVEL; ilp++){
 31 | 				if(gy + ilp < vol.nrows){
 32 | 					cost[ilp] = vol.volume[vol.stride*vol.nrows*disp + vol.stride*(gy+ilp) + gx];
 33 | 				}
 34 | 				__syncthreads();
 35 | 			}
 36 | #pragma unroll
 37 | 			for(int ilp = 0; ilp < ILP_LEVEL; ilp++){
 38 | 				if(cost[ilp] < mincost[ilp]){
 39 | 					mincost[ilp] = cost[ilp];
 40 | 					mindisp[ilp] = disp;
 41 | 				}
 42 | 			}
 43 | 			__syncthreads();
 44 | 		}
 45 | 
 46 | 		// write the resulting minimum to the output
 47 | #pragma unroll
 48 | 		for(int ilp = 0; ilp < ILP_LEVEL; ilp++){
 49 | 			if(gy + ilp < vol.nrows){
 50 | 				output[vol.ncols*(gy+ilp) + gx] = mindisp[ilp];
 51 | 			}
 52 | 		}
 53 | 	}
 54 | }
 55 | 
 56 | void costVolumeMinimize_gpu(struct cost_volume_t cost_volume, Mat& outim){
 57 | 	int nrows = cost_volume.nrows;
 58 | 	int ncols = cost_volume.ncols;
 59 | 	// init out mat
 60 | 	outim = Mat::zeros(nrows,ncols,CV_8U);
 61 | 	// allocate output matrix on gpu
 62 | 	unsigned char* d_output;
 63 | 	cudaMalloc(&d_output, nrows*ncols*sizeof(unsigned char));
 64 | 	// zero the d_output
 65 | 	// gpu_memset<<<nrows*ncols*sizeof(unsigned char)/1024 + 1, 1024>>>((unsigned char*)d_output,0,nrows*ncols*sizeof(unsigned char));
 66 | 	// gpu_perror("memset on output");
 67 | 
 68 | 	// settings for the kernel
 69 | 	// trying to use 128 threads-wide so the uchar global write is 128 bytes
 70 | 	dim3 threadsPerBlock(128,1);
 71 | 	dim3 blocksPerGrid(ncols/threadsPerBlock.x+1,nrows/threadsPerBlock.y/ILP_LEVEL+1);
 72 | 	// call the kernel
 73 | 	struct timespec timer;
 74 | 	check_timer(NULL,&timer);
 75 |     costVolumeMinimize_kernel<<<blocksPerGrid, threadsPerBlock>>>(cost_volume, (unsigned char*)d_output);
 76 | 	cudaDeviceSynchronize();
 77 |     check_timer("costVolumeMinimize_gpu time",&timer);
 78 | 	gpu_perror("costVolumeMinimize_kernel");
 79 | 
 80 | 	// copy debug back over
 81 |     cudaMemcpy((unsigned char*)outim.data, d_output, nrows*ncols*sizeof(unsigned char), cudaMemcpyDeviceToHost);
 82 | 	// imshow("window",outim); waitKey(0);
 83 | 
 84 | 	// cleanup the temporary image memory
 85 | 	cudaFree(d_output);
 86 | }
 87 | 
 88 | void costVolumeMinimize(struct cost_volume_t cost_volume, Mat& outim){
 89 | 	int ndisp = cost_volume.ndisp;
 90 | 	int nrows = cost_volume.nrows;
 91 | 	int ncols = cost_volume.ncols;
 92 | 	// init out mat
 93 | 	outim = Mat::zeros(nrows,ncols,CV_8U);
 94 | 	unsigned char* out = (unsigned char*) (outim.data);
 95 | 	float* volume = cost_volume.volume;
 96 | 	for(int col = 0; col < ncols; col++){
 97 | 		for(int row = 0; row < nrows; row++){
 98 | 			float minval = volume[nrows*ncols*0 + ncols*row + col];
 99 | 			int minidx = 0;
100 | 			// iterate over the disparities
101 | 			for(int disp = 1; disp < min(ndisp,col); disp++){
102 | 				float test = volume[nrows*ncols*disp + ncols*row + col];
103 | 				if(test < minval){
104 | 					minval = test;
105 | 					minidx = disp;
106 | 				}
107 | 			}
108 | 			out[ncols*row + col] = (unsigned char)minidx;
109 | 		}
110 | 	}
111 | }
112 | 
113 | 


--------------------------------------------------------------------------------
/cost_volume.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <opencv2/opencv.hpp>
  3 | #include <stdlib.h>
  4 | #include <stdint.h>
  5 | #include <inttypes.h>
  6 | #include "timer.h"
  7 | #include "createCostVolume.h"
  8 | #include "createCostVolume_tadcg.h"
  9 | #include "costVolumeMinimize.h"
 10 | #include "costVolumeFilter_jointBilateral.h"
 11 | #include "costVolumeFilter_guided.h"
 12 | #include "costVolumeFilter_box.h"
 13 | #include "cost_volume.h"
 14 | 
 15 | using namespace std;
 16 | using namespace cv;
 17 | 
 18 | struct cost_volume_t get_gpu_volume(struct cost_volume_t vin){
 19 | 	struct cost_volume_t vout;
 20 | 	vout.nrows  = vin.nrows;
 21 | 	vout.ncols  = vin.ncols;
 22 | 	vout.ndisp  = vin.ndisp;
 23 | 	vout.stride = vin.ncols;
 24 | 	// copy the gpu data directly
 25 | 	float* gpu_copy = (float*)malloc(vin.stride*vin.nrows*vin.ndisp*sizeof(float));
 26 |     cudaMemcpy(gpu_copy, vin.volume, vin.stride*vin.nrows*vin.ndisp*sizeof(float), cudaMemcpyDeviceToHost);
 27 | 	// now copy without padding
 28 | 	vout.volume = (float*)malloc(vout.ncols*vout.nrows*vout.ndisp*sizeof(float));
 29 | 	for(int col = 0; col < vout.ncols; col++){
 30 | 		for(int row = 0; row < vout.nrows; row++){
 31 | 			// iterate over the disparities
 32 | 			for(int disp = 0; disp < vout.ndisp; disp++){
 33 | 				vout.volume[vout.nrows*vout.ncols*disp + vout.ncols*row + col] = gpu_copy[vin.nrows*vin.stride*disp + vin.stride*row + col];
 34 | 			}
 35 | 		}
 36 | 	}
 37 | 	free(gpu_copy);
 38 | 	return vout;
 39 | }
 40 | 
 41 | void viewSlices(struct cost_volume_t& cost_volume, int first, int last){
 42 | 	int nrows = cost_volume.nrows;
 43 | 	int stride = cost_volume.stride;
 44 | 	float* vin = cost_volume.volume;
 45 | 	if(last < 0){
 46 | 		last = cost_volume.ndisp - last;
 47 | 	}
 48 | 	for(int disp = first; disp <= last; disp++){
 49 | 		printf("\n%d\n",disp);
 50 | 		Mat slicein(nrows,stride,CV_32F,&(vin[nrows*stride*disp]));
 51 | 		double m,M; minMaxLoc(slicein,&m,&M);
 52 | 		printf("min,Max of slice = %f,%f\n",m,M);
 53 | 		printf("slice rows,cols: %d,%d\n",slicein.rows,slicein.cols);
 54 | 		Mat temp = (slicein - m)/(M-m);
 55 | 		imshow("window",temp); if((char)waitKey(0)=='q') break;
 56 | 	}
 57 | }
 58 | 
 59 | int main(int argc, char** argv){
 60 | 	cudaDeviceReset();
 61 | 	// spacial and intensity sigmas
 62 | 	double s_sigma, c_sigma;
 63 | 	// size of bilateral kernel
 64 | 	int ksize;
 65 | 	// number of disparities to check
 66 | 	int ndisp;
 67 | 	// input images
 68 | 	Mat l_im, r_im;
 69 | 
 70 | 	if(argc < 6){
 71 | 		printf("usage: %s <left image> <right image> <num disparities> <kernel size> <spacial sigma> <color sigma>\n\n",argv[0]);
 72 | 		printf("... for now, using defaults (l.png r.png 64 15 5 50)\n");
 73 | 		l_im = imread("l.png");
 74 | 		r_im = imread("r.png");
 75 | 		ndisp = 64;
 76 | 		ksize = 15;
 77 | 		s_sigma = 5;
 78 | 		c_sigma = 50;
 79 | 	}else{
 80 | 		// read images, convert to floats
 81 | 		l_im = imread(argv[1]);
 82 | 		r_im = imread(argv[2]);
 83 | 		ndisp = atoi(argv[3]);
 84 | 		ksize = atoi(argv[4]);
 85 | 		s_sigma = atof(argv[5]);
 86 | 		c_sigma = atof(argv[6]);
 87 | 	}
 88 | 	printf("ndisp,ksize,s_sigma,c_sigma: %d,%d,%.3f,%.3f\n",ndisp,ksize,s_sigma,c_sigma);
 89 | 
 90 | 	Mat out,out_gpu;
 91 | 	//struct cost_volume_t gpu_volume = createCostVolume_gpu(l_im, r_im, 64);
 92 | 	struct cost_volume_t gpu_volume = createCostVolume_tadcg_gpu(l_im, r_im, 64,20,90,.9);
 93 | 	costVolumeFilter_jointBilateral_gpu(gpu_volume, l_im, ksize, c_sigma, s_sigma);
 94 | 	//costVolumeFilter_guided_gpu(gpu_volume, l_im, ksize, c_sigma);
 95 | 	//costVolumeFilter_box_gpu(gpu_volume, ksize);
 96 | 	costVolumeMinimize_gpu(gpu_volume, out_gpu);
 97 | 
 98 | 	//struct cost_volume_t cpu_volume = get_gpu_volume(gpu_volume);
 99 | 	//costVolumeFilter_guided(cpu_volume,l_im,ksize,c_sigma);
100 | 	//costVolumeMinimize(cpu_volume,out_gpu);
101 | 
102 | 	struct cost_volume_t ref_volume = createCostVolume_tadcg(l_im,r_im,64,20,90,.9);
103 | 	//struct cost_volume_t ref_volume2 = createCostVolume(l_im,r_im,64);
104 | 	costVolumeFilter_jointBilateral(ref_volume, l_im, ksize, c_sigma, s_sigma);
105 | 	//costVolumeBoxFilter(ref_volume,ksize);
106 | 	//costVolumeFilter_guided(ref_volume,l_im,ksize,c_sigma);
107 | 	//costVolumeFilter_guided(ref_volume2,l_im,ksize,c_sigma);
108 | 	costVolumeMinimize(ref_volume, out);
109 | 	//costVolumeMinimize(ref_volume2, out_gpu);
110 | 	//viewSlices(cpu_volume,0,12);
111 | 	//viewSlices(ref_volume,0,10);
112 | 	// costVolumeFilter_box(cpu_volume,ksize);
113 | 	int show = 1;
114 | 	if(show){
115 | 		printf("l_im\n"); imshow("window",l_im); waitKey(0);
116 | 		printf("cpu\n");  imshow("window",out); waitKey(0);
117 | 		printf("gpu\n");  imshow("window",out_gpu); waitKey(0);
118 | 		printf("python\n");  imshow("window",imread("tadcg.png")); waitKey(0);
119 | 	}
120 | }
121 | 
122 | 


--------------------------------------------------------------------------------
/createCostVolume.cu:
--------------------------------------------------------------------------------
  1 | #include "cost_volume.h"
  2 | #include "createCostVolume.h"
  3 | #include "timer.h"
  4 | #include "helper.h"
  5 | 
  6 | using namespace std;
  7 | using namespace cv;
  8 | 
  9 | // Device code
 10 | __global__ void createCostVolume_kernel(int* ref_global, int* tgt_global, struct cost_volume_t vol, int* debug){
 11 | 	int gx = blockIdx.x*blockDim.x + threadIdx.x;
 12 | 	int gy = blockIdx.y*blockDim.y + threadIdx.y;
 13 | 
 14 | 	extern __shared__ int tgt_data[]; // contains relevant tgt image data
 15 | 
 16 | 	// copy target image global memory into shared memory (all threads must participate)
 17 | 	for(int i = 0; i < vol.ndisp + blockDim.x; i += blockDim.x){
 18 | 		// check to make sure the actual read lands in 0 <= col < ncols  && row < nrows
 19 | 		if(gy < vol.nrows && (gx - (vol.ndisp-1) + i) >= 0 && (gx - (vol.ndisp-1) + i) < vol.ncols){
 20 | 			tgt_data[(blockDim.x + vol.ndisp - 1)*threadIdx.y + threadIdx.x + i] = tgt_global[vol.ncols*gy + gx - (vol.ndisp-1) + i];
 21 | 		}
 22 | 		__syncthreads();
 23 | 	}
 24 | 
 25 | 	// now only threads which land in the image participate
 26 | 	if(gy < vol.nrows && gx < vol.ncols){
 27 | 
 28 | 		// get reference pixel from global memory
 29 | 		int ref = ref_global[vol.ncols*gy + gx];
 30 | 
 31 | 		// pull out channel data from reference pixel (brought in as an int)
 32 | 		int rr,rg,rb;
 33 | 		rr = (ref&0x000000FF) >> 0;
 34 | 		rb = (ref&0x0000FF00) >> 8;
 35 | 		rg = (ref&0x00FF0000) >> 16;
 36 | 		
 37 | 		// now go through each disparity
 38 | 		for(int disp = 0; disp < vol.ndisp; disp ++){
 39 | 			float cost;
 40 | 			// check if this disp has a pixel in the tgt image
 41 | 			if( gx - disp >= 0){
 42 | 				// read tgt pixel from shared memory
 43 | 				int tgt = tgt_data[(blockDim.x + vol.ndisp - 1)*threadIdx.y + (vol.ndisp-1) + threadIdx.x - disp];
 44 | 
 45 | 				// separate channel data
 46 | 				int tr,tg,tb;
 47 | 				tr = (tgt&0x000000FF) >> 0;
 48 | 				tb = (tgt&0x0000FF00) >> 8;
 49 | 				tg = (tgt&0x00FF0000) >> 16;
 50 | 
 51 | 				// using SAD for aggregate cost function
 52 | 				cost = abs(rr - tr) + abs(rb-tb) + abs(rg-tg);
 53 | 			}else{
 54 | 				// these values of the cost volume don't correspond to two real pixels, so make the cost high
 55 | 				cost = 9999;
 56 | 			}
 57 | 			__syncthreads();
 58 | 			// now write the cost to the actual cost_volume
 59 | 			vol.volume[vol.stride*vol.nrows*disp + vol.stride*gy + gx] = cost;
 60 | 		}
 61 | 	}
 62 | }
 63 | 
 64 | struct cost_volume_t createCostVolume_gpu(Mat leftim, Mat rightim, int ndisp){
 65 | 	int nchans = leftim.channels();
 66 | 	int nrows = leftim.rows;
 67 | 	int ncols = leftim.cols;
 68 | 	// find stride so that rows in global memory align to 128-byte boundaries
 69 | 	int boundary = 128/sizeof(float);
 70 | 	int stride = ncols + (boundary - ncols%boundary)%boundary;
 71 | 	// allocate gpu memory for cost volume
 72 | 	float* volume_gpu;
 73 | 	cudaMalloc(&volume_gpu,nrows*ndisp*stride*sizeof(float));
 74 | 	// zero the volume_gpu
 75 | 	// gpu_memset<<<ncols*ndisp*stride*sizeof(float)/1024 + 1, 1024>>>((unsigned char*)volume_gpu,0,ncols*ndisp*stride*sizeof(float));
 76 | 	// gpu_perror("memset on volume");
 77 | 	// init struct cost_volume_t object
 78 | 	struct cost_volume_t cost_volume = {volume_gpu,nrows,ncols,ndisp,stride};
 79 | 	// convert BGR images to RGBA
 80 | 	cvtColor(leftim,leftim,CV_BGR2RGBA);
 81 | 	cvtColor(rightim,rightim,CV_BGR2RGBA);
 82 | 	// copy left image to to GPU
 83 | 	unsigned char* d_im_l;
 84 | 	cudaMalloc(&d_im_l, 4*nrows*ncols*sizeof(unsigned char));
 85 |     cudaMemcpy(d_im_l, leftim.data, 4*nrows*ncols*sizeof(unsigned char), cudaMemcpyHostToDevice);
 86 | 	// copy right image to to GPU
 87 | 	unsigned char* d_im_r;
 88 | 	cudaMalloc(&d_im_r, 4*nrows*ncols*sizeof(unsigned char));
 89 |     cudaMemcpy(d_im_r, rightim.data, 4*nrows*ncols*sizeof(unsigned char), cudaMemcpyHostToDevice);
 90 | 	// debug setup
 91 | 	Mat debug(nrows,ncols,CV_8UC4);
 92 | 	unsigned char* d_debug;
 93 | 	cudaMalloc(&d_debug,nrows*ncols*sizeof(int));
 94 | 	// zero the volume_gpu
 95 | 	// gpu_memset<<<ncols*nrows*sizeof(int)/1024 + 1, 1024>>>((unsigned char*)d_debug,0,ncols*nrows*sizeof(int));
 96 | 	// gpu_perror("memset on debug");
 97 | 
 98 | 	// settings for the kernel
 99 | 	// should be 32-threads wide to ensure 128-byte block global reads
100 | 	dim3 threadsPerBlock(32,4);
101 | 	dim3 blocksPerGrid(ncols/threadsPerBlock.x+1,nrows/threadsPerBlock.y+1);
102 | 	int tgt_shared_mem = (threadsPerBlock.x+ndisp-1)*threadsPerBlock.y*sizeof(int);
103 | 	// call the kernel
104 | 	struct timespec timer;
105 | 	check_timer(NULL,&timer);
106 |     createCostVolume_kernel<<<blocksPerGrid, threadsPerBlock, tgt_shared_mem>>>((int*)d_im_l, (int*)d_im_r, cost_volume, (int*)d_debug);
107 | 	cudaDeviceSynchronize();
108 |     check_timer("cost_volume_gpu time",&timer);
109 | 	gpu_perror("createCostVolume_kernel");
110 | 
111 | 	// copy debug back over
112 |     cudaMemcpy((int*)debug.data, d_debug, nrows*ncols*sizeof(int), cudaMemcpyDeviceToHost);
113 | 	// imshow("window",leftim); waitKey(0);
114 | 	// imshow("window",debug); waitKey(0);
115 | 	// imshow("window",leftim); waitKey(0);
116 | 
117 | 	// cleanup the temporary image memory
118 | 	cudaFree(d_im_l);
119 | 	cudaFree(d_im_r);
120 | 	cudaFree(d_debug);
121 | 
122 | 	return cost_volume;
123 | }
124 | 
125 | struct cost_volume_t createCostVolume(Mat leftim, Mat rightim,int ndisp){
126 | 	int nchans = leftim.channels();
127 | 	int nrows = leftim.rows;
128 | 	int ncols = leftim.cols;
129 | 	int stride = ncols;
130 | 	float* volume = (float*)malloc(ncols*nrows*nchans*ndisp*sizeof(float));
131 | 	// init struct cost_volume_t object
132 | 	struct cost_volume_t cost_volume = {volume,nrows,ncols,ndisp,stride};
133 | 
134 | 	// make sure images are the same size
135 | 	if(leftim.cols != rightim.cols || leftim.rows != rightim.rows && leftim.channels() == rightim.channels()){
136 | 		printf("ERROR: left and right images in createCostVolume do not have matching rows and cols and channels\n");
137 | 		return cost_volume;
138 | 	}
139 | 
140 | 	struct timespec timer;
141 | 	check_timer(NULL,&timer);
142 | 
143 | 	unsigned char* left =  (unsigned char*)leftim.data;
144 | 	unsigned char* right = (unsigned char*)rightim.data;
145 | 	// init values to very large numbers
146 | 	// the reason for this is that some regions near volume edges won't be dealt with
147 | 	for( int i = 0; i < ncols*nrows*nchans*ndisp; i++){
148 | 		// arbitrary large number
149 | 		volume[i] = 9999;
150 | 	}
151 | 
152 | 	// organization will be ndisp images of rows of pixels
153 | 	// iterate over the whole image
154 | 	for(int col = 0; col < ncols; col++){
155 | 		for(int row = 0; row < nrows; row++){
156 | 			// iterate over the disparities
157 | 			for(int disp = 0; disp < min(ndisp,col+1); disp++){
158 | 				// get difference over channels
159 | 				float diff = 0;
160 | 				for(int chan = 0; chan < nchans; chan++){
161 | 					diff += abs(left[(ncols*row + col)*nchans + chan] - right[(ncols*row + col - disp)*nchans + chan]);
162 | 				}
163 | 				volume[nrows*ncols*disp + ncols*row + col] = diff;
164 | 			}
165 | 		}
166 | 	}
167 | 	check_timer("createCostVolume",&timer);
168 | 	return cost_volume;
169 | }
170 | 
171 | 


--------------------------------------------------------------------------------
/costVolumeFilter_guided.cu:
--------------------------------------------------------------------------------
  1 | #include "costVolumeFilter_guided.h"
  2 | #include "helper.h"
  3 | #include "opencv2/ximgproc/edge_filter.hpp"
  4 | #include "opencv2/cudaarithm.hpp"
  5 | #include "opencv2/cudafilters.hpp"
  6 | #include <npp.h>
  7 | 
  8 | 
  9 | using namespace std;
 10 | using namespace cv;
 11 | 
 12 | __global__ void costVolumeFilter_guided_kernel(struct cost_volume_t vol, int* guide_global, int inter_win_padding, float* output, float sigma_s, float sigma_c, int ksize){
 13 | 	
 14 | }
 15 | 
 16 | void costVolumeFilter_guided_gpu(struct cost_volume_t& vol, Mat guide, int ksize, float eps){
 17 | 	int nrows = vol.nrows;
 18 | 	int ncols = vol.ncols;
 19 | 	int ndisp = vol.ndisp;
 20 | 	int stride = vol.stride;
 21 | 
 22 | 	struct timespec timer;
 23 | 
 24 | 	cuda::GpuMat I;
 25 | 
 26 | 	// copy guide image to grayscale
 27 | 	cvtColor(guide,guide,CV_BGR2GRAY);
 28 | 	// convert to float
 29 | 	guide.convertTo(guide,CV_32FC1);
 30 | 	// copy guide image to GPU
 31 | 	I.upload(guide);
 32 | 	// set up working memory
 33 | 	cuda::GpuMat mean(I.rows,I.cols,I.type());
 34 | 	cuda::GpuMat var(I.rows,I.cols,I.type());
 35 | 	cuda::GpuMat workmem(I.rows,I.cols,I.type());
 36 | 	cuda::GpuMat workmem2(I.rows,I.cols,I.type());
 37 | 	cuda::GpuMat workmem3(I.rows,I.cols,I.type());
 38 | 
 39 | 	cuda::GpuMat p_(I.rows,I.cols,I.type());
 40 | 	cuda::GpuMat p_mean(I.rows,I.cols,I.type());
 41 | 	cuda::GpuMat a(I.rows,I.cols,I.type());
 42 | 	cuda::GpuMat a_(I.rows,I.cols,I.type());
 43 | 	cuda::GpuMat a_mean(I.rows,I.cols,I.type());
 44 | 	cuda::GpuMat a_I(I.rows,I.cols,I.type());
 45 | 	cuda::GpuMat b(I.rows,I.cols,I.type());
 46 | 	cuda::GpuMat b_(I.rows,I.cols,I.type());
 47 | 	cuda::GpuMat Ip(I.rows,I.cols,I.type());
 48 | 	cuda::GpuMat Ip_(I.rows,I.cols,I.type());
 49 | 
 50 | 	check_timer(NULL,&timer);
 51 | 
 52 | 	// pre-step 1: box filter I to get mean
 53 | 	cudaDeviceSynchronize();
 54 | 	{
 55 | 		float* src_data  = (float*)I.data;
 56 | 		float* out_data  = (float*)mean.data;
 57 | 		int src_pitch    = I.step;
 58 | 		int out_pitch    = mean.step;
 59 | 		NppiSize size    = {ncols , nrows };
 60 | 		NppiSize sizeROI = {ncols , nrows };
 61 | 		NppiSize kernel  = {ksize , ksize };
 62 | 		NppiPoint offset = {0 , 0 };
 63 | 		NppiPoint anchor = {ksize/2 , ksize/2 };
 64 | 
 65 | 		nppiFilterBoxBorder_32f_C1R(
 66 | 			src_data, src_pitch,
 67 | 			size, offset,
 68 | 			out_data, out_pitch,
 69 | 			sizeROI, kernel, anchor, NPP_BORDER_REPLICATE);
 70 | 	}
 71 | 	// pre-step 2: square I, for variance calculation
 72 | 	cudaDeviceSynchronize();
 73 | 	cuda::sqr(I,var);
 74 | 	// pre-step 3: box filter I^2
 75 | 	cudaDeviceSynchronize();
 76 | 	{
 77 | 		float* src_data  = (float*)var.data;
 78 | 		float* out_data  = (float*)workmem3.data;
 79 | 		int src_pitch    = var.step;
 80 | 		int out_pitch    = workmem3.step;
 81 | 		NppiSize size    = {ncols , nrows };
 82 | 		NppiSize sizeROI = {ncols , nrows };
 83 | 		NppiSize kernel  = {ksize , ksize };
 84 | 		NppiPoint offset = {0 , 0 };
 85 | 		NppiPoint anchor = {ksize/2 , ksize/2 };
 86 | 
 87 | 		nppiFilterBoxBorder_32f_C1R(
 88 | 			src_data, src_pitch,
 89 | 			size, offset,
 90 | 			out_data, out_pitch,
 91 | 			sizeROI, kernel, anchor, NPP_BORDER_REPLICATE);
 92 | 	}
 93 | 	// pre-step 4: square the mean
 94 | 	cudaDeviceSynchronize();
 95 | 	cuda::sqr(mean,workmem);
 96 | 	// pre-step 5: variance = mean(x^2) - mean(x)^2
 97 | 	cudaDeviceSynchronize();
 98 | 	cuda::subtract(workmem3, workmem, workmem2);
 99 | 	// pre-step 6: add eps to variance
100 | 	cudaDeviceSynchronize();
101 | 	cuda::add(workmem2, eps, var);
102 | 
103 | 	for(int disp = 0; disp < ndisp; disp++){
104 | 		// step 1: element-wise multiply I by p
105 | 		cuda::GpuMat p(Size(ncols,nrows), CV_32F, &(vol.volume[disp*nrows*stride]), stride*sizeof(float));
106 | 		//cuda::GpuMat Ip = workmem;
107 | 		cuda::multiply(I,p,Ip);
108 | 		// step 2: box filter Ip to be Ip_
109 | 		//cuda::GpuMat Ip_ = Ip;
110 | 		{
111 | 			float* src_data  = (float*)Ip.data;
112 | 			float* out_data  = (float*)Ip_.data;
113 | 			int src_pitch    = Ip.step;
114 | 			int out_pitch    = Ip_.step;
115 | 			NppiSize size    = {ncols, nrows };
116 | 			NppiSize sizeROI = {ncols, nrows };
117 | 			NppiSize kernel  = {ksize , ksize };
118 | 			NppiPoint offset = {0 , 0 };
119 | 			NppiPoint anchor = {ksize/2 , ksize/2 };
120 | 
121 | 			nppiFilterBoxBorder_32f_C1R(
122 | 				src_data, src_pitch,
123 | 				size, offset,
124 | 				out_data, out_pitch,
125 | 				sizeROI, kernel, anchor, NPP_BORDER_REPLICATE);
126 | 		}
127 | 		// step 3: box filter p to be p_
128 | 		//cuda::GpuMat p_ = p;
129 | 		{
130 | 			float* src_data  = (float*)p.data;
131 | 			float* out_data  = (float*)p_.data;
132 | 			int src_pitch    = p.step;
133 | 			int out_pitch    = p_.step;
134 | 			NppiSize size    = {ncols, nrows };
135 | 			NppiSize sizeROI = {ncols, nrows };
136 | 			NppiSize kernel  = {ksize , ksize };
137 | 			NppiPoint offset = {0 , 0 };
138 | 			NppiPoint anchor = {ksize/2 , ksize/2 };
139 | 
140 | 			nppiFilterBoxBorder_32f_C1R(
141 | 				src_data, src_pitch,
142 | 				size, offset,
143 | 				out_data, out_pitch,
144 | 				sizeROI, kernel, anchor, NPP_BORDER_REPLICATE);
145 | 		}
146 | 		// step 4: combine p_ and mean
147 | 		//cuda::GpuMat p_mean = workmem2;
148 | 		cuda::multiply(p_, mean, p_mean);
149 | 		// step 5: compute Ip_ - mean*p_
150 | 		//cuda::GpuMat a = Ip_;
151 | 		cuda::subtract(Ip_, p_mean, workmem);
152 | 		// step 6: divide by var+eps (stored as var)
153 | 		cuda::divide(workmem, var, a);
154 | 		// step 7: start calculating b with a*mean
155 | 		//cuda::GpuMat a_mean  = workmem2;
156 | 		cuda::multiply(a, mean, a_mean);
157 | 		// step 8: b = p_ - a_mean
158 | 		//cuda::GpuMat b = p_;
159 | 		cuda::subtract(p_, a_mean, b);
160 | 		// step 9: box filter a
161 | 		//cuda::GpuMat a_ = a;
162 | 		{
163 | 			float* src_data  = (float*)a.data;
164 | 			float* out_data  = (float*)a_.data;
165 | 			int src_pitch    = a.step;
166 | 			int out_pitch    = a_.step;
167 | 			NppiSize size    = {ncols, nrows };
168 | 			NppiSize sizeROI = {ncols, nrows };
169 | 			NppiSize kernel  = {ksize , ksize };
170 | 			NppiPoint offset = {0 , 0 };
171 | 			NppiPoint anchor = {ksize/2 , ksize/2 };
172 | 
173 | 			nppiFilterBoxBorder_32f_C1R(
174 | 				src_data, src_pitch,
175 | 				size, offset,
176 | 				out_data, out_pitch,
177 | 				sizeROI, kernel, anchor, NPP_BORDER_REPLICATE);
178 | 		}
179 | 		// step 10: box filter b
180 | 		//cuda::GpuMat b_ = b;
181 | 		{
182 | 			float* src_data  = (float*)b.data;
183 | 			float* out_data  = (float*)b_.data;
184 | 			int src_pitch    = b.step;
185 | 			int out_pitch    = b_.step;
186 | 			NppiSize size    = {ncols, nrows };
187 | 			NppiSize sizeROI = {ncols, nrows };
188 | 			NppiSize kernel  = {ksize , ksize };
189 | 			NppiPoint offset = {0 , 0 };
190 | 			NppiPoint anchor = {ksize/2 , ksize/2 };
191 | 
192 | 			nppiFilterBoxBorder_32f_C1R(
193 | 				src_data, src_pitch,
194 | 				size, offset,
195 | 				out_data, out_pitch,
196 | 				sizeROI, kernel, anchor, NPP_BORDER_REPLICATE);
197 | 		}
198 | 		// step 11: start to build q with a_ * I
199 | 		//cuda::GpuMat a_I = a_;
200 | 		cuda::multiply(a_, I, a_I);
201 | 		// step 12: q = a_I + b_;
202 | 		cuda::GpuMat q = p;
203 | 		cuda::add(a_I, b_, q);
204 | 	}
205 | 
206 | 	check_timer("costVolumeFilter_guided_gpu time",&timer);
207 | 
208 | 	I.release();
209 | 	mean.release();
210 | 	var.release();
211 | 	workmem.release();
212 | }
213 | 
214 | void costVolumeFilter_guided(struct cost_volume_t& vol, Mat guide, int ksize, float eps){
215 | 	int nrows = vol.nrows;
216 | 	int ncols = vol.ncols;
217 | 	int ndisp = vol.ndisp;
218 | 	float* vin = vol.volume;
219 | 	// doesn't do in-place editing... need second float*
220 | 	float* vout = (float*)malloc(nrows*ncols*ndisp*sizeof(float));
221 | 	// create guided filter
222 | 	//Ptr<ximgproc::GuidedFilter> guided = ximgproc::createGuidedFilter(guide,ksize,eps);
223 | 
224 | 	cvtColor(guide,guide,CV_BGR2GRAY);
225 | 
226 | 	struct timespec timer;
227 | 	check_timer(NULL,&timer);
228 | 
229 | 	for(int disp = 0; disp < ndisp; disp++){
230 | 		Rect relevant;
231 | 		relevant.x = disp; relevant.width = ncols-disp;
232 | 		relevant.y = 0; relevant.height = nrows;
233 | 		Mat slicein(nrows,ncols,CV_32F,&(vin[nrows*ncols*disp]));
234 | 		Mat sliceout(nrows,ncols,CV_32F,&(vout[nrows*ncols*disp]));
235 | 		ximgproc::guidedFilter(guide(relevant),slicein(relevant),sliceout(relevant),ksize,eps);
236 | 	}
237 | 
238 | 	check_timer("costVolumeFilter_guided time:",&timer);
239 | 	printf("\n");
240 | 	// free old cost_volume float*
241 | 	free(vol.volume);
242 | 	// replace with new cost_volume float*
243 | 	vol.volume = vout;
244 | }
245 | 
246 | 


--------------------------------------------------------------------------------
/createCostVolume_tadcg.cu:
--------------------------------------------------------------------------------
  1 | #include "cost_volume.h"
  2 | #include "createCostVolume_tadcg.h"
  3 | #include "timer.h"
  4 | #include "helper.h"
  5 | 
  6 | using namespace std;
  7 | using namespace cv;
  8 | 
  9 | struct two16s {
 10 | 	short int a;
 11 | 	short int b;
 12 | };
 13 | 
 14 | // Device code
 15 | __global__ void createCostVolume_tadcg_kernel(cuda::PtrStepi ref_global, cuda::PtrStepi tgt_global, struct cost_volume_t vol, cuda::PtrStepi debug, float tc, float tg, float alpha){
 16 | 	int gx = blockIdx.x*blockDim.x + threadIdx.x;
 17 | 	int gy = blockIdx.y*blockDim.y + threadIdx.y;
 18 | 
 19 | 	extern __shared__ struct rgba_pixel tgt_data[]; // contains relevant tgt image data
 20 | 
 21 | 	// set shared row pointer
 22 | 	struct rgba_pixel* s_row = (struct rgba_pixel*)((char*)tgt_data + (blockDim.x+vol.ndisp)*threadIdx.y*sizeof(struct rgba_pixel));
 23 | 	
 24 | 	{
 25 | 		// set global row for data transfer loop
 26 | 		struct rgba_pixel* g_row = (struct rgba_pixel*)((char*)tgt_global.data + tgt_global.step*gy);
 27 | 
 28 | 		// copy target image global memory into shared memory (all threads must participate)
 29 | 		for(int i = 0; i < vol.ndisp + blockDim.x; i += blockDim.x){
 30 | 			// check to make sure the actual read lands in 0 <= col < ncols  && row < nrows
 31 | 			if(gy < vol.nrows && (gx - vol.ndisp + i) >= 0 && (gx - vol.ndisp + i) < vol.ncols && threadIdx.x + i < vol.ndisp + blockDim.x){
 32 | 				s_row[threadIdx.x + i] = g_row[gx - vol.ndisp + i];
 33 | 			}
 34 | 			__syncthreads();
 35 | 		}
 36 | 	}
 37 | 
 38 | 	// now only threads which land in the image participate
 39 | 	if(gy < vol.nrows && gx < vol.ncols){
 40 | 		struct rgba_pixel ref0, ref;
 41 | 
 42 | 		{
 43 | 			struct rgba_pixel* g_row = (struct rgba_pixel*)((char*)ref_global.data + ref_global.step*gy);
 44 | 
 45 | 			// get reference pixels from global memory
 46 | 			ref = g_row[gx];
 47 | 			// ref0 is the previous pixel, for the gradient calculation
 48 | 			// casting rgba_pixel to int allows multiplying by (gx>0) which avoids a divergence opportunity
 49 | 			((int*)&ref0)[0] = (((int*)g_row)[max(gx-1,0)]) * (gx > 0);
 50 | 			// old, divergent code (easier to understand:
 51 | 			// if(gx > 0){
 52 | 			// 	ref0 = g_row[gx-1];
 53 | 			// }else{
 54 | 			// 	((int*)&(ref0))[0] = 0;
 55 | 			// }
 56 | 		} 
 57 | 
 58 | 		struct rgba_pixel tgt;
 59 | 		struct rgba_pixel tgt0;
 60 | 		// if(gx == 100 && gy == 100){
 61 | 		// 	printf("ref,ref0 = %d,%d,%d %d,%d,%d\n",ref.b,ref.g,ref.r,ref0.b,ref0.g,ref0.r);
 62 | 		// }
 63 | 
 64 | 		// now go through each disparity
 65 | 		for(int disp = 0; disp < vol.ndisp; disp ++){
 66 | 			float* g_row = (float*)((char*)vol.volume + (disp*vol.nrows+gy)*vol.stride*sizeof(float));
 67 | 			float cost;
 68 | 			int adc, adg;
 69 | 			// check if this disp has a pixel in the tgt image
 70 | 			if( gx - disp >= 0){
 71 | 				// read tgt pixel from shared memory
 72 | 				tgt = s_row[vol.ndisp + threadIdx.x - disp];
 73 | 				// tgt0 is for calculating the gradient
 74 | 				tgt0 = s_row[vol.ndisp-1 + threadIdx.x - disp];
 75 | 
 76 | 				// this is the CUDA-C way to do this
 77 | 				// caluculate absolute difference of color
 78 | 				adc = abs(ref.r - tgt.r) + abs(ref.g - tgt.g) + abs(ref.b - tgt.b);
 79 | 				// caluculate absolute difference of gradient
 80 | 				adg = abs(ref.r-ref0.r - tgt.r+tgt0.r) + abs(ref.g-ref0.g - tgt.g+tgt0.g) + abs(ref.b-ref0.b - tgt.b+tgt0.b);
 81 | 
 82 | 				// this is the PTX way to do this
 83 | 				// although SIMD assembly instructions show a slight performance improvement, though these instructions are Kepler-specific
 84 | 				// int C = 0;
 85 | 				// int rgrad;
 86 | 				// int tgrad;
 87 | 				// //calculate gradients
 88 | 				// asm("vsub4.s32.u32.u32.sat" " %0, %1, %2, %3;": "=r" (rgrad):"r" (((int*)&ref)[0]), "r" (((int*)&ref0)[0]), "r" (C));
 89 | 				// asm("vsub4.s32.u32.u32.sat" " %0, %1, %2, %3;": "=r" (tgrad):"r" (((int*)&tgt)[0]), "r" (((int*)&tgt0)[0]), "r" (C));
 90 | 				// // caluculate absolute difference of color
 91 | 				// asm("vabsdiff4.u32.u32.u32.add" " %0, %1, %2, %3;": "=r" (adc):"r" (((int*)&ref)[0]), "r" (((int*)&tgt)[0]), "r" (C));
 92 | 				// // caluculate absolute difference of gradient
 93 | 				// asm("vabsdiff4.u32.s32.s32.add" " %0, %1, %2, %3;": "=r" (adg):"r" (rgrad), "r" (tgrad), "r" (C));
 94 | 
 95 | 
 96 | 				// calculate cost with TAD C+G
 97 | 				cost = alpha*min(tc,(float)adc)+(1-alpha)*min(tg,(float)adg);
 98 | 			}else{
 99 | 				// these values of the cost volume don't correspond to two real pixels, so make the cost high
100 | 				cost = 9999;
101 | 			}
102 | 			__syncthreads();
103 | 			// now write the cost to the actual cost_volume
104 | 			g_row[gx] = cost;
105 | 			// if(gx == 100 && gy == 100){
106 | 			// 	printf("tgt,tgt0 = %d,%d,%d %d,%d,%d\t",tgt.b,tgt.g,tgt.r,tgt0.b,tgt0.g,tgt0.r);
107 | 			// 	printf("disp,cost,adc,adg = %d,%f,%f,%f\n",disp,cost,adc,adg);
108 | 			// }
109 | 			__syncthreads();
110 | 		}
111 | 	}
112 | }
113 | 
114 | 
115 | struct cost_volume_t createCostVolume_tadcg_gpu(Mat leftim, Mat rightim, int ndisp, float tc, float tg, float alpha){
116 | 	int nchans = leftim.channels();
117 | 	int nrows = leftim.rows;
118 | 	int ncols = leftim.cols;
119 | 	size_t pitch;
120 | 	// allocate gpu memory for cost volume
121 | 	float* volume_gpu;
122 | 	cudaMallocPitch(&volume_gpu,&pitch,ncols*sizeof(float),ndisp*nrows);
123 | 	int stride = pitch / sizeof(float);
124 | 	// init struct cost_volume_t object
125 | 	struct cost_volume_t cost_volume = {volume_gpu,nrows,ncols,ndisp,stride};
126 | 	// convert BGR images to RGBA
127 | 	cvtColor(leftim,leftim,CV_BGR2RGBA);
128 | 	cvtColor(rightim,rightim,CV_BGR2RGBA);
129 | 	// copy left image to to GPU
130 | 	cuda::GpuMat d_im_l;
131 | 	d_im_l.upload(leftim);
132 | 	// copy right image to to GPU
133 | 	cuda::GpuMat d_im_r;
134 | 	d_im_r.upload(rightim);
135 | 	// debug setup
136 | 	cuda::GpuMat d_debug(Size(ncols,nrows),CV_8UC4);
137 | 
138 | 	// settings for the kernel
139 | 	// should be 32-threads wide to ensure 128-byte block global reads
140 | 	dim3 threadsPerBlock(32,4);
141 | 	dim3 blocksPerGrid(ncols/threadsPerBlock.x+1,nrows/threadsPerBlock.y+1);
142 | 	int tgt_shared_mem = (threadsPerBlock.x+ndisp)*threadsPerBlock.y*sizeof(int);
143 | 	// call the kernel
144 | 	struct timespec timer;
145 | 	check_timer(NULL,&timer);
146 |     createCostVolume_tadcg_kernel<<<blocksPerGrid, threadsPerBlock, tgt_shared_mem>>>(d_im_l, d_im_r, cost_volume, d_debug, tc,tg,alpha);
147 | 	cudaDeviceSynchronize();
148 |     check_timer("createCostVolume_tadcg_gpu time",&timer);
149 | 	gpu_perror("createCostVolume_tadcg_kernel");
150 | 
151 | 	// copy debug back over
152 | 	Mat debug;
153 | 	//d_debug.download(debug);
154 | 	// imshow("window",leftim); waitKey(0);
155 | 	// imshow("window",debug); waitKey(0);
156 | 	// imshow("window",leftim); waitKey(0);
157 | 
158 | 	// cleanup the temporary image memory
159 | 	d_im_l.release();
160 | 	d_im_r.release();
161 | 	d_debug.release();
162 | 
163 | 	return cost_volume;
164 | }
165 | 
166 | 
167 | struct bgr_pixel {
168 | 	unsigned char b;
169 | 	unsigned char g;
170 | 	unsigned char r;
171 | };
172 | 
173 | struct cost_volume_t createCostVolume_tadcg(Mat leftim, Mat rightim, int ndisp, float tc, float tg, float alpha){
174 | 	int nchans = leftim.channels();
175 | 	int nrows = leftim.rows;
176 | 	int ncols = leftim.cols;
177 | 	int stride = ncols;
178 | 	float* volume = (float*)malloc(ncols*nrows*nchans*ndisp*sizeof(float));
179 | 	// init struct cost_volume_t object
180 | 	struct cost_volume_t cost_volume = {volume,nrows,ncols,ndisp,stride};
181 | 
182 | 	// make sure images are the same size
183 | 	if(leftim.cols != rightim.cols || leftim.rows != rightim.rows && leftim.channels() == rightim.channels()){
184 | 		printf("ERROR: left and right images in createCostVolume do not have matching rows and cols and channels\n");
185 | 		return cost_volume;
186 | 	}
187 | 
188 | 	struct timespec timer;
189 | 	check_timer(NULL,&timer);
190 | 
191 | 	unsigned char* left =  (unsigned char*)leftim.data;
192 | 	unsigned char* right = (unsigned char*)rightim.data;
193 | 
194 | 	// init values to very large numbers
195 | 	// the reason for this is that some regions near volume edges won't be dealt with
196 | 	for( int i = 0; i < ncols*nrows*nchans*ndisp; i++){
197 | 		// arbitrary large number
198 | 		volume[i] = 9999;
199 | 	}
200 | 
201 | 	// organization will be ndisp images of rows of pixels
202 | 	// iterate over the whole image
203 | 	for(int col = 0; col < ncols; col++){
204 | 		for(int row = 0; row < nrows; row++){
205 | 			struct bgr_pixel ref,ref0, tgt,tgt0;
206 | 			float cost;
207 | 			ref = ((struct bgr_pixel*)(left))[ncols*row + col];
208 | 			if(col >0){
209 | 				ref0 = ((struct bgr_pixel*)(left))[ncols*row + col - 1];
210 | 			}else{
211 | 				ref0.b = 0;
212 | 				ref0.g = 0;
213 | 				ref0.r = 0;
214 | 			}
215 | 			// if(col == 100 && row == 100){
216 | 			// 	printf("ref,ref0 = %d,%d,%d %d,%d,%d\n",ref.b,ref.g,ref.r,ref0.b,ref0.g,ref0.r);
217 | 			// }
218 | 			// iterate over the disparities
219 | 			for(int disp = 0; disp < min(ndisp,col+1); disp++){
220 | 				// get absolute difference of color and of grad
221 | 				float adc = 0;
222 | 				float adg = 0;
223 | 				tgt = ((struct bgr_pixel*)(right))[ncols*row + col-disp];
224 | 				if(col > 0){
225 | 					tgt0 = ((struct bgr_pixel*)(right))[ncols*row + col-disp - 1];
226 | 				}else{
227 | 					tgt0.b = 0;
228 | 					tgt0.g = 0;
229 | 					tgt0.r = 0;
230 | 				}
231 | 
232 | 				// caluculate absolute difference of color
233 | 				adc = abs((int)ref.r - (int)tgt.r) + abs((int)ref.g - (int)tgt.g) + abs((int)ref.b - (int)tgt.b);
234 | 
235 | 				// caluculate absolute difference of gradient
236 | 				adg = abs((int)ref.r-(int)ref0.r - (int)tgt.r+(int)tgt0.r) + abs((int)ref.g-(int)ref0.g - (int)tgt.g+(int)tgt0.g) + abs((int)ref.b-(int)ref0.b - (int)tgt.b+(int)tgt0.b);
237 | 
238 | 				// calculate cost with TAD C+G
239 | 				cost = alpha*min(adc,tc) + (1-alpha)*min(adg,tg);
240 | 
241 | 				// if(col == 100 && row == 100){
242 | 				// 	printf("tgt,tgt0 = %d,%d,%d %d,%d,%d\t",tgt.b,tgt.g,tgt.r,tgt0.b,tgt0.g,tgt0.r);
243 | 				// 	printf("disp,cost,adc,adg = %d,%f,%f,%f\n",disp,cost,adc,adg);
244 | 				// }
245 | 				volume[nrows*ncols*disp + ncols*row + col] = cost;
246 | 			}
247 | 		}
248 | 	}
249 | 	check_timer("createCostVolume_tadcg time",&timer);
250 | 	return cost_volume;
251 | }
252 | 


--------------------------------------------------------------------------------
/costVolumeFilter_jointBilateral.cu:
--------------------------------------------------------------------------------
  1 | #include "costVolumeFilter_jointBilateral.h"
  2 | #include "opencv2/ximgproc/edge_filter.hpp"
  3 | #include "helper.h"
  4 | 
  5 | using namespace std;
  6 | using namespace cv;
  7 | 
  8 | __global__ void costVolumeFilter_jointBilateral_kernel(struct cost_volume_t vol, int* guide_global, int inter_win_padding, float* output, int ksize, float sigma_c, float sigma_s){
  9 | 	int gx = blockIdx.x*blockDim.x + threadIdx.x;
 10 | 	int gy = blockIdx.y*blockDim.y + threadIdx.y;
 11 | 
 12 | 	// radius of kernel
 13 | 	int krad = (ksize-1)/2;
 14 | 
 15 | 	extern __shared__ char shared_mem[];
 16 | 
 17 | 	// the guide is first in shared memory
 18 | 	int* guide = (int*)(&shared_mem[0]);
 19 | 	// the slice is second in shared memory
 20 | 	float* slice = (float*)&(shared_mem[(ksize+blockDim.y-1)*(ksize+blockDim.x-1)*sizeof(float) + inter_win_padding]);
 21 | 
 22 | 	int guide_c;
 23 | 
 24 | 	// center pixel of guide image
 25 | 	if(gy < vol.nrows && gx < vol.ncols){
 26 | 		guide_c = guide_global[vol.ncols*gy + gx];
 27 | 	}
 28 | 	// pull out channel data from guide center pixel (brought in as an int)
 29 | 	int gcr,gcg, gcb;
 30 | 	gcr = (guide_c&0x000000FF) >> 0;
 31 | 	gcb = (guide_c&0x0000FF00) >> 8;
 32 | 	gcg = (guide_c&0x00FF0000) >> 16;
 33 | 
 34 | 	// copy relevant subimages to shared memory
 35 | 	// starting with the guide sub image
 36 | 	for(int i = 1; i < ksize+blockDim.x-1; i += blockDim.x){
 37 | 		// only threads in bounds in x dim continue to next loop
 38 | 		if(i + threadIdx.x < ksize+blockDim.x-1 && gx + i - krad >= 0 && gx + i - krad < vol.ncols){
 39 | 			for(int j = 0; j < ksize+blockDim.y-1; j += blockDim.y){
 40 | 				// only threads in bounds in y dim continue
 41 | 				if(j + threadIdx.y < ksize+blockDim.y-1 && gy + j - krad >= 0 && gy + j - krad < vol.nrows){
 42 | 					guide[(ksize+blockDim.x-1) * (j+threadIdx.y) + i + threadIdx.x] = guide_global[vol.ncols * (gy + j - krad) + gx + i - krad];
 43 | 				}
 44 | 			}
 45 | 		}
 46 | 	}
 47 | 	__syncthreads();
 48 | 	// continuing with the slice sub image
 49 | 	for(int i = 0; i < ksize+blockDim.x-1; i += blockDim.x){
 50 | 		// only threads in bounds in x dim continue to next loop
 51 | 		if(i + threadIdx.x < ksize+blockDim.x-1 && gx + i - krad >= 0 && gx + i - krad < vol.ncols){
 52 | 			for(int j = 0; j < ksize+blockDim.y-1; j += blockDim.y){
 53 | 				// only threads in bounds in y dim continue
 54 | 				if(j + threadIdx.y < ksize+blockDim.y-1 && gy + j - krad >= 0 && gy + j - krad < vol.nrows){
 55 | 					slice[(ksize+blockDim.x-1) * (j+threadIdx.y) + i + threadIdx.x] = vol.volume[vol.nrows*vol.stride*blockIdx.z + vol.stride * (gy + j - krad) + gx + i - krad];
 56 | 				}
 57 | 			}
 58 | 		}
 59 | 	}
 60 | 	__syncthreads();
 61 | 
 62 | 	float weight = 0;
 63 | 	float sum = 0;
 64 | 
 65 | 	// now the bilateral calculation
 66 | 	for(int i = 0; i < ksize; i++){
 67 | 		if(gx - krad + i >= 0 && gx - krad + i < vol.ncols){
 68 | 			for(int j = 0; j < ksize; j++){
 69 | 				if(gy - krad + j >= 0 && gy - krad + j < vol.nrows){
 70 | 					int   guide_p  = guide[(ksize+blockDim.x-1)*(j+threadIdx.y) + i + threadIdx.x];
 71 | 					float slice_p  = slice[(ksize+blockDim.x-1)*(j+threadIdx.y) + i + threadIdx.x];
 72 | 					int gr,gg,gb;
 73 | 					gr = (guide_p&0x000000FF) >> 0;
 74 | 					gb = (guide_p&0x0000FF00) >> 8;
 75 | 					gg = (guide_p&0x00FF0000) >> 16;
 76 | 					int c_diff = abs(gr - gcr) + abs(gb - gcb) + abs(gg - gcg);
 77 | 					float s = __expf( -((j-krad)*(j-krad)+(i-krad)*(i-krad)) / (sigma_s*sigma_s) );
 78 | 					float c = __expf( -(c_diff*c_diff) / (sigma_c*sigma_c));
 79 | 					weight += s*c;
 80 | 					sum += slice_p*s*c;
 81 | 				}
 82 | 			}
 83 | 		}
 84 | 		__syncthreads();
 85 | 	}
 86 | 
 87 | 	// normalize the weighted sum by the sum of the weights
 88 | 	sum /= weight;
 89 | 
 90 | 	if(gy < vol.nrows && gx < vol.ncols){
 91 | 		// for debug, just copy the guide sub image to the output buffer
 92 | 		//output[vol.nrows*vol.stride*blockIdx.z + vol.stride*gy + gx] = (float)(guide[(ksize+blockDim.x-1)*(threadIdx.y + krad) + krad + threadIdx.x] & 0x000000FF);
 93 | 		// for debug, just copy the slice sub image to the output buffer
 94 | 		//output[vol.nrows*vol.stride*blockIdx.z + vol.stride*gy + gx] = slice[(ksize+blockDim.x-1)*(threadIdx.y + krad) + krad + threadIdx.x];
 95 | 		// ok but for reals, output the bilaterally smoothed value here
 96 | 		output[vol.nrows*vol.stride*blockIdx.z + vol.stride*gy + gx] = sum;
 97 | 	}
 98 | }
 99 | 
100 | void costVolumeFilter_jointBilateral_gpu(struct cost_volume_t& cost_volume, Mat guide, int ksize, float sigma_c, float sigma_s){
101 | 	int nrows = cost_volume.nrows;
102 | 	int ncols = cost_volume.ncols;
103 | 	int ndisp = cost_volume.ndisp;
104 | 	int stride = cost_volume.stride;
105 | 
106 | 	if(ksize%2 != 1){
107 | 		printf("ERROR: in costVolumeFilter_jointBilateral_gpu, ksize must be odd\n");
108 | 		return;
109 | 	}
110 | 
111 | 	// settings for the kernel
112 | 	// trying to use 32 threads-wide so the global reads are 128 bytes
113 | 	dim3 threadsPerBlock(32,16);
114 | 	dim3 blocksPerGrid(ncols/threadsPerBlock.x+1,nrows/threadsPerBlock.y+1,ndisp);
115 | 	int guide_win_rows = (ksize + threadsPerBlock.y - 1);
116 | 	int guide_win_width_bytes = (ksize + threadsPerBlock.x - 1)*sizeof(int);
117 | 	// pad between images to 256 bytes
118 | 	int inter_window_pad = (256 - guide_win_width_bytes%256)%256;
119 | 	int slice_win_rows = (ksize + threadsPerBlock.y - 1);
120 | 	int slice_win_width_bytes = (ksize + threadsPerBlock.x - 1)*sizeof(float);
121 | 	int shared_size = guide_win_rows*guide_win_width_bytes + inter_window_pad + slice_win_rows*slice_win_width_bytes;
122 | 	// make sure the shared size is less than device maximum
123 | 	int device;
124 | 	cudaGetDevice(&device);
125 | 	cudaDeviceProp properties;
126 | 	cudaGetDeviceProperties(&properties, device);
127 | 	if(shared_size > properties.sharedMemPerMultiprocessor){
128 | 		printf("ERROR: in costVolumeFilter_jointBilateral_gpu, shared_size exceeds device limit\n");
129 | 		return;
130 | 	}
131 | 
132 | 	// allocate output volume (post-filtering) on gpu
133 | 	float* d_output;
134 | 	cudaMalloc(&d_output, ndisp*nrows*stride*sizeof(float));
135 | 	// copy guide image to to GPU
136 | 	cvtColor(guide,guide,CV_BGR2RGBA);
137 | 	int* d_guide;
138 | 	cudaMalloc(&d_guide, 4*nrows*ncols*sizeof(unsigned char));
139 |     cudaMemcpy(d_guide, guide.data, 4*nrows*ncols*sizeof(unsigned char), cudaMemcpyHostToDevice);
140 | 	
141 | 	// call the kernel
142 | 	struct timespec timer;
143 | 	check_timer(NULL,&timer);
144 |     costVolumeFilter_jointBilateral_kernel<<<blocksPerGrid, threadsPerBlock, shared_size>>>(cost_volume, d_guide, inter_window_pad, d_output, ksize, sigma_c, sigma_s);
145 |     //costVolumeFilter_jointBilateral_kernel<<<blocksPerGrid, threadsPerBlock>>>(cost_volume, d_guide, inter_window_pad, d_output, sigma_s, sigma_c, ksize);
146 | 	cudaDeviceSynchronize();
147 |     check_timer("costVolumeFilter_jointBilateral_gpu time",&timer);
148 | 	gpu_perror("costVolumeFilter_jointBilateral_kernel");
149 | 
150 | 	// shuffle cost_volume pointers
151 | 	cudaFree(cost_volume.volume); // don't need the input anymore
152 | 	cost_volume.volume = d_output; // keep the output instead
153 | }
154 | 
155 | void jointBilateralFilter(Mat& srcim, Mat& guideim, Mat& dst, int kernelSize, float sigma_color, float sigma_space){
156 | 	// make sure images are the same size
157 | 	if(srcim.cols != guideim.cols || srcim.rows != guideim.rows){
158 | 		printf("ERROR: src and guide images in jointBilateralFilter do not have matching rows and cols\n");
159 | 		return;
160 | 	}
161 | 	if(kernelSize%2 != 1){
162 | 		printf("ERROR: kernelSize jointBilateralFilter must be odd\n");
163 | 		return;
164 | 	}
165 | 	int nrows = srcim.rows;
166 | 	int ncols = srcim.cols;
167 | 	int nchans = guideim.channels();
168 | 	// set up some useful variables
169 | 	int win_rad = (kernelSize -1) / 2;
170 | 	// assume we are taking in floating point images
171 | 	float* src = (float*)srcim.data;
172 | 	float* guide = (float*)guideim.data;
173 | 	Mat outim = Mat::zeros(nrows,ncols,CV_32F);
174 | 	float* out = (float*)outim.data;
175 | 	// iterate over the whole image
176 | 	for(int col = 0; col < ncols; col++){
177 | 		for(int row = 0; row < nrows; row++){
178 | 			double normalizing_factor = 0;
179 | 			double weighted_sum = 0;
180 | 			float* guide_center = &(guide[(ncols*row + col)*nchans]);
181 | 			// iterate over the window
182 | 			for(int j = max(0,row-win_rad); j < min(nrows,row+win_rad+1); j++){
183 | 				for(int i = max(0,col-win_rad); i < min(ncols,col+win_rad+1); i++){
184 | 					int x = i - col;
185 | 					int y = j - row;
186 | 					int radius2 = x*x+y*y;
187 | 					float src_pixel = src[ncols*j + i];
188 | 					float* guide_pixel = &(guide[(ncols*j + i)*nchans]);
189 | 					double weight = 1;
190 | 					// apply spacial sigma
191 | 					weight *= std::exp(-radius2/(2.*sigma_space*sigma_space));
192 | 					// get intensity difference from guide image
193 | 					float diff = 0;
194 | 					for(int chan = 0; chan < nchans; chan++){
195 | 						diff += abs(guide_pixel[chan] - guide_center[chan]);
196 | 					}
197 | 					// apply sigma_color
198 | 					weight *= std::exp(-diff*diff/(2.*sigma_color*sigma_color));
199 | 					// add in values
200 | 					normalizing_factor += weight;
201 | 					weighted_sum += weight*src_pixel;
202 | 				}
203 | 			}
204 | 			out[ncols*row + col] = weighted_sum / normalizing_factor;
205 | 			//printf("row,col,val : %d,%d,%f\n",row,col,weighted_sum / normalizing_factor);
206 | 		}
207 | 	}
208 | 	outim.copyTo(dst);
209 | 	return;
210 | }
211 | 
212 | void costVolumeFilter_jointBilateral(struct cost_volume_t& cost_volume, Mat guide, int kernelSize, float sigma_color, float sigma_space){
213 | 	int nrows = cost_volume.nrows;
214 | 	int ncols = cost_volume.ncols;
215 | 	int ndisp = cost_volume.ndisp;
216 | 	float* vin = cost_volume.volume;
217 | 	// doesn't do in-place editing... need second float*
218 | 	float* vout = (float*)malloc(nrows*ncols*ndisp*sizeof(float));
219 | 	// guide must be CV_32F if the cost_volume is
220 | 	guide.convertTo(guide,CV_32F);
221 | 	struct timespec timer;
222 | 	check_timer(NULL,&timer);
223 | 	for(int disp = 0; disp < ndisp; disp++){
224 | 		Mat slicein(nrows,ncols,CV_32F,&(vin[nrows*ncols*disp]));
225 | 		Mat sliceout(nrows,ncols,CV_32F,&(vout[nrows*ncols*disp]));
226 | 		//jointBilateralFilter(InputArray joint, InputArray src, OutputArray dst, int d, double sigmaColor, double sigmaSpace, int borderType=BORDER_DEFAULT)
227 | 		ximgproc::jointBilateralFilter(guide, slicein, sliceout, kernelSize, sigma_color, sigma_space);
228 | 	}
229 | 	check_timer("costVolumeFilter_jointBilateral time",&timer);
230 | 	printf("\n");
231 | 	// free old cost_volume float*
232 | 	free(cost_volume.volume);
233 | 	// replace with new cost_volume float*
234 | 	cost_volume.volume = vout;
235 | }
236 | 


--------------------------------------------------------------------------------
/asw.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <opencv2/opencv.hpp>
  3 | #include <stdlib.h>
  4 | #include <time.h>
  5 | #include <stdint.h>
  6 | #include <inttypes.h>
  7 | 
  8 | 
  9 | #define MAX_DISP 1000
 10 | #define NCHANS 3
 11 | #define IDP_LVL 4
 12 | 
 13 | #define BLOCK_SIZE 16
 14 | 
 15 | 
 16 | // timing utility
 17 | struct timespec check_timer(const char* str, struct timespec* ts){
 18 | 	struct timespec oldtime;
 19 | 	// copy old time over
 20 | 	oldtime.tv_nsec = ts->tv_nsec;
 21 | 	oldtime.tv_sec = ts->tv_sec;
 22 | 	// update ts
 23 | 	clock_gettime(CLOCK_REALTIME, ts);
 24 | 	// print old time
 25 | 	int diffsec;
 26 | 	int diffnsec;
 27 | 	if(str != NULL){
 28 | 		diffsec =  ts->tv_sec - oldtime.tv_sec;
 29 | 		diffnsec =  ts->tv_nsec - oldtime.tv_nsec;
 30 | 		// correct the values if we measured over an integer second break:
 31 | 		if(diffnsec < 0){
 32 | 			diffsec--;
 33 | 			diffnsec += 1000000000;
 34 | 		}
 35 | 		printf("%s:%ds %dns\n",str,diffsec,diffnsec);
 36 | 	}
 37 | 	return (struct timespec) {diffsec, diffnsec};
 38 | }
 39 | 
 40 | // little bitty kernel to initialize blocks of device memory
 41 | __global__ void gpu_memset(unsigned char* start, unsigned char value, int length){
 42 | 	int tx = threadIdx.x;
 43 | 	int bx = blockIdx.x;
 44 | 	int gx = bx*blockDim.x + tx;
 45 | 	if(gx < length){
 46 | 		start[gx] = value;
 47 | 	}
 48 | }
 49 | 
 50 | // teeny little helper function
 51 | void gpu_perror(char* input){
 52 | 	printf("%s: %s\n", input, cudaGetErrorString(cudaGetLastError()));
 53 | }
 54 | 
 55 | 
 56 | // In the future it may be useful to bring a whole line of pixels into local memory...
 57 | // ... from shared memory, and then do everything that needs to be dones with that line...
 58 | // ... for a given pixel, before moving to the next row...
 59 | // ... or maybe it would be better to use a single location of spacial sigma.  Oh I like that. 
 60 | 
 61 | 
 62 | // now let's try just running with 32 threads, but each 32 thread warp stretches horizontally across a row
 63 | // we want to reduced shared memory accesses and increase IDP, so with this in mind we will...
 64 | // ... read in an entire line (blockdim.x + 2*win_rad + ndisp) into shared memory
 65 | // ... ok hold on...
 66 | // ... first strategy: horizontal line of threads, each thread calculates n pixels below it as well
 67 | // ... second strategy: vertical line of threads, each thread calculates n pixels to the right of it
 68 | // ... the first strategy is good for IDP, and can reduce shared reads because each pixel below needs
 69 | // ... to access a given pixel once for each ndisp (in general)
 70 | // ... second strategy is good because each thread needs to access each pixel for differenct ndisp
 71 | // ... the IDP-vertically strategy has neater boundary conditions in the disp direction, because all IDPs share same disp
 72 | // ... the IDP-horizontally strategy has neater boundary conditions in the window direction because all IDPs share same 
 73 | // ... but in either case I think the GPU usage should be the same since every thread should run into the same problems.
 74 | // ... therefore I don't think that should be a deciding factor.
 75 | // My next concern is that we will not generate enough blocks with this strategy to fill a large GPU
 76 | // ... obviously that's a problem with the old kernel writeup as well.
 77 | // I guess for now we will just work on minimizing shared memory accessess.
 78 | __global__ void asw_kernel2(unsigned char* global_left, unsigned char* global_right, unsigned char* output, unsigned char* debug,
 79 | 	int nrows, int ncols, int ndisp, int win_size, int win_rad, float s_sigma, float c_sigma)
 80 | {
 81 | 	extern __shared__ unsigned char ref[]; // the beginning of the shared memory block
 82 | 	unsigned char* tgt = &ref[(win_size + blockDim.x)*NCHANS*IDP_LVL]; // tgt follows a block big enough for reference
 83 | 	// if we start somewhere with a middle row of the image, then we can use a shared variable to share center values
 84 | 	// __shared__ unsigned char ref_center_pix[NCHANS*IDP_LVL];
 85 | 	// __shared__ unsigned char tgt_center_pix[MAX_DISP*NCHANS*IDP_LVL]
 86 | 
 87 | 	int ref_width_bytes = (2*win_rad+blockDim.x)*NCHANS*sizeof(unsigned char);
 88 | 	int tgt_width_bytes = (ndisp+2*win_rad+blockDim.x)*NCHANS*sizeof(unsigned char);
 89 | 
 90 | 	// we are sticking with local memory for the sums of the disparities, because since we only access that occasionally I don't think the latency is a problem
 91 | 	float costs[MAX_DISP*IDP_LVL];
 92 | 	float weights[MAX_DISP*IDP_LVL];
 93 | 
 94 | 	// other things should fall into register memory
 95 | 
 96 | 
 97 | }
 98 | 
 99 | // Device code
100 | __global__ void asw_kernel(unsigned char* global_left, unsigned char* global_right, unsigned char* output, unsigned char* debug,
101 | 	int nrows, int ncols, int ndisp, int win_size, int win_rad, float s_sigma, float c_sigma)
102 | 	{
103 | 	// ok, we're going to try a block size of 32 ( 32x32 = 1024, max threads per block )
104 | 	// no... we'll use 16x16 since there's problems with shared memory with two images
105 | 	// each thread will calculate the full asw stereo output for a single pixel
106 | 	// shared memory will contain all the input image data for the full block of asw calculations
107 | 	// texture memory will contain the spacial filter, eventually
108 | 	extern __shared__ unsigned char ref[]; // contains both left and right image data
109 | 
110 | 	// get the size of the sub-images that we are considering
111 | 	// reference window
112 | 	int ref_width_bytes = (2*win_rad+blockDim.x)*NCHANS*sizeof(unsigned char);
113 | 	// int ref_rows = (2*win_rad+blockDim.y);
114 | 	// target window
115 | 	int tgt_width_bytes = (ndisp+2*win_rad+blockDim.x)*NCHANS*sizeof(unsigned char);
116 | 	// int tgt_rows = (2*win_rad+blockDim.y);
117 | 
118 | 	unsigned char* tgt = (unsigned char*)(&ref[ ref_width_bytes*(2*win_rad+blockDim.y) ]); // tgt image, reference to somwhere of shared allocated memory
119 | 
120 | 	float ref_c_factor;
121 | 	float tgt_c_factor;
122 | 	float s_factor;
123 | 	float ref_c2p_diff;
124 | 	float tgt_c2p_diff;
125 | 	float ref2tgt_diff;
126 | 	// variables for keeping track of the output
127 | 	float weight;
128 | 	float cost;
129 | 	float min_cost;
130 | 	unsigned char min_cost_index;
131 | 	unsigned char ref_center_pix[3];
132 | 	unsigned char tgt_center_pix[3];
133 | 	unsigned char ref_pix[3];
134 | 	unsigned char tgt_pix[3];
135 | 
136 | 	int disp;
137 | 	int win_x;
138 | 	int win_y;
139 | 	int dx;
140 | 	int tgt_x;
141 | 
142 | 	// get identity of this thread (changing these to #define's)
143 | 
144 | 	#define tx (threadIdx.x)
145 | 	#define ty (threadIdx.y)
146 | 	#define bx (blockIdx.x + 5)
147 | 	#define by (blockIdx.y + 1)
148 | 	#define gx (bx*blockDim.x + tx)
149 | 	#define gy (by*blockDim.y + ty)
150 | 
151 | 	// copy relevant subimages to shared memory
152 | 	// TODO: additional boundary checks on this data
153 | 	// TODO: better division technique
154 | 	// TODO: investigate where syncthreads() needs to be called for best performance
155 | 	// we can copy the 24-bit image over 32 bits at a time
156 | 	// except then I don't know how to deal with the edge case
157 | 	// so let's just do one character at a time
158 | 	// starting with reference image: (4 deleted register variables)	
159 | 	// int xblocks = (ref_width_bytes / blockDim.x + 1);
160 | 	// int yblocks = ((2*win_rad+blockDim.y) / blockDim.y + 1);
161 | 	// int xstart = ((bx*blockDim.x - win_rad)*NCHANS);
162 | 	// int ystart = (gy - win_rad);
163 | 	// 29 variables here
164 | 	for(win_x = 0; win_x < (ref_width_bytes / blockDim.x + 1); win_x++){
165 | 		// int x_idx = (win_x*blockDim.x + tx);
166 | 		// int g_x_idx = (((bx*blockDim.x - win_rad)*NCHANS) + win_x*blockDim.x + tx);
167 | 		if((win_x*blockDim.x + tx) < ref_width_bytes){
168 | 			for(win_y = 0; win_y < ((2*win_rad+blockDim.y) / blockDim.y + 1); win_y++){
169 | 				// int y_idx = (win_y*blockDim.y + ty);
170 | 				// int g_y_idx = ((gy - win_rad) + win_y*blockDim.y);
171 | 				if((win_y*blockDim.y + ty) < (2*win_rad+blockDim.y)){
172 | 					// copy bytes (not pixels) from global_left into reference image
173 | 					ref[(win_y*blockDim.y + ty)*ref_width_bytes + (win_x*blockDim.x + tx)] = global_left[((gy - win_rad) + win_y*blockDim.y)*ncols*NCHANS + (((bx*blockDim.x - win_rad)*NCHANS) + win_x*blockDim.x + tx)];
174 | 					// copy into the debug image (only made to work with a single block of threads)
175 | 					// debug[((gy - win_rad) + win_y*blockDim.y)*ncols*NCHANS + (((bx*blockDim.x - win_rad)*NCHANS) + win_x*blockDim.x + tx)]  = ref[(win_y*blockDim.y + ty)*ref_width_bytes + (win_x*blockDim.x + tx)];
176 | 				}
177 | 			}
178 | 		}
179 | 	}
180 | 	// then to the target image: (4 deleted register variables)
181 | 	// xblocks = (tgt_width_bytes / blockDim.x + 1);
182 | 	// yblocks = ((2*win_rad+blockDim.y) / blockDim.y + 1);
183 | 	// xstart = ((bx*blockDim.x - win_rad - ndisp)*NCHANS);
184 | 	// ystart = (gy - win_rad);
185 | 	for(win_x = 0; win_x < (tgt_width_bytes / blockDim.x + 1); win_x++){
186 | 		// int x_idx = (win_x*blockDim.x + tx);
187 | 		// int g_x_idx = (((bx*blockDim.x - win_rad - ndisp)*NCHANS) + win_x*blockDim.x + tx);
188 | 		if((win_x*blockDim.x + tx) < tgt_width_bytes){
189 | 			for(win_y = 0; win_y < ((2*win_rad+blockDim.y) / blockDim.y + 1); win_y++){
190 | 				// int y_idx = (win_y*blockDim.y + ty);
191 | 				// int g_y_idx = ((gy - win_rad) + win_y*blockDim.y);
192 | 				if((win_y*blockDim.y + ty) < (2*win_rad+blockDim.y)){
193 | 					// copy bytes (not pixels) from global_left into reference image
194 | 					tgt[(win_y*blockDim.y + ty)*tgt_width_bytes + (win_x*blockDim.x + tx)] = global_right[((gy - win_rad) + win_y*blockDim.y)*ncols*NCHANS + (((bx*blockDim.x - win_rad - ndisp)*NCHANS) + win_x*blockDim.x + tx)];
195 | 					// copy into the debug image (only made to work with a single block of threads)
196 | 					// debug[((gy - win_rad) + win_y*blockDim.y)*ncols*NCHANS + (((bx*blockDim.x - win_rad - ndisp)*NCHANS) + win_x*blockDim.x + tx)]  = tgt[(win_y*blockDim.y + ty)*tgt_width_bytes + (win_x*blockDim.x + tx)];
197 | 				}
198 | 			}
199 | 		}
200 | 	}
201 | 
202 | 	__syncthreads();
203 | 
204 | 	// get a pointer to the ref_center_pix, which is constant for any given thread
205 | 	ref_center_pix[0] = ref[(win_rad + ty)*ref_width_bytes + (win_rad + tx)*NCHANS + 0];
206 | 	ref_center_pix[1] = ref[(win_rad + ty)*ref_width_bytes + (win_rad + tx)*NCHANS + 1];
207 | 	ref_center_pix[2] = ref[(win_rad + ty)*ref_width_bytes + (win_rad + tx)*NCHANS + 2];
208 | 	// initialize min_cost to some arbitrarily large value
209 | 	min_cost = 1e12;
210 | 	// initialize min_cost_index to 0
211 | 	min_cost_index = 0;
212 | 
213 | 	// for each value of ndisp	
214 | 	for(disp = 0; disp < ndisp; disp++){
215 | 		// get a pointer to the tgt_center_pix, which is constant for each disp
216 | 		tgt_center_pix[0] = tgt[(win_rad + ty)*tgt_width_bytes + (ndisp + win_rad + tx - disp)*NCHANS + 0];
217 | 		tgt_center_pix[1] = tgt[(win_rad + ty)*tgt_width_bytes + (ndisp + win_rad + tx - disp)*NCHANS + 1];
218 | 		tgt_center_pix[2] = tgt[(win_rad + ty)*tgt_width_bytes + (ndisp + win_rad + tx - disp)*NCHANS + 2];
219 | 		// reset weight and cost
220 | 		weight = 0;
221 | 		cost = 0;
222 | 		// in each row in the window:
223 | 		for(win_x = 0; win_x < win_size; win_x++){
224 | 			// locate the pixel in the ref image (deleted this var)
225 | 			dx = win_x + tx;
226 | 			// locate the pixel in the tgt image (deleted this var)
227 | 			tgt_x = ndisp + win_x + tx - disp;
228 | 			// find the window-center to pixel x-distance (deleted this var)
229 | 			// int dx = win_x - win_rad;
230 | 			// in each column of the window:
231 | 			for(win_y = 0; win_y < win_size; win_y++){
232 | 				// locate the pixel in the ref image (deleted this var)
233 | 				// int ref_y = win_y + ty;
234 | 				// find the window-center to pixel y-distance (deleted this var)
235 | 				// int dy = win_y - win_rad;
236 | 				// get the radius^2 value (deleted this var)
237 | 				// float radius_2 = (win_x-win_rad)*(win_x-win_rad) + (win_y-win_rad)*(win_y-win_rad);
238 | 				// get the s_factor for this particular window location
239 | 				s_factor = __expf(-((win_x-win_rad)*(win_x-win_rad) + (win_y-win_rad)*(win_y-win_rad))/(2.*s_sigma*s_sigma));
240 | 				// store tgt and ref pixels in register memory
241 | 				ref_pix[0] = ref[(win_y+ty)*ref_width_bytes + (dx)*NCHANS + 0];
242 | 				ref_pix[1] = ref[(win_y+ty)*ref_width_bytes + (dx)*NCHANS + 1];
243 | 				ref_pix[2] = ref[(win_y+ty)*ref_width_bytes + (dx)*NCHANS + 2];
244 | 				tgt_pix[0] = tgt[(win_y+ty)*tgt_width_bytes + (tgt_x)*NCHANS + 0];
245 | 				tgt_pix[1] = tgt[(win_y+ty)*tgt_width_bytes + (tgt_x)*NCHANS + 1];
246 | 				tgt_pix[2] = tgt[(win_y+ty)*tgt_width_bytes + (tgt_x)*NCHANS + 2];
247 | 				// get the center-to-pixel and overall color differences (organized together for IDP)
248 | 				ref_c2p_diff = abs(ref_center_pix[0] - ref_pix[0]);
249 | 				tgt_c2p_diff = abs(tgt_center_pix[0] - ref_pix[0]);
250 | 				ref2tgt_diff = abs(ref_pix[0] - tgt_pix[0]);
251 | 				ref_c2p_diff += abs(ref_center_pix[1] - ref_pix[1]);
252 | 				tgt_c2p_diff += abs(tgt_center_pix[1] - ref_pix[1]);
253 | 				ref2tgt_diff+= abs(ref_pix[1] - tgt_pix[1]);
254 | 				ref_c2p_diff += abs(ref_center_pix[2] - ref_pix[2]);
255 | 				tgt_c2p_diff += abs(tgt_center_pix[2] - ref_pix[2]);
256 | 				ref2tgt_diff+= abs(ref_pix[2] - tgt_pix[2]);
257 | 				// get the c_factors
258 | 				ref_c_factor = __expf(-ref_c2p_diff*ref_c2p_diff/(2.*c_sigma*c_sigma));
259 | 				tgt_c_factor = __expf(-tgt_c2p_diff*tgt_c2p_diff/(2.*c_sigma*c_sigma));
260 | 				// calulate the pix_weight (this variable has been done away with to increase ILP)
261 | 				// pix_weight = s_factor*ref_c_factor*tgt_c_factor;
262 | 				// add in the cost
263 | 				cost += s_factor*ref_c_factor*tgt_c_factor*ref2tgt_diff;
264 | 				// add in the weight
265 | 				weight += s_factor*ref_c_factor*tgt_c_factor;
266 | 			}
267 | 		}
268 | 		// now that the window is done, compare this cost (after normalizing) to min_cost
269 | 		if( min_cost > cost / weight){
270 | 			min_cost = cost / weight;
271 | 			min_cost_index = disp;
272 | 		}
273 | 		__syncthreads();
274 | 	}
275 | 
276 | 	// set the output to the index of min_cost
277 | 	output[gy*ncols + gx] = min_cost_index;
278 | }
279 | 
280 | int asw(cv::Mat im_l, cv::Mat im_r, int ndisp, int s_sigma, int c_sigma){
281 | 	// window size and win_rad
282 | 	int win_size = 3*s_sigma;
283 | 	int win_rad = (win_size - 1)/2;
284 | 	// declare timer
285 | 	struct timespec timer;
286 | 
287 | 	// check that images are matching dimensions
288 | 	if(im_l.rows != im_r.rows){
289 | 		printf("Error: im_l and im_r do not have matching row count\n");
290 | 		return 1;
291 | 	}
292 | 	if(im_l.cols != im_r.cols){
293 | 		printf("Error: im_l and im_r do not have matching col count\n");
294 | 		return 1;
295 | 	}
296 | 	if(im_l.channels() != im_r.channels()){
297 | 		printf("Error: im_l and im_r do not have matching channel count\n");
298 | 		return 1;
299 | 	}
300 | 
301 | 	// set easy-access variables for number of rows, cols, and chans
302 | 	int nrows = im_l.rows;
303 | 	int ncols = im_l.cols;
304 | 	int nchans = im_l.channels();
305 | 	// initialize the device input arrays
306 | 	unsigned char* d_im_l;
307 | 	cudaMalloc(&d_im_l,nchans*nrows*ncols*sizeof(unsigned char));
308 | 	unsigned char* d_im_r;
309 | 	cudaMalloc(&d_im_r,nchans*nrows*ncols*sizeof(unsigned char));
310 | 	// initialize the output data matrix
311 | 	unsigned char* out = (unsigned char*)malloc(nrows*ncols*sizeof(unsigned char));
312 | 	unsigned char* d_out;
313 | 	cudaMalloc(&d_out,nrows*ncols*sizeof(unsigned char));
314 | 	unsigned char* debug = (unsigned char*)malloc(nrows*ncols*nchans*sizeof(unsigned char));
315 | 	unsigned char* d_debug;
316 | 	cudaMalloc(&d_debug,nchans*nrows*ncols*sizeof(unsigned char));
317 | 
318 | 	// define a shortcut to the host data arrays
319 | 	unsigned char* data_l = ((unsigned char*)(im_l.data));
320 | 	unsigned char* data_r = ((unsigned char*)(im_r.data));
321 | 
322 | 	//copy the host input data to the device
323 |     cudaMemcpy(d_im_l, data_l, nchans*nrows*ncols*sizeof(unsigned char), cudaMemcpyHostToDevice);
324 |     cudaMemcpy(d_im_r, data_r, nchans*nrows*ncols*sizeof(unsigned char), cudaMemcpyHostToDevice);
325 | 
326 | 	// get gaussian kernel for spacial look-up table:
327 | 	// equation from cv::getGaussianKernel(), but without normalization
328 | 	// float s_weights[win_size][win_size]; 
329 | 	// for(int i=0; i<win_size; i++){
330 | 	// 	for(int j=0; j<win_size; j++){
331 | 	// 		float x = i-win_rad;
332 | 	// 		float y = j-win_rad;
333 | 	// 		float radius = sqrt(x*x+y*y);
334 | 	// 		s_weights[i][j] = std::pow(2.71828,-radius*radius/(2.*s_sigma*s_sigma));
335 | 	// 		// printf("%.6f ",s_weights[i][j]);
336 | 	// 	}
337 | 	// 	// printf("\n");
338 | 	// }
339 | 
340 | 	// get gaussian kernel for color look-up table:
341 | 	// equation from cv::getGaussianKernel(), but without normalization
342 | 	// float c_weights[511]; 
343 | 	// for(int i=0; i<511; i++){
344 | 	// 	float radius = i-255;
345 | 	// 	c_weights[i] = std::pow(2.71828,-radius*radius/(2.*c_sigma*c_sigma));
346 | 	// 	// printf("%.6f ",c_weights[i]);
347 | 	// }
348 | 
349 | 	// initialize the outputs (otherwise changes persist between runtimes, hard to debug):
350 | 	int tpb = 1024;
351 | 	int bpg = nrows*ncols*sizeof(unsigned char) / tpb + 1;
352 | 	printf("zeroing output images\n");
353 | 	gpu_memset<<<bpg, tpb>>>(d_out,25,nrows*ncols*sizeof(unsigned char));
354 | 	gpu_perror("memset1");
355 | 	gpu_memset<<<nchans*bpg, tpb>>>(d_debug,25,nchans*nrows*ncols*sizeof(unsigned char));
356 | 	gpu_perror("memset2");
357 | 
358 | 	// check some values before calling the asw_kernel
359 | 	size_t reference_window_size = (2*win_rad+BLOCK_SIZE)*(2*win_rad+BLOCK_SIZE)*sizeof(unsigned char)*nchans;
360 | 	size_t target_window_size = (2*win_rad+ndisp+BLOCK_SIZE)*(BLOCK_SIZE+2*win_rad)*sizeof(unsigned char)*nchans;
361 | 	size_t shared_size = target_window_size+reference_window_size;
362 | 	printf("win_size %d win_rad %d ndisp %d shared size = %d\n",win_size,win_rad,ndisp,shared_size);
363 | 	if(shared_size > 47000){
364 | 		printf("FATAL ERROR: shared_size for asw_kernel exceeds the device limit (48 kB), exiting\n");
365 | 		return 1;
366 | 	}
367 | 
368 | 	// call the asw_kernel
369 | 	dim3 blocksPerGrid(22,21);
370 | 	dim3 threadsPerBlock(BLOCK_SIZE,BLOCK_SIZE);
371 | 	// __global__ void asw_kernel(unsigned char* global_left, unsigned char* global_right, unsigned char* output, unsigned char* debug,
372 | 	//		int nrows, int ncols, int ndisp, int win_size, int win_rad, float s_sigma, float c_sigma)
373 | 	printf("starting asw kernel\n");
374 | 	check_timer(NULL,&timer);
375 |     asw_kernel<<<blocksPerGrid, threadsPerBlock, shared_size>>>(d_im_l, d_im_r, d_out, d_debug,
376 |     	nrows, ncols, nchans, win_size, win_rad, s_sigma, c_sigma);
377 |     cudaDeviceSynchronize();
378 |     check_timer("asw kernel finished",&timer);
379 | 	gpu_perror("asw_kernel");
380 | 
381 | 	// copy the device output data to the host
382 | 	check_timer(NULL,&timer);
383 |     cudaMemcpy(out, d_out, nrows*ncols*sizeof(unsigned char), cudaMemcpyDeviceToHost);
384 |     cudaMemcpy(debug, d_debug, nrows*ncols*nchans*sizeof(unsigned char), cudaMemcpyDeviceToHost);
385 |     check_timer("copying complete",&timer);
386 | 
387 |     // make an image and view it:
388 |     cv::Mat im_out(nrows,ncols,CV_8UC1,out);
389 |     cv::Mat im_debug(nrows,ncols,CV_8UC3,debug);
390 |     // cv::rectangle(im_debug,cv::Point(16*15,16*15),cv::Point(16*16,16*16),cv::Scalar(255,0,0));
391 |     // cv::rectangle(im_out,cv::Point(16*15,16*15),cv::Point(16*16,16*16),127);
392 |     // cv::imshow("window",im_debug);
393 |     // cv::waitKey(0);
394 |     cv::imshow("window",im_out);
395 |     cv::waitKey(0);
396 | 
397 | 	// cleanup memory
398 | 	cudaFree(d_im_l);
399 | 	cudaFree(d_im_r);
400 | 	cudaFree(d_out);
401 | 	cudaFree(d_debug);
402 | 	free(out);
403 | 	free(debug);
404 | 
405 | 	return 0;
406 | }
407 | 
408 | int main(int argc, char** argv){
409 | 	// spacial and color sigmas
410 | 	int s_sigma, c_sigma;
411 | 	// number of disparities to check
412 | 	int ndisp;
413 | 	// input images
414 | 	cv::Mat im_l, im_r;
415 | 
416 | 	if(argc < 6){
417 | 		printf("usage: %s <left image> <right image> <num disparities> <spacial sigma> <color sigma>",argv[0]);
418 | 		return 1;
419 | 	}else{
420 | 		im_l = cv::imread(argv[1]);
421 | 		im_r = cv::imread(argv[2]);
422 | 		ndisp = atoi(argv[3]);
423 | 		s_sigma = atoi(argv[4]);
424 | 		c_sigma = atoi(argv[5]);
425 | 	}
426 | 
427 | 	return asw(im_l, im_r, ndisp, s_sigma, c_sigma);
428 | }


--------------------------------------------------------------------------------