├── CMakeLists.txt
├── Makefile
├── README.md
├── affTest.cpp
├── bitMatcher.cu
├── bitMatcher.h
├── driveGnuPlotStreams.pl
├── gpuFacade.cpp
├── gpuFacade.hpp
├── latch.cu
├── latch.h
├── latchAff.cu
├── latchAff.h
├── min.cpp
├── vo.cpp
└── vo2.cpp


/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | PROJECT(latch_cuda)
 2 | 
 3 | FIND_PACKAGE(CUDA REQUIRED)
 4 | FIND_PACKAGE(OpenCV 3 REQUIRED)
 5 | 
 6 | INCLUDE(FindCUDA)
 7 | 
 8 | SET(CUDALATCHSRCS
 9 |     latch.cu
10 |     bitMatcher.cu
11 | )
12 | 
13 | SET(CUDALATCHAFFSRCS
14 |     latchAff.cu
15 |     bitMatcher.cu
16 | )
17 | 
18 | LIST(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED -lineinfo -Xptxas -v -Xcompiler -fopenmp -use_fast_math -O3 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_60,code=sm_60 --default-stream per-thread")
19 | message("CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
20 | 
21 | SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
22 | 
23 | CUDA_ADD_LIBRARY(latch_cuda ${CUDALATCHSRCS} STATIC)
24 | CUDA_ADD_LIBRARY(latch_aff_cuda ${CUDALATCHAFFSRCS} STATIC)
25 | CUDA_ADD_EXECUTABLE(latch_min_test min.cpp)
26 | CUDA_ADD_EXECUTABLE(latch_vo vo.cpp)
27 | CUDA_ADD_EXECUTABLE(latch_vo2 vo2.cpp gpuFacade.cpp)
28 | CUDA_ADD_EXECUTABLE(latch_affTest affTest.cpp)
29 | 
30 | INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
31 | INCLUDE_DIRECTORIES(${OpenCV_INCLUDE_DIRS})
32 | 
33 | TARGET_LINK_LIBRARIES(latch_cuda ${OpenCV_LIBS})
34 | TARGET_LINK_LIBRARIES(latch_aff_cuda ${OpenCV_LIBS})
35 | 
36 | TARGET_LINK_LIBRARIES(latch_min_test latch_cuda)
37 | TARGET_LINK_LIBRARIES(latch_vo latch_cuda)
38 | TARGET_LINK_LIBRARIES(latch_vo2 latch_cuda)
39 | TARGET_LINK_LIBRARIES(latch_affTest latch_aff_cuda)
40 | 
41 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | all: drone.mp4 vo
 2 | 
 3 | affine: latchAff.o bitMatcher.o
 4 | 	g++ -std=c++11 `pkg-config --cflags opencv` affTest.cpp latchAff.o bitMatcher.o -I/usr/local/cuda-7.5/include/ -L/usr/local/cuda/lib64 -lcuda -lcudart -L/usr/local/lib -lopencv_stitching -lopencv_superres -lopencv_videostab -lopencv_aruco -lopencv_bgsegm -lopencv_bioinspired -lopencv_ccalib -lopencv_dnn -lopencv_dpm -lopencv_fuzzy -lopencv_line_descriptor -lopencv_optflow -lopencv_plot -lopencv_reg -lopencv_saliency -lopencv_stereo -lopencv_structured_light -lopencv_rgbd -lopencv_surface_matching -lopencv_tracking -lopencv_datasets -lopencv_text -lopencv_face -lopencv_xfeatures2d -lopencv_shape -lopencv_video -lopencv_ximgproc -lopencv_calib3d -lopencv_features2d -lopencv_flann -lopencv_xobjdetect -lopencv_objdetect -lopencv_ml -lopencv_xphoto -lopencv_highgui -lopencv_videoio -lopencv_imgcodecs -lopencv_photo -lopencv_imgproc -lopencv_core -o affTest
 5 | 
 6 | vo: latch.o bitMatcher.o #gpuFacade.o #fast.o
 7 | 	g++ -std=c++11 `pkg-config --cflags opencv` vo.cpp latch.o bitMatcher.o -I/usr/local/cuda-7.5/include/ -L/usr/local/cuda/lib64 -lcuda -lcudart -L/usr/local/lib -lopencv_stitching -lopencv_superres -lopencv_videostab -lopencv_aruco -lopencv_bgsegm -lopencv_bioinspired -lopencv_ccalib -lopencv_dnn -lopencv_dpm -lopencv_fuzzy -lopencv_line_descriptor -lopencv_optflow -lopencv_plot -lopencv_reg -lopencv_saliency -lopencv_stereo -lopencv_structured_light -lopencv_rgbd -lopencv_surface_matching -lopencv_tracking -lopencv_datasets -lopencv_text -lopencv_face -lopencv_xfeatures2d -lopencv_shape -lopencv_video -lopencv_ximgproc -lopencv_calib3d -lopencv_features2d -lopencv_flann -lopencv_xobjdetect -lopencv_objdetect -lopencv_ml -lopencv_xphoto -lopencv_highgui -lopencv_videoio -lopencv_imgcodecs -lopencv_photo -lopencv_imgproc -lopencv_core -o vo
 8 | 
 9 | vo2: latch.o bitMatcher.o gpuFacade.o   #fast.o
10 | 	g++ `pkg-config --cflags opencv` vo2.cpp latch.o bitMatcher.o -I/usr/local/cuda-7.5/include/ -L/usr/local/cuda/lib64 -lcuda -lcudart -L/usr/local/lib -lopencv_stitching -lopencv_superres -lopencv_videostab -lopencv_aruco -lopencv_bgsegm -lopencv_bioinspired -lopencv_ccalib -lopencv_dnn -lopencv_dpm -lopencv_fuzzy -lopencv_line_descriptor -lopencv_optflow -lopencv_plot -lopencv_reg -lopencv_saliency -lopencv_stereo -lopencv_structured_light -lopencv_rgbd -lopencv_surface_matching -lopencv_tracking -lopencv_datasets -lopencv_text -lopencv_face -lopencv_xfeatures2d -lopencv_shape -lopencv_video -lopencv_ximgproc -lopencv_calib3d -lopencv_features2d -lopencv_flann -lopencv_xobjdetect -lopencv_objdetect -lopencv_ml -lopencv_xphoto -lopencv_highgui -lopencv_videoio -lopencv_imgcodecs -lopencv_photo -lopencv_imgproc -lopencv_core -o vo2
11 | 
12 | demo2: drone.mp4 vo2
13 | 	./vo2 drone.mp4 620 | perl ./driveGnuPlotStreams.pl 12 4 200 4200 200 200 0 0 0 0 0 0 0 0 950x200+960+30 950x200+960+780 950x200+960+280 950x200+960+530 'pitch' 'yaw' 'roll' 'polar translation angle' 'azimuthal translation angle' 'z' 'keypoints' 'matches' '100 * threshold' 'cpu [ms]' 'gpu [ms]' 'defect' 0 0 0 1 1 1 2 2 2 3 3 1
14 | 
15 | demo2_no_gnuplot: drone.mp4 vo2
16 | 	./vo2 drone.mp4 620
17 | 
18 | demo: drone.mp4 vo
19 | 	./vo drone.mp4 620 | perl ./driveGnuPlotStreams.pl 12 4 200 4200 200 200 0 0 0 0 0 0 0 0 950x200+960+30 950x200+960+780 950x200+960+280 950x200+960+530 'pitch' 'yaw' 'roll' 'polar translation angle' 'azimuthal translation angle' 'z' 'keypoints' 'matches' '100 * threshold' 'cpu [ms]' 'gpu [ms]' 'defect' 0 0 0 1 1 1 2 2 2 3 3 1
20 | #620
21 | #4400
22 | demo_no_gnuplot: drone.mp4 vo
23 | 	./vo drone.mp4 620
24 | 
25 | drone.mp4:
26 | 	youtube-dl -f 137 https://www.youtube.com/watch?v=wneCezU_VQ4
27 | 	mv Raw\ FPV\ Training\ Session\ -\ Dirt\ Bike\ Visit\ in\ Park-wneCezU_VQ4.mp4 drone.mp4
28 | 
29 | gpuFacade.o: latch.o bitMatcher.o
30 | 	g++ `pkg-config --cflags opencv` -c gpuFacade.cpp latch.o bitMatcher.o -I/usr/local/cuda-7.5/include/ -L/usr/local/cuda/lib64 -lcuda -lcudart -L/usr/local/lib -lopencv_stitching -lopencv_superres -lopencv_videostab -lopencv_aruco -lopencv_bgsegm -lopencv_bioinspired -lopencv_ccalib -lopencv_dnn -lopencv_dpm -lopencv_fuzzy -lopencv_line_descriptor -lopencv_optflow -lopencv_plot -lopencv_reg -lopencv_saliency -lopencv_stereo -lopencv_structured_light -lopencv_rgbd -lopencv_surface_matching -lopencv_tracking -lopencv_datasets -lopencv_text -lopencv_face -lopencv_xfeatures2d -lopencv_shape -lopencv_video -lopencv_ximgproc -lopencv_calib3d -lopencv_features2d -lopencv_flann -lopencv_xobjdetect -lopencv_objdetect -lopencv_ml -lopencv_xphoto -lopencv_highgui -lopencv_videoio -lopencv_imgcodecs -lopencv_photo -lopencv_imgproc -lopencv_core
31 | 
32 | 
33 | #fast.o:
34 | #	nvcc -c -lineinfo -O3 -o fast.o       fast.cu       -gencode arch=compute_52,code=sm_52 -I/home/chris/cub-1.5.2/
35 | 
36 | latch.o:
37 | 	nvcc -c -lineinfo -Xptxas -v -use_fast_math -O3 -o latch.o      latch.cu      -gencode arch=compute_52,code=sm_52
38 | 
39 | latchAff.o:
40 | 	nvcc -c -lineinfo -Xptxas -v -use_fast_math -O3 -o latchAff.o      latchAff.cu      -gencode arch=compute_52,code=sm_52
41 | 
42 | 
43 | min: latch.o bitMatcher.o
44 | 	g++ `pkg-config --cflags opencv` min.cpp latch.o bitMatcher.o -I/usr/local/cuda-7.5/include/ -L/usr/local/cuda/lib64 -lcuda -lcudart -L/usr/local/lib -lopencv_stitching -lopencv_superres -lopencv_videostab -lopencv_aruco -lopencv_bgsegm -lopencv_bioinspired -lopencv_ccalib -lopencv_dnn -lopencv_dpm -lopencv_fuzzy -lopencv_line_descriptor -lopencv_optflow -lopencv_plot -lopencv_reg -lopencv_saliency -lopencv_stereo -lopencv_structured_light -lopencv_rgbd -lopencv_surface_matching -lopencv_tracking -lopencv_datasets -lopencv_text -lopencv_face -lopencv_xfeatures2d -lopencv_shape -lopencv_video -lopencv_ximgproc -lopencv_calib3d -lopencv_features2d -lopencv_flann -lopencv_xobjdetect -lopencv_objdetect -lopencv_ml -lopencv_xphoto -lopencv_highgui -lopencv_videoio -lopencv_imgcodecs -lopencv_photo -lopencv_imgproc -lopencv_core -o min
45 | 	./min ob/1.png ob/2.png
46 | 
47 | bitMatcher.o:
48 | 	nvcc -c -lineinfo -Xptxas -v -use_fast_math -O3 -o bitMatcher.o bitMatcher.cu -gencode arch=compute_52,code=sm_52
49 | 
50 | clean:
51 | 	rm vo; rm latch.o; rm bitMatcher.o; rm gpuFacade.o; rm latchAff.o; rm vo2; rm affTest;
52 | 
53 | run: vo
54 | 	./vo
55 | 
56 | #plot: vo
57 | #	./vo $(video) $(skip) $(w) $(h) | feedgnuplot --stream 0.01 --lines --nopoints --legend 0 pitch --legend 1 yaw --legend 2 roll --xlen 200
58 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Major updates are coming in the immediate future. Please watch this space. 
 2 | 
 3 | # CUDA implementation of the LATCH descriptor & brute-force matcher
 4 | 
 5 | This is a high performance GPU implementation of the [LATCH descriptor](http://www.openu.ac.il/home/hassner/projects/LATCH/) invented by [Gil Levi](https://gilscvblog.com/2015/11/07/performance-evaluation-of-binary-descriptor-introducing-the-latch-descriptor/) and [Tal Hassner](http://www.openu.ac.il/home/hassner/). Please reference: "LATCH: Learned Arrangements of Three Patch Codes", IEEE Winter Conference on Applications of Computer Vision (WACV), Lake Placid, NY, USA, March, 2016.
 6 | 
 7 | You should probably be looking at the [OpenMVG branch](https://github.com/mdaiter/openMVG) which includes this code.
 8 | 
 9 | [![IMAGE ALT TEXT](http://img.youtube.com/vi/zmfLZY7T6Qg/0.jpg)](http://www.youtube.com/watch?v=zmfLZY7T6Qg "Video Title")
10 | 
11 | On a GTX 970M I see 10^6 descriptor extractions per second (1 to 1.2 microseconds per descriptor), and 3*10^9 comparisons per second. A GTX 760 sees 70% of this speed. NVidia graphics card with CUDA compute capability >=3.0 required.
12 | 
13 | Look at min.cpp for a minimal introduction. Compile it with "make min -j7". Run it as "./min 1.png 2.png" (Note, min.cpp is broken. Take a look at vo.cpp instead or the OpenMVG class.)
14 | 
15 | vo.cpp has a better example of how you can hide 100% of the processing time of the GPU. The quickest way to see it in action is to install "youtube-dl" and then run "make demo -j7". Or you could just watch this video: https://www.youtube.com/watch?v=zmfLZY7T6Qg I see cumulative 43ms of CPU overhead for GPU processing of 4250 frames of 1080p video.
16 | 
17 | Note that currently each descriptor is 2048 bits but the last 1536 bits are 0. I was originally planning on building larger variants: true 1024 bit and 2048 bit LATCH descriptors. You can relatively easily adjust this down to 1024 bits by changing defines, but refactoring is necessary for 512 bits.
18 | 
19 | Current features:
20 | - hardware interpolation for affine invariant descriptors at virtually no performance overhead
21 | - customizable importance masking for patch triplet comparisons at no performance overhead
22 | - asynchronous GPU operation
23 | - fast cross-checking (symmetry test) with event-driven multi-stream matching kernel
24 | 
25 | Approximate order of planned features:
26 | - multichannel support ( http://arxiv.org/abs/1603.04408 )
27 | - extractor kernel granularity optimization (possibly increased extractor speed)
28 | - documentation
29 | - 512 bit matcher (increased matcher speed)
30 | - API improvements (currently a mess)
31 | - CUDA implementation of adaptive grid FAST detector
32 | - offline parameter optimization with PyGMO
33 | - integration into OpenCV
34 | 
35 | Multi-GPU support is not currently planned. Please contact me if you have a use case that requires it.
36 | 
37 | This work is released under a Creative Commons Attribution-ShareAlike license. If you use this code in an academic work, please cite me by name ([Christopher Parker](https://github.com/csp256/)) and link to [this repository](https://github.com/csp256/cudaLATCH/).
38 | 
39 | Please email me if you have any questions: csparker.work@gmail.com
40 | 


--------------------------------------------------------------------------------
/affTest.cpp:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <iostream>
  3 | #include <stdio.h>
  4 | #include <time.h>
  5 | #include "cuda.h"
  6 | #include "cuda_runtime.h"
  7 | #include "opencv2/opencv.hpp"
  8 | using namespace std;
  9 | using namespace cv;
 10 | #include "latchAff.h"
 11 | #include "bitMatcher.h"
 12 | //#include "gpuFacade.hpp"
 13 | 
 14 | #define cudaCalloc(A, B) \
 15 |     do { \
 16 |         cudaError_t __cudaCalloc_err = cudaMalloc(A, B); \
 17 |         if (__cudaCalloc_err == cudaSuccess) cudaMemset(*A, 0, B); \
 18 |     } while (0)
 19 | 
 20 | #define checkError(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 21 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
 22 |    if (code != cudaSuccess) {
 23 |       fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
 24 |       if (abort) exit(code);
 25 |    }
 26 | }
 27 | 
 28 | #define checkLaunchError()                                            \
 29 | do {                                                                  \
 30 |     /* Check synchronous errors, i.e. pre-launch */                   \
 31 |     cudaError_t err = cudaGetLastError();                             \
 32 |     if (cudaSuccess != err) {                                         \
 33 |         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 34 |                  __FILE__, __LINE__, cudaGetErrorString(err) );       \
 35 |         exit(EXIT_FAILURE);                                           \
 36 |     }                                                                 \
 37 |     /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
 38 |     err = cudaThreadSynchronize();                                    \
 39 |     if (cudaSuccess != err) {                                         \
 40 |         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 41 |                  __FILE__, __LINE__, cudaGetErrorString( err) );      \
 42 |         exit(EXIT_FAILURE);                                           \
 43 |     }                                                                 \
 44 | } while (0)
 45 | 
 46 | 
 47 | // Sometimes the recovered pose is 180 degrees off...? I thought cheirality test would handle that, but apparently not always.
 48 | double dist2(Mat a, Mat b) {
 49 |     double s = 0.0;
 50 |     for (int i=0; i<3; i++) {
 51 |         const double t = a.at<double>(i) - b.at<double>(i);
 52 |         s += t*t;
 53 |     }
 54 |     return s;
 55 | }
 56 | 
 57 | // In general a suffix of 1 means previous frame, and 2 means current frame.
 58 | // However, we start processing the next frame while the GPU is working on current...
 59 | // So at a certain point frame 1 shifts down to 0, 2 shifts down to 1, and the new 2 is loaded.
 60 | int main( int argc, char** argv ) {
 61 |     // gpuFacade gpu;
 62 |     // gpu.set_values(3,4);
 63 |     // cerr << "!! " << gpu.area() << endl;
 64 | 
 65 |     // This must be an integer multiple of 512.
 66 |     // Specifically, half-multiples of the number of SM's for your GPU are sensible.
 67 |     // I have 10 streaming multiprocessors, so I chose 15*512 = 7680.
 68 |     const int maxKP = 512 * 15;
 69 |     const bool showMatches = true;
 70 |     // Shows every Nth processed frame's matches.
 71 |     const int showMatchesInterval = 10;
 72 |     const bool showVideo = true;
 73 |     // Shows every Nth processed frame.
 74 |     const int showVideoInterval = 1;
 75 |     int WIDTH, HEIGHT, totalMatches, totalInliers = 0;
 76 |     const int matchThreshold = 12;
 77 |     // Discard this many frames for each one processed. Change with +/- keys while running.
 78 |     int skipFrames = 0;
 79 |     // Threshold for FAST detector
 80 |     int threshold = 20;
 81 |     int targetKP = 3000;
 82 |     int tolerance = 200;
 83 |     int maxLoops = 100;//4200;
 84 |     const bool gnuplot = true;
 85 |     double defect = 0.0;
 86 |     int extractions = 0;
 87 | 
 88 |     VideoCapture cap;
 89 |     if (argc == 1) {
 90 |         cap = VideoCapture(0);
 91 |         WIDTH  = cap.get(CAP_PROP_FRAME_WIDTH);
 92 |         HEIGHT = cap.get(CAP_PROP_FRAME_HEIGHT);
 93 |     }
 94 |     if (argc == 2 || argc == 3) {
 95 |         cap = VideoCapture(argv[1]);
 96 |         WIDTH  = cap.get(CAP_PROP_FRAME_WIDTH);
 97 |         HEIGHT = cap.get(CAP_PROP_FRAME_HEIGHT);
 98 |         if (argc == 3) {
 99 |             for (int i=0; i<atoi(argv[2]); i++) {
100 |                 cap.grab();
101 |             }
102 |         }
103 |     }
104 |     if (argc == 4) {
105 |         cap = VideoCapture(0);
106 |         WIDTH  = atoi(argv[2]);
107 |         HEIGHT = atoi(argv[3]);
108 |         cap.set(CAP_PROP_FRAME_WIDTH,  WIDTH);
109 |         cap.set(CAP_PROP_FRAME_HEIGHT, HEIGHT);
110 |     }
111 | 
112 |     double f = 0.4;
113 |     double data[]= {f*WIDTH,  0.0,  WIDTH*0.5,  0.0, f*HEIGHT, HEIGHT*0.5, 0.0, 0.0, 1.0};
114 |     Mat K(3, 3, CV_64F, data);
115 |     Mat F, R, T, rod, mask;
116 |     Mat img0, img1, img2, img1g, img2g, imgMatches, E, rodOld, outMat1, outMat2;
117 | 
118 |     cap >> img1;
119 |     cap >> img2;
120 |     img1 = imread("/home/chris/cv/data/affine/graffiti/img1.ppm", -1);
121 |     cv::cvtColor(img1, img1g, CV_BGR2GRAY);
122 |     cv::cvtColor(img2, img2g, CV_BGR2GRAY);
123 |     if (showMatches) {
124 |         namedWindow("Matches", WINDOW_NORMAL);
125 |     }
126 |     waitKey(1);
127 |     if (showVideo) {
128 |         namedWindow("Video", WINDOW_NORMAL);
129 |     }
130 |     waitKey(1);
131 |     resizeWindow("Matches", 1920/2, 540/2);
132 |     resizeWindow("Video", 960, 540);
133 |     moveWindow("Matches", 0, 540+55);
134 |     moveWindow("Video", 0, 0);
135 |     waitKey(1);
136 | 
137 |     cudaEvent_t start, stop;
138 |     cudaEventCreate(&start);
139 |     cudaEventCreate(&stop);
140 | 
141 |     vector<KeyPoint> keypoints0, keypoints1, keypoints2;
142 |     vector<DMatch> goodMatches;
143 |     vector<Point2f> p1, p2; // Point correspondences for recovering pose.
144 |     int numKP0, numKP1, numKP2; // The actual number of keypoints we are dealing with: just keypoints#.size(), but capped at maxKP.
145 |     int key = -1;
146 |     clock_t timer, timer2;
147 |     float time;
148 | 
149 |     // Sizes for device and host pointers
150 |     size_t sizeK = maxKP * sizeof(float) * 5; // K for keypoints
151 |     size_t sizeI = WIDTH * HEIGHT * sizeof(unsigned char); // I for Image
152 |     size_t sizeD = maxKP * (2048 / 32) * sizeof(unsigned int); // D for Descriptor
153 |     size_t sizeM = maxKP * sizeof(int); // M for Matches
154 |     size_t sizeMask = 64 * sizeof(float);
155 | 
156 |     // Host pointers
157 |     float *h_K1, *h_K2;
158 |     cudaMallocHost((void **) &h_K1, sizeK);
159 |     cudaMallocHost((void **) &h_K2, sizeK);
160 |     // For reasons opaque to me, allocating both (but not either) h_M1 or h_M2
161 |     // with cudaMallocHost segfaults, apparently after graceful exit? So neither of them are pinned.
162 |     int h_M1[maxKP];
163 |     int h_M2[maxKP];
164 |     float h_mask[64];
165 |     for (int i=0; i<64; i++) { h_mask[i] = 1.0f; }
166 | 
167 |     // Device pointers
168 |     unsigned char *d_I;
169 |     unsigned int *d_D1, *d_D2, *uIntSwapPointer;
170 |     int *d_M1, *d_M2;
171 |     float *d_K, *d_mask;
172 |     cudaCalloc((void **) &d_K, sizeK);
173 |     cudaCalloc((void **) &d_D1, sizeD);
174 |     cudaCalloc((void **) &d_D2, sizeD);
175 |     cudaCalloc((void **) &d_M1, sizeM);
176 |     cudaCalloc((void **) &d_M2, sizeM);
177 |     cudaCalloc((void **) &d_mask, sizeM);
178 | 
179 |     // The patch triplet locations for LATCH fits in texture memory cache.
180 |     cudaArray* patchTriplets;
181 |     initPatchTriplets(patchTriplets);
182 |     size_t pitch;
183 |     initImage(&d_I, WIDTH, HEIGHT, &pitch);
184 |     initMask(&d_mask, h_mask);
185 | 
186 |     // Events allow asynchronous, nonblocking launch of subsequent kernels after a given event has happened,
187 |     // such as completion of a different kernel on a different stream.
188 |     cudaEvent_t latchFinished;
189 |     cudaEventCreate(&latchFinished);
190 |     // You should create a new stream for each bitMatcher kernel you want to launch at once.
191 |     cudaStream_t streanumKP1, streanumKP2;
192 |     cudaStreamCreate(&streanumKP1);
193 |     cudaStreamCreate(&streanumKP2);
194 | 
195 | 
196 |     FAST(img1g, keypoints1, threshold);
197 |     // extractions += keypoints1.size();
198 |     // latchAff( img1g, d_I, pitch, h_K1, d_D1, &numKP1, maxKP, d_K, &keypoints1, d_mask, latchFinished, outMat1 );
199 | 
200 |     Ptr<MSER> mserExtractor  = MSER::create();
201 | 
202 |     vector<vector<cv::Point> > mserContours;
203 |     vector<KeyPoint> mserKeypoint;
204 |     vector<Rect> mserBbox;
205 |     mserExtractor->detect(img1g, mserContours, mserBbox);
206 | 
207 |     outMat1 = img1.clone();
208 |     outMat2 = img1.clone();
209 | resize(outMat2, outMat2, Size(64,64));
210 | 
211 |     // cerr << outMat1.depth() << " (()) " << outMat1.channels() << " (()) " << outMat1.type() << endl;
212 |     // for (int i=0; i<mserContours.size(); i+=320) {
213 |     //     ellipse(outMat1, fitEllipse(mserContours[i]), Scalar(255,0,0));
214 |     // }
215 | // cerr << "** " << fitEllipse(mserContours[640]) << endl;
216 |     RotatedRect rekt = fitEllipse(mserContours[640]);
217 |     rekt.center.y = rekt.center.y - 330;
218 |     rekt.angle = 130;
219 |     ellipse(outMat1, rekt, Scalar(0,255,0), 3);
220 | 
221 |     latchAff( img1g, d_I, pitch, h_K1, d_D1, &numKP1, maxKP, d_K, &keypoints1, d_mask, latchFinished, outMat2, rekt);
222 | 
223 | 
224 |     // for (vector<cv::Point> v : mserContours){
225 |     //     for (cv::Point p : v){
226 |     //         outMat1.at<uchar>(p.y, p.x*3+0) = 255;
227 |     //         outMat1.at<uchar>(p.y, p.x*3+1) = 255;
228 |     //         outMat1.at<uchar>(p.y, p.x*3+2) = 255;
229 |     //     }
230 |     // }
231 | 
232 |     // ms(box, regions, Mat());
233 |     //  for (int i = 0; i < regions.size(); i++)
234 |     //  {
235 |     //      ellipse(box, fitEllipse(regions[i]), Scalar(255));
236 |     //  }
237 | 
238 |     // FAST(img2g, keypoints2, threshold); // This call to fast is concurrent with above execution.
239 |     // extractions += keypoints2.size();
240 |     // latchAff( img2g, d_I, pitch, h_K2, d_D2, &numKP2, maxKP, d_K, &keypoints2, d_mask, latchFinished, outMat2 );
241 |     // bitMatcher( d_D1, d_D2, numKP1, numKP2, maxKP, d_M1, matchThreshold, streanumKP1, latchFinished );
242 |     // bitMatcher( d_D2, d_D1, numKP2, numKP1, maxKP, d_M2, matchThreshold, streanumKP2, latchFinished );
243 |     // timer = clock();
244 |     // getMatches(maxKP, h_M1, d_M1);
245 |     // getMatches(maxKP, h_M2, d_M2);
246 |     // for (int i=0; i<numKP1; i++) {
247 |     //     if (h_M1[i] >= 0 && h_M1[i] < numKP2 && h_M2[h_M1[i]] == i) {
248 |     //         goodMatches.push_back( DMatch(i, h_M1[i], 0)); // For drawing.
249 |     //         p1.push_back(keypoints1[i].pt); // For recovering pose.
250 |     //         p2.push_back(keypoints2[h_M1[i]].pt);
251 |     //     }
252 |     // }
253 |     //
254 |     // drawMatches( img1, keypoints1, img2, keypoints2,
255 |     //     goodMatches, imgMatches, Scalar::all(-1), Scalar::all(-1),
256 |     //     vector<char>(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS );
257 |     imshow( "Video", outMat1 );
258 |     imshow( "Matches", outMat2 );
259 |     waitKey(0);
260 |     return 0;
261 | 
262 | /*    img1.copyTo(img0);
263 |     img2.copyTo(img1);
264 |     cap.read(img2);
265 |     cvtColor(img2, img2g, CV_BGR2GRAY);
266 | 
267 |     keypoints0 = keypoints1;
268 |     keypoints1 = keypoints2;
269 | 
270 |     uIntSwapPointer = d_D1;
271 |     d_D1 = d_D2;
272 |     d_D2 = uIntSwapPointer;
273 | 
274 |     numKP0 = numKP1;
275 |     numKP1 = numKP2;
276 | 
277 |     FAST(img2g, keypoints2, threshold);
278 |     int loopIteration = 0;
279 |     for (; loopIteration < maxLoops || maxLoops == -1; loopIteration++) { // Main Loop.
280 |         { // GPU code for descriptors and matching.
281 |             cudaEventRecord(start, 0);
282 |             extractions += keypoints2.size();
283 |             latch( img2g, d_I, pitch, h_K2, d_D2, &numKP2, maxKP, d_K, &keypoints2, d_mask, latchFinished);
284 |             bitMatcher( d_D1, d_D2, numKP1, numKP2, maxKP, d_M1, matchThreshold, streanumKP1, latchFinished );
285 |             bitMatcher( d_D2, d_D1, numKP2, numKP1, maxKP, d_M2, matchThreshold, streanumKP2, latchFinished );
286 |             cudaEventRecord(stop, 0);
287 |         }
288 |         timer = clock();
289 |         { // Put as much CPU code here as possible.
290 |             { // Display matches and/or video to user.
291 |                 bool needToDraw = false;
292 |                 if (showMatches && loopIteration % showMatchesInterval == 0) { // Draw matches.
293 |                     drawMatches( img0, keypoints0, img1, keypoints1,
294 |                         goodMatches, imgMatches, Scalar::all(-1), Scalar::all(-1),
295 |                         vector<char>(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS );
296 |                     imshow( "Matches", imgMatches );
297 |                     needToDraw = true;
298 |                 }
299 |                 if (showVideo && loopIteration % showVideoInterval == 0) {
300 |                     imshow("Video", img1);
301 |                     needToDraw = true;
302 |                 }
303 |                 if (needToDraw) {
304 |                     key = waitKey(1);
305 |                 }
306 |             }
307 |             { // Handle user input.
308 |                 switch (key) {
309 |                     case (-1):
310 |                     break;
311 |                     case (1048689): // q
312 |                     case (113): // also q
313 |                         return 0;
314 |                     break;
315 |                     case (1048695): // w
316 |                         waitKey(0);
317 |                     break;
318 |                     case (1114027): // +
319 |                         skipFrames++;
320 |                         cerr << "For each processed frame we are now skipping " << skipFrames << endl;
321 |                     break;
322 |                     case (1114029): // -
323 |                         skipFrames = max(1, --skipFrames);
324 |                         cerr << "For each processed frame we are now skipping " << skipFrames << endl;
325 |                     break;
326 |                     default:
327 |                         cerr << "Currently pressed key is:   " << key << endl;
328 |                     break;
329 |                 }
330 |                 key = -1;
331 |             }
332 |             { // Iterate the "logical" loop (get ready to process next frame)
333 |                 img1.copyTo(img0);
334 |                 img2.copyTo(img1);
335 |                 for (int i=0; i<skipFrames; i++) {
336 |                     cap.grab();
337 |                 }
338 |                 cap.read(img2);
339 |                 if (img2.cols == 0) break;
340 |                 cvtColor(img2, img2g, CV_BGR2GRAY);
341 | 
342 |                 keypoints0 = keypoints1;
343 |                 keypoints1 = keypoints2;
344 | 
345 |                 uIntSwapPointer = d_D1;
346 |                 d_D1 = d_D2;
347 |                 d_D2 = uIntSwapPointer;
348 | 
349 |                 numKP0 = numKP1;
350 |                 numKP1 = numKP2;
351 |             }
352 |             { // Solve for and output rotation vector (this gets piped to feedgnuplot).
353 |                 if (10 < p1.size() && 10 < p2.size()) {
354 |                     E = findEssentialMat(p1, p2, f*WIDTH, Point2d(WIDTH*0.5f, HEIGHT*0.5f), RANSAC, 0.999, 3.0, mask);
355 |                     int inliers = 0;
356 |                     for (int i=0; i<mask.rows; i++) {
357 |                         inliers += mask.data[i];
358 |                     }
359 |                     totalInliers += inliers;
360 |                     double size = p1.size();
361 |                     double r = inliers/max((double)size, 150.0);
362 |                     r = 1.0 - min(r + 0.05, 1.0);
363 |                     defect += r*r;
364 |                     cout << "11:" << r*r << endl;
365 | 
366 |                     recoverPose(E, p1, p2, R, T, f*WIDTH, Point2d(WIDTH*0.5f, HEIGHT*0.5f), mask);
367 |                     Rodrigues(R, rod);
368 |                     if (loopIteration==0) {
369 |                         rod.copyTo(rodOld);
370 |                     }
371 |                     if (dist2(rod, rodOld) < 1.0) {
372 |                         rod.copyTo(rodOld);
373 |                     } else {
374 |                         cerr << "Rejecting the recovered pose: " << rod.t() * 57.2957795 << endl;
375 |                         // This commented out chunk of code is good for webcams. If you initialize with a bad value it will recover.
376 |                         // const double alpha = 0.1; // Move our region of acceptable responses (only a little) closer to the observed (but presumed erroneous) value.
377 |                         // for (int i=0; i<3; i++) {
378 |                         //     rodOld.at<double>(i) = rodOld.at<double>(i)*(1.0-alpha) + rod.at<double>(i)*alpha;
379 |                         // }
380 |                         rodOld.copyTo(rod);
381 |                     }
382 |                 } else {
383 |                     defect += 1.0;
384 |                     cout << "11:" << 1.0 << endl;
385 |                     cerr << "Too few matches! Not going to try to recover pose this frame." << endl;
386 |                 }
387 |                 // To prevent the graphs from desynchronizing from each other, we have to output this unconditionally.
388 |                 if (gnuplot) {
389 |                     for (int i=0; i<3; i++) {
390 |                         cout << i << ":" << rod.at<double>(i) * 57.2957795 << endl; // Output Rodrigues vector, rescaled to degrees
391 |                     }
392 |                     // T is unit norm (scale-less) and often erroneously sign-reversed.
393 |                     // if (T.at<double>(2) < 0) T = -T; // Assume dominate motion is forward... (this is not an elegant assumption)
394 |                     // double theta = atan2(T.at<double>(0), T.at<double>(2));
395 |                     // double phi = atan2(T.at<double>(1), T.at<double>(2));
396 |                     // cout << 3 << ":" << theta * 57.2957795 << endl; // Plot polar translation angle
397 |                     // cout << 4 << ":" << phi * 57.2957795 << endl; // Plot azimuthal translation angle
398 |                 }
399 |             }
400 |             { // run FAST detector on the CPU for next frame (get ready for next loop iteration).
401 |                 FAST(img2g, keypoints2, threshold);
402 |                 // Apply proportional control to threshold to drive it towards targetKP.
403 |                 int control = (int)(((float)keypoints2.size() - (float)targetKP) / (float)tolerance);
404 |                 threshold += min(100, control);
405 |                 if (threshold < 1) threshold = 1;
406 |             }
407 |         }
408 |         if (gnuplot) {
409 |             time = (1000*(clock() - timer)/(double)CLOCKS_PER_SEC);
410 |             cout << "9:" << time << endl; // Plot CPU time.
411 |             timer = clock();
412 |         }
413 |         { // Get new GPU results
414 |             p1.clear();
415 |             p2.clear();
416 |             goodMatches.clear();
417 |             getMatches(maxKP, h_M1, d_M1);
418 |             getMatches(maxKP, h_M2, d_M2);
419 |             cudaEventElapsedTime(&time, start, stop);
420 |             if (gnuplot) {
421 |                 cout << "10:" << (time+(1000*(clock() - timer)/(double)CLOCKS_PER_SEC)) << endl; // Plot total asynchronous GPU time.
422 |             }
423 |             for (int i=0; i<numKP0; i++) {
424 |                 if (h_M1[i] >= 0 && h_M1[i] < numKP1 && h_M2[h_M1[i]] == i) {
425 |                     goodMatches.push_back( DMatch(i, h_M1[i], 0)); // For drawing matches.
426 |                     p1.push_back(keypoints0[i].pt); // For recovering pose.
427 |                     p2.push_back(keypoints1[h_M1[i]].pt);
428 |                 }
429 |             }
430 |         }
431 |         if (gnuplot) {
432 |             cout << "6:" << numKP1 << endl; // Plot number of keypoints.
433 |             cout << "7:" << p1.size() << endl; // Plot number of matches.
434 |             cout << "8:" << 100*threshold << endl; // Plot current threshold for FAST.
435 |         }
436 |         totalMatches += p1.size();
437 |     }
438 |     cudaFreeArray(patchTriplets);
439 |     cudaFree(d_K);
440 |     cudaFree(d_D1);
441 |     cudaFree(d_D2);
442 |     cudaFree(d_M1);
443 |     cudaFree(d_M2);
444 |     cudaFreeHost(h_K1);
445 |     cudaFreeHost(h_K2);
446 |     cerr << "Total matches: " << totalMatches << endl;
447 |     cerr << "Total inliers: " << totalInliers << endl;
448 |     cerr << "Defect: " << defect << endl;
449 |     cerr << "Loop iteration: " << loopIteration << endl;
450 |     cerr << "Extractions: " << extractions << endl;
451 | 
452 |     return 0;
453 | */
454 | }
455 | 


--------------------------------------------------------------------------------
/bitMatcher.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <iostream>
  3 | #include <errno.h>
  4 | 
  5 | #include <cuda_runtime.h>
  6 | #include <cuda_runtime_api.h>
  7 | #include <device_launch_parameters.h>
  8 | 
  9 | #include "bitMatcher.h"
 10 | 
 11 | using namespace std;
 12 | 
 13 | // Number of values each thread in a warp gets per vector.
 14 | #define chunksPerVector (2)
 15 | #define vectorsPerWarp (16)
 16 | // Vectors per group is used to increase ILP. it must divide vectorsPerWarp. This implementation is specialized for vectorsPerGroup==8.
 17 | #define vectorsPerGroup (8)
 18 | #define warpsPerBlock (32)
 19 | // The total number of int32's needed to store a vector. We should drop this down to 16 for an optimized implementation for canonical LATCH.
 20 | #define vectorDimension (64)
 21 | #define _warpSize (32)
 22 | #define cacheSize (128)
 23 | #define halfCacheSize (64)
 24 | 
 25 | #define checkError(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 26 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
 27 |    if (code != cudaSuccess) {
 28 |       fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
 29 |       if (abort) exit(code);
 30 |    }
 31 | }
 32 | 
 33 | #define checkLaunchError()                                          \
 34 | do {                                                                  \
 35 |     /* Check synchronous errors, i.e. pre-launch */                   \
 36 |     cudaError_t err = cudaGetLastError();                             \
 37 |     if (cudaSuccess != err) {                                         \
 38 |         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 39 |                  __FILE__, __LINE__, cudaGetErrorString(err) );       \
 40 |         exit(EXIT_FAILURE);                                           \
 41 |     }                                                                 \
 42 |     /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
 43 |     err = cudaThreadSynchronize();                                    \
 44 |     if (cudaSuccess != err) {                                         \
 45 |         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 46 |                  __FILE__, __LINE__, cudaGetErrorString( err) );      \
 47 |         exit(EXIT_FAILURE);                                           \
 48 |     }                                                                 \
 49 | } while (0)
 50 | 
 51 | 
 52 | // Launch as 32x32
 53 | __global__ void __launch_bounds__(1024, 1)
 54 |                 bitMatch(   const unsigned int *g_query,
 55 |                             const unsigned int *g_training,
 56 |                             int *g_match,
 57 |                             const int trainingSize,
 58 |                             const int threshold) {
 59 |     // Load query vectors
 60 |     register unsigned int query[vectorsPerWarp][chunksPerVector];
 61 | 
 62 |     volatile __shared__ unsigned int s_training[cacheSize][chunksPerVector][_warpSize]; // We have enough room to load extra query vectors from shared memory...
 63 |     {
 64 |         int offset = threadIdx.x;
 65 |         offset += blockIdx.x * vectorDimension * warpsPerBlock * vectorsPerWarp;
 66 |         offset +=              vectorDimension *  threadIdx.y  * vectorsPerWarp;
 67 | 
 68 |         #pragma unroll
 69 |         for (int i=0; i<vectorsPerWarp; i++) {
 70 |             #pragma unroll
 71 |             for (int j=0; j<chunksPerVector; j++, offset += warpSize) {
 72 |                 query[i][j] = g_query[offset];
 73 |             }
 74 |         }
 75 |     }
 76 | 
 77 |     // Load the first training vectors.
 78 |     int trainingOffset = threadIdx.y * vectorDimension;
 79 |     if (threadIdx.y < halfCacheSize) {
 80 |         for (int i=0; i<chunksPerVector; i++, trainingOffset += warpSize) {
 81 |             s_training[threadIdx.y][i][threadIdx.x] = g_training[trainingOffset + threadIdx.x];
 82 |         }
 83 |     }
 84 |     __threadfence_block();
 85 | 
 86 |     register int bestIndex = -1;
 87 |     register int best       =  9999999;
 88 |     register int secondBest = 99999999;
 89 |     #pragma unroll 4
 90 |     for (int t=0; t < trainingSize; t+= cacheSize) {
 91 |         // Synchronize halfway through using shared memory...
 92 |         // So you can freely write to the other half.
 93 |         #pragma unroll
 94 |         for (int half=0; half < 2; half++) { // Half will be 0 when you should be working with top half, and loading into bottom half.
 95 |             register unsigned int prefetch = 0.0f;
 96 |             #pragma unroll
 97 |             for (int st=0; st < halfCacheSize; st++) { // Every iteration of this loop must load a single training vector into shared memory.
 98 |                 {
 99 |                     // Stream a new pair of training vectors to registers at start of every even loop (and write them to shared memory at end of every odd loop)
100 |                     if (st % 2 == 0) {
101 |                         if (threadIdx.y < 2*chunksPerVector) {
102 |                             const int index = (t + (half+1)*halfCacheSize + st)*vectorDimension + threadIdx.y*_warpSize + threadIdx.x;
103 |                             if (index < trainingSize*vectorDimension) {
104 |                                 prefetch = g_training[index];
105 |                             }
106 |                         }
107 |                     }
108 |                 }
109 |                 {
110 |                     // This is the offset into our shared memory cache of training vectors.
111 |                     const register int trainingOffset = half*halfCacheSize + st;
112 |                     register unsigned int train[chunksPerVector];
113 | 
114 |                     // Load training vector into registers.
115 |                     #pragma unroll
116 |                     for (int chunk = 0; chunk < chunksPerVector; chunk++) {
117 |                         train[chunk] = s_training[trainingOffset][chunk][threadIdx.x];
118 |                     }
119 | 
120 |                     // The compiler throws a hissy fit if you try to make dist an array, and tosses everything into local memory.
121 |                     register int dist0, dist1, dist2, dist3, dist4, dist5, dist6, dist7;
122 |                     // Also, the compiler does not like this being in a (fully unrolled) loop... drama queen.
123 |                     dist0 = __popc(query[0][0] ^ train[0]);// + __popc(query[0][1] ^ train[1]);
124 |                     dist1 = __popc(query[1][0] ^ train[0]);// + __popc(query[1][1] ^ train[1]);
125 |                     dist2 = __popc(query[2][0] ^ train[0]);// + __popc(query[2][1] ^ train[1]);
126 |                     dist3 = __popc(query[3][0] ^ train[0]);// + __popc(query[3][1] ^ train[1]);
127 |                     dist4 = __popc(query[4][0] ^ train[0]);// + __popc(query[4][1] ^ train[1]);
128 |                     dist5 = __popc(query[5][0] ^ train[0]);// + __popc(query[5][1] ^ train[1]);
129 |                     dist6 = __popc(query[6][0] ^ train[0]);// + __popc(query[6][1] ^ train[1]);
130 |                     dist7 = __popc(query[7][0] ^ train[0]);// + __popc(query[7][1] ^ train[1]);
131 |                     dist0 |= (__popc(query[ 8][0] ^ train[0]) /*+ __popc(query[ 8][1] ^ train[1])*/)<<16;
132 |                     dist1 |= (__popc(query[ 9][0] ^ train[0]) /*+ __popc(query[ 9][1] ^ train[1])*/)<<16;
133 |                     dist2 |= (__popc(query[10][0] ^ train[0]) /*+ __popc(query[10][1] ^ train[1])*/)<<16;
134 |                     dist3 |= (__popc(query[11][0] ^ train[0]) /*+ __popc(query[11][1] ^ train[1])*/)<<16;
135 |                     dist4 |= (__popc(query[12][0] ^ train[0]) /*+ __popc(query[12][1] ^ train[1])*/)<<16;
136 |                     dist5 |= (__popc(query[13][0] ^ train[0]) /*+ __popc(query[13][1] ^ train[1])*/)<<16;
137 |                     dist6 |= (__popc(query[14][0] ^ train[0]) /*+ __popc(query[14][1] ^ train[1])*/)<<16;
138 |                     dist7 |= (__popc(query[15][0] ^ train[0]) /*+ __popc(query[15][1] ^ train[1])*/)<<16;
139 | 
140 |                     dist0 += __shfl_xor(dist0,   1);
141 |                     dist1 += __shfl_xor(dist1,   1);
142 |                     if (threadIdx.x & 1) dist0 = dist1;
143 |                     dist2 += __shfl_xor(dist2,   1);
144 |                     dist3 += __shfl_xor(dist3,   1);
145 |                     if (threadIdx.x & 1) dist2 = dist3;
146 |                     dist4 += __shfl_xor(dist4,   1);
147 |                     dist5 += __shfl_xor(dist5,   1);
148 |                     if (threadIdx.x & 1) dist4 = dist5;
149 |                     dist6 += __shfl_xor(dist6,   1);
150 |                     dist7 += __shfl_xor(dist7,   1);
151 |                     if (threadIdx.x & 1) dist6 = dist7;
152 |                     dist0 += __shfl_xor(dist0,   2);
153 |                     dist2 += __shfl_xor(dist2,   2);
154 |                     if (threadIdx.x & 2) dist0 = dist2;
155 |                     dist4 += __shfl_xor(dist4,   2);
156 |                     dist6 += __shfl_xor(dist6,   2);
157 |                     if (threadIdx.x & 2) dist4 = dist6;
158 |                     dist0 += __shfl_xor(dist0,   4);
159 |                     dist4 += __shfl_xor(dist4,   4);
160 |                     if (threadIdx.x & 4) dist0 = dist4;
161 |                     dist0 += __shfl_xor(dist0,   8);
162 |                     dist0 += __shfl_xor(dist0,  16);
163 |                     if (threadIdx.x < 8) dist0 &= 2047;
164 |                     else dist0 >>= 16;
165 | 
166 |                     if (dist0 < secondBest) {
167 |                         if (dist0 < best) {
168 |                             secondBest = best;
169 |                             best = dist0;
170 |                             bestIndex = t + trainingOffset;
171 |                         } else {
172 |                             secondBest = dist0;
173 |                         }
174 |                     }
175 |                 }
176 |                 { // Write new training vectors prefetched into registers to shared memory cache at end of every even loop.
177 |                     if (st % 2 == 1) {
178 |                         if (threadIdx.y < chunksPerVector) { // We can load identically for each chunk, but not so for write to shared memory differently.
179 |                             s_training[(half^1)*halfCacheSize + (st-1)    ][threadIdx.y                  ][threadIdx.x] = prefetch;
180 |                         } else if (threadIdx.y < 2*chunksPerVector) {
181 |                             s_training[(half^1)*halfCacheSize + (st-1) + 1][threadIdx.y - chunksPerVector][threadIdx.x] = prefetch;
182 |                         }
183 |                     }
184 |                 }
185 |             }
186 |             __syncthreads();
187 |         }
188 |     }
189 |     if (threadIdx.x < vectorsPerWarp) {
190 |         if (secondBest - best < threshold) {
191 |             bestIndex = -1; // Failed hard threshold test.
192 |         }
193 |         // We can trash what is in shared memory now... it is called s_training, but here it is just scratch space.
194 |         // I guess I should use a union for this?
195 |         const register int packing = _warpSize / vectorsPerWarp; // NOTE: This assumes vectorsPerWarp divides _warpSize. If it doesnt, you'll have to handle this differently.
196 |         s_training[0][threadIdx.y / packing][(threadIdx.y%packing)*vectorsPerWarp + threadIdx.x] = bestIndex;
197 |     }
198 |     __threadfence_block();
199 |     if (threadIdx.y < vectorsPerWarp) {
200 |         g_match[blockIdx.x*vectorsPerWarp*warpsPerBlock + threadIdx.y*warpsPerBlock + threadIdx.x] = s_training[0][threadIdx.y][threadIdx.x];
201 |     }
202 | }
203 | 
204 | 
205 | void bitMatcher(unsigned int* d_Q, unsigned int* d_T, int keypointsQ, int keypointsT, int maxKP, int* d_M, const int threshold, cudaStream_t stream, cudaEvent_t event) {
206 |     dim3 threadsPerBlock(_warpSize, warpsPerBlock);
207 |     const int neededBlocks = (keypointsQ + (vectorsPerWarp * warpsPerBlock) - 1) / (vectorsPerWarp * warpsPerBlock); // This is the "round up integer division" pattern
208 |     dim3 blocksPerGrid(neededBlocks, 1, 1);
209 | 
210 |     cudaStreamWaitEvent(stream, event, 0);
211 |     // checkLaunchError();
212 |     bitMatch<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_Q, d_T, d_M, keypointsT, threshold);
213 |     // checkLaunchError();
214 | }
215 | 
216 | void getMatches(int maxKP, int* h_M, int* d_M) {
217 |     size_t sizeM = maxKP * sizeof(int);
218 |     checkLaunchError();
219 |     cudaMemcpyAsync(h_M, d_M, sizeM, cudaMemcpyDeviceToHost);
220 |     checkLaunchError();
221 | };
222 | 


--------------------------------------------------------------------------------
/bitMatcher.h:
--------------------------------------------------------------------------------
1 | void bitMatcher(unsigned int*, unsigned int*, int, int, int, int*, int, cudaStream_t, cudaEvent_t);
2 | void getMatches(int, int*, int*);
3 | 


--------------------------------------------------------------------------------
/driveGnuPlotStreams.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | use strict;
  3 | 
  4 | sub usage {
  5 | 	print "Usage: $0 <options>\n";
  6 | 	print <<OEF;
  7 | where options are (in order):
  8 | 
  9 |   NumberOfStreams                        How many streams to plot (M)
 10 | 
 11 |   NumberOfWindows                        How many windows to use (N)
 12 | 
 13 |   Window0_WindowSampleSize               this many samples per window for window0
 14 |   <Window1_WindowSampleSize>             ...for window1
 15 | ...
 16 |   <WindowN_WindowSampleSize>             ...for windowN
 17 | 
 18 |   Window0_YRangeMin Window0_YRangeMax    Min and Max values for window0
 19 |   <Window1_YRangeMin Window1_YRangeMax>  ...for window1
 20 | ...
 21 |   <WindowN_YRangeMin WindowN_YRangeMax>  ...for windowN
 22 | 
 23 |   Window0_geometry			 WIDTHxHEIGHT+XOFF+YOFF (in pixels)
 24 |   <Window1_geometry>                     ...for window1
 25 | ...
 26 |   <WindowN_geometry>                     ...for windowN
 27 | 
 28 |   Stream0_Title                          Title used for stream 0
 29 |   <Stream1_Title>                        ...for stream1
 30 | ...
 31 |   <StreamN_Title>                        ...for streamM
 32 | 
 33 |   WindowNumber0                          Window into which stream 0 is plotted
 34 |   <WindowNumber1>                        ... for stream1
 35 | ...
 36 |   <WindowNumberM>                        ... for streamM
 37 | OEF
 38 | 	exit(1);
 39 | }
 40 | 
 41 | sub WrongParameter {
 42 | 	my $cause = shift;
 43 | 	print "Expected parameter missing ($cause)...\n\n";
 44 | 	usage;
 45 | 	exit(1);
 46 | }
 47 | 
 48 | 
 49 | sub main {
 50 | 	my $argIdx = 0;
 51 | 	my $numberOfStreams = shift or WrongParameter("number of streams");
 52 |     my $numberOfWindows = shift or WrongParameter("number of windows");
 53 | 	print "Will display $numberOfStreams Streams in $numberOfWindows windows...\n";
 54 | 	my @sampleSizes;
 55 | 	for(my $i=0; $i<$numberOfWindows; $i++) {
 56 | 		my $samples = shift or WrongParameter("sample size $i");
 57 | 		push @sampleSizes, $samples;
 58 | 		print "Window $i will use a window of $samples samples\n";
 59 | 	}
 60 | 	my @ranges;
 61 | 	for(my $i=0; $i<$numberOfWindows; $i++) {
 62 | 		my $miny = shift;
 63 | 		WrongParameter("min y of window $i") if !defined($miny);
 64 | 		my $maxy = shift;
 65 | 		WrongParameter("max y of window $i") if !defined($maxy);
 66 | 		push @ranges, [ $miny, $maxy ];
 67 | 		print "Window $i will use a range of [$miny, $maxy]\n";
 68 | 	}
 69 | 	my @geometries;
 70 | 	for(my $i=0; $i<$numberOfWindows; $i++) {
 71 | 		my $geometry = shift or WrongParameter("geometry $i");
 72 | 		push @geometries, $geometry;
 73 | 		print "Window $i will use a geometry of '$geometry'\n";
 74 | 	}
 75 | 	my @titles;
 76 | 	for(my $i=0; $i<$numberOfStreams; $i++) {
 77 | 		my $title = shift or WrongParameter("title $i");
 78 | 		push @titles, $title;
 79 | 		print "Stream $i will use a title of '$title'\n";
 80 | 	}
 81 |     my @streams;    # streams in a window
 82 |     my @windows;    # window of a stream
 83 |     for(my $i=0; $i<$numberOfStreams; $i++) {
 84 |         my $window = shift;
 85 | 		WrongParameter("window of stream $i") if !defined $window;
 86 |         push @{$streams[$window]}, $i;
 87 |         $windows[$i] = $window;
 88 |         print "Stream $i will be plotted in window $window\n";
 89 |     }
 90 |     # check that every window has a stream
 91 | 	for my $windowIdx(0..$numberOfWindows-1) {
 92 | 		if (!defined($streams[$windowIdx]) or @{$streams[$windowIdx]} == 0) {
 93 | 			warn "Warning: Window $windowIdx has no streams!\n";
 94 | 		}
 95 | 	}
 96 | 	my @gnuplots;
 97 | 	my @buffers;
 98 | 	my @xcounters;
 99 |     for (0..$numberOfStreams-1) {
100 | 		my @data = [];
101 | 		push @buffers, @data;
102 | 		push @xcounters, 0;
103 |     }
104 | 	for(my $i=0; $i<$numberOfWindows; $i++) {
105 | 		local *PIPE;
106 | 		my $geometry = $geometries[$i];
107 | 		open PIPE, "|gnuplot -geometry $geometry" || die "Can't initialize gnuplot number ".($i+1)."\n";
108 | 		select((select(PIPE), $| = 1)[0]);
109 | 		push @gnuplots, *PIPE;
110 | 		print PIPE "set xtics\n";
111 | 		print PIPE "set ytics\n";
112 | #		print PIPE "set yrange [".($ranges[$i]->[0]).":".($ranges[$i]->[1])."]\n";
113 | 		print PIPE "set style data lines\n";
114 | 		print PIPE "set grid\n";
115 | 		print PIPE "set term x11\n";
116 | 	}
117 | 	my $streamIdx = 0;
118 | 	# replace @ARGV with remaining args for <> below
119 | 	@ARGV = @_;
120 | 	while(<>) {
121 | 		chomp;
122 | 		my @parts = split /:/;
123 | 		#print "$.: parts=", join("-", @parts), "\n";
124 | 		$streamIdx = $parts[0];
125 |         my $windowIdx = $windows[$streamIdx];
126 |         my $buf = $buffers[$streamIdx];
127 | 		my $pip = $gnuplots[$windowIdx];
128 | 		# data buffering (up to stream sample size)
129 | 		my $xcounter = $xcounters[$streamIdx];
130 | 		push @{$buf}, "$xcounter $parts[1]";
131 | 		$xcounters[$streamIdx]++;
132 |         my $max_xcounter = $xcounter;
133 | 		my $q = 0;
134 |         for my $stream (@{$streams[$windowIdx]}) {
135 |             if ($xcounters[$stream] > $max_xcounter) {
136 |                 $max_xcounter = $xcounters[$stream];
137 | 				$q = 1;
138 |             }
139 |         }
140 | 		my $plotInterval = 15;
141 | 		if ($max_xcounter % $plotInterval != $plotInterval-1 && $q == 1) {
142 | 			next;
143 | 		}
144 | 
145 | 		print $pip "set xrange [".($max_xcounter-$sampleSizes[$windowIdx]).":".($max_xcounter)."]\n";
146 |         my @plots;
147 |         for my $stream (@{$streams[$windowIdx]}) {
148 | 			if (@{$buffers[$stream]} > 0) {
149 | 				push @plots, "\"-\" title '$titles[$stream]'";
150 | 			}
151 |         }
152 | 		print $pip "plot ", join(", ", @plots), "\n";
153 |         for my $stream (@{$streams[$windowIdx]}) {
154 | 			if (@{$buffers[$stream]} > 0) {
155 | 				for my $elem (reverse @{$buffers[$stream]}) {
156 | 					print $pip "$elem\n";
157 | 				}
158 | 				print $pip "e\n";
159 | 			}
160 |         }
161 | 		if (scalar(@{$buf})>$sampleSizes[$windowIdx]) {
162 | 			shift @{$buf};
163 | 		}
164 | 	}
165 | 	for(my $i=0; $i<$numberOfWindows; $i++) {
166 | 		my $pip = $gnuplots[$i];
167 | 		print $pip "exit;\n";
168 | 		close $pip;
169 | 	}
170 | }
171 | 
172 | main @ARGV;
173 | 


--------------------------------------------------------------------------------
/gpuFacade.cpp:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <iostream>
  3 | #include <stdio.h>
  4 | #include <time.h>
  5 | #include "cuda.h"
  6 | #include "cuda_runtime.h"
  7 | #include "opencv2/opencv.hpp"
  8 | using namespace std;
  9 | using namespace cv;
 10 | #include "latch.h"
 11 | #include "bitMatcher.h"
 12 | #include "gpuFacade.hpp"
 13 | 
 14 | // images
 15 | // keypoints
 16 | // descriptors
 17 | // matches
 18 | 
 19 | using namespace std;
 20 | 
 21 | #define cudaCalloc(A, B) \
 22 |     do { \
 23 |         cudaError_t __cudaCalloc_err = cudaMalloc(A, B); \
 24 |         if (__cudaCalloc_err == cudaSuccess) cudaMemset(*A, 0, B); \
 25 |     } while (0)
 26 | 
 27 | #define checkError(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 28 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
 29 |    if (code != cudaSuccess) {
 30 |       fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
 31 |       if (abort) exit(code);
 32 |    }
 33 | }
 34 | 
 35 | #define checkLaunchError()                                            \
 36 | do {                                                                  \
 37 |     /* Check synchronous errors, i.e. pre-launch */                   \
 38 |     cudaError_t err = cudaGetLastError();                             \
 39 |     if (cudaSuccess != err) {                                         \
 40 |         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 41 |                  __FILE__, __LINE__, cudaGetErrorString(err) );       \
 42 |         exit(EXIT_FAILURE);                                           \
 43 |     }                                                                 \
 44 |     /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
 45 |     err = cudaThreadSynchronize();                                    \
 46 |     if (cudaSuccess != err) {                                         \
 47 |         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 48 |                  __FILE__, __LINE__, cudaGetErrorString( err) );      \
 49 |         exit(EXIT_FAILURE);                                           \
 50 |     }                                                                 \
 51 | } while (0)
 52 | 
 53 | gpuFacade::~gpuFacade() {
 54 |     // cudaFreeArray(patchTriplets); // This crashes..?
 55 |     cudaFree(d_K);
 56 |     cudaFree(d_D1);
 57 |     cudaFree(d_D2);
 58 |     cudaFree(d_M1);
 59 |     cudaFree(d_M2);
 60 |     cudaFreeHost(h_K1);
 61 |     cudaFreeHost(h_K2);
 62 |     cudaDeviceReset();
 63 | }
 64 | 
 65 | gpuFacade::gpuFacade(int maxKeypoints, int input_WIDTH, int input_HEIGHT, int imageSlots) {
 66 |     maxKP = maxKeypoints;
 67 |     WIDTH = input_WIDTH;
 68 |     HEIGHT = input_HEIGHT;
 69 | 
 70 |     cudaEventCreate(&start);
 71 |     cudaEventCreate(&stop);
 72 | 
 73 |     // Sizes for device and host pointers
 74 |     sizeK = maxKP * sizeof(float) * 4; // K for keypoints
 75 |     sizeI = WIDTH * HEIGHT * sizeof(unsigned char); // I for Image
 76 |     sizeD = maxKP * (2048 / 32) * sizeof(unsigned int); // D for Descriptor
 77 |     sizeM = maxKP * sizeof(int); // M for Matches
 78 |     sizeMask = 64 * sizeof(float);
 79 | 
 80 |     // Host pointers
 81 |     cudaMallocHost((void **) &h_K1, sizeK);
 82 |     cudaMallocHost((void **) &h_K2, sizeK);
 83 |     h_M1 = (int*) malloc(sizeM);
 84 |     h_M2 = (int*) malloc(sizeM);
 85 |     for (int i=0; i<64; i++) { h_mask[i] = 1.0f; }
 86 | 
 87 |     // Device pointers
 88 |     cudaCalloc((void **) &d_K, sizeK);
 89 |     cudaCalloc((void **) &d_D1, sizeD);
 90 |     cudaCalloc((void **) &d_D2, sizeD);
 91 |     cudaCalloc((void **) &d_M1, sizeM);
 92 |     cudaCalloc((void **) &d_M2, sizeM);
 93 |     cudaCalloc((void **) &d_mask, sizeM);
 94 | 
 95 |     // The patch triplet locations for LATCH fits in texture memory cache.
 96 |     initPatchTriplets(patchTriplets);
 97 |     initImage(&d_I, WIDTH, HEIGHT, &pitch);
 98 |     initMask(&d_mask, h_mask);
 99 | 
100 |     // Events allow asynchronous, nonblocking launch of subsequent kernels after a given event has happened,
101 |     // such as completion of a different kernel on a different stream.
102 |     cudaEventCreate(&latchFinished);
103 |     // You should create a new stream for each bitMatcher kernel you want to launch at once.
104 |     cudaStreamCreate(&streamKP1);
105 |     cudaStreamCreate(&streamKP2);
106 | }
107 | 
108 | void gpuFacade::LATCH(
109 |                 Mat img,
110 |                 unsigned int* d_descriptor,
111 |                 int* keypoints,
112 |                 vector<KeyPoint>* vectorKP) {
113 |     latch( img, d_I, pitch, h_K1, d_descriptor, keypoints, maxKP, d_K, vectorKP, d_mask, latchFinished );
114 | }
115 | 
116 | void gpuFacade::match(
117 |                 unsigned int* d_descriptorQ,
118 |                 unsigned int* d_descriptorT,
119 |                 int numKP_Q,
120 |                 int numKP_T,
121 |                 int* d_matches,
122 |                 int threshold,
123 |                 cudaStream_t stream) {
124 |     bitMatcher( d_descriptorQ, d_descriptorT, numKP_Q, numKP_T, maxKP, d_matches, threshold, stream, latchFinished );
125 | }
126 | 
127 | void gpuFacade::getResults(int* h_matches, int* d_matches) {
128 |     getMatches(maxKP, h_matches, d_matches);
129 | }
130 | 


--------------------------------------------------------------------------------
/gpuFacade.hpp:
--------------------------------------------------------------------------------
 1 | class gpuFacade {
 2 |     public:
 3 |         int WIDTH, HEIGHT;
 4 |         void set_values (int,int);
 5 |         int area();
 6 | 
 7 |         int maxKP;
 8 |         cudaEvent_t start, stop;
 9 |         vector<KeyPoint> keypoints0, keypoints1, keypoints2;
10 |         vector<DMatch> goodMatches;
11 |         vector<Point2f> p1, p2; // Point correspondences for recovering pose.
12 |         int numKP0, numKP1, numKP2; // The actual number of keypoints we are dealing with: just keypoints#.size(), but capped at maxKP.
13 |         size_t sizeK; // K for keypoints
14 |         size_t sizeI; // I for Image
15 |         size_t sizeD; // D for Descriptor
16 |         size_t sizeM; // M for Matches
17 |         size_t sizeMask;
18 |         float *h_K1, *h_K2;
19 |         // For reasons opaque to me, allocating both (but not either) h_M1 or h_M2
20 |         // with cudaMallocHost segfaults, apparently after graceful exit? So neither of them are pinned.
21 |         int* h_M1;
22 |         int* h_M2;
23 |         float h_mask[64];
24 |         unsigned char *d_I;
25 |         unsigned int *d_D1, *d_D2, *uIntSwapPointer;
26 |         int *d_M1, *d_M2;
27 |         float *d_K, *d_mask;
28 |         cudaArray* patchTriplets;
29 |         size_t pitch;
30 |         cudaEvent_t latchFinished;
31 |         cudaStream_t streamKP1, streamKP2;
32 | 
33 |         void LATCH(cv::Mat, unsigned int*, int*, std::vector<cv::KeyPoint>*);
34 |         void match( unsigned int*,
35 |                     unsigned int*,
36 |                     int,
37 |                     int,
38 |                     int*,
39 |                     int,
40 |                     cudaStream_t);
41 |         void getResults(int* h_matches, int* d_matches);
42 |         gpuFacade(int, int, int, int);
43 |         ~gpuFacade();
44 | };
45 | 


--------------------------------------------------------------------------------
/latch.cu:
--------------------------------------------------------------------------------
  1 | // Uncomment below define to use sum of squared differences instead of sum of absolute differences.
  2 | // #define use_SAD // You can keep this commented.
  3 | // Uncomment below define to use an importance mask on each patch comparison.
  4 | // #define use_mask // You can keep this commented.
  5 | 
  6 | #include <vector>
  7 | #include <stdio.h>
  8 | #include <iostream>
  9 | #include <cuda_runtime.h>
 10 | #include "opencv2/opencv.hpp"
 11 | using namespace std;
 12 | using namespace cv;
 13 | 
 14 | #define _warpSize (32)
 15 | #define _warpSizef (32.0f)
 16 | #define warpsPerBlock (32)
 17 | // Region of interest
 18 | #define roiWidth (64)
 19 | #define roiHeight (64)
 20 | // Minimal amount that avoids shared memory bank conflicts
 21 | #define roiWidthPadding (4)
 22 | // The numbers loaded into the oracle assume patchSize==8. If you really want canonical LATCH, you can set the mask to ignore those pixels.
 23 | #define patchSize (8)
 24 | #define bitsPerDescriptor (512)
 25 | // With further work this should be decreased to 0 for even faster matching.
 26 | #define paddingBitsPerDescriptor (1536)
 27 | #define bitsPerUInt32 (32)
 28 | #define deg2rad (0.0174533f)
 29 | #define negDeg2rad (-0.0174533f)
 30 | #define inv64 (0.015625f)
 31 | #define CHECK_BORDER (0)
 32 | 
 33 | // Used to store the oracle of patch triplets.
 34 | texture<int, cudaTextureType2D, cudaReadModeElementType> patchTriplets;
 35 | texture<unsigned char, 2, cudaReadModeNormalizedFloat> image;
 36 | 
 37 | __constant__ int triplets[3*512];
 38 | 
 39 | #define cudaCalloc(A, B) \
 40 |     do { \
 41 |         cudaError_t __cudaCalloc_err = cudaMalloc(A, B); \
 42 |         if (__cudaCalloc_err == cudaSuccess) cudaMemset(*A, 0, B); \
 43 |     } while (0)
 44 | 
 45 | #define checkError(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 46 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
 47 |    if (code != cudaSuccess) {
 48 |       fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
 49 |       if (abort) exit(code);
 50 |    }
 51 | }
 52 | 
 53 | #define checkLaunchError()                                          \
 54 | do {                                                                  \
 55 |     /* Check synchronous errors, i.e. pre-launch */                   \
 56 |     cudaError_t err = cudaGetLastError();                             \
 57 |     if (cudaSuccess != err) {                                         \
 58 |         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 59 |                  __FILE__, __LINE__, cudaGetErrorString(err) );       \
 60 |         exit(EXIT_FAILURE);                                           \
 61 |     }                                                                 \
 62 |     /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
 63 |     err = cudaThreadSynchronize();                                    \
 64 |     if (cudaSuccess != err) {                                         \
 65 |         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 66 |                  __FILE__, __LINE__, cudaGetErrorString( err) );      \
 67 |         exit(EXIT_FAILURE);                                           \
 68 |     }                                                                 \
 69 | } while (0)
 70 | 
 71 |  // Launch as 32x32
 72 | __global__ void __launch_bounds__(1024, 2)
 73 |                 latch(  const float *g_K,
 74 |                         unsigned int *g_D,
 75 |                         const int imgWidth,
 76 |                         const int imgHeight,
 77 |                         const float *g_mask/*,
 78 |                         const float *g_oriented*/) {
 79 |     volatile __shared__ int s_kpOffset[2];
 80 |     volatile __shared__ float s_mask[64];
 81 |     volatile __shared__ float s_stride[4];
 82 |     volatile __shared__ float s_roi[roiHeight][roiWidth + roiWidthPadding][1]; // It is faster to operate on floats, even if our image is unsigned char! (we can't guarantee media instructions are available)
 83 |     volatile __shared__ unsigned int s_out[warpsPerBlock];
 84 |     {
 85 |         register float mask0, mask1;
 86 |         if (threadIdx.y == 0 && threadIdx.x < 2) { // 2 threads, 2 coordinates
 87 |             register float k;
 88 |             k = g_K[blockIdx.x*5 + threadIdx.x];
 89 |             if (CHECK_BORDER && (k < _warpSize || (threadIdx.x == 0 && imgWidth-_warpSize < k) || (threadIdx.x == 1 && imgHeight-_warpSize < k))) {
 90 |                 k = -999999; // If too near boundary, make sure the kpOffset will be negative, so everyone gets the signal to bail.
 91 |             }
 92 |             s_kpOffset[threadIdx.x] = k + 0.5f;
 93 |         }
 94 |         if (threadIdx.y == 1 && threadIdx.x < 4) {
 95 |             register float l; // Just a temp variable... thread 0 has major axis, 1 has minor axis, 2 has angle.
 96 |             if (threadIdx.x < 3) {
 97 |                 l = g_K[blockIdx.x*5 + 2 + threadIdx.x];
 98 |             }
 99 |             register float c, s, t; // cos and sin and theta
100 |             t = __shfl(l, 2);
101 |             __sincosf(negDeg2rad*t, &s, &c);
102 | 
103 |             const register float a = __shfl(l, 0);
104 |             const register float b = __shfl(l, 1);
105 | 
106 |             const register float p = ((threadIdx.x & 1) == 0) ? a : b;
107 |             const register float q = (threadIdx.x == 1 || threadIdx.x == 2) ? s : c;
108 | 
109 |             s_stride[threadIdx.x] = p*q*inv64;
110 |         }
111 |         if (threadIdx.y == 2) {
112 |             mask0 = g_mask[threadIdx.x];
113 |         }
114 |         if (threadIdx.y == 3) {
115 |             mask1 = g_mask[_warpSize + threadIdx.x];
116 |         }
117 |         __threadfence_block();
118 |         __syncthreads();
119 |         const register int x = s_kpOffset[0];
120 |         const register int y = s_kpOffset[1];
121 |         if (x < 0 || y < 0) {
122 |             return; // This is the case if our keypoint is within boundary of the edge of the image. 5000 blocks returning in this way takes ~300 microseconds.
123 |         }
124 |         const register float r = (threadIdx.x < 4) ? s_stride[threadIdx.x] : 0;
125 |         
126 |         const register float r11 =  __shfl(r, 0);
127 |         const register float r12 =  __shfl(r, 1);
128 |         const register float r21 = -__shfl(r, 2);
129 |         const register float r22 =  __shfl(r, 3);
130 |         // const register float c = s_stride[0];
131 |         // const register float s = s_stride[1];
132 | 
133 |         // 64 by 64 region of interest means four 32 by 32 loads.
134 |         if (threadIdx.y == 2) {
135 |             s_mask[threadIdx.x] = mask0;
136 |         }
137 |         if (threadIdx.y == 3) {
138 |             s_mask[threadIdx.x + _warpSize] = mask1;
139 |         }
140 | 
141 |         // const register float cN = c * -_warpSizef;
142 |         // const register float sN = s * -_warpSizef;
143 |         //
144 |         // const register float cu = c * threadIdx.x;
145 |         // const register float su = s * threadIdx.x;
146 |         // const register float cv = c * threadIdx.y;
147 |         // const register float sv = s * threadIdx.y;
148 |         //
149 |         // const register float cuN = cu + cN;
150 |         // const register float suN = su + sN;
151 |         // const register float cvN = cv + cN;
152 |         // const register float svN = sv + sN;
153 | 
154 |         const register float nx = threadIdx.x - _warpSizef;
155 |         const register float ny = threadIdx.y - _warpSizef;
156 | 
157 | 
158 |         s_roi[threadIdx.y            ][threadIdx.x            ][0] = /*(unsigned char)*/ ( 256.0f * tex2D(image, min((float)imgWidth-1.0f, r11*nx          + r12*ny          + x), min((float)imgHeight-1.0f, r21*nx          + r22*ny           + y)));
159 |         s_roi[threadIdx.y            ][threadIdx.x + _warpSize][0] = /*(unsigned char)*/ ( 256.0f * tex2D(image, min((float)imgWidth-1.0f, r11*threadIdx.x + r12*ny          + x), min((float)imgHeight-1.0f, r21*threadIdx.x + r22*ny           + y)));
160 |         s_roi[threadIdx.y + _warpSize][threadIdx.x            ][0] = /*(unsigned char)*/ ( 256.0f * tex2D(image, min((float)imgWidth-1.0f, r11*nx          + r12*threadIdx.y + x), min((float)imgHeight-1.0f, r21*nx          + r22*threadIdx.y  + y)));
161 |         s_roi[threadIdx.y + _warpSize][threadIdx.x + _warpSize][0] = /*(unsigned char)*/ ( 256.0f * tex2D(image, min((float)imgWidth-1.0f, r11*threadIdx.x + r12*threadIdx.y + x), min((float)imgHeight-1.0f, r21*threadIdx.x + r22*threadIdx.y  + y)));
162 |     }
163 |     register unsigned int out = 0;
164 |     const register int wrappedX =      threadIdx.x % patchSize; // Offset for patch, interlaced to decrease padding needed for shared memory bank conflict avoidance
165 |     const register int wrappedY = 2 * (threadIdx.x / patchSize); // Each thread will use both wrappedY and wrappedY+1
166 |     __syncthreads();
167 |     __threadfence_block();
168 |     // if (blockIdx.x == 0) {
169 |     //     g_out[(threadIdx.y            )*64 + (threadIdx.x            )] = s_roi[threadIdx.y            ][threadIdx.x            ][0];
170 |     //     g_out[(threadIdx.y            )*64 + (threadIdx.x + _warpSize)] = s_roi[threadIdx.y            ][threadIdx.x + _warpSize][0];
171 |     //     g_out[(threadIdx.y + _warpSize)*64 + (threadIdx.x + _warpSize)] = s_roi[threadIdx.y + _warpSize][threadIdx.x + _warpSize][0];
172 |     //     g_out[(threadIdx.y + _warpSize)*64 + (threadIdx.x            )] = s_roi[threadIdx.y + _warpSize][threadIdx.x            ][0];
173 |     // }
174 |     // __syncthreads();
175 |     // __threadfence_block();
176 | 
177 | 
178 |     const register float mask0 = s_mask[threadIdx.x];
179 |     const register float mask1 = s_mask[threadIdx.x + _warpSize];
180 | 
181 |     register int nextCoord = tex2D(patchTriplets, threadIdx.x, threadIdx.y); // This access is hardware cached.
182 |     // register int offset = threadIdx.y * 3 * 16;
183 |     // register int nextAleph = triplets[offset  ];
184 |     // register int nextTavek = triplets[offset+1];
185 |     // register int nextBet   = triplets[offset+2];
186 |     #pragma unroll
187 |     for (register int i=0; i<16; ) {
188 |         const register int coord = nextCoord;
189 |         if (i!=16-4) nextCoord = tex2D(patchTriplets, 6*(i+4) + threadIdx.x, threadIdx.y); // This access is hardware cached.
190 |         #pragma unroll
191 |         for (register int j=0; j<4; j++, i++) {
192 |             // offset += 3;
193 |             // const register int alephIndexX = nextAleph & 255;
194 |             // const register int alephIndexY = nextAleph >> 8;
195 |             // nextAleph = triplets[offset];
196 |             // const register int tavekIndexX = nextTavek & 255;
197 |             // const register int tavekIndexY = nextTavek >> 8;
198 |             // nextTavek = triplets[offset+1];
199 |             // const register int betIndexX = nextBet & 255;
200 |             // const register int betIndexY = nextBet >> 8;
201 |             // nextBet = triplets[offset+2];
202 | 
203 |             const register int alephIndexX = __shfl(coord, 6*j  );
204 |             const register int alephIndexY = __shfl(coord, 6*j+1);
205 |             const register int tavekIndexX = __shfl(coord, 6*j+2);
206 |             const register int tavekIndexY = __shfl(coord, 6*j+3);
207 |             const register int betIndexX = __shfl(coord, 6*j+4);
208 |             const register int betIndexY = __shfl(coord, 6*j+5);
209 |             const register int bitIndex = 16*(threadIdx.y & 1) + i;
210 |             const register int outThread = 0;
211 | 
212 |             // This assumes an 8x8 patch. As there are only 32 threads per warp, each thread will pull two values from each thread.
213 |             // The access pattern is interleaved to decrease the amount of shared memory padding necessary to avoid bank conflicts:
214 |             //      each thread pulls a verticle pair from each patch.
215 |             const register int tavek0 = s_roi[tavekIndexY + wrappedY  ][tavekIndexX + wrappedX][0]; // Tavek means "between".
216 |             const register int tavek1 = s_roi[tavekIndexY + wrappedY+1][tavekIndexX + wrappedX][0]; // It is our root patch.
217 |             const register int aleph0 = s_roi[alephIndexY + wrappedY  ][alephIndexX + wrappedX][0]; // Aleph is "A"
218 |             const register int aleph1 = s_roi[alephIndexY + wrappedY+1][alephIndexX + wrappedX][0]; // Similarity to aleph is denoted by a bit set to 0
219 |             const register int bet0   = s_roi[betIndexY   + wrappedY  ][betIndexX   + wrappedX][0]; // Bet is "B"
220 |             const register int bet1   = s_roi[betIndexY   + wrappedY+1][betIndexX   + wrappedX][0]; // Similarity to bet is denoted by a bit set to 1
221 | 
222 |             // Now we compute the sum of squared differences between both patch pairs.
223 |             // First, differences:
224 |             register int alephDiff0 = (tavek0 - aleph0);
225 |             register int alephDiff1 = (tavek1 - aleph1);
226 |             register int betDiff0   = (tavek0 - bet0);
227 |             register int betDiff1   = (tavek1 - bet1);
228 |             // Then, squared differences
229 |             alephDiff0 *= alephDiff0;
230 |             alephDiff1 *= alephDiff1;
231 |             betDiff0 *= betDiff0;
232 |             betDiff1 *= betDiff1;
233 | 
234 |             alephDiff0 *= mask0;
235 |             alephDiff1 *= mask1;
236 |             betDiff0 *= mask0;
237 |             betDiff1 *= mask1;
238 | 
239 |             alephDiff0 += alephDiff1; // Merge both interleaved squared differences, to make upcoming warp reduction faster
240 |             betDiff0   += betDiff1;
241 | 
242 |             alephDiff0 -= betDiff0; // Easiest to just take this difference now, then reduce, then compare to 0. Same as reduce then compare relative to each other.
243 |             alephDiff0 += __shfl_xor(alephDiff0,  1);
244 |             alephDiff0 += __shfl_xor(alephDiff0,  2);
245 |             alephDiff0 += __shfl_xor(alephDiff0,  4);
246 |             alephDiff0 += __shfl_xor(alephDiff0,  8);
247 |             alephDiff0 += __shfl_xor(alephDiff0, 16); // By xor shfling, every thread has the resulting sum.
248 | 
249 |             // One thread sets a specific bit high if tavek is closer to bet.
250 |             if (alephDiff0 < 0 && threadIdx.x == outThread) {
251 |                 out |= (1<<bitIndex);
252 |             }
253 |         }
254 |     }
255 | 
256 |     if (threadIdx.x == 0) { // In this case, only thread 0 ever has important data.
257 |         s_out[threadIdx.y] = out;
258 |     }
259 |     __syncthreads();
260 |     __threadfence_block();
261 |     if (threadIdx.y == 0) {
262 |         out = s_out[threadIdx.x]; // Warp 0 now has all the data we need to output.
263 |         __syncthreads();
264 |         __threadfence_block();
265 | 
266 |         out |= __shfl_down(out,  1, _warpSize); // Each warp computed half a 32 bit word. Merge them before output.
267 |         out = __shfl(out, 2*threadIdx.x); // Only even threads have useful data after above shfl_down.
268 | 
269 |         if (threadIdx.x < bitsPerDescriptor / bitsPerUInt32) { // 512 / 32 = 16
270 |             g_D[((paddingBitsPerDescriptor + bitsPerDescriptor) / bitsPerUInt32)*blockIdx.x + threadIdx.x] = out; // And that's it, with this write to global memory we're done with this descriptor.
271 |         }
272 |     }
273 | }
274 | 
275 | 
276 | void initPatchTriplets(cudaArray* cuArray) {
277 |     const int h_data[]= { 41, 22, 47, 47, 51, 24, 32, 44, 52, 17, 32,  7, 50, 14, 26,  8, 51, 33, 45, 18, 30, 38, 42, 10,  6, 30, 16, 40,  6, 49, 39, 34, 35, 43, 31, 17, 21, 44, 18, 14, 25, 37, 23, 29, 12, 44, 19,  7,  9, 30, 26, 19,  6, 52, 47, 40, 27,  9, 43, 19, 35, 26, 50,  5, 41, 48, 25, 37, 11, 27, 23,  9, 25, 14, 33,  7, 38, 47, 40, 19, 52, 48, 48,  8, 23, 46, 47, 39, 22, 12, 50, 35, 29, 20, 18, 34, 47, 24, 31, 36, 26, 47, 11, 38, 17, 16,  7, 11, 52, 15, 46, 14, 42,  9,  4, 13, 43, 14,  5, 17, 22, 50, 27, 17, 34, 14, 44, 46, 38,  5, 48, 32, 51, 36, 32, 35, 45,  9, 26,  7, 17, 10, 25, 35,  5, 38, 17, 33, 12, 47,  4, 32, 43, 12,  9, 23,  9, 24, 27, 33,  8, 30, 48, 40, 39,  4, 37, 50, 37, 41, 22,  5, 38, 43,  6, 20, 24, 23, 13, 48, 22, 15, 29, 44, 22, 51, 10, 25, 20, 13, 10, 33, 42, 16, 37, 41, 47, 40,  6, 44, 27, 47, 44, 16, 29, 36, 27, 24, 25, 35, 31, 43, 51,  5, 33, 19, 30, 21, 42, 15, 34, 48, 10, 39, 44, 18, 16, 32, 13, 30, 19, 49,  7, 48, 25, 33,  6, 51, 21,  6, 11, 41, 52, 14,  4,  4, 52, 43, 31,  6, 12, 35, 14,  8, 29, 21, 16, 26, 47, 45, 28, 46, 16, 21, 16, 38, 36, 33,  7, 10, 13, 37, 41, 31, 10, 11, 28, 33, 39,  6, 36, 46, 49, 30,  6, 11, 43, 31,  6, 13, 46, 51,  5, 49,  4, 44, 38, 25, 20, 27,  9, 47, 50, 51, 14, 26, 48, 43, 26,  9, 47, 13, 18, 40, 28, 19, 19, 12, 41,  6, 44, 44, 28, 14, 20, 15, 27, 48, 23,  6, 21,  5, 24, 38, 27, 48, 44, 27, 15, 12, 52, 10, 10, 40, 36, 47,  4, 14, 13, 52, 34, 30,  7,  6, 48, 26, 36, 28, 45, 18,  9, 49, 21, 48, 14, 31, 47, 45, 28, 12, 10,  9, 11,  9, 40,  5, 16, 20, 37, 38, 37,  5, 49,  4, 37, 47, 13, 10, 35,  9, 33, 31, 25, 12, 32, 26, 43, 18,  4, 44, 52, 39, 45, 44, 19, 29, 46, 13, 39, 23, 28, 52,  8, 16, 14,  9, 52, 12, 19, 22, 50, 14, 30,  6, 44, 39, 51, 27, 32, 18, 48, 50, 18, 19, 45, 41, 15, 15, 13, 41, 39, 37, 15, 37, 50, 43, 30, 46, 16, 18, 31, 51, 46, 43, 48,  4, 35, 22, 44, 39, 36, 29, 41, 44, 52,  8, 37, 24, 20, 25, 45, 52,  9, 45, 39, 34, 23, 50, 42, 18, 23, 17, 13, 18,  6, 37, 35, 46, 16, 36, 41,  4, 37, 28, 30, 31, 35, 40, 49, 42, 28, 20, 11, 30, 50, 48, 23, 44, 47,  5, 50, 10,  9, 25, 52, 13, 46, 28, 17, 44, 45, 39, 50, 43, 17, 35, 48, 19, 12, 38, 30, 29,  9, 48,  9, 24, 30, 25,  4, 45, 25, 49, 50, 16, 27, 31, 25,  8, 21, 51, 27, 19, 17, 31,  8, 23, 19, 20, 47, 11, 49, 49, 49, 15, 18, 34, 30, 26, 11,  7, 47, 52, 48, 34, 52, 17, 38,  5, 27, 19, 36, 23, 50,  8, 25, 52, 47, 33,  4, 34, 28, 15,  5, 13, 38, 48,  6, 24, 37,  8,  4, 38, 33, 13,  4,  8, 50, 34, 36, 21, 39, 50, 10, 35, 19, 47, 16, 23, 19, 49,  8, 11, 11, 50,  5, 34,  6, 16, 11, 35, 10, 31, 29, 52,  4, 48, 18, 19, 30, 43, 46, 46, 44, 15, 10, 39, 37, 22, 52, 52,  4, 50, 40, 16, 48, 35,  7, 43, 50, 23, 19, 21, 51, 15, 11,  8, 19, 22, 51, 28,  6, 41, 13, 10, 29,  6, 11, 38, 28, 32, 32, 20, 46, 20, 21, 22,  8, 46,  8, 25,  8, 14, 32, 19, 11, 20, 10, 21, 31, 36, 12, 33, 35, 16, 38, 47, 48, 49,  6, 52, 32, 36,  6, 30,  9, 10, 10, 50, 26, 41, 38, 37, 13, 43, 49, 44, 44, 39,  4, 26, 52, 49, 21, 16, 29, 42, 37, 45, 48, 45, 35, 35, 33,  4, 15, 20, 49, 46, 13, 39,  6, 36, 40, 20, 10, 51, 42, 38, 34,  4, 45, 18, 36, 41, 49, 45, 52, 25,  7,  4, 46, 39, 20, 33, 18,  5, 26, 51, 15, 33, 39, 35, 27,  7, 18, 24, 49,  6, 13, 34, 34, 24, 44, 21, 21,  5, 47, 34, 27, 49, 51, 14, 26, 11, 50, 15,  6, 32, 42, 31, 18, 31, 42, 17,  6, 36, 39, 41,  4, 38, 52, 49, 40, 30, 41, 12, 43, 29, 27, 24, 48,  6, 22,  9, 14,  8, 30, 17,  8, 52,  5, 18, 40, 29,  4, 30,  4,  5, 12, 41, 27, 17, 20, 34, 47, 15,  5, 51, 10,  4, 51, 12,  7, 44, 16, 47, 18, 34, 22, 12, 28, 13, 15, 52, 26, 37, 47, 24, 28, 49, 49, 44, 18,  4,  4,  8, 15, 23, 52, 35, 15, 35, 46, 47, 28, 50,  7, 48, 28, 46, 51, 38, 15, 14, 44, 38, 18, 16, 36, 38, 15, 52,  6, 22, 11, 42, 22, 39, 45, 45, 21, 45, 45, 16, 50, 27, 26, 25,  4, 50, 40, 28, 29, 17, 40, 12,  8, 22, 17, 45, 23,  9, 46, 35, 20, 31, 51, 17, 52, 21, 10, 52, 48, 27, 18, 32, 24,  6, 14, 20, 43, 20, 12, 48, 45, 51, 40, 43, 43,  9, 33, 32, 12, 49, 31, 25, 11, 13, 10, 42,  8,  6, 10, 40, 49, 41, 10, 28, 40, 16,  8, 51, 43, 18, 14, 12,  4, 44, 40, 23, 12, 41, 17, 15, 24, 19, 26, 10, 31, 16,  4, 28, 26, 25, 14, 14, 50, 37,  7, 45, 46, 38, 30, 51, 43, 34, 20, 10, 43, 51, 17, 51,  4, 41, 32, 44,  4, 15, 37, 28, 49,  5, 34,  4,  6, 41, 49, 47,  7, 18,  7, 47, 35, 26, 21, 29, 30,  7, 36, 48, 39, 16, 47,  9, 26, 52, 45, 29, 25, 21, 31, 45, 24, 15,  5, 23, 13, 14, 21, 39, 13,  5, 52, 50, 11, 46, 33, 21, 39,  6, 46, 23, 48, 17,  8, 28, 39, 32, 46, 46, 19, 35, 47, 45, 29, 11, 52,  4, 32, 31,  9,  5, 37, 51, 18, 37, 35, 26, 15, 33, 44, 23, 36, 15, 19,  5, 40, 41, 34,  7, 27, 28, 24, 46, 37, 11,  4,  6, 37, 45,  9, 30, 48, 14,  6, 51, 50, 39, 19, 14, 36, 24, 40,  6, 26, 41, 36, 49, 37, 20, 42, 46, 33, 19, 44, 15, 21, 21, 49, 16, 41, 16, 18, 39, 35, 39, 31, 36, 33, 22, 30, 42, 52,  6, 36, 51, 21, 18, 50, 39, 34, 48, 22, 19, 38, 23, 26, 27, 40, 43, 14, 42,  5, 34, 15, 25, 19, 30, 50, 27,  4, 18, 11, 50, 34, 19, 16, 15, 16, 29, 24, 37, 14, 26, 41, 30, 51, 26, 40, 33, 44, 14, 24,  6, 46, 45, 15, 20, 35, 23,  7, 11, 31, 27, 25, 26, 18, 47, 46, 34, 14, 52, 48, 38, 21,  8,  5, 38, 24, 20,  8, 19, 18, 44, 14,  7, 37, 45, 16, 49, 44, 52, 47,  6, 17, 16, 52,  8, 33, 13, 42, 40, 31, 15, 22, 48, 32, 50, 31,  8,  5, 16, 35, 40,  5, 44, 50, 31, 24, 46, 50, 36, 29, 29, 44, 37, 23, 46, 49, 21, 23, 13, 52, 22, 15, 42, 16, 51, 11, 46, 40, 46, 27, 28, 48, 35, 42, 21, 13, 11, 46, 26, 10, 16, 17, 42, 13,  7, 26,  7, 32, 12, 38, 26, 50,  5,  9, 11, 47, 44, 36, 50, 35,  6, 50,  6, 36, 11, 11, 28, 40, 50, 41, 36, 22, 47, 47, 22, 47, 24,  5, 18, 51, 15, 12,  9, 22, 46,  9, 51, 33, 10, 24, 16, 10,  4, 18, 37, 34, 32, 12,  4, 21, 13, 28, 31, 27, 52, 23, 40, 38,  4, 52, 50, 15, 37, 29, 46, 43, 35, 10, 25, 17,  6, 10, 23, 19, 35, 42, 52, 22, 27, 27,  4, 50, 47, 27, 41,  9, 31, 13, 12, 16, 29, 28, 40, 49, 49, 41,  6,  9, 19, 42, 40,  5, 45, 41, 17, 50, 31, 52, 14,  9, 33, 27, 48, 46, 43, 47,  9, 12, 52, 51, 35,  8, 15, 50, 49,  5, 25,  8, 47, 44, 26,  8,  9, 46, 46, 16, 12, 42, 23, 49, 12,  5, 23, 47, 36, 16, 40,  8, 23, 21, 22, 18,  4, 25, 46, 17, 23, 36, 42, 30, 25, 37, 34, 52, 30, 26, 22, 52, 16, 35, 20, 28, 36, 25, 49, 50, 29, 45, 16, 42,  5, 47, 10, 27, 51,  7, 18, 22, 27, 42,  6, 19, 19, 48,  4, 12, 17, 49, 47,  4, 37, 20, 45,  9, 21, 16, 25, 47,  4, 13, 28, 27,  7, 19, 50,  7, 24, 51, 32, 25, 39, 37, 32, 18, 38, 38, 32, 20, 35, 33, 13, 49,  5, 37, 16, 11,  7, 26, 13, 11, 13, 49, 40, 37, 51, 29, 19, 49, 48, 47, 22, 33, 27, 12,  7, 47, 25, 16, 43, 42, 31, 26, 30,  8, 11, 25, 12, 13, 15,  7, 39, 10, 49, 23, 11, 33, 39, 51, 35, 19, 45, 48, 22, 39, 14,  7, 51, 47,  7, 19, 22, 51,  4, 12, 35,  6, 49, 35, 40,  9, 16, 25, 47, 51, 38, 25, 10, 26, 50, 20, 12, 23, 51, 42, 49, 50,  9, 34, 19, 22,  4, 16, 15,  5, 37, 32,  7, 42, 24, 51, 46, 37, 35, 22, 34, 50, 28, 42, 15, 36, 52, 44, 14, 30,  8, 32, 15, 39, 17, 42, 16, 51, 35, 38, 49, 42, 24, 42, 50, 41, 14, 39, 16, 49, 47, 48, 20, 31,  8, 51, 15, 51, 51, 32, 46, 26, 38, 17, 48, 29, 49, 34, 43, 42, 25, 44, 52, 36, 17, 46, 51, 49, 25, 43,  5, 33, 23, 35, 37, 16, 24, 42, 46, 29,  4, 39, 19, 27, 38, 12, 18, 21, 50, 14, 33,  6, 46, 13,  4, 27, 36, 11, 44, 32, 28, 11, 52, 19, 20, 45, 25, 15, 49, 52, 38, 40, 40, 31, 13, 49, 34, 27, 23, 47, 47,  7, 13, 40, 42,  4, 13,  4, 38, 33, 17, 12, 22, 44, 20, 23,  8, 43, 21, 24, 48, 23, 40, 19, 48, 10, 40, 38, 14, 14, 52, 45, 16, 27, 41, 46, 47, 15, 50, 30, 37, 14, 47, 41, 16, 33, 46, 32,  4,  5, 23, 27, 17, 47, 41, 42, 17,  7, 20, 50,  6,  4, 49, 20,  7, 33, 42, 39, 24, 19, 18, 44, 30, 47, 16, 20, 42, 50,  5,  6, 15, 29, 24, 11, 32,  7, 38, 33, 31,  9, 46, 25, 10, 41, 43, 47,  5, 26, 40, 51,  9, 27, 18, 43, 21, 28,  8, 35, 28, 11, 27, 23, 43, 12,  8, 39,  7, 30, 13, 32, 30, 25, 33, 32, 26, 25, 14, 41, 50, 13, 47, 37, 11, 24, 46, 49, 35, 26, 33, 43, 50, 35,  5, 47, 42, 39, 42, 52,  5, 39, 34, 45, 49, 20, 15, 43, 39, 16,  5, 38, 36, 20, 17, 40, 23, 12,  9, 46, 22,  8,  4, 27,  6,  4, 19, 11, 16, 37, 47, 12, 52, 42, 19, 22, 35, 48,  5, 35, 47, 52, 28, 37, 51,  5, 50, 39, 35,  4, 50,  7, 28, 20, 42,  8, 51, 42, 20, 12, 13, 46, 39, 30, 22, 52, 35, 34, 52, 14, 52, 24, 31,  7, 30, 51, 38, 52,  4, 38, 38, 39, 33, 26, 43, 40, 35, 52, 39, 23, 34, 49, 40, 40, 50, 27, 41, 13, 10, 42,  5, 48, 29, 47, 51,  9,  6, 32, 26,  9, 48, 36, 30, 19, 38, 51, 49, 17, 17, 27, 43, 37, 51, 48, 29, 37, 37, 41, 49, 37,  6, 23, 12, 33, 17, 11,  5, 21, 37,  4, 51, 35, 19, 51, 30, 48, 12, 43, 10,  6, 46, 44, 42, 15, 10,  5, 20,  6, 41, 40, 19, 40, 48, 42, 16, 10, 23, 13, 31,  9, 36, 12, 15, 38, 38, 13, 45, 19, 33,  5, 44, 19, 31, 12,  9, 42, 49,  9,  6, 27, 33, 51, 41, 29,  4, 18, 47, 27,  5,  9,  5, 32,  9, 36, 32, 35, 46, 45, 40, 29, 35, 22, 46, 39,  4, 36, 46, 44, 14,  6, 39, 17, 26,  8, 42, 23, 37, 37,  5, 44, 52, 16, 20, 42, 22, 17, 33, 51, 22, 12, 23, 49, 13, 49,  6,  4, 26, 41, 20, 45, 47, 52, 24, 38, 34, 14,  7, 20, 41, 23, 27,  7, 16, 51,  4,  7, 11, 40, 39, 49, 43, 41, 51, 19, 44,  5, 26, 22, 30, 47, 32, 46,  4, 51, 34, 36,  5, 43, 26, 35, 48, 52, 38, 36, 52, 32, 25,  5, 33, 47, 25,  5, 51,  9,  8, 31, 43, 16, 34, 18, 51, 28, 31, 46,  6, 40, 36,  4, 47, 50, 30, 40, 28, 24,  4, 49, 44, 19, 25, 42, 42, 14, 32, 46, 39, 19, 14, 49,  5, 39, 50, 29, 32, 45, 25, 41,  6, 11, 51, 39, 43, 39, 14, 31, 37, 24, 16, 22, 44, 30, 33, 48, 34, 38, 27, 35, 49, 40, 35,  7, 40, 14,  7,  5, 41, 44, 52, 28, 18, 14, 12, 16, 22, 51, 36, 18, 37, 42, 10, 30, 52, 19, 23, 44, 45, 28, 27, 38, 49, 35, 28, 16, 13, 41, 17, 42,  8,  6, 15, 28, 29,  7, 13, 34,  5, 12,  8, 19, 52, 30, 11, 23, 32,  7, 46, 46,  6,  7, 22, 36, 25, 33, 45, 46, 38, 31, 28, 39, 50, 24, 16,  4, 38, 46, 48,  7,  4, 20,  9, 34,  4, 45, 35, 29, 36, 47, 36, 41,  5,  7,  4, 49, 30,  7, 13, 48, 45, 49, 25, 49, 46, 10, 18, 45, 10, 10, 38, 33, 22, 47, 38, 39, 50, 34, 52, 36, 41, 31, 20, 25, 16, 15, 32,  7, 51, 18, 33, 26,  6, 33, 19, 48, 11,  4, 44, 33, 25, 30, 33, 34,  4, 33, 49, 13, 50, 29, 35, 12, 28,  9,  7, 21, 38, 28,  5, 13, 22, 26, 10,  8, 20, 12, 47, 29, 43, 46, 32, 33, 32,  7, 14, 32, 30, 30, 47, 28, 20, 33, 35, 12, 10, 50, 30, 10, 50,  5, 30, 43,  7,  9, 18, 13, 40, 20, 14,  8, 17, 17, 31, 29, 48,  4, 48, 30, 31, 27, 52, 45, 47,  6, 30, 37,  5,  8, 25, 17, 17, 39,  8, 15,  5, 33, 27, 44, 21, 31, 37, 51, 26, 42, 51, 41, 26, 48, 16, 40, 46, 50, 29, 44,  9, 39, 36, 35, 51, 37, 37, 30,  8, 43,  5, 38, 28, 18, 51, 37, 32,  4, 10, 31, 43, 12, 21, 47, 11, 11, 29, 51, 39, 50, 21, 28, 52, 28, 25,  4,  6, 43, 37, 20, 24, 48, 14, 20, 14, 47, 37, 52, 26, 20, 24, 52, 42, 45, 27, 38,  5, 29, 43, 37, 27, 28,  4, 41, 16, 23, 38, 10, 22,  5, 39,  8, 49, 46, 32, 19, 21, 52, 43, 35, 25, 26, 39, 18,  4, 39, 30, 18, 41, 45, 11, 14, 10, 49, 14, 19, 11, 24, 19, 18, 26, 50,  7, 36, 17, 29, 51, 25, 13,  7,  8, 14, 47, 31, 18, 17, 50, 31,  7,  5, 13, 28, 37,  9, 40,  4, 25, 23, 50,  5, 43, 44, 19,  9, 10, 39, 27, 10, 34, 28,  4, 46,  5, 43, 17,  4, 32, 44, 29, 38,  7, 51, 29, 30, 18, 46, 26, 29, 33, 21,  5, 52, 12, 17,  6, 52,  9, 47, 40,  5, 30, 40, 28, 45, 37, 40, 16, 36, 39, 12, 12, 47, 28,  9,  7, 43, 48, 48,  4, 37, 31, 52, 29, 34, 49, 46,  9,  6, 49, 26, 14, 47, 50, 28, 11, 46, 16, 26, 38,  7, 20, 19, 21, 10, 13,  9, 27, 21,  7,  5, 13,  5, 23, 41, 49, 10, 29, 40, 34, 43, 16, 38,  8, 44, 15,  8, 22, 42, 41, 19, 26, 39, 14, 15, 23, 43, 24, 41, 45, 50, 25, 47, 11, 26, 39,  5, 50, 40, 44, 40, 24, 46, 37, 28, 37, 39,  8, 39, 29, 40, 17, 50, 19, 52,  5, 14, 15, 25, 33, 32, 40, 42, 40, 14, 31, 43, 45, 17,  4, 46,  5, 23, 31, 46, 37, 37, 48, 37, 31,  7, 18, 36, 27,  4,  5, 41, 32, 25,  9, 47, 29, 46, 10, 30,  7, 38, 41, 18, 11, 28, 40, 36, 47, 49, 36, 30,  5,  9, 36, 33, 24, 16, 46, 42, 16, 47,  9, 42, 33, 37, 49,  7,  7, 20, 29, 27, 42, 41, 34, 44,  4, 43, 42, 23, 49, 14, 20, 26, 39, 14,  7,  5, 47, 22, 22, 38, 18,  5, 26, 44, 44, 41, 14, 31, 13, 41,  5, 13, 15, 45, 40,  9, 47, 23, 46, 16, 38, 24, 12,  6, 13, 19, 10, 18, 44, 21, 23, 41, 10, 10,  5,  5, 50, 25,  4, 42, 48, 40, 44, 49, 17, 47, 47, 40, 10, 25, 11, 37, 14,  9, 17, 42, 15,  7, 51, 36, 22, 10, 40, 45, 29, 24, 27, 32, 47, 16, 21, 49, 31,  4, 49, 41, 36, 45, 51, 30, 43, 49, 24, 32, 44, 13,  8, 29, 34, 44,  6, 34, 39, 46, 16,  4, 27, 10, 36, 15, 26, 44, 22, 27, 21,  8,  8, 37, 18, 13, 34, 45, 44,  9, 45, 47, 28, 10, 20, 43,  5, 40, 22, 29, 39, 49, 13, 34, 47, 38,  4, 12, 51, 27, 20, 11, 14, 39, 30, 27, 35, 42, 26, 39, 48, 27, 24, 25,  5,  9, 48, 17, 26,  8,  4, 39, 16, 33,  7, 26, 15 };
278 |     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0,  	cudaChannelFormatKindSigned);
279 |     cudaMallocArray(&cuArray, &channelDesc, 6*16, 32);
280 |     cudaMemcpyToArrayAsync(cuArray, 0, 0, h_data, 6*16*32*sizeof(int), cudaMemcpyHostToDevice);
281 | 
282 |     patchTriplets.addressMode[0] = cudaAddressModeBorder;
283 |     patchTriplets.addressMode[1] = cudaAddressModeBorder;
284 |     patchTriplets.filterMode     = cudaFilterModePoint;
285 |     patchTriplets.normalized     = false;
286 | 
287 |     cudaBindTextureToArray(patchTriplets, cuArray, channelDesc);
288 | 
289 |     // int h_data_packed[3*512];
290 |     // for (int i=0; i<6*512; i+=2) {
291 |     //     h_data_packed[i>>1] = h_data[i] + (h_data[i+1] << 8);
292 |     // }
293 |     // cudaMemcpyToSymbolAsync(triplets, h_data_packed, sizeof(float)*3*512);
294 | }
295 | 
296 | 
297 | void initImage(unsigned char ** d_I, int width, int height, size_t * pitch) {
298 |     cudaMallocPitch((void**)d_I, pitch, width*sizeof(*d_I), height);
299 | 
300 |     image.addressMode[0] = cudaAddressModeClamp;
301 |     image.addressMode[1] = cudaAddressModeClamp;
302 |     image.addressMode[2] = cudaAddressModeClamp;
303 |     image.normalized = false;
304 |     image.filterMode = cudaFilterModeLinear;
305 |     size_t tex_ofs;
306 |     cudaBindTexture2D (&tex_ofs, &image, *d_I, &image.channelDesc, width, height, *pitch);
307 | }
308 | 
309 | void initMask(float** d_mask, float* h_mask) {
310 |     // This packs even rows together in h_mask, then odd rows.
311 |     // It is 'run once' code.
312 |     float t[64];
313 |     for (int i=0; i<64; i++) {
314 |         t[i] = h_mask[i];
315 |     }
316 |     for (int r=0; r<4; r++) {
317 |         for (int c=0; c<8; c++) {
318 |             h_mask[c + r*8] = t[c + r*16];
319 |         }
320 |     }
321 |     for (int r=4; r<8; r++) {
322 |         for (int c=0; c<8; c++) {
323 |             h_mask[c + r*8] = t[c + (r-4)*16 + 8];
324 |         }
325 |     }
326 |     size_t sizeMask = 64 * sizeof(float);
327 |     cudaMalloc((void **) d_mask, sizeMask);
328 |     cudaMemcpy(*d_mask, h_mask, sizeMask, cudaMemcpyHostToDevice);
329 | }
330 | 
331 | float computeGradient(const unsigned char* img, const int width, const int x, const int y) {
332 |     float dx = 0.0f;
333 |     float dy = 0.0f;
334 |     float delta = 0.0f;
335 |     int base = x + y*width;
336 |     int offset;
337 | 
338 |     offset = 3*width;
339 |     delta = (img[base + offset] - img[base - offset]);
340 |     dy += delta;
341 | 
342 |     offset = 3*width + 1;
343 |     delta = (img[base + offset] - img[base - offset]);
344 |     dy += delta * 3 / sqrt(10);
345 |     dx += delta     / sqrt(10);
346 | 
347 |     offset = 2*width + 2;
348 |     delta = (img[base + offset] - img[base - offset]);
349 |     dy += delta     / sqrt(2);
350 |     dx += delta     / sqrt(2);
351 | 
352 |     offset = 1*width + 3;
353 |     delta = (img[base + offset] - img[base - offset]);
354 |     dy += delta     / sqrt(10);
355 |     dx += delta * 3 / sqrt(10);
356 | 
357 |     offset = 3;
358 |     delta = (img[base + offset] - img[base - offset]);
359 |     dx += delta;
360 | 
361 |     offset = -1*width + 3;
362 |     delta = (img[base + offset] - img[base - offset]);
363 |     dy -= delta     / sqrt(10);
364 |     dx += delta * 3 / sqrt(10);
365 | 
366 |     offset = -2*width + 2;
367 |     delta = (img[base + offset] - img[base - offset]);
368 |     dy -= delta     / sqrt(2);
369 |     dx += delta     / sqrt(2);
370 | 
371 |     offset = -3*width + 1;
372 |     delta = (img[base + offset] - img[base - offset]);
373 |     dy -= delta * 3 / sqrt(10);
374 |     dx += delta     / sqrt(10);
375 | 
376 |     return atan2f(dy, dx);
377 | }
378 | 
379 | void latch( Mat imgMat,
380 |             unsigned char* d_I,
381 |             size_t pitch,
382 |             float* h_K,
383 |             unsigned int* d_D,
384 |             int* keypoints,
385 |             int maxKP,
386 |             float* d_K,
387 |             vector<KeyPoint>* vectorKP,
388 |             float* d_mask,
389 |             cudaEvent_t latchFinished) {
390 |     const unsigned char* h_I = imgMat.data;
391 |     const int height = imgMat.rows;
392 |     const int width = imgMat.cols;
393 | 
394 |     // All of these calls are non blocking but serialized.
395 |     // cudaMemsetAsync(d_K, -1, maxKP * sizeof(int) * 4); // Negative one is represented by all '1' bits in both int32 and uchar8.
396 |     // cudaMemsetAsync(d_D,  0, maxKP * (2048 / 32) * sizeof(unsigned int));
397 |     cudaMemcpy2DAsync(d_I, pitch, h_I, width*sizeof(unsigned char), width*sizeof(unsigned char), height, cudaMemcpyHostToDevice);
398 | 
399 |     // Only prep up to maxKP for the GPU (as that is the most we have prepared the GPU to handle)
400 |     *keypoints = ((*vectorKP).size() < maxKP) ? (*vectorKP).size() : maxKP;
401 |     for (int i=0; i<*keypoints; i+=1) {
402 |         h_K[5*i  ] = (*vectorKP)[i].pt.x;
403 |         h_K[5*i+1] = (*vectorKP)[i].pt.y;
404 |         h_K[5*i+2] = 64.0f; // WIDTH in pixels
405 |         h_K[5*i+3] = 64.0f; // HEIGHT in pixels
406 |         h_K[5*i+4] = 0.0f; // ANGLE in degrees (if openmvg uses radians, let me know)
407 |     }
408 |     for (int i=*keypoints; i<maxKP; i++) {
409 |         for (int j=0; j<5; j++) {
410 |             h_K[5*i+j] = -1.0f;
411 |         }
412 |     }
413 | 
414 |     size_t sizeK = *keypoints * sizeof(float) * 5;
415 |     cudaMemcpyAsync(d_K, h_K, sizeK, cudaMemcpyHostToDevice);
416 | 
417 |     dim3 threadsPerBlock(_warpSize, warpsPerBlock);
418 |     dim3 blocksPerGrid(*keypoints, 1, 1);
419 |     // checkLaunchError();
420 |     latch<<<blocksPerGrid, threadsPerBlock>>>(d_K, d_D, width, height, d_mask);
421 |     // checkLaunchError();
422 |     cudaEventRecord(latchFinished);
423 | }
424 | 


--------------------------------------------------------------------------------
/latch.h:
--------------------------------------------------------------------------------
 1 | void latch( Mat,
 2 |             unsigned char *,
 3 |             size_t,
 4 |             float *,
 5 |             unsigned int *,
 6 |             int *,
 7 |             int,
 8 |             float *,
 9 |             vector<KeyPoint>*,
10 |             float*,
11 |             cudaEvent_t);
12 |  
13 | void initPatchTriplets(cudaArray*);
14 | 
15 | void initImage(    unsigned char**,
16 |                     int,
17 |                     int,
18 |                     size_t *
19 |                 );
20 | 
21 | void initMask(      float **,
22 |                     float *);
23 | 


--------------------------------------------------------------------------------
/latchAff.cu:
--------------------------------------------------------------------------------
  1 | // Uncomment below define to use sum of squared differences instead of sum of absolute differences.
  2 | // #define use_SAD // You can keep this commented.
  3 | // Uncomment below define to use an importance mask on each patch comparison.
  4 | // #define use_mask // You can keep this commented.
  5 | 
  6 | #include <vector>
  7 | #include <stdio.h>
  8 | #include <iostream>
  9 | #include <cuda_runtime.h>
 10 | #include "opencv2/opencv.hpp"
 11 | using namespace std;
 12 | using namespace cv;
 13 | 
 14 | #define _warpSize (32)
 15 | #define _warpSizef (32.0f)
 16 | #define warpsPerBlock (32)
 17 | // Region of interest
 18 | #define roiWidth (64)
 19 | #define roiHeight (64)
 20 | // Minimal amount that avoids shared memory bank conflicts
 21 | #define roiWidthPadding (4)
 22 | // The numbers loaded into the oracle assume patchSize==8. If you really want canonical LATCH, you can set the mask to ignore those pixels.
 23 | #define patchSize (8)
 24 | #define bitsPerDescriptor (512)
 25 | // With further work this should be decreased to 0 for even faster matching.
 26 | #define paddingBitsPerDescriptor (1536)
 27 | #define bitsPerUInt32 (32)
 28 | #define deg2rad (0.0174533f)
 29 | #define negDeg2rad (-0.0174533f)
 30 | #define CHECK_BORDER (0)
 31 | 
 32 | // Used to store the oracle of patch triplets.
 33 | texture<int, cudaTextureType2D, cudaReadModeElementType> patchTriplets;
 34 | texture<unsigned char, 2, cudaReadModeNormalizedFloat> image;
 35 | 
 36 | __constant__ int triplets[3*512];
 37 | 
 38 | #define cudaCalloc(A, B) \
 39 |     do { \
 40 |         cudaError_t __cudaCalloc_err = cudaMalloc(A, B); \
 41 |         if (__cudaCalloc_err == cudaSuccess) cudaMemset(*A, 0, B); \
 42 |     } while (0)
 43 | 
 44 | #define checkError(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 45 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
 46 |    if (code != cudaSuccess) {
 47 |       fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
 48 |       if (abort) exit(code);
 49 |    }
 50 | }
 51 | 
 52 | #define checkLaunchError()                                          \
 53 | do {                                                                  \
 54 |     /* Check synchronous errors, i.e. pre-launch */                   \
 55 |     cudaError_t err = cudaGetLastError();                             \
 56 |     if (cudaSuccess != err) {                                         \
 57 |         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 58 |                  __FILE__, __LINE__, cudaGetErrorString(err) );       \
 59 |         exit(EXIT_FAILURE);                                           \
 60 |     }                                                                 \
 61 |     /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
 62 |     err = cudaThreadSynchronize();                                    \
 63 |     if (cudaSuccess != err) {                                         \
 64 |         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 65 |                  __FILE__, __LINE__, cudaGetErrorString( err) );      \
 66 |         exit(EXIT_FAILURE);                                           \
 67 |     }                                                                 \
 68 | } while (0)
 69 | 
 70 |  // Launch as 32x32
 71 | __global__ void __launch_bounds__(1024, 2)
 72 |                 latch(  const float *g_K,
 73 |                         unsigned int *g_D,
 74 |                         const int imgWidth,
 75 |                         const int imgHeight,
 76 |                         const float *g_mask/*,
 77 |                         const float *g_oriented*/,
 78 |                     float *g_out) {
 79 |     volatile __shared__ int s_kpOffset[2];
 80 |     volatile __shared__ float s_mask[64];
 81 |     volatile __shared__ float s_stride[4];
 82 |     volatile __shared__ float s_roi[roiHeight][roiWidth + roiWidthPadding][1]; // It is faster to operate on floats, even if our image is unsigned char! (we can't guarantee media instructions are available)
 83 |     volatile __shared__ unsigned int s_out[warpsPerBlock];
 84 |     {
 85 |         register float mask0, mask1;
 86 |         if (threadIdx.y == 0 && threadIdx.x < 2) { // 2 threads, 2 coordinates
 87 |             register float k;
 88 |             k = g_K[blockIdx.x*5 + threadIdx.x];
 89 |             if (CHECK_BORDER && (k < _warpSize || (threadIdx.x == 0 && imgWidth-_warpSize < k) || (threadIdx.x == 1 && imgHeight-_warpSize < k))) {
 90 |                 k = -999999; // If too near boundary, make sure the kpOffset will be negative, so everyone gets the signal to bail.
 91 |             }
 92 |             s_kpOffset[threadIdx.x] = k + 0.5f;
 93 |         }
 94 |         if (threadIdx.y == 1 && threadIdx.x < 4) {
 95 |             register float l; // Just a temp variable... thread 0 has major axis, 1 has minor axis, 2 has angle.
 96 |             if (threadIdx.x < 3) {
 97 |                 l = g_K[blockIdx.x*5 + 2 + threadIdx.x];
 98 |             }
 99 |             register float c, s, t; // cos and sin and theta
100 |             t = __shfl(l, 2);
101 |             __sincosf(negDeg2rad*t, &s, &c);
102 |             register float a,b;
103 |             a = __shfl(l, 0) * 0.015625f;
104 |             b = __shfl(l, 1) * 0.015625f;
105 | 
106 |             const register float p = ((threadIdx.x & 1) == 0) ? a : b;
107 |             const register float q = (threadIdx.x == 1 || threadIdx.x == 2) ? s : c;
108 |             
109 |             s_stride[threadIdx.x] = p*q;
110 |         }
111 |         if (threadIdx.y == 2) {
112 |             mask0 = g_mask[threadIdx.x];
113 |         }
114 |         if (threadIdx.y == 3) {
115 |             mask1 = g_mask[_warpSize + threadIdx.x];
116 |         }
117 |         __threadfence_block();
118 |         __syncthreads();
119 |         const register int x = s_kpOffset[0];
120 |         const register int y = s_kpOffset[1];
121 |         if (x < 0 || y < 0) {
122 |             return; // This is the case if our keypoint is within boundary of the edge of the image. 5000 blocks returning in this way takes ~300 microseconds.
123 |         }
124 |         const register float r = (threadIdx.x < 4) ? s_stride[threadIdx.x] : 0.0f;
125 | 
126 |         const register float r11 =  __shfl(r, 0);
127 |         const register float r12 =  __shfl(r, 1);
128 |         const register float r21 = -__shfl(r, 2);
129 |         const register float r22 =  __shfl(r, 3);
130 |         // const register float c = s_stride[0];
131 |         // const register float s = s_stride[1];
132 | 
133 |         // 64 by 64 region of interest means four 32 by 32 loads.
134 |         if (threadIdx.y == 2) {
135 |             s_mask[threadIdx.x] = mask0;
136 |         }
137 |         if (threadIdx.y == 3) {
138 |             s_mask[threadIdx.x + _warpSize] = mask1;
139 |         }
140 | 
141 |         // const register float cN = c * -_warpSizef;
142 |         // const register float sN = s * -_warpSizef;
143 |         //
144 |         // const register float cu = c * threadIdx.x;
145 |         // const register float su = s * threadIdx.x;
146 |         // const register float cv = c * threadIdx.y;
147 |         // const register float sv = s * threadIdx.y;
148 |         //
149 |         // const register float cuN = cu + cN;
150 |         // const register float suN = su + sN;
151 |         // const register float cvN = cv + cN;
152 |         // const register float svN = sv + sN;
153 | 
154 |         const register float nx = threadIdx.x - _warpSizef;
155 |         const register float ny = threadIdx.y - _warpSizef;
156 | 
157 | 
158 |         s_roi[threadIdx.y            ][threadIdx.x            ][0] = /*(unsigned char)*/ ( 256.0f * tex2D(image, min((float)imgWidth-1.0f, r11*nx          + r12*ny          + x), min((float)imgHeight-1.0f, r21*nx          + r22*ny           + y)));
159 |         s_roi[threadIdx.y            ][threadIdx.x + _warpSize][0] = /*(unsigned char)*/ ( 256.0f * tex2D(image, min((float)imgWidth-1.0f, r11*threadIdx.x + r12*ny          + x), min((float)imgHeight-1.0f, r21*threadIdx.x + r22*ny           + y)));
160 |         s_roi[threadIdx.y + _warpSize][threadIdx.x            ][0] = /*(unsigned char)*/ ( 256.0f * tex2D(image, min((float)imgWidth-1.0f, r11*nx          + r12*threadIdx.y + x), min((float)imgHeight-1.0f, r21*nx          + r22*threadIdx.y  + y)));
161 |         s_roi[threadIdx.y + _warpSize][threadIdx.x + _warpSize][0] = /*(unsigned char)*/ ( 256.0f * tex2D(image, min((float)imgWidth-1.0f, r11*threadIdx.x + r12*threadIdx.y + x), min((float)imgHeight-1.0f, r21*threadIdx.x + r22*threadIdx.y  + y)));
162 |     }
163 |     register unsigned int out = 0;
164 |     const register int wrappedX =      threadIdx.x % patchSize; // Offset for patch, interlaced to decrease padding needed for shared memory bank conflict avoidance
165 |     const register int wrappedY = 2 * (threadIdx.x / patchSize); // Each thread will use both wrappedY and wrappedY+1
166 |     __syncthreads();
167 |     __threadfence_block();
168 |     if (blockIdx.x == 0) {
169 |         g_out[(threadIdx.y            )*64 + (threadIdx.x            )] = s_roi[threadIdx.y            ][threadIdx.x            ][0];
170 |         g_out[(threadIdx.y            )*64 + (threadIdx.x + _warpSize)] = s_roi[threadIdx.y            ][threadIdx.x + _warpSize][0];
171 |         g_out[(threadIdx.y + _warpSize)*64 + (threadIdx.x + _warpSize)] = s_roi[threadIdx.y + _warpSize][threadIdx.x + _warpSize][0];
172 |         g_out[(threadIdx.y + _warpSize)*64 + (threadIdx.x            )] = s_roi[threadIdx.y + _warpSize][threadIdx.x            ][0];
173 |     }
174 |     __syncthreads();
175 |     __threadfence_block();
176 | 
177 | 
178 |     const register float mask0 = s_mask[threadIdx.x];
179 |     const register float mask1 = s_mask[threadIdx.x + _warpSize];
180 | 
181 |     register int nextCoord = tex2D(patchTriplets, threadIdx.x, threadIdx.y); // This access is hardware cached.
182 |     // register int offset = threadIdx.y * 3 * 16;
183 |     // register int nextAleph = triplets[offset  ];
184 |     // register int nextTavek = triplets[offset+1];
185 |     // register int nextBet   = triplets[offset+2];
186 |     #pragma unroll
187 |     for (register int i=0; i<16; ) {
188 |         const register int coord = nextCoord;
189 |         if (i!=16-4) nextCoord = tex2D(patchTriplets, 6*(i+4) + threadIdx.x, threadIdx.y); // This access is hardware cached.
190 |         #pragma unroll
191 |         for (register int j=0; j<4; j++, i++) {
192 |             // offset += 3;
193 |             // const register int alephIndexX = nextAleph & 255;
194 |             // const register int alephIndexY = nextAleph >> 8;
195 |             // nextAleph = triplets[offset];
196 |             // const register int tavekIndexX = nextTavek & 255;
197 |             // const register int tavekIndexY = nextTavek >> 8;
198 |             // nextTavek = triplets[offset+1];
199 |             // const register int betIndexX = nextBet & 255;
200 |             // const register int betIndexY = nextBet >> 8;
201 |             // nextBet = triplets[offset+2];
202 | 
203 |             const register int alephIndexX = __shfl(coord, 6*j  );
204 |             const register int alephIndexY = __shfl(coord, 6*j+1);
205 |             const register int tavekIndexX = __shfl(coord, 6*j+2);
206 |             const register int tavekIndexY = __shfl(coord, 6*j+3);
207 |             const register int betIndexX = __shfl(coord, 6*j+4);
208 |             const register int betIndexY = __shfl(coord, 6*j+5);
209 |             const register int bitIndex = 16*(threadIdx.y & 1) + i;
210 |             const register int outThread = 0;
211 | 
212 |             // This assumes an 8x8 patch. As there are only 32 threads per warp, each thread will pull two values from each thread.
213 |             // The access pattern is interleaved to decrease the amount of shared memory padding necessary to avoid bank conflicts:
214 |             //      each thread pulls a verticle pair from each patch.
215 |             const register int tavek0 = s_roi[tavekIndexY + wrappedY  ][tavekIndexX + wrappedX][0]; // Tavek means "between".
216 |             const register int tavek1 = s_roi[tavekIndexY + wrappedY+1][tavekIndexX + wrappedX][0]; // It is our root patch.
217 |             const register int aleph0 = s_roi[alephIndexY + wrappedY  ][alephIndexX + wrappedX][0]; // Aleph is "A"
218 |             const register int aleph1 = s_roi[alephIndexY + wrappedY+1][alephIndexX + wrappedX][0]; // Similarity to aleph is denoted by a bit set to 0
219 |             const register int bet0   = s_roi[betIndexY   + wrappedY  ][betIndexX   + wrappedX][0]; // Bet is "B"
220 |             const register int bet1   = s_roi[betIndexY   + wrappedY+1][betIndexX   + wrappedX][0]; // Similarity to bet is denoted by a bit set to 1
221 | 
222 |             // Now we compute the sum of squared differences between both patch pairs.
223 |             // First, differences:
224 |             register int alephDiff0 = (tavek0 - aleph0);
225 |             register int alephDiff1 = (tavek1 - aleph1);
226 |             register int betDiff0   = (tavek0 - bet0);
227 |             register int betDiff1   = (tavek1 - bet1);
228 |             // Then, squared differences
229 |             alephDiff0 *= alephDiff0;
230 |             alephDiff1 *= alephDiff1;
231 |             betDiff0 *= betDiff0;
232 |             betDiff1 *= betDiff1;
233 | 
234 |             alephDiff0 *= mask0;
235 |             alephDiff1 *= mask1;
236 |             betDiff0 *= mask0;
237 |             betDiff1 *= mask1;
238 | 
239 |             alephDiff0 += alephDiff1; // Merge both interleaved squared differences, to make upcoming warp reduction faster
240 |             betDiff0   += betDiff1;
241 | 
242 |             alephDiff0 -= betDiff0; // Easiest to just take this difference now, then reduce, then compare to 0. Same as reduce then compare relative to each other.
243 |             alephDiff0 += __shfl_xor(alephDiff0,  1);
244 |             alephDiff0 += __shfl_xor(alephDiff0,  2);
245 |             alephDiff0 += __shfl_xor(alephDiff0,  4);
246 |             alephDiff0 += __shfl_xor(alephDiff0,  8);
247 |             alephDiff0 += __shfl_xor(alephDiff0, 16); // By xor shfling, every thread has the resulting sum.
248 | 
249 |             // One thread sets a specific bit high if tavek is closer to bet.
250 |             if (alephDiff0 < 0 && threadIdx.x == outThread) {
251 |                 out |= (1<<bitIndex);
252 |             }
253 |         }
254 |     }
255 | 
256 |     if (threadIdx.x == 0) { // In this case, only thread 0 ever has important data.
257 |         s_out[threadIdx.y] = out;
258 |     }
259 |     __syncthreads();
260 |     __threadfence_block();
261 |     if (threadIdx.y == 0) {
262 |         out = s_out[threadIdx.x]; // Warp 0 now has all the data we need to output.
263 |         __syncthreads();
264 |         __threadfence_block();
265 | 
266 |         out |= __shfl_down(out,  1, _warpSize); // Each warp computed half a 32 bit word. Merge them before output.
267 |         out = __shfl(out, 2*threadIdx.x); // Only even threads have useful data after above shfl_down.
268 | 
269 |         if (threadIdx.x < bitsPerDescriptor / bitsPerUInt32) { // 512 / 32 = 16
270 |             g_D[((paddingBitsPerDescriptor + bitsPerDescriptor) / bitsPerUInt32)*blockIdx.x + threadIdx.x] = out; // And that's it, with this write to global memory we're done with this descriptor.
271 |         }
272 |     }
273 | }
274 | 
275 | 
276 | void initPatchTriplets(cudaArray* cuArray) {
277 |     const int h_data[]= { 41, 22, 47, 47, 51, 24, 32, 44, 52, 17, 32,  7, 50, 14, 26,  8, 51, 33, 45, 18, 30, 38, 42, 10,  6, 30, 16, 40,  6, 49, 39, 34, 35, 43, 31, 17, 21, 44, 18, 14, 25, 37, 23, 29, 12, 44, 19,  7,  9, 30, 26, 19,  6, 52, 47, 40, 27,  9, 43, 19, 35, 26, 50,  5, 41, 48, 25, 37, 11, 27, 23,  9, 25, 14, 33,  7, 38, 47, 40, 19, 52, 48, 48,  8, 23, 46, 47, 39, 22, 12, 50, 35, 29, 20, 18, 34, 47, 24, 31, 36, 26, 47, 11, 38, 17, 16,  7, 11, 52, 15, 46, 14, 42,  9,  4, 13, 43, 14,  5, 17, 22, 50, 27, 17, 34, 14, 44, 46, 38,  5, 48, 32, 51, 36, 32, 35, 45,  9, 26,  7, 17, 10, 25, 35,  5, 38, 17, 33, 12, 47,  4, 32, 43, 12,  9, 23,  9, 24, 27, 33,  8, 30, 48, 40, 39,  4, 37, 50, 37, 41, 22,  5, 38, 43,  6, 20, 24, 23, 13, 48, 22, 15, 29, 44, 22, 51, 10, 25, 20, 13, 10, 33, 42, 16, 37, 41, 47, 40,  6, 44, 27, 47, 44, 16, 29, 36, 27, 24, 25, 35, 31, 43, 51,  5, 33, 19, 30, 21, 42, 15, 34, 48, 10, 39, 44, 18, 16, 32, 13, 30, 19, 49,  7, 48, 25, 33,  6, 51, 21,  6, 11, 41, 52, 14,  4,  4, 52, 43, 31,  6, 12, 35, 14,  8, 29, 21, 16, 26, 47, 45, 28, 46, 16, 21, 16, 38, 36, 33,  7, 10, 13, 37, 41, 31, 10, 11, 28, 33, 39,  6, 36, 46, 49, 30,  6, 11, 43, 31,  6, 13, 46, 51,  5, 49,  4, 44, 38, 25, 20, 27,  9, 47, 50, 51, 14, 26, 48, 43, 26,  9, 47, 13, 18, 40, 28, 19, 19, 12, 41,  6, 44, 44, 28, 14, 20, 15, 27, 48, 23,  6, 21,  5, 24, 38, 27, 48, 44, 27, 15, 12, 52, 10, 10, 40, 36, 47,  4, 14, 13, 52, 34, 30,  7,  6, 48, 26, 36, 28, 45, 18,  9, 49, 21, 48, 14, 31, 47, 45, 28, 12, 10,  9, 11,  9, 40,  5, 16, 20, 37, 38, 37,  5, 49,  4, 37, 47, 13, 10, 35,  9, 33, 31, 25, 12, 32, 26, 43, 18,  4, 44, 52, 39, 45, 44, 19, 29, 46, 13, 39, 23, 28, 52,  8, 16, 14,  9, 52, 12, 19, 22, 50, 14, 30,  6, 44, 39, 51, 27, 32, 18, 48, 50, 18, 19, 45, 41, 15, 15, 13, 41, 39, 37, 15, 37, 50, 43, 30, 46, 16, 18, 31, 51, 46, 43, 48,  4, 35, 22, 44, 39, 36, 29, 41, 44, 52,  8, 37, 24, 20, 25, 45, 52,  9, 45, 39, 34, 23, 50, 42, 18, 23, 17, 13, 18,  6, 37, 35, 46, 16, 36, 41,  4, 37, 28, 30, 31, 35, 40, 49, 42, 28, 20, 11, 30, 50, 48, 23, 44, 47,  5, 50, 10,  9, 25, 52, 13, 46, 28, 17, 44, 45, 39, 50, 43, 17, 35, 48, 19, 12, 38, 30, 29,  9, 48,  9, 24, 30, 25,  4, 45, 25, 49, 50, 16, 27, 31, 25,  8, 21, 51, 27, 19, 17, 31,  8, 23, 19, 20, 47, 11, 49, 49, 49, 15, 18, 34, 30, 26, 11,  7, 47, 52, 48, 34, 52, 17, 38,  5, 27, 19, 36, 23, 50,  8, 25, 52, 47, 33,  4, 34, 28, 15,  5, 13, 38, 48,  6, 24, 37,  8,  4, 38, 33, 13,  4,  8, 50, 34, 36, 21, 39, 50, 10, 35, 19, 47, 16, 23, 19, 49,  8, 11, 11, 50,  5, 34,  6, 16, 11, 35, 10, 31, 29, 52,  4, 48, 18, 19, 30, 43, 46, 46, 44, 15, 10, 39, 37, 22, 52, 52,  4, 50, 40, 16, 48, 35,  7, 43, 50, 23, 19, 21, 51, 15, 11,  8, 19, 22, 51, 28,  6, 41, 13, 10, 29,  6, 11, 38, 28, 32, 32, 20, 46, 20, 21, 22,  8, 46,  8, 25,  8, 14, 32, 19, 11, 20, 10, 21, 31, 36, 12, 33, 35, 16, 38, 47, 48, 49,  6, 52, 32, 36,  6, 30,  9, 10, 10, 50, 26, 41, 38, 37, 13, 43, 49, 44, 44, 39,  4, 26, 52, 49, 21, 16, 29, 42, 37, 45, 48, 45, 35, 35, 33,  4, 15, 20, 49, 46, 13, 39,  6, 36, 40, 20, 10, 51, 42, 38, 34,  4, 45, 18, 36, 41, 49, 45, 52, 25,  7,  4, 46, 39, 20, 33, 18,  5, 26, 51, 15, 33, 39, 35, 27,  7, 18, 24, 49,  6, 13, 34, 34, 24, 44, 21, 21,  5, 47, 34, 27, 49, 51, 14, 26, 11, 50, 15,  6, 32, 42, 31, 18, 31, 42, 17,  6, 36, 39, 41,  4, 38, 52, 49, 40, 30, 41, 12, 43, 29, 27, 24, 48,  6, 22,  9, 14,  8, 30, 17,  8, 52,  5, 18, 40, 29,  4, 30,  4,  5, 12, 41, 27, 17, 20, 34, 47, 15,  5, 51, 10,  4, 51, 12,  7, 44, 16, 47, 18, 34, 22, 12, 28, 13, 15, 52, 26, 37, 47, 24, 28, 49, 49, 44, 18,  4,  4,  8, 15, 23, 52, 35, 15, 35, 46, 47, 28, 50,  7, 48, 28, 46, 51, 38, 15, 14, 44, 38, 18, 16, 36, 38, 15, 52,  6, 22, 11, 42, 22, 39, 45, 45, 21, 45, 45, 16, 50, 27, 26, 25,  4, 50, 40, 28, 29, 17, 40, 12,  8, 22, 17, 45, 23,  9, 46, 35, 20, 31, 51, 17, 52, 21, 10, 52, 48, 27, 18, 32, 24,  6, 14, 20, 43, 20, 12, 48, 45, 51, 40, 43, 43,  9, 33, 32, 12, 49, 31, 25, 11, 13, 10, 42,  8,  6, 10, 40, 49, 41, 10, 28, 40, 16,  8, 51, 43, 18, 14, 12,  4, 44, 40, 23, 12, 41, 17, 15, 24, 19, 26, 10, 31, 16,  4, 28, 26, 25, 14, 14, 50, 37,  7, 45, 46, 38, 30, 51, 43, 34, 20, 10, 43, 51, 17, 51,  4, 41, 32, 44,  4, 15, 37, 28, 49,  5, 34,  4,  6, 41, 49, 47,  7, 18,  7, 47, 35, 26, 21, 29, 30,  7, 36, 48, 39, 16, 47,  9, 26, 52, 45, 29, 25, 21, 31, 45, 24, 15,  5, 23, 13, 14, 21, 39, 13,  5, 52, 50, 11, 46, 33, 21, 39,  6, 46, 23, 48, 17,  8, 28, 39, 32, 46, 46, 19, 35, 47, 45, 29, 11, 52,  4, 32, 31,  9,  5, 37, 51, 18, 37, 35, 26, 15, 33, 44, 23, 36, 15, 19,  5, 40, 41, 34,  7, 27, 28, 24, 46, 37, 11,  4,  6, 37, 45,  9, 30, 48, 14,  6, 51, 50, 39, 19, 14, 36, 24, 40,  6, 26, 41, 36, 49, 37, 20, 42, 46, 33, 19, 44, 15, 21, 21, 49, 16, 41, 16, 18, 39, 35, 39, 31, 36, 33, 22, 30, 42, 52,  6, 36, 51, 21, 18, 50, 39, 34, 48, 22, 19, 38, 23, 26, 27, 40, 43, 14, 42,  5, 34, 15, 25, 19, 30, 50, 27,  4, 18, 11, 50, 34, 19, 16, 15, 16, 29, 24, 37, 14, 26, 41, 30, 51, 26, 40, 33, 44, 14, 24,  6, 46, 45, 15, 20, 35, 23,  7, 11, 31, 27, 25, 26, 18, 47, 46, 34, 14, 52, 48, 38, 21,  8,  5, 38, 24, 20,  8, 19, 18, 44, 14,  7, 37, 45, 16, 49, 44, 52, 47,  6, 17, 16, 52,  8, 33, 13, 42, 40, 31, 15, 22, 48, 32, 50, 31,  8,  5, 16, 35, 40,  5, 44, 50, 31, 24, 46, 50, 36, 29, 29, 44, 37, 23, 46, 49, 21, 23, 13, 52, 22, 15, 42, 16, 51, 11, 46, 40, 46, 27, 28, 48, 35, 42, 21, 13, 11, 46, 26, 10, 16, 17, 42, 13,  7, 26,  7, 32, 12, 38, 26, 50,  5,  9, 11, 47, 44, 36, 50, 35,  6, 50,  6, 36, 11, 11, 28, 40, 50, 41, 36, 22, 47, 47, 22, 47, 24,  5, 18, 51, 15, 12,  9, 22, 46,  9, 51, 33, 10, 24, 16, 10,  4, 18, 37, 34, 32, 12,  4, 21, 13, 28, 31, 27, 52, 23, 40, 38,  4, 52, 50, 15, 37, 29, 46, 43, 35, 10, 25, 17,  6, 10, 23, 19, 35, 42, 52, 22, 27, 27,  4, 50, 47, 27, 41,  9, 31, 13, 12, 16, 29, 28, 40, 49, 49, 41,  6,  9, 19, 42, 40,  5, 45, 41, 17, 50, 31, 52, 14,  9, 33, 27, 48, 46, 43, 47,  9, 12, 52, 51, 35,  8, 15, 50, 49,  5, 25,  8, 47, 44, 26,  8,  9, 46, 46, 16, 12, 42, 23, 49, 12,  5, 23, 47, 36, 16, 40,  8, 23, 21, 22, 18,  4, 25, 46, 17, 23, 36, 42, 30, 25, 37, 34, 52, 30, 26, 22, 52, 16, 35, 20, 28, 36, 25, 49, 50, 29, 45, 16, 42,  5, 47, 10, 27, 51,  7, 18, 22, 27, 42,  6, 19, 19, 48,  4, 12, 17, 49, 47,  4, 37, 20, 45,  9, 21, 16, 25, 47,  4, 13, 28, 27,  7, 19, 50,  7, 24, 51, 32, 25, 39, 37, 32, 18, 38, 38, 32, 20, 35, 33, 13, 49,  5, 37, 16, 11,  7, 26, 13, 11, 13, 49, 40, 37, 51, 29, 19, 49, 48, 47, 22, 33, 27, 12,  7, 47, 25, 16, 43, 42, 31, 26, 30,  8, 11, 25, 12, 13, 15,  7, 39, 10, 49, 23, 11, 33, 39, 51, 35, 19, 45, 48, 22, 39, 14,  7, 51, 47,  7, 19, 22, 51,  4, 12, 35,  6, 49, 35, 40,  9, 16, 25, 47, 51, 38, 25, 10, 26, 50, 20, 12, 23, 51, 42, 49, 50,  9, 34, 19, 22,  4, 16, 15,  5, 37, 32,  7, 42, 24, 51, 46, 37, 35, 22, 34, 50, 28, 42, 15, 36, 52, 44, 14, 30,  8, 32, 15, 39, 17, 42, 16, 51, 35, 38, 49, 42, 24, 42, 50, 41, 14, 39, 16, 49, 47, 48, 20, 31,  8, 51, 15, 51, 51, 32, 46, 26, 38, 17, 48, 29, 49, 34, 43, 42, 25, 44, 52, 36, 17, 46, 51, 49, 25, 43,  5, 33, 23, 35, 37, 16, 24, 42, 46, 29,  4, 39, 19, 27, 38, 12, 18, 21, 50, 14, 33,  6, 46, 13,  4, 27, 36, 11, 44, 32, 28, 11, 52, 19, 20, 45, 25, 15, 49, 52, 38, 40, 40, 31, 13, 49, 34, 27, 23, 47, 47,  7, 13, 40, 42,  4, 13,  4, 38, 33, 17, 12, 22, 44, 20, 23,  8, 43, 21, 24, 48, 23, 40, 19, 48, 10, 40, 38, 14, 14, 52, 45, 16, 27, 41, 46, 47, 15, 50, 30, 37, 14, 47, 41, 16, 33, 46, 32,  4,  5, 23, 27, 17, 47, 41, 42, 17,  7, 20, 50,  6,  4, 49, 20,  7, 33, 42, 39, 24, 19, 18, 44, 30, 47, 16, 20, 42, 50,  5,  6, 15, 29, 24, 11, 32,  7, 38, 33, 31,  9, 46, 25, 10, 41, 43, 47,  5, 26, 40, 51,  9, 27, 18, 43, 21, 28,  8, 35, 28, 11, 27, 23, 43, 12,  8, 39,  7, 30, 13, 32, 30, 25, 33, 32, 26, 25, 14, 41, 50, 13, 47, 37, 11, 24, 46, 49, 35, 26, 33, 43, 50, 35,  5, 47, 42, 39, 42, 52,  5, 39, 34, 45, 49, 20, 15, 43, 39, 16,  5, 38, 36, 20, 17, 40, 23, 12,  9, 46, 22,  8,  4, 27,  6,  4, 19, 11, 16, 37, 47, 12, 52, 42, 19, 22, 35, 48,  5, 35, 47, 52, 28, 37, 51,  5, 50, 39, 35,  4, 50,  7, 28, 20, 42,  8, 51, 42, 20, 12, 13, 46, 39, 30, 22, 52, 35, 34, 52, 14, 52, 24, 31,  7, 30, 51, 38, 52,  4, 38, 38, 39, 33, 26, 43, 40, 35, 52, 39, 23, 34, 49, 40, 40, 50, 27, 41, 13, 10, 42,  5, 48, 29, 47, 51,  9,  6, 32, 26,  9, 48, 36, 30, 19, 38, 51, 49, 17, 17, 27, 43, 37, 51, 48, 29, 37, 37, 41, 49, 37,  6, 23, 12, 33, 17, 11,  5, 21, 37,  4, 51, 35, 19, 51, 30, 48, 12, 43, 10,  6, 46, 44, 42, 15, 10,  5, 20,  6, 41, 40, 19, 40, 48, 42, 16, 10, 23, 13, 31,  9, 36, 12, 15, 38, 38, 13, 45, 19, 33,  5, 44, 19, 31, 12,  9, 42, 49,  9,  6, 27, 33, 51, 41, 29,  4, 18, 47, 27,  5,  9,  5, 32,  9, 36, 32, 35, 46, 45, 40, 29, 35, 22, 46, 39,  4, 36, 46, 44, 14,  6, 39, 17, 26,  8, 42, 23, 37, 37,  5, 44, 52, 16, 20, 42, 22, 17, 33, 51, 22, 12, 23, 49, 13, 49,  6,  4, 26, 41, 20, 45, 47, 52, 24, 38, 34, 14,  7, 20, 41, 23, 27,  7, 16, 51,  4,  7, 11, 40, 39, 49, 43, 41, 51, 19, 44,  5, 26, 22, 30, 47, 32, 46,  4, 51, 34, 36,  5, 43, 26, 35, 48, 52, 38, 36, 52, 32, 25,  5, 33, 47, 25,  5, 51,  9,  8, 31, 43, 16, 34, 18, 51, 28, 31, 46,  6, 40, 36,  4, 47, 50, 30, 40, 28, 24,  4, 49, 44, 19, 25, 42, 42, 14, 32, 46, 39, 19, 14, 49,  5, 39, 50, 29, 32, 45, 25, 41,  6, 11, 51, 39, 43, 39, 14, 31, 37, 24, 16, 22, 44, 30, 33, 48, 34, 38, 27, 35, 49, 40, 35,  7, 40, 14,  7,  5, 41, 44, 52, 28, 18, 14, 12, 16, 22, 51, 36, 18, 37, 42, 10, 30, 52, 19, 23, 44, 45, 28, 27, 38, 49, 35, 28, 16, 13, 41, 17, 42,  8,  6, 15, 28, 29,  7, 13, 34,  5, 12,  8, 19, 52, 30, 11, 23, 32,  7, 46, 46,  6,  7, 22, 36, 25, 33, 45, 46, 38, 31, 28, 39, 50, 24, 16,  4, 38, 46, 48,  7,  4, 20,  9, 34,  4, 45, 35, 29, 36, 47, 36, 41,  5,  7,  4, 49, 30,  7, 13, 48, 45, 49, 25, 49, 46, 10, 18, 45, 10, 10, 38, 33, 22, 47, 38, 39, 50, 34, 52, 36, 41, 31, 20, 25, 16, 15, 32,  7, 51, 18, 33, 26,  6, 33, 19, 48, 11,  4, 44, 33, 25, 30, 33, 34,  4, 33, 49, 13, 50, 29, 35, 12, 28,  9,  7, 21, 38, 28,  5, 13, 22, 26, 10,  8, 20, 12, 47, 29, 43, 46, 32, 33, 32,  7, 14, 32, 30, 30, 47, 28, 20, 33, 35, 12, 10, 50, 30, 10, 50,  5, 30, 43,  7,  9, 18, 13, 40, 20, 14,  8, 17, 17, 31, 29, 48,  4, 48, 30, 31, 27, 52, 45, 47,  6, 30, 37,  5,  8, 25, 17, 17, 39,  8, 15,  5, 33, 27, 44, 21, 31, 37, 51, 26, 42, 51, 41, 26, 48, 16, 40, 46, 50, 29, 44,  9, 39, 36, 35, 51, 37, 37, 30,  8, 43,  5, 38, 28, 18, 51, 37, 32,  4, 10, 31, 43, 12, 21, 47, 11, 11, 29, 51, 39, 50, 21, 28, 52, 28, 25,  4,  6, 43, 37, 20, 24, 48, 14, 20, 14, 47, 37, 52, 26, 20, 24, 52, 42, 45, 27, 38,  5, 29, 43, 37, 27, 28,  4, 41, 16, 23, 38, 10, 22,  5, 39,  8, 49, 46, 32, 19, 21, 52, 43, 35, 25, 26, 39, 18,  4, 39, 30, 18, 41, 45, 11, 14, 10, 49, 14, 19, 11, 24, 19, 18, 26, 50,  7, 36, 17, 29, 51, 25, 13,  7,  8, 14, 47, 31, 18, 17, 50, 31,  7,  5, 13, 28, 37,  9, 40,  4, 25, 23, 50,  5, 43, 44, 19,  9, 10, 39, 27, 10, 34, 28,  4, 46,  5, 43, 17,  4, 32, 44, 29, 38,  7, 51, 29, 30, 18, 46, 26, 29, 33, 21,  5, 52, 12, 17,  6, 52,  9, 47, 40,  5, 30, 40, 28, 45, 37, 40, 16, 36, 39, 12, 12, 47, 28,  9,  7, 43, 48, 48,  4, 37, 31, 52, 29, 34, 49, 46,  9,  6, 49, 26, 14, 47, 50, 28, 11, 46, 16, 26, 38,  7, 20, 19, 21, 10, 13,  9, 27, 21,  7,  5, 13,  5, 23, 41, 49, 10, 29, 40, 34, 43, 16, 38,  8, 44, 15,  8, 22, 42, 41, 19, 26, 39, 14, 15, 23, 43, 24, 41, 45, 50, 25, 47, 11, 26, 39,  5, 50, 40, 44, 40, 24, 46, 37, 28, 37, 39,  8, 39, 29, 40, 17, 50, 19, 52,  5, 14, 15, 25, 33, 32, 40, 42, 40, 14, 31, 43, 45, 17,  4, 46,  5, 23, 31, 46, 37, 37, 48, 37, 31,  7, 18, 36, 27,  4,  5, 41, 32, 25,  9, 47, 29, 46, 10, 30,  7, 38, 41, 18, 11, 28, 40, 36, 47, 49, 36, 30,  5,  9, 36, 33, 24, 16, 46, 42, 16, 47,  9, 42, 33, 37, 49,  7,  7, 20, 29, 27, 42, 41, 34, 44,  4, 43, 42, 23, 49, 14, 20, 26, 39, 14,  7,  5, 47, 22, 22, 38, 18,  5, 26, 44, 44, 41, 14, 31, 13, 41,  5, 13, 15, 45, 40,  9, 47, 23, 46, 16, 38, 24, 12,  6, 13, 19, 10, 18, 44, 21, 23, 41, 10, 10,  5,  5, 50, 25,  4, 42, 48, 40, 44, 49, 17, 47, 47, 40, 10, 25, 11, 37, 14,  9, 17, 42, 15,  7, 51, 36, 22, 10, 40, 45, 29, 24, 27, 32, 47, 16, 21, 49, 31,  4, 49, 41, 36, 45, 51, 30, 43, 49, 24, 32, 44, 13,  8, 29, 34, 44,  6, 34, 39, 46, 16,  4, 27, 10, 36, 15, 26, 44, 22, 27, 21,  8,  8, 37, 18, 13, 34, 45, 44,  9, 45, 47, 28, 10, 20, 43,  5, 40, 22, 29, 39, 49, 13, 34, 47, 38,  4, 12, 51, 27, 20, 11, 14, 39, 30, 27, 35, 42, 26, 39, 48, 27, 24, 25,  5,  9, 48, 17, 26,  8,  4, 39, 16, 33,  7, 26, 15 };
278 |     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0,  	cudaChannelFormatKindSigned);
279 |     cudaMallocArray(&cuArray, &channelDesc, 6*16, 32);
280 |     cudaMemcpyToArrayAsync(cuArray, 0, 0, h_data, 6*16*32*sizeof(int), cudaMemcpyHostToDevice);
281 | 
282 |     patchTriplets.addressMode[0] = cudaAddressModeBorder;
283 |     patchTriplets.addressMode[1] = cudaAddressModeBorder;
284 |     patchTriplets.filterMode     = cudaFilterModePoint;
285 |     patchTriplets.normalized     = false;
286 | 
287 |     cudaBindTextureToArray(patchTriplets, cuArray, channelDesc);
288 | 
289 |     // int h_data_packed[3*512];
290 |     // for (int i=0; i<6*512; i+=2) {
291 |     //     h_data_packed[i>>1] = h_data[i] + (h_data[i+1] << 8);
292 |     // }
293 |     // cudaMemcpyToSymbolAsync(triplets, h_data_packed, sizeof(float)*3*512);
294 | }
295 | 
296 | 
297 | void initImage(unsigned char ** d_I, int width, int height, size_t * pitch) {
298 |     cudaMallocPitch((void**)d_I, pitch, width*sizeof(*d_I), height);
299 | 
300 |     image.addressMode[0] = cudaAddressModeClamp;
301 |     image.addressMode[1] = cudaAddressModeClamp;
302 |     image.addressMode[2] = cudaAddressModeClamp;
303 |     image.normalized = false;
304 |     image.filterMode = cudaFilterModeLinear;
305 |     size_t tex_ofs;
306 |     cudaBindTexture2D (&tex_ofs, &image, *d_I, &image.channelDesc, width, height, *pitch);
307 | }
308 | 
309 | void initMask(float** d_mask, float* h_mask) {
310 |     // This packs even rows together in h_mask, then odd rows.
311 |     // It is 'run once' code.
312 |     float t[64];
313 |     for (int i=0; i<64; i++) {
314 |         t[i] = h_mask[i];
315 |     }
316 |     for (int r=0; r<4; r++) {
317 |         for (int c=0; c<8; c++) {
318 |             h_mask[c + r*8] = t[c + r*16];
319 |         }
320 |     }
321 |     for (int r=4; r<8; r++) {
322 |         for (int c=0; c<8; c++) {
323 |             h_mask[c + r*8] = t[c + (r-4)*16 + 8];
324 |         }
325 |     }
326 |     size_t sizeMask = 64 * sizeof(float);
327 |     cudaMalloc((void **) d_mask, sizeMask);
328 |     cudaMemcpy(*d_mask, h_mask, sizeMask, cudaMemcpyHostToDevice);
329 | }
330 | 
331 | float computeGradient(const unsigned char* img, const int width, const int x, const int y) {
332 |     float dx = 0.0f;
333 |     float dy = 0.0f;
334 |     float delta = 0.0f;
335 |     int base = x + y*width;
336 |     int offset;
337 | 
338 |     offset = 3*width;
339 |     delta = (img[base + offset] - img[base - offset]);
340 |     dy += delta;
341 | 
342 |     offset = 3*width + 1;
343 |     delta = (img[base + offset] - img[base - offset]);
344 |     dy += delta * 3 / sqrt(10);
345 |     dx += delta     / sqrt(10);
346 | 
347 |     offset = 2*width + 2;
348 |     delta = (img[base + offset] - img[base - offset]);
349 |     dy += delta     / sqrt(2);
350 |     dx += delta     / sqrt(2);
351 | 
352 |     offset = 1*width + 3;
353 |     delta = (img[base + offset] - img[base - offset]);
354 |     dy += delta     / sqrt(10);
355 |     dx += delta * 3 / sqrt(10);
356 | 
357 |     offset = 3;
358 |     delta = (img[base + offset] - img[base - offset]);
359 |     dx += delta;
360 | 
361 |     offset = -1*width + 3;
362 |     delta = (img[base + offset] - img[base - offset]);
363 |     dy -= delta     / sqrt(10);
364 |     dx += delta * 3 / sqrt(10);
365 | 
366 |     offset = -2*width + 2;
367 |     delta = (img[base + offset] - img[base - offset]);
368 |     dy -= delta     / sqrt(2);
369 |     dx += delta     / sqrt(2);
370 | 
371 |     offset = -3*width + 1;
372 |     delta = (img[base + offset] - img[base - offset]);
373 |     dy -= delta * 3 / sqrt(10);
374 |     dx += delta     / sqrt(10);
375 | 
376 |     return atan2f(dy, dx);
377 | }
378 | 
379 | void latchAff( Mat imgMat,
380 |             unsigned char* d_I,
381 |             size_t pitch,
382 |             float* h_K,
383 |             unsigned int* d_D,
384 |             int* keypoints,
385 |             int maxKP,
386 |             float* d_K,
387 |             vector<KeyPoint>* vectorKP,
388 |             float* d_mask,
389 |             cudaEvent_t latchFinished,
390 |             Mat outMat,
391 |             RotatedRect rekt) {
392 |     const unsigned char* h_I = imgMat.data;
393 |     const int height = imgMat.rows;
394 |     const int width = imgMat.cols;
395 | 
396 |     // All of these calls are non blocking but serialized.
397 |     // cudaMemsetAsync(d_K, -1, maxKP * sizeof(int) * 4); // Negative one is represented by all '1' bits in both int32 and uchar8.
398 |     // cudaMemsetAsync(d_D,  0, maxKP * (2048 / 32) * sizeof(unsigned int));
399 |     cudaMemcpy2DAsync(d_I, pitch, h_I, width*sizeof(unsigned char), width*sizeof(unsigned char), height, cudaMemcpyHostToDevice);
400 | 
401 |     // Only prep up to maxKP for the GPU (as that is the most we have prepared the GPU to handle)
402 |     *keypoints = ((*vectorKP).size() < maxKP) ? (*vectorKP).size() : maxKP;
403 |     for (int i=0; i<*keypoints; i+=1) {
404 |         h_K[5*i  ] = (*vectorKP)[i].pt.x;
405 |         h_K[5*i+1] = (*vectorKP)[i].pt.y;
406 |         h_K[5*i+2] = 1.0f; // (*vectorKP)[i].size);
407 |         // h_K[4*i+3] = (*vectorKP)[i].angle;
408 |         h_K[5*i+3] = computeGradient(h_I, width, h_K[5*i  ], h_K[5*i+1]);
409 |     }
410 |     for (int i=*keypoints; i<maxKP; i++) {
411 |         h_K[5*i  ] = -1.0f;
412 |         h_K[5*i+1] = -1.0f;
413 |         h_K[5*i+2] = -1.0f;
414 |         h_K[5*i+3] = -1.0f;
415 |     }
416 |     h_K[0] = rekt.center.x;
417 |     h_K[1] = rekt.center.y;
418 |     h_K[2] = (rekt.size.width)/64;
419 |     h_K[3] = (rekt.size.height)/64;
420 |     h_K[4] = -3.1415926535f*rekt.angle/180.0f;
421 |     cerr << h_K[0] << endl;
422 |     cerr << h_K[1] << endl;
423 |     cerr << h_K[2] << endl;
424 |     cerr << h_K[3] << endl;
425 |     cerr << h_K[4] << endl;
426 | 
427 |     size_t sizeK = *keypoints * sizeof(float) * 5;
428 |     cudaMemcpyAsync(d_K, h_K, sizeK, cudaMemcpyHostToDevice);
429 | 
430 |     float *d_out, *h_out;
431 |     size_t sizeOut = sizeof(float) * 64 * 64;
432 |     cudaMallocHost((void **) &h_out, sizeOut);
433 |     cudaCalloc((void **) &d_out, sizeOut);
434 | 
435 | 
436 |     dim3 threadsPerBlock(_warpSize, warpsPerBlock);
437 |     dim3 blocksPerGrid(*keypoints, 1, 1);
438 |     checkLaunchError();
439 |     latch<<<blocksPerGrid, threadsPerBlock>>>(d_K, d_D, width, height, d_mask, d_out);
440 |     cudaDeviceSynchronize();
441 |     cudaMemcpy(h_out, d_out, sizeOut, cudaMemcpyDeviceToHost);
442 |     cudaDeviceSynchronize();
443 |     for (int j=0; j<64; j++) {
444 |         for (int i=0; i<64; i++) {
445 |             outMat.at<uchar>(j, 3*i) = h_out[j*64+i];
446 |             outMat.at<uchar>(j, 3*i+1) = h_out[j*64+i];
447 |             outMat.at<uchar>(j, 3*i+2) = h_out[j*64+i];
448 |             // cerr << " " << h_out[j*64+i];
449 |         }
450 |     }
451 |     cerr << endl;
452 |     checkLaunchError();
453 |     cudaEventRecord(latchFinished);
454 | }
455 | 


--------------------------------------------------------------------------------
/latchAff.h:
--------------------------------------------------------------------------------
 1 | void latchAff( Mat,
 2 |             unsigned char *,
 3 |             size_t,
 4 |             float *,
 5 |             unsigned int *,
 6 |             int *,
 7 |             int,
 8 |             float *,
 9 |             vector<KeyPoint>*,
10 |             float*,
11 |             cudaEvent_t,
12 |             Mat,
13 |             RotatedRect);
14 | 
15 | void initPatchTriplets(cudaArray*);
16 | 
17 | void initImage(    unsigned char**,
18 |                     int,
19 |                     int,
20 |                     size_t *
21 |                 );
22 | 
23 | void initMask(      float **,
24 |                     float *);
25 | 


--------------------------------------------------------------------------------
/min.cpp:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <iostream>
  3 | #include <time.h>
  4 | #include "cuda.h"
  5 | #include "cuda_runtime.h"
  6 | #include "opencv2/opencv.hpp"
  7 | using namespace std;
  8 | using namespace cv;
  9 | #include "latch.h"
 10 | #include "bitMatcher.h"
 11 | 
 12 | #define cudaCalloc(A, B) \
 13 |     do { \
 14 |         cudaError_t __cudaCalloc_err = cudaMalloc(A, B); \
 15 |         if (__cudaCalloc_err == cudaSuccess) cudaMemset(*A, 0, B); \
 16 |     } while (0)
 17 | 
 18 | int main( int argc, char** argv ) {
 19 |     if (argc != 3) {
 20 |         cout << "Please pass the names of two images (same size) as arguments:    ./min img1.png img2.png" << endl;
 21 |         return -1;
 22 |     }
 23 |     // maKP is the maximum number of keypoints/features you will be able to use on the GPU.
 24 |     // This _must_ be an integer multiple of 512.
 25 |     // Integers which are themselves a multiple of the number of streaming multiprocessors
 26 |     // on your GPU (or half that number) should work well.
 27 |     const int maxKP = 512 * 15;
 28 |     int matchThreshold = 8; // Second best match must be at least matchThreshold from best match.
 29 |     int fastThreshold = 12; // For keypoint detection.
 30 |     clock_t t;
 31 | 
 32 |     Mat img1, img2, img1g, img2g, imgMatches;
 33 |     vector<KeyPoint> keypoints1, keypoints2;
 34 |     vector<DMatch> matches;
 35 | 
 36 |     img1 = imread(argv[1], IMREAD_COLOR);
 37 |     img2 = imread(argv[2], IMREAD_COLOR);
 38 |     cv::cvtColor(img1, img1g, CV_BGR2GRAY);
 39 |     cv::cvtColor(img2, img2g, CV_BGR2GRAY);
 40 |     const int imgWidth = img1.cols; // Assumes both images are the same size.
 41 |     const int imgHeight = img1.rows;
 42 | 
 43 |     // Sizes for host and device arrays both.
 44 |     size_t sizeK = maxKP * sizeof(int) * 2; // K for Keypoint
 45 |     size_t sizeI = imgWidth * imgHeight * sizeof(unsigned char); // I for Image
 46 |     size_t sizeD = maxKP * (2048 / 32) * sizeof(unsigned int); // D for Descriptor. 32 bits per uint32. 2048 bits per descriptor.
 47 |     size_t sizeM = maxKP * sizeof(int); // M for Matches
 48 | 
 49 |     // Host (CPU) arrays
 50 |     int *h_K1, *h_K2;
 51 |     cudaMallocHost((void **) &h_K1, sizeK); // Page locked memory is faster to transfer to-and-from the GPU
 52 |     cudaMallocHost((void **) &h_K2, sizeK); // (but that isnt really our bottleneck)
 53 |     int h_M1[maxKP];
 54 |     int h_M2[maxKP]; // For reasons unknown to me, if I use cudaMallocHost for h_M2 everything breaks...? Would love to know why.
 55 |     int numKP1, numKP2; // Minimum of the vector of keypoints.size() and maxKP (the max number of keypoints the GPU is prepared to handle)
 56 | 
 57 |     // Device (GPU) pointers. You can not directly look at device memory (without transfering it back to the host, aka CPU)
 58 |     unsigned char *d_I;
 59 |     unsigned int *d_D1, *d_D2;
 60 |     int *d_K, *d_M1, *d_M2;
 61 |     float *d_K, *d_mask;
 62 |     cudaCalloc((void **) &d_K, sizeK);
 63 |     cudaCalloc((void **) &d_I, sizeI);
 64 |     cudaCalloc((void **) &d_D1, sizeD);
 65 |     cudaCalloc((void **) &d_D2, sizeD);
 66 |     cudaCalloc((void **) &d_M1, sizeM);
 67 |     cudaCalloc((void **) &d_M2, sizeM);
 68 |     cudaCalloc((void **) &d_mask, sizeM);
 69 | 
 70 |     // The patch triplet locations for LATCH fits in texture memory cache.
 71 |     cudaArray* triplets;
 72 |     initPatchTriplets(triplets);
 73 | 
 74 |     size_t pitch;
 75 |     initImage(&d_I, imgWidth, imgHeight, &pitch);
 76 |     initMask(&d_mask, h_mask);
 77 | 
 78 |     // Events allow asynchronous, nonblocking launch of subsequent kernels after a given event has happened.
 79 |     cudaEvent_t latchFinishedEvent;
 80 |     cudaEventCreate(&latchFinishedEvent);
 81 |     // You should create a new stream for each bitMatcher kernel you want to launch at once.
 82 |     cudaStream_t stream1, stream2;
 83 |     cudaStreamCreate(&stream1);
 84 |     cudaStreamCreate(&stream2);
 85 | 
 86 |     // Normal OpenCV CPU code.
 87 |     FAST(img1g, keypoints1, fastThreshold);
 88 |     FAST(img2g, keypoints2, fastThreshold); // If we were clever, we would put this after the first LATCH call, so both the CPU and GPU would be working at the same time.
 89 | 
 90 |     t = clock(); // Begin timing kernel launches.
 91 |     // LATCH runs on the default stream and will block until it is finished.
 92 |     latch( img1g, d_I, pitch, h_K1, d_D1, &numKP1, maxKP, d_K, &keypoints1, d_mask, latchFinishedEvent);
 93 |     latch( img2g, d_I, pitch, h_K2, d_D2, &numKP2, maxKP, d_K, &keypoints2, d_mask, latchFinishedEvent);
 94 | 
 95 |     // latch( img1g, h_K1, d_D1, &numKP1, maxKP, d_K, d_I, &keypoints1, imgWidth, imgHeight, latchFinishedEvent ); // The latchFinishedEvent will be overridden by the next LATCH launch.  (this one will be ignored)
 96 |     // latch( img2g, h_K2, d_D2, &numKP2, maxKP, d_K, d_I, &keypoints2, imgWidth, imgHeight, latchFinishedEvent ); // This call will only begin after the above has completed. (but is still non blocking)
 97 |     bitMatcher( d_D1, d_D2, numKP1, numKP2, maxKP, d_M1, matchThreshold, stream1, latchFinishedEvent ); // Each concurrent bitMatcher launch should get its own d_M# pointer and its own stream#
 98 |     bitMatcher( d_D2, d_D1, numKP2, numKP1, maxKP, d_M2, matchThreshold, stream2, latchFinishedEvent ); // Both bitMatcher launches will start in parallel when the most recent call to LATCH completes.
 99 |     cout << "Launching kernels took " << 1000*(clock() - t)/(float)CLOCKS_PER_SEC << " milliseconds." << endl;
100 | 
101 |     // Put as much CPU code as possible here.
102 |     // The CPU can continue to do useful work while the GPU is thinking.
103 |     // If you put no code here, the CPU will stall until the GPU is done.
104 | 
105 |     t = clock(); // Begin timing wasted CPU time.
106 |     getMatches(maxKP, h_M1, d_M1);
107 |     getMatches(maxKP, h_M2, d_M2);
108 |     cout << "Gathering results took " << 1000*(clock() - t)/(float)CLOCKS_PER_SEC << " milliseconds." << endl;
109 |     for (int i=0; i<numKP1; i++) {
110 |         if (h_M1[i] >= 0 && h_M1[i] < numKP2 && h_M2[h_M1[i]] == i) {
111 |             matches.push_back( DMatch(i, h_M1[i], 0));
112 |         }
113 |     }
114 |     cout << "Between " << keypoints1.size() << " and " << keypoints2.size() << " keypoints found " << matches.size() << " matches." << endl;
115 | 
116 |     drawMatches( img1, keypoints1, img2, keypoints2,
117 |         matches, imgMatches, Scalar::all(-1), Scalar::all(-1),
118 |         vector<char>(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS );
119 |     resize(imgMatches, imgMatches, Size(1280, 480));
120 |     imshow( "Matches", imgMatches );
121 |     waitKey(0);
122 | 
123 |     cudaFreeArray(triplets);
124 |     cudaFree(d_K);
125 |     cudaFree(d_I);
126 |     cudaFree(d_D1);
127 |     cudaFree(d_D2);
128 |     cudaFree(d_M1);
129 |     cudaFree(d_M2);
130 |     cudaFreeHost(h_K1);
131 |     cudaFreeHost(h_K2);
132 |     return 0;
133 | }
134 | 


--------------------------------------------------------------------------------
/vo.cpp:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <iostream>
  3 | #include <stdio.h>
  4 | #include <time.h>
  5 | #include "cuda.h"
  6 | #include "cuda_runtime.h"
  7 | #include "opencv2/opencv.hpp"
  8 | using namespace std;
  9 | using namespace cv;
 10 | #include "latch.h"
 11 | #include "bitMatcher.h"
 12 | // #include "gpuFacade.hpp"
 13 | 
 14 | #define cudaCalloc(A, B) \
 15 |     do { \
 16 |         cudaError_t __cudaCalloc_err = cudaMalloc(A, B); \
 17 |         if (__cudaCalloc_err == cudaSuccess) cudaMemset(*A, 0, B); \
 18 |     } while (0)
 19 | 
 20 | #define checkError(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 21 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
 22 |    if (code != cudaSuccess) {
 23 |       fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
 24 |       if (abort) exit(code);
 25 |    }
 26 | }
 27 | 
 28 | #define checkLaunchError()                                            \
 29 | do {                                                                  \
 30 |     /* Check synchronous errors, i.e. pre-launch */                   \
 31 |     cudaError_t err = cudaGetLastError();                             \
 32 |     if (cudaSuccess != err) {                                         \
 33 |         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 34 |                  __FILE__, __LINE__, cudaGetErrorString(err) );       \
 35 |         exit(EXIT_FAILURE);                                           \
 36 |     }                                                                 \
 37 |     /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
 38 |     err = cudaThreadSynchronize();                                    \
 39 |     if (cudaSuccess != err) {                                         \
 40 |         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 41 |                  __FILE__, __LINE__, cudaGetErrorString( err) );      \
 42 |         exit(EXIT_FAILURE);                                           \
 43 |     }                                                                 \
 44 | } while (0)
 45 | 
 46 | 
 47 | // Sometimes the recovered pose is 180 degrees off...? I thought cheirality test would handle that, but apparently not always.
 48 | double dist2(Mat a, Mat b) {
 49 |     double s = 0.0;
 50 |     for (int i=0; i<3; i++) {
 51 |         const double t = a.at<double>(i) - b.at<double>(i);
 52 |         s += t*t;
 53 |     }
 54 |     return s;
 55 | }
 56 | 
 57 | // In general a suffix of 1 means previous frame, and 2 means current frame.
 58 | // However, we start processing the next frame while the GPU is working on current...
 59 | // So at a certain point frame 1 shifts down to 0, 2 shifts down to 1, and the new 2 is loaded.
 60 | int main( int argc, char** argv ) {
 61 |     // gpuFacade gpu;
 62 |     // gpu.set_values(3,4);
 63 |     // cerr << "!! " << gpu.area() << endl;
 64 | 
 65 |     // This must be an integer multiple of 512.
 66 |     // Specifically, half-multiples of the number of SM's for your GPU are sensible.
 67 |     // I have 10 streaming multiprocessors, so I chose 15*512 = 7680.
 68 |     const int maxKP = 512 * 15;
 69 |     const bool showMatches = true;
 70 |     // Shows every Nth processed frame's matches.
 71 |     const int showMatchesInterval = 10;
 72 |     const bool showVideo = true;
 73 |     // Shows every Nth processed frame.
 74 |     const int showVideoInterval = 1;
 75 |     int WIDTH, HEIGHT, totalMatches, totalInliers = 0;
 76 |     const int matchThreshold = 12;
 77 |     // Discard this many frames for each one processed. Change with +/- keys while running.
 78 |     int skipFrames = 0;
 79 |     // Threshold for FAST detector
 80 |     int threshold = 20;
 81 |     int targetKP = 3000;
 82 |     int tolerance = 200;
 83 |     int maxLoops = 100;//4200;
 84 |     const bool gnuplot = true;
 85 |     double defect = 0.0;
 86 |     int extractions = 0;
 87 | 
 88 |     VideoCapture cap;
 89 |     if (argc == 1) {
 90 |         cap = VideoCapture(0);
 91 |         WIDTH  = cap.get(CAP_PROP_FRAME_WIDTH);
 92 |         HEIGHT = cap.get(CAP_PROP_FRAME_HEIGHT);
 93 |     }
 94 |     if (argc == 2 || argc == 3) {
 95 |         cap = VideoCapture(argv[1]);
 96 |         WIDTH  = cap.get(CAP_PROP_FRAME_WIDTH);
 97 |         HEIGHT = cap.get(CAP_PROP_FRAME_HEIGHT);
 98 |         if (argc == 3) {
 99 |             for (int i=0; i<atoi(argv[2]); i++) {
100 |                 cap.grab();
101 |             }
102 |         }
103 |     }
104 |     if (argc == 4) {
105 |         cap = VideoCapture(0);
106 |         WIDTH  = atoi(argv[2]);
107 |         HEIGHT = atoi(argv[3]);
108 |         cap.set(CAP_PROP_FRAME_WIDTH,  WIDTH);
109 |         cap.set(CAP_PROP_FRAME_HEIGHT, HEIGHT);
110 |     }
111 | 
112 |     double f = 0.4;
113 |     double data[]= {f*WIDTH,  0.0,  WIDTH*0.5,  0.0, f*HEIGHT, HEIGHT*0.5, 0.0, 0.0, 1.0};
114 |     Mat K(3, 3, CV_64F, data);
115 |     Mat F, R, T, rod, mask;
116 |     Mat img0, img1, img2, img1g, img2g, imgMatches, E, rodOld;
117 | 
118 |     cap >> img1;
119 |     cap >> img2;
120 |     cv::cvtColor(img1, img1g, CV_BGR2GRAY);
121 |     cv::cvtColor(img2, img2g, CV_BGR2GRAY);
122 |     if (showMatches) {
123 |         namedWindow("Matches", WINDOW_NORMAL);
124 |     }
125 |     waitKey(1);
126 |     if (showVideo) {
127 |         namedWindow("Video", WINDOW_NORMAL);
128 |     }
129 |     waitKey(1);
130 |     resizeWindow("Matches", 1920/2, 540/2);
131 |     resizeWindow("Video", 960, 540);
132 |     moveWindow("Matches", 0, 540+55);
133 |     moveWindow("Video", 0, 0);
134 |     waitKey(1);
135 | 
136 |     cudaEvent_t start, stop;
137 |     cudaEventCreate(&start);
138 |     cudaEventCreate(&stop);
139 | 
140 |     vector<KeyPoint> keypoints0, keypoints1, keypoints2;
141 |     vector<DMatch> goodMatches;
142 |     vector<Point2f> p1, p2; // Point correspondences for recovering pose.
143 |     int numKP0, numKP1, numKP2; // The actual number of keypoints we are dealing with: just keypoints#.size(), but capped at maxKP.
144 |     int key = -1;
145 |     clock_t timer, timer2;
146 |     float time;
147 | 
148 |     // Sizes for device and host pointers
149 |     size_t sizeK = maxKP * sizeof(float) * 5; // K for keypoints
150 |     size_t sizeI = WIDTH * HEIGHT * sizeof(unsigned char); // I for Image
151 |     size_t sizeD = maxKP * (2048 / 32) * sizeof(unsigned int); // D for Descriptor
152 |     size_t sizeM = maxKP * sizeof(int); // M for Matches
153 |     size_t sizeMask = 64 * sizeof(float);
154 | 
155 |     // Host pointers
156 |     float *h_K1, *h_K2;
157 |     cudaMallocHost((void **) &h_K1, sizeK);
158 |     cudaMallocHost((void **) &h_K2, sizeK);
159 |     // For reasons opaque to me, allocating both (but not either) h_M1 or h_M2
160 |     // with cudaMallocHost segfaults, apparently after graceful exit? So neither of them are pinned.
161 |     int h_M1[maxKP];
162 |     int h_M2[maxKP];
163 |     float h_mask[64];
164 |     for (int i=0; i<64; i++) { h_mask[i] = 1.0f; }
165 | 
166 |     // Device pointers
167 |     unsigned char *d_I;
168 |     unsigned int *d_D1, *d_D2, *uIntSwapPointer;
169 |     int *d_M1, *d_M2;
170 |     float *d_K, *d_mask;
171 |     cudaCalloc((void **) &d_K, sizeK);
172 |     cudaCalloc((void **) &d_D1, sizeD);
173 |     cudaCalloc((void **) &d_D2, sizeD);
174 |     cudaCalloc((void **) &d_M1, sizeM);
175 |     cudaCalloc((void **) &d_M2, sizeM);
176 |     cudaCalloc((void **) &d_mask, sizeM);
177 | 
178 |     // The patch triplet locations for LATCH fits in texture memory cache.
179 |     cudaArray* patchTriplets;
180 |     initPatchTriplets(patchTriplets);
181 |     size_t pitch;
182 |     initImage(&d_I, WIDTH, HEIGHT, &pitch);
183 |     initMask(&d_mask, h_mask);
184 | 
185 |     // Events allow asynchronous, nonblocking launch of subsequent kernels after a given event has happened,
186 |     // such as completion of a different kernel on a different stream.
187 |     cudaEvent_t latchFinished;
188 |     cudaEventCreate(&latchFinished);
189 |     // You should create a new stream for each bitMatcher kernel you want to launch at once.
190 |     cudaStream_t streanumKP1, streanumKP2;
191 |     cudaStreamCreate(&streanumKP1);
192 |     cudaStreamCreate(&streanumKP2);
193 | 
194 |     FAST(img1g, keypoints1, threshold);
195 |     extractions += keypoints1.size();
196 |     latch( img1g, d_I, pitch, h_K1, d_D1, &numKP1, maxKP, d_K, &keypoints1, d_mask, latchFinished );
197 |     FAST(img2g, keypoints2, threshold); // This call to fast is concurrent with above execution.
198 |     extractions += keypoints2.size();
199 |     latch( img2g, d_I, pitch, h_K2, d_D2, &numKP2, maxKP, d_K, &keypoints2, d_mask, latchFinished );
200 |     bitMatcher( d_D1, d_D2, numKP1, numKP2, maxKP, d_M1, matchThreshold, streanumKP1, latchFinished );
201 |     bitMatcher( d_D2, d_D1, numKP2, numKP1, maxKP, d_M2, matchThreshold, streanumKP2, latchFinished );
202 |     timer = clock();
203 |     getMatches(maxKP, h_M1, d_M1);
204 |     getMatches(maxKP, h_M2, d_M2);
205 |     for (int i=0; i<numKP1; i++) {
206 |         if (h_M1[i] >= 0 && h_M1[i] < numKP2 && h_M2[h_M1[i]] == i) {
207 |             goodMatches.push_back( DMatch(i, h_M1[i], 0)); // For drawing.
208 |             p1.push_back(keypoints1[i].pt); // For recovering pose.
209 |             p2.push_back(keypoints2[h_M1[i]].pt);
210 |         }
211 |     }
212 | 
213 |     img1.copyTo(img0);
214 |     img2.copyTo(img1);
215 |     cap.read(img2);
216 |     cvtColor(img2, img2g, CV_BGR2GRAY);
217 | 
218 |     keypoints0 = keypoints1;
219 |     keypoints1 = keypoints2;
220 | 
221 |     uIntSwapPointer = d_D1;
222 |     d_D1 = d_D2;
223 |     d_D2 = uIntSwapPointer;
224 | 
225 |     numKP0 = numKP1;
226 |     numKP1 = numKP2;
227 | 
228 |     FAST(img2g, keypoints2, threshold);
229 |     int loopIteration = 0;
230 |     for (; loopIteration < maxLoops || maxLoops == -1; loopIteration++) { // Main Loop.
231 |         { // GPU code for descriptors and matching.
232 |             cudaEventRecord(start, 0);
233 |             extractions += keypoints2.size();
234 |             latch( img2g, d_I, pitch, h_K2, d_D2, &numKP2, maxKP, d_K, &keypoints2, d_mask, latchFinished);
235 |             bitMatcher( d_D1, d_D2, numKP1, numKP2, maxKP, d_M1, matchThreshold, streanumKP1, latchFinished );
236 |             bitMatcher( d_D2, d_D1, numKP2, numKP1, maxKP, d_M2, matchThreshold, streanumKP2, latchFinished );
237 |             cudaEventRecord(stop, 0);
238 |         }
239 |         timer = clock();
240 |         { // Put as much CPU code here as possible.
241 |             { // Display matches and/or video to user.
242 |                 bool needToDraw = false;
243 |                 if (showMatches && loopIteration % showMatchesInterval == 0) { // Draw matches.
244 |                     drawMatches( img0, keypoints0, img1, keypoints1,
245 |                         goodMatches, imgMatches, Scalar::all(-1), Scalar::all(-1),
246 |                         vector<char>(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS );
247 |                     imshow( "Matches", imgMatches );
248 |                     needToDraw = true;
249 |                 }
250 |                 if (showVideo && loopIteration % showVideoInterval == 0) {
251 |                     imshow("Video", img1);
252 |                     needToDraw = true;
253 |                 }
254 |                 if (needToDraw) {
255 |                     key = waitKey(1);
256 |                 }
257 |             }
258 |             { // Handle user input.
259 |                 switch (key) {
260 |                     case (-1):
261 |                     break;
262 |                     case (1048689): // q
263 |                     case (113): // also q
264 |                         return 0;
265 |                     break;
266 |                     case (1048695): // w
267 |                         waitKey(0);
268 |                     break;
269 |                     case (1114027): // +
270 |                         skipFrames++;
271 |                         cerr << "For each processed frame we are now skipping " << skipFrames << endl;
272 |                     break;
273 |                     case (1114029): // -
274 |                         skipFrames = max(1, --skipFrames);
275 |                         cerr << "For each processed frame we are now skipping " << skipFrames << endl;
276 |                     break;
277 |                     default:
278 |                         cerr << "Currently pressed key is:   " << key << endl;
279 |                     break;
280 |                 }
281 |                 key = -1;
282 |             }
283 |             { // Iterate the "logical" loop (get ready to process next frame)
284 |                 img1.copyTo(img0);
285 |                 img2.copyTo(img1);
286 |                 for (int i=0; i<skipFrames; i++) {
287 |                     cap.grab();
288 |                 }
289 |                 cap.read(img2);
290 |                 if (img2.cols == 0) break;
291 |                 cvtColor(img2, img2g, CV_BGR2GRAY);
292 | 
293 |                 keypoints0 = keypoints1;
294 |                 keypoints1 = keypoints2;
295 | 
296 |                 uIntSwapPointer = d_D1;
297 |                 d_D1 = d_D2;
298 |                 d_D2 = uIntSwapPointer;
299 | 
300 |                 numKP0 = numKP1;
301 |                 numKP1 = numKP2;
302 |             }
303 |             { // Solve for and output rotation vector (this gets piped to feedgnuplot).
304 |                 if (10 < p1.size() && 10 < p2.size()) {
305 |                     E = findEssentialMat(p1, p2, f*WIDTH, Point2d(WIDTH*0.5f, HEIGHT*0.5f), RANSAC, 0.999, 3.0, mask);
306 |                     int inliers = 0;
307 |                     for (int i=0; i<mask.rows; i++) {
308 |                         inliers += mask.data[i];
309 |                     }
310 |                     totalInliers += inliers;
311 |                     double size = p1.size();
312 |                     double r = inliers/max((double)size, 150.0);
313 |                     r = 1.0 - min(r + 0.05, 1.0);
314 |                     defect += r*r;
315 |                     cout << "11:" << r*r << endl;
316 | 
317 |                     recoverPose(E, p1, p2, R, T, f*WIDTH, Point2d(WIDTH*0.5f, HEIGHT*0.5f), mask);
318 |                     Rodrigues(R, rod);
319 |                     if (loopIteration==0) {
320 |                         rod.copyTo(rodOld);
321 |                     }
322 |                     if (dist2(rod, rodOld) < 1.0) {
323 |                         rod.copyTo(rodOld);
324 |                     } else {
325 |                         cerr << "Rejecting the recovered pose: " << rod.t() * 57.2957795 << endl;
326 |                         // This commented out chunk of code is good for webcams. If you initialize with a bad value it will recover.
327 |                         // const double alpha = 0.1; // Move our region of acceptable responses (only a little) closer to the observed (but presumed erroneous) value.
328 |                         // for (int i=0; i<3; i++) {
329 |                         //     rodOld.at<double>(i) = rodOld.at<double>(i)*(1.0-alpha) + rod.at<double>(i)*alpha;
330 |                         // }
331 |                         rodOld.copyTo(rod);
332 |                     }
333 |                 } else {
334 |                     defect += 1.0;
335 |                     cout << "11:" << 1.0 << endl;
336 |                     cerr << "Too few matches! Not going to try to recover pose this frame." << endl;
337 |                 }
338 |                 // To prevent the graphs from desynchronizing from each other, we have to output this unconditionally.
339 |                 if (gnuplot) {
340 |                     for (int i=0; i<3; i++) {
341 |                         cout << i << ":" << rod.at<double>(i) * 57.2957795 << endl; // Output Rodrigues vector, rescaled to degrees
342 |                     }
343 |                     // T is unit norm (scale-less) and often erroneously sign-reversed.
344 |                     // if (T.at<double>(2) < 0) T = -T; // Assume dominate motion is forward... (this is not an elegant assumption)
345 |                     // double theta = atan2(T.at<double>(0), T.at<double>(2));
346 |                     // double phi = atan2(T.at<double>(1), T.at<double>(2));
347 |                     // cout << 3 << ":" << theta * 57.2957795 << endl; // Plot polar translation angle
348 |                     // cout << 4 << ":" << phi * 57.2957795 << endl; // Plot azimuthal translation angle
349 |                 }
350 |             }
351 |             { // run FAST detector on the CPU for next frame (get ready for next loop iteration).
352 |                 FAST(img2g, keypoints2, threshold);
353 |                 // Apply proportional control to threshold to drive it towards targetKP.
354 |                 int control = (int)(((float)keypoints2.size() - (float)targetKP) / (float)tolerance);
355 |                 threshold += min(100, control);
356 |                 if (threshold < 1) threshold = 1;
357 |             }
358 |         }
359 |         if (gnuplot) {
360 |             time = (1000*(clock() - timer)/(double)CLOCKS_PER_SEC);
361 |             cout << "9:" << time << endl; // Plot CPU time.
362 |             timer = clock();
363 |         }
364 |         { // Get new GPU results
365 |             p1.clear();
366 |             p2.clear();
367 |             goodMatches.clear();
368 |             getMatches(maxKP, h_M1, d_M1);
369 |             getMatches(maxKP, h_M2, d_M2);
370 |             cudaEventElapsedTime(&time, start, stop);
371 |             if (gnuplot) {
372 |                 cout << "10:" << (time+(1000*(clock() - timer)/(double)CLOCKS_PER_SEC)) << endl; // Plot total asynchronous GPU time.
373 |             }
374 |             for (int i=0; i<numKP0; i++) {
375 |                 if (h_M1[i] >= 0 && h_M1[i] < numKP1 && h_M2[h_M1[i]] == i) {
376 |                     goodMatches.push_back( DMatch(i, h_M1[i], 0)); // For drawing matches.
377 |                     p1.push_back(keypoints0[i].pt); // For recovering pose.
378 |                     p2.push_back(keypoints1[h_M1[i]].pt);
379 |                 }
380 |             }
381 |         }
382 |         if (gnuplot) {
383 |             cout << "6:" << numKP1 << endl; // Plot number of keypoints.
384 |             cout << "7:" << p1.size() << endl; // Plot number of matches.
385 |             cout << "8:" << 100*threshold << endl; // Plot current threshold for FAST.
386 |         }
387 |         totalMatches += p1.size();
388 |     }
389 |     cudaFreeArray(patchTriplets);
390 |     cudaFree(d_K);
391 |     cudaFree(d_D1);
392 |     cudaFree(d_D2);
393 |     cudaFree(d_M1);
394 |     cudaFree(d_M2);
395 |     cudaFreeHost(h_K1);
396 |     cudaFreeHost(h_K2);
397 |     cerr << "Total matches: " << totalMatches << endl;
398 |     cerr << "Total inliers: " << totalInliers << endl;
399 |     cerr << "Defect: " << defect << endl;
400 |     cerr << "Loop iteration: " << loopIteration << endl;
401 |     cerr << "Extractions: " << extractions << endl;
402 | 
403 |     return 0;
404 | }
405 | 


--------------------------------------------------------------------------------
/vo2.cpp:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <iostream>
  3 | #include <stdio.h>
  4 | #include <time.h>
  5 | #include "cuda.h"
  6 | #include "cuda_runtime.h"
  7 | #include "opencv2/opencv.hpp"
  8 | using namespace std;
  9 | using namespace cv;
 10 | #include "latch.h"
 11 | #include "bitMatcher.h"
 12 | #include "gpuFacade.hpp"
 13 | 
 14 | #define cudaCalloc(A, B) \
 15 |     do { \
 16 |         cudaError_t __cudaCalloc_err = cudaMalloc(A, B); \
 17 |         if (__cudaCalloc_err == cudaSuccess) cudaMemset(*A, 0, B); \
 18 |     } while (0)
 19 | 
 20 | #define checkError(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 21 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
 22 |    if (code != cudaSuccess) {
 23 |       fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
 24 |       if (abort) exit(code);
 25 |    }
 26 | }
 27 | 
 28 | #define checkLaunchError()                                            \
 29 | do {                                                                  \
 30 |     /* Check synchronous errors, i.e. pre-launch */                   \
 31 |     cudaError_t err = cudaGetLastError();                             \
 32 |     if (cudaSuccess != err) {                                         \
 33 |         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 34 |                  __FILE__, __LINE__, cudaGetErrorString(err) );       \
 35 |         exit(EXIT_FAILURE);                                           \
 36 |     }                                                                 \
 37 |     /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
 38 |     err = cudaThreadSynchronize();                                    \
 39 |     if (cudaSuccess != err) {                                         \
 40 |         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 41 |                  __FILE__, __LINE__, cudaGetErrorString( err) );      \
 42 |         exit(EXIT_FAILURE);                                           \
 43 |     }                                                                 \
 44 | } while (0)
 45 | 
 46 | 
 47 | // Sometimes the recovered pose is 180 degrees off...? I thought cheirality test would handle that, but apparently not always.
 48 | double dist2(Mat a, Mat b) {
 49 |     double s = 0.0;
 50 |     for (int i=0; i<3; i++) {
 51 |         const double t = a.at<double>(i) - b.at<double>(i);
 52 |         s += t*t;
 53 |     }
 54 |     return s;
 55 | }
 56 | 
 57 | // In general a suffix of 1 means previous frame, and 2 means current frame.
 58 | // However, we start processing the next frame while the GPU is working on current...
 59 | // So at a certain point frame 1 shifts down to 0, 2 shifts down to 1, and the new 2 is loaded.
 60 | int main( int argc, char** argv ) {
 61 |     // This must be an integer multiple of 512.
 62 |     // Specifically, half-multiples of the number of SM's for your GPU are sensible.
 63 |     // I have 10 streaming multiprocessors, so I chose 15*512 = 7680.
 64 |     const int maxKP = 512 * 15;
 65 |     const bool showMatches = true;
 66 |     // Shows every Nth processed frame's matches.
 67 |     const int showMatchesInterval = 10;
 68 |     const bool showVideo = true;
 69 |     // Shows every Nth processed frame.
 70 |     const int showVideoInterval = 1;
 71 |     int WIDTH, HEIGHT, totalMatches = 0, totalInliers = 0;
 72 |     const int matchThreshold = 12;
 73 |     // Discard this many frames for each one processed. Change with +/- keys while running.
 74 |     int skipFrames = 0;
 75 |     // Threshold for FAST detector
 76 |     int threshold = 90;
 77 |     int targetKP = 3000;
 78 |     int tolerance = 200;
 79 |     int maxLoops = 150;
 80 |     const bool gnuplot = true;
 81 |     double defect = 0.0;
 82 | 
 83 |     VideoCapture cap;
 84 |     // if (argc == 1) {
 85 |     //     cap = VideoCapture(0);
 86 |     //     WIDTH  = cap.get(CAP_PROP_FRAME_WIDTH);
 87 |     //     HEIGHT = cap.get(CAP_PROP_FRAME_HEIGHT);
 88 |     // }
 89 |     if (argc == 2 || argc == 3) {
 90 |         cap = VideoCapture(argv[1]);
 91 |         WIDTH  = cap.get(CAP_PROP_FRAME_WIDTH);
 92 |         HEIGHT = cap.get(CAP_PROP_FRAME_HEIGHT);
 93 |         if (argc == 3) {
 94 |             for (int i=0; i<atoi(argv[2]); i++) {
 95 |                 cap.grab();
 96 |             }
 97 |         }
 98 |     }
 99 |     // if (argc == 4) {
100 |     //     cap = VideoCapture(0);
101 |     //     WIDTH  = atoi(argv[2]);
102 |     //     HEIGHT = atoi(argv[3]);
103 |     //     cap.set(CAP_PROP_FRAME_WIDTH,  WIDTH);
104 |     //     cap.set(CAP_PROP_FRAME_HEIGHT, HEIGHT);
105 |     // }
106 | 
107 |     double f = 0.4;
108 |     double data[]= {f*WIDTH,  0.0,  WIDTH*0.5,  0.0, f*HEIGHT, HEIGHT*0.5, 0.0, 0.0, 1.0};
109 |     Mat K(3, 3, CV_64F, data);
110 |     Mat F, R, T, rod, mask;
111 |     Mat img0, img1, img2, img1g, img2g, imgMatches, E, rodOld;
112 | 
113 |     cap >> img1;
114 |     cap >> img2;
115 |     cv::cvtColor(img1, img1g, CV_BGR2GRAY);
116 |     cv::cvtColor(img2, img2g, CV_BGR2GRAY);
117 |     if (showMatches) {
118 |         namedWindow("Matches", WINDOW_NORMAL);
119 |     }
120 |     waitKey(1);
121 |     if (showVideo) {
122 |         namedWindow("Video", WINDOW_NORMAL);
123 |     }
124 |     waitKey(1);
125 |     resizeWindow("Matches", 1920/2, 540/2);
126 |     resizeWindow("Video", 960, 540);
127 |     moveWindow("Matches", 0, 540+55);
128 |     moveWindow("Video", 0, 0);
129 |     waitKey(1);
130 | 
131 |     cudaEvent_t start, stop;
132 |     cudaEventCreate(&start);
133 |     cudaEventCreate(&stop);
134 | 
135 |     vector<KeyPoint> keypoints0, keypoints1, keypoints2;
136 |     vector<DMatch> goodMatches;
137 |     vector<Point2f> p1, p2; // Point correspondences for recovering pose.
138 |     int key = -1;
139 |     clock_t timer, timer2;
140 |     float time;
141 | 
142 |     gpuFacade gpu(maxKP, WIDTH, HEIGHT);
143 |     FAST(img1g, keypoints1, threshold);
144 |     gpu.LATCH(img1g, gpu.d_D1, &(gpu.numKP1), &keypoints1);
145 |     FAST(img2g, keypoints2, threshold); // This call to fast is concurrent with above execution.
146 |     gpu.LATCH(img2g, gpu.d_D2, &(gpu.numKP2), &keypoints2);
147 |     gpu.match(gpu.d_D1, gpu.d_D2, gpu.numKP1, gpu.numKP2, gpu.d_M1, matchThreshold, gpu.streamKP1);
148 |     gpu.match(gpu.d_D2, gpu.d_D1, gpu.numKP2, gpu.numKP1, gpu.d_M2, matchThreshold, gpu.streamKP2);
149 |     gpu.getResults(gpu.h_M1, gpu.d_M1);
150 |     gpu.getResults(gpu.h_M2, gpu.d_M2);
151 |     for (int i=0; i<gpu.numKP1; i++) {
152 |         if (gpu.h_M1[i] >= 0 && gpu.h_M1[i] < gpu.numKP2 && gpu.h_M2[gpu.h_M1[i]] == i) {
153 |             goodMatches.push_back( DMatch(i, gpu.h_M1[i], 0)); // For drawing.
154 |             p1.push_back(keypoints1[i].pt); // For recovering pose.
155 |             p2.push_back(keypoints2[gpu.h_M1[i]].pt);
156 |         }
157 |     }
158 | 
159 |     img1.copyTo(img0);
160 |     img2.copyTo(img1);
161 |     cap.read(img2);
162 |     cvtColor(img2, img2g, CV_BGR2GRAY);
163 | 
164 |     keypoints0 = keypoints1;
165 |     keypoints1 = keypoints2;
166 | 
167 |     gpu.uIntSwapPointer = gpu.d_D1;
168 |     gpu.d_D1 = gpu.d_D2;
169 |     gpu.d_D2 = gpu.uIntSwapPointer;
170 | 
171 |     gpu.numKP0 = gpu.numKP1;
172 |     gpu.numKP1 = gpu.numKP2;
173 | 
174 |     FAST(img2g, keypoints2, threshold);
175 |     int loopIteration = 0;
176 |     for (; loopIteration < maxLoops || maxLoops == -1; loopIteration++) { // Main Loop.
177 |         { // GPU code for descriptors and matching.
178 |             cudaEventRecord(gpu.start, 0);
179 |             gpu.LATCH(img2g, gpu.d_D2, &(gpu.numKP2), &keypoints2);
180 |             gpu.match(gpu.d_D1, gpu.d_D2, gpu.numKP1, gpu.numKP2, gpu.d_M1, matchThreshold, gpu.streamKP1);
181 |             gpu.match(gpu.d_D2, gpu.d_D1, gpu.numKP2, gpu.numKP1, gpu.d_M2, matchThreshold, gpu.streamKP2);
182 |             cudaEventRecord(gpu.stop, 0);
183 |         }
184 |         timer = clock();
185 |         { // Put as much CPU code here as possible.
186 |             { // Display matches and/or video to user.
187 |                 bool needToDraw = false;
188 |                 if (showMatches && loopIteration % showMatchesInterval == 0) { // Draw matches.
189 |                     drawMatches( img0, keypoints0, img1, keypoints1,
190 |                         goodMatches, imgMatches, Scalar::all(-1), Scalar::all(-1),
191 |                         vector<char>(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS );
192 |                     imshow( "Matches", imgMatches );
193 |                     needToDraw = true;
194 |                 }
195 |                 if (showVideo && loopIteration % showVideoInterval == 0) {
196 |                     imshow("Video", img1);
197 |                     needToDraw = true;
198 |                 }
199 |                 if (needToDraw) {
200 |                     key = waitKey(1);
201 |                 }
202 |             }
203 |             { // Handle user input.
204 |                 switch (key) {
205 |                     case (-1):
206 |                     break;
207 |                     case (1048689): // q
208 |                     case (113): // also q
209 |                         return 0;
210 |                     break;
211 |                     case (1048695): // w
212 |                         waitKey(0);
213 |                     break;
214 |                     case (1114027): // +
215 |                         skipFrames++;
216 |                         cerr << "For each processed frame we are now skipping " << skipFrames << endl;
217 |                     break;
218 |                     case (1114029): // -
219 |                         skipFrames = max(1, --skipFrames);
220 |                         cerr << "For each processed frame we are now skipping " << skipFrames << endl;
221 |                     break;
222 |                     default:
223 |                         cerr << "Currently pressed key is:   " << key << endl;
224 |                     break;
225 |                 }
226 |                 key = -1;
227 |             }
228 |             { // Iterate the "logical" loop (get ready to process next frame)
229 |                 img1.copyTo(img0);
230 |                 img2.copyTo(img1);
231 |                 for (int i=0; i<skipFrames; i++) {
232 |                     cap.grab();
233 |                 }
234 |                 cap.read(img2);
235 |                 if (img2.cols == 0) break;
236 |                 cvtColor(img2, img2g, CV_BGR2GRAY);
237 | 
238 |                 keypoints0 = keypoints1;
239 |                 keypoints1 = keypoints2;
240 | 
241 |                 gpu.uIntSwapPointer = gpu.d_D1;
242 |                 gpu.d_D1 = gpu.d_D2;
243 |                 gpu.d_D2 = gpu.uIntSwapPointer;
244 | 
245 |                 gpu.numKP0 = gpu.numKP1;
246 |                 gpu.numKP1 = gpu.numKP2;
247 |             }
248 |             { // Solve for and output rotation vector (this gets piped to feedgnuplot).
249 |                 if (10 < p1.size() && 10 < p2.size()) {
250 |                     E = findEssentialMat(p1, p2, f*WIDTH, Point2d(WIDTH*0.5f, HEIGHT*0.5f), RANSAC, 0.999, 3.0, mask);
251 |                     int inliers = 0;
252 |                     for (int i=0; i<mask.rows; i++) {
253 |                         inliers += mask.data[i];
254 |                     }
255 |                     totalInliers += inliers;
256 |                     double size = p1.size();
257 |                     double r = inliers/max((double)size, 150.0);
258 |                     r = 1.0 - min(r + 0.05, 1.0);
259 |                     defect += r*r;
260 |                     if (gnuplot) {
261 |                         cout << "11:" << r*r << endl;
262 |                     }
263 | 
264 |                     // recoverPose(E, p1, p2, R, T, f*WIDTH, Point2d(WIDTH*0.5f, HEIGHT*0.5f), mask);
265 |                     // Rodrigues(R, rod);
266 |                     // if (loopIteration==0) {
267 |                     //     rod.copyTo(rodOld);
268 |                     // }
269 |                     // if (dist2(rod, rodOld) < 1.0) {
270 |                     //     rod.copyTo(rodOld);
271 |                     // } else {
272 |                     //     cerr << "Rejecting the recovered pose: " << rod.t() * 57.2957795 << endl;
273 |                     //     // This commented out chunk of code is good for webcams. If you initialize with a bad value it will recover.
274 |                     //     // const double alpha = 0.1; // Move our region of acceptable responses (only a little) closer to the observed (but presumed erroneous) value.
275 |                     //     // for (int i=0; i<3; i++) {
276 |                     //     //     rodOld.at<double>(i) = rodOld.at<double>(i)*(1.0-alpha) + rod.at<double>(i)*alpha;
277 |                     //     // }
278 |                     //     rodOld.copyTo(rod);
279 |                     // }
280 |                 } else {
281 |                     defect += 1.0;
282 |                     cout << "11:" << 1.0 << endl;
283 |                     cerr << "Too few matches! Not going to try to recover pose this frame." << endl;
284 |                 }
285 |                 // To prevent the graphs from desynchronizing from each other, we have to output this unconditionally.
286 |                 if (gnuplot) {
287 |                     for (int i=0; i<3; i++) {
288 |                         // cout << i << ":" << rod.at<double>(i) * 57.2957795 << endl; // Output Rodrigues vector, rescaled to degrees
289 |                     }
290 |                     // T is unit norm (scale-less) and often erroneously sign-reversed.
291 |                     // if (T.at<double>(2) < 0) T = -T; // Assume dominate motion is forward... (this is not an elegant assumption)
292 |                     // double theta = atan2(T.at<double>(0), T.at<double>(2));
293 |                     // double phi = atan2(T.at<double>(1), T.at<double>(2));
294 |                     // cout << 3 << ":" << theta * 57.2957795 << endl; // Plot polar translation angle
295 |                     // cout << 4 << ":" << phi * 57.2957795 << endl; // Plot azimuthal translation angle
296 |                 }
297 |             }
298 |             { // run FAST detector on the CPU for next frame (get ready for next loop iteration).
299 |                 FAST(img2g, keypoints2, threshold);
300 |                 // Apply proportional control to threshold to drive it towards targetKP.
301 |                 int control = (int)(((float)keypoints2.size() - (float)targetKP) / (float)tolerance);
302 |                 threshold += min(100, control);
303 |                 if (threshold < 1) threshold = 1;
304 |             }
305 |         }
306 |         if (gnuplot) {
307 |             time = (1000*(clock() - timer)/(double)CLOCKS_PER_SEC);
308 |             cout << "9:" << time << endl; // Plot CPU time.
309 |             timer = clock();
310 |         }
311 |         { // Get new GPU results
312 |             p1.clear();
313 |             p2.clear();
314 |             goodMatches.clear();
315 |             gpu.getResults(gpu.h_M1, gpu.d_M1);
316 |             gpu.getResults(gpu.h_M2, gpu.d_M2);
317 |             cudaEventElapsedTime(&time, gpu.start, gpu.stop);
318 |             if (gnuplot) {
319 |                 cout << "10:" << (time+(1000*(clock() - timer)/(double)CLOCKS_PER_SEC)) << endl; // Plot total asynchronous GPU time.
320 |             }
321 |             checkLaunchError();
322 |             for (int i=0; i<gpu.numKP0; i++) {
323 |                 if (gpu.h_M1[i] >= 0 && gpu.h_M1[i] < gpu.numKP1 && gpu.h_M2[gpu.h_M1[i]] == i) {
324 |                     goodMatches.push_back( DMatch(i, gpu.h_M1[i], 0)); // For drawing matches.
325 |                     p1.push_back(keypoints0[i].pt); // For recovering pose.
326 |                     p2.push_back(keypoints1[gpu.h_M1[i]].pt);
327 |                 }
328 |             }
329 |         }
330 |         if (gnuplot) {
331 |             cout << "6:" << gpu.numKP1 << endl; // Plot number of keypoints.
332 |             cout << "7:" << p1.size() << endl; // Plot number of matches.
333 |             cout << "8:" << 100*threshold << endl; // Plot current threshold for FAST.
334 |         }
335 |         totalMatches += p1.size();
336 |     }
337 |     cerr << "Total matches: " << totalMatches << endl;
338 |     cerr << "Total inliers: " << totalInliers << endl;
339 |     cerr << "Defect: " << defect << endl;
340 |     cerr << "Loop iteration: " << loopIteration << endl;
341 |     return 0;
342 | }
343 | 


--------------------------------------------------------------------------------