├── CMakeLists.txt ├── Makefile ├── README.md ├── affTest.cpp ├── bitMatcher.cu ├── bitMatcher.h ├── driveGnuPlotStreams.pl ├── gpuFacade.cpp ├── gpuFacade.hpp ├── latch.cu ├── latch.h ├── latchAff.cu ├── latchAff.h ├── min.cpp ├── vo.cpp └── vo2.cpp /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | PROJECT(latch_cuda) 2 | 3 | FIND_PACKAGE(CUDA REQUIRED) 4 | FIND_PACKAGE(OpenCV 3 REQUIRED) 5 | 6 | INCLUDE(FindCUDA) 7 | 8 | SET(CUDALATCHSRCS 9 | latch.cu 10 | bitMatcher.cu 11 | ) 12 | 13 | SET(CUDALATCHAFFSRCS 14 | latchAff.cu 15 | bitMatcher.cu 16 | ) 17 | 18 | LIST(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED -lineinfo -Xptxas -v -Xcompiler -fopenmp -use_fast_math -O3 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_60,code=sm_60 --default-stream per-thread") 19 | message("CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}") 20 | 21 | SET(CUDA_PROPAGATE_HOST_FLAGS OFF) 22 | 23 | CUDA_ADD_LIBRARY(latch_cuda ${CUDALATCHSRCS} STATIC) 24 | CUDA_ADD_LIBRARY(latch_aff_cuda ${CUDALATCHAFFSRCS} STATIC) 25 | CUDA_ADD_EXECUTABLE(latch_min_test min.cpp) 26 | CUDA_ADD_EXECUTABLE(latch_vo vo.cpp) 27 | CUDA_ADD_EXECUTABLE(latch_vo2 vo2.cpp gpuFacade.cpp) 28 | CUDA_ADD_EXECUTABLE(latch_affTest affTest.cpp) 29 | 30 | INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS}) 31 | INCLUDE_DIRECTORIES(${OpenCV_INCLUDE_DIRS}) 32 | 33 | TARGET_LINK_LIBRARIES(latch_cuda ${OpenCV_LIBS}) 34 | TARGET_LINK_LIBRARIES(latch_aff_cuda ${OpenCV_LIBS}) 35 | 36 | TARGET_LINK_LIBRARIES(latch_min_test latch_cuda) 37 | TARGET_LINK_LIBRARIES(latch_vo latch_cuda) 38 | TARGET_LINK_LIBRARIES(latch_vo2 latch_cuda) 39 | TARGET_LINK_LIBRARIES(latch_affTest latch_aff_cuda) 40 | 41 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: drone.mp4 vo 2 | 3 | affine: latchAff.o bitMatcher.o 4 | g++ -std=c++11 `pkg-config --cflags opencv` affTest.cpp latchAff.o bitMatcher.o -I/usr/local/cuda-7.5/include/ -L/usr/local/cuda/lib64 -lcuda -lcudart -L/usr/local/lib -lopencv_stitching -lopencv_superres -lopencv_videostab -lopencv_aruco -lopencv_bgsegm -lopencv_bioinspired -lopencv_ccalib -lopencv_dnn -lopencv_dpm -lopencv_fuzzy -lopencv_line_descriptor -lopencv_optflow -lopencv_plot -lopencv_reg -lopencv_saliency -lopencv_stereo -lopencv_structured_light -lopencv_rgbd -lopencv_surface_matching -lopencv_tracking -lopencv_datasets -lopencv_text -lopencv_face -lopencv_xfeatures2d -lopencv_shape -lopencv_video -lopencv_ximgproc -lopencv_calib3d -lopencv_features2d -lopencv_flann -lopencv_xobjdetect -lopencv_objdetect -lopencv_ml -lopencv_xphoto -lopencv_highgui -lopencv_videoio -lopencv_imgcodecs -lopencv_photo -lopencv_imgproc -lopencv_core -o affTest 5 | 6 | vo: latch.o bitMatcher.o #gpuFacade.o #fast.o 7 | g++ -std=c++11 `pkg-config --cflags opencv` vo.cpp latch.o bitMatcher.o -I/usr/local/cuda-7.5/include/ -L/usr/local/cuda/lib64 -lcuda -lcudart -L/usr/local/lib -lopencv_stitching -lopencv_superres -lopencv_videostab -lopencv_aruco -lopencv_bgsegm -lopencv_bioinspired -lopencv_ccalib -lopencv_dnn -lopencv_dpm -lopencv_fuzzy -lopencv_line_descriptor -lopencv_optflow -lopencv_plot -lopencv_reg -lopencv_saliency -lopencv_stereo -lopencv_structured_light -lopencv_rgbd -lopencv_surface_matching -lopencv_tracking -lopencv_datasets -lopencv_text -lopencv_face -lopencv_xfeatures2d -lopencv_shape -lopencv_video -lopencv_ximgproc -lopencv_calib3d -lopencv_features2d -lopencv_flann -lopencv_xobjdetect -lopencv_objdetect -lopencv_ml -lopencv_xphoto -lopencv_highgui -lopencv_videoio -lopencv_imgcodecs -lopencv_photo -lopencv_imgproc -lopencv_core -o vo 8 | 9 | vo2: latch.o bitMatcher.o gpuFacade.o #fast.o 10 | g++ `pkg-config --cflags opencv` vo2.cpp latch.o bitMatcher.o -I/usr/local/cuda-7.5/include/ -L/usr/local/cuda/lib64 -lcuda -lcudart -L/usr/local/lib -lopencv_stitching -lopencv_superres -lopencv_videostab -lopencv_aruco -lopencv_bgsegm -lopencv_bioinspired -lopencv_ccalib -lopencv_dnn -lopencv_dpm -lopencv_fuzzy -lopencv_line_descriptor -lopencv_optflow -lopencv_plot -lopencv_reg -lopencv_saliency -lopencv_stereo -lopencv_structured_light -lopencv_rgbd -lopencv_surface_matching -lopencv_tracking -lopencv_datasets -lopencv_text -lopencv_face -lopencv_xfeatures2d -lopencv_shape -lopencv_video -lopencv_ximgproc -lopencv_calib3d -lopencv_features2d -lopencv_flann -lopencv_xobjdetect -lopencv_objdetect -lopencv_ml -lopencv_xphoto -lopencv_highgui -lopencv_videoio -lopencv_imgcodecs -lopencv_photo -lopencv_imgproc -lopencv_core -o vo2 11 | 12 | demo2: drone.mp4 vo2 13 | ./vo2 drone.mp4 620 | perl ./driveGnuPlotStreams.pl 12 4 200 4200 200 200 0 0 0 0 0 0 0 0 950x200+960+30 950x200+960+780 950x200+960+280 950x200+960+530 'pitch' 'yaw' 'roll' 'polar translation angle' 'azimuthal translation angle' 'z' 'keypoints' 'matches' '100 * threshold' 'cpu [ms]' 'gpu [ms]' 'defect' 0 0 0 1 1 1 2 2 2 3 3 1 14 | 15 | demo2_no_gnuplot: drone.mp4 vo2 16 | ./vo2 drone.mp4 620 17 | 18 | demo: drone.mp4 vo 19 | ./vo drone.mp4 620 | perl ./driveGnuPlotStreams.pl 12 4 200 4200 200 200 0 0 0 0 0 0 0 0 950x200+960+30 950x200+960+780 950x200+960+280 950x200+960+530 'pitch' 'yaw' 'roll' 'polar translation angle' 'azimuthal translation angle' 'z' 'keypoints' 'matches' '100 * threshold' 'cpu [ms]' 'gpu [ms]' 'defect' 0 0 0 1 1 1 2 2 2 3 3 1 20 | #620 21 | #4400 22 | demo_no_gnuplot: drone.mp4 vo 23 | ./vo drone.mp4 620 24 | 25 | drone.mp4: 26 | youtube-dl -f 137 https://www.youtube.com/watch?v=wneCezU_VQ4 27 | mv Raw\ FPV\ Training\ Session\ -\ Dirt\ Bike\ Visit\ in\ Park-wneCezU_VQ4.mp4 drone.mp4 28 | 29 | gpuFacade.o: latch.o bitMatcher.o 30 | g++ `pkg-config --cflags opencv` -c gpuFacade.cpp latch.o bitMatcher.o -I/usr/local/cuda-7.5/include/ -L/usr/local/cuda/lib64 -lcuda -lcudart -L/usr/local/lib -lopencv_stitching -lopencv_superres -lopencv_videostab -lopencv_aruco -lopencv_bgsegm -lopencv_bioinspired -lopencv_ccalib -lopencv_dnn -lopencv_dpm -lopencv_fuzzy -lopencv_line_descriptor -lopencv_optflow -lopencv_plot -lopencv_reg -lopencv_saliency -lopencv_stereo -lopencv_structured_light -lopencv_rgbd -lopencv_surface_matching -lopencv_tracking -lopencv_datasets -lopencv_text -lopencv_face -lopencv_xfeatures2d -lopencv_shape -lopencv_video -lopencv_ximgproc -lopencv_calib3d -lopencv_features2d -lopencv_flann -lopencv_xobjdetect -lopencv_objdetect -lopencv_ml -lopencv_xphoto -lopencv_highgui -lopencv_videoio -lopencv_imgcodecs -lopencv_photo -lopencv_imgproc -lopencv_core 31 | 32 | 33 | #fast.o: 34 | # nvcc -c -lineinfo -O3 -o fast.o fast.cu -gencode arch=compute_52,code=sm_52 -I/home/chris/cub-1.5.2/ 35 | 36 | latch.o: 37 | nvcc -c -lineinfo -Xptxas -v -use_fast_math -O3 -o latch.o latch.cu -gencode arch=compute_52,code=sm_52 38 | 39 | latchAff.o: 40 | nvcc -c -lineinfo -Xptxas -v -use_fast_math -O3 -o latchAff.o latchAff.cu -gencode arch=compute_52,code=sm_52 41 | 42 | 43 | min: latch.o bitMatcher.o 44 | g++ `pkg-config --cflags opencv` min.cpp latch.o bitMatcher.o -I/usr/local/cuda-7.5/include/ -L/usr/local/cuda/lib64 -lcuda -lcudart -L/usr/local/lib -lopencv_stitching -lopencv_superres -lopencv_videostab -lopencv_aruco -lopencv_bgsegm -lopencv_bioinspired -lopencv_ccalib -lopencv_dnn -lopencv_dpm -lopencv_fuzzy -lopencv_line_descriptor -lopencv_optflow -lopencv_plot -lopencv_reg -lopencv_saliency -lopencv_stereo -lopencv_structured_light -lopencv_rgbd -lopencv_surface_matching -lopencv_tracking -lopencv_datasets -lopencv_text -lopencv_face -lopencv_xfeatures2d -lopencv_shape -lopencv_video -lopencv_ximgproc -lopencv_calib3d -lopencv_features2d -lopencv_flann -lopencv_xobjdetect -lopencv_objdetect -lopencv_ml -lopencv_xphoto -lopencv_highgui -lopencv_videoio -lopencv_imgcodecs -lopencv_photo -lopencv_imgproc -lopencv_core -o min 45 | ./min ob/1.png ob/2.png 46 | 47 | bitMatcher.o: 48 | nvcc -c -lineinfo -Xptxas -v -use_fast_math -O3 -o bitMatcher.o bitMatcher.cu -gencode arch=compute_52,code=sm_52 49 | 50 | clean: 51 | rm vo; rm latch.o; rm bitMatcher.o; rm gpuFacade.o; rm latchAff.o; rm vo2; rm affTest; 52 | 53 | run: vo 54 | ./vo 55 | 56 | #plot: vo 57 | # ./vo $(video) $(skip) $(w) $(h) | feedgnuplot --stream 0.01 --lines --nopoints --legend 0 pitch --legend 1 yaw --legend 2 roll --xlen 200 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Major updates are coming in the immediate future. Please watch this space. 2 | 3 | # CUDA implementation of the LATCH descriptor & brute-force matcher 4 | 5 | This is a high performance GPU implementation of the [LATCH descriptor](http://www.openu.ac.il/home/hassner/projects/LATCH/) invented by [Gil Levi](https://gilscvblog.com/2015/11/07/performance-evaluation-of-binary-descriptor-introducing-the-latch-descriptor/) and [Tal Hassner](http://www.openu.ac.il/home/hassner/). Please reference: "LATCH: Learned Arrangements of Three Patch Codes", IEEE Winter Conference on Applications of Computer Vision (WACV), Lake Placid, NY, USA, March, 2016. 6 | 7 | You should probably be looking at the [OpenMVG branch](https://github.com/mdaiter/openMVG) which includes this code. 8 | 9 | [![IMAGE ALT TEXT](http://img.youtube.com/vi/zmfLZY7T6Qg/0.jpg)](http://www.youtube.com/watch?v=zmfLZY7T6Qg "Video Title") 10 | 11 | On a GTX 970M I see 10^6 descriptor extractions per second (1 to 1.2 microseconds per descriptor), and 3*10^9 comparisons per second. A GTX 760 sees 70% of this speed. NVidia graphics card with CUDA compute capability >=3.0 required. 12 | 13 | Look at min.cpp for a minimal introduction. Compile it with "make min -j7". Run it as "./min 1.png 2.png" (Note, min.cpp is broken. Take a look at vo.cpp instead or the OpenMVG class.) 14 | 15 | vo.cpp has a better example of how you can hide 100% of the processing time of the GPU. The quickest way to see it in action is to install "youtube-dl" and then run "make demo -j7". Or you could just watch this video: https://www.youtube.com/watch?v=zmfLZY7T6Qg I see cumulative 43ms of CPU overhead for GPU processing of 4250 frames of 1080p video. 16 | 17 | Note that currently each descriptor is 2048 bits but the last 1536 bits are 0. I was originally planning on building larger variants: true 1024 bit and 2048 bit LATCH descriptors. You can relatively easily adjust this down to 1024 bits by changing defines, but refactoring is necessary for 512 bits. 18 | 19 | Current features: 20 | - hardware interpolation for affine invariant descriptors at virtually no performance overhead 21 | - customizable importance masking for patch triplet comparisons at no performance overhead 22 | - asynchronous GPU operation 23 | - fast cross-checking (symmetry test) with event-driven multi-stream matching kernel 24 | 25 | Approximate order of planned features: 26 | - multichannel support ( http://arxiv.org/abs/1603.04408 ) 27 | - extractor kernel granularity optimization (possibly increased extractor speed) 28 | - documentation 29 | - 512 bit matcher (increased matcher speed) 30 | - API improvements (currently a mess) 31 | - CUDA implementation of adaptive grid FAST detector 32 | - offline parameter optimization with PyGMO 33 | - integration into OpenCV 34 | 35 | Multi-GPU support is not currently planned. Please contact me if you have a use case that requires it. 36 | 37 | This work is released under a Creative Commons Attribution-ShareAlike license. If you use this code in an academic work, please cite me by name ([Christopher Parker](https://github.com/csp256/)) and link to [this repository](https://github.com/csp256/cudaLATCH/). 38 | 39 | Please email me if you have any questions: csparker.work@gmail.com 40 | -------------------------------------------------------------------------------- /affTest.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "cuda.h" 6 | #include "cuda_runtime.h" 7 | #include "opencv2/opencv.hpp" 8 | using namespace std; 9 | using namespace cv; 10 | #include "latchAff.h" 11 | #include "bitMatcher.h" 12 | //#include "gpuFacade.hpp" 13 | 14 | #define cudaCalloc(A, B) \ 15 | do { \ 16 | cudaError_t __cudaCalloc_err = cudaMalloc(A, B); \ 17 | if (__cudaCalloc_err == cudaSuccess) cudaMemset(*A, 0, B); \ 18 | } while (0) 19 | 20 | #define checkError(ans) { gpuAssert((ans), __FILE__, __LINE__); } 21 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { 22 | if (code != cudaSuccess) { 23 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 24 | if (abort) exit(code); 25 | } 26 | } 27 | 28 | #define checkLaunchError() \ 29 | do { \ 30 | /* Check synchronous errors, i.e. pre-launch */ \ 31 | cudaError_t err = cudaGetLastError(); \ 32 | if (cudaSuccess != err) { \ 33 | fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 34 | __FILE__, __LINE__, cudaGetErrorString(err) ); \ 35 | exit(EXIT_FAILURE); \ 36 | } \ 37 | /* Check asynchronous errors, i.e. kernel failed (ULF) */ \ 38 | err = cudaThreadSynchronize(); \ 39 | if (cudaSuccess != err) { \ 40 | fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 41 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 42 | exit(EXIT_FAILURE); \ 43 | } \ 44 | } while (0) 45 | 46 | 47 | // Sometimes the recovered pose is 180 degrees off...? I thought cheirality test would handle that, but apparently not always. 48 | double dist2(Mat a, Mat b) { 49 | double s = 0.0; 50 | for (int i=0; i<3; i++) { 51 | const double t = a.at(i) - b.at(i); 52 | s += t*t; 53 | } 54 | return s; 55 | } 56 | 57 | // In general a suffix of 1 means previous frame, and 2 means current frame. 58 | // However, we start processing the next frame while the GPU is working on current... 59 | // So at a certain point frame 1 shifts down to 0, 2 shifts down to 1, and the new 2 is loaded. 60 | int main( int argc, char** argv ) { 61 | // gpuFacade gpu; 62 | // gpu.set_values(3,4); 63 | // cerr << "!! " << gpu.area() << endl; 64 | 65 | // This must be an integer multiple of 512. 66 | // Specifically, half-multiples of the number of SM's for your GPU are sensible. 67 | // I have 10 streaming multiprocessors, so I chose 15*512 = 7680. 68 | const int maxKP = 512 * 15; 69 | const bool showMatches = true; 70 | // Shows every Nth processed frame's matches. 71 | const int showMatchesInterval = 10; 72 | const bool showVideo = true; 73 | // Shows every Nth processed frame. 74 | const int showVideoInterval = 1; 75 | int WIDTH, HEIGHT, totalMatches, totalInliers = 0; 76 | const int matchThreshold = 12; 77 | // Discard this many frames for each one processed. Change with +/- keys while running. 78 | int skipFrames = 0; 79 | // Threshold for FAST detector 80 | int threshold = 20; 81 | int targetKP = 3000; 82 | int tolerance = 200; 83 | int maxLoops = 100;//4200; 84 | const bool gnuplot = true; 85 | double defect = 0.0; 86 | int extractions = 0; 87 | 88 | VideoCapture cap; 89 | if (argc == 1) { 90 | cap = VideoCapture(0); 91 | WIDTH = cap.get(CAP_PROP_FRAME_WIDTH); 92 | HEIGHT = cap.get(CAP_PROP_FRAME_HEIGHT); 93 | } 94 | if (argc == 2 || argc == 3) { 95 | cap = VideoCapture(argv[1]); 96 | WIDTH = cap.get(CAP_PROP_FRAME_WIDTH); 97 | HEIGHT = cap.get(CAP_PROP_FRAME_HEIGHT); 98 | if (argc == 3) { 99 | for (int i=0; i> img1; 119 | cap >> img2; 120 | img1 = imread("/home/chris/cv/data/affine/graffiti/img1.ppm", -1); 121 | cv::cvtColor(img1, img1g, CV_BGR2GRAY); 122 | cv::cvtColor(img2, img2g, CV_BGR2GRAY); 123 | if (showMatches) { 124 | namedWindow("Matches", WINDOW_NORMAL); 125 | } 126 | waitKey(1); 127 | if (showVideo) { 128 | namedWindow("Video", WINDOW_NORMAL); 129 | } 130 | waitKey(1); 131 | resizeWindow("Matches", 1920/2, 540/2); 132 | resizeWindow("Video", 960, 540); 133 | moveWindow("Matches", 0, 540+55); 134 | moveWindow("Video", 0, 0); 135 | waitKey(1); 136 | 137 | cudaEvent_t start, stop; 138 | cudaEventCreate(&start); 139 | cudaEventCreate(&stop); 140 | 141 | vector keypoints0, keypoints1, keypoints2; 142 | vector goodMatches; 143 | vector p1, p2; // Point correspondences for recovering pose. 144 | int numKP0, numKP1, numKP2; // The actual number of keypoints we are dealing with: just keypoints#.size(), but capped at maxKP. 145 | int key = -1; 146 | clock_t timer, timer2; 147 | float time; 148 | 149 | // Sizes for device and host pointers 150 | size_t sizeK = maxKP * sizeof(float) * 5; // K for keypoints 151 | size_t sizeI = WIDTH * HEIGHT * sizeof(unsigned char); // I for Image 152 | size_t sizeD = maxKP * (2048 / 32) * sizeof(unsigned int); // D for Descriptor 153 | size_t sizeM = maxKP * sizeof(int); // M for Matches 154 | size_t sizeMask = 64 * sizeof(float); 155 | 156 | // Host pointers 157 | float *h_K1, *h_K2; 158 | cudaMallocHost((void **) &h_K1, sizeK); 159 | cudaMallocHost((void **) &h_K2, sizeK); 160 | // For reasons opaque to me, allocating both (but not either) h_M1 or h_M2 161 | // with cudaMallocHost segfaults, apparently after graceful exit? So neither of them are pinned. 162 | int h_M1[maxKP]; 163 | int h_M2[maxKP]; 164 | float h_mask[64]; 165 | for (int i=0; i<64; i++) { h_mask[i] = 1.0f; } 166 | 167 | // Device pointers 168 | unsigned char *d_I; 169 | unsigned int *d_D1, *d_D2, *uIntSwapPointer; 170 | int *d_M1, *d_M2; 171 | float *d_K, *d_mask; 172 | cudaCalloc((void **) &d_K, sizeK); 173 | cudaCalloc((void **) &d_D1, sizeD); 174 | cudaCalloc((void **) &d_D2, sizeD); 175 | cudaCalloc((void **) &d_M1, sizeM); 176 | cudaCalloc((void **) &d_M2, sizeM); 177 | cudaCalloc((void **) &d_mask, sizeM); 178 | 179 | // The patch triplet locations for LATCH fits in texture memory cache. 180 | cudaArray* patchTriplets; 181 | initPatchTriplets(patchTriplets); 182 | size_t pitch; 183 | initImage(&d_I, WIDTH, HEIGHT, &pitch); 184 | initMask(&d_mask, h_mask); 185 | 186 | // Events allow asynchronous, nonblocking launch of subsequent kernels after a given event has happened, 187 | // such as completion of a different kernel on a different stream. 188 | cudaEvent_t latchFinished; 189 | cudaEventCreate(&latchFinished); 190 | // You should create a new stream for each bitMatcher kernel you want to launch at once. 191 | cudaStream_t streanumKP1, streanumKP2; 192 | cudaStreamCreate(&streanumKP1); 193 | cudaStreamCreate(&streanumKP2); 194 | 195 | 196 | FAST(img1g, keypoints1, threshold); 197 | // extractions += keypoints1.size(); 198 | // latchAff( img1g, d_I, pitch, h_K1, d_D1, &numKP1, maxKP, d_K, &keypoints1, d_mask, latchFinished, outMat1 ); 199 | 200 | Ptr mserExtractor = MSER::create(); 201 | 202 | vector > mserContours; 203 | vector mserKeypoint; 204 | vector mserBbox; 205 | mserExtractor->detect(img1g, mserContours, mserBbox); 206 | 207 | outMat1 = img1.clone(); 208 | outMat2 = img1.clone(); 209 | resize(outMat2, outMat2, Size(64,64)); 210 | 211 | // cerr << outMat1.depth() << " (()) " << outMat1.channels() << " (()) " << outMat1.type() << endl; 212 | // for (int i=0; i v : mserContours){ 225 | // for (cv::Point p : v){ 226 | // outMat1.at(p.y, p.x*3+0) = 255; 227 | // outMat1.at(p.y, p.x*3+1) = 255; 228 | // outMat1.at(p.y, p.x*3+2) = 255; 229 | // } 230 | // } 231 | 232 | // ms(box, regions, Mat()); 233 | // for (int i = 0; i < regions.size(); i++) 234 | // { 235 | // ellipse(box, fitEllipse(regions[i]), Scalar(255)); 236 | // } 237 | 238 | // FAST(img2g, keypoints2, threshold); // This call to fast is concurrent with above execution. 239 | // extractions += keypoints2.size(); 240 | // latchAff( img2g, d_I, pitch, h_K2, d_D2, &numKP2, maxKP, d_K, &keypoints2, d_mask, latchFinished, outMat2 ); 241 | // bitMatcher( d_D1, d_D2, numKP1, numKP2, maxKP, d_M1, matchThreshold, streanumKP1, latchFinished ); 242 | // bitMatcher( d_D2, d_D1, numKP2, numKP1, maxKP, d_M2, matchThreshold, streanumKP2, latchFinished ); 243 | // timer = clock(); 244 | // getMatches(maxKP, h_M1, d_M1); 245 | // getMatches(maxKP, h_M2, d_M2); 246 | // for (int i=0; i= 0 && h_M1[i] < numKP2 && h_M2[h_M1[i]] == i) { 248 | // goodMatches.push_back( DMatch(i, h_M1[i], 0)); // For drawing. 249 | // p1.push_back(keypoints1[i].pt); // For recovering pose. 250 | // p2.push_back(keypoints2[h_M1[i]].pt); 251 | // } 252 | // } 253 | // 254 | // drawMatches( img1, keypoints1, img2, keypoints2, 255 | // goodMatches, imgMatches, Scalar::all(-1), Scalar::all(-1), 256 | // vector(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS ); 257 | imshow( "Video", outMat1 ); 258 | imshow( "Matches", outMat2 ); 259 | waitKey(0); 260 | return 0; 261 | 262 | /* img1.copyTo(img0); 263 | img2.copyTo(img1); 264 | cap.read(img2); 265 | cvtColor(img2, img2g, CV_BGR2GRAY); 266 | 267 | keypoints0 = keypoints1; 268 | keypoints1 = keypoints2; 269 | 270 | uIntSwapPointer = d_D1; 271 | d_D1 = d_D2; 272 | d_D2 = uIntSwapPointer; 273 | 274 | numKP0 = numKP1; 275 | numKP1 = numKP2; 276 | 277 | FAST(img2g, keypoints2, threshold); 278 | int loopIteration = 0; 279 | for (; loopIteration < maxLoops || maxLoops == -1; loopIteration++) { // Main Loop. 280 | { // GPU code for descriptors and matching. 281 | cudaEventRecord(start, 0); 282 | extractions += keypoints2.size(); 283 | latch( img2g, d_I, pitch, h_K2, d_D2, &numKP2, maxKP, d_K, &keypoints2, d_mask, latchFinished); 284 | bitMatcher( d_D1, d_D2, numKP1, numKP2, maxKP, d_M1, matchThreshold, streanumKP1, latchFinished ); 285 | bitMatcher( d_D2, d_D1, numKP2, numKP1, maxKP, d_M2, matchThreshold, streanumKP2, latchFinished ); 286 | cudaEventRecord(stop, 0); 287 | } 288 | timer = clock(); 289 | { // Put as much CPU code here as possible. 290 | { // Display matches and/or video to user. 291 | bool needToDraw = false; 292 | if (showMatches && loopIteration % showMatchesInterval == 0) { // Draw matches. 293 | drawMatches( img0, keypoints0, img1, keypoints1, 294 | goodMatches, imgMatches, Scalar::all(-1), Scalar::all(-1), 295 | vector(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS ); 296 | imshow( "Matches", imgMatches ); 297 | needToDraw = true; 298 | } 299 | if (showVideo && loopIteration % showVideoInterval == 0) { 300 | imshow("Video", img1); 301 | needToDraw = true; 302 | } 303 | if (needToDraw) { 304 | key = waitKey(1); 305 | } 306 | } 307 | { // Handle user input. 308 | switch (key) { 309 | case (-1): 310 | break; 311 | case (1048689): // q 312 | case (113): // also q 313 | return 0; 314 | break; 315 | case (1048695): // w 316 | waitKey(0); 317 | break; 318 | case (1114027): // + 319 | skipFrames++; 320 | cerr << "For each processed frame we are now skipping " << skipFrames << endl; 321 | break; 322 | case (1114029): // - 323 | skipFrames = max(1, --skipFrames); 324 | cerr << "For each processed frame we are now skipping " << skipFrames << endl; 325 | break; 326 | default: 327 | cerr << "Currently pressed key is: " << key << endl; 328 | break; 329 | } 330 | key = -1; 331 | } 332 | { // Iterate the "logical" loop (get ready to process next frame) 333 | img1.copyTo(img0); 334 | img2.copyTo(img1); 335 | for (int i=0; i(i) = rodOld.at(i)*(1.0-alpha) + rod.at(i)*alpha; 379 | // } 380 | rodOld.copyTo(rod); 381 | } 382 | } else { 383 | defect += 1.0; 384 | cout << "11:" << 1.0 << endl; 385 | cerr << "Too few matches! Not going to try to recover pose this frame." << endl; 386 | } 387 | // To prevent the graphs from desynchronizing from each other, we have to output this unconditionally. 388 | if (gnuplot) { 389 | for (int i=0; i<3; i++) { 390 | cout << i << ":" << rod.at(i) * 57.2957795 << endl; // Output Rodrigues vector, rescaled to degrees 391 | } 392 | // T is unit norm (scale-less) and often erroneously sign-reversed. 393 | // if (T.at(2) < 0) T = -T; // Assume dominate motion is forward... (this is not an elegant assumption) 394 | // double theta = atan2(T.at(0), T.at(2)); 395 | // double phi = atan2(T.at(1), T.at(2)); 396 | // cout << 3 << ":" << theta * 57.2957795 << endl; // Plot polar translation angle 397 | // cout << 4 << ":" << phi * 57.2957795 << endl; // Plot azimuthal translation angle 398 | } 399 | } 400 | { // run FAST detector on the CPU for next frame (get ready for next loop iteration). 401 | FAST(img2g, keypoints2, threshold); 402 | // Apply proportional control to threshold to drive it towards targetKP. 403 | int control = (int)(((float)keypoints2.size() - (float)targetKP) / (float)tolerance); 404 | threshold += min(100, control); 405 | if (threshold < 1) threshold = 1; 406 | } 407 | } 408 | if (gnuplot) { 409 | time = (1000*(clock() - timer)/(double)CLOCKS_PER_SEC); 410 | cout << "9:" << time << endl; // Plot CPU time. 411 | timer = clock(); 412 | } 413 | { // Get new GPU results 414 | p1.clear(); 415 | p2.clear(); 416 | goodMatches.clear(); 417 | getMatches(maxKP, h_M1, d_M1); 418 | getMatches(maxKP, h_M2, d_M2); 419 | cudaEventElapsedTime(&time, start, stop); 420 | if (gnuplot) { 421 | cout << "10:" << (time+(1000*(clock() - timer)/(double)CLOCKS_PER_SEC)) << endl; // Plot total asynchronous GPU time. 422 | } 423 | for (int i=0; i= 0 && h_M1[i] < numKP1 && h_M2[h_M1[i]] == i) { 425 | goodMatches.push_back( DMatch(i, h_M1[i], 0)); // For drawing matches. 426 | p1.push_back(keypoints0[i].pt); // For recovering pose. 427 | p2.push_back(keypoints1[h_M1[i]].pt); 428 | } 429 | } 430 | } 431 | if (gnuplot) { 432 | cout << "6:" << numKP1 << endl; // Plot number of keypoints. 433 | cout << "7:" << p1.size() << endl; // Plot number of matches. 434 | cout << "8:" << 100*threshold << endl; // Plot current threshold for FAST. 435 | } 436 | totalMatches += p1.size(); 437 | } 438 | cudaFreeArray(patchTriplets); 439 | cudaFree(d_K); 440 | cudaFree(d_D1); 441 | cudaFree(d_D2); 442 | cudaFree(d_M1); 443 | cudaFree(d_M2); 444 | cudaFreeHost(h_K1); 445 | cudaFreeHost(h_K2); 446 | cerr << "Total matches: " << totalMatches << endl; 447 | cerr << "Total inliers: " << totalInliers << endl; 448 | cerr << "Defect: " << defect << endl; 449 | cerr << "Loop iteration: " << loopIteration << endl; 450 | cerr << "Extractions: " << extractions << endl; 451 | 452 | return 0; 453 | */ 454 | } 455 | -------------------------------------------------------------------------------- /bitMatcher.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "bitMatcher.h" 10 | 11 | using namespace std; 12 | 13 | // Number of values each thread in a warp gets per vector. 14 | #define chunksPerVector (2) 15 | #define vectorsPerWarp (16) 16 | // Vectors per group is used to increase ILP. it must divide vectorsPerWarp. This implementation is specialized for vectorsPerGroup==8. 17 | #define vectorsPerGroup (8) 18 | #define warpsPerBlock (32) 19 | // The total number of int32's needed to store a vector. We should drop this down to 16 for an optimized implementation for canonical LATCH. 20 | #define vectorDimension (64) 21 | #define _warpSize (32) 22 | #define cacheSize (128) 23 | #define halfCacheSize (64) 24 | 25 | #define checkError(ans) { gpuAssert((ans), __FILE__, __LINE__); } 26 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { 27 | if (code != cudaSuccess) { 28 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 29 | if (abort) exit(code); 30 | } 31 | } 32 | 33 | #define checkLaunchError() \ 34 | do { \ 35 | /* Check synchronous errors, i.e. pre-launch */ \ 36 | cudaError_t err = cudaGetLastError(); \ 37 | if (cudaSuccess != err) { \ 38 | fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 39 | __FILE__, __LINE__, cudaGetErrorString(err) ); \ 40 | exit(EXIT_FAILURE); \ 41 | } \ 42 | /* Check asynchronous errors, i.e. kernel failed (ULF) */ \ 43 | err = cudaThreadSynchronize(); \ 44 | if (cudaSuccess != err) { \ 45 | fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 46 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 47 | exit(EXIT_FAILURE); \ 48 | } \ 49 | } while (0) 50 | 51 | 52 | // Launch as 32x32 53 | __global__ void __launch_bounds__(1024, 1) 54 | bitMatch( const unsigned int *g_query, 55 | const unsigned int *g_training, 56 | int *g_match, 57 | const int trainingSize, 58 | const int threshold) { 59 | // Load query vectors 60 | register unsigned int query[vectorsPerWarp][chunksPerVector]; 61 | 62 | volatile __shared__ unsigned int s_training[cacheSize][chunksPerVector][_warpSize]; // We have enough room to load extra query vectors from shared memory... 63 | { 64 | int offset = threadIdx.x; 65 | offset += blockIdx.x * vectorDimension * warpsPerBlock * vectorsPerWarp; 66 | offset += vectorDimension * threadIdx.y * vectorsPerWarp; 67 | 68 | #pragma unroll 69 | for (int i=0; i>= 16; 165 | 166 | if (dist0 < secondBest) { 167 | if (dist0 < best) { 168 | secondBest = best; 169 | best = dist0; 170 | bestIndex = t + trainingOffset; 171 | } else { 172 | secondBest = dist0; 173 | } 174 | } 175 | } 176 | { // Write new training vectors prefetched into registers to shared memory cache at end of every even loop. 177 | if (st % 2 == 1) { 178 | if (threadIdx.y < chunksPerVector) { // We can load identically for each chunk, but not so for write to shared memory differently. 179 | s_training[(half^1)*halfCacheSize + (st-1) ][threadIdx.y ][threadIdx.x] = prefetch; 180 | } else if (threadIdx.y < 2*chunksPerVector) { 181 | s_training[(half^1)*halfCacheSize + (st-1) + 1][threadIdx.y - chunksPerVector][threadIdx.x] = prefetch; 182 | } 183 | } 184 | } 185 | } 186 | __syncthreads(); 187 | } 188 | } 189 | if (threadIdx.x < vectorsPerWarp) { 190 | if (secondBest - best < threshold) { 191 | bestIndex = -1; // Failed hard threshold test. 192 | } 193 | // We can trash what is in shared memory now... it is called s_training, but here it is just scratch space. 194 | // I guess I should use a union for this? 195 | const register int packing = _warpSize / vectorsPerWarp; // NOTE: This assumes vectorsPerWarp divides _warpSize. If it doesnt, you'll have to handle this differently. 196 | s_training[0][threadIdx.y / packing][(threadIdx.y%packing)*vectorsPerWarp + threadIdx.x] = bestIndex; 197 | } 198 | __threadfence_block(); 199 | if (threadIdx.y < vectorsPerWarp) { 200 | g_match[blockIdx.x*vectorsPerWarp*warpsPerBlock + threadIdx.y*warpsPerBlock + threadIdx.x] = s_training[0][threadIdx.y][threadIdx.x]; 201 | } 202 | } 203 | 204 | 205 | void bitMatcher(unsigned int* d_Q, unsigned int* d_T, int keypointsQ, int keypointsT, int maxKP, int* d_M, const int threshold, cudaStream_t stream, cudaEvent_t event) { 206 | dim3 threadsPerBlock(_warpSize, warpsPerBlock); 207 | const int neededBlocks = (keypointsQ + (vectorsPerWarp * warpsPerBlock) - 1) / (vectorsPerWarp * warpsPerBlock); // This is the "round up integer division" pattern 208 | dim3 blocksPerGrid(neededBlocks, 1, 1); 209 | 210 | cudaStreamWaitEvent(stream, event, 0); 211 | // checkLaunchError(); 212 | bitMatch<<>>(d_Q, d_T, d_M, keypointsT, threshold); 213 | // checkLaunchError(); 214 | } 215 | 216 | void getMatches(int maxKP, int* h_M, int* d_M) { 217 | size_t sizeM = maxKP * sizeof(int); 218 | checkLaunchError(); 219 | cudaMemcpyAsync(h_M, d_M, sizeM, cudaMemcpyDeviceToHost); 220 | checkLaunchError(); 221 | }; 222 | -------------------------------------------------------------------------------- /bitMatcher.h: -------------------------------------------------------------------------------- 1 | void bitMatcher(unsigned int*, unsigned int*, int, int, int, int*, int, cudaStream_t, cudaEvent_t); 2 | void getMatches(int, int*, int*); 3 | -------------------------------------------------------------------------------- /driveGnuPlotStreams.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | sub usage { 5 | print "Usage: $0 \n"; 6 | print < ...for window1 15 | ... 16 | ...for windowN 17 | 18 | Window0_YRangeMin Window0_YRangeMax Min and Max values for window0 19 | ...for window1 20 | ... 21 | ...for windowN 22 | 23 | Window0_geometry WIDTHxHEIGHT+XOFF+YOFF (in pixels) 24 | ...for window1 25 | ... 26 | ...for windowN 27 | 28 | Stream0_Title Title used for stream 0 29 | ...for stream1 30 | ... 31 | ...for streamM 32 | 33 | WindowNumber0 Window into which stream 0 is plotted 34 | ... for stream1 35 | ... 36 | ... for streamM 37 | OEF 38 | exit(1); 39 | } 40 | 41 | sub WrongParameter { 42 | my $cause = shift; 43 | print "Expected parameter missing ($cause)...\n\n"; 44 | usage; 45 | exit(1); 46 | } 47 | 48 | 49 | sub main { 50 | my $argIdx = 0; 51 | my $numberOfStreams = shift or WrongParameter("number of streams"); 52 | my $numberOfWindows = shift or WrongParameter("number of windows"); 53 | print "Will display $numberOfStreams Streams in $numberOfWindows windows...\n"; 54 | my @sampleSizes; 55 | for(my $i=0; $i<$numberOfWindows; $i++) { 56 | my $samples = shift or WrongParameter("sample size $i"); 57 | push @sampleSizes, $samples; 58 | print "Window $i will use a window of $samples samples\n"; 59 | } 60 | my @ranges; 61 | for(my $i=0; $i<$numberOfWindows; $i++) { 62 | my $miny = shift; 63 | WrongParameter("min y of window $i") if !defined($miny); 64 | my $maxy = shift; 65 | WrongParameter("max y of window $i") if !defined($maxy); 66 | push @ranges, [ $miny, $maxy ]; 67 | print "Window $i will use a range of [$miny, $maxy]\n"; 68 | } 69 | my @geometries; 70 | for(my $i=0; $i<$numberOfWindows; $i++) { 71 | my $geometry = shift or WrongParameter("geometry $i"); 72 | push @geometries, $geometry; 73 | print "Window $i will use a geometry of '$geometry'\n"; 74 | } 75 | my @titles; 76 | for(my $i=0; $i<$numberOfStreams; $i++) { 77 | my $title = shift or WrongParameter("title $i"); 78 | push @titles, $title; 79 | print "Stream $i will use a title of '$title'\n"; 80 | } 81 | my @streams; # streams in a window 82 | my @windows; # window of a stream 83 | for(my $i=0; $i<$numberOfStreams; $i++) { 84 | my $window = shift; 85 | WrongParameter("window of stream $i") if !defined $window; 86 | push @{$streams[$window]}, $i; 87 | $windows[$i] = $window; 88 | print "Stream $i will be plotted in window $window\n"; 89 | } 90 | # check that every window has a stream 91 | for my $windowIdx(0..$numberOfWindows-1) { 92 | if (!defined($streams[$windowIdx]) or @{$streams[$windowIdx]} == 0) { 93 | warn "Warning: Window $windowIdx has no streams!\n"; 94 | } 95 | } 96 | my @gnuplots; 97 | my @buffers; 98 | my @xcounters; 99 | for (0..$numberOfStreams-1) { 100 | my @data = []; 101 | push @buffers, @data; 102 | push @xcounters, 0; 103 | } 104 | for(my $i=0; $i<$numberOfWindows; $i++) { 105 | local *PIPE; 106 | my $geometry = $geometries[$i]; 107 | open PIPE, "|gnuplot -geometry $geometry" || die "Can't initialize gnuplot number ".($i+1)."\n"; 108 | select((select(PIPE), $| = 1)[0]); 109 | push @gnuplots, *PIPE; 110 | print PIPE "set xtics\n"; 111 | print PIPE "set ytics\n"; 112 | # print PIPE "set yrange [".($ranges[$i]->[0]).":".($ranges[$i]->[1])."]\n"; 113 | print PIPE "set style data lines\n"; 114 | print PIPE "set grid\n"; 115 | print PIPE "set term x11\n"; 116 | } 117 | my $streamIdx = 0; 118 | # replace @ARGV with remaining args for <> below 119 | @ARGV = @_; 120 | while(<>) { 121 | chomp; 122 | my @parts = split /:/; 123 | #print "$.: parts=", join("-", @parts), "\n"; 124 | $streamIdx = $parts[0]; 125 | my $windowIdx = $windows[$streamIdx]; 126 | my $buf = $buffers[$streamIdx]; 127 | my $pip = $gnuplots[$windowIdx]; 128 | # data buffering (up to stream sample size) 129 | my $xcounter = $xcounters[$streamIdx]; 130 | push @{$buf}, "$xcounter $parts[1]"; 131 | $xcounters[$streamIdx]++; 132 | my $max_xcounter = $xcounter; 133 | my $q = 0; 134 | for my $stream (@{$streams[$windowIdx]}) { 135 | if ($xcounters[$stream] > $max_xcounter) { 136 | $max_xcounter = $xcounters[$stream]; 137 | $q = 1; 138 | } 139 | } 140 | my $plotInterval = 15; 141 | if ($max_xcounter % $plotInterval != $plotInterval-1 && $q == 1) { 142 | next; 143 | } 144 | 145 | print $pip "set xrange [".($max_xcounter-$sampleSizes[$windowIdx]).":".($max_xcounter)."]\n"; 146 | my @plots; 147 | for my $stream (@{$streams[$windowIdx]}) { 148 | if (@{$buffers[$stream]} > 0) { 149 | push @plots, "\"-\" title '$titles[$stream]'"; 150 | } 151 | } 152 | print $pip "plot ", join(", ", @plots), "\n"; 153 | for my $stream (@{$streams[$windowIdx]}) { 154 | if (@{$buffers[$stream]} > 0) { 155 | for my $elem (reverse @{$buffers[$stream]}) { 156 | print $pip "$elem\n"; 157 | } 158 | print $pip "e\n"; 159 | } 160 | } 161 | if (scalar(@{$buf})>$sampleSizes[$windowIdx]) { 162 | shift @{$buf}; 163 | } 164 | } 165 | for(my $i=0; $i<$numberOfWindows; $i++) { 166 | my $pip = $gnuplots[$i]; 167 | print $pip "exit;\n"; 168 | close $pip; 169 | } 170 | } 171 | 172 | main @ARGV; 173 | -------------------------------------------------------------------------------- /gpuFacade.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "cuda.h" 6 | #include "cuda_runtime.h" 7 | #include "opencv2/opencv.hpp" 8 | using namespace std; 9 | using namespace cv; 10 | #include "latch.h" 11 | #include "bitMatcher.h" 12 | #include "gpuFacade.hpp" 13 | 14 | // images 15 | // keypoints 16 | // descriptors 17 | // matches 18 | 19 | using namespace std; 20 | 21 | #define cudaCalloc(A, B) \ 22 | do { \ 23 | cudaError_t __cudaCalloc_err = cudaMalloc(A, B); \ 24 | if (__cudaCalloc_err == cudaSuccess) cudaMemset(*A, 0, B); \ 25 | } while (0) 26 | 27 | #define checkError(ans) { gpuAssert((ans), __FILE__, __LINE__); } 28 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { 29 | if (code != cudaSuccess) { 30 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 31 | if (abort) exit(code); 32 | } 33 | } 34 | 35 | #define checkLaunchError() \ 36 | do { \ 37 | /* Check synchronous errors, i.e. pre-launch */ \ 38 | cudaError_t err = cudaGetLastError(); \ 39 | if (cudaSuccess != err) { \ 40 | fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 41 | __FILE__, __LINE__, cudaGetErrorString(err) ); \ 42 | exit(EXIT_FAILURE); \ 43 | } \ 44 | /* Check asynchronous errors, i.e. kernel failed (ULF) */ \ 45 | err = cudaThreadSynchronize(); \ 46 | if (cudaSuccess != err) { \ 47 | fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 48 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 49 | exit(EXIT_FAILURE); \ 50 | } \ 51 | } while (0) 52 | 53 | gpuFacade::~gpuFacade() { 54 | // cudaFreeArray(patchTriplets); // This crashes..? 55 | cudaFree(d_K); 56 | cudaFree(d_D1); 57 | cudaFree(d_D2); 58 | cudaFree(d_M1); 59 | cudaFree(d_M2); 60 | cudaFreeHost(h_K1); 61 | cudaFreeHost(h_K2); 62 | cudaDeviceReset(); 63 | } 64 | 65 | gpuFacade::gpuFacade(int maxKeypoints, int input_WIDTH, int input_HEIGHT, int imageSlots) { 66 | maxKP = maxKeypoints; 67 | WIDTH = input_WIDTH; 68 | HEIGHT = input_HEIGHT; 69 | 70 | cudaEventCreate(&start); 71 | cudaEventCreate(&stop); 72 | 73 | // Sizes for device and host pointers 74 | sizeK = maxKP * sizeof(float) * 4; // K for keypoints 75 | sizeI = WIDTH * HEIGHT * sizeof(unsigned char); // I for Image 76 | sizeD = maxKP * (2048 / 32) * sizeof(unsigned int); // D for Descriptor 77 | sizeM = maxKP * sizeof(int); // M for Matches 78 | sizeMask = 64 * sizeof(float); 79 | 80 | // Host pointers 81 | cudaMallocHost((void **) &h_K1, sizeK); 82 | cudaMallocHost((void **) &h_K2, sizeK); 83 | h_M1 = (int*) malloc(sizeM); 84 | h_M2 = (int*) malloc(sizeM); 85 | for (int i=0; i<64; i++) { h_mask[i] = 1.0f; } 86 | 87 | // Device pointers 88 | cudaCalloc((void **) &d_K, sizeK); 89 | cudaCalloc((void **) &d_D1, sizeD); 90 | cudaCalloc((void **) &d_D2, sizeD); 91 | cudaCalloc((void **) &d_M1, sizeM); 92 | cudaCalloc((void **) &d_M2, sizeM); 93 | cudaCalloc((void **) &d_mask, sizeM); 94 | 95 | // The patch triplet locations for LATCH fits in texture memory cache. 96 | initPatchTriplets(patchTriplets); 97 | initImage(&d_I, WIDTH, HEIGHT, &pitch); 98 | initMask(&d_mask, h_mask); 99 | 100 | // Events allow asynchronous, nonblocking launch of subsequent kernels after a given event has happened, 101 | // such as completion of a different kernel on a different stream. 102 | cudaEventCreate(&latchFinished); 103 | // You should create a new stream for each bitMatcher kernel you want to launch at once. 104 | cudaStreamCreate(&streamKP1); 105 | cudaStreamCreate(&streamKP2); 106 | } 107 | 108 | void gpuFacade::LATCH( 109 | Mat img, 110 | unsigned int* d_descriptor, 111 | int* keypoints, 112 | vector* vectorKP) { 113 | latch( img, d_I, pitch, h_K1, d_descriptor, keypoints, maxKP, d_K, vectorKP, d_mask, latchFinished ); 114 | } 115 | 116 | void gpuFacade::match( 117 | unsigned int* d_descriptorQ, 118 | unsigned int* d_descriptorT, 119 | int numKP_Q, 120 | int numKP_T, 121 | int* d_matches, 122 | int threshold, 123 | cudaStream_t stream) { 124 | bitMatcher( d_descriptorQ, d_descriptorT, numKP_Q, numKP_T, maxKP, d_matches, threshold, stream, latchFinished ); 125 | } 126 | 127 | void gpuFacade::getResults(int* h_matches, int* d_matches) { 128 | getMatches(maxKP, h_matches, d_matches); 129 | } 130 | -------------------------------------------------------------------------------- /gpuFacade.hpp: -------------------------------------------------------------------------------- 1 | class gpuFacade { 2 | public: 3 | int WIDTH, HEIGHT; 4 | void set_values (int,int); 5 | int area(); 6 | 7 | int maxKP; 8 | cudaEvent_t start, stop; 9 | vector keypoints0, keypoints1, keypoints2; 10 | vector goodMatches; 11 | vector p1, p2; // Point correspondences for recovering pose. 12 | int numKP0, numKP1, numKP2; // The actual number of keypoints we are dealing with: just keypoints#.size(), but capped at maxKP. 13 | size_t sizeK; // K for keypoints 14 | size_t sizeI; // I for Image 15 | size_t sizeD; // D for Descriptor 16 | size_t sizeM; // M for Matches 17 | size_t sizeMask; 18 | float *h_K1, *h_K2; 19 | // For reasons opaque to me, allocating both (but not either) h_M1 or h_M2 20 | // with cudaMallocHost segfaults, apparently after graceful exit? So neither of them are pinned. 21 | int* h_M1; 22 | int* h_M2; 23 | float h_mask[64]; 24 | unsigned char *d_I; 25 | unsigned int *d_D1, *d_D2, *uIntSwapPointer; 26 | int *d_M1, *d_M2; 27 | float *d_K, *d_mask; 28 | cudaArray* patchTriplets; 29 | size_t pitch; 30 | cudaEvent_t latchFinished; 31 | cudaStream_t streamKP1, streamKP2; 32 | 33 | void LATCH(cv::Mat, unsigned int*, int*, std::vector*); 34 | void match( unsigned int*, 35 | unsigned int*, 36 | int, 37 | int, 38 | int*, 39 | int, 40 | cudaStream_t); 41 | void getResults(int* h_matches, int* d_matches); 42 | gpuFacade(int, int, int, int); 43 | ~gpuFacade(); 44 | }; 45 | -------------------------------------------------------------------------------- /latch.cu: -------------------------------------------------------------------------------- 1 | // Uncomment below define to use sum of squared differences instead of sum of absolute differences. 2 | // #define use_SAD // You can keep this commented. 3 | // Uncomment below define to use an importance mask on each patch comparison. 4 | // #define use_mask // You can keep this commented. 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "opencv2/opencv.hpp" 11 | using namespace std; 12 | using namespace cv; 13 | 14 | #define _warpSize (32) 15 | #define _warpSizef (32.0f) 16 | #define warpsPerBlock (32) 17 | // Region of interest 18 | #define roiWidth (64) 19 | #define roiHeight (64) 20 | // Minimal amount that avoids shared memory bank conflicts 21 | #define roiWidthPadding (4) 22 | // The numbers loaded into the oracle assume patchSize==8. If you really want canonical LATCH, you can set the mask to ignore those pixels. 23 | #define patchSize (8) 24 | #define bitsPerDescriptor (512) 25 | // With further work this should be decreased to 0 for even faster matching. 26 | #define paddingBitsPerDescriptor (1536) 27 | #define bitsPerUInt32 (32) 28 | #define deg2rad (0.0174533f) 29 | #define negDeg2rad (-0.0174533f) 30 | #define inv64 (0.015625f) 31 | #define CHECK_BORDER (0) 32 | 33 | // Used to store the oracle of patch triplets. 34 | texture patchTriplets; 35 | texture image; 36 | 37 | __constant__ int triplets[3*512]; 38 | 39 | #define cudaCalloc(A, B) \ 40 | do { \ 41 | cudaError_t __cudaCalloc_err = cudaMalloc(A, B); \ 42 | if (__cudaCalloc_err == cudaSuccess) cudaMemset(*A, 0, B); \ 43 | } while (0) 44 | 45 | #define checkError(ans) { gpuAssert((ans), __FILE__, __LINE__); } 46 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { 47 | if (code != cudaSuccess) { 48 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 49 | if (abort) exit(code); 50 | } 51 | } 52 | 53 | #define checkLaunchError() \ 54 | do { \ 55 | /* Check synchronous errors, i.e. pre-launch */ \ 56 | cudaError_t err = cudaGetLastError(); \ 57 | if (cudaSuccess != err) { \ 58 | fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 59 | __FILE__, __LINE__, cudaGetErrorString(err) ); \ 60 | exit(EXIT_FAILURE); \ 61 | } \ 62 | /* Check asynchronous errors, i.e. kernel failed (ULF) */ \ 63 | err = cudaThreadSynchronize(); \ 64 | if (cudaSuccess != err) { \ 65 | fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 66 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 67 | exit(EXIT_FAILURE); \ 68 | } \ 69 | } while (0) 70 | 71 | // Launch as 32x32 72 | __global__ void __launch_bounds__(1024, 2) 73 | latch( const float *g_K, 74 | unsigned int *g_D, 75 | const int imgWidth, 76 | const int imgHeight, 77 | const float *g_mask/*, 78 | const float *g_oriented*/) { 79 | volatile __shared__ int s_kpOffset[2]; 80 | volatile __shared__ float s_mask[64]; 81 | volatile __shared__ float s_stride[4]; 82 | volatile __shared__ float s_roi[roiHeight][roiWidth + roiWidthPadding][1]; // It is faster to operate on floats, even if our image is unsigned char! (we can't guarantee media instructions are available) 83 | volatile __shared__ unsigned int s_out[warpsPerBlock]; 84 | { 85 | register float mask0, mask1; 86 | if (threadIdx.y == 0 && threadIdx.x < 2) { // 2 threads, 2 coordinates 87 | register float k; 88 | k = g_K[blockIdx.x*5 + threadIdx.x]; 89 | if (CHECK_BORDER && (k < _warpSize || (threadIdx.x == 0 && imgWidth-_warpSize < k) || (threadIdx.x == 1 && imgHeight-_warpSize < k))) { 90 | k = -999999; // If too near boundary, make sure the kpOffset will be negative, so everyone gets the signal to bail. 91 | } 92 | s_kpOffset[threadIdx.x] = k + 0.5f; 93 | } 94 | if (threadIdx.y == 1 && threadIdx.x < 4) { 95 | register float l; // Just a temp variable... thread 0 has major axis, 1 has minor axis, 2 has angle. 96 | if (threadIdx.x < 3) { 97 | l = g_K[blockIdx.x*5 + 2 + threadIdx.x]; 98 | } 99 | register float c, s, t; // cos and sin and theta 100 | t = __shfl(l, 2); 101 | __sincosf(negDeg2rad*t, &s, &c); 102 | 103 | const register float a = __shfl(l, 0); 104 | const register float b = __shfl(l, 1); 105 | 106 | const register float p = ((threadIdx.x & 1) == 0) ? a : b; 107 | const register float q = (threadIdx.x == 1 || threadIdx.x == 2) ? s : c; 108 | 109 | s_stride[threadIdx.x] = p*q*inv64; 110 | } 111 | if (threadIdx.y == 2) { 112 | mask0 = g_mask[threadIdx.x]; 113 | } 114 | if (threadIdx.y == 3) { 115 | mask1 = g_mask[_warpSize + threadIdx.x]; 116 | } 117 | __threadfence_block(); 118 | __syncthreads(); 119 | const register int x = s_kpOffset[0]; 120 | const register int y = s_kpOffset[1]; 121 | if (x < 0 || y < 0) { 122 | return; // This is the case if our keypoint is within boundary of the edge of the image. 5000 blocks returning in this way takes ~300 microseconds. 123 | } 124 | const register float r = (threadIdx.x < 4) ? s_stride[threadIdx.x] : 0; 125 | 126 | const register float r11 = __shfl(r, 0); 127 | const register float r12 = __shfl(r, 1); 128 | const register float r21 = -__shfl(r, 2); 129 | const register float r22 = __shfl(r, 3); 130 | // const register float c = s_stride[0]; 131 | // const register float s = s_stride[1]; 132 | 133 | // 64 by 64 region of interest means four 32 by 32 loads. 134 | if (threadIdx.y == 2) { 135 | s_mask[threadIdx.x] = mask0; 136 | } 137 | if (threadIdx.y == 3) { 138 | s_mask[threadIdx.x + _warpSize] = mask1; 139 | } 140 | 141 | // const register float cN = c * -_warpSizef; 142 | // const register float sN = s * -_warpSizef; 143 | // 144 | // const register float cu = c * threadIdx.x; 145 | // const register float su = s * threadIdx.x; 146 | // const register float cv = c * threadIdx.y; 147 | // const register float sv = s * threadIdx.y; 148 | // 149 | // const register float cuN = cu + cN; 150 | // const register float suN = su + sN; 151 | // const register float cvN = cv + cN; 152 | // const register float svN = sv + sN; 153 | 154 | const register float nx = threadIdx.x - _warpSizef; 155 | const register float ny = threadIdx.y - _warpSizef; 156 | 157 | 158 | s_roi[threadIdx.y ][threadIdx.x ][0] = /*(unsigned char)*/ ( 256.0f * tex2D(image, min((float)imgWidth-1.0f, r11*nx + r12*ny + x), min((float)imgHeight-1.0f, r21*nx + r22*ny + y))); 159 | s_roi[threadIdx.y ][threadIdx.x + _warpSize][0] = /*(unsigned char)*/ ( 256.0f * tex2D(image, min((float)imgWidth-1.0f, r11*threadIdx.x + r12*ny + x), min((float)imgHeight-1.0f, r21*threadIdx.x + r22*ny + y))); 160 | s_roi[threadIdx.y + _warpSize][threadIdx.x ][0] = /*(unsigned char)*/ ( 256.0f * tex2D(image, min((float)imgWidth-1.0f, r11*nx + r12*threadIdx.y + x), min((float)imgHeight-1.0f, r21*nx + r22*threadIdx.y + y))); 161 | s_roi[threadIdx.y + _warpSize][threadIdx.x + _warpSize][0] = /*(unsigned char)*/ ( 256.0f * tex2D(image, min((float)imgWidth-1.0f, r11*threadIdx.x + r12*threadIdx.y + x), min((float)imgHeight-1.0f, r21*threadIdx.x + r22*threadIdx.y + y))); 162 | } 163 | register unsigned int out = 0; 164 | const register int wrappedX = threadIdx.x % patchSize; // Offset for patch, interlaced to decrease padding needed for shared memory bank conflict avoidance 165 | const register int wrappedY = 2 * (threadIdx.x / patchSize); // Each thread will use both wrappedY and wrappedY+1 166 | __syncthreads(); 167 | __threadfence_block(); 168 | // if (blockIdx.x == 0) { 169 | // g_out[(threadIdx.y )*64 + (threadIdx.x )] = s_roi[threadIdx.y ][threadIdx.x ][0]; 170 | // g_out[(threadIdx.y )*64 + (threadIdx.x + _warpSize)] = s_roi[threadIdx.y ][threadIdx.x + _warpSize][0]; 171 | // g_out[(threadIdx.y + _warpSize)*64 + (threadIdx.x + _warpSize)] = s_roi[threadIdx.y + _warpSize][threadIdx.x + _warpSize][0]; 172 | // g_out[(threadIdx.y + _warpSize)*64 + (threadIdx.x )] = s_roi[threadIdx.y + _warpSize][threadIdx.x ][0]; 173 | // } 174 | // __syncthreads(); 175 | // __threadfence_block(); 176 | 177 | 178 | const register float mask0 = s_mask[threadIdx.x]; 179 | const register float mask1 = s_mask[threadIdx.x + _warpSize]; 180 | 181 | register int nextCoord = tex2D(patchTriplets, threadIdx.x, threadIdx.y); // This access is hardware cached. 182 | // register int offset = threadIdx.y * 3 * 16; 183 | // register int nextAleph = triplets[offset ]; 184 | // register int nextTavek = triplets[offset+1]; 185 | // register int nextBet = triplets[offset+2]; 186 | #pragma unroll 187 | for (register int i=0; i<16; ) { 188 | const register int coord = nextCoord; 189 | if (i!=16-4) nextCoord = tex2D(patchTriplets, 6*(i+4) + threadIdx.x, threadIdx.y); // This access is hardware cached. 190 | #pragma unroll 191 | for (register int j=0; j<4; j++, i++) { 192 | // offset += 3; 193 | // const register int alephIndexX = nextAleph & 255; 194 | // const register int alephIndexY = nextAleph >> 8; 195 | // nextAleph = triplets[offset]; 196 | // const register int tavekIndexX = nextTavek & 255; 197 | // const register int tavekIndexY = nextTavek >> 8; 198 | // nextTavek = triplets[offset+1]; 199 | // const register int betIndexX = nextBet & 255; 200 | // const register int betIndexY = nextBet >> 8; 201 | // nextBet = triplets[offset+2]; 202 | 203 | const register int alephIndexX = __shfl(coord, 6*j ); 204 | const register int alephIndexY = __shfl(coord, 6*j+1); 205 | const register int tavekIndexX = __shfl(coord, 6*j+2); 206 | const register int tavekIndexY = __shfl(coord, 6*j+3); 207 | const register int betIndexX = __shfl(coord, 6*j+4); 208 | const register int betIndexY = __shfl(coord, 6*j+5); 209 | const register int bitIndex = 16*(threadIdx.y & 1) + i; 210 | const register int outThread = 0; 211 | 212 | // This assumes an 8x8 patch. As there are only 32 threads per warp, each thread will pull two values from each thread. 213 | // The access pattern is interleaved to decrease the amount of shared memory padding necessary to avoid bank conflicts: 214 | // each thread pulls a verticle pair from each patch. 215 | const register int tavek0 = s_roi[tavekIndexY + wrappedY ][tavekIndexX + wrappedX][0]; // Tavek means "between". 216 | const register int tavek1 = s_roi[tavekIndexY + wrappedY+1][tavekIndexX + wrappedX][0]; // It is our root patch. 217 | const register int aleph0 = s_roi[alephIndexY + wrappedY ][alephIndexX + wrappedX][0]; // Aleph is "A" 218 | const register int aleph1 = s_roi[alephIndexY + wrappedY+1][alephIndexX + wrappedX][0]; // Similarity to aleph is denoted by a bit set to 0 219 | const register int bet0 = s_roi[betIndexY + wrappedY ][betIndexX + wrappedX][0]; // Bet is "B" 220 | const register int bet1 = s_roi[betIndexY + wrappedY+1][betIndexX + wrappedX][0]; // Similarity to bet is denoted by a bit set to 1 221 | 222 | // Now we compute the sum of squared differences between both patch pairs. 223 | // First, differences: 224 | register int alephDiff0 = (tavek0 - aleph0); 225 | register int alephDiff1 = (tavek1 - aleph1); 226 | register int betDiff0 = (tavek0 - bet0); 227 | register int betDiff1 = (tavek1 - bet1); 228 | // Then, squared differences 229 | alephDiff0 *= alephDiff0; 230 | alephDiff1 *= alephDiff1; 231 | betDiff0 *= betDiff0; 232 | betDiff1 *= betDiff1; 233 | 234 | alephDiff0 *= mask0; 235 | alephDiff1 *= mask1; 236 | betDiff0 *= mask0; 237 | betDiff1 *= mask1; 238 | 239 | alephDiff0 += alephDiff1; // Merge both interleaved squared differences, to make upcoming warp reduction faster 240 | betDiff0 += betDiff1; 241 | 242 | alephDiff0 -= betDiff0; // Easiest to just take this difference now, then reduce, then compare to 0. Same as reduce then compare relative to each other. 243 | alephDiff0 += __shfl_xor(alephDiff0, 1); 244 | alephDiff0 += __shfl_xor(alephDiff0, 2); 245 | alephDiff0 += __shfl_xor(alephDiff0, 4); 246 | alephDiff0 += __shfl_xor(alephDiff0, 8); 247 | alephDiff0 += __shfl_xor(alephDiff0, 16); // By xor shfling, every thread has the resulting sum. 248 | 249 | // One thread sets a specific bit high if tavek is closer to bet. 250 | if (alephDiff0 < 0 && threadIdx.x == outThread) { 251 | out |= (1<>1] = h_data[i] + (h_data[i+1] << 8); 292 | // } 293 | // cudaMemcpyToSymbolAsync(triplets, h_data_packed, sizeof(float)*3*512); 294 | } 295 | 296 | 297 | void initImage(unsigned char ** d_I, int width, int height, size_t * pitch) { 298 | cudaMallocPitch((void**)d_I, pitch, width*sizeof(*d_I), height); 299 | 300 | image.addressMode[0] = cudaAddressModeClamp; 301 | image.addressMode[1] = cudaAddressModeClamp; 302 | image.addressMode[2] = cudaAddressModeClamp; 303 | image.normalized = false; 304 | image.filterMode = cudaFilterModeLinear; 305 | size_t tex_ofs; 306 | cudaBindTexture2D (&tex_ofs, &image, *d_I, &image.channelDesc, width, height, *pitch); 307 | } 308 | 309 | void initMask(float** d_mask, float* h_mask) { 310 | // This packs even rows together in h_mask, then odd rows. 311 | // It is 'run once' code. 312 | float t[64]; 313 | for (int i=0; i<64; i++) { 314 | t[i] = h_mask[i]; 315 | } 316 | for (int r=0; r<4; r++) { 317 | for (int c=0; c<8; c++) { 318 | h_mask[c + r*8] = t[c + r*16]; 319 | } 320 | } 321 | for (int r=4; r<8; r++) { 322 | for (int c=0; c<8; c++) { 323 | h_mask[c + r*8] = t[c + (r-4)*16 + 8]; 324 | } 325 | } 326 | size_t sizeMask = 64 * sizeof(float); 327 | cudaMalloc((void **) d_mask, sizeMask); 328 | cudaMemcpy(*d_mask, h_mask, sizeMask, cudaMemcpyHostToDevice); 329 | } 330 | 331 | float computeGradient(const unsigned char* img, const int width, const int x, const int y) { 332 | float dx = 0.0f; 333 | float dy = 0.0f; 334 | float delta = 0.0f; 335 | int base = x + y*width; 336 | int offset; 337 | 338 | offset = 3*width; 339 | delta = (img[base + offset] - img[base - offset]); 340 | dy += delta; 341 | 342 | offset = 3*width + 1; 343 | delta = (img[base + offset] - img[base - offset]); 344 | dy += delta * 3 / sqrt(10); 345 | dx += delta / sqrt(10); 346 | 347 | offset = 2*width + 2; 348 | delta = (img[base + offset] - img[base - offset]); 349 | dy += delta / sqrt(2); 350 | dx += delta / sqrt(2); 351 | 352 | offset = 1*width + 3; 353 | delta = (img[base + offset] - img[base - offset]); 354 | dy += delta / sqrt(10); 355 | dx += delta * 3 / sqrt(10); 356 | 357 | offset = 3; 358 | delta = (img[base + offset] - img[base - offset]); 359 | dx += delta; 360 | 361 | offset = -1*width + 3; 362 | delta = (img[base + offset] - img[base - offset]); 363 | dy -= delta / sqrt(10); 364 | dx += delta * 3 / sqrt(10); 365 | 366 | offset = -2*width + 2; 367 | delta = (img[base + offset] - img[base - offset]); 368 | dy -= delta / sqrt(2); 369 | dx += delta / sqrt(2); 370 | 371 | offset = -3*width + 1; 372 | delta = (img[base + offset] - img[base - offset]); 373 | dy -= delta * 3 / sqrt(10); 374 | dx += delta / sqrt(10); 375 | 376 | return atan2f(dy, dx); 377 | } 378 | 379 | void latch( Mat imgMat, 380 | unsigned char* d_I, 381 | size_t pitch, 382 | float* h_K, 383 | unsigned int* d_D, 384 | int* keypoints, 385 | int maxKP, 386 | float* d_K, 387 | vector* vectorKP, 388 | float* d_mask, 389 | cudaEvent_t latchFinished) { 390 | const unsigned char* h_I = imgMat.data; 391 | const int height = imgMat.rows; 392 | const int width = imgMat.cols; 393 | 394 | // All of these calls are non blocking but serialized. 395 | // cudaMemsetAsync(d_K, -1, maxKP * sizeof(int) * 4); // Negative one is represented by all '1' bits in both int32 and uchar8. 396 | // cudaMemsetAsync(d_D, 0, maxKP * (2048 / 32) * sizeof(unsigned int)); 397 | cudaMemcpy2DAsync(d_I, pitch, h_I, width*sizeof(unsigned char), width*sizeof(unsigned char), height, cudaMemcpyHostToDevice); 398 | 399 | // Only prep up to maxKP for the GPU (as that is the most we have prepared the GPU to handle) 400 | *keypoints = ((*vectorKP).size() < maxKP) ? (*vectorKP).size() : maxKP; 401 | for (int i=0; i<*keypoints; i+=1) { 402 | h_K[5*i ] = (*vectorKP)[i].pt.x; 403 | h_K[5*i+1] = (*vectorKP)[i].pt.y; 404 | h_K[5*i+2] = 64.0f; // WIDTH in pixels 405 | h_K[5*i+3] = 64.0f; // HEIGHT in pixels 406 | h_K[5*i+4] = 0.0f; // ANGLE in degrees (if openmvg uses radians, let me know) 407 | } 408 | for (int i=*keypoints; i>>(d_K, d_D, width, height, d_mask); 421 | // checkLaunchError(); 422 | cudaEventRecord(latchFinished); 423 | } 424 | -------------------------------------------------------------------------------- /latch.h: -------------------------------------------------------------------------------- 1 | void latch( Mat, 2 | unsigned char *, 3 | size_t, 4 | float *, 5 | unsigned int *, 6 | int *, 7 | int, 8 | float *, 9 | vector*, 10 | float*, 11 | cudaEvent_t); 12 | 13 | void initPatchTriplets(cudaArray*); 14 | 15 | void initImage( unsigned char**, 16 | int, 17 | int, 18 | size_t * 19 | ); 20 | 21 | void initMask( float **, 22 | float *); 23 | -------------------------------------------------------------------------------- /latchAff.cu: -------------------------------------------------------------------------------- 1 | // Uncomment below define to use sum of squared differences instead of sum of absolute differences. 2 | // #define use_SAD // You can keep this commented. 3 | // Uncomment below define to use an importance mask on each patch comparison. 4 | // #define use_mask // You can keep this commented. 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "opencv2/opencv.hpp" 11 | using namespace std; 12 | using namespace cv; 13 | 14 | #define _warpSize (32) 15 | #define _warpSizef (32.0f) 16 | #define warpsPerBlock (32) 17 | // Region of interest 18 | #define roiWidth (64) 19 | #define roiHeight (64) 20 | // Minimal amount that avoids shared memory bank conflicts 21 | #define roiWidthPadding (4) 22 | // The numbers loaded into the oracle assume patchSize==8. If you really want canonical LATCH, you can set the mask to ignore those pixels. 23 | #define patchSize (8) 24 | #define bitsPerDescriptor (512) 25 | // With further work this should be decreased to 0 for even faster matching. 26 | #define paddingBitsPerDescriptor (1536) 27 | #define bitsPerUInt32 (32) 28 | #define deg2rad (0.0174533f) 29 | #define negDeg2rad (-0.0174533f) 30 | #define CHECK_BORDER (0) 31 | 32 | // Used to store the oracle of patch triplets. 33 | texture patchTriplets; 34 | texture image; 35 | 36 | __constant__ int triplets[3*512]; 37 | 38 | #define cudaCalloc(A, B) \ 39 | do { \ 40 | cudaError_t __cudaCalloc_err = cudaMalloc(A, B); \ 41 | if (__cudaCalloc_err == cudaSuccess) cudaMemset(*A, 0, B); \ 42 | } while (0) 43 | 44 | #define checkError(ans) { gpuAssert((ans), __FILE__, __LINE__); } 45 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { 46 | if (code != cudaSuccess) { 47 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 48 | if (abort) exit(code); 49 | } 50 | } 51 | 52 | #define checkLaunchError() \ 53 | do { \ 54 | /* Check synchronous errors, i.e. pre-launch */ \ 55 | cudaError_t err = cudaGetLastError(); \ 56 | if (cudaSuccess != err) { \ 57 | fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 58 | __FILE__, __LINE__, cudaGetErrorString(err) ); \ 59 | exit(EXIT_FAILURE); \ 60 | } \ 61 | /* Check asynchronous errors, i.e. kernel failed (ULF) */ \ 62 | err = cudaThreadSynchronize(); \ 63 | if (cudaSuccess != err) { \ 64 | fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 65 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 66 | exit(EXIT_FAILURE); \ 67 | } \ 68 | } while (0) 69 | 70 | // Launch as 32x32 71 | __global__ void __launch_bounds__(1024, 2) 72 | latch( const float *g_K, 73 | unsigned int *g_D, 74 | const int imgWidth, 75 | const int imgHeight, 76 | const float *g_mask/*, 77 | const float *g_oriented*/, 78 | float *g_out) { 79 | volatile __shared__ int s_kpOffset[2]; 80 | volatile __shared__ float s_mask[64]; 81 | volatile __shared__ float s_stride[4]; 82 | volatile __shared__ float s_roi[roiHeight][roiWidth + roiWidthPadding][1]; // It is faster to operate on floats, even if our image is unsigned char! (we can't guarantee media instructions are available) 83 | volatile __shared__ unsigned int s_out[warpsPerBlock]; 84 | { 85 | register float mask0, mask1; 86 | if (threadIdx.y == 0 && threadIdx.x < 2) { // 2 threads, 2 coordinates 87 | register float k; 88 | k = g_K[blockIdx.x*5 + threadIdx.x]; 89 | if (CHECK_BORDER && (k < _warpSize || (threadIdx.x == 0 && imgWidth-_warpSize < k) || (threadIdx.x == 1 && imgHeight-_warpSize < k))) { 90 | k = -999999; // If too near boundary, make sure the kpOffset will be negative, so everyone gets the signal to bail. 91 | } 92 | s_kpOffset[threadIdx.x] = k + 0.5f; 93 | } 94 | if (threadIdx.y == 1 && threadIdx.x < 4) { 95 | register float l; // Just a temp variable... thread 0 has major axis, 1 has minor axis, 2 has angle. 96 | if (threadIdx.x < 3) { 97 | l = g_K[blockIdx.x*5 + 2 + threadIdx.x]; 98 | } 99 | register float c, s, t; // cos and sin and theta 100 | t = __shfl(l, 2); 101 | __sincosf(negDeg2rad*t, &s, &c); 102 | register float a,b; 103 | a = __shfl(l, 0) * 0.015625f; 104 | b = __shfl(l, 1) * 0.015625f; 105 | 106 | const register float p = ((threadIdx.x & 1) == 0) ? a : b; 107 | const register float q = (threadIdx.x == 1 || threadIdx.x == 2) ? s : c; 108 | 109 | s_stride[threadIdx.x] = p*q; 110 | } 111 | if (threadIdx.y == 2) { 112 | mask0 = g_mask[threadIdx.x]; 113 | } 114 | if (threadIdx.y == 3) { 115 | mask1 = g_mask[_warpSize + threadIdx.x]; 116 | } 117 | __threadfence_block(); 118 | __syncthreads(); 119 | const register int x = s_kpOffset[0]; 120 | const register int y = s_kpOffset[1]; 121 | if (x < 0 || y < 0) { 122 | return; // This is the case if our keypoint is within boundary of the edge of the image. 5000 blocks returning in this way takes ~300 microseconds. 123 | } 124 | const register float r = (threadIdx.x < 4) ? s_stride[threadIdx.x] : 0.0f; 125 | 126 | const register float r11 = __shfl(r, 0); 127 | const register float r12 = __shfl(r, 1); 128 | const register float r21 = -__shfl(r, 2); 129 | const register float r22 = __shfl(r, 3); 130 | // const register float c = s_stride[0]; 131 | // const register float s = s_stride[1]; 132 | 133 | // 64 by 64 region of interest means four 32 by 32 loads. 134 | if (threadIdx.y == 2) { 135 | s_mask[threadIdx.x] = mask0; 136 | } 137 | if (threadIdx.y == 3) { 138 | s_mask[threadIdx.x + _warpSize] = mask1; 139 | } 140 | 141 | // const register float cN = c * -_warpSizef; 142 | // const register float sN = s * -_warpSizef; 143 | // 144 | // const register float cu = c * threadIdx.x; 145 | // const register float su = s * threadIdx.x; 146 | // const register float cv = c * threadIdx.y; 147 | // const register float sv = s * threadIdx.y; 148 | // 149 | // const register float cuN = cu + cN; 150 | // const register float suN = su + sN; 151 | // const register float cvN = cv + cN; 152 | // const register float svN = sv + sN; 153 | 154 | const register float nx = threadIdx.x - _warpSizef; 155 | const register float ny = threadIdx.y - _warpSizef; 156 | 157 | 158 | s_roi[threadIdx.y ][threadIdx.x ][0] = /*(unsigned char)*/ ( 256.0f * tex2D(image, min((float)imgWidth-1.0f, r11*nx + r12*ny + x), min((float)imgHeight-1.0f, r21*nx + r22*ny + y))); 159 | s_roi[threadIdx.y ][threadIdx.x + _warpSize][0] = /*(unsigned char)*/ ( 256.0f * tex2D(image, min((float)imgWidth-1.0f, r11*threadIdx.x + r12*ny + x), min((float)imgHeight-1.0f, r21*threadIdx.x + r22*ny + y))); 160 | s_roi[threadIdx.y + _warpSize][threadIdx.x ][0] = /*(unsigned char)*/ ( 256.0f * tex2D(image, min((float)imgWidth-1.0f, r11*nx + r12*threadIdx.y + x), min((float)imgHeight-1.0f, r21*nx + r22*threadIdx.y + y))); 161 | s_roi[threadIdx.y + _warpSize][threadIdx.x + _warpSize][0] = /*(unsigned char)*/ ( 256.0f * tex2D(image, min((float)imgWidth-1.0f, r11*threadIdx.x + r12*threadIdx.y + x), min((float)imgHeight-1.0f, r21*threadIdx.x + r22*threadIdx.y + y))); 162 | } 163 | register unsigned int out = 0; 164 | const register int wrappedX = threadIdx.x % patchSize; // Offset for patch, interlaced to decrease padding needed for shared memory bank conflict avoidance 165 | const register int wrappedY = 2 * (threadIdx.x / patchSize); // Each thread will use both wrappedY and wrappedY+1 166 | __syncthreads(); 167 | __threadfence_block(); 168 | if (blockIdx.x == 0) { 169 | g_out[(threadIdx.y )*64 + (threadIdx.x )] = s_roi[threadIdx.y ][threadIdx.x ][0]; 170 | g_out[(threadIdx.y )*64 + (threadIdx.x + _warpSize)] = s_roi[threadIdx.y ][threadIdx.x + _warpSize][0]; 171 | g_out[(threadIdx.y + _warpSize)*64 + (threadIdx.x + _warpSize)] = s_roi[threadIdx.y + _warpSize][threadIdx.x + _warpSize][0]; 172 | g_out[(threadIdx.y + _warpSize)*64 + (threadIdx.x )] = s_roi[threadIdx.y + _warpSize][threadIdx.x ][0]; 173 | } 174 | __syncthreads(); 175 | __threadfence_block(); 176 | 177 | 178 | const register float mask0 = s_mask[threadIdx.x]; 179 | const register float mask1 = s_mask[threadIdx.x + _warpSize]; 180 | 181 | register int nextCoord = tex2D(patchTriplets, threadIdx.x, threadIdx.y); // This access is hardware cached. 182 | // register int offset = threadIdx.y * 3 * 16; 183 | // register int nextAleph = triplets[offset ]; 184 | // register int nextTavek = triplets[offset+1]; 185 | // register int nextBet = triplets[offset+2]; 186 | #pragma unroll 187 | for (register int i=0; i<16; ) { 188 | const register int coord = nextCoord; 189 | if (i!=16-4) nextCoord = tex2D(patchTriplets, 6*(i+4) + threadIdx.x, threadIdx.y); // This access is hardware cached. 190 | #pragma unroll 191 | for (register int j=0; j<4; j++, i++) { 192 | // offset += 3; 193 | // const register int alephIndexX = nextAleph & 255; 194 | // const register int alephIndexY = nextAleph >> 8; 195 | // nextAleph = triplets[offset]; 196 | // const register int tavekIndexX = nextTavek & 255; 197 | // const register int tavekIndexY = nextTavek >> 8; 198 | // nextTavek = triplets[offset+1]; 199 | // const register int betIndexX = nextBet & 255; 200 | // const register int betIndexY = nextBet >> 8; 201 | // nextBet = triplets[offset+2]; 202 | 203 | const register int alephIndexX = __shfl(coord, 6*j ); 204 | const register int alephIndexY = __shfl(coord, 6*j+1); 205 | const register int tavekIndexX = __shfl(coord, 6*j+2); 206 | const register int tavekIndexY = __shfl(coord, 6*j+3); 207 | const register int betIndexX = __shfl(coord, 6*j+4); 208 | const register int betIndexY = __shfl(coord, 6*j+5); 209 | const register int bitIndex = 16*(threadIdx.y & 1) + i; 210 | const register int outThread = 0; 211 | 212 | // This assumes an 8x8 patch. As there are only 32 threads per warp, each thread will pull two values from each thread. 213 | // The access pattern is interleaved to decrease the amount of shared memory padding necessary to avoid bank conflicts: 214 | // each thread pulls a verticle pair from each patch. 215 | const register int tavek0 = s_roi[tavekIndexY + wrappedY ][tavekIndexX + wrappedX][0]; // Tavek means "between". 216 | const register int tavek1 = s_roi[tavekIndexY + wrappedY+1][tavekIndexX + wrappedX][0]; // It is our root patch. 217 | const register int aleph0 = s_roi[alephIndexY + wrappedY ][alephIndexX + wrappedX][0]; // Aleph is "A" 218 | const register int aleph1 = s_roi[alephIndexY + wrappedY+1][alephIndexX + wrappedX][0]; // Similarity to aleph is denoted by a bit set to 0 219 | const register int bet0 = s_roi[betIndexY + wrappedY ][betIndexX + wrappedX][0]; // Bet is "B" 220 | const register int bet1 = s_roi[betIndexY + wrappedY+1][betIndexX + wrappedX][0]; // Similarity to bet is denoted by a bit set to 1 221 | 222 | // Now we compute the sum of squared differences between both patch pairs. 223 | // First, differences: 224 | register int alephDiff0 = (tavek0 - aleph0); 225 | register int alephDiff1 = (tavek1 - aleph1); 226 | register int betDiff0 = (tavek0 - bet0); 227 | register int betDiff1 = (tavek1 - bet1); 228 | // Then, squared differences 229 | alephDiff0 *= alephDiff0; 230 | alephDiff1 *= alephDiff1; 231 | betDiff0 *= betDiff0; 232 | betDiff1 *= betDiff1; 233 | 234 | alephDiff0 *= mask0; 235 | alephDiff1 *= mask1; 236 | betDiff0 *= mask0; 237 | betDiff1 *= mask1; 238 | 239 | alephDiff0 += alephDiff1; // Merge both interleaved squared differences, to make upcoming warp reduction faster 240 | betDiff0 += betDiff1; 241 | 242 | alephDiff0 -= betDiff0; // Easiest to just take this difference now, then reduce, then compare to 0. Same as reduce then compare relative to each other. 243 | alephDiff0 += __shfl_xor(alephDiff0, 1); 244 | alephDiff0 += __shfl_xor(alephDiff0, 2); 245 | alephDiff0 += __shfl_xor(alephDiff0, 4); 246 | alephDiff0 += __shfl_xor(alephDiff0, 8); 247 | alephDiff0 += __shfl_xor(alephDiff0, 16); // By xor shfling, every thread has the resulting sum. 248 | 249 | // One thread sets a specific bit high if tavek is closer to bet. 250 | if (alephDiff0 < 0 && threadIdx.x == outThread) { 251 | out |= (1<>1] = h_data[i] + (h_data[i+1] << 8); 292 | // } 293 | // cudaMemcpyToSymbolAsync(triplets, h_data_packed, sizeof(float)*3*512); 294 | } 295 | 296 | 297 | void initImage(unsigned char ** d_I, int width, int height, size_t * pitch) { 298 | cudaMallocPitch((void**)d_I, pitch, width*sizeof(*d_I), height); 299 | 300 | image.addressMode[0] = cudaAddressModeClamp; 301 | image.addressMode[1] = cudaAddressModeClamp; 302 | image.addressMode[2] = cudaAddressModeClamp; 303 | image.normalized = false; 304 | image.filterMode = cudaFilterModeLinear; 305 | size_t tex_ofs; 306 | cudaBindTexture2D (&tex_ofs, &image, *d_I, &image.channelDesc, width, height, *pitch); 307 | } 308 | 309 | void initMask(float** d_mask, float* h_mask) { 310 | // This packs even rows together in h_mask, then odd rows. 311 | // It is 'run once' code. 312 | float t[64]; 313 | for (int i=0; i<64; i++) { 314 | t[i] = h_mask[i]; 315 | } 316 | for (int r=0; r<4; r++) { 317 | for (int c=0; c<8; c++) { 318 | h_mask[c + r*8] = t[c + r*16]; 319 | } 320 | } 321 | for (int r=4; r<8; r++) { 322 | for (int c=0; c<8; c++) { 323 | h_mask[c + r*8] = t[c + (r-4)*16 + 8]; 324 | } 325 | } 326 | size_t sizeMask = 64 * sizeof(float); 327 | cudaMalloc((void **) d_mask, sizeMask); 328 | cudaMemcpy(*d_mask, h_mask, sizeMask, cudaMemcpyHostToDevice); 329 | } 330 | 331 | float computeGradient(const unsigned char* img, const int width, const int x, const int y) { 332 | float dx = 0.0f; 333 | float dy = 0.0f; 334 | float delta = 0.0f; 335 | int base = x + y*width; 336 | int offset; 337 | 338 | offset = 3*width; 339 | delta = (img[base + offset] - img[base - offset]); 340 | dy += delta; 341 | 342 | offset = 3*width + 1; 343 | delta = (img[base + offset] - img[base - offset]); 344 | dy += delta * 3 / sqrt(10); 345 | dx += delta / sqrt(10); 346 | 347 | offset = 2*width + 2; 348 | delta = (img[base + offset] - img[base - offset]); 349 | dy += delta / sqrt(2); 350 | dx += delta / sqrt(2); 351 | 352 | offset = 1*width + 3; 353 | delta = (img[base + offset] - img[base - offset]); 354 | dy += delta / sqrt(10); 355 | dx += delta * 3 / sqrt(10); 356 | 357 | offset = 3; 358 | delta = (img[base + offset] - img[base - offset]); 359 | dx += delta; 360 | 361 | offset = -1*width + 3; 362 | delta = (img[base + offset] - img[base - offset]); 363 | dy -= delta / sqrt(10); 364 | dx += delta * 3 / sqrt(10); 365 | 366 | offset = -2*width + 2; 367 | delta = (img[base + offset] - img[base - offset]); 368 | dy -= delta / sqrt(2); 369 | dx += delta / sqrt(2); 370 | 371 | offset = -3*width + 1; 372 | delta = (img[base + offset] - img[base - offset]); 373 | dy -= delta * 3 / sqrt(10); 374 | dx += delta / sqrt(10); 375 | 376 | return atan2f(dy, dx); 377 | } 378 | 379 | void latchAff( Mat imgMat, 380 | unsigned char* d_I, 381 | size_t pitch, 382 | float* h_K, 383 | unsigned int* d_D, 384 | int* keypoints, 385 | int maxKP, 386 | float* d_K, 387 | vector* vectorKP, 388 | float* d_mask, 389 | cudaEvent_t latchFinished, 390 | Mat outMat, 391 | RotatedRect rekt) { 392 | const unsigned char* h_I = imgMat.data; 393 | const int height = imgMat.rows; 394 | const int width = imgMat.cols; 395 | 396 | // All of these calls are non blocking but serialized. 397 | // cudaMemsetAsync(d_K, -1, maxKP * sizeof(int) * 4); // Negative one is represented by all '1' bits in both int32 and uchar8. 398 | // cudaMemsetAsync(d_D, 0, maxKP * (2048 / 32) * sizeof(unsigned int)); 399 | cudaMemcpy2DAsync(d_I, pitch, h_I, width*sizeof(unsigned char), width*sizeof(unsigned char), height, cudaMemcpyHostToDevice); 400 | 401 | // Only prep up to maxKP for the GPU (as that is the most we have prepared the GPU to handle) 402 | *keypoints = ((*vectorKP).size() < maxKP) ? (*vectorKP).size() : maxKP; 403 | for (int i=0; i<*keypoints; i+=1) { 404 | h_K[5*i ] = (*vectorKP)[i].pt.x; 405 | h_K[5*i+1] = (*vectorKP)[i].pt.y; 406 | h_K[5*i+2] = 1.0f; // (*vectorKP)[i].size); 407 | // h_K[4*i+3] = (*vectorKP)[i].angle; 408 | h_K[5*i+3] = computeGradient(h_I, width, h_K[5*i ], h_K[5*i+1]); 409 | } 410 | for (int i=*keypoints; i>>(d_K, d_D, width, height, d_mask, d_out); 440 | cudaDeviceSynchronize(); 441 | cudaMemcpy(h_out, d_out, sizeOut, cudaMemcpyDeviceToHost); 442 | cudaDeviceSynchronize(); 443 | for (int j=0; j<64; j++) { 444 | for (int i=0; i<64; i++) { 445 | outMat.at(j, 3*i) = h_out[j*64+i]; 446 | outMat.at(j, 3*i+1) = h_out[j*64+i]; 447 | outMat.at(j, 3*i+2) = h_out[j*64+i]; 448 | // cerr << " " << h_out[j*64+i]; 449 | } 450 | } 451 | cerr << endl; 452 | checkLaunchError(); 453 | cudaEventRecord(latchFinished); 454 | } 455 | -------------------------------------------------------------------------------- /latchAff.h: -------------------------------------------------------------------------------- 1 | void latchAff( Mat, 2 | unsigned char *, 3 | size_t, 4 | float *, 5 | unsigned int *, 6 | int *, 7 | int, 8 | float *, 9 | vector*, 10 | float*, 11 | cudaEvent_t, 12 | Mat, 13 | RotatedRect); 14 | 15 | void initPatchTriplets(cudaArray*); 16 | 17 | void initImage( unsigned char**, 18 | int, 19 | int, 20 | size_t * 21 | ); 22 | 23 | void initMask( float **, 24 | float *); 25 | -------------------------------------------------------------------------------- /min.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "cuda.h" 5 | #include "cuda_runtime.h" 6 | #include "opencv2/opencv.hpp" 7 | using namespace std; 8 | using namespace cv; 9 | #include "latch.h" 10 | #include "bitMatcher.h" 11 | 12 | #define cudaCalloc(A, B) \ 13 | do { \ 14 | cudaError_t __cudaCalloc_err = cudaMalloc(A, B); \ 15 | if (__cudaCalloc_err == cudaSuccess) cudaMemset(*A, 0, B); \ 16 | } while (0) 17 | 18 | int main( int argc, char** argv ) { 19 | if (argc != 3) { 20 | cout << "Please pass the names of two images (same size) as arguments: ./min img1.png img2.png" << endl; 21 | return -1; 22 | } 23 | // maKP is the maximum number of keypoints/features you will be able to use on the GPU. 24 | // This _must_ be an integer multiple of 512. 25 | // Integers which are themselves a multiple of the number of streaming multiprocessors 26 | // on your GPU (or half that number) should work well. 27 | const int maxKP = 512 * 15; 28 | int matchThreshold = 8; // Second best match must be at least matchThreshold from best match. 29 | int fastThreshold = 12; // For keypoint detection. 30 | clock_t t; 31 | 32 | Mat img1, img2, img1g, img2g, imgMatches; 33 | vector keypoints1, keypoints2; 34 | vector matches; 35 | 36 | img1 = imread(argv[1], IMREAD_COLOR); 37 | img2 = imread(argv[2], IMREAD_COLOR); 38 | cv::cvtColor(img1, img1g, CV_BGR2GRAY); 39 | cv::cvtColor(img2, img2g, CV_BGR2GRAY); 40 | const int imgWidth = img1.cols; // Assumes both images are the same size. 41 | const int imgHeight = img1.rows; 42 | 43 | // Sizes for host and device arrays both. 44 | size_t sizeK = maxKP * sizeof(int) * 2; // K for Keypoint 45 | size_t sizeI = imgWidth * imgHeight * sizeof(unsigned char); // I for Image 46 | size_t sizeD = maxKP * (2048 / 32) * sizeof(unsigned int); // D for Descriptor. 32 bits per uint32. 2048 bits per descriptor. 47 | size_t sizeM = maxKP * sizeof(int); // M for Matches 48 | 49 | // Host (CPU) arrays 50 | int *h_K1, *h_K2; 51 | cudaMallocHost((void **) &h_K1, sizeK); // Page locked memory is faster to transfer to-and-from the GPU 52 | cudaMallocHost((void **) &h_K2, sizeK); // (but that isnt really our bottleneck) 53 | int h_M1[maxKP]; 54 | int h_M2[maxKP]; // For reasons unknown to me, if I use cudaMallocHost for h_M2 everything breaks...? Would love to know why. 55 | int numKP1, numKP2; // Minimum of the vector of keypoints.size() and maxKP (the max number of keypoints the GPU is prepared to handle) 56 | 57 | // Device (GPU) pointers. You can not directly look at device memory (without transfering it back to the host, aka CPU) 58 | unsigned char *d_I; 59 | unsigned int *d_D1, *d_D2; 60 | int *d_K, *d_M1, *d_M2; 61 | float *d_K, *d_mask; 62 | cudaCalloc((void **) &d_K, sizeK); 63 | cudaCalloc((void **) &d_I, sizeI); 64 | cudaCalloc((void **) &d_D1, sizeD); 65 | cudaCalloc((void **) &d_D2, sizeD); 66 | cudaCalloc((void **) &d_M1, sizeM); 67 | cudaCalloc((void **) &d_M2, sizeM); 68 | cudaCalloc((void **) &d_mask, sizeM); 69 | 70 | // The patch triplet locations for LATCH fits in texture memory cache. 71 | cudaArray* triplets; 72 | initPatchTriplets(triplets); 73 | 74 | size_t pitch; 75 | initImage(&d_I, imgWidth, imgHeight, &pitch); 76 | initMask(&d_mask, h_mask); 77 | 78 | // Events allow asynchronous, nonblocking launch of subsequent kernels after a given event has happened. 79 | cudaEvent_t latchFinishedEvent; 80 | cudaEventCreate(&latchFinishedEvent); 81 | // You should create a new stream for each bitMatcher kernel you want to launch at once. 82 | cudaStream_t stream1, stream2; 83 | cudaStreamCreate(&stream1); 84 | cudaStreamCreate(&stream2); 85 | 86 | // Normal OpenCV CPU code. 87 | FAST(img1g, keypoints1, fastThreshold); 88 | FAST(img2g, keypoints2, fastThreshold); // If we were clever, we would put this after the first LATCH call, so both the CPU and GPU would be working at the same time. 89 | 90 | t = clock(); // Begin timing kernel launches. 91 | // LATCH runs on the default stream and will block until it is finished. 92 | latch( img1g, d_I, pitch, h_K1, d_D1, &numKP1, maxKP, d_K, &keypoints1, d_mask, latchFinishedEvent); 93 | latch( img2g, d_I, pitch, h_K2, d_D2, &numKP2, maxKP, d_K, &keypoints2, d_mask, latchFinishedEvent); 94 | 95 | // latch( img1g, h_K1, d_D1, &numKP1, maxKP, d_K, d_I, &keypoints1, imgWidth, imgHeight, latchFinishedEvent ); // The latchFinishedEvent will be overridden by the next LATCH launch. (this one will be ignored) 96 | // latch( img2g, h_K2, d_D2, &numKP2, maxKP, d_K, d_I, &keypoints2, imgWidth, imgHeight, latchFinishedEvent ); // This call will only begin after the above has completed. (but is still non blocking) 97 | bitMatcher( d_D1, d_D2, numKP1, numKP2, maxKP, d_M1, matchThreshold, stream1, latchFinishedEvent ); // Each concurrent bitMatcher launch should get its own d_M# pointer and its own stream# 98 | bitMatcher( d_D2, d_D1, numKP2, numKP1, maxKP, d_M2, matchThreshold, stream2, latchFinishedEvent ); // Both bitMatcher launches will start in parallel when the most recent call to LATCH completes. 99 | cout << "Launching kernels took " << 1000*(clock() - t)/(float)CLOCKS_PER_SEC << " milliseconds." << endl; 100 | 101 | // Put as much CPU code as possible here. 102 | // The CPU can continue to do useful work while the GPU is thinking. 103 | // If you put no code here, the CPU will stall until the GPU is done. 104 | 105 | t = clock(); // Begin timing wasted CPU time. 106 | getMatches(maxKP, h_M1, d_M1); 107 | getMatches(maxKP, h_M2, d_M2); 108 | cout << "Gathering results took " << 1000*(clock() - t)/(float)CLOCKS_PER_SEC << " milliseconds." << endl; 109 | for (int i=0; i= 0 && h_M1[i] < numKP2 && h_M2[h_M1[i]] == i) { 111 | matches.push_back( DMatch(i, h_M1[i], 0)); 112 | } 113 | } 114 | cout << "Between " << keypoints1.size() << " and " << keypoints2.size() << " keypoints found " << matches.size() << " matches." << endl; 115 | 116 | drawMatches( img1, keypoints1, img2, keypoints2, 117 | matches, imgMatches, Scalar::all(-1), Scalar::all(-1), 118 | vector(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS ); 119 | resize(imgMatches, imgMatches, Size(1280, 480)); 120 | imshow( "Matches", imgMatches ); 121 | waitKey(0); 122 | 123 | cudaFreeArray(triplets); 124 | cudaFree(d_K); 125 | cudaFree(d_I); 126 | cudaFree(d_D1); 127 | cudaFree(d_D2); 128 | cudaFree(d_M1); 129 | cudaFree(d_M2); 130 | cudaFreeHost(h_K1); 131 | cudaFreeHost(h_K2); 132 | return 0; 133 | } 134 | -------------------------------------------------------------------------------- /vo.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "cuda.h" 6 | #include "cuda_runtime.h" 7 | #include "opencv2/opencv.hpp" 8 | using namespace std; 9 | using namespace cv; 10 | #include "latch.h" 11 | #include "bitMatcher.h" 12 | // #include "gpuFacade.hpp" 13 | 14 | #define cudaCalloc(A, B) \ 15 | do { \ 16 | cudaError_t __cudaCalloc_err = cudaMalloc(A, B); \ 17 | if (__cudaCalloc_err == cudaSuccess) cudaMemset(*A, 0, B); \ 18 | } while (0) 19 | 20 | #define checkError(ans) { gpuAssert((ans), __FILE__, __LINE__); } 21 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { 22 | if (code != cudaSuccess) { 23 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 24 | if (abort) exit(code); 25 | } 26 | } 27 | 28 | #define checkLaunchError() \ 29 | do { \ 30 | /* Check synchronous errors, i.e. pre-launch */ \ 31 | cudaError_t err = cudaGetLastError(); \ 32 | if (cudaSuccess != err) { \ 33 | fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 34 | __FILE__, __LINE__, cudaGetErrorString(err) ); \ 35 | exit(EXIT_FAILURE); \ 36 | } \ 37 | /* Check asynchronous errors, i.e. kernel failed (ULF) */ \ 38 | err = cudaThreadSynchronize(); \ 39 | if (cudaSuccess != err) { \ 40 | fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 41 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 42 | exit(EXIT_FAILURE); \ 43 | } \ 44 | } while (0) 45 | 46 | 47 | // Sometimes the recovered pose is 180 degrees off...? I thought cheirality test would handle that, but apparently not always. 48 | double dist2(Mat a, Mat b) { 49 | double s = 0.0; 50 | for (int i=0; i<3; i++) { 51 | const double t = a.at(i) - b.at(i); 52 | s += t*t; 53 | } 54 | return s; 55 | } 56 | 57 | // In general a suffix of 1 means previous frame, and 2 means current frame. 58 | // However, we start processing the next frame while the GPU is working on current... 59 | // So at a certain point frame 1 shifts down to 0, 2 shifts down to 1, and the new 2 is loaded. 60 | int main( int argc, char** argv ) { 61 | // gpuFacade gpu; 62 | // gpu.set_values(3,4); 63 | // cerr << "!! " << gpu.area() << endl; 64 | 65 | // This must be an integer multiple of 512. 66 | // Specifically, half-multiples of the number of SM's for your GPU are sensible. 67 | // I have 10 streaming multiprocessors, so I chose 15*512 = 7680. 68 | const int maxKP = 512 * 15; 69 | const bool showMatches = true; 70 | // Shows every Nth processed frame's matches. 71 | const int showMatchesInterval = 10; 72 | const bool showVideo = true; 73 | // Shows every Nth processed frame. 74 | const int showVideoInterval = 1; 75 | int WIDTH, HEIGHT, totalMatches, totalInliers = 0; 76 | const int matchThreshold = 12; 77 | // Discard this many frames for each one processed. Change with +/- keys while running. 78 | int skipFrames = 0; 79 | // Threshold for FAST detector 80 | int threshold = 20; 81 | int targetKP = 3000; 82 | int tolerance = 200; 83 | int maxLoops = 100;//4200; 84 | const bool gnuplot = true; 85 | double defect = 0.0; 86 | int extractions = 0; 87 | 88 | VideoCapture cap; 89 | if (argc == 1) { 90 | cap = VideoCapture(0); 91 | WIDTH = cap.get(CAP_PROP_FRAME_WIDTH); 92 | HEIGHT = cap.get(CAP_PROP_FRAME_HEIGHT); 93 | } 94 | if (argc == 2 || argc == 3) { 95 | cap = VideoCapture(argv[1]); 96 | WIDTH = cap.get(CAP_PROP_FRAME_WIDTH); 97 | HEIGHT = cap.get(CAP_PROP_FRAME_HEIGHT); 98 | if (argc == 3) { 99 | for (int i=0; i> img1; 119 | cap >> img2; 120 | cv::cvtColor(img1, img1g, CV_BGR2GRAY); 121 | cv::cvtColor(img2, img2g, CV_BGR2GRAY); 122 | if (showMatches) { 123 | namedWindow("Matches", WINDOW_NORMAL); 124 | } 125 | waitKey(1); 126 | if (showVideo) { 127 | namedWindow("Video", WINDOW_NORMAL); 128 | } 129 | waitKey(1); 130 | resizeWindow("Matches", 1920/2, 540/2); 131 | resizeWindow("Video", 960, 540); 132 | moveWindow("Matches", 0, 540+55); 133 | moveWindow("Video", 0, 0); 134 | waitKey(1); 135 | 136 | cudaEvent_t start, stop; 137 | cudaEventCreate(&start); 138 | cudaEventCreate(&stop); 139 | 140 | vector keypoints0, keypoints1, keypoints2; 141 | vector goodMatches; 142 | vector p1, p2; // Point correspondences for recovering pose. 143 | int numKP0, numKP1, numKP2; // The actual number of keypoints we are dealing with: just keypoints#.size(), but capped at maxKP. 144 | int key = -1; 145 | clock_t timer, timer2; 146 | float time; 147 | 148 | // Sizes for device and host pointers 149 | size_t sizeK = maxKP * sizeof(float) * 5; // K for keypoints 150 | size_t sizeI = WIDTH * HEIGHT * sizeof(unsigned char); // I for Image 151 | size_t sizeD = maxKP * (2048 / 32) * sizeof(unsigned int); // D for Descriptor 152 | size_t sizeM = maxKP * sizeof(int); // M for Matches 153 | size_t sizeMask = 64 * sizeof(float); 154 | 155 | // Host pointers 156 | float *h_K1, *h_K2; 157 | cudaMallocHost((void **) &h_K1, sizeK); 158 | cudaMallocHost((void **) &h_K2, sizeK); 159 | // For reasons opaque to me, allocating both (but not either) h_M1 or h_M2 160 | // with cudaMallocHost segfaults, apparently after graceful exit? So neither of them are pinned. 161 | int h_M1[maxKP]; 162 | int h_M2[maxKP]; 163 | float h_mask[64]; 164 | for (int i=0; i<64; i++) { h_mask[i] = 1.0f; } 165 | 166 | // Device pointers 167 | unsigned char *d_I; 168 | unsigned int *d_D1, *d_D2, *uIntSwapPointer; 169 | int *d_M1, *d_M2; 170 | float *d_K, *d_mask; 171 | cudaCalloc((void **) &d_K, sizeK); 172 | cudaCalloc((void **) &d_D1, sizeD); 173 | cudaCalloc((void **) &d_D2, sizeD); 174 | cudaCalloc((void **) &d_M1, sizeM); 175 | cudaCalloc((void **) &d_M2, sizeM); 176 | cudaCalloc((void **) &d_mask, sizeM); 177 | 178 | // The patch triplet locations for LATCH fits in texture memory cache. 179 | cudaArray* patchTriplets; 180 | initPatchTriplets(patchTriplets); 181 | size_t pitch; 182 | initImage(&d_I, WIDTH, HEIGHT, &pitch); 183 | initMask(&d_mask, h_mask); 184 | 185 | // Events allow asynchronous, nonblocking launch of subsequent kernels after a given event has happened, 186 | // such as completion of a different kernel on a different stream. 187 | cudaEvent_t latchFinished; 188 | cudaEventCreate(&latchFinished); 189 | // You should create a new stream for each bitMatcher kernel you want to launch at once. 190 | cudaStream_t streanumKP1, streanumKP2; 191 | cudaStreamCreate(&streanumKP1); 192 | cudaStreamCreate(&streanumKP2); 193 | 194 | FAST(img1g, keypoints1, threshold); 195 | extractions += keypoints1.size(); 196 | latch( img1g, d_I, pitch, h_K1, d_D1, &numKP1, maxKP, d_K, &keypoints1, d_mask, latchFinished ); 197 | FAST(img2g, keypoints2, threshold); // This call to fast is concurrent with above execution. 198 | extractions += keypoints2.size(); 199 | latch( img2g, d_I, pitch, h_K2, d_D2, &numKP2, maxKP, d_K, &keypoints2, d_mask, latchFinished ); 200 | bitMatcher( d_D1, d_D2, numKP1, numKP2, maxKP, d_M1, matchThreshold, streanumKP1, latchFinished ); 201 | bitMatcher( d_D2, d_D1, numKP2, numKP1, maxKP, d_M2, matchThreshold, streanumKP2, latchFinished ); 202 | timer = clock(); 203 | getMatches(maxKP, h_M1, d_M1); 204 | getMatches(maxKP, h_M2, d_M2); 205 | for (int i=0; i= 0 && h_M1[i] < numKP2 && h_M2[h_M1[i]] == i) { 207 | goodMatches.push_back( DMatch(i, h_M1[i], 0)); // For drawing. 208 | p1.push_back(keypoints1[i].pt); // For recovering pose. 209 | p2.push_back(keypoints2[h_M1[i]].pt); 210 | } 211 | } 212 | 213 | img1.copyTo(img0); 214 | img2.copyTo(img1); 215 | cap.read(img2); 216 | cvtColor(img2, img2g, CV_BGR2GRAY); 217 | 218 | keypoints0 = keypoints1; 219 | keypoints1 = keypoints2; 220 | 221 | uIntSwapPointer = d_D1; 222 | d_D1 = d_D2; 223 | d_D2 = uIntSwapPointer; 224 | 225 | numKP0 = numKP1; 226 | numKP1 = numKP2; 227 | 228 | FAST(img2g, keypoints2, threshold); 229 | int loopIteration = 0; 230 | for (; loopIteration < maxLoops || maxLoops == -1; loopIteration++) { // Main Loop. 231 | { // GPU code for descriptors and matching. 232 | cudaEventRecord(start, 0); 233 | extractions += keypoints2.size(); 234 | latch( img2g, d_I, pitch, h_K2, d_D2, &numKP2, maxKP, d_K, &keypoints2, d_mask, latchFinished); 235 | bitMatcher( d_D1, d_D2, numKP1, numKP2, maxKP, d_M1, matchThreshold, streanumKP1, latchFinished ); 236 | bitMatcher( d_D2, d_D1, numKP2, numKP1, maxKP, d_M2, matchThreshold, streanumKP2, latchFinished ); 237 | cudaEventRecord(stop, 0); 238 | } 239 | timer = clock(); 240 | { // Put as much CPU code here as possible. 241 | { // Display matches and/or video to user. 242 | bool needToDraw = false; 243 | if (showMatches && loopIteration % showMatchesInterval == 0) { // Draw matches. 244 | drawMatches( img0, keypoints0, img1, keypoints1, 245 | goodMatches, imgMatches, Scalar::all(-1), Scalar::all(-1), 246 | vector(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS ); 247 | imshow( "Matches", imgMatches ); 248 | needToDraw = true; 249 | } 250 | if (showVideo && loopIteration % showVideoInterval == 0) { 251 | imshow("Video", img1); 252 | needToDraw = true; 253 | } 254 | if (needToDraw) { 255 | key = waitKey(1); 256 | } 257 | } 258 | { // Handle user input. 259 | switch (key) { 260 | case (-1): 261 | break; 262 | case (1048689): // q 263 | case (113): // also q 264 | return 0; 265 | break; 266 | case (1048695): // w 267 | waitKey(0); 268 | break; 269 | case (1114027): // + 270 | skipFrames++; 271 | cerr << "For each processed frame we are now skipping " << skipFrames << endl; 272 | break; 273 | case (1114029): // - 274 | skipFrames = max(1, --skipFrames); 275 | cerr << "For each processed frame we are now skipping " << skipFrames << endl; 276 | break; 277 | default: 278 | cerr << "Currently pressed key is: " << key << endl; 279 | break; 280 | } 281 | key = -1; 282 | } 283 | { // Iterate the "logical" loop (get ready to process next frame) 284 | img1.copyTo(img0); 285 | img2.copyTo(img1); 286 | for (int i=0; i(i) = rodOld.at(i)*(1.0-alpha) + rod.at(i)*alpha; 330 | // } 331 | rodOld.copyTo(rod); 332 | } 333 | } else { 334 | defect += 1.0; 335 | cout << "11:" << 1.0 << endl; 336 | cerr << "Too few matches! Not going to try to recover pose this frame." << endl; 337 | } 338 | // To prevent the graphs from desynchronizing from each other, we have to output this unconditionally. 339 | if (gnuplot) { 340 | for (int i=0; i<3; i++) { 341 | cout << i << ":" << rod.at(i) * 57.2957795 << endl; // Output Rodrigues vector, rescaled to degrees 342 | } 343 | // T is unit norm (scale-less) and often erroneously sign-reversed. 344 | // if (T.at(2) < 0) T = -T; // Assume dominate motion is forward... (this is not an elegant assumption) 345 | // double theta = atan2(T.at(0), T.at(2)); 346 | // double phi = atan2(T.at(1), T.at(2)); 347 | // cout << 3 << ":" << theta * 57.2957795 << endl; // Plot polar translation angle 348 | // cout << 4 << ":" << phi * 57.2957795 << endl; // Plot azimuthal translation angle 349 | } 350 | } 351 | { // run FAST detector on the CPU for next frame (get ready for next loop iteration). 352 | FAST(img2g, keypoints2, threshold); 353 | // Apply proportional control to threshold to drive it towards targetKP. 354 | int control = (int)(((float)keypoints2.size() - (float)targetKP) / (float)tolerance); 355 | threshold += min(100, control); 356 | if (threshold < 1) threshold = 1; 357 | } 358 | } 359 | if (gnuplot) { 360 | time = (1000*(clock() - timer)/(double)CLOCKS_PER_SEC); 361 | cout << "9:" << time << endl; // Plot CPU time. 362 | timer = clock(); 363 | } 364 | { // Get new GPU results 365 | p1.clear(); 366 | p2.clear(); 367 | goodMatches.clear(); 368 | getMatches(maxKP, h_M1, d_M1); 369 | getMatches(maxKP, h_M2, d_M2); 370 | cudaEventElapsedTime(&time, start, stop); 371 | if (gnuplot) { 372 | cout << "10:" << (time+(1000*(clock() - timer)/(double)CLOCKS_PER_SEC)) << endl; // Plot total asynchronous GPU time. 373 | } 374 | for (int i=0; i= 0 && h_M1[i] < numKP1 && h_M2[h_M1[i]] == i) { 376 | goodMatches.push_back( DMatch(i, h_M1[i], 0)); // For drawing matches. 377 | p1.push_back(keypoints0[i].pt); // For recovering pose. 378 | p2.push_back(keypoints1[h_M1[i]].pt); 379 | } 380 | } 381 | } 382 | if (gnuplot) { 383 | cout << "6:" << numKP1 << endl; // Plot number of keypoints. 384 | cout << "7:" << p1.size() << endl; // Plot number of matches. 385 | cout << "8:" << 100*threshold << endl; // Plot current threshold for FAST. 386 | } 387 | totalMatches += p1.size(); 388 | } 389 | cudaFreeArray(patchTriplets); 390 | cudaFree(d_K); 391 | cudaFree(d_D1); 392 | cudaFree(d_D2); 393 | cudaFree(d_M1); 394 | cudaFree(d_M2); 395 | cudaFreeHost(h_K1); 396 | cudaFreeHost(h_K2); 397 | cerr << "Total matches: " << totalMatches << endl; 398 | cerr << "Total inliers: " << totalInliers << endl; 399 | cerr << "Defect: " << defect << endl; 400 | cerr << "Loop iteration: " << loopIteration << endl; 401 | cerr << "Extractions: " << extractions << endl; 402 | 403 | return 0; 404 | } 405 | -------------------------------------------------------------------------------- /vo2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "cuda.h" 6 | #include "cuda_runtime.h" 7 | #include "opencv2/opencv.hpp" 8 | using namespace std; 9 | using namespace cv; 10 | #include "latch.h" 11 | #include "bitMatcher.h" 12 | #include "gpuFacade.hpp" 13 | 14 | #define cudaCalloc(A, B) \ 15 | do { \ 16 | cudaError_t __cudaCalloc_err = cudaMalloc(A, B); \ 17 | if (__cudaCalloc_err == cudaSuccess) cudaMemset(*A, 0, B); \ 18 | } while (0) 19 | 20 | #define checkError(ans) { gpuAssert((ans), __FILE__, __LINE__); } 21 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { 22 | if (code != cudaSuccess) { 23 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 24 | if (abort) exit(code); 25 | } 26 | } 27 | 28 | #define checkLaunchError() \ 29 | do { \ 30 | /* Check synchronous errors, i.e. pre-launch */ \ 31 | cudaError_t err = cudaGetLastError(); \ 32 | if (cudaSuccess != err) { \ 33 | fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 34 | __FILE__, __LINE__, cudaGetErrorString(err) ); \ 35 | exit(EXIT_FAILURE); \ 36 | } \ 37 | /* Check asynchronous errors, i.e. kernel failed (ULF) */ \ 38 | err = cudaThreadSynchronize(); \ 39 | if (cudaSuccess != err) { \ 40 | fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ 41 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 42 | exit(EXIT_FAILURE); \ 43 | } \ 44 | } while (0) 45 | 46 | 47 | // Sometimes the recovered pose is 180 degrees off...? I thought cheirality test would handle that, but apparently not always. 48 | double dist2(Mat a, Mat b) { 49 | double s = 0.0; 50 | for (int i=0; i<3; i++) { 51 | const double t = a.at(i) - b.at(i); 52 | s += t*t; 53 | } 54 | return s; 55 | } 56 | 57 | // In general a suffix of 1 means previous frame, and 2 means current frame. 58 | // However, we start processing the next frame while the GPU is working on current... 59 | // So at a certain point frame 1 shifts down to 0, 2 shifts down to 1, and the new 2 is loaded. 60 | int main( int argc, char** argv ) { 61 | // This must be an integer multiple of 512. 62 | // Specifically, half-multiples of the number of SM's for your GPU are sensible. 63 | // I have 10 streaming multiprocessors, so I chose 15*512 = 7680. 64 | const int maxKP = 512 * 15; 65 | const bool showMatches = true; 66 | // Shows every Nth processed frame's matches. 67 | const int showMatchesInterval = 10; 68 | const bool showVideo = true; 69 | // Shows every Nth processed frame. 70 | const int showVideoInterval = 1; 71 | int WIDTH, HEIGHT, totalMatches = 0, totalInliers = 0; 72 | const int matchThreshold = 12; 73 | // Discard this many frames for each one processed. Change with +/- keys while running. 74 | int skipFrames = 0; 75 | // Threshold for FAST detector 76 | int threshold = 90; 77 | int targetKP = 3000; 78 | int tolerance = 200; 79 | int maxLoops = 150; 80 | const bool gnuplot = true; 81 | double defect = 0.0; 82 | 83 | VideoCapture cap; 84 | // if (argc == 1) { 85 | // cap = VideoCapture(0); 86 | // WIDTH = cap.get(CAP_PROP_FRAME_WIDTH); 87 | // HEIGHT = cap.get(CAP_PROP_FRAME_HEIGHT); 88 | // } 89 | if (argc == 2 || argc == 3) { 90 | cap = VideoCapture(argv[1]); 91 | WIDTH = cap.get(CAP_PROP_FRAME_WIDTH); 92 | HEIGHT = cap.get(CAP_PROP_FRAME_HEIGHT); 93 | if (argc == 3) { 94 | for (int i=0; i> img1; 114 | cap >> img2; 115 | cv::cvtColor(img1, img1g, CV_BGR2GRAY); 116 | cv::cvtColor(img2, img2g, CV_BGR2GRAY); 117 | if (showMatches) { 118 | namedWindow("Matches", WINDOW_NORMAL); 119 | } 120 | waitKey(1); 121 | if (showVideo) { 122 | namedWindow("Video", WINDOW_NORMAL); 123 | } 124 | waitKey(1); 125 | resizeWindow("Matches", 1920/2, 540/2); 126 | resizeWindow("Video", 960, 540); 127 | moveWindow("Matches", 0, 540+55); 128 | moveWindow("Video", 0, 0); 129 | waitKey(1); 130 | 131 | cudaEvent_t start, stop; 132 | cudaEventCreate(&start); 133 | cudaEventCreate(&stop); 134 | 135 | vector keypoints0, keypoints1, keypoints2; 136 | vector goodMatches; 137 | vector p1, p2; // Point correspondences for recovering pose. 138 | int key = -1; 139 | clock_t timer, timer2; 140 | float time; 141 | 142 | gpuFacade gpu(maxKP, WIDTH, HEIGHT); 143 | FAST(img1g, keypoints1, threshold); 144 | gpu.LATCH(img1g, gpu.d_D1, &(gpu.numKP1), &keypoints1); 145 | FAST(img2g, keypoints2, threshold); // This call to fast is concurrent with above execution. 146 | gpu.LATCH(img2g, gpu.d_D2, &(gpu.numKP2), &keypoints2); 147 | gpu.match(gpu.d_D1, gpu.d_D2, gpu.numKP1, gpu.numKP2, gpu.d_M1, matchThreshold, gpu.streamKP1); 148 | gpu.match(gpu.d_D2, gpu.d_D1, gpu.numKP2, gpu.numKP1, gpu.d_M2, matchThreshold, gpu.streamKP2); 149 | gpu.getResults(gpu.h_M1, gpu.d_M1); 150 | gpu.getResults(gpu.h_M2, gpu.d_M2); 151 | for (int i=0; i= 0 && gpu.h_M1[i] < gpu.numKP2 && gpu.h_M2[gpu.h_M1[i]] == i) { 153 | goodMatches.push_back( DMatch(i, gpu.h_M1[i], 0)); // For drawing. 154 | p1.push_back(keypoints1[i].pt); // For recovering pose. 155 | p2.push_back(keypoints2[gpu.h_M1[i]].pt); 156 | } 157 | } 158 | 159 | img1.copyTo(img0); 160 | img2.copyTo(img1); 161 | cap.read(img2); 162 | cvtColor(img2, img2g, CV_BGR2GRAY); 163 | 164 | keypoints0 = keypoints1; 165 | keypoints1 = keypoints2; 166 | 167 | gpu.uIntSwapPointer = gpu.d_D1; 168 | gpu.d_D1 = gpu.d_D2; 169 | gpu.d_D2 = gpu.uIntSwapPointer; 170 | 171 | gpu.numKP0 = gpu.numKP1; 172 | gpu.numKP1 = gpu.numKP2; 173 | 174 | FAST(img2g, keypoints2, threshold); 175 | int loopIteration = 0; 176 | for (; loopIteration < maxLoops || maxLoops == -1; loopIteration++) { // Main Loop. 177 | { // GPU code for descriptors and matching. 178 | cudaEventRecord(gpu.start, 0); 179 | gpu.LATCH(img2g, gpu.d_D2, &(gpu.numKP2), &keypoints2); 180 | gpu.match(gpu.d_D1, gpu.d_D2, gpu.numKP1, gpu.numKP2, gpu.d_M1, matchThreshold, gpu.streamKP1); 181 | gpu.match(gpu.d_D2, gpu.d_D1, gpu.numKP2, gpu.numKP1, gpu.d_M2, matchThreshold, gpu.streamKP2); 182 | cudaEventRecord(gpu.stop, 0); 183 | } 184 | timer = clock(); 185 | { // Put as much CPU code here as possible. 186 | { // Display matches and/or video to user. 187 | bool needToDraw = false; 188 | if (showMatches && loopIteration % showMatchesInterval == 0) { // Draw matches. 189 | drawMatches( img0, keypoints0, img1, keypoints1, 190 | goodMatches, imgMatches, Scalar::all(-1), Scalar::all(-1), 191 | vector(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS ); 192 | imshow( "Matches", imgMatches ); 193 | needToDraw = true; 194 | } 195 | if (showVideo && loopIteration % showVideoInterval == 0) { 196 | imshow("Video", img1); 197 | needToDraw = true; 198 | } 199 | if (needToDraw) { 200 | key = waitKey(1); 201 | } 202 | } 203 | { // Handle user input. 204 | switch (key) { 205 | case (-1): 206 | break; 207 | case (1048689): // q 208 | case (113): // also q 209 | return 0; 210 | break; 211 | case (1048695): // w 212 | waitKey(0); 213 | break; 214 | case (1114027): // + 215 | skipFrames++; 216 | cerr << "For each processed frame we are now skipping " << skipFrames << endl; 217 | break; 218 | case (1114029): // - 219 | skipFrames = max(1, --skipFrames); 220 | cerr << "For each processed frame we are now skipping " << skipFrames << endl; 221 | break; 222 | default: 223 | cerr << "Currently pressed key is: " << key << endl; 224 | break; 225 | } 226 | key = -1; 227 | } 228 | { // Iterate the "logical" loop (get ready to process next frame) 229 | img1.copyTo(img0); 230 | img2.copyTo(img1); 231 | for (int i=0; i(i) = rodOld.at(i)*(1.0-alpha) + rod.at(i)*alpha; 277 | // // } 278 | // rodOld.copyTo(rod); 279 | // } 280 | } else { 281 | defect += 1.0; 282 | cout << "11:" << 1.0 << endl; 283 | cerr << "Too few matches! Not going to try to recover pose this frame." << endl; 284 | } 285 | // To prevent the graphs from desynchronizing from each other, we have to output this unconditionally. 286 | if (gnuplot) { 287 | for (int i=0; i<3; i++) { 288 | // cout << i << ":" << rod.at(i) * 57.2957795 << endl; // Output Rodrigues vector, rescaled to degrees 289 | } 290 | // T is unit norm (scale-less) and often erroneously sign-reversed. 291 | // if (T.at(2) < 0) T = -T; // Assume dominate motion is forward... (this is not an elegant assumption) 292 | // double theta = atan2(T.at(0), T.at(2)); 293 | // double phi = atan2(T.at(1), T.at(2)); 294 | // cout << 3 << ":" << theta * 57.2957795 << endl; // Plot polar translation angle 295 | // cout << 4 << ":" << phi * 57.2957795 << endl; // Plot azimuthal translation angle 296 | } 297 | } 298 | { // run FAST detector on the CPU for next frame (get ready for next loop iteration). 299 | FAST(img2g, keypoints2, threshold); 300 | // Apply proportional control to threshold to drive it towards targetKP. 301 | int control = (int)(((float)keypoints2.size() - (float)targetKP) / (float)tolerance); 302 | threshold += min(100, control); 303 | if (threshold < 1) threshold = 1; 304 | } 305 | } 306 | if (gnuplot) { 307 | time = (1000*(clock() - timer)/(double)CLOCKS_PER_SEC); 308 | cout << "9:" << time << endl; // Plot CPU time. 309 | timer = clock(); 310 | } 311 | { // Get new GPU results 312 | p1.clear(); 313 | p2.clear(); 314 | goodMatches.clear(); 315 | gpu.getResults(gpu.h_M1, gpu.d_M1); 316 | gpu.getResults(gpu.h_M2, gpu.d_M2); 317 | cudaEventElapsedTime(&time, gpu.start, gpu.stop); 318 | if (gnuplot) { 319 | cout << "10:" << (time+(1000*(clock() - timer)/(double)CLOCKS_PER_SEC)) << endl; // Plot total asynchronous GPU time. 320 | } 321 | checkLaunchError(); 322 | for (int i=0; i= 0 && gpu.h_M1[i] < gpu.numKP1 && gpu.h_M2[gpu.h_M1[i]] == i) { 324 | goodMatches.push_back( DMatch(i, gpu.h_M1[i], 0)); // For drawing matches. 325 | p1.push_back(keypoints0[i].pt); // For recovering pose. 326 | p2.push_back(keypoints1[gpu.h_M1[i]].pt); 327 | } 328 | } 329 | } 330 | if (gnuplot) { 331 | cout << "6:" << gpu.numKP1 << endl; // Plot number of keypoints. 332 | cout << "7:" << p1.size() << endl; // Plot number of matches. 333 | cout << "8:" << 100*threshold << endl; // Plot current threshold for FAST. 334 | } 335 | totalMatches += p1.size(); 336 | } 337 | cerr << "Total matches: " << totalMatches << endl; 338 | cerr << "Total inliers: " << totalInliers << endl; 339 | cerr << "Defect: " << defect << endl; 340 | cerr << "Loop iteration: " << loopIteration << endl; 341 | return 0; 342 | } 343 | --------------------------------------------------------------------------------