├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── README.org
├── include
    ├── HashConverter.h
    ├── HashMatcher.h
    ├── KeyFileReader.h
    └── Share.h
├── job.sh.in
└── src
    ├── HashConverter.cpp
    ├── HashConverter.cu
    ├── HashMatcher.cpp
    ├── HashMatcher.cu
    ├── KeyFileReader.cpp
    ├── TestHashConverter.cpp
    ├── TestHashMatcher.cpp
    ├── TestKeyFileReader.cpp
    └── main.cpp


/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | build/
3 | /compile_commands.json
4 | test/
5 | *~
6 | *.user
7 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third_party/cub"]
2 | 	path = third_party/cub
3 | 	url = git@github.com:NVlabs/cub.git
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(CasHash-CUDA)
 2 | cmake_minimum_required(VERSION 3.1)
 3 | 
 4 | if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
 5 |   message(FATAL_ERROR "In-source builds not allowed. Please make a seperate directory and run cmake from there.")
 6 | endif()
 7 | 
 8 | if(NOT CMAKE_BUILD_TYPE)
 9 |   set(CMAKE_BUILD_TYPE "Debug")
10 | endif()
11 | 
12 | string(TOLOWER "${CMAKE_BUILD_TYPE}" cmake_build_type_tolower)
13 | if( NOT cmake_build_type_tolower STREQUAL "debug"
14 |     AND NOT cmake_build_type_tolower STREQUAL "release"
15 |     AND NOT cmake_build_type_tolower STREQUAL "relwithdebinfo")
16 |   message(FATAL_ERROR "Unknown build type \"${CMAKE_BUILD_TYPE}\". Allowed values are Debug, Release, RelWithDebInfo (case-insensitive).")
17 | endif()
18 | 
19 | if(cmake_build_type_tolower STREQUAL "debug")
20 |   list(APPEND CMAKE_CXX_FLAGS "-Wall -Wno-long-long")
21 |   #add_definitions(-DDEBUG_HASH_MATCHER)
22 |   #add_definitions(-DDEBUG_HASH_CONVERTER2)
23 | endif()
24 | 
25 | find_package(CUDA REQUIRED)
26 | 
27 | # Kepler
28 | #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_30,code=sm_30")
29 | list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_35,code=sm_35")
30 | 
31 | # C++0x support
32 | list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
33 | set(CMAKE_CXX_STANDARD 11)
34 | 
35 | # export compile commands so that our auto-completion system can index the source files
36 | set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
37 | 
38 | include_directories(${PROJECT_SOURCE_DIR}/include)
39 | 
40 | include_directories(${PROJECT_SOURCE_DIR}/third_party/cub)
41 | 
42 | #cuda_add_executable(KeyMatchCUDA
43 | #  src/main.cpp
44 | #  src/KeyFileReader.cpp
45 | #  )
46 | 
47 | cuda_add_executable(TestKeyFileReader
48 |   src/TestKeyFileReader.cpp
49 |   src/KeyFileReader.cpp
50 |   )
51 | 
52 | cuda_add_executable(TestHashConverter
53 |   src/TestHashConverter.cpp
54 |   src/KeyFileReader.cpp
55 |   src/HashConverter.cpp
56 |   src/HashConverter.cu
57 |   )
58 | 
59 | target_link_libraries(
60 |   TestHashConverter
61 |   curand)
62 | 
63 | cuda_add_executable(TestHashMatcher
64 |   src/TestHashMatcher.cpp
65 |   src/KeyFileReader.cpp
66 |   src/HashConverter.cpp
67 |   src/HashConverter.cu
68 |   src/HashMatcher.cpp
69 |   src/HashMatcher.cu
70 |   )
71 | 
72 | target_link_libraries(
73 |    TestHashMatcher
74 |    curand)
75 | 
76 | cuda_add_executable(KeyMatchCUDA
77 |   src/main.cpp
78 |   src/KeyFileReader.cpp
79 |   src/HashConverter.cpp
80 |   src/HashConverter.cu
81 |   src/HashMatcher.cpp
82 |   src/HashMatcher.cu
83 |   )
84 | 
85 | target_link_libraries(
86 |    KeyMatchCUDA
87 |    curand)
88 | 
89 | # For HHLR
90 | configure_file(job.sh.in job.sh @ONLY)
91 | 


--------------------------------------------------------------------------------
/README.org:
--------------------------------------------------------------------------------
 1 | #+TITLE: CasHash-CUDA
 2 | 
 3 | * Description
 4 | 
 5 | This project provides a library for GPU acclearated SIFT feature matching between images.
 6 | 
 7 | *How fast is it?*
 8 | 
 9 | According to our benchmark on Tesla K20 GPU, this algorithm can reach 30 fps pairing 30 images each with ~2000 sift vectors at a time.
10 | 
11 | *What for?*
12 | 
13 | This program can be used as a frontend for online image matching as well as large scale 3D reconstruction.
14 | 
15 | *Related Publication*
16 | 
17 | Cheng Jian, Cong Leng, Jiaxiang Wu, Hainan Cui, and Hanqing Lu. "Fast and accurate image matching with cascade hashing for 3d reconstruction." In IEEE Conference on Computer Vision and Pattern Recognition (CVPR2014), pp. 1-8. 2014.
18 | 
19 | * Installation
20 | 
21 | 
22 | #+BEGIN_EXAMPLE
23 | git clone git@github.com:cvcore/cashash_cuda.git cashash_cuda
24 | git submodule init
25 | cd cashash_cuda
26 | mkdir build && cd build
27 | cmake ..
28 | make
29 | #+END_EXAMPLE
30 | 
31 | * Usage
32 | 
33 | - Input :: A list of path storing SIFT keyfeatures extracted from the images.
34 | - Output :: Match pairs.
35 | 
36 | Sole command:
37 | 
38 | #+BEGIN_EXAMPLE
39 | ./KeyMatchCUDA <list.txt> <outfile>
40 | #+END_EXAMPLE
41 | 
42 | On Lichtweise Cluster:
43 | 
44 | Extract dataset file into cashash_cuda/dataset, then in build folder, run:
45 | #+BEGIN_EXAMPLE
46 | sbatch job.sh
47 | #+END_EXAMPLE
48 | 
49 | You can download the dataset here:
50 | 
51 | https://www.dropbox.com/s/ur6l6oigyxfzgrp/cashash_cuda_dataset.zip?dl=0
52 | 
53 | * Todo
54 | 
55 | - ✅ SIFT Vector Preprocessing & CPU Storage
56 |   - ✅ Load vectors in all images.
57 |     - ✅  Stream loading with cuda stream and asynchronious functions.
58 |     - Device supports concurrent kernel execution & has 2 async engines
59 |   - ✅ Update all SIFT Vectors to become zero mean 
60 |     - Stream preprocessing
61 |     - 1000 images * 2000 sift vectors * 128 dim * 4 byte = 976MiB (We have two GPUs of 5GiB global memory in cluster)
62 | - ✅ Hash Calculation
63 |   - ✅ Hash Remapping
64 |     - For remapping into 128d Hamming space, we use 1x128 grids.
65 |   - ✅ Bucket Generating
66 |     - For bucketing, we use 6x8 grids.
67 |   - ✅ Bucket Storage
68 |     - Bucket Information: 6 bucket group * 2000 vectors * 1000 images * 2 byte = ~24MiB
69 |     - Remapped vector: 2000 vectors * 1000 images * 16 byte = ~31MiB
70 | - ✅ Matching
71 |   - ✅ Use =__device__ int __popcll(unsigned long long int x)= for sorting mapped hash values
72 |   - ✅ Query all vectors according to bucket information stored in previous step
73 |   - ❌  Check multiple image pairs simultaneously
74 | 


--------------------------------------------------------------------------------
/include/HashConverter.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda_runtime.h>
 4 | #include "Share.h"
 5 | 
 6 | class HashConverter {
 7 | public:
 8 |     HashConverter();
 9 |     ~HashConverter();
10 |     void CompHash(ImageDevice &d_Image, cudaStream_t stream = 0);
11 |     void BucketHash(ImageDevice &d_Image, cudaStream_t stream = 0);
12 |     void CalcHashValues(ImageDevice &d_Image);
13 |     cudaEvent_t CalcHashValuesAsync(ImageDevice &d_Image, cudaEvent_t sync = NULL);
14 | 
15 | private:
16 |     void FillHashingMatrixCuRand();
17 |     void FillHashingMatrixCMath();
18 |     void FillHashingMatrixExternal(char const *path);
19 |     float GetNormRand(void);
20 | 
21 |     Matrix<SiftData_t> d_projMatHamming_; // Matrix for 128-bit hamming vector, width = kDimSiftData
22 |     Matrix<SiftData_t> d_projMatBucket_; // Same structure as d_projMatHamming but we chose to use only 6*8 = 48 bit from it.
23 |     cudaStream_t hashConverterStream_;
24 | };
25 | 


--------------------------------------------------------------------------------
/include/HashMatcher.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda_runtime.h>
 4 | #include "Share.h"
 5 | #include <vector>
 6 | 
 7 | const BucketEle_t INVALID_CANDIDATE = ~0;
 8 | const int MAX_COMPHASH_DISTANCE = ~(1 << (sizeof(int) * 8 - 1));
 9 | const float MAX_SIFT_DISTANCE = 1.0e38f;
10 | const int POSSIBLE_CANDIDATES = 8;
11 | const int HASH_MATCHER_BLOCK_SIZE = 32;
12 | const int HASH_MATCHER_ITEMS_PER_THREAD = 2;
13 | 
14 | class HashMatcher {
15 | public:
16 |     HashMatcher();
17 |     ~HashMatcher();
18 |     int NumberOfMatch(int queryImageIndex, int targetImageIndex);
19 |     MatchPairListPtr MatchPairList(int queryImageIndex, int targetImageIndex);
20 |     void AddImage(const ImageDevice &d_Image); /* return value: image index */
21 |     cudaEvent_t AddImageAsync(const ImageDevice &d_Image, cudaEvent_t sync = NULL);
22 |     
23 | private:
24 |     std::vector<ImageDevice> d_imageList_;
25 |     std::map< std::pair< int, int >, MatchPairListPtr > matchDataBase_;
26 |     cudaStream_t hashMatcherStream_;
27 | 
28 |     cudaEvent_t GeneratePair(int queryImageIndex, int targetImageIndex);
29 | };
30 | 


--------------------------------------------------------------------------------
/include/KeyFileReader.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda_runtime.h>
 4 | #include "Share.h"
 5 | #include <vector>
 6 | 
 7 | class KeyFileReader {
 8 | public:
 9 |     KeyFileReader();
10 |     ~KeyFileReader();
11 |     void UploadImage(ImageDevice &imgDev, const int index);
12 |     cudaEvent_t UploadImageAsync(ImageDevice &imgDev, const int index, cudaEvent_t sync = 0);
13 |     void AddKeyFile(const char *path);
14 |     void OpenKeyList(const char *path);
15 |     void ZeroMeanProc();
16 | 
17 |     int cntImage;
18 |     
19 |     std::vector<ImageHost> h_imageList_;
20 | private:
21 |     SiftData_t siftAccumulator_[kDimSiftData];
22 |     int cntTotalVector_;
23 |     cudaStream_t keyFileReaderStream_;
24 | };
25 | 


--------------------------------------------------------------------------------
/include/Share.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <stdint.h>
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <iostream>
  7 | #include <cuda_runtime.h>
  8 | #include <string>
  9 | #include <vector>
 10 | #include <memory>
 11 | #include <utility>
 12 | #include <map>
 13 | 
 14 | #ifdef __CUDACC__
 15 | #define CUDA_UNIVERSAL_QUALIFIER __host__ __device__
 16 | #else
 17 | #define CUDA_UNIVERSAL_QUALIFIER
 18 | #endif
 19 | 
 20 | const int kDimSiftData = 128; // the number of dimensions of SIFT feature
 21 | const int kDimHashData = 128; // the number of dimensions of Hash code
 22 | const int kBitInCompHash = 64; // the number of Hash code bits to be compressed; in this case, use a <uint64_t> variable to represent 64 bits
 23 | const int kDimCompHashData = kDimHashData / kBitInCompHash; // the number of dimensions of CompHash code
 24 | const int kMinMatchListLen = 16; // the minimal list length for outputing SIFT matching result between two images
 25 | const int kMaxCntPoint = 4000; // the maximal number of possible SIFT points; ensure this value is not exceeded in your application
 26 | 
 27 | const int kCntBucketBit = 8; // the number of bucket bits
 28 | const int kCntBucketGroup = 6; // the number of bucket groups
 29 | const int kCntBucketPerGroup = 1 << kCntBucketBit; // the number of buckets in each group
 30 | const int kMaxMemberPerGroup = 100;
 31 | 
 32 | const int kCntCandidateTopMin = 6; // the minimal number of top-ranked candidates
 33 | const int kCntCandidateTopMax = 10; // the maximal number of top-ranked candidates
 34 | const int kMaxCandidatePerDist = 100;
 35 | 
 36 | typedef float SiftData_t; // CUDA GPUs are optimized for float arithmetics, we use float instead of int
 37 | typedef float* SiftDataPtr;
 38 | typedef const float* SiftDataConstPtr;
 39 | typedef uint8_t HashData_t;
 40 | typedef uint8_t* HashDataPtr; // Hash code is represented with <uint8_t> type; only the lowest bit is used
 41 | typedef uint64_t CompHashData_t;
 42 | typedef uint64_t* CompHashDataPtr; // CompHash code is represented with <uint64_t> type
 43 | typedef unsigned int BucketEle_t;
 44 | typedef unsigned int* BucketElePtr; // index list of points in a specific bucket
 45 | 
 46 | typedef std::pair<unsigned int, unsigned int> MatchPair_t;
 47 | typedef std::shared_ptr<MatchPair_t> MatchPairPtr;
 48 | 
 49 | typedef std::vector<MatchPair_t> MatchPairList_t;
 50 | typedef std::shared_ptr<MatchPairList_t> MatchPairListPtr;
 51 | 
 52 | template <typename T>
 53 | struct Matrix {
 54 |     int width;
 55 |     int height;
 56 |     size_t pitch; // row size in bytes
 57 |     T* elements;
 58 | 
 59 |     CUDA_UNIVERSAL_QUALIFIER inline T& operator() (int i, int j) {
 60 |         return *(reinterpret_cast<T *>(reinterpret_cast<char *>(elements) + i * pitch) + j);
 61 |     } // no more ugly pointer calcs
 62 | 
 63 |     CUDA_UNIVERSAL_QUALIFIER inline const T& operator() (int i, int j) const {
 64 |          return *(reinterpret_cast<T *>(reinterpret_cast<char *>(elements) + i * pitch) + j);
 65 |     }
 66 | 
 67 |     Matrix(int H, int W) : height(H), width(W){
 68 |         pitch = sizeof(T) * width; // init pitch, will be adjusted later if use cudaMallocPitch
 69 |     }
 70 | 
 71 |     Matrix() : width(0), height(0), pitch(0), elements(NULL) {
 72 |     }
 73 | };
 74 | 
 75 | struct ImageHost {
 76 |     int cntPoint; // the number of SIFT points
 77 |     std::string keyFilePath;
 78 |     Matrix<SiftData_t> siftData; // [cntPoint x 128] Matrix, storing all sift vectors one-off
 79 | 
 80 | };
 81 | 
 82 | struct ImageDevice {
 83 |     int cntPoint;
 84 |     Matrix<SiftData_t> siftData;
 85 |     Matrix<CompHashData_t> compHashData; // [cntPoint x 2 Matrix]
 86 |     Matrix<HashData_t> bucketIDList; // element -> buckets [cntPoint x kCntBucketGroup]
 87 |     Matrix<BucketEle_t> bucketList; // bucket -> elements [kCntBucketGroup*kCntBucketPerGroup x kMaxMemberPerGroup]
 88 |     std::map<int, BucketElePtr> targetCandidates;
 89 | };
 90 | 
 91 | 
 92 | 
 93 | #define CUDA_CHECK_ERROR                                                         \
 94 |     do {                                                                         \
 95 |         const cudaError_t err = cudaGetLastError();                              \
 96 |         if (err != cudaSuccess) {                                                \
 97 |             const char *const err_str = cudaGetErrorString(err);                 \
 98 |             std::cerr << "Cuda error in " << __FILE__ << ":" << __LINE__ - 1     \
 99 |                       << ": " << err_str << " (" << err << ")" << std::endl;     \
100 |             exit(EXIT_FAILURE);                                                  \
101 |         }                                                                        \
102 |     } while(0)
103 | 
104 | 
105 | template< typename T >
106 | void check(T result, char const *const func, const char *const file, int const line)
107 | {
108 |     if (result)
109 |     {
110 |         fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
111 |                 file, line, static_cast<unsigned int>(result), cudaGetErrorString(result), func);
112 |         cudaDeviceReset();
113 |         // Make sure we call CUDA Device Reset before exiting
114 |         exit(EXIT_FAILURE);
115 |     }
116 | }
117 | #define CUDA_CATCH_ERROR(val) check ( (val), #val, __FILE__, __LINE__)
118 | 
119 | template <typename T>
120 | inline void dumpDeviceArray(T const *d_Array, int count) {
121 |     T *h_Array = new T[count];
122 |     cudaMemcpy(h_Array, d_Array, count * sizeof(T), cudaMemcpyDeviceToHost);
123 |     CUDA_CHECK_ERROR;
124 |     std::cout << "Dumping device array:\n";
125 |     for(int i = 0; i < count; i++) {
126 |         std::cout << +h_Array[i] << ", ";
127 |     }
128 |     std::cout << "[ " << count << " element(s) ]\n";
129 |     delete [] h_Array;
130 | }
131 | 
132 | template <typename T>
133 | inline void dumpHostArray(T const *h_Array, int count) {
134 |     std::cout << "Dumping host array:\n";
135 |     for(int i = 0; i < count; i++) {
136 |         std::cout << +h_Array[i] << ", ";
137 |     }
138 |     std::cout << "[ " << count << " element(s) ]\n";
139 | }
140 | 
141 | 


--------------------------------------------------------------------------------
/job.sh.in:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -J CasHash_CUDA
 4 | #SBATCH -e @CMAKE_BINARY_DIR@/stderr.txt
 5 | #SBATCH -o @CMAKE_BINARY_DIR@/stdout.txt
 6 | #SBATCH -n 1
 7 | #SBATCH --mem-per-cpu=1024
 8 | #SBATCH -t 30
 9 | #SBATCH --exclusive
10 | #SBATCH -p kurs2
11 | 
12 | echo "This is Job $SLURM_JOB_ID"
13 | cd @CMAKE_BINARY_DIR@
14 | ./KeyMatchCUDA ../dataset/list_huge.txt output_gpu.txt
15 | 


--------------------------------------------------------------------------------
/src/HashConverter.cpp:
--------------------------------------------------------------------------------
  1 | #include "HashConverter.h"
  2 | #include "Share.h"
  3 | 
  4 | #include <stdio.h>
  5 | #include <cuda_runtime.h>
  6 | #include <stdlib.h>
  7 | #include <cuda.h>
  8 | #include <curand.h>
  9 | #include <iostream>
 10 | #include <cmath>
 11 | 
 12 | HashConverter::HashConverter() {
 13 | 	//Allocate matrix for hashing into 128d-Hamming space
 14 | 	d_projMatHamming_.width = kDimSiftData;
 15 | 	d_projMatHamming_.height = kDimHashData;
 16 | 	cudaMallocPitch(&(d_projMatHamming_.elements),
 17 |                   &(d_projMatHamming_.pitch),
 18 |                   d_projMatHamming_.width * sizeof(SiftData_t),
 19 |                   d_projMatHamming_.height);
 20 |   CUDA_CHECK_ERROR;
 21 | 
 22 |   d_projMatBucket_.width = kDimSiftData;
 23 | 	d_projMatBucket_.height = kDimHashData;
 24 | 	cudaMallocPitch(&(d_projMatBucket_.elements),
 25 |                   &(d_projMatBucket_.pitch),
 26 |                   d_projMatBucket_.width * sizeof(SiftData_t),
 27 |                   d_projMatBucket_.height);
 28 |   CUDA_CHECK_ERROR;
 29 | 
 30 |   hashConverterStream_ = 0;
 31 | 
 32 |   FillHashingMatrixCuRand();
 33 | }
 34 | 
 35 | HashConverter::~HashConverter(){ 
 36 |     cudaFree(d_projMatHamming_.elements);
 37 |     cudaFree(d_projMatBucket_.elements);
 38 |     //cudaFree(d_projMatBucket_[0].elements);
 39 | 
 40 |     CUDA_CHECK_ERROR;
 41 | }
 42 | 
 43 | void HashConverter::FillHashingMatrixCuRand() {
 44 |     curandGenerator_t gen;
 45 | 
 46 |     curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
 47 |     curandSetPseudoRandomGeneratorSeed(gen, 1234ULL);
 48 |     
 49 |     for(int i = 0; i < d_projMatHamming_.height; i++) {
 50 |         curandGenerateNormal(gen, &d_projMatHamming_(i, 0), kDimSiftData, 0, 1);
 51 |     }
 52 | 
 53 |     for(int i = 0; i < d_projMatBucket_.height; i++) {
 54 |         curandGenerateNormal(gen, &d_projMatBucket_(i, 0), kDimSiftData, 0, 1);
 55 |     }
 56 | 
 57 |     CUDA_CHECK_ERROR;
 58 | 
 59 | #ifdef DEBUG_HASH_CONVERTER_RANDOM_MATRIX
 60 |     std::cout << "Device random matrix:\n";
 61 |     dumpDeviceArray(&d_projMatBucket_(0, 0), 128);
 62 | #endif
 63 | 
 64 | }
 65 | 
 66 | void HashConverter::FillHashingMatrixCMath() {
 67 |     SiftDataPtr tempRand = new SiftData_t[kDimSiftData];
 68 | 
 69 |     for(int i = 0; i < d_projMatHamming_.height; i++) {
 70 |         for(int j = 0; j < kDimSiftData; j++) {
 71 |             tempRand[j] = GetNormRand();
 72 |         }
 73 |         cudaMemcpy(&d_projMatHamming_(i, 0), tempRand, kDimSiftData * sizeof(SiftData_t), cudaMemcpyHostToDevice);
 74 |     }
 75 | 
 76 |     for(int i = 0; i < d_projMatHamming_.height; i++) {
 77 |         for(int j = 0; j < kDimSiftData; j++) {
 78 |             tempRand[j] = GetNormRand();
 79 |         }
 80 |         cudaMemcpy(&d_projMatBucket_(i, 0), tempRand, kDimSiftData * sizeof(SiftData_t), cudaMemcpyHostToDevice);
 81 |     }
 82 | 
 83 | #ifdef DEBUG_HASH_CONVERTER_RANDOM_MATRIX
 84 |     std::cout << "Device random matrix:\n";
 85 |     dumpDeviceArray(&d_projMatBucket_(0, 0), 128);
 86 | #endif
 87 | 
 88 | }
 89 | 
 90 | void HashConverter::FillHashingMatrixExternal(char const *path) {
 91 |     FILE *randomFp = fopen(path, "r");
 92 |     if(!randomFp) {
 93 |         std::cerr << "Random matrix does not exist!\n";
 94 |         exit(-1);
 95 |     }
 96 | 
 97 |     SiftDataPtr tempRand = new SiftData_t[kDimSiftData];
 98 | 
 99 |     for(int i = 0; i < d_projMatHamming_.height; i++) {
100 |         for(int j = 0; j < kDimSiftData; j++) {
101 |             fscanf(randomFp, "%f", &tempRand[j]);
102 |         }
103 |         cudaMemcpy(&d_projMatHamming_(i, 0), tempRand, kDimSiftData * sizeof(SiftData_t), cudaMemcpyHostToDevice);
104 |     }
105 | 
106 |     for(int i = 0; i < kCntBucketGroup * kCntBucketBit; i++) {
107 |         for(int j = 0; j < kDimSiftData; j++) {
108 |             fscanf(randomFp, "%f", &tempRand[j]);
109 |         }
110 |         cudaMemcpy(&d_projMatBucket_(i, 0), tempRand, kDimSiftData * sizeof(SiftData_t), cudaMemcpyHostToDevice);
111 |     }
112 | 
113 |     delete [] tempRand;
114 |     fclose(randomFp);
115 | }
116 | 
117 | void HashConverter::CalcHashValues(ImageDevice &d_Image){
118 | 	CompHash(d_Image);
119 | 	BucketHash(d_Image);
120 | }
121 | 
122 | cudaEvent_t HashConverter::CalcHashValuesAsync(ImageDevice &d_Image, cudaEvent_t sync) {
123 |     if(hashConverterStream_ == 0) {
124 |         cudaStreamCreate(&hashConverterStream_);
125 |     }
126 | 
127 |     if(sync) {
128 |         cudaStreamWaitEvent(hashConverterStream_, sync, 0);
129 |     }
130 | 
131 |     CompHash(d_Image, hashConverterStream_);
132 |     BucketHash(d_Image, hashConverterStream_);
133 | 
134 |     cudaEvent_t finish;
135 |     cudaEventCreate(&finish);
136 |     cudaEventRecord(finish, hashConverterStream_);
137 | 
138 |     return finish;
139 | }
140 | 
141 | float HashConverter::GetNormRand(void) {
142 |     // based on Box-Muller transform; for more details, please refer to the following WIKIPEDIA website:
143 |     // http://en.wikipedia.org/wiki/Box_Muller_transform
144 |     float u1 = (rand() % 1000 + 1) / 1000.0;
145 |     float u2 = (rand() % 1000 + 1) / 1000.0;
146 | 
147 |     float randVal = sqrt(-2 * log(u1)) * cos(2 * acos(-1.0) * u2);
148 | 
149 |     return randVal;
150 | }
151 | 


--------------------------------------------------------------------------------
/src/HashConverter.cu:
--------------------------------------------------------------------------------
  1 | #include "HashConverter.h"
  2 | #include "Share.h"
  3 | 
  4 | __global__ void CompHashKernel(Matrix<SiftData_t> g_sift, const Matrix<SiftData_t> g_projMat, Matrix<CompHashData_t> g_compHash) {
  5 |     __shared__  float s_siftCur[kDimSiftData]; // shared sift vector
  6 |     __shared__ uint32_t s_hashBits[kDimHashData + 16];
  7 |     SiftDataPtr g_siftCur = &g_sift(blockIdx.x, 0);
  8 |     SiftDataConstPtr g_projMatCur = &g_projMat(threadIdx.x, 0);
  9 |     int tx = threadIdx.x;
 10 |     int bx = blockIdx.x;
 11 | 
 12 |     s_siftCur[tx] = g_siftCur[tx]; // we can do this because kDimSiftData == kBitInCompHash, otherwise we need to setup a if condition
 13 |     __syncthreads();
 14 | 
 15 |     float element = 0.f;
 16 |     for(int i = 0; i < kDimSiftData; i++) {
 17 |         element = element + s_siftCur[i] * g_projMatCur[i];
 18 |     }
 19 | 
 20 |     if(tx < 16) {
 21 |         s_hashBits[kDimHashData + tx] = 0;
 22 |     }
 23 | 
 24 |     uint32_t hashVal = static_cast<uint32_t>(element > 0.f);
 25 |     hashVal <<= (tx % 32);
 26 |     s_hashBits[tx] = hashVal;
 27 |     __syncthreads();
 28 | 
 29 |     //for(int stride = 2; stride <= 32; stride <<= 1) {
 30 |     //    if(tx % stride == 0) {
 31 |     //        s_hashBits[tx] += s_hashBits[tx + stride / 2];
 32 |     //    }
 33 |     //}
 34 | 
 35 |     /* dangerous reduction but no warp divergence, assuming warp size = 32 */
 36 |     s_hashBits[tx] ^= s_hashBits[tx + 1];
 37 |     s_hashBits[tx] ^= s_hashBits[tx + 2];
 38 |     s_hashBits[tx] ^= s_hashBits[tx + 4];
 39 |     s_hashBits[tx] ^= s_hashBits[tx + 8];
 40 |     s_hashBits[tx] ^= s_hashBits[tx + 16];
 41 | 
 42 |     __syncthreads();
 43 | 
 44 |     if(tx % 64 == 0) {
 45 |         uint64_t halfCompHash = ((static_cast<uint64_t>(s_hashBits[tx + 32]) << 32) + s_hashBits[tx]);
 46 |         g_compHash(bx, tx / 64) = halfCompHash;
 47 |     }
 48 | }
 49 | 
 50 | void HashConverter::CompHash( ImageDevice &d_Image, cudaStream_t stream ) {
 51 |     // d_Image.compHashData.width = 2;
 52 |     // d_Image.compHashData.height = d_Image.cntPoint;
 53 |     // cudaMallocPitch(&(d_Image.compHashData.elements),
 54 |     //                 &(d_Image.compHashData.pitch),
 55 |     //                 d_Image.compHashData.width,
 56 |     //                 d_Image.compHashData.height);
 57 | 
 58 |     d_Image.compHashData.width = 2;
 59 |     d_Image.compHashData.pitch = sizeof(CompHashData_t) * 2;
 60 |     d_Image.compHashData.height = d_Image.cntPoint;
 61 |     cudaMalloc(&(d_Image.compHashData.elements),
 62 |                d_Image.compHashData.pitch * d_Image.compHashData.height);
 63 |     CUDA_CHECK_ERROR;
 64 | 
 65 |     dim3 blockSize(kDimHashData);
 66 |     dim3 gridSize(d_Image.cntPoint);
 67 | 
 68 |     if(stream == 0)
 69 |         CompHashKernel<<<gridSize, blockSize>>>(d_Image.siftData,
 70 |                                                 d_projMatHamming_,
 71 |                                                 d_Image.compHashData);
 72 |     else {
 73 |         CompHashKernel<<<gridSize, blockSize, 0, stream>>>(d_Image.siftData,
 74 |                                                            d_projMatHamming_,
 75 |                                                            d_Image.compHashData);
 76 |     }
 77 | 
 78 |     CUDA_CHECK_ERROR;
 79 | }
 80 | 
 81 | __global__ void BucketHashKernel(Matrix<SiftData_t> g_sift, const Matrix<SiftData_t> g_projMat, Matrix<HashData_t> g_bucketHash, Matrix<BucketEle_t> g_bucketEle) {
 82 |     __shared__  float s_siftCur[kDimSiftData]; // shared sift vector
 83 |     __shared__ int s_hashBits[kDimHashData];
 84 |     SiftDataPtr g_siftCur = &g_sift(blockIdx.x, 0);
 85 |     SiftDataConstPtr g_projMatCur = &g_projMat(threadIdx.x, 0);
 86 |     int tx = threadIdx.x; // hash group
 87 |     int bx = blockIdx.x; // sift vector index
 88 |     int idx = tx + bx * blockDim.x;
 89 | 
 90 |     s_siftCur[tx] = g_siftCur[tx]; // we can do this because kDimSiftData == kBitInCompHash, otherwise we need to setup a if condition
 91 |     if(idx < g_bucketEle.height)
 92 |         g_bucketEle(idx, 0) = 0;
 93 | 
 94 |     __syncthreads();
 95 | 
 96 |     float element = 0.f;
 97 |     for(int i = 0; i < kDimSiftData; i++) {
 98 |         element = element + s_siftCur[i] * g_projMatCur[i];
 99 |     }
100 | 
101 |     int hashVal = static_cast<int>(element > 0.f);
102 | 
103 |     hashVal <<= tx % 8;
104 |     s_hashBits[tx] = hashVal;
105 |     __syncthreads();
106 | 
107 |     for(int stride = 2; stride <= 8; stride <<= 1) {
108 |         if(tx % stride == 0) {
109 |             s_hashBits[tx] += s_hashBits[tx + stride / 2];
110 |         }
111 |     }
112 | 
113 |     if(tx % 8 == 0 && tx / 8 < kCntBucketGroup) {
114 |         hashVal = s_hashBits[tx];
115 |         g_bucketHash(bx, tx / 8) = hashVal;
116 |         BucketElePtr baseAddr = &(g_bucketEle(kCntBucketPerGroup * tx / 8 + hashVal, 0));
117 |         int currIdx = atomicInc(baseAddr, kMaxMemberPerGroup) + 1;
118 |  
119 | #ifdef DEBUG_HASH_CONVERTER
120 |         printf("%d %d %d\n", tx / 8, hashVal, currIdx);
121 |         if(currIdx == kMaxMemberPerGroup) {
122 |             printf("Warning: bucket full! Consider increasing bucket #%d in group %d!\n", hashVal, tx / 8);
123 |         }
124 | #endif
125 | 
126 |         g_bucketEle(kCntBucketPerGroup * tx / 8 + hashVal, currIdx) = bx;
127 |     }
128 | }
129 | 
130 | void HashConverter::BucketHash( ImageDevice &d_Image, cudaStream_t stream ) {
131 |     d_Image.bucketIDList.width = kCntBucketGroup;
132 |     d_Image.bucketIDList.height = d_Image.cntPoint;
133 |     cudaMallocPitch(&(d_Image.bucketIDList.elements),
134 |                     &(d_Image.bucketIDList.pitch),
135 |                     d_Image.bucketIDList.width * sizeof(HashData_t),
136 |                     d_Image.bucketIDList.height);
137 | 
138 |     d_Image.bucketList.width = kMaxMemberPerGroup;
139 |     d_Image.bucketList.height = kCntBucketGroup * kCntBucketPerGroup;
140 |     cudaMallocPitch(&(d_Image.bucketList.elements),
141 |                     &(d_Image.bucketList.pitch),
142 |                     d_Image.bucketList.width * sizeof(BucketEle_t),
143 |                     d_Image.bucketList.height);
144 | 
145 | 
146 |     //for(int i = 0; i < d_Image.bucketList.height; i++) {
147 |     //    cudaMemset(&(d_Image.bucketList(i, 0)),
148 |     //               0,
149 |     //               sizeof(BucketEle_t));
150 |     //    CUDA_CHECK_ERROR;
151 |     //}
152 | 
153 |     //CUDA_CHECK_ERROR;
154 | 
155 |     // TODO bucketEle
156 |     dim3 blockSize(kDimHashData);
157 |     dim3 gridSize(d_Image.cntPoint);
158 |     
159 |     if(stream == 0)
160 |         BucketHashKernel<<<gridSize, blockSize>>>(d_Image.siftData,
161 |                                                   d_projMatBucket_,
162 |                                                   d_Image.bucketIDList,
163 |                                                   d_Image.bucketList);
164 |     else {
165 |         BucketHashKernel<<<gridSize, blockSize, 0, stream>>>(d_Image.siftData, d_projMatBucket_, d_Image.bucketIDList, d_Image.bucketList);
166 |     }
167 | 
168 | #ifdef DEBUG_HASH_CONVERTER2
169 |     for(int m = 0; m < kCntBucketGroup; m++) {
170 |         for(int bucket = 0; bucket < kCntBucketPerGroup; bucket++) {
171 |             BucketEle_t bucketSize;
172 |             cudaMemcpy(&bucketSize, &(d_Image.bucketList(m * kCntBucketPerGroup + bucket, 0)), sizeof(BucketEle_t), cudaMemcpyDeviceToHost);
173 |             std::cout << "Group: " << m << " Bucket: " << bucket << " Size: " << bucketSize << "\n";
174 |         }
175 |     }
176 |     CUDA_CHECK_ERROR;
177 | #endif
178 | 
179 | }
180 |  
181 | 


--------------------------------------------------------------------------------
/src/HashMatcher.cpp:
--------------------------------------------------------------------------------
 1 | #include "HashMatcher.h"
 2 | 
 3 | #include <stdio.h>
 4 | #include <cuda_runtime.h>
 5 | #include <stdlib.h>
 6 | #include <cuda.h>
 7 | #include <iostream>
 8 | 
 9 | HashMatcher::HashMatcher() {
10 |     hashMatcherStream_ = 0;
11 | }
12 | 
13 | HashMatcher::~HashMatcher() { 
14 | 
15 | }
16 | 
17 | int PairListIndex(int imageIndex1, int imageIndex2) {
18 |     if(imageIndex2 > imageIndex1)
19 |         std::swap(imageIndex1, imageIndex2);
20 | 
21 |     return imageIndex1 * (imageIndex1 - 1) / 2 + imageIndex2;
22 | }
23 | 
24 | void HashMatcher::AddImage(const ImageDevice &d_Image) {
25 |     d_imageList_.push_back(d_Image);
26 | 
27 |     int currentImages = d_imageList_.size();
28 | 
29 |     for(int imageIndex = 0; imageIndex < currentImages - 1; imageIndex++) {
30 |         GeneratePair(currentImages - 1, imageIndex); // pair with all previous images
31 |         // TODO pair with user-specified list
32 |     }
33 | }
34 | 
35 | cudaEvent_t HashMatcher::AddImageAsync(const ImageDevice &d_Image, cudaEvent_t sync) {
36 |     if(hashMatcherStream_ == 0) {
37 |         cudaStreamCreate(&hashMatcherStream_);
38 |     }
39 | 
40 |     if(sync) {
41 |         cudaStreamWaitEvent(hashMatcherStream_, sync, 0);
42 |     }
43 | 
44 |     d_imageList_.push_back(d_Image);
45 | 
46 |     int currentImages = d_imageList_.size();
47 | 
48 |     for(int imageIndex = 0; imageIndex < currentImages - 1; imageIndex++) {
49 |         GeneratePair(currentImages - 1, imageIndex); // pair with all previous images
50 |         // TODO pair with user-specified list
51 |     }
52 | 
53 |     cudaEvent_t finish;
54 |     cudaEventCreate(&finish);
55 |     cudaEventRecord(finish, hashMatcherStream_);
56 | 
57 |     return finish;
58 | }
59 | 
60 | int HashMatcher::NumberOfMatch(int queryImageIndex, int targetImageIndex) {
61 |     return MatchPairList(queryImageIndex, targetImageIndex)->size();
62 | }
63 | 
64 | MatchPairListPtr HashMatcher::MatchPairList( int queryImageIndex, int targetImageIndex ) {
65 |     auto queryTargetPair = std::make_pair(queryImageIndex, targetImageIndex);
66 | 
67 |     if(!matchDataBase_.count(queryTargetPair)) {
68 |         ImageDevice &queryImage = d_imageList_[queryImageIndex];
69 |         BucketElePtr d_candidateArray = queryImage.targetCandidates[targetImageIndex];
70 |         BucketElePtr h_candidateArray = new BucketEle_t[queryImage.cntPoint];
71 | 
72 |         cudaMemcpy(h_candidateArray, d_candidateArray, queryImage.cntPoint * sizeof(BucketEle_t), cudaMemcpyDeviceToHost);
73 |         cudaFree(d_candidateArray);
74 |         CUDA_CHECK_ERROR;
75 | 
76 |         MatchPairListPtr newMatchPairList(new MatchPairList_t);
77 | 
78 |         for(int point = 0; point < queryImage.cntPoint; point++) {
79 |             if(h_candidateArray[point] != INVALID_CANDIDATE) {
80 |                 newMatchPairList->push_back(std::make_pair(point, h_candidateArray[point]));
81 |             }
82 |         }
83 | 
84 |         matchDataBase_[queryTargetPair] = newMatchPairList;
85 |     }
86 | 
87 |     return matchDataBase_[queryTargetPair];
88 | }
89 | 


--------------------------------------------------------------------------------
/src/HashMatcher.cu:
--------------------------------------------------------------------------------
  1 | #include "HashMatcher.h"
  2 | 
  3 | #include "cub/cub.cuh"
  4 | 
  5 | 
  6 | struct DistIndexPair {
  7 |     SiftData_t dist;
  8 |     BucketEle_t index;
  9 | };
 10 | 
 11 | struct MinDistOp {
 12 | CUDA_UNIVERSAL_QUALIFIER DistIndexPair operator() (const DistIndexPair a, const DistIndexPair b) {
 13 |         return (a.dist < b.dist) ? a : b;
 14 |     }
 15 | };
 16 | 
 17 | __global__ void GeneratePairKernel(Matrix<HashData_t> g_queryImageBucketID,
 18 |                                    Matrix<CompHashData_t> g_queryImageCompHashData,
 19 |                                    Matrix<SiftData_t> g_queryImageSiftData,
 20 |                                    int queryImageCntPoint,
 21 |                                    Matrix<BucketEle_t> g_targetImageBucket,
 22 |                                    Matrix<CompHashData_t> g_targetImageCompHashData,
 23 |                                    Matrix<SiftData_t> g_targetImageSiftData,
 24 |                                    BucketElePtr g_pairResult) {
 25 | 
 26 |     int candidate[kDimHashData + 1][kMaxCandidatePerDist]; // 6 * 1K, local memory
 27 |     int candidateLen[kDimHashData + 1];
 28 |     bool candidateUsed[kMaxCntPoint];
 29 |     int candidateTop[kCntCandidateTopMax];
 30 |     int candidateTopLen = 0;
 31 | 
 32 |     int querySiftIndex = threadIdx.x + blockIdx.x * blockDim.x;
 33 | 
 34 |     if(querySiftIndex >= queryImageCntPoint)
 35 |         return;
 36 | 
 37 |     memset(candidateLen, 0, sizeof(candidateLen));
 38 |     
 39 |     CompHashData_t currentCompHash[2];
 40 |     currentCompHash[0] = g_queryImageCompHashData(querySiftIndex, 0);
 41 |     currentCompHash[1] = g_queryImageCompHashData(querySiftIndex, 1);
 42 | 
 43 | #ifdef DEBUG_HASH_MATCHER1
 44 |     printf("current comphash: %lld %lld\n", currentCompHash[0], currentCompHash[1]);
 45 | #endif
 46 | 
 47 |     for(int m = 0; m < kCntBucketGroup; m++) {
 48 | 
 49 |         HashData_t currentBucket = g_queryImageBucketID(querySiftIndex, m);
 50 |         BucketEle_t *targetBucket = &g_targetImageBucket(m * kCntBucketPerGroup + currentBucket, 0);
 51 |         int targetBucketElements = targetBucket[0];
 52 | 
 53 |         for(int bucketIndex = 1; bucketIndex <= targetBucketElements; bucketIndex++) {
 54 | 
 55 |             int targetIndex = targetBucket[bucketIndex];
 56 | 
 57 |             int dist = __popcll(currentCompHash[0] ^ g_targetImageCompHashData(targetIndex, 0)) +
 58 |                 __popcll(currentCompHash[1] ^ g_targetImageCompHashData(targetIndex, 1));
 59 |             candidate[dist][candidateLen[dist]++] = targetIndex;
 60 |             candidateUsed[targetIndex] = false;
 61 | 
 62 | #ifdef DEBUG_HASH_MATCHER2
 63 |             printf("(%d %d) ", targetIndex, dist);
 64 | #endif
 65 |         }
 66 |     }
 67 | 
 68 |     for(int dist = 0; dist <= kDimHashData; dist++) {
 69 |         for(int i = 0; i < candidateLen[dist]; i++) {
 70 |             int targetIndex = candidate[dist][i];
 71 | 
 72 |             //if(blockIdx.x == 0 && threadIdx.x == 0) {
 73 |             //    printf("%d ", targetIndex);
 74 |             //}
 75 | 
 76 |             if(!candidateUsed[targetIndex]) {
 77 |                 candidateUsed[targetIndex] = true;
 78 |                 candidateTop[candidateTopLen++] = targetIndex;
 79 |                 if(candidateTopLen == kCntCandidateTopMax)
 80 |                     break;
 81 |             }
 82 |         }
 83 | 
 84 |         if(candidateTopLen >= kCntCandidateTopMin)
 85 |             break;
 86 |     }
 87 | 
 88 |     SiftDataConstPtr queryImageSift = &g_queryImageSiftData(querySiftIndex, 0);
 89 | 
 90 |     double minVal1 = 0.0;
 91 |     int minValInd1 = -1;
 92 |     double minVal2 = 0.0;
 93 |     int minValInd2 = -1;
 94 | 
 95 |     for(int candidateListIndex = 0; candidateListIndex < candidateTopLen; candidateListIndex++) {
 96 | 
 97 |         int candidateIndex = candidateTop[candidateListIndex];
 98 |         SiftDataConstPtr candidateSift = &g_targetImageSiftData(candidateIndex, 0);
 99 | 
100 |         float candidateDist = 0.f;
101 |         for(int i = 0; i < kDimSiftData; i++) {
102 |             float diff = queryImageSift[i] - candidateSift[i];
103 |             candidateDist = candidateDist + diff * diff;
104 |         }
105 | 
106 |         if (minValInd2 == -1 || minVal2 > candidateDist) {
107 |             minVal2 = candidateDist;
108 |             minValInd2 = candidateIndex;
109 |         }
110 | 
111 |         if (minValInd1 == -1 || minVal1 > minVal2) {
112 |             float minValTemp = minVal2;
113 |             minVal2 = minVal1;
114 |             minVal1 = minValTemp;
115 |             int minValIndTemp = minValInd2;
116 |             minValInd2 = minValInd1;
117 |             minValInd1 = minValIndTemp;
118 |         }
119 |     }
120 | 
121 | 
122 |     if (minVal1 < minVal2 * 0.32f) {
123 |         g_pairResult[querySiftIndex] = minValInd1 + 1;
124 |     } else {
125 |         g_pairResult[querySiftIndex] = 0;
126 |     }
127 | 
128 | }
129 | 
130 | template <int BLOCK_SIZE = HASH_MATCHER_BLOCK_SIZE, int ITEMS_PER_THREAD = HASH_MATCHER_ITEMS_PER_THREAD>
131 | __global__ void GeneratePairKernelFast(Matrix<HashData_t> g_queryImageBucketID,
132 |                                        Matrix<CompHashData_t> g_queryImageCompHashData,
133 |                                        Matrix<SiftData_t> g_queryImageSiftData,
134 |                                        const int queryImageCntPoint,
135 |                                        Matrix<BucketEle_t> g_targetImageBucket,
136 |                                        Matrix<CompHashData_t> g_targetImageCompHashData,
137 |                                        Matrix<SiftData_t> g_targetImageSiftData,
138 |                                        BucketElePtr g_pairResult) {
139 | 
140 |     typedef cub::BlockLoad<BucketEle_t, BLOCK_SIZE, ITEMS_PER_THREAD, cub::BLOCK_LOAD_DIRECT> LoadBucketVectorsT;
141 |     typedef cub::BlockRadixSort<int, BLOCK_SIZE, ITEMS_PER_THREAD, BucketEle_t> SortVectorDistT;
142 |     typedef cub::BlockReduce<DistIndexPair, BLOCK_SIZE> BlockReduceT;
143 | 
144 |     __shared__ union {
145 |         typename LoadBucketVectorsT::TempStorage load;
146 |         typename SortVectorDistT::TempStorage sort;
147 |         typename BlockReduceT::TempStorage reduce;
148 |     } tempStorage;
149 | 
150 |     /* in case of same candidate numbers being thrown from different bucket */
151 |     const int lastTopThreadsCnt = POSSIBLE_CANDIDATES / ITEMS_PER_THREAD; // FIXME: deal with demainders != 0
152 |     __shared__ BucketEle_t s_lastTopVectors[POSSIBLE_CANDIDATES + 1];
153 | 
154 |     /* Initialize lastTops as INVALID */
155 |     if(threadIdx.x < POSSIBLE_CANDIDATES + 1) {
156 |         s_lastTopVectors[threadIdx.x] = INVALID_CANDIDATE;
157 |     }
158 |     __syncthreads();
159 | 
160 |     BucketEle_t targetVectors[ITEMS_PER_THREAD];
161 |     int targetDists[ITEMS_PER_THREAD];
162 | 
163 |     int queryIndex = blockIdx.x;
164 |     CompHashData_t queryCompHash[2];
165 |     queryCompHash[0] = g_queryImageCompHashData(queryIndex, 0);
166 |     queryCompHash[1] = g_queryImageCompHashData(queryIndex, 1);
167 |      
168 |     for(int group = 0; group < kCntBucketGroup; group++) {
169 |         BucketElePtr currentBucketPtr = &g_targetImageBucket(g_queryImageBucketID(queryIndex, group), 0);
170 |         int currentBucketSize = *currentBucketPtr;
171 | 
172 |         LoadBucketVectorsT(tempStorage.load).Load(currentBucketPtr + 1, targetVectors, currentBucketSize, INVALID_CANDIDATE);
173 |         __syncthreads();
174 | 
175 |         if(threadIdx.x >= blockDim.x - lastTopThreadsCnt) {
176 |             int offset = (threadIdx.x - (blockDim.x - lastTopThreadsCnt)) * ITEMS_PER_THREAD;
177 | 
178 |             #pragma unroll
179 |             for(int i = 0; i < ITEMS_PER_THREAD; i++) {
180 |                 targetVectors[i] = s_lastTopVectors[i + offset];
181 |             }
182 |         }
183 | 
184 |         #pragma unroll
185 |         for(int i = 0; i < ITEMS_PER_THREAD; i++) {
186 |             BucketEle_t targetIndex = targetVectors[i];
187 | 
188 |             if(targetIndex != INVALID_CANDIDATE) {
189 |                 targetDists[i] = __popcll(queryCompHash[0] ^ g_targetImageCompHashData(targetIndex, 0)) +
190 |                     __popcll(queryCompHash[1] ^ g_targetImageCompHashData(targetIndex, 1));
191 |             } else {
192 |                 targetDists[i] = MAX_COMPHASH_DISTANCE;
193 |             }
194 |         }
195 | 
196 |         SortVectorDistT(tempStorage.sort).Sort(targetDists, targetVectors, 0, 8); // end_bit = 8, maximum possible dist = 128
197 | 
198 |         if(threadIdx.x < lastTopThreadsCnt) {
199 |             int offset = threadIdx.x * ITEMS_PER_THREAD;
200 | 
201 |             #pragma unroll
202 |             for(int i = 0; i < ITEMS_PER_THREAD; i++) {
203 |                 s_lastTopVectors[i + offset] = targetVectors[i];
204 |             }
205 |         }
206 | 
207 |         __syncthreads();
208 | 
209 |         /* remove duplicated candidate. prerequisite: POSSIBLE_CANDIDATES < blockDim.x */
210 |         bool isDuplicate = false;
211 | 
212 |         if(threadIdx.x < POSSIBLE_CANDIDATES && (s_lastTopVectors[threadIdx.x] == s_lastTopVectors[threadIdx.x + 1])){
213 |             isDuplicate = true;
214 |         }
215 | 
216 |         __syncthreads();
217 | 
218 |         if(isDuplicate) {
219 |             s_lastTopVectors[threadIdx.x] = INVALID_CANDIDATE;
220 |         }
221 | 
222 |         __syncthreads();
223 |     }
224 | 
225 |     DistIndexPair candidate;
226 |     candidate.dist = MAX_SIFT_DISTANCE;
227 | 
228 |     if(threadIdx.x < POSSIBLE_CANDIDATES) {
229 |         candidate.index = s_lastTopVectors[threadIdx.x];
230 | 
231 |         if(candidate.index != INVALID_CANDIDATE) {
232 |             float dist = 0;
233 |             SiftDataPtr querySiftVector = &g_queryImageSiftData(queryIndex, 0),
234 |                 targetSiftVector = &g_targetImageSiftData(candidate.index, 0);
235 | 
236 |             for(int i = 0; i < kDimSiftData; i++) {
237 |                 float diff = querySiftVector[i] - targetSiftVector[i];
238 |                 dist += diff * diff;
239 |             }
240 | 
241 |             candidate.dist = dist;
242 |         }
243 |     }
244 | 
245 |     DistIndexPair min1, min2;
246 |     const MinDistOp minDistOp;
247 | 
248 |     min1 = BlockReduceT(tempStorage.reduce).Reduce(candidate, minDistOp, POSSIBLE_CANDIDATES);
249 | 
250 |     if(threadIdx.x == 0) {
251 |         candidate.index = INVALID_CANDIDATE;
252 |         candidate.dist = MAX_SIFT_DISTANCE;
253 |     }
254 | 
255 |     min2 = BlockReduceT(tempStorage.reduce).Reduce(candidate, minDistOp, POSSIBLE_CANDIDATES);
256 | 
257 |     if(threadIdx.x == 0) {
258 |         if(min1.dist < min2.dist * 0.32f) {
259 |             g_pairResult[queryIndex] = min1.index;
260 |         } else {
261 |             g_pairResult[queryIndex] = INVALID_CANDIDATE;
262 |         }
263 |     }
264 | }
265 | 
266 | cudaEvent_t HashMatcher::GeneratePair(int queryImageIndex, int targetImageIndex) {
267 |     ImageDevice &queryImage = d_imageList_[queryImageIndex];
268 |     const ImageDevice &targetImage = d_imageList_[targetImageIndex];
269 | 
270 |     BucketElePtr candidateArray;
271 |     cudaMalloc(&candidateArray, sizeof(BucketEle_t) * queryImage.cntPoint);
272 |     CUDA_CHECK_ERROR;
273 | 
274 |     queryImage.targetCandidates[targetImageIndex] = candidateArray;
275 | 
276 |     dim3 gridSize(queryImage.cntPoint);
277 |     dim3 blockSize(HASH_MATCHER_BLOCK_SIZE);
278 |     
279 |     GeneratePairKernelFast<<<gridSize, blockSize, 0, hashMatcherStream_>>>(
280 |         queryImage.bucketIDList,
281 |         queryImage.compHashData,
282 |         queryImage.siftData,
283 |         queryImage.cntPoint,
284 |         targetImage.bucketList,
285 |         targetImage.compHashData,
286 |         targetImage.siftData,
287 |         candidateArray);
288 | 
289 |     cudaEvent_t finish;
290 |     cudaEventCreate(&finish);
291 |     cudaEventRecord(finish, hashMatcherStream_);
292 | 
293 |     return finish;
294 | }
295 | 


--------------------------------------------------------------------------------
/src/KeyFileReader.cpp:
--------------------------------------------------------------------------------
  1 | #include "KeyFileReader.h"
  2 | #include "Share.h"
  3 | 
  4 | #include <stdio.h>
  5 | #include <cuda_runtime.h>
  6 | #include <stdlib.h>
  7 | #include <cstring>
  8 | 
  9 | KeyFileReader::KeyFileReader() {
 10 |     std::memset(siftAccumulator_, 0, sizeof(siftAccumulator_));
 11 |     keyFileReaderStream_ = 0;
 12 | }
 13 | 
 14 | KeyFileReader::~KeyFileReader() {
 15 |     std::vector<ImageHost>::iterator it;
 16 |     for(it = h_imageList_.begin(); it != h_imageList_.end(); ++it) {
 17 |         delete[] it->siftData.elements;
 18 |     }
 19 | }
 20 | 
 21 | void KeyFileReader::AddKeyFile( const char *path ) {
 22 |     FILE *keyFile = fopen(path, "r");
 23 |     if(keyFile == NULL) {
 24 |         fprintf(stderr, "Key file %s does not exist!\n", path);
 25 |         exit(EXIT_FAILURE);
 26 |     }
 27 |     fprintf(stderr, "Reading SIFT vector from %s\n", path);
 28 |     int cntPoint, cntDim;
 29 |     fscanf(keyFile, "%d%d", &cntPoint, &cntDim);
 30 |     if(cntDim != kDimSiftData) {
 31 |         fprintf(stderr, "Unsupported SIFT vector dimension %d, should be %d!\n", cntDim, kDimSiftData);
 32 |         exit(EXIT_FAILURE);
 33 |     }
 34 | 
 35 |     ImageHost newImage;
 36 |     newImage.cntPoint = cntPoint;
 37 |     newImage.keyFilePath = path;
 38 | 
 39 |     size_t requiredSize = cntPoint * cntDim;
 40 |     newImage.siftData.elements = new SiftData_t[requiredSize];
 41 |     newImage.siftData.width = cntDim;
 42 |     newImage.siftData.height = cntPoint;
 43 |     newImage.siftData.pitch = cntDim * sizeof(SiftData_t);
 44 |     if( newImage.siftData.elements == NULL) {
 45 |         fprintf(stderr, "Can't allocate memory for host image!\n");
 46 |         exit(EXIT_FAILURE);
 47 |     }
 48 | 
 49 |     for(int i = 0; i < cntPoint; i++) {
 50 |         fscanf(keyFile, "%*f%*f%*f%*f"); //ignoring sift headers
 51 |         SiftDataPtr rowVector = newImage.siftData.elements + kDimSiftData * i;
 52 |         for(int j = 0; j < kDimSiftData; j++) {
 53 |             fscanf(keyFile, "%f", &rowVector[j]);
 54 |             siftAccumulator_[j] = siftAccumulator_[j] + rowVector[j];
 55 |         }
 56 |         cntTotalVector_++;
 57 |     }
 58 |     fclose(keyFile);
 59 |     h_imageList_.push_back(newImage);
 60 |     cntImage = h_imageList_.size();
 61 | }
 62 | 
 63 | void KeyFileReader::OpenKeyList( const char *path ) {
 64 |     FILE *keyList = fopen(path, "r");
 65 |     char keyFilePath[256];
 66 |     if(keyList == NULL) {
 67 |         fprintf(stderr, "Keylist file %s does not exist!\n", path);
 68 |         exit(EXIT_FAILURE);
 69 |     }
 70 |     while(fscanf(keyList, "%s", keyFilePath) > 0) {
 71 |         AddKeyFile(keyFilePath);
 72 |     }
 73 |     fclose(keyList);
 74 | }
 75 | 
 76 | void KeyFileReader::ZeroMeanProc() {
 77 |     SiftData_t mean[kDimSiftData];
 78 | 
 79 |     for(int i = 0; i < kDimSiftData; i++) {
 80 |         mean[i] = siftAccumulator_[i] / cntTotalVector_;
 81 |     }
 82 | 
 83 |     std::vector<ImageHost>::iterator it;
 84 | 
 85 |     for(it = h_imageList_.begin(); it != h_imageList_.end(); ++it) {
 86 |         for(int i = 0; i < it->cntPoint; i++) {
 87 |             SiftDataPtr rowVector = &it->siftData(i, 0);
 88 |             for(int j = 0; j < kDimSiftData; j++) {
 89 |                 rowVector[j] -= mean[j];
 90 |             }
 91 |         }
 92 |     }
 93 | }
 94 | 
 95 | void KeyFileReader::UploadImage( ImageDevice &d_Image, const int index ) {
 96 |     d_Image.cntPoint = h_imageList_[index].cntPoint;
 97 |     d_Image.siftData.width = kDimSiftData;
 98 |     d_Image.siftData.height = h_imageList_[index].cntPoint;
 99 | 
100 |     cudaMallocPitch(&(d_Image.siftData.elements),
101 |                     &(d_Image.siftData.pitch),
102 |                     d_Image.siftData.width * sizeof(SiftData_t),
103 |                     d_Image.siftData.height);
104 | 
105 |     cudaMemcpy2D(d_Image.siftData.elements,
106 |                  d_Image.siftData.pitch,
107 |                  h_imageList_[index].siftData.elements,
108 |                  h_imageList_[index].siftData.pitch,
109 |                  h_imageList_[index].siftData.width * sizeof(SiftData_t),
110 |                  h_imageList_[index].siftData.height,
111 |                  cudaMemcpyHostToDevice);
112 |     CUDA_CHECK_ERROR;
113 | }
114 | 
115 | cudaEvent_t KeyFileReader::UploadImageAsync( ImageDevice &d_Image, const int index, cudaEvent_t sync ) {
116 |     if(keyFileReaderStream_ == 0) {
117 |         cudaStreamCreate(&keyFileReaderStream_);
118 |     }
119 | 
120 |     if(sync) {
121 |         cudaStreamWaitEvent(keyFileReaderStream_, sync, 0);
122 |     }
123 | 
124 |     d_Image.cntPoint = h_imageList_[index].cntPoint;
125 |     d_Image.siftData.width = kDimSiftData;
126 |     d_Image.siftData.height = h_imageList_[index].cntPoint;
127 | 
128 |     cudaMallocPitch(&(d_Image.siftData.elements),
129 |                     &(d_Image.siftData.pitch),
130 |                     d_Image.siftData.width * sizeof(SiftData_t),
131 |                     d_Image.siftData.height);
132 | 
133 |     cudaMemcpy2DAsync(d_Image.siftData.elements,
134 |                       d_Image.siftData.pitch,
135 |                       h_imageList_[index].siftData.elements,
136 |                       h_imageList_[index].siftData.pitch,
137 |                       h_imageList_[index].siftData.width * sizeof(SiftData_t),
138 |                       h_imageList_[index].siftData.height,
139 |                       cudaMemcpyHostToDevice,
140 |                       keyFileReaderStream_);
141 | 
142 |     cudaEvent_t finish;
143 |     cudaEventCreate(&finish);
144 |     cudaEventRecord(finish, keyFileReaderStream_);
145 | 
146 |     CUDA_CHECK_ERROR;
147 | 
148 |     return finish;
149 | }
150 | 


--------------------------------------------------------------------------------
/src/TestHashConverter.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda_runtime.h>
 3 | 
 4 | #include "HashConverter.h"
 5 | #include "KeyFileReader.h"
 6 | #include "Share.h"
 7 | 
 8 | int main(int argc, char *argv[]) {
 9 |     ImageDevice d_Img;
10 |     KeyFileReader kf;
11 |     std::cout << "reading keylist\n";
12 |     kf.OpenKeyList(argv[1]);
13 | 
14 |     std::cout << "Vector #0 in host image #0:\n";
15 |     dumpHostArray(&kf.h_imageList_[0].siftData(0, 0), kDimSiftData);
16 | 
17 |     std::cout << "vector #1 in image #0:\n";
18 |     dumpHostArray(&kf.h_imageList_[0].siftData(1, 0), kDimSiftData);
19 | 
20 |     std::cout << "Removing DC components..\n";
21 |     kf.ZeroMeanProc();
22 |     std::cout << "uploading image\n";
23 |     kf.UploadImage(d_Img, 0);
24 | 
25 |     std::cout << "Vector #0 in host image #0:\n";
26 |     dumpHostArray(&kf.h_imageList_[0].siftData(0, 0), kDimSiftData);
27 | 
28 |     std::cout << "Vector #0 in device image #0:\n";
29 |     dumpDeviceArray(&d_Img.siftData(0, 0), kDimSiftData);
30 | 
31 |     std::cout << "HashConverter instantiated\n";
32 |     HashConverter hc;
33 | 
34 |     std::cout << "Calculating comphash...\n";
35 |     hc.CompHash(d_Img);
36 |     CUDA_CHECK_ERROR;
37 |     std::cout << "Constructing buckets...\n";
38 |     hc.BucketHash(d_Img);
39 | 
40 |     std::cout << "Bucket information for image #0:\n";
41 |     dumpDeviceArray(&d_Img.bucketIDList(0, 0), kCntBucketGroup);
42 | 
43 |     cudaFree(d_Img.compHashData.elements);
44 |     CUDA_CHECK_ERROR;
45 |     cudaFree(d_Img.siftData.elements);
46 |     CUDA_CHECK_ERROR;
47 | 
48 |     return 0;
49 | }
50 | 


--------------------------------------------------------------------------------
/src/TestHashMatcher.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda_runtime.h>
 3 | 
 4 | #include "HashMatcher.h"
 5 | #include "HashConverter.h"
 6 | #include "KeyFileReader.h"
 7 | #include "Share.h"
 8 | 
 9 | int main(int argc, char *argv[]) {
10 |     
11 |     KeyFileReader kf;
12 |     std::cerr << "reading keylist\n";
13 |     kf.OpenKeyList(argv[1]);
14 |     std::cerr << "preprocessing to zero-mean vectors\n";
15 |     kf.ZeroMeanProc();
16 | 
17 |     std::cerr << "initialing cuda device\n";
18 |     cudaSetDevice(0);
19 | 
20 |     std::cerr << "filling hash matrix" << '\n';
21 |     HashConverter hc;
22 | 
23 |     HashMatcher hm;
24 | 
25 |     for (int i = 0; i < kf.cntImage; i++) {
26 |         ImageDevice curImg;
27 | 
28 |         std::cerr << "------------------\nuploading image " << i << "\n";
29 |         kf.UploadImage(curImg, i);
30 | 
31 |         std::cerr << "Converting hash values\n";
32 |         hc.CalcHashValues(curImg);
33 |         //dumpDeviceArray(&curImg.compHashData(0, 0), 2);
34 |         //dumpDeviceArray(&curImg.bucketIDList(0, 0), 6);
35 | 
36 |         std::cerr << "Adding image to hashmatcher\n";
37 |         hm.AddImage(curImg);
38 | 
39 |         for(int j = 0; j < i; j++) {
40 |             std::cerr << hm.NumberOfMatch(i, j) << " match(es) found between image " << i << " and " << j << "\n";
41 | #ifdef DEBUG_HASH_MATCHER
42 |             MatchPairListPtr mpList = hm.MatchPairList(i, j);
43 |             for(MatchPairList_t::iterator it = mpList->begin(); it != mpList->end(); it++) {
44 |                 std::cerr << "(" << it->first << ", " << it->second << ") ";
45 |             }
46 |             std::cerr << std::endl;
47 | #endif
48 |         }
49 | 
50 |     }
51 | 
52 |     return 0;
53 | }
54 | 


--------------------------------------------------------------------------------
/src/TestKeyFileReader.cpp:
--------------------------------------------------------------------------------
 1 | #include <Share.h>
 2 | #include <KeyFileReader.h>
 3 | 
 4 | int main(int argc, char *argv[]) {
 5 |     KeyFileReader kfr;
 6 | 
 7 |     if(argc != 3) {
 8 |         fprintf(stderr, "Usage: %s <list.txt> outfile\n", argv[0]);
 9 |         exit(EXIT_FAILURE);
10 |     }
11 | 
12 |     kfr.OpenKeyList(argv[1]);
13 |     kfr.ZeroMeanProc();
14 | 
15 |     ImageDevice d_Img;
16 |     for(int i = 0; i < kfr.cntImage; i++) {
17 |         kfr.UploadImage(d_Img, i);
18 |     }
19 | 
20 |     return 0;
21 | }
22 |  
23 | 


--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <cstdlib>
 3 | #include <iostream>
 4 | 
 5 | #include "KeyFileReader.h"
 6 | #include "HashConverter.h"
 7 | #include "HashMatcher.h"
 8 | 
 9 | int main(int argc, char **argv) {
10 |     if(argc != 3) {
11 |         fprintf(stderr, "Usage: %s <list.txt> outfile\n", argv[0]);
12 |         exit(EXIT_FAILURE);
13 |     }
14 | 
15 |     KeyFileReader keyFileReader;
16 |     keyFileReader.OpenKeyList(argv[1]);
17 |     keyFileReader.ZeroMeanProc();
18 | 
19 |     std::cerr << "Initializing CUDA device...\n";
20 |     cudaSetDevice(0);
21 | 
22 |     HashConverter hashConverter;
23 |     HashMatcher hashMatcher;
24 | 
25 |     cudaEvent_t start, stop;
26 |     cudaEventCreate(&start);
27 |     cudaEventCreate(&stop);
28 | 
29 |     FILE *outFile = fopen(argv[2], "w");
30 | 
31 |     cudaEventRecord(start);
32 | 
33 |     for(int imageIndex = 0; imageIndex < keyFileReader.cntImage; imageIndex++) {
34 |         ImageDevice newImage;
35 | 
36 |         std::cerr << "---------------------\nUploading image #" << imageIndex << " to GPU...\n";
37 |         cudaEvent_t kfFinishEvent = keyFileReader.UploadImageAsync(newImage, imageIndex);
38 | 
39 |         std::cerr << "Calculating compressed Hash Values for image #" << imageIndex << "\n"; 
40 |         cudaEvent_t hcFinishEvent = hashConverter.CalcHashValuesAsync(newImage, kfFinishEvent);
41 | 
42 |         std::cerr << "Matching image #" << imageIndex << " with previous images...\n";
43 |         hashMatcher.AddImageAsync(newImage, hcFinishEvent);
44 | 
45 |         for(int imageIndex2 = 0; imageIndex2 < imageIndex; imageIndex2++) {
46 |             MatchPairListPtr mpList = hashMatcher.MatchPairList(imageIndex, imageIndex2);
47 |             int pairCount = hashMatcher.NumberOfMatch(imageIndex, imageIndex2);
48 | 
49 |             fprintf(outFile, "%d %d\n%d\n", imageIndex2, imageIndex, pairCount);
50 | 
51 |             for(MatchPairList_t::iterator it = mpList->begin(); it != mpList->end(); it++) {
52 |                 fprintf(outFile, "%d %d\n", it->second, it->first);
53 |             }
54 |         }
55 | 
56 |     }
57 | 
58 |     cudaEventRecord(stop);
59 |     cudaEventSynchronize(stop);
60 | 
61 |     float timeElapsed;
62 |     cudaEventElapsedTime(&timeElapsed, start, stop);
63 |     std::cerr << "Time elapsed: " << timeElapsed << " ms\n";
64 | 
65 |     cudaEventDestroy(start);
66 |     cudaEventDestroy(stop);
67 | 
68 |     fclose(outFile);
69 | 
70 |     return 0;
71 | }
72 | 


--------------------------------------------------------------------------------