├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── README.org ├── include ├── HashConverter.h ├── HashMatcher.h ├── KeyFileReader.h └── Share.h ├── job.sh.in └── src ├── HashConverter.cpp ├── HashConverter.cu ├── HashMatcher.cpp ├── HashMatcher.cu ├── KeyFileReader.cpp ├── TestHashConverter.cpp ├── TestHashMatcher.cpp ├── TestKeyFileReader.cpp └── main.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | build/ 3 | /compile_commands.json 4 | test/ 5 | *~ 6 | *.user 7 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/cub"] 2 | path = third_party/cub 3 | url = git@github.com:NVlabs/cub.git 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(CasHash-CUDA) 2 | cmake_minimum_required(VERSION 3.1) 3 | 4 | if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR}) 5 | message(FATAL_ERROR "In-source builds not allowed. Please make a seperate directory and run cmake from there.") 6 | endif() 7 | 8 | if(NOT CMAKE_BUILD_TYPE) 9 | set(CMAKE_BUILD_TYPE "Debug") 10 | endif() 11 | 12 | string(TOLOWER "${CMAKE_BUILD_TYPE}" cmake_build_type_tolower) 13 | if( NOT cmake_build_type_tolower STREQUAL "debug" 14 | AND NOT cmake_build_type_tolower STREQUAL "release" 15 | AND NOT cmake_build_type_tolower STREQUAL "relwithdebinfo") 16 | message(FATAL_ERROR "Unknown build type \"${CMAKE_BUILD_TYPE}\". Allowed values are Debug, Release, RelWithDebInfo (case-insensitive).") 17 | endif() 18 | 19 | if(cmake_build_type_tolower STREQUAL "debug") 20 | list(APPEND CMAKE_CXX_FLAGS "-Wall -Wno-long-long") 21 | #add_definitions(-DDEBUG_HASH_MATCHER) 22 | #add_definitions(-DDEBUG_HASH_CONVERTER2) 23 | endif() 24 | 25 | find_package(CUDA REQUIRED) 26 | 27 | # Kepler 28 | #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_30,code=sm_30") 29 | list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_35,code=sm_35") 30 | 31 | # C++0x support 32 | list(APPEND CUDA_NVCC_FLAGS "-std=c++11") 33 | set(CMAKE_CXX_STANDARD 11) 34 | 35 | # export compile commands so that our auto-completion system can index the source files 36 | set(CMAKE_EXPORT_COMPILE_COMMANDS 1) 37 | 38 | include_directories(${PROJECT_SOURCE_DIR}/include) 39 | 40 | include_directories(${PROJECT_SOURCE_DIR}/third_party/cub) 41 | 42 | #cuda_add_executable(KeyMatchCUDA 43 | # src/main.cpp 44 | # src/KeyFileReader.cpp 45 | # ) 46 | 47 | cuda_add_executable(TestKeyFileReader 48 | src/TestKeyFileReader.cpp 49 | src/KeyFileReader.cpp 50 | ) 51 | 52 | cuda_add_executable(TestHashConverter 53 | src/TestHashConverter.cpp 54 | src/KeyFileReader.cpp 55 | src/HashConverter.cpp 56 | src/HashConverter.cu 57 | ) 58 | 59 | target_link_libraries( 60 | TestHashConverter 61 | curand) 62 | 63 | cuda_add_executable(TestHashMatcher 64 | src/TestHashMatcher.cpp 65 | src/KeyFileReader.cpp 66 | src/HashConverter.cpp 67 | src/HashConverter.cu 68 | src/HashMatcher.cpp 69 | src/HashMatcher.cu 70 | ) 71 | 72 | target_link_libraries( 73 | TestHashMatcher 74 | curand) 75 | 76 | cuda_add_executable(KeyMatchCUDA 77 | src/main.cpp 78 | src/KeyFileReader.cpp 79 | src/HashConverter.cpp 80 | src/HashConverter.cu 81 | src/HashMatcher.cpp 82 | src/HashMatcher.cu 83 | ) 84 | 85 | target_link_libraries( 86 | KeyMatchCUDA 87 | curand) 88 | 89 | # For HHLR 90 | configure_file(job.sh.in job.sh @ONLY) 91 | -------------------------------------------------------------------------------- /README.org: -------------------------------------------------------------------------------- 1 | #+TITLE: CasHash-CUDA 2 | 3 | * Description 4 | 5 | This project provides a library for GPU acclearated SIFT feature matching between images. 6 | 7 | *How fast is it?* 8 | 9 | According to our benchmark on Tesla K20 GPU, this algorithm can reach 30 fps pairing 30 images each with ~2000 sift vectors at a time. 10 | 11 | *What for?* 12 | 13 | This program can be used as a frontend for online image matching as well as large scale 3D reconstruction. 14 | 15 | *Related Publication* 16 | 17 | Cheng Jian, Cong Leng, Jiaxiang Wu, Hainan Cui, and Hanqing Lu. "Fast and accurate image matching with cascade hashing for 3d reconstruction." In IEEE Conference on Computer Vision and Pattern Recognition (CVPR2014), pp. 1-8. 2014. 18 | 19 | * Installation 20 | 21 | 22 | #+BEGIN_EXAMPLE 23 | git clone git@github.com:cvcore/cashash_cuda.git cashash_cuda 24 | git submodule init 25 | cd cashash_cuda 26 | mkdir build && cd build 27 | cmake .. 28 | make 29 | #+END_EXAMPLE 30 | 31 | * Usage 32 | 33 | - Input :: A list of path storing SIFT keyfeatures extracted from the images. 34 | - Output :: Match pairs. 35 | 36 | Sole command: 37 | 38 | #+BEGIN_EXAMPLE 39 | ./KeyMatchCUDA 40 | #+END_EXAMPLE 41 | 42 | On Lichtweise Cluster: 43 | 44 | Extract dataset file into cashash_cuda/dataset, then in build folder, run: 45 | #+BEGIN_EXAMPLE 46 | sbatch job.sh 47 | #+END_EXAMPLE 48 | 49 | You can download the dataset here: 50 | 51 | https://www.dropbox.com/s/ur6l6oigyxfzgrp/cashash_cuda_dataset.zip?dl=0 52 | 53 | * Todo 54 | 55 | - ✅ SIFT Vector Preprocessing & CPU Storage 56 | - ✅ Load vectors in all images. 57 | - ✅ Stream loading with cuda stream and asynchronious functions. 58 | - Device supports concurrent kernel execution & has 2 async engines 59 | - ✅ Update all SIFT Vectors to become zero mean 60 | - Stream preprocessing 61 | - 1000 images * 2000 sift vectors * 128 dim * 4 byte = 976MiB (We have two GPUs of 5GiB global memory in cluster) 62 | - ✅ Hash Calculation 63 | - ✅ Hash Remapping 64 | - For remapping into 128d Hamming space, we use 1x128 grids. 65 | - ✅ Bucket Generating 66 | - For bucketing, we use 6x8 grids. 67 | - ✅ Bucket Storage 68 | - Bucket Information: 6 bucket group * 2000 vectors * 1000 images * 2 byte = ~24MiB 69 | - Remapped vector: 2000 vectors * 1000 images * 16 byte = ~31MiB 70 | - ✅ Matching 71 | - ✅ Use =__device__ int __popcll(unsigned long long int x)= for sorting mapped hash values 72 | - ✅ Query all vectors according to bucket information stored in previous step 73 | - ❌ Check multiple image pairs simultaneously 74 | -------------------------------------------------------------------------------- /include/HashConverter.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "Share.h" 5 | 6 | class HashConverter { 7 | public: 8 | HashConverter(); 9 | ~HashConverter(); 10 | void CompHash(ImageDevice &d_Image, cudaStream_t stream = 0); 11 | void BucketHash(ImageDevice &d_Image, cudaStream_t stream = 0); 12 | void CalcHashValues(ImageDevice &d_Image); 13 | cudaEvent_t CalcHashValuesAsync(ImageDevice &d_Image, cudaEvent_t sync = NULL); 14 | 15 | private: 16 | void FillHashingMatrixCuRand(); 17 | void FillHashingMatrixCMath(); 18 | void FillHashingMatrixExternal(char const *path); 19 | float GetNormRand(void); 20 | 21 | Matrix d_projMatHamming_; // Matrix for 128-bit hamming vector, width = kDimSiftData 22 | Matrix d_projMatBucket_; // Same structure as d_projMatHamming but we chose to use only 6*8 = 48 bit from it. 23 | cudaStream_t hashConverterStream_; 24 | }; 25 | -------------------------------------------------------------------------------- /include/HashMatcher.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "Share.h" 5 | #include 6 | 7 | const BucketEle_t INVALID_CANDIDATE = ~0; 8 | const int MAX_COMPHASH_DISTANCE = ~(1 << (sizeof(int) * 8 - 1)); 9 | const float MAX_SIFT_DISTANCE = 1.0e38f; 10 | const int POSSIBLE_CANDIDATES = 8; 11 | const int HASH_MATCHER_BLOCK_SIZE = 32; 12 | const int HASH_MATCHER_ITEMS_PER_THREAD = 2; 13 | 14 | class HashMatcher { 15 | public: 16 | HashMatcher(); 17 | ~HashMatcher(); 18 | int NumberOfMatch(int queryImageIndex, int targetImageIndex); 19 | MatchPairListPtr MatchPairList(int queryImageIndex, int targetImageIndex); 20 | void AddImage(const ImageDevice &d_Image); /* return value: image index */ 21 | cudaEvent_t AddImageAsync(const ImageDevice &d_Image, cudaEvent_t sync = NULL); 22 | 23 | private: 24 | std::vector d_imageList_; 25 | std::map< std::pair< int, int >, MatchPairListPtr > matchDataBase_; 26 | cudaStream_t hashMatcherStream_; 27 | 28 | cudaEvent_t GeneratePair(int queryImageIndex, int targetImageIndex); 29 | }; 30 | -------------------------------------------------------------------------------- /include/KeyFileReader.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "Share.h" 5 | #include 6 | 7 | class KeyFileReader { 8 | public: 9 | KeyFileReader(); 10 | ~KeyFileReader(); 11 | void UploadImage(ImageDevice &imgDev, const int index); 12 | cudaEvent_t UploadImageAsync(ImageDevice &imgDev, const int index, cudaEvent_t sync = 0); 13 | void AddKeyFile(const char *path); 14 | void OpenKeyList(const char *path); 15 | void ZeroMeanProc(); 16 | 17 | int cntImage; 18 | 19 | std::vector h_imageList_; 20 | private: 21 | SiftData_t siftAccumulator_[kDimSiftData]; 22 | int cntTotalVector_; 23 | cudaStream_t keyFileReaderStream_; 24 | }; 25 | -------------------------------------------------------------------------------- /include/Share.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #ifdef __CUDACC__ 15 | #define CUDA_UNIVERSAL_QUALIFIER __host__ __device__ 16 | #else 17 | #define CUDA_UNIVERSAL_QUALIFIER 18 | #endif 19 | 20 | const int kDimSiftData = 128; // the number of dimensions of SIFT feature 21 | const int kDimHashData = 128; // the number of dimensions of Hash code 22 | const int kBitInCompHash = 64; // the number of Hash code bits to be compressed; in this case, use a variable to represent 64 bits 23 | const int kDimCompHashData = kDimHashData / kBitInCompHash; // the number of dimensions of CompHash code 24 | const int kMinMatchListLen = 16; // the minimal list length for outputing SIFT matching result between two images 25 | const int kMaxCntPoint = 4000; // the maximal number of possible SIFT points; ensure this value is not exceeded in your application 26 | 27 | const int kCntBucketBit = 8; // the number of bucket bits 28 | const int kCntBucketGroup = 6; // the number of bucket groups 29 | const int kCntBucketPerGroup = 1 << kCntBucketBit; // the number of buckets in each group 30 | const int kMaxMemberPerGroup = 100; 31 | 32 | const int kCntCandidateTopMin = 6; // the minimal number of top-ranked candidates 33 | const int kCntCandidateTopMax = 10; // the maximal number of top-ranked candidates 34 | const int kMaxCandidatePerDist = 100; 35 | 36 | typedef float SiftData_t; // CUDA GPUs are optimized for float arithmetics, we use float instead of int 37 | typedef float* SiftDataPtr; 38 | typedef const float* SiftDataConstPtr; 39 | typedef uint8_t HashData_t; 40 | typedef uint8_t* HashDataPtr; // Hash code is represented with type; only the lowest bit is used 41 | typedef uint64_t CompHashData_t; 42 | typedef uint64_t* CompHashDataPtr; // CompHash code is represented with type 43 | typedef unsigned int BucketEle_t; 44 | typedef unsigned int* BucketElePtr; // index list of points in a specific bucket 45 | 46 | typedef std::pair MatchPair_t; 47 | typedef std::shared_ptr MatchPairPtr; 48 | 49 | typedef std::vector MatchPairList_t; 50 | typedef std::shared_ptr MatchPairListPtr; 51 | 52 | template 53 | struct Matrix { 54 | int width; 55 | int height; 56 | size_t pitch; // row size in bytes 57 | T* elements; 58 | 59 | CUDA_UNIVERSAL_QUALIFIER inline T& operator() (int i, int j) { 60 | return *(reinterpret_cast(reinterpret_cast(elements) + i * pitch) + j); 61 | } // no more ugly pointer calcs 62 | 63 | CUDA_UNIVERSAL_QUALIFIER inline const T& operator() (int i, int j) const { 64 | return *(reinterpret_cast(reinterpret_cast(elements) + i * pitch) + j); 65 | } 66 | 67 | Matrix(int H, int W) : height(H), width(W){ 68 | pitch = sizeof(T) * width; // init pitch, will be adjusted later if use cudaMallocPitch 69 | } 70 | 71 | Matrix() : width(0), height(0), pitch(0), elements(NULL) { 72 | } 73 | }; 74 | 75 | struct ImageHost { 76 | int cntPoint; // the number of SIFT points 77 | std::string keyFilePath; 78 | Matrix siftData; // [cntPoint x 128] Matrix, storing all sift vectors one-off 79 | 80 | }; 81 | 82 | struct ImageDevice { 83 | int cntPoint; 84 | Matrix siftData; 85 | Matrix compHashData; // [cntPoint x 2 Matrix] 86 | Matrix bucketIDList; // element -> buckets [cntPoint x kCntBucketGroup] 87 | Matrix bucketList; // bucket -> elements [kCntBucketGroup*kCntBucketPerGroup x kMaxMemberPerGroup] 88 | std::map targetCandidates; 89 | }; 90 | 91 | 92 | 93 | #define CUDA_CHECK_ERROR \ 94 | do { \ 95 | const cudaError_t err = cudaGetLastError(); \ 96 | if (err != cudaSuccess) { \ 97 | const char *const err_str = cudaGetErrorString(err); \ 98 | std::cerr << "Cuda error in " << __FILE__ << ":" << __LINE__ - 1 \ 99 | << ": " << err_str << " (" << err << ")" << std::endl; \ 100 | exit(EXIT_FAILURE); \ 101 | } \ 102 | } while(0) 103 | 104 | 105 | template< typename T > 106 | void check(T result, char const *const func, const char *const file, int const line) 107 | { 108 | if (result) 109 | { 110 | fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", 111 | file, line, static_cast(result), cudaGetErrorString(result), func); 112 | cudaDeviceReset(); 113 | // Make sure we call CUDA Device Reset before exiting 114 | exit(EXIT_FAILURE); 115 | } 116 | } 117 | #define CUDA_CATCH_ERROR(val) check ( (val), #val, __FILE__, __LINE__) 118 | 119 | template 120 | inline void dumpDeviceArray(T const *d_Array, int count) { 121 | T *h_Array = new T[count]; 122 | cudaMemcpy(h_Array, d_Array, count * sizeof(T), cudaMemcpyDeviceToHost); 123 | CUDA_CHECK_ERROR; 124 | std::cout << "Dumping device array:\n"; 125 | for(int i = 0; i < count; i++) { 126 | std::cout << +h_Array[i] << ", "; 127 | } 128 | std::cout << "[ " << count << " element(s) ]\n"; 129 | delete [] h_Array; 130 | } 131 | 132 | template 133 | inline void dumpHostArray(T const *h_Array, int count) { 134 | std::cout << "Dumping host array:\n"; 135 | for(int i = 0; i < count; i++) { 136 | std::cout << +h_Array[i] << ", "; 137 | } 138 | std::cout << "[ " << count << " element(s) ]\n"; 139 | } 140 | 141 | -------------------------------------------------------------------------------- /job.sh.in: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -J CasHash_CUDA 4 | #SBATCH -e @CMAKE_BINARY_DIR@/stderr.txt 5 | #SBATCH -o @CMAKE_BINARY_DIR@/stdout.txt 6 | #SBATCH -n 1 7 | #SBATCH --mem-per-cpu=1024 8 | #SBATCH -t 30 9 | #SBATCH --exclusive 10 | #SBATCH -p kurs2 11 | 12 | echo "This is Job $SLURM_JOB_ID" 13 | cd @CMAKE_BINARY_DIR@ 14 | ./KeyMatchCUDA ../dataset/list_huge.txt output_gpu.txt 15 | -------------------------------------------------------------------------------- /src/HashConverter.cpp: -------------------------------------------------------------------------------- 1 | #include "HashConverter.h" 2 | #include "Share.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | HashConverter::HashConverter() { 13 | //Allocate matrix for hashing into 128d-Hamming space 14 | d_projMatHamming_.width = kDimSiftData; 15 | d_projMatHamming_.height = kDimHashData; 16 | cudaMallocPitch(&(d_projMatHamming_.elements), 17 | &(d_projMatHamming_.pitch), 18 | d_projMatHamming_.width * sizeof(SiftData_t), 19 | d_projMatHamming_.height); 20 | CUDA_CHECK_ERROR; 21 | 22 | d_projMatBucket_.width = kDimSiftData; 23 | d_projMatBucket_.height = kDimHashData; 24 | cudaMallocPitch(&(d_projMatBucket_.elements), 25 | &(d_projMatBucket_.pitch), 26 | d_projMatBucket_.width * sizeof(SiftData_t), 27 | d_projMatBucket_.height); 28 | CUDA_CHECK_ERROR; 29 | 30 | hashConverterStream_ = 0; 31 | 32 | FillHashingMatrixCuRand(); 33 | } 34 | 35 | HashConverter::~HashConverter(){ 36 | cudaFree(d_projMatHamming_.elements); 37 | cudaFree(d_projMatBucket_.elements); 38 | //cudaFree(d_projMatBucket_[0].elements); 39 | 40 | CUDA_CHECK_ERROR; 41 | } 42 | 43 | void HashConverter::FillHashingMatrixCuRand() { 44 | curandGenerator_t gen; 45 | 46 | curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); 47 | curandSetPseudoRandomGeneratorSeed(gen, 1234ULL); 48 | 49 | for(int i = 0; i < d_projMatHamming_.height; i++) { 50 | curandGenerateNormal(gen, &d_projMatHamming_(i, 0), kDimSiftData, 0, 1); 51 | } 52 | 53 | for(int i = 0; i < d_projMatBucket_.height; i++) { 54 | curandGenerateNormal(gen, &d_projMatBucket_(i, 0), kDimSiftData, 0, 1); 55 | } 56 | 57 | CUDA_CHECK_ERROR; 58 | 59 | #ifdef DEBUG_HASH_CONVERTER_RANDOM_MATRIX 60 | std::cout << "Device random matrix:\n"; 61 | dumpDeviceArray(&d_projMatBucket_(0, 0), 128); 62 | #endif 63 | 64 | } 65 | 66 | void HashConverter::FillHashingMatrixCMath() { 67 | SiftDataPtr tempRand = new SiftData_t[kDimSiftData]; 68 | 69 | for(int i = 0; i < d_projMatHamming_.height; i++) { 70 | for(int j = 0; j < kDimSiftData; j++) { 71 | tempRand[j] = GetNormRand(); 72 | } 73 | cudaMemcpy(&d_projMatHamming_(i, 0), tempRand, kDimSiftData * sizeof(SiftData_t), cudaMemcpyHostToDevice); 74 | } 75 | 76 | for(int i = 0; i < d_projMatHamming_.height; i++) { 77 | for(int j = 0; j < kDimSiftData; j++) { 78 | tempRand[j] = GetNormRand(); 79 | } 80 | cudaMemcpy(&d_projMatBucket_(i, 0), tempRand, kDimSiftData * sizeof(SiftData_t), cudaMemcpyHostToDevice); 81 | } 82 | 83 | #ifdef DEBUG_HASH_CONVERTER_RANDOM_MATRIX 84 | std::cout << "Device random matrix:\n"; 85 | dumpDeviceArray(&d_projMatBucket_(0, 0), 128); 86 | #endif 87 | 88 | } 89 | 90 | void HashConverter::FillHashingMatrixExternal(char const *path) { 91 | FILE *randomFp = fopen(path, "r"); 92 | if(!randomFp) { 93 | std::cerr << "Random matrix does not exist!\n"; 94 | exit(-1); 95 | } 96 | 97 | SiftDataPtr tempRand = new SiftData_t[kDimSiftData]; 98 | 99 | for(int i = 0; i < d_projMatHamming_.height; i++) { 100 | for(int j = 0; j < kDimSiftData; j++) { 101 | fscanf(randomFp, "%f", &tempRand[j]); 102 | } 103 | cudaMemcpy(&d_projMatHamming_(i, 0), tempRand, kDimSiftData * sizeof(SiftData_t), cudaMemcpyHostToDevice); 104 | } 105 | 106 | for(int i = 0; i < kCntBucketGroup * kCntBucketBit; i++) { 107 | for(int j = 0; j < kDimSiftData; j++) { 108 | fscanf(randomFp, "%f", &tempRand[j]); 109 | } 110 | cudaMemcpy(&d_projMatBucket_(i, 0), tempRand, kDimSiftData * sizeof(SiftData_t), cudaMemcpyHostToDevice); 111 | } 112 | 113 | delete [] tempRand; 114 | fclose(randomFp); 115 | } 116 | 117 | void HashConverter::CalcHashValues(ImageDevice &d_Image){ 118 | CompHash(d_Image); 119 | BucketHash(d_Image); 120 | } 121 | 122 | cudaEvent_t HashConverter::CalcHashValuesAsync(ImageDevice &d_Image, cudaEvent_t sync) { 123 | if(hashConverterStream_ == 0) { 124 | cudaStreamCreate(&hashConverterStream_); 125 | } 126 | 127 | if(sync) { 128 | cudaStreamWaitEvent(hashConverterStream_, sync, 0); 129 | } 130 | 131 | CompHash(d_Image, hashConverterStream_); 132 | BucketHash(d_Image, hashConverterStream_); 133 | 134 | cudaEvent_t finish; 135 | cudaEventCreate(&finish); 136 | cudaEventRecord(finish, hashConverterStream_); 137 | 138 | return finish; 139 | } 140 | 141 | float HashConverter::GetNormRand(void) { 142 | // based on Box-Muller transform; for more details, please refer to the following WIKIPEDIA website: 143 | // http://en.wikipedia.org/wiki/Box_Muller_transform 144 | float u1 = (rand() % 1000 + 1) / 1000.0; 145 | float u2 = (rand() % 1000 + 1) / 1000.0; 146 | 147 | float randVal = sqrt(-2 * log(u1)) * cos(2 * acos(-1.0) * u2); 148 | 149 | return randVal; 150 | } 151 | -------------------------------------------------------------------------------- /src/HashConverter.cu: -------------------------------------------------------------------------------- 1 | #include "HashConverter.h" 2 | #include "Share.h" 3 | 4 | __global__ void CompHashKernel(Matrix g_sift, const Matrix g_projMat, Matrix g_compHash) { 5 | __shared__ float s_siftCur[kDimSiftData]; // shared sift vector 6 | __shared__ uint32_t s_hashBits[kDimHashData + 16]; 7 | SiftDataPtr g_siftCur = &g_sift(blockIdx.x, 0); 8 | SiftDataConstPtr g_projMatCur = &g_projMat(threadIdx.x, 0); 9 | int tx = threadIdx.x; 10 | int bx = blockIdx.x; 11 | 12 | s_siftCur[tx] = g_siftCur[tx]; // we can do this because kDimSiftData == kBitInCompHash, otherwise we need to setup a if condition 13 | __syncthreads(); 14 | 15 | float element = 0.f; 16 | for(int i = 0; i < kDimSiftData; i++) { 17 | element = element + s_siftCur[i] * g_projMatCur[i]; 18 | } 19 | 20 | if(tx < 16) { 21 | s_hashBits[kDimHashData + tx] = 0; 22 | } 23 | 24 | uint32_t hashVal = static_cast(element > 0.f); 25 | hashVal <<= (tx % 32); 26 | s_hashBits[tx] = hashVal; 27 | __syncthreads(); 28 | 29 | //for(int stride = 2; stride <= 32; stride <<= 1) { 30 | // if(tx % stride == 0) { 31 | // s_hashBits[tx] += s_hashBits[tx + stride / 2]; 32 | // } 33 | //} 34 | 35 | /* dangerous reduction but no warp divergence, assuming warp size = 32 */ 36 | s_hashBits[tx] ^= s_hashBits[tx + 1]; 37 | s_hashBits[tx] ^= s_hashBits[tx + 2]; 38 | s_hashBits[tx] ^= s_hashBits[tx + 4]; 39 | s_hashBits[tx] ^= s_hashBits[tx + 8]; 40 | s_hashBits[tx] ^= s_hashBits[tx + 16]; 41 | 42 | __syncthreads(); 43 | 44 | if(tx % 64 == 0) { 45 | uint64_t halfCompHash = ((static_cast(s_hashBits[tx + 32]) << 32) + s_hashBits[tx]); 46 | g_compHash(bx, tx / 64) = halfCompHash; 47 | } 48 | } 49 | 50 | void HashConverter::CompHash( ImageDevice &d_Image, cudaStream_t stream ) { 51 | // d_Image.compHashData.width = 2; 52 | // d_Image.compHashData.height = d_Image.cntPoint; 53 | // cudaMallocPitch(&(d_Image.compHashData.elements), 54 | // &(d_Image.compHashData.pitch), 55 | // d_Image.compHashData.width, 56 | // d_Image.compHashData.height); 57 | 58 | d_Image.compHashData.width = 2; 59 | d_Image.compHashData.pitch = sizeof(CompHashData_t) * 2; 60 | d_Image.compHashData.height = d_Image.cntPoint; 61 | cudaMalloc(&(d_Image.compHashData.elements), 62 | d_Image.compHashData.pitch * d_Image.compHashData.height); 63 | CUDA_CHECK_ERROR; 64 | 65 | dim3 blockSize(kDimHashData); 66 | dim3 gridSize(d_Image.cntPoint); 67 | 68 | if(stream == 0) 69 | CompHashKernel<<>>(d_Image.siftData, 70 | d_projMatHamming_, 71 | d_Image.compHashData); 72 | else { 73 | CompHashKernel<<>>(d_Image.siftData, 74 | d_projMatHamming_, 75 | d_Image.compHashData); 76 | } 77 | 78 | CUDA_CHECK_ERROR; 79 | } 80 | 81 | __global__ void BucketHashKernel(Matrix g_sift, const Matrix g_projMat, Matrix g_bucketHash, Matrix g_bucketEle) { 82 | __shared__ float s_siftCur[kDimSiftData]; // shared sift vector 83 | __shared__ int s_hashBits[kDimHashData]; 84 | SiftDataPtr g_siftCur = &g_sift(blockIdx.x, 0); 85 | SiftDataConstPtr g_projMatCur = &g_projMat(threadIdx.x, 0); 86 | int tx = threadIdx.x; // hash group 87 | int bx = blockIdx.x; // sift vector index 88 | int idx = tx + bx * blockDim.x; 89 | 90 | s_siftCur[tx] = g_siftCur[tx]; // we can do this because kDimSiftData == kBitInCompHash, otherwise we need to setup a if condition 91 | if(idx < g_bucketEle.height) 92 | g_bucketEle(idx, 0) = 0; 93 | 94 | __syncthreads(); 95 | 96 | float element = 0.f; 97 | for(int i = 0; i < kDimSiftData; i++) { 98 | element = element + s_siftCur[i] * g_projMatCur[i]; 99 | } 100 | 101 | int hashVal = static_cast(element > 0.f); 102 | 103 | hashVal <<= tx % 8; 104 | s_hashBits[tx] = hashVal; 105 | __syncthreads(); 106 | 107 | for(int stride = 2; stride <= 8; stride <<= 1) { 108 | if(tx % stride == 0) { 109 | s_hashBits[tx] += s_hashBits[tx + stride / 2]; 110 | } 111 | } 112 | 113 | if(tx % 8 == 0 && tx / 8 < kCntBucketGroup) { 114 | hashVal = s_hashBits[tx]; 115 | g_bucketHash(bx, tx / 8) = hashVal; 116 | BucketElePtr baseAddr = &(g_bucketEle(kCntBucketPerGroup * tx / 8 + hashVal, 0)); 117 | int currIdx = atomicInc(baseAddr, kMaxMemberPerGroup) + 1; 118 | 119 | #ifdef DEBUG_HASH_CONVERTER 120 | printf("%d %d %d\n", tx / 8, hashVal, currIdx); 121 | if(currIdx == kMaxMemberPerGroup) { 122 | printf("Warning: bucket full! Consider increasing bucket #%d in group %d!\n", hashVal, tx / 8); 123 | } 124 | #endif 125 | 126 | g_bucketEle(kCntBucketPerGroup * tx / 8 + hashVal, currIdx) = bx; 127 | } 128 | } 129 | 130 | void HashConverter::BucketHash( ImageDevice &d_Image, cudaStream_t stream ) { 131 | d_Image.bucketIDList.width = kCntBucketGroup; 132 | d_Image.bucketIDList.height = d_Image.cntPoint; 133 | cudaMallocPitch(&(d_Image.bucketIDList.elements), 134 | &(d_Image.bucketIDList.pitch), 135 | d_Image.bucketIDList.width * sizeof(HashData_t), 136 | d_Image.bucketIDList.height); 137 | 138 | d_Image.bucketList.width = kMaxMemberPerGroup; 139 | d_Image.bucketList.height = kCntBucketGroup * kCntBucketPerGroup; 140 | cudaMallocPitch(&(d_Image.bucketList.elements), 141 | &(d_Image.bucketList.pitch), 142 | d_Image.bucketList.width * sizeof(BucketEle_t), 143 | d_Image.bucketList.height); 144 | 145 | 146 | //for(int i = 0; i < d_Image.bucketList.height; i++) { 147 | // cudaMemset(&(d_Image.bucketList(i, 0)), 148 | // 0, 149 | // sizeof(BucketEle_t)); 150 | // CUDA_CHECK_ERROR; 151 | //} 152 | 153 | //CUDA_CHECK_ERROR; 154 | 155 | // TODO bucketEle 156 | dim3 blockSize(kDimHashData); 157 | dim3 gridSize(d_Image.cntPoint); 158 | 159 | if(stream == 0) 160 | BucketHashKernel<<>>(d_Image.siftData, 161 | d_projMatBucket_, 162 | d_Image.bucketIDList, 163 | d_Image.bucketList); 164 | else { 165 | BucketHashKernel<<>>(d_Image.siftData, d_projMatBucket_, d_Image.bucketIDList, d_Image.bucketList); 166 | } 167 | 168 | #ifdef DEBUG_HASH_CONVERTER2 169 | for(int m = 0; m < kCntBucketGroup; m++) { 170 | for(int bucket = 0; bucket < kCntBucketPerGroup; bucket++) { 171 | BucketEle_t bucketSize; 172 | cudaMemcpy(&bucketSize, &(d_Image.bucketList(m * kCntBucketPerGroup + bucket, 0)), sizeof(BucketEle_t), cudaMemcpyDeviceToHost); 173 | std::cout << "Group: " << m << " Bucket: " << bucket << " Size: " << bucketSize << "\n"; 174 | } 175 | } 176 | CUDA_CHECK_ERROR; 177 | #endif 178 | 179 | } 180 | 181 | -------------------------------------------------------------------------------- /src/HashMatcher.cpp: -------------------------------------------------------------------------------- 1 | #include "HashMatcher.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | HashMatcher::HashMatcher() { 10 | hashMatcherStream_ = 0; 11 | } 12 | 13 | HashMatcher::~HashMatcher() { 14 | 15 | } 16 | 17 | int PairListIndex(int imageIndex1, int imageIndex2) { 18 | if(imageIndex2 > imageIndex1) 19 | std::swap(imageIndex1, imageIndex2); 20 | 21 | return imageIndex1 * (imageIndex1 - 1) / 2 + imageIndex2; 22 | } 23 | 24 | void HashMatcher::AddImage(const ImageDevice &d_Image) { 25 | d_imageList_.push_back(d_Image); 26 | 27 | int currentImages = d_imageList_.size(); 28 | 29 | for(int imageIndex = 0; imageIndex < currentImages - 1; imageIndex++) { 30 | GeneratePair(currentImages - 1, imageIndex); // pair with all previous images 31 | // TODO pair with user-specified list 32 | } 33 | } 34 | 35 | cudaEvent_t HashMatcher::AddImageAsync(const ImageDevice &d_Image, cudaEvent_t sync) { 36 | if(hashMatcherStream_ == 0) { 37 | cudaStreamCreate(&hashMatcherStream_); 38 | } 39 | 40 | if(sync) { 41 | cudaStreamWaitEvent(hashMatcherStream_, sync, 0); 42 | } 43 | 44 | d_imageList_.push_back(d_Image); 45 | 46 | int currentImages = d_imageList_.size(); 47 | 48 | for(int imageIndex = 0; imageIndex < currentImages - 1; imageIndex++) { 49 | GeneratePair(currentImages - 1, imageIndex); // pair with all previous images 50 | // TODO pair with user-specified list 51 | } 52 | 53 | cudaEvent_t finish; 54 | cudaEventCreate(&finish); 55 | cudaEventRecord(finish, hashMatcherStream_); 56 | 57 | return finish; 58 | } 59 | 60 | int HashMatcher::NumberOfMatch(int queryImageIndex, int targetImageIndex) { 61 | return MatchPairList(queryImageIndex, targetImageIndex)->size(); 62 | } 63 | 64 | MatchPairListPtr HashMatcher::MatchPairList( int queryImageIndex, int targetImageIndex ) { 65 | auto queryTargetPair = std::make_pair(queryImageIndex, targetImageIndex); 66 | 67 | if(!matchDataBase_.count(queryTargetPair)) { 68 | ImageDevice &queryImage = d_imageList_[queryImageIndex]; 69 | BucketElePtr d_candidateArray = queryImage.targetCandidates[targetImageIndex]; 70 | BucketElePtr h_candidateArray = new BucketEle_t[queryImage.cntPoint]; 71 | 72 | cudaMemcpy(h_candidateArray, d_candidateArray, queryImage.cntPoint * sizeof(BucketEle_t), cudaMemcpyDeviceToHost); 73 | cudaFree(d_candidateArray); 74 | CUDA_CHECK_ERROR; 75 | 76 | MatchPairListPtr newMatchPairList(new MatchPairList_t); 77 | 78 | for(int point = 0; point < queryImage.cntPoint; point++) { 79 | if(h_candidateArray[point] != INVALID_CANDIDATE) { 80 | newMatchPairList->push_back(std::make_pair(point, h_candidateArray[point])); 81 | } 82 | } 83 | 84 | matchDataBase_[queryTargetPair] = newMatchPairList; 85 | } 86 | 87 | return matchDataBase_[queryTargetPair]; 88 | } 89 | -------------------------------------------------------------------------------- /src/HashMatcher.cu: -------------------------------------------------------------------------------- 1 | #include "HashMatcher.h" 2 | 3 | #include "cub/cub.cuh" 4 | 5 | 6 | struct DistIndexPair { 7 | SiftData_t dist; 8 | BucketEle_t index; 9 | }; 10 | 11 | struct MinDistOp { 12 | CUDA_UNIVERSAL_QUALIFIER DistIndexPair operator() (const DistIndexPair a, const DistIndexPair b) { 13 | return (a.dist < b.dist) ? a : b; 14 | } 15 | }; 16 | 17 | __global__ void GeneratePairKernel(Matrix g_queryImageBucketID, 18 | Matrix g_queryImageCompHashData, 19 | Matrix g_queryImageSiftData, 20 | int queryImageCntPoint, 21 | Matrix g_targetImageBucket, 22 | Matrix g_targetImageCompHashData, 23 | Matrix g_targetImageSiftData, 24 | BucketElePtr g_pairResult) { 25 | 26 | int candidate[kDimHashData + 1][kMaxCandidatePerDist]; // 6 * 1K, local memory 27 | int candidateLen[kDimHashData + 1]; 28 | bool candidateUsed[kMaxCntPoint]; 29 | int candidateTop[kCntCandidateTopMax]; 30 | int candidateTopLen = 0; 31 | 32 | int querySiftIndex = threadIdx.x + blockIdx.x * blockDim.x; 33 | 34 | if(querySiftIndex >= queryImageCntPoint) 35 | return; 36 | 37 | memset(candidateLen, 0, sizeof(candidateLen)); 38 | 39 | CompHashData_t currentCompHash[2]; 40 | currentCompHash[0] = g_queryImageCompHashData(querySiftIndex, 0); 41 | currentCompHash[1] = g_queryImageCompHashData(querySiftIndex, 1); 42 | 43 | #ifdef DEBUG_HASH_MATCHER1 44 | printf("current comphash: %lld %lld\n", currentCompHash[0], currentCompHash[1]); 45 | #endif 46 | 47 | for(int m = 0; m < kCntBucketGroup; m++) { 48 | 49 | HashData_t currentBucket = g_queryImageBucketID(querySiftIndex, m); 50 | BucketEle_t *targetBucket = &g_targetImageBucket(m * kCntBucketPerGroup + currentBucket, 0); 51 | int targetBucketElements = targetBucket[0]; 52 | 53 | for(int bucketIndex = 1; bucketIndex <= targetBucketElements; bucketIndex++) { 54 | 55 | int targetIndex = targetBucket[bucketIndex]; 56 | 57 | int dist = __popcll(currentCompHash[0] ^ g_targetImageCompHashData(targetIndex, 0)) + 58 | __popcll(currentCompHash[1] ^ g_targetImageCompHashData(targetIndex, 1)); 59 | candidate[dist][candidateLen[dist]++] = targetIndex; 60 | candidateUsed[targetIndex] = false; 61 | 62 | #ifdef DEBUG_HASH_MATCHER2 63 | printf("(%d %d) ", targetIndex, dist); 64 | #endif 65 | } 66 | } 67 | 68 | for(int dist = 0; dist <= kDimHashData; dist++) { 69 | for(int i = 0; i < candidateLen[dist]; i++) { 70 | int targetIndex = candidate[dist][i]; 71 | 72 | //if(blockIdx.x == 0 && threadIdx.x == 0) { 73 | // printf("%d ", targetIndex); 74 | //} 75 | 76 | if(!candidateUsed[targetIndex]) { 77 | candidateUsed[targetIndex] = true; 78 | candidateTop[candidateTopLen++] = targetIndex; 79 | if(candidateTopLen == kCntCandidateTopMax) 80 | break; 81 | } 82 | } 83 | 84 | if(candidateTopLen >= kCntCandidateTopMin) 85 | break; 86 | } 87 | 88 | SiftDataConstPtr queryImageSift = &g_queryImageSiftData(querySiftIndex, 0); 89 | 90 | double minVal1 = 0.0; 91 | int minValInd1 = -1; 92 | double minVal2 = 0.0; 93 | int minValInd2 = -1; 94 | 95 | for(int candidateListIndex = 0; candidateListIndex < candidateTopLen; candidateListIndex++) { 96 | 97 | int candidateIndex = candidateTop[candidateListIndex]; 98 | SiftDataConstPtr candidateSift = &g_targetImageSiftData(candidateIndex, 0); 99 | 100 | float candidateDist = 0.f; 101 | for(int i = 0; i < kDimSiftData; i++) { 102 | float diff = queryImageSift[i] - candidateSift[i]; 103 | candidateDist = candidateDist + diff * diff; 104 | } 105 | 106 | if (minValInd2 == -1 || minVal2 > candidateDist) { 107 | minVal2 = candidateDist; 108 | minValInd2 = candidateIndex; 109 | } 110 | 111 | if (minValInd1 == -1 || minVal1 > minVal2) { 112 | float minValTemp = minVal2; 113 | minVal2 = minVal1; 114 | minVal1 = minValTemp; 115 | int minValIndTemp = minValInd2; 116 | minValInd2 = minValInd1; 117 | minValInd1 = minValIndTemp; 118 | } 119 | } 120 | 121 | 122 | if (minVal1 < minVal2 * 0.32f) { 123 | g_pairResult[querySiftIndex] = minValInd1 + 1; 124 | } else { 125 | g_pairResult[querySiftIndex] = 0; 126 | } 127 | 128 | } 129 | 130 | template 131 | __global__ void GeneratePairKernelFast(Matrix g_queryImageBucketID, 132 | Matrix g_queryImageCompHashData, 133 | Matrix g_queryImageSiftData, 134 | const int queryImageCntPoint, 135 | Matrix g_targetImageBucket, 136 | Matrix g_targetImageCompHashData, 137 | Matrix g_targetImageSiftData, 138 | BucketElePtr g_pairResult) { 139 | 140 | typedef cub::BlockLoad LoadBucketVectorsT; 141 | typedef cub::BlockRadixSort SortVectorDistT; 142 | typedef cub::BlockReduce BlockReduceT; 143 | 144 | __shared__ union { 145 | typename LoadBucketVectorsT::TempStorage load; 146 | typename SortVectorDistT::TempStorage sort; 147 | typename BlockReduceT::TempStorage reduce; 148 | } tempStorage; 149 | 150 | /* in case of same candidate numbers being thrown from different bucket */ 151 | const int lastTopThreadsCnt = POSSIBLE_CANDIDATES / ITEMS_PER_THREAD; // FIXME: deal with demainders != 0 152 | __shared__ BucketEle_t s_lastTopVectors[POSSIBLE_CANDIDATES + 1]; 153 | 154 | /* Initialize lastTops as INVALID */ 155 | if(threadIdx.x < POSSIBLE_CANDIDATES + 1) { 156 | s_lastTopVectors[threadIdx.x] = INVALID_CANDIDATE; 157 | } 158 | __syncthreads(); 159 | 160 | BucketEle_t targetVectors[ITEMS_PER_THREAD]; 161 | int targetDists[ITEMS_PER_THREAD]; 162 | 163 | int queryIndex = blockIdx.x; 164 | CompHashData_t queryCompHash[2]; 165 | queryCompHash[0] = g_queryImageCompHashData(queryIndex, 0); 166 | queryCompHash[1] = g_queryImageCompHashData(queryIndex, 1); 167 | 168 | for(int group = 0; group < kCntBucketGroup; group++) { 169 | BucketElePtr currentBucketPtr = &g_targetImageBucket(g_queryImageBucketID(queryIndex, group), 0); 170 | int currentBucketSize = *currentBucketPtr; 171 | 172 | LoadBucketVectorsT(tempStorage.load).Load(currentBucketPtr + 1, targetVectors, currentBucketSize, INVALID_CANDIDATE); 173 | __syncthreads(); 174 | 175 | if(threadIdx.x >= blockDim.x - lastTopThreadsCnt) { 176 | int offset = (threadIdx.x - (blockDim.x - lastTopThreadsCnt)) * ITEMS_PER_THREAD; 177 | 178 | #pragma unroll 179 | for(int i = 0; i < ITEMS_PER_THREAD; i++) { 180 | targetVectors[i] = s_lastTopVectors[i + offset]; 181 | } 182 | } 183 | 184 | #pragma unroll 185 | for(int i = 0; i < ITEMS_PER_THREAD; i++) { 186 | BucketEle_t targetIndex = targetVectors[i]; 187 | 188 | if(targetIndex != INVALID_CANDIDATE) { 189 | targetDists[i] = __popcll(queryCompHash[0] ^ g_targetImageCompHashData(targetIndex, 0)) + 190 | __popcll(queryCompHash[1] ^ g_targetImageCompHashData(targetIndex, 1)); 191 | } else { 192 | targetDists[i] = MAX_COMPHASH_DISTANCE; 193 | } 194 | } 195 | 196 | SortVectorDistT(tempStorage.sort).Sort(targetDists, targetVectors, 0, 8); // end_bit = 8, maximum possible dist = 128 197 | 198 | if(threadIdx.x < lastTopThreadsCnt) { 199 | int offset = threadIdx.x * ITEMS_PER_THREAD; 200 | 201 | #pragma unroll 202 | for(int i = 0; i < ITEMS_PER_THREAD; i++) { 203 | s_lastTopVectors[i + offset] = targetVectors[i]; 204 | } 205 | } 206 | 207 | __syncthreads(); 208 | 209 | /* remove duplicated candidate. prerequisite: POSSIBLE_CANDIDATES < blockDim.x */ 210 | bool isDuplicate = false; 211 | 212 | if(threadIdx.x < POSSIBLE_CANDIDATES && (s_lastTopVectors[threadIdx.x] == s_lastTopVectors[threadIdx.x + 1])){ 213 | isDuplicate = true; 214 | } 215 | 216 | __syncthreads(); 217 | 218 | if(isDuplicate) { 219 | s_lastTopVectors[threadIdx.x] = INVALID_CANDIDATE; 220 | } 221 | 222 | __syncthreads(); 223 | } 224 | 225 | DistIndexPair candidate; 226 | candidate.dist = MAX_SIFT_DISTANCE; 227 | 228 | if(threadIdx.x < POSSIBLE_CANDIDATES) { 229 | candidate.index = s_lastTopVectors[threadIdx.x]; 230 | 231 | if(candidate.index != INVALID_CANDIDATE) { 232 | float dist = 0; 233 | SiftDataPtr querySiftVector = &g_queryImageSiftData(queryIndex, 0), 234 | targetSiftVector = &g_targetImageSiftData(candidate.index, 0); 235 | 236 | for(int i = 0; i < kDimSiftData; i++) { 237 | float diff = querySiftVector[i] - targetSiftVector[i]; 238 | dist += diff * diff; 239 | } 240 | 241 | candidate.dist = dist; 242 | } 243 | } 244 | 245 | DistIndexPair min1, min2; 246 | const MinDistOp minDistOp; 247 | 248 | min1 = BlockReduceT(tempStorage.reduce).Reduce(candidate, minDistOp, POSSIBLE_CANDIDATES); 249 | 250 | if(threadIdx.x == 0) { 251 | candidate.index = INVALID_CANDIDATE; 252 | candidate.dist = MAX_SIFT_DISTANCE; 253 | } 254 | 255 | min2 = BlockReduceT(tempStorage.reduce).Reduce(candidate, minDistOp, POSSIBLE_CANDIDATES); 256 | 257 | if(threadIdx.x == 0) { 258 | if(min1.dist < min2.dist * 0.32f) { 259 | g_pairResult[queryIndex] = min1.index; 260 | } else { 261 | g_pairResult[queryIndex] = INVALID_CANDIDATE; 262 | } 263 | } 264 | } 265 | 266 | cudaEvent_t HashMatcher::GeneratePair(int queryImageIndex, int targetImageIndex) { 267 | ImageDevice &queryImage = d_imageList_[queryImageIndex]; 268 | const ImageDevice &targetImage = d_imageList_[targetImageIndex]; 269 | 270 | BucketElePtr candidateArray; 271 | cudaMalloc(&candidateArray, sizeof(BucketEle_t) * queryImage.cntPoint); 272 | CUDA_CHECK_ERROR; 273 | 274 | queryImage.targetCandidates[targetImageIndex] = candidateArray; 275 | 276 | dim3 gridSize(queryImage.cntPoint); 277 | dim3 blockSize(HASH_MATCHER_BLOCK_SIZE); 278 | 279 | GeneratePairKernelFast<<>>( 280 | queryImage.bucketIDList, 281 | queryImage.compHashData, 282 | queryImage.siftData, 283 | queryImage.cntPoint, 284 | targetImage.bucketList, 285 | targetImage.compHashData, 286 | targetImage.siftData, 287 | candidateArray); 288 | 289 | cudaEvent_t finish; 290 | cudaEventCreate(&finish); 291 | cudaEventRecord(finish, hashMatcherStream_); 292 | 293 | return finish; 294 | } 295 | -------------------------------------------------------------------------------- /src/KeyFileReader.cpp: -------------------------------------------------------------------------------- 1 | #include "KeyFileReader.h" 2 | #include "Share.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | KeyFileReader::KeyFileReader() { 10 | std::memset(siftAccumulator_, 0, sizeof(siftAccumulator_)); 11 | keyFileReaderStream_ = 0; 12 | } 13 | 14 | KeyFileReader::~KeyFileReader() { 15 | std::vector::iterator it; 16 | for(it = h_imageList_.begin(); it != h_imageList_.end(); ++it) { 17 | delete[] it->siftData.elements; 18 | } 19 | } 20 | 21 | void KeyFileReader::AddKeyFile( const char *path ) { 22 | FILE *keyFile = fopen(path, "r"); 23 | if(keyFile == NULL) { 24 | fprintf(stderr, "Key file %s does not exist!\n", path); 25 | exit(EXIT_FAILURE); 26 | } 27 | fprintf(stderr, "Reading SIFT vector from %s\n", path); 28 | int cntPoint, cntDim; 29 | fscanf(keyFile, "%d%d", &cntPoint, &cntDim); 30 | if(cntDim != kDimSiftData) { 31 | fprintf(stderr, "Unsupported SIFT vector dimension %d, should be %d!\n", cntDim, kDimSiftData); 32 | exit(EXIT_FAILURE); 33 | } 34 | 35 | ImageHost newImage; 36 | newImage.cntPoint = cntPoint; 37 | newImage.keyFilePath = path; 38 | 39 | size_t requiredSize = cntPoint * cntDim; 40 | newImage.siftData.elements = new SiftData_t[requiredSize]; 41 | newImage.siftData.width = cntDim; 42 | newImage.siftData.height = cntPoint; 43 | newImage.siftData.pitch = cntDim * sizeof(SiftData_t); 44 | if( newImage.siftData.elements == NULL) { 45 | fprintf(stderr, "Can't allocate memory for host image!\n"); 46 | exit(EXIT_FAILURE); 47 | } 48 | 49 | for(int i = 0; i < cntPoint; i++) { 50 | fscanf(keyFile, "%*f%*f%*f%*f"); //ignoring sift headers 51 | SiftDataPtr rowVector = newImage.siftData.elements + kDimSiftData * i; 52 | for(int j = 0; j < kDimSiftData; j++) { 53 | fscanf(keyFile, "%f", &rowVector[j]); 54 | siftAccumulator_[j] = siftAccumulator_[j] + rowVector[j]; 55 | } 56 | cntTotalVector_++; 57 | } 58 | fclose(keyFile); 59 | h_imageList_.push_back(newImage); 60 | cntImage = h_imageList_.size(); 61 | } 62 | 63 | void KeyFileReader::OpenKeyList( const char *path ) { 64 | FILE *keyList = fopen(path, "r"); 65 | char keyFilePath[256]; 66 | if(keyList == NULL) { 67 | fprintf(stderr, "Keylist file %s does not exist!\n", path); 68 | exit(EXIT_FAILURE); 69 | } 70 | while(fscanf(keyList, "%s", keyFilePath) > 0) { 71 | AddKeyFile(keyFilePath); 72 | } 73 | fclose(keyList); 74 | } 75 | 76 | void KeyFileReader::ZeroMeanProc() { 77 | SiftData_t mean[kDimSiftData]; 78 | 79 | for(int i = 0; i < kDimSiftData; i++) { 80 | mean[i] = siftAccumulator_[i] / cntTotalVector_; 81 | } 82 | 83 | std::vector::iterator it; 84 | 85 | for(it = h_imageList_.begin(); it != h_imageList_.end(); ++it) { 86 | for(int i = 0; i < it->cntPoint; i++) { 87 | SiftDataPtr rowVector = &it->siftData(i, 0); 88 | for(int j = 0; j < kDimSiftData; j++) { 89 | rowVector[j] -= mean[j]; 90 | } 91 | } 92 | } 93 | } 94 | 95 | void KeyFileReader::UploadImage( ImageDevice &d_Image, const int index ) { 96 | d_Image.cntPoint = h_imageList_[index].cntPoint; 97 | d_Image.siftData.width = kDimSiftData; 98 | d_Image.siftData.height = h_imageList_[index].cntPoint; 99 | 100 | cudaMallocPitch(&(d_Image.siftData.elements), 101 | &(d_Image.siftData.pitch), 102 | d_Image.siftData.width * sizeof(SiftData_t), 103 | d_Image.siftData.height); 104 | 105 | cudaMemcpy2D(d_Image.siftData.elements, 106 | d_Image.siftData.pitch, 107 | h_imageList_[index].siftData.elements, 108 | h_imageList_[index].siftData.pitch, 109 | h_imageList_[index].siftData.width * sizeof(SiftData_t), 110 | h_imageList_[index].siftData.height, 111 | cudaMemcpyHostToDevice); 112 | CUDA_CHECK_ERROR; 113 | } 114 | 115 | cudaEvent_t KeyFileReader::UploadImageAsync( ImageDevice &d_Image, const int index, cudaEvent_t sync ) { 116 | if(keyFileReaderStream_ == 0) { 117 | cudaStreamCreate(&keyFileReaderStream_); 118 | } 119 | 120 | if(sync) { 121 | cudaStreamWaitEvent(keyFileReaderStream_, sync, 0); 122 | } 123 | 124 | d_Image.cntPoint = h_imageList_[index].cntPoint; 125 | d_Image.siftData.width = kDimSiftData; 126 | d_Image.siftData.height = h_imageList_[index].cntPoint; 127 | 128 | cudaMallocPitch(&(d_Image.siftData.elements), 129 | &(d_Image.siftData.pitch), 130 | d_Image.siftData.width * sizeof(SiftData_t), 131 | d_Image.siftData.height); 132 | 133 | cudaMemcpy2DAsync(d_Image.siftData.elements, 134 | d_Image.siftData.pitch, 135 | h_imageList_[index].siftData.elements, 136 | h_imageList_[index].siftData.pitch, 137 | h_imageList_[index].siftData.width * sizeof(SiftData_t), 138 | h_imageList_[index].siftData.height, 139 | cudaMemcpyHostToDevice, 140 | keyFileReaderStream_); 141 | 142 | cudaEvent_t finish; 143 | cudaEventCreate(&finish); 144 | cudaEventRecord(finish, keyFileReaderStream_); 145 | 146 | CUDA_CHECK_ERROR; 147 | 148 | return finish; 149 | } 150 | -------------------------------------------------------------------------------- /src/TestHashConverter.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "HashConverter.h" 5 | #include "KeyFileReader.h" 6 | #include "Share.h" 7 | 8 | int main(int argc, char *argv[]) { 9 | ImageDevice d_Img; 10 | KeyFileReader kf; 11 | std::cout << "reading keylist\n"; 12 | kf.OpenKeyList(argv[1]); 13 | 14 | std::cout << "Vector #0 in host image #0:\n"; 15 | dumpHostArray(&kf.h_imageList_[0].siftData(0, 0), kDimSiftData); 16 | 17 | std::cout << "vector #1 in image #0:\n"; 18 | dumpHostArray(&kf.h_imageList_[0].siftData(1, 0), kDimSiftData); 19 | 20 | std::cout << "Removing DC components..\n"; 21 | kf.ZeroMeanProc(); 22 | std::cout << "uploading image\n"; 23 | kf.UploadImage(d_Img, 0); 24 | 25 | std::cout << "Vector #0 in host image #0:\n"; 26 | dumpHostArray(&kf.h_imageList_[0].siftData(0, 0), kDimSiftData); 27 | 28 | std::cout << "Vector #0 in device image #0:\n"; 29 | dumpDeviceArray(&d_Img.siftData(0, 0), kDimSiftData); 30 | 31 | std::cout << "HashConverter instantiated\n"; 32 | HashConverter hc; 33 | 34 | std::cout << "Calculating comphash...\n"; 35 | hc.CompHash(d_Img); 36 | CUDA_CHECK_ERROR; 37 | std::cout << "Constructing buckets...\n"; 38 | hc.BucketHash(d_Img); 39 | 40 | std::cout << "Bucket information for image #0:\n"; 41 | dumpDeviceArray(&d_Img.bucketIDList(0, 0), kCntBucketGroup); 42 | 43 | cudaFree(d_Img.compHashData.elements); 44 | CUDA_CHECK_ERROR; 45 | cudaFree(d_Img.siftData.elements); 46 | CUDA_CHECK_ERROR; 47 | 48 | return 0; 49 | } 50 | -------------------------------------------------------------------------------- /src/TestHashMatcher.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "HashMatcher.h" 5 | #include "HashConverter.h" 6 | #include "KeyFileReader.h" 7 | #include "Share.h" 8 | 9 | int main(int argc, char *argv[]) { 10 | 11 | KeyFileReader kf; 12 | std::cerr << "reading keylist\n"; 13 | kf.OpenKeyList(argv[1]); 14 | std::cerr << "preprocessing to zero-mean vectors\n"; 15 | kf.ZeroMeanProc(); 16 | 17 | std::cerr << "initialing cuda device\n"; 18 | cudaSetDevice(0); 19 | 20 | std::cerr << "filling hash matrix" << '\n'; 21 | HashConverter hc; 22 | 23 | HashMatcher hm; 24 | 25 | for (int i = 0; i < kf.cntImage; i++) { 26 | ImageDevice curImg; 27 | 28 | std::cerr << "------------------\nuploading image " << i << "\n"; 29 | kf.UploadImage(curImg, i); 30 | 31 | std::cerr << "Converting hash values\n"; 32 | hc.CalcHashValues(curImg); 33 | //dumpDeviceArray(&curImg.compHashData(0, 0), 2); 34 | //dumpDeviceArray(&curImg.bucketIDList(0, 0), 6); 35 | 36 | std::cerr << "Adding image to hashmatcher\n"; 37 | hm.AddImage(curImg); 38 | 39 | for(int j = 0; j < i; j++) { 40 | std::cerr << hm.NumberOfMatch(i, j) << " match(es) found between image " << i << " and " << j << "\n"; 41 | #ifdef DEBUG_HASH_MATCHER 42 | MatchPairListPtr mpList = hm.MatchPairList(i, j); 43 | for(MatchPairList_t::iterator it = mpList->begin(); it != mpList->end(); it++) { 44 | std::cerr << "(" << it->first << ", " << it->second << ") "; 45 | } 46 | std::cerr << std::endl; 47 | #endif 48 | } 49 | 50 | } 51 | 52 | return 0; 53 | } 54 | -------------------------------------------------------------------------------- /src/TestKeyFileReader.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(int argc, char *argv[]) { 5 | KeyFileReader kfr; 6 | 7 | if(argc != 3) { 8 | fprintf(stderr, "Usage: %s outfile\n", argv[0]); 9 | exit(EXIT_FAILURE); 10 | } 11 | 12 | kfr.OpenKeyList(argv[1]); 13 | kfr.ZeroMeanProc(); 14 | 15 | ImageDevice d_Img; 16 | for(int i = 0; i < kfr.cntImage; i++) { 17 | kfr.UploadImage(d_Img, i); 18 | } 19 | 20 | return 0; 21 | } 22 | 23 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "KeyFileReader.h" 6 | #include "HashConverter.h" 7 | #include "HashMatcher.h" 8 | 9 | int main(int argc, char **argv) { 10 | if(argc != 3) { 11 | fprintf(stderr, "Usage: %s outfile\n", argv[0]); 12 | exit(EXIT_FAILURE); 13 | } 14 | 15 | KeyFileReader keyFileReader; 16 | keyFileReader.OpenKeyList(argv[1]); 17 | keyFileReader.ZeroMeanProc(); 18 | 19 | std::cerr << "Initializing CUDA device...\n"; 20 | cudaSetDevice(0); 21 | 22 | HashConverter hashConverter; 23 | HashMatcher hashMatcher; 24 | 25 | cudaEvent_t start, stop; 26 | cudaEventCreate(&start); 27 | cudaEventCreate(&stop); 28 | 29 | FILE *outFile = fopen(argv[2], "w"); 30 | 31 | cudaEventRecord(start); 32 | 33 | for(int imageIndex = 0; imageIndex < keyFileReader.cntImage; imageIndex++) { 34 | ImageDevice newImage; 35 | 36 | std::cerr << "---------------------\nUploading image #" << imageIndex << " to GPU...\n"; 37 | cudaEvent_t kfFinishEvent = keyFileReader.UploadImageAsync(newImage, imageIndex); 38 | 39 | std::cerr << "Calculating compressed Hash Values for image #" << imageIndex << "\n"; 40 | cudaEvent_t hcFinishEvent = hashConverter.CalcHashValuesAsync(newImage, kfFinishEvent); 41 | 42 | std::cerr << "Matching image #" << imageIndex << " with previous images...\n"; 43 | hashMatcher.AddImageAsync(newImage, hcFinishEvent); 44 | 45 | for(int imageIndex2 = 0; imageIndex2 < imageIndex; imageIndex2++) { 46 | MatchPairListPtr mpList = hashMatcher.MatchPairList(imageIndex, imageIndex2); 47 | int pairCount = hashMatcher.NumberOfMatch(imageIndex, imageIndex2); 48 | 49 | fprintf(outFile, "%d %d\n%d\n", imageIndex2, imageIndex, pairCount); 50 | 51 | for(MatchPairList_t::iterator it = mpList->begin(); it != mpList->end(); it++) { 52 | fprintf(outFile, "%d %d\n", it->second, it->first); 53 | } 54 | } 55 | 56 | } 57 | 58 | cudaEventRecord(stop); 59 | cudaEventSynchronize(stop); 60 | 61 | float timeElapsed; 62 | cudaEventElapsedTime(&timeElapsed, start, stop); 63 | std::cerr << "Time elapsed: " << timeElapsed << " ms\n"; 64 | 65 | cudaEventDestroy(start); 66 | cudaEventDestroy(stop); 67 | 68 | fclose(outFile); 69 | 70 | return 0; 71 | } 72 | --------------------------------------------------------------------------------