├── src └── CMakeLists.txt ├── .gitmodules ├── bench ├── workload_gen │ ├── ycsb_download.sh │ ├── gen_workload.sh │ ├── workload_spec │ │ ├── workloadc_email_latest │ │ ├── workloadc_email_zipfian │ │ ├── workloadc_randint_latest │ │ ├── workloadc_randint_zipfian │ │ ├── workloadc_email_uniform │ │ ├── workloadc_randint_uniform │ │ └── workload_template │ ├── gen_txn.py │ └── gen_load.py ├── CMakeLists.txt ├── filter.hpp ├── filter_factory.hpp ├── filter_bloom.hpp ├── filter_surf.hpp ├── run.sh ├── bench.hpp ├── workload_arf.cpp ├── bloom.hpp ├── MurmurHash3.h ├── workload_multi_thread.cpp └── workload.cpp ├── test ├── CMakeLists.txt └── unitTest │ ├── CMakeLists.txt │ ├── test_louds_sparse_small.cpp │ ├── test_surf_small.cpp │ ├── test_louds_dense_small.cpp │ ├── test_suffix_vector.cpp │ ├── test_select.cpp │ ├── test_rank.cpp │ ├── test_bitvector.cpp │ ├── test_label_vector.cpp │ └── test_suffix.cpp ├── .gitignore ├── .travis.yml ├── CMakeLists.txt ├── include ├── hash.hpp ├── config.hpp ├── rank.hpp ├── bitvector.hpp ├── select.hpp ├── label_vector.hpp ├── popcount.h ├── suffix.hpp └── surf.hpp ├── README.md ├── simple_example.cpp ├── CodeCoverage.cmake └── LICENSE /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(surf surf.cpp) 2 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "ARF"] 2 | path = ARF 3 | url = https://github.com/efficient/ARF.git 4 | branch = master 5 | -------------------------------------------------------------------------------- /bench/workload_gen/ycsb_download.sh: -------------------------------------------------------------------------------- 1 | mkdir ../workloads 2 | curl -O --location https://github.com/brianfrankcooper/YCSB/releases/download/0.12.0/ycsb-0.12.0.tar.gz 3 | tar xfvz ycsb-0.12.0.tar.gz 4 | rm ycsb-0.12.0.tar.gz 5 | mv ycsb-0.12.0 YCSB 6 | -------------------------------------------------------------------------------- /bench/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(workload workload.cpp) 2 | target_link_libraries(workload) 3 | 4 | add_executable(workload_multi_thread workload_multi_thread.cpp) 5 | target_link_libraries(workload_multi_thread) 6 | 7 | #add_executable(workload_arf workload_arf.cpp) 8 | #target_link_libraries(workload_arf ARF) 9 | -------------------------------------------------------------------------------- /bench/workload_gen/gen_workload.sh: -------------------------------------------------------------------------------- 1 | #!bin/bash 2 | 3 | python gen_load.py randint uniform 4 | python gen_txn.py randint uniform 5 | python gen_txn.py randint zipfian 6 | #python gen_txn.py randint latest 7 | 8 | #python gen_load.py email uniform 9 | #python gen_txn.py email uniform 10 | #python gen_txn.py email zipfian 11 | #python gen_txn.py email latest 12 | 13 | 14 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | find_package(GTest REQUIRED) 2 | include_directories(${GTEST_INCLUDE_DIR}) 3 | 4 | function (add_surf_test file_name ) 5 | add_executable(${file_name} ${file_name}.cpp) 6 | target_link_libraries(${file_name} gtest) 7 | add_test(NAME ${file_name} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${file_name}) 8 | endfunction() 9 | 10 | add_subdirectory(unitTest) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | -------------------------------------------------------------------------------- /bench/filter.hpp: -------------------------------------------------------------------------------- 1 | #ifndef FILTER_H_ 2 | #define FILTER_H_ 3 | 4 | #include 5 | #include 6 | 7 | namespace bench { 8 | 9 | class Filter { 10 | public: 11 | virtual bool lookup(const std::string& key) = 0; 12 | virtual bool lookupRange(const std::string& left_key, const std::string& right_key) = 0; 13 | virtual bool approxCount(const std::string& left_key, const std::string& right_key) = 0; 14 | virtual uint64_t getMemoryUsage() = 0; 15 | }; 16 | 17 | } // namespace bench 18 | 19 | #endif // FILTER_H 20 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | sudo: required 3 | dist: xenial 4 | compiler: gcc 5 | 6 | install: 7 | - sudo apt-get install build-essential 8 | - sudo apt-get install cmake 9 | - sudo apt-get install libgtest.dev 10 | - cd /usr/src/gtest 11 | - sudo cmake CMakeLists.txt 12 | - sudo make 13 | - sudo cp *.a /usr/lib 14 | - sudo apt-get install lcov 15 | - sudo apt-get install ruby 16 | - sudo gem install coveralls-lcov 17 | 18 | script: 19 | - cd $TRAVIS_BUILD_DIR 20 | - mkdir build 21 | - cd build 22 | - cmake -DCMAKE_BUILD_TYPE=Debug -DCOVERALLS=ON .. 23 | - make -j 24 | - make coverage 25 | 26 | after_success: 27 | - lcov --remove coverage.info 'test/*' '/usr/*' '/lib/*' --output-file coverage.info 28 | - lcov --list coverage.info 29 | - coveralls-lcov --repo-token=${COVERALLS_TOKEN} coverage.info 30 | -------------------------------------------------------------------------------- /test/unitTest/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | find_package(GTest REQUIRED) 2 | include_directories(${GTEST_INCLUDE_DIR}) 3 | 4 | function (add_unit_test file_name) 5 | add_executable(${file_name} ${file_name}.cpp) 6 | target_link_libraries(${file_name} gtest) 7 | add_test(NAME ${file_name} 8 | COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${file_name} 9 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) 10 | endfunction() 11 | 12 | add_unit_test(test_bitvector) 13 | add_unit_test(test_label_vector) 14 | add_unit_test(test_louds_dense) 15 | add_unit_test(test_louds_dense_small) 16 | add_unit_test(test_louds_sparse) 17 | add_unit_test(test_louds_sparse_small) 18 | add_unit_test(test_rank) 19 | add_unit_test(test_select) 20 | add_unit_test(test_suffix) 21 | add_unit_test(test_surf) 22 | add_unit_test(test_surf_builder) 23 | add_unit_test(test_surf_small) 24 | 25 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required (VERSION 2.6) 2 | project (SuRF) 3 | 4 | message(STATUS "Configuring..." ${CMAKE_PROJECT_NAME}) 5 | 6 | if (NOT CMAKE_BUILD_TYPE) 7 | set(CMAKE_BUILD_TYPE "Release") 8 | endif() 9 | 10 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -g -Wall -mpopcnt -pthread -std=c++11") 11 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3 -Wall -Werror -mpopcnt -pthread -std=c++11") 12 | 13 | option(COVERALLS "Generate coveralls data" OFF) 14 | 15 | if (COVERALLS) 16 | include("${CMAKE_CURRENT_SOURCE_DIR}/CodeCoverage.cmake") 17 | append_coverage_compiler_flags() 18 | set(COVERAGE_EXCLUDES 'ARF/*' 'bench/*' 'test/*' '/usr/*' '/lib/*') 19 | setup_target_for_coverage( 20 | NAME coverage 21 | EXECUTABLE make test 22 | ) 23 | else() 24 | add_definitions(-DNDEBUG) 25 | endif() 26 | 27 | enable_testing() 28 | 29 | include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include") 30 | 31 | add_subdirectory(test) 32 | add_subdirectory(bench) 33 | 34 | #include_directories("${CMAKE_CURRENT_SOURCE_DIR}/ARF/include") 35 | #add_subdirectory(ARF) 36 | -------------------------------------------------------------------------------- /bench/filter_factory.hpp: -------------------------------------------------------------------------------- 1 | #ifndef FILTER_FACTORY_H_ 2 | #define FILTER_FACTORY_H_ 3 | 4 | #include "filter.hpp" 5 | #include "filter_bloom.hpp" 6 | #include "filter_surf.hpp" 7 | 8 | namespace bench { 9 | 10 | class FilterFactory { 11 | public: 12 | static Filter* createFilter(const std::string& filter_type, 13 | const uint32_t suffix_len, 14 | const std::vector& keys) { 15 | if (filter_type.compare(std::string("SuRF")) == 0) 16 | return new FilterSuRF(keys, surf::kNone, 0, 0); 17 | else if (filter_type.compare(std::string("SuRFHash")) == 0) 18 | return new FilterSuRF(keys, surf::kHash, suffix_len, 0); 19 | else if (filter_type.compare(std::string("SuRFReal")) == 0) 20 | return new FilterSuRF(keys, surf::kReal, 0, suffix_len); 21 | else if (filter_type.compare(std::string("SuRFMixed")) == 0) 22 | return new FilterSuRF(keys, surf::kMixed, suffix_len, suffix_len); 23 | else if (filter_type.compare(std::string("Bloom")) == 0) 24 | return new FilterBloom(keys); 25 | else 26 | return new FilterSuRF(keys, surf::kReal, 0, suffix_len); // default 27 | } 28 | }; 29 | 30 | } // namespace bench 31 | 32 | #endif // FILTER_FACTORY_H 33 | -------------------------------------------------------------------------------- /bench/filter_bloom.hpp: -------------------------------------------------------------------------------- 1 | #ifndef FILTER_BLOOM_H_ 2 | #define FILTER_BLOOM_H_ 3 | 4 | #include 5 | #include 6 | 7 | #include "bloom.hpp" 8 | 9 | namespace bench { 10 | 11 | class FilterBloom : public Filter { 12 | public: 13 | // Requires that keys are sorted 14 | FilterBloom(const std::vector& keys) { 15 | filter_ = new BloomFilter(kBitsPerKey); 16 | filter_->CreateFilter(keys, keys.size(), &filter_data_); 17 | } 18 | 19 | ~FilterBloom() { 20 | delete filter_; 21 | } 22 | 23 | bool lookup(const std::string& key) { 24 | return filter_->KeyMayMatch(key, filter_data_); 25 | } 26 | 27 | bool lookupRange(const std::string& left_key, const std::string& right_key) { 28 | std::cout << kRed << "A Bloom filter does not support range queries\n" << kNoColor; 29 | return false; 30 | } 31 | 32 | bool approxCount(const std::string& left_key, const std::string& right_key) { 33 | std::cout << kRed << "A Bloom filter does not support approximate count queries\n" << kNoColor; 34 | return false; 35 | } 36 | 37 | uint64_t getMemoryUsage() { 38 | return filter_data_.size(); 39 | } 40 | 41 | private: 42 | int kBitsPerKey = 10; 43 | 44 | BloomFilter* filter_; 45 | std::string filter_data_; 46 | }; 47 | 48 | } // namespace bench 49 | 50 | #endif // FILTER_BLOOM_H 51 | -------------------------------------------------------------------------------- /bench/filter_surf.hpp: -------------------------------------------------------------------------------- 1 | #ifndef FILTER_SURF_H_ 2 | #define FILTER_SURF_H_ 3 | 4 | #include 5 | #include 6 | 7 | #include "surf.hpp" 8 | 9 | namespace bench { 10 | 11 | class FilterSuRF : public Filter { 12 | public: 13 | // Requires that keys are sorted 14 | FilterSuRF(const std::vector& keys, 15 | const surf::SuffixType suffix_type, 16 | const uint32_t hash_suffix_len, const uint32_t real_suffix_len) { 17 | // uses default sparse-dense size ratio 18 | filter_ = new surf::SuRF(keys, surf::kIncludeDense, surf::kSparseDenseRatio, 19 | suffix_type, hash_suffix_len, real_suffix_len); 20 | } 21 | 22 | ~FilterSuRF() { 23 | filter_->destroy(); 24 | delete filter_; 25 | } 26 | 27 | bool lookup(const std::string& key) { 28 | return filter_->lookupKey(key); 29 | } 30 | 31 | bool lookupRange(const std::string& left_key, const std::string& right_key) { 32 | //return filter_->lookupRange(left_key, false, right_key, false); 33 | return filter_->lookupRange(left_key, true, right_key, true); 34 | } 35 | 36 | bool approxCount(const std::string& left_key, const std::string& right_key) { 37 | return filter_->approxCount(left_key, right_key); 38 | } 39 | 40 | uint64_t getMemoryUsage() { 41 | return filter_->getMemoryUsage(); 42 | } 43 | 44 | private: 45 | surf::SuRF* filter_; 46 | }; 47 | 48 | } // namespace bench 49 | 50 | #endif // FILTER_SURF_H 51 | -------------------------------------------------------------------------------- /include/hash.hpp: -------------------------------------------------------------------------------- 1 | #ifndef HASH_H_ 2 | #define HASH_H_ 3 | 4 | #include 5 | 6 | namespace surf { 7 | 8 | //****************************************************** 9 | //HASH FUNCTION FROM LEVELDB 10 | //****************************************************** 11 | inline uint32_t DecodeFixed32(const char* ptr) { 12 | uint32_t result; 13 | memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load 14 | return result; 15 | } 16 | 17 | inline uint32_t Hash(const char* data, size_t n, uint32_t seed) { 18 | // Similar to murmur hash 19 | const uint32_t m = 0xc6a4a793; 20 | const uint32_t r = 24; 21 | const char* limit = data + n; 22 | uint32_t h = seed ^ (n * m); 23 | 24 | // Pick up four bytes at a time 25 | while (data + 4 <= limit) { 26 | uint32_t w = DecodeFixed32(data); 27 | data += 4; 28 | h += w; 29 | h *= m; 30 | h ^= (h >> 16); 31 | } 32 | 33 | // Pick up remaining bytes 34 | switch (limit - data) { 35 | case 3: 36 | h += static_cast(data[2]) << 16; 37 | case 2: 38 | h += static_cast(data[1]) << 8; 39 | case 1: 40 | h += static_cast(data[0]); 41 | h *= m; 42 | h ^= (h >> r); 43 | break; 44 | } 45 | return h; 46 | } 47 | 48 | inline uint32_t suffixHash(const std::string &key) { 49 | return Hash(key.c_str(), key.size(), 0xbc9f1d34); 50 | } 51 | 52 | inline uint32_t suffixHash(const char* key, const int keylen) { 53 | return Hash(key, keylen, 0xbc9f1d34); 54 | } 55 | 56 | } // namespace surf 57 | 58 | #endif // HASH_H_ 59 | 60 | -------------------------------------------------------------------------------- /bench/run.sh: -------------------------------------------------------------------------------- 1 | #!bin/bash 2 | 3 | echo 'Bloom Filter, random int, point queries' 4 | ../build/bench/workload Bloom 1 mixed 50 0 randint point zipfian 5 | 6 | echo 'SuRF, random int, point queries' 7 | ../build/bench/workload SuRF 1 mixed 50 0 randint point zipfian 8 | 9 | echo 'SuRFHash, 4-bit suffixes, random int, point queries' 10 | ../build/bench/workload SuRFHash 4 mixed 50 0 randint point zipfian 11 | 12 | echo 'SuRFReal, 4-bit suffixes, random int, point queries' 13 | ../build/bench/workload SuRFReal 4 mixed 50 0 randint point zipfian 14 | 15 | echo 'SuRFMixed, 2-bit hash suffixes and 2-bit real suffixes, random int, point queries' 16 | ../build/bench/workload SuRFMixed 2 mixed 50 0 randint mix zipfian 17 | 18 | 19 | # echo 'Bloom Filter, email, point queries' 20 | # ../build/bench/workload Bloom 1 mixed 50 0 email point zipfian 21 | 22 | # echo 'SuRF, email, point queries' 23 | # ../build/bench/workload SuRF 1 mixed 50 0 email point zipfian 24 | 25 | # echo 'SuRFHash, 4-bit suffixes, email, point queries' 26 | # ../build/bench/workload SuRFHash 4 mixed 50 0 email point zipfian 27 | 28 | # echo 'SuRFReal, 4-bit suffixes, email, point queries' 29 | # ../build/bench/workload SuRFReal 4 mixed 50 0 email point zipfian 30 | 31 | # echo 'SuRFMixed, 2-bit hash suffixes and 2-bit real suffixes, email, point queries' 32 | # ../build/bench/workload SuRFMixed 2 mixed 50 0 email mix zipfian 33 | 34 | 35 | echo 'SuRFReal, 4-bit suffixes, random int, range queries' 36 | ../build/bench/workload SuRFReal 4 mixed 50 0 randint range zipfian 37 | 38 | # echo 'SuRFReal, 4-bit suffixes, email, point queries' 39 | # ../build/bench/workload SuRFReal 4 mixed 50 0 email range zipfian 40 | 41 | -------------------------------------------------------------------------------- /include/config.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CONFIG_H_ 2 | #define CONFIG_H_ 3 | 4 | #include 5 | #include 6 | 7 | namespace surf { 8 | 9 | using level_t = uint32_t; 10 | using position_t = uint32_t; 11 | static const position_t kMaxPos = UINT32_MAX; 12 | 13 | using label_t = uint8_t; 14 | static const position_t kFanout = 256; 15 | 16 | using word_t = uint64_t; 17 | static const unsigned kWordSize = 64; 18 | static const word_t kMsbMask = 0x8000000000000000; 19 | static const word_t kOneMask = 0xFFFFFFFFFFFFFFFF; 20 | 21 | static const bool kIncludeDense = true; 22 | //static const uint32_t kSparseDenseRatio = 64; 23 | static const uint32_t kSparseDenseRatio = 16; 24 | static const label_t kTerminator = 255; 25 | 26 | static const int kHashShift = 7; 27 | 28 | static const int kCouldBePositive = 2018; // used in suffix comparison 29 | 30 | enum SuffixType { 31 | kNone = 0, 32 | kHash = 1, 33 | kReal = 2, 34 | kMixed = 3 35 | }; 36 | 37 | void align(char*& ptr) { 38 | ptr = (char*)(((uint64_t)ptr + 7) & ~((uint64_t)7)); 39 | } 40 | 41 | void sizeAlign(position_t& size) { 42 | size = (size + 7) & ~((position_t)7); 43 | } 44 | 45 | void sizeAlign(uint64_t& size) { 46 | size = (size + 7) & ~((uint64_t)7); 47 | } 48 | 49 | std::string uint64ToString(const uint64_t word) { 50 | uint64_t endian_swapped_word = __builtin_bswap64(word); 51 | return std::string(reinterpret_cast(&endian_swapped_word), 8); 52 | } 53 | 54 | uint64_t stringToUint64(const std::string& str_word) { 55 | uint64_t int_word = 0; 56 | memcpy(reinterpret_cast(&int_word), str_word.data(), 8); 57 | return __builtin_bswap64(int_word); 58 | } 59 | 60 | } // namespace surf 61 | 62 | #endif // CONFIG_H_ 63 | -------------------------------------------------------------------------------- /test/unitTest/test_louds_sparse_small.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include "config.hpp" 9 | #include "surf.hpp" 10 | 11 | namespace surf { 12 | 13 | namespace surftest { 14 | 15 | static const bool kIncludeDense = false; 16 | static const uint32_t kSparseDenseRatio = 0; 17 | static const SuffixType kSuffixType = kReal; 18 | static const level_t kSuffixLen = 8; 19 | 20 | class SuRFSmallTest : public ::testing::Test { 21 | public: 22 | virtual void SetUp () {} 23 | virtual void TearDown () {} 24 | }; 25 | 26 | TEST_F (SuRFSmallTest, ExampleInPaperTest) { 27 | std::vector keys; 28 | 29 | keys.push_back(std::string("f")); 30 | keys.push_back(std::string("far")); 31 | keys.push_back(std::string("fas")); 32 | keys.push_back(std::string("fast")); 33 | keys.push_back(std::string("fat")); 34 | keys.push_back(std::string("s")); 35 | keys.push_back(std::string("top")); 36 | keys.push_back(std::string("toy")); 37 | keys.push_back(std::string("trie")); 38 | keys.push_back(std::string("trip")); 39 | keys.push_back(std::string("try")); 40 | 41 | SuRFBuilder* builder = new SuRFBuilder(kIncludeDense, kSparseDenseRatio, kSuffixType, 0, kSuffixLen); 42 | builder->build(keys); 43 | LoudsSparse* louds_sparse = new LoudsSparse(builder); 44 | LoudsSparse::Iter iter(louds_sparse); 45 | 46 | louds_sparse->moveToKeyGreaterThan(std::string("to"), true, iter); 47 | ASSERT_TRUE(iter.isValid()); 48 | ASSERT_EQ(0, iter.getKey().compare("top")); 49 | iter++; 50 | ASSERT_EQ(0, iter.getKey().compare("toy")); 51 | } 52 | 53 | } // namespace surftest 54 | 55 | } // namespace surf 56 | 57 | int main (int argc, char** argv) { 58 | ::testing::InitGoogleTest(&argc, argv); 59 | return RUN_ALL_TESTS(); 60 | } 61 | -------------------------------------------------------------------------------- /test/unitTest/test_surf_small.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include "config.hpp" 9 | #include "surf.hpp" 10 | 11 | namespace surf { 12 | 13 | namespace surftest { 14 | 15 | static const SuffixType kSuffixType = kReal; 16 | static const level_t kSuffixLen = 8; 17 | 18 | class SuRFSmallTest : public ::testing::Test { 19 | public: 20 | virtual void SetUp () {} 21 | virtual void TearDown () {} 22 | }; 23 | 24 | TEST_F (SuRFSmallTest, ExampleInPaperTest) { 25 | std::vector keys; 26 | 27 | keys.push_back(std::string("f")); 28 | keys.push_back(std::string("far")); 29 | keys.push_back(std::string("fas")); 30 | keys.push_back(std::string("fast")); 31 | keys.push_back(std::string("fat")); 32 | keys.push_back(std::string("s")); 33 | keys.push_back(std::string("top")); 34 | keys.push_back(std::string("toy")); 35 | keys.push_back(std::string("trie")); 36 | keys.push_back(std::string("trip")); 37 | keys.push_back(std::string("try")); 38 | 39 | SuRF* surf = new SuRF(keys, kIncludeDense, kSparseDenseRatio, kSuffixType, 0, kSuffixLen); 40 | bool exist = surf->lookupRange(std::string("top"), false, std::string("toyy"), false); 41 | ASSERT_TRUE(exist); 42 | exist = surf->lookupRange(std::string("toq"), false, std::string("toyy"), false); 43 | ASSERT_TRUE(exist); 44 | exist = surf->lookupRange(std::string("trie"), false, std::string("tripp"), false); 45 | ASSERT_TRUE(exist); 46 | 47 | SuRF::Iter iter = surf->moveToKeyGreaterThan(std::string("t"), true); 48 | ASSERT_TRUE(iter.isValid()); 49 | iter++; 50 | ASSERT_TRUE(iter.isValid()); 51 | } 52 | 53 | } // namespace surftest 54 | 55 | } // namespace surf 56 | 57 | int main (int argc, char** argv) { 58 | ::testing::InitGoogleTest(&argc, argv); 59 | return RUN_ALL_TESTS(); 60 | } 61 | -------------------------------------------------------------------------------- /test/unitTest/test_louds_dense_small.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include "config.hpp" 9 | #include "surf.hpp" 10 | 11 | namespace surf { 12 | 13 | namespace surftest { 14 | 15 | static const bool kIncludeDense = true; 16 | static const uint32_t kSparseDenseRatio = 0; 17 | static const SuffixType kSuffixType = kReal; 18 | static const level_t kSuffixLen = 8; 19 | 20 | class SuRFSmallTest : public ::testing::Test { 21 | public: 22 | virtual void SetUp () {} 23 | virtual void TearDown () {} 24 | }; 25 | 26 | TEST_F (SuRFSmallTest, ExampleInPaperTest) { 27 | std::vector keys; 28 | 29 | keys.push_back(std::string("f")); 30 | keys.push_back(std::string("far")); 31 | keys.push_back(std::string("fas")); 32 | keys.push_back(std::string("fast")); 33 | keys.push_back(std::string("fat")); 34 | keys.push_back(std::string("s")); 35 | keys.push_back(std::string("top")); 36 | keys.push_back(std::string("toy")); 37 | keys.push_back(std::string("trie")); 38 | keys.push_back(std::string("trip")); 39 | keys.push_back(std::string("try")); 40 | 41 | SuRFBuilder* builder = new SuRFBuilder(kIncludeDense, kSparseDenseRatio, kSuffixType, 0, kSuffixLen); 42 | builder->build(keys); 43 | LoudsDense* louds_dense = new LoudsDense(builder); 44 | LoudsDense::Iter iter(louds_dense); 45 | 46 | louds_dense->moveToKeyGreaterThan(std::string("to"), true, iter); 47 | ASSERT_TRUE(iter.isValid()); 48 | ASSERT_EQ(0, iter.getKey().compare("top")); 49 | iter++; 50 | ASSERT_EQ(0, iter.getKey().compare("toy")); 51 | 52 | iter.clear(); 53 | louds_dense->moveToKeyGreaterThan(std::string("fas"), true, iter); 54 | ASSERT_TRUE(iter.isValid()); 55 | ASSERT_EQ(0, iter.getKey().compare("fas")); 56 | } 57 | 58 | } // namespace surftest 59 | 60 | } // namespace surf 61 | 62 | int main (int argc, char** argv) { 63 | ::testing::InitGoogleTest(&argc, argv); 64 | return RUN_ALL_TESTS(); 65 | } 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Succinct Range Filter (SuRF) 2 | [![Build Status](https://travis-ci.org/efficient/SuRF.svg?branch=master)](https://travis-ci.org/efficient/SuRF) 3 | [![Coverage Status](https://coveralls.io/repos/github/efficient/SuRF/badge.svg?branch=master)](https://coveralls.io/github/efficient/SuRF?branch=master) 4 | 5 | **SuRF** is a fast and compact filter that provides exact-match filtering, 6 | range filtering, and approximate range counts. This is the source code for our 7 | [SIGMOD best paper](http://www.cs.cmu.edu/~huanche1/publications/surf_paper.pdf). 8 | We also host a [demo website](https://www.rangefilter.io/). 9 | The RocksDB experiments with SuRF can be found [here](https://github.com/efficient/rocksdb). 10 | 11 | ## Install Dependencies 12 | sudo apt-get install build-essential cmake libgtest.dev 13 | cd /usr/src/gtest 14 | sudo cmake CMakeLists.txt 15 | sudo make 16 | sudo cp *.a /usr/lib 17 | 18 | ## Build 19 | git submodule init 20 | git submodule update 21 | mkdir build 22 | cd build 23 | cmake .. 24 | make -j 25 | 26 | ## Simple Example 27 | A simple example can be found [here](https://github.com/efficient/SuRF/blob/master/simple_example.cpp). To run the example: 28 | ``` 29 | g++ -mpopcnt -std=c++11 simple_example.cpp 30 | ./a.out 31 | ``` 32 | Note that the key list passed to the SuRF constructor must be SORTED. 33 | 34 | ## Run Unit Tests 35 | make test 36 | 37 | ## Benchmark 38 | 39 | ### Step 1: Download YCSB 40 | cd bench/workload_gen 41 | bash ycsb_download.sh 42 | 43 | ### Step 2: Generate Workloads 44 | cd bench/workload_gen 45 | bash gen_workload.sh 46 | You must provide your own email list to generate email-key workloads. 47 | 48 | ### Step 3: Run Workloads 49 | cd bench 50 | bash run.sh 51 | Note that `run.sh` only includes several representative runs. 52 | Refer to `bench/workload.cpp`, `bench/workload_multi_thread.cpp` 53 | and `bench/workload_arf.cpp` for more experiment configurations. 54 | 55 | ## License 56 | Copyright 2018, Carnegie Mellon University 57 | 58 | Licensed under the [Apache License](https://github.com/efficient/SuRF/blob/master/LICENSE). 59 | -------------------------------------------------------------------------------- /test/unitTest/test_suffix_vector.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "config.hpp" 10 | #include "suffix_vector.hpp" 11 | #include "surf_builder.hpp" 12 | 13 | namespace surf { 14 | 15 | // DEPRECATED 16 | namespace suffixvectortest { 17 | 18 | static const std::string kFilePath = "../../../test/words.txt"; 19 | static const int kTestSize = 234369; 20 | static std::vector words; 21 | 22 | class SuffixVectorUnitTest : public ::testing::Test { 23 | public: 24 | virtual void SetUp () { 25 | ; 26 | } 27 | virtual void TearDown () { 28 | delete builder_; 29 | delete suffixes_; 30 | } 31 | 32 | SuRFBuilder* builder_; 33 | SuffixVector* suffixes_; 34 | }; 35 | 36 | TEST_F (SuffixVectorUnitTest, buildNoneTest) { 37 | bool include_dense = false; 38 | uint32_t sparse_dense_ratio = 0; 39 | builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, kNone); 40 | builder_->build(words); 41 | suffixes_ = new SuffixVector(kNone, builder_->getSuffixes()); 42 | } 43 | 44 | TEST_F (SuffixVectorUnitTest, buildHashTest) { 45 | bool include_dense = false; 46 | uint32_t sparse_dense_ratio = 0; 47 | builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, kHash); 48 | builder_->build(words); 49 | suffixes_ = new SuffixVector(kHash, builder_->getSuffixes()); 50 | } 51 | 52 | TEST_F (SuffixVectorUnitTest, buildRealTest) { 53 | bool include_dense = false; 54 | uint32_t sparse_dense_ratio = 0; 55 | builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, kReal); 56 | builder_->build(words); 57 | suffixes_ = new SuffixVector(kReal, builder_->getSuffixes()); 58 | } 59 | 60 | //TODO checkEqualityTest 61 | //TODO compareTest 62 | 63 | void loadWordList() { 64 | std::ifstream infile(kFilePath); 65 | std::string key; 66 | int count = 0; 67 | while (infile.good() && count < kTestSize) { 68 | infile >> key; 69 | words.push_back(key); 70 | count++; 71 | } 72 | } 73 | 74 | } // namespace suffixvectortest 75 | 76 | } // namespace surf 77 | 78 | int main (int argc, char** argv) { 79 | ::testing::InitGoogleTest(&argc, argv); 80 | surf::suffixvectortest::loadWordList(); 81 | return RUN_ALL_TESTS(); 82 | } 83 | -------------------------------------------------------------------------------- /bench/workload_gen/workload_spec/workloadc_email_latest: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2010 Yahoo! Inc. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); you 4 | # may not use this file except in compliance with the License. You 5 | # may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 12 | # implied. See the License for the specific language governing 13 | # permissions and limitations under the License. See accompanying 14 | # LICENSE file. 15 | 16 | # Yahoo! Cloud System Benchmark 17 | # Workload C: Read only 18 | # Application example: user profile cache, where profiles are constructed elsewhere (e.g., Hadoop) 19 | # 20 | # Read/update ratio: 100/0 21 | # Default data size: 1 KB records (10 fields, 100 bytes each, plus key) 22 | # Request distribution: zipfian 23 | 24 | recordcount=25000000 25 | operationcount=10000000 26 | workload=com.yahoo.ycsb.workloads.CoreWorkload 27 | 28 | readallfields=true 29 | 30 | readproportion=1 31 | updateproportion=0 32 | scanproportion=0 33 | insertproportion=0 34 | 35 | requestdistribution=latest 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /bench/workload_gen/workload_spec/workloadc_email_zipfian: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2010 Yahoo! Inc. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); you 4 | # may not use this file except in compliance with the License. You 5 | # may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 12 | # implied. See the License for the specific language governing 13 | # permissions and limitations under the License. See accompanying 14 | # LICENSE file. 15 | 16 | # Yahoo! Cloud System Benchmark 17 | # Workload C: Read only 18 | # Application example: user profile cache, where profiles are constructed elsewhere (e.g., Hadoop) 19 | # 20 | # Read/update ratio: 100/0 21 | # Default data size: 1 KB records (10 fields, 100 bytes each, plus key) 22 | # Request distribution: zipfian 23 | 24 | recordcount=25000000 25 | operationcount=10000000 26 | workload=com.yahoo.ycsb.workloads.CoreWorkload 27 | 28 | readallfields=true 29 | 30 | readproportion=1 31 | updateproportion=0 32 | scanproportion=0 33 | insertproportion=0 34 | 35 | requestdistribution=zipfian 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /bench/workload_gen/workload_spec/workloadc_randint_latest: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2010 Yahoo! Inc. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); you 4 | # may not use this file except in compliance with the License. You 5 | # may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 12 | # implied. See the License for the specific language governing 13 | # permissions and limitations under the License. See accompanying 14 | # LICENSE file. 15 | 16 | # Yahoo! Cloud System Benchmark 17 | # Workload C: Read only 18 | # Application example: user profile cache, where profiles are constructed elsewhere (e.g., Hadoop) 19 | # 20 | # Read/update ratio: 100/0 21 | # Default data size: 1 KB records (10 fields, 100 bytes each, plus key) 22 | # Request distribution: zipfian 23 | 24 | recordcount=100000000 25 | operationcount=10000000 26 | workload=com.yahoo.ycsb.workloads.CoreWorkload 27 | 28 | readallfields=true 29 | 30 | readproportion=1 31 | updateproportion=0 32 | scanproportion=0 33 | insertproportion=0 34 | 35 | requestdistribution=latest 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /bench/workload_gen/workload_spec/workloadc_randint_zipfian: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2010 Yahoo! Inc. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); you 4 | # may not use this file except in compliance with the License. You 5 | # may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 12 | # implied. See the License for the specific language governing 13 | # permissions and limitations under the License. See accompanying 14 | # LICENSE file. 15 | 16 | # Yahoo! Cloud System Benchmark 17 | # Workload C: Read only 18 | # Application example: user profile cache, where profiles are constructed elsewhere (e.g., Hadoop) 19 | # 20 | # Read/update ratio: 100/0 21 | # Default data size: 1 KB records (10 fields, 100 bytes each, plus key) 22 | # Request distribution: zipfian 23 | 24 | recordcount=100000000 25 | operationcount=10000000 26 | workload=com.yahoo.ycsb.workloads.CoreWorkload 27 | 28 | readallfields=true 29 | 30 | readproportion=1 31 | updateproportion=0 32 | scanproportion=0 33 | insertproportion=0 34 | 35 | requestdistribution=zipfian 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /bench/workload_gen/workload_spec/workloadc_email_uniform: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2010 Yahoo! Inc. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); you 4 | # may not use this file except in compliance with the License. You 5 | # may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 12 | # implied. See the License for the specific language governing 13 | # permissions and limitations under the License. See accompanying 14 | # LICENSE file. 15 | 16 | # Yahoo! Cloud System Benchmark 17 | # Workload C: Read only 18 | # Application example: user profile cache, where profiles are constructed elsewhere (e.g., Hadoop) 19 | # 20 | # Read/update ratio: 100/0 21 | # Default data size: 1 KB records (10 fields, 100 bytes each, plus key) 22 | # Request distribution: zipfian 23 | 24 | recordcount=25000000 25 | operationcount=10000000 26 | workload=com.yahoo.ycsb.workloads.CoreWorkload 27 | 28 | fieldcount=1 29 | fieldlength=10 30 | readallfields=true 31 | 32 | readproportion=1 33 | updateproportion=0 34 | scanproportion=0 35 | insertproportion=0 36 | 37 | requestdistribution=uniform 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /bench/workload_gen/workload_spec/workloadc_randint_uniform: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2010 Yahoo! Inc. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); you 4 | # may not use this file except in compliance with the License. You 5 | # may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 12 | # implied. See the License for the specific language governing 13 | # permissions and limitations under the License. See accompanying 14 | # LICENSE file. 15 | 16 | # Yahoo! Cloud System Benchmark 17 | # Workload C: Read only 18 | # Application example: user profile cache, where profiles are constructed elsewhere (e.g., Hadoop) 19 | # 20 | # Read/update ratio: 100/0 21 | # Default data size: 1 KB records (10 fields, 100 bytes each, plus key) 22 | # Request distribution: zipfian 23 | 24 | recordcount=100000000 25 | operationcount=10000000 26 | workload=com.yahoo.ycsb.workloads.CoreWorkload 27 | 28 | fieldcount=1 29 | fieldlength=10 30 | readallfields=true 31 | 32 | readproportion=1 33 | updateproportion=0 34 | scanproportion=0 35 | insertproportion=0 36 | 37 | requestdistribution=uniform 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /simple_example.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "include/surf.hpp" 5 | 6 | using namespace surf; 7 | 8 | int main() { 9 | std::vector keys = { 10 | "f", 11 | "far", 12 | "fast", 13 | "s", 14 | "top", 15 | "toy", 16 | "trie", 17 | }; 18 | 19 | // basic surf 20 | SuRF* surf = new SuRF(keys); 21 | 22 | // use default dense-to-sparse ratio; specify suffix type and length 23 | SuRF* surf_hash = new SuRF(keys, surf::kHash, 8, 0); 24 | SuRF* surf_real = new SuRF(keys, surf::kReal, 0, 8); 25 | 26 | // customize dense-to-sparse ratio; specify suffix type and length 27 | SuRF* surf_mixed = new SuRF(keys, true, 16, surf::kMixed, 4, 4); 28 | 29 | //---------------------------------------- 30 | // point queries 31 | //---------------------------------------- 32 | std::cout << "Point Query Example: fase" << std::endl; 33 | 34 | std::string key = "fase"; 35 | 36 | if (surf->lookupKey(key)) 37 | std::cout << "False Positive: "<< key << " found in basic SuRF" << std::endl; 38 | else 39 | std::cout << "Correct: " << key << " NOT found in basic SuRF" << std::endl; 40 | 41 | if (surf_hash->lookupKey(key)) 42 | std::cout << "False Positive: " << key << " found in SuRF hash" << std::endl; 43 | else 44 | std::cout << "Correct: " << key << " NOT found in SuRF hash" << std::endl; 45 | 46 | if (surf_real->lookupKey(key)) 47 | std::cout << "False Positive: " << key << " found in SuRF real" << std::endl; 48 | else 49 | std::cout << "Correct: " << key << " NOT found in SuRF real" << std::endl; 50 | 51 | if (surf_mixed->lookupKey(key)) 52 | std::cout << "False Positive: " << key << " found in SuRF mixed" << std::endl; 53 | else 54 | std::cout << "Correct: " << key << " NOT found in SuRF mixed" << std::endl; 55 | 56 | //---------------------------------------- 57 | // range queries 58 | //---------------------------------------- 59 | std::cout << "\nRange Query Example: [fare, fase)" << std::endl; 60 | 61 | std::string left_key = "fare"; 62 | std::string right_key = "fase"; 63 | 64 | if (surf->lookupRange(left_key, true, right_key, false)) 65 | std::cout << "False Positive: There exist key(s) within range [" 66 | << left_key << ", " << right_key << ") " << "according to basic SuRF" << std::endl; 67 | else 68 | std::cout << "Correct: No key exists within range [" 69 | << left_key << ", " << right_key << ") " << "according to basic SuRF" << std::endl; 70 | 71 | if (surf_hash->lookupRange(left_key, true, right_key, false)) 72 | std::cout << "False Positive: There exist key(s) within range [" 73 | << left_key << ", " << right_key << ") " << "according to SuRF hash" << std::endl; 74 | else 75 | std::cout << "Correct: No key exists within range [" 76 | << left_key << ", " << right_key << ") " << "according to SuRF hash" << std::endl; 77 | 78 | if (surf_real->lookupRange(left_key, true, right_key, false)) 79 | std::cout << "False Positive: There exist key(s) within range [" 80 | << left_key << ", " << right_key << ") " << "according to SuRF real" << std::endl; 81 | else 82 | std::cout << "Correct: No key exists within range [" 83 | << left_key << ", " << right_key << ") " << "according to SuRF real" << std::endl; 84 | 85 | if (surf_mixed->lookupRange(left_key, true, right_key, false)) 86 | std::cout << "False Positive: There exist key(s) within range [" 87 | << left_key << ", " << right_key << ") " << "according to SuRF mixed" << std::endl; 88 | else 89 | std::cout << "Correct: No key exists within range [" 90 | << left_key << ", " << right_key << ") " << "according to SuRF mixed" << std::endl; 91 | 92 | return 0; 93 | } 94 | -------------------------------------------------------------------------------- /test/unitTest/test_select.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "config.hpp" 10 | #include "select.hpp" 11 | #include "surf_builder.hpp" 12 | 13 | namespace surf { 14 | 15 | namespace selecttest { 16 | 17 | static const std::string kFilePath = "../../../test/words.txt"; 18 | static const int kTestSize = 234369; 19 | static std::vector words; 20 | 21 | class SelectUnitTest : public ::testing::Test { 22 | public: 23 | virtual void SetUp () { 24 | bool include_dense = false; 25 | uint32_t sparse_dense_ratio = 0; 26 | level_t suffix_len = 8; 27 | builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, kReal, 0, suffix_len); 28 | data_ = nullptr; 29 | num_items_ = 0; 30 | } 31 | virtual void TearDown () { 32 | delete builder_; 33 | if (data_) 34 | delete[] data_; 35 | } 36 | 37 | void setupWordsTest(); 38 | void testSerialize(); 39 | void testSelect(); 40 | 41 | static const position_t kSelectSampleInterval = 64; 42 | 43 | SuRFBuilder* builder_; 44 | BitvectorSelect* bv_; 45 | std::vector num_items_per_level_; 46 | position_t num_items_; 47 | char* data_; 48 | }; 49 | 50 | void SelectUnitTest::setupWordsTest() { 51 | builder_->build(words); 52 | for (level_t level = 0; level < builder_->getTreeHeight(); level++) 53 | num_items_per_level_.push_back(builder_->getLabels()[level].size()); 54 | for (level_t level = 0; level < num_items_per_level_.size(); level++) 55 | num_items_ += num_items_per_level_[level]; 56 | bv_ = new BitvectorSelect(kSelectSampleInterval, builder_->getLoudsBits(), num_items_per_level_); 57 | } 58 | 59 | void SelectUnitTest::testSerialize() { 60 | uint64_t size = bv_->serializedSize(); 61 | ASSERT_TRUE((bv_->size() - size) >= 0); 62 | data_ = new char[size]; 63 | BitvectorSelect* ori_bv = bv_; 64 | char* data = data_; 65 | ori_bv->serialize(data); 66 | data = data_; 67 | bv_ = BitvectorSelect::deSerialize(data); 68 | 69 | ASSERT_EQ(ori_bv->bitsSize(), bv_->bitsSize()); 70 | ASSERT_EQ(ori_bv->selectLutSize(), bv_->selectLutSize()); 71 | 72 | ori_bv->destroy(); 73 | delete ori_bv; 74 | } 75 | 76 | void SelectUnitTest::testSelect() { 77 | position_t rank = 1; 78 | for (position_t pos = 0; pos < num_items_; pos++) { 79 | if (bv_->readBit(pos)) { 80 | position_t select = bv_->select(rank); 81 | ASSERT_EQ(pos, select); 82 | rank++; 83 | } 84 | } 85 | } 86 | 87 | TEST_F (SelectUnitTest, readBitTest) { 88 | setupWordsTest(); 89 | position_t bv_pos = 0; 90 | for (level_t level = 0; level < builder_->getTreeHeight(); level++) { 91 | for (position_t pos = 0; pos < num_items_per_level_[level]; pos++) { 92 | bool expected_bit = SuRFBuilder::readBit(builder_->getLoudsBits()[level], pos); 93 | bool bv_bit = bv_->readBit(bv_pos); 94 | ASSERT_EQ(expected_bit, bv_bit); 95 | bv_pos++; 96 | } 97 | } 98 | bv_->destroy(); 99 | delete bv_; 100 | } 101 | 102 | TEST_F (SelectUnitTest, selectTest) { 103 | setupWordsTest(); 104 | testSelect(); 105 | } 106 | 107 | TEST_F (SelectUnitTest, serializeTest) { 108 | setupWordsTest(); 109 | testSerialize(); 110 | testSelect(); 111 | } 112 | 113 | void loadWordList() { 114 | std::ifstream infile(kFilePath); 115 | std::string key; 116 | int count = 0; 117 | while (infile.good() && count < kTestSize) { 118 | infile >> key; 119 | words.push_back(key); 120 | count++; 121 | } 122 | } 123 | 124 | } // namespace ranktest 125 | 126 | } // namespace surf 127 | 128 | int main (int argc, char** argv) { 129 | ::testing::InitGoogleTest(&argc, argv); 130 | surf::selecttest::loadWordList(); 131 | return RUN_ALL_TESTS(); 132 | } 133 | -------------------------------------------------------------------------------- /include/rank.hpp: -------------------------------------------------------------------------------- 1 | #ifndef RANK_H_ 2 | #define RANK_H_ 3 | 4 | #include "bitvector.hpp" 5 | 6 | #include 7 | 8 | #include 9 | 10 | #include "popcount.h" 11 | 12 | namespace surf { 13 | 14 | class BitvectorRank : public Bitvector { 15 | public: 16 | BitvectorRank() : basic_block_size_(0), rank_lut_(nullptr) {}; 17 | 18 | BitvectorRank(const position_t basic_block_size, 19 | const std::vector >& bitvector_per_level, 20 | const std::vector& num_bits_per_level, 21 | const level_t start_level = 0, 22 | const level_t end_level = 0/* non-inclusive */) 23 | : Bitvector(bitvector_per_level, num_bits_per_level, start_level, end_level) { 24 | basic_block_size_ = basic_block_size; 25 | initRankLut(); 26 | } 27 | 28 | ~BitvectorRank() {} 29 | 30 | // Counts the number of 1's in the bitvector up to position pos. 31 | // pos is zero-based; count is one-based. 32 | // E.g., for bitvector: 100101000, rank(3) = 2 33 | position_t rank(position_t pos) const { 34 | assert(pos <= num_bits_); 35 | position_t word_per_basic_block = basic_block_size_ / kWordSize; 36 | position_t block_id = pos / basic_block_size_; 37 | position_t offset = pos & (basic_block_size_ - 1); 38 | return (rank_lut_[block_id] 39 | + popcountLinear(bits_, block_id * word_per_basic_block, offset + 1)); 40 | } 41 | 42 | position_t rankLutSize() const { 43 | return ((num_bits_ / basic_block_size_ + 1) * sizeof(position_t)); 44 | } 45 | 46 | position_t serializedSize() const { 47 | position_t size = sizeof(num_bits_) + sizeof(basic_block_size_) 48 | + bitsSize() + rankLutSize(); 49 | sizeAlign(size); 50 | return size; 51 | } 52 | 53 | position_t size() const { 54 | return (sizeof(BitvectorRank) + bitsSize() + rankLutSize()); 55 | } 56 | 57 | void prefetch(position_t pos) const { 58 | __builtin_prefetch(bits_ + (pos / kWordSize)); 59 | __builtin_prefetch(rank_lut_ + (pos / basic_block_size_)); 60 | } 61 | 62 | void serialize(char*& dst) const { 63 | memcpy(dst, &num_bits_, sizeof(num_bits_)); 64 | dst += sizeof(num_bits_); 65 | memcpy(dst, &basic_block_size_, sizeof(basic_block_size_)); 66 | dst += sizeof(basic_block_size_); 67 | memcpy(dst, bits_, bitsSize()); 68 | dst += bitsSize(); 69 | memcpy(dst, rank_lut_, rankLutSize()); 70 | dst += rankLutSize(); 71 | align(dst); 72 | } 73 | 74 | static BitvectorRank* deSerialize(char*& src) { 75 | BitvectorRank* bv_rank = new BitvectorRank(); 76 | memcpy(&(bv_rank->num_bits_), src, sizeof(bv_rank->num_bits_)); 77 | src += sizeof(bv_rank->num_bits_); 78 | memcpy(&(bv_rank->basic_block_size_), src, sizeof(bv_rank->basic_block_size_)); 79 | src += sizeof(bv_rank->basic_block_size_); 80 | 81 | bv_rank->bits_ = new word_t[bv_rank->numWords()]; 82 | memcpy(bv_rank->bits_, src, bv_rank->bitsSize()); 83 | src += bv_rank->bitsSize(); 84 | bv_rank->rank_lut_ = new position_t[bv_rank->rankLutSize() / sizeof(position_t)]; 85 | memcpy(bv_rank->rank_lut_, src, bv_rank->rankLutSize()); 86 | src += bv_rank->rankLutSize(); 87 | 88 | //bv_rank->bits_ = const_cast(reinterpret_cast(src)); 89 | //src += bv_rank->bitsSize(); 90 | //bv_rank->rank_lut_ = const_cast(reinterpret_cast(src)); 91 | //src += bv_rank->rankLutSize(); 92 | 93 | align(src); 94 | return bv_rank; 95 | } 96 | 97 | void destroy() { 98 | delete[] bits_; 99 | delete[] rank_lut_; 100 | } 101 | 102 | private: 103 | void initRankLut() { 104 | position_t word_per_basic_block = basic_block_size_ / kWordSize; 105 | position_t num_blocks = num_bits_ / basic_block_size_ + 1; 106 | rank_lut_ = new position_t[num_blocks]; 107 | 108 | position_t cumu_rank = 0; 109 | for (position_t i = 0; i < num_blocks - 1; i++) { 110 | rank_lut_[i] = cumu_rank; 111 | cumu_rank += popcountLinear(bits_, i * word_per_basic_block, basic_block_size_); 112 | } 113 | rank_lut_[num_blocks - 1] = cumu_rank; 114 | } 115 | 116 | position_t basic_block_size_; 117 | position_t* rank_lut_; //rank look-up table 118 | }; 119 | 120 | } // namespace surf 121 | 122 | #endif // RANK_H_ 123 | -------------------------------------------------------------------------------- /bench/workload_gen/gen_txn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | class bcolors: 5 | HEADER = '\033[95m' 6 | OKBLUE = '\033[94m' 7 | OKGREEN = '\033[92m' 8 | WARNING = '\033[93m' 9 | FAIL = '\033[91m' 10 | ENDC = '\033[0m' 11 | BOLD = '\033[1m' 12 | UNDERLINE = '\033[4m' 13 | 14 | ##################################################################################### 15 | 16 | def reverseHostName ( email ) : 17 | name, sep, host = email.partition('@') 18 | hostparts = host[:-1].split('.') 19 | r_host = '' 20 | for part in hostparts : 21 | r_host = part + '.' + r_host 22 | return r_host + sep + name 23 | 24 | ##################################################################################### 25 | 26 | if (len(sys.argv) < 3) : 27 | print bcolors.FAIL + 'Usage:' 28 | print 'arg 1, key type: randint, timestamp, email' 29 | print 'arg 2, distribution: uniform, zipfian, latest' + bcolors.ENDC 30 | sys.exit() 31 | 32 | key_type = sys.argv[1] 33 | distribution = sys.argv[2] 34 | 35 | print bcolors.OKGREEN + 'key type = ' + key_type 36 | print 'distribution = ' + distribution + bcolors.ENDC 37 | 38 | ycsb_dir = 'YCSB/bin/' 39 | workload_dir = 'workload_spec/' 40 | output_dir='../workloads/' 41 | 42 | email_list = 'email_list.txt' 43 | email_list_size = 27549660 44 | email_keymap_file = output_dir + 'email_keymap.txt' 45 | 46 | timestamp_list = 'poisson_timestamps.csv' 47 | timestamp_keymap_file = output_dir + 'timestamp_keymap.txt' 48 | 49 | if key_type != 'randint' and key_type != 'timestamp' and key_type != 'email' : 50 | print bcolors.FAIL + 'Incorrect key_type: please pick from randint and email' + bcolors.ENDC 51 | sys.exit() 52 | 53 | if distribution != 'uniform' and distribution != 'zipfian' and distribution != 'latest' : 54 | print bcolors.FAIL + 'Incorrect distribution: please pick from uniform, zipfian and latest' + bcolors.ENDC 55 | sys.exit() 56 | 57 | out_ycsb_txn = output_dir + 'ycsb_txn_' + key_type + '_' + distribution 58 | out_txn_ycsbkey = output_dir + 'txn_' + 'ycsbkey' + '_' + distribution 59 | out_txn = output_dir + 'txn_' + key_type + '_' + distribution 60 | 61 | cmd_ycsb_txn = ycsb_dir + 'ycsb run basic -P ' + workload_dir + 'workloadc_' + key_type + '_' + distribution + ' -s > ' + out_ycsb_txn 62 | 63 | os.system(cmd_ycsb_txn) 64 | 65 | ##################################################################################### 66 | 67 | f_txn = open (out_ycsb_txn, 'r') 68 | f_txn_out = open (out_txn_ycsbkey, 'w') 69 | for line in f_txn : 70 | cols = line.split() 71 | if len(cols) > 2 and cols[0] == 'READ' : 72 | f_txn_out.write (cols[2][4:] + "\n") 73 | f_txn.close() 74 | f_txn_out.close() 75 | 76 | cmd = 'rm -f ' + out_ycsb_txn 77 | os.system(cmd) 78 | 79 | ##################################################################################### 80 | 81 | if key_type == 'randint' : 82 | f_txn = open (out_txn_ycsbkey, 'r') 83 | f_txn_out = open (out_txn, 'w') 84 | for line in f_txn : 85 | f_txn_out.write (line) 86 | 87 | elif key_type == 'timestamp' : 88 | timestamp_keymap = {} 89 | f_timestamp_keymap = open (timestamp_keymap_file, 'r') 90 | for line in f_timestamp_keymap : 91 | cols = line.split() 92 | timestamp_keymap[int(cols[0])] = cols[1] 93 | 94 | count = 0 95 | f_txn = open (out_txn_ycsbkey, 'r') 96 | f_txn_out = open (out_txn, 'w') 97 | for line in f_txn : 98 | cols = line.split() 99 | if len(cols) > 0 : 100 | f_txn_out.write (timestamp_keymap[int(cols[0])] + '\n') 101 | f_timestamp_keymap.close() 102 | 103 | elif key_type == 'email' : 104 | email_keymap = {} 105 | f_email_keymap = open (email_keymap_file, 'r') 106 | for line in f_email_keymap : 107 | cols = line.split() 108 | email_keymap[int(cols[0])] = cols[1] 109 | 110 | count = 0 111 | f_txn = open (out_txn_ycsbkey, 'r') 112 | f_txn_out = open (out_txn, 'w') 113 | for line in f_txn : 114 | cols = line.split() 115 | if len(cols) > 0 : 116 | f_txn_out.write (email_keymap[int(cols[0])] + '\n') 117 | f_email_keymap.close() 118 | 119 | f_txn.close() 120 | f_txn_out.close() 121 | 122 | cmd = 'rm -f ' + out_txn_ycsbkey 123 | os.system(cmd) 124 | -------------------------------------------------------------------------------- /bench/bench.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | namespace bench { 21 | 22 | static const uint64_t kNumIntRecords = 100000000; 23 | static const uint64_t kNumEmailRecords = 25000000; 24 | static const uint64_t kNumTxns = 10000000; 25 | static const uint64_t kIntRangeSize = 92233697311; 26 | static const uint64_t kEmailRangeSize = 128; 27 | 28 | //static const uint64_t kRandintRangeSize = 328 * 1024 * 1024 * (uint64_t)1024; 29 | //static const char* kWordloadDir = "workloads/"; 30 | 31 | // for pretty print 32 | static const char* kGreen ="\033[0;32m"; 33 | static const char* kRed ="\033[0;31m"; 34 | static const char* kNoColor ="\033[0;0m"; 35 | 36 | // for time measurement 37 | double getNow() { 38 | struct timeval tv; 39 | gettimeofday(&tv, 0); 40 | return tv.tv_sec + tv.tv_usec / 1000000.0; 41 | } 42 | 43 | std::string uint64ToString(uint64_t key) { 44 | uint64_t endian_swapped_key = __builtin_bswap64(key); 45 | return std::string(reinterpret_cast(&endian_swapped_key), 8); 46 | } 47 | 48 | uint64_t stringToUint64(std::string str_key) { 49 | uint64_t int_key = 0; 50 | memcpy(reinterpret_cast(&int_key), str_key.data(), 8); 51 | return __builtin_bswap64(int_key); 52 | } 53 | 54 | void loadKeysFromFile(const std::string& file_name, const bool is_key_int, 55 | std::vector &keys) { 56 | std::ifstream infile(file_name); 57 | std::string key; 58 | uint64_t count = 0; 59 | if (is_key_int) { 60 | while (count < kNumIntRecords && infile.good()) { 61 | uint64_t int_key; 62 | infile >> int_key; 63 | key = uint64ToString(int_key); 64 | keys.push_back(key); 65 | count++; 66 | } 67 | } else { 68 | while (count < kNumEmailRecords && infile.good()) { 69 | infile >> key; 70 | keys.push_back(key); 71 | count++; 72 | } 73 | } 74 | } 75 | 76 | void loadKeysFromFile(const std::string& file_name, uint64_t num_records, 77 | std::vector &keys) { 78 | std::ifstream infile(file_name); 79 | uint64_t count = 0; 80 | while (count < num_records && infile.good()) { 81 | uint64_t key; 82 | infile >> key; 83 | keys.push_back(key); 84 | count++; 85 | } 86 | } 87 | 88 | // 0 < percent <= 100 89 | void selectKeysToInsert(const unsigned percent, 90 | std::vector &insert_keys, 91 | std::vector &keys) { 92 | random_shuffle(keys.begin(), keys.end()); 93 | uint64_t num_insert_keys = keys.size() * percent / 100; 94 | for (uint64_t i = 0; i < num_insert_keys; i++) 95 | insert_keys.push_back(keys[i]); 96 | 97 | keys.clear(); 98 | sort(insert_keys.begin(), insert_keys.end()); 99 | } 100 | 101 | // 0 < percent <= 100 102 | void selectIntKeysToInsert(const unsigned percent, 103 | std::vector &insert_keys, 104 | std::vector &keys) { 105 | random_shuffle(keys.begin(), keys.end()); 106 | uint64_t num_insert_keys = keys.size() * percent / 100; 107 | for (uint64_t i = 0; i < num_insert_keys; i++) 108 | insert_keys.push_back(keys[i]); 109 | 110 | keys.clear(); 111 | sort(insert_keys.begin(), insert_keys.end()); 112 | } 113 | 114 | // pos > 0, position counting from the last byte 115 | void modifyKeyByte(std::vector &keys, int pos) { 116 | for (int i = 0; i < (int)keys.size(); i++) { 117 | int keylen = keys[i].length(); 118 | if (keylen > pos) 119 | keys[i][keylen - 1 - pos] = '+'; 120 | else 121 | keys[i][0] = '+'; 122 | } 123 | } 124 | 125 | std::string getUpperBoundKey(const std::string& key_type, const std::string& key) { 126 | std::string ret_str = key; 127 | if (key_type.compare(std::string("email")) == 0) { 128 | ret_str[ret_str.size() - 1] += (char)kEmailRangeSize; 129 | } else { 130 | uint64_t int_key = stringToUint64(key); 131 | int_key += kIntRangeSize; 132 | ret_str = uint64ToString(int_key); 133 | } 134 | return ret_str; 135 | } 136 | 137 | } // namespace bench 138 | -------------------------------------------------------------------------------- /bench/workload_gen/gen_load.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | class bcolors: 5 | HEADER = '\033[95m' 6 | OKBLUE = '\033[94m' 7 | OKGREEN = '\033[92m' 8 | WARNING = '\033[93m' 9 | FAIL = '\033[91m' 10 | ENDC = '\033[0m' 11 | BOLD = '\033[1m' 12 | UNDERLINE = '\033[4m' 13 | 14 | ##################################################################################### 15 | 16 | def reverseHostName ( email ) : 17 | name, sep, host = email.partition('@') 18 | hostparts = host[:-1].split('.') 19 | r_host = '' 20 | for part in hostparts : 21 | r_host = part + '.' + r_host 22 | return r_host + sep + name 23 | 24 | ##################################################################################### 25 | 26 | if (len(sys.argv) < 3) : 27 | print bcolors.FAIL + 'Usage:' 28 | print 'arg 1, key type: randint, timestamp, email' 29 | print 'arg 2, distribution: uniform, zipfian, latest' + bcolors.ENDC 30 | sys.exit() 31 | 32 | key_type = sys.argv[1] 33 | distribution = sys.argv[2] 34 | 35 | print bcolors.OKGREEN + 'key type = ' + key_type 36 | print 'distribution = ' + distribution + bcolors.ENDC 37 | 38 | ycsb_dir = 'YCSB/bin/' 39 | workload_dir = 'workload_spec/' 40 | output_dir='../workloads/' 41 | 42 | email_list = 'email_list.txt' 43 | email_list_size = 27549660 44 | email_keymap_file = output_dir + 'email_keymap.txt' 45 | 46 | timestamp_list = 'poisson_timestamps.csv' 47 | timestamp_keymap_file = output_dir + 'timestamp_keymap.txt' 48 | 49 | if key_type != 'randint' and key_type != 'timestamp' and key_type != 'email' : 50 | print bcolors.FAIL + 'Incorrect key_type: please pick from randint and email' + bcolors.ENDC 51 | sys.exit() 52 | 53 | if distribution != 'uniform' and distribution != 'zipfian' and distribution != 'latest' : 54 | print bcolors.FAIL + 'Incorrect distribution: please pick from uniform, zipfian and latest' + bcolors.ENDC 55 | sys.exit() 56 | 57 | out_ycsb_load = output_dir + 'ycsb_load_' + key_type 58 | out_load_ycsbkey = output_dir + 'load_' + 'ycsbkey' 59 | out_load = output_dir + 'load_' + key_type 60 | 61 | cmd_ycsb_load = ycsb_dir + 'ycsb load basic -P ' + workload_dir + 'workloadc_' + key_type + '_' + distribution + ' -s > ' + out_ycsb_load 62 | 63 | os.system(cmd_ycsb_load) 64 | 65 | ##################################################################################### 66 | 67 | f_load = open (out_ycsb_load, 'r') 68 | f_load_out = open (out_load_ycsbkey, 'w') 69 | for line in f_load : 70 | cols = line.split() 71 | if len(cols) > 2 and cols[0] == "INSERT": 72 | f_load_out.write (cols[2][4:] + '\n') 73 | f_load.close() 74 | f_load_out.close() 75 | 76 | cmd = 'rm -f ' + out_ycsb_load 77 | os.system(cmd) 78 | 79 | ##################################################################################### 80 | 81 | if key_type == 'randint' : 82 | f_load = open (out_load_ycsbkey, 'r') 83 | f_load_out = open (out_load, 'w') 84 | for line in f_load : 85 | f_load_out.write (line) 86 | 87 | elif key_type == 'timestamp' : 88 | timestamp_keymap = {} 89 | f_timestamp_keymap = open (timestamp_keymap_file, 'w') 90 | 91 | f_timestamp = open (timestamp_list, 'r') 92 | timestamps = f_timestamp.readlines() 93 | 94 | f_load_out = open (out_load, 'w') 95 | f_load = open (out_load_ycsbkey, 'r') 96 | count = 0 97 | for line in f_load : 98 | cols = line.split() 99 | ts = timestamps[count] 100 | f_load_out.write (ts) 101 | f_timestamp_keymap.write (cols[0] + ' ' + ts) 102 | count += 1 103 | f_timestamp_keymap.close() 104 | 105 | elif key_type == 'email' : 106 | email_keymap = {} 107 | f_email_keymap = open (email_keymap_file, 'w') 108 | 109 | f_email = open (email_list, 'r') 110 | emails = f_email.readlines() 111 | 112 | f_load = open (out_load_ycsbkey, 'r') 113 | f_load_out = open (out_load, 'w') 114 | 115 | sample_size = len(f_load.readlines()) 116 | gap = email_list_size / sample_size 117 | 118 | f_load.close() 119 | f_load = open (out_load_ycsbkey, 'r') 120 | count = 0 121 | for line in f_load : 122 | cols = line.split() 123 | email = reverseHostName(emails[count * gap]) 124 | f_load_out.write (email + '\n') 125 | f_email_keymap.write (cols[0] + ' ' + email + '\n') 126 | count += 1 127 | f_email_keymap.close() 128 | 129 | f_load.close() 130 | f_load_out.close() 131 | 132 | cmd = 'rm -f ' + out_load_ycsbkey 133 | os.system(cmd) 134 | -------------------------------------------------------------------------------- /test/unitTest/test_rank.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "config.hpp" 10 | #include "rank.hpp" 11 | #include "surf_builder.hpp" 12 | 13 | namespace surf { 14 | 15 | namespace ranktest { 16 | 17 | static const std::string kFilePath = "../../../test/words.txt"; 18 | static const int kTestSize = 234369; 19 | static std::vector words; 20 | 21 | class RankUnitTest : public ::testing::Test { 22 | public: 23 | virtual void SetUp () { 24 | bool include_dense = false; 25 | uint32_t sparse_dense_ratio = 0; 26 | level_t suffix_len = 8; 27 | builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, kReal, 0, suffix_len); 28 | data_ = nullptr; 29 | data2_ = nullptr; 30 | num_items_ = 0; 31 | } 32 | virtual void TearDown () { 33 | delete builder_; 34 | if (data_) 35 | delete[] data_; 36 | if (data2_) 37 | delete[] data2_; 38 | } 39 | 40 | void setupWordsTest(); 41 | void testSerialize(); 42 | void testRank(); 43 | 44 | static const position_t kRankBasicBlockSize = 512; 45 | 46 | SuRFBuilder* builder_; 47 | BitvectorRank* bv_; 48 | BitvectorRank* bv2_; 49 | std::vector num_items_per_level_; 50 | position_t num_items_; 51 | char* data_; 52 | char* data2_; 53 | }; 54 | 55 | void RankUnitTest::setupWordsTest() { 56 | builder_->build(words); 57 | for (level_t level = 0; level < builder_->getTreeHeight(); level++) 58 | num_items_per_level_.push_back(builder_->getLabels()[level].size()); 59 | for (level_t level = 0; level < num_items_per_level_.size(); level++) 60 | num_items_ += num_items_per_level_[level]; 61 | bv_ = new BitvectorRank(kRankBasicBlockSize, builder_->getChildIndicatorBits(), num_items_per_level_); 62 | bv2_ = new BitvectorRank(kRankBasicBlockSize, builder_->getLoudsBits(), num_items_per_level_); 63 | } 64 | 65 | void RankUnitTest::testSerialize() { 66 | uint64_t size = bv_->serializedSize(); 67 | ASSERT_TRUE((bv_->size() - size) >= 0); 68 | data_ = new char[size]; 69 | BitvectorRank* ori_bv = bv_; 70 | char* data = data_; 71 | ori_bv->serialize(data); 72 | data = data_; 73 | bv_ = BitvectorRank::deSerialize(data); 74 | 75 | ASSERT_EQ(ori_bv->bitsSize(), bv_->bitsSize()); 76 | ASSERT_EQ(ori_bv->rankLutSize(), bv_->rankLutSize()); 77 | 78 | ori_bv->destroy(); 79 | delete ori_bv; 80 | 81 | size = bv2_->serializedSize(); 82 | data2_ = new char[size]; 83 | BitvectorRank* ori_bv2 = bv2_; 84 | char* data2 = data2_; 85 | ori_bv2->serialize(data2); 86 | data2 = data2_; 87 | bv2_ = BitvectorRank::deSerialize(data2); 88 | 89 | ASSERT_EQ(ori_bv2->bitsSize(), bv2_->bitsSize()); 90 | ASSERT_EQ(ori_bv2->rankLutSize(), bv2_->rankLutSize()); 91 | 92 | ori_bv2->destroy(); 93 | delete ori_bv2; 94 | } 95 | 96 | void RankUnitTest::testRank() { 97 | position_t expected_rank = 0; 98 | position_t expected_rank2 = 0; 99 | for (position_t pos = 0; pos < num_items_; pos++) { 100 | if (bv_->readBit(pos)) expected_rank++; 101 | position_t rank = bv_->rank(pos); 102 | ASSERT_EQ(expected_rank, rank); 103 | 104 | if (bv2_->readBit(pos)) expected_rank2++; 105 | position_t rank2 = bv2_->rank(pos); 106 | ASSERT_EQ(expected_rank2, rank2); 107 | } 108 | } 109 | 110 | TEST_F (RankUnitTest, readBitTest) { 111 | setupWordsTest(); 112 | position_t bv_pos = 0; 113 | for (level_t level = 0; level < builder_->getTreeHeight(); level++) { 114 | for (position_t pos = 0; pos < num_items_per_level_[level]; pos++) { 115 | bool expected_bit = SuRFBuilder::readBit(builder_->getChildIndicatorBits()[level], pos); 116 | bool bv_bit = bv_->readBit(bv_pos); 117 | ASSERT_EQ(expected_bit, bv_bit); 118 | 119 | expected_bit = SuRFBuilder::readBit(builder_->getLoudsBits()[level], pos); 120 | bv_bit = bv2_->readBit(bv_pos); 121 | ASSERT_EQ(expected_bit, bv_bit); 122 | 123 | bv_pos++; 124 | } 125 | } 126 | bv_->destroy(); 127 | delete bv_; 128 | bv2_->destroy(); 129 | delete bv2_; 130 | } 131 | 132 | TEST_F (RankUnitTest, rankTest) { 133 | setupWordsTest(); 134 | testRank(); 135 | bv_->destroy(); 136 | delete bv_; 137 | bv2_->destroy(); 138 | delete bv2_; 139 | } 140 | 141 | TEST_F (RankUnitTest, serializeTest) { 142 | setupWordsTest(); 143 | testSerialize(); 144 | testRank(); 145 | } 146 | 147 | void loadWordList() { 148 | std::ifstream infile(kFilePath); 149 | std::string key; 150 | int count = 0; 151 | while (infile.good() && count < kTestSize) { 152 | infile >> key; 153 | words.push_back(key); 154 | count++; 155 | } 156 | } 157 | 158 | } // namespace ranktest 159 | 160 | } // namespace surf 161 | 162 | int main (int argc, char** argv) { 163 | ::testing::InitGoogleTest(&argc, argv); 164 | surf::ranktest::loadWordList(); 165 | return RUN_ALL_TESTS(); 166 | } 167 | -------------------------------------------------------------------------------- /include/bitvector.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BITVECTOR_H_ 2 | #define BITVECTOR_H_ 3 | 4 | #include 5 | 6 | #include 7 | 8 | #include "config.hpp" 9 | 10 | namespace surf { 11 | 12 | class Bitvector { 13 | public: 14 | Bitvector() : num_bits_(0), bits_(nullptr) {}; 15 | 16 | Bitvector(const std::vector >& bitvector_per_level, 17 | const std::vector& num_bits_per_level, 18 | const level_t start_level = 0, 19 | level_t end_level = 0/* non-inclusive */) { 20 | if (end_level == 0) 21 | end_level = bitvector_per_level.size(); 22 | num_bits_ = totalNumBits(num_bits_per_level, start_level, end_level); 23 | bits_ = new word_t[numWords()]; 24 | memset(bits_, 0, bitsSize()); 25 | concatenateBitvectors(bitvector_per_level, num_bits_per_level, start_level, end_level); 26 | } 27 | 28 | ~Bitvector() {} 29 | 30 | position_t numBits() const { 31 | return num_bits_; 32 | } 33 | 34 | position_t numWords() const { 35 | if (num_bits_ % kWordSize == 0) 36 | return (num_bits_ / kWordSize); 37 | else 38 | return (num_bits_ / kWordSize + 1); 39 | } 40 | 41 | // in bytes 42 | position_t bitsSize() const { 43 | return (numWords() * (kWordSize / 8)); 44 | } 45 | 46 | // in bytes 47 | position_t size() const { 48 | return (sizeof(Bitvector) + bitsSize()); 49 | } 50 | 51 | bool readBit(const position_t pos) const; 52 | 53 | position_t distanceToNextSetBit(const position_t pos) const; 54 | position_t distanceToPrevSetBit(const position_t pos) const; 55 | 56 | private: 57 | position_t totalNumBits(const std::vector& num_bits_per_level, 58 | const level_t start_level, 59 | const level_t end_level/* non-inclusive */); 60 | 61 | void concatenateBitvectors(const std::vector >& bitvector_per_level, 62 | const std::vector& num_bits_per_level, 63 | const level_t start_level, 64 | const level_t end_level/* non-inclusive */); 65 | protected: 66 | position_t num_bits_; 67 | word_t* bits_; 68 | }; 69 | 70 | bool Bitvector::readBit (const position_t pos) const { 71 | assert(pos <= num_bits_); 72 | position_t word_id = pos / kWordSize; 73 | position_t offset = pos & (kWordSize - 1); 74 | return bits_[word_id] & (kMsbMask >> offset); 75 | } 76 | 77 | position_t Bitvector::distanceToNextSetBit (const position_t pos) const { 78 | assert(pos < num_bits_); 79 | position_t distance = 1; 80 | 81 | position_t word_id = (pos + 1) / kWordSize; 82 | position_t offset = (pos + 1) % kWordSize; 83 | 84 | //first word left-over bits 85 | word_t test_bits = bits_[word_id] << offset; 86 | if (test_bits > 0) { 87 | return (distance + __builtin_clzll(test_bits)); 88 | } else { 89 | if (word_id == numWords() - 1) 90 | return (num_bits_ - pos); 91 | distance += (kWordSize - offset); 92 | } 93 | 94 | while (word_id < numWords() - 1) { 95 | word_id++; 96 | test_bits = bits_[word_id]; 97 | if (test_bits > 0) 98 | return (distance + __builtin_clzll(test_bits)); 99 | distance += kWordSize; 100 | } 101 | return distance; 102 | } 103 | 104 | position_t Bitvector::distanceToPrevSetBit (const position_t pos) const { 105 | assert(pos <= num_bits_); 106 | if (pos == 0) return 0; 107 | position_t distance = 1; 108 | 109 | position_t word_id = (pos - 1) / kWordSize; 110 | position_t offset = (pos - 1) % kWordSize; 111 | 112 | //first word left-over bits 113 | word_t test_bits = bits_[word_id] >> (kWordSize - 1 - offset); 114 | if (test_bits > 0) { 115 | return (distance + __builtin_ctzll(test_bits)); 116 | } else { 117 | //if (word_id == 0) 118 | //return (offset + 1); 119 | distance += (offset + 1); 120 | } 121 | 122 | while (word_id > 0) { 123 | word_id--; 124 | test_bits = bits_[word_id]; 125 | if (test_bits > 0) 126 | return (distance + __builtin_ctzll(test_bits)); 127 | distance += kWordSize; 128 | } 129 | return distance; 130 | } 131 | 132 | position_t Bitvector::totalNumBits(const std::vector& num_bits_per_level, 133 | const level_t start_level, 134 | const level_t end_level/* non-inclusive */) { 135 | position_t num_bits = 0; 136 | for (level_t level = start_level; level < end_level; level++) 137 | num_bits += num_bits_per_level[level]; 138 | return num_bits; 139 | } 140 | 141 | void Bitvector::concatenateBitvectors(const std::vector >& bitvector_per_level, 142 | const std::vector& num_bits_per_level, 143 | const level_t start_level, 144 | const level_t end_level/* non-inclusive */) { 145 | position_t bit_shift = 0; 146 | position_t word_id = 0; 147 | for (level_t level = start_level; level < end_level; level++) { 148 | if (num_bits_per_level[level] == 0) continue; 149 | position_t num_complete_words = num_bits_per_level[level] / kWordSize; 150 | for (position_t word = 0; word < num_complete_words; word++) { 151 | bits_[word_id] |= (bitvector_per_level[level][word] >> bit_shift); 152 | word_id++; 153 | if (bit_shift > 0) 154 | bits_[word_id] |= (bitvector_per_level[level][word] << (kWordSize - bit_shift)); 155 | } 156 | 157 | word_t bits_remain = num_bits_per_level[level] - num_complete_words * kWordSize; 158 | if (bits_remain > 0) { 159 | word_t last_word = bitvector_per_level[level][num_complete_words]; 160 | bits_[word_id] |= (last_word >> bit_shift); 161 | if (bit_shift + bits_remain < kWordSize) { 162 | bit_shift += bits_remain; 163 | } else { 164 | word_id++; 165 | bits_[word_id] |= (last_word << (kWordSize - bit_shift)); 166 | bit_shift = bit_shift + bits_remain - kWordSize; 167 | } 168 | } 169 | } 170 | } 171 | 172 | } // namespace surf 173 | 174 | #endif // BITVECTOR_H_ 175 | -------------------------------------------------------------------------------- /include/select.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SELECT_H_ 2 | #define SELECT_H_ 3 | 4 | #include "bitvector.hpp" 5 | 6 | #include 7 | 8 | #include 9 | 10 | #include "config.hpp" 11 | #include "popcount.h" 12 | 13 | namespace surf { 14 | 15 | class BitvectorSelect : public Bitvector { 16 | public: 17 | BitvectorSelect() : sample_interval_(0), num_ones_(0), select_lut_(nullptr) {}; 18 | 19 | BitvectorSelect(const position_t sample_interval, 20 | const std::vector >& bitvector_per_level, 21 | const std::vector& num_bits_per_level, 22 | const level_t start_level = 0, 23 | const level_t end_level = 0/* non-inclusive */) 24 | : Bitvector(bitvector_per_level, num_bits_per_level, start_level, end_level) { 25 | sample_interval_ = sample_interval; 26 | initSelectLut(); 27 | } 28 | 29 | ~BitvectorSelect() {} 30 | 31 | // Returns the postion of the rank-th 1 bit. 32 | // posistion is zero-based; rank is one-based. 33 | // E.g., for bitvector: 100101000, select(3) = 5 34 | position_t select(position_t rank) const { 35 | assert(rank > 0); 36 | assert(rank <= num_ones_ + 1); 37 | position_t lut_idx = rank / sample_interval_; 38 | position_t rank_left = rank % sample_interval_; 39 | // The first slot in select_lut_ stores the position of the first 1 bit. 40 | // Slot i > 0 stores the position of (i * sample_interval_)-th 1 bit 41 | if (lut_idx == 0) 42 | rank_left--; 43 | 44 | position_t pos = select_lut_[lut_idx]; 45 | 46 | if (rank_left == 0) 47 | return pos; 48 | 49 | position_t word_id = pos / kWordSize; 50 | position_t offset = pos % kWordSize; 51 | if (offset == kWordSize - 1) { 52 | word_id++; 53 | offset = 0; 54 | } else { 55 | offset++; 56 | } 57 | word_t word = bits_[word_id] << offset >> offset; //zero-out most significant bits 58 | position_t ones_count_in_word = popcount(word); 59 | while (ones_count_in_word < rank_left) { 60 | word_id++; 61 | word = bits_[word_id]; 62 | rank_left -= ones_count_in_word; 63 | ones_count_in_word = popcount(word); 64 | } 65 | return (word_id * kWordSize + select64_popcount_search(word, rank_left)); 66 | } 67 | 68 | position_t selectLutSize() const { 69 | return ((num_ones_ / sample_interval_ + 1) * sizeof(position_t)); 70 | } 71 | 72 | position_t serializedSize() const { 73 | position_t size = sizeof(num_bits_) + sizeof(sample_interval_) + sizeof(num_ones_) 74 | + bitsSize() + selectLutSize(); 75 | sizeAlign(size); 76 | return size; 77 | } 78 | 79 | position_t size() const { 80 | return (sizeof(BitvectorSelect) + bitsSize() + selectLutSize()); 81 | } 82 | 83 | position_t numOnes() const { 84 | return num_ones_; 85 | } 86 | 87 | void serialize(char*& dst) const { 88 | memcpy(dst, &num_bits_, sizeof(num_bits_)); 89 | dst += sizeof(num_bits_); 90 | memcpy(dst, &sample_interval_, sizeof(sample_interval_)); 91 | dst += sizeof(sample_interval_); 92 | memcpy(dst, &num_ones_, sizeof(num_ones_)); 93 | dst += sizeof(num_ones_); 94 | memcpy(dst, bits_, bitsSize()); 95 | dst += bitsSize(); 96 | memcpy(dst, select_lut_, selectLutSize()); 97 | dst += selectLutSize(); 98 | align(dst); 99 | } 100 | 101 | static BitvectorSelect* deSerialize(char*& src) { 102 | BitvectorSelect* bv_select = new BitvectorSelect(); 103 | memcpy(&(bv_select->num_bits_), src, sizeof(bv_select->num_bits_)); 104 | src += sizeof(bv_select->num_bits_); 105 | memcpy(&(bv_select->sample_interval_), src, sizeof(bv_select->sample_interval_)); 106 | src += sizeof(bv_select->sample_interval_); 107 | memcpy(&(bv_select->num_ones_), src, sizeof(bv_select->num_ones_)); 108 | src += sizeof(bv_select->num_ones_); 109 | 110 | bv_select->bits_ = new word_t[bv_select->numWords()]; 111 | memcpy(bv_select->bits_, src, bv_select->bitsSize()); 112 | src += bv_select->bitsSize(); 113 | bv_select->select_lut_ = new position_t[bv_select->selectLutSize() / sizeof(position_t)]; 114 | memcpy(bv_select->select_lut_, src, bv_select->selectLutSize()); 115 | src += bv_select->selectLutSize(); 116 | 117 | //bv_select->bits_ = const_cast(reinterpret_cast(src)); 118 | //src += bv_select->bitsSize(); 119 | //bv_select->select_lut_ = const_cast(reinterpret_cast(src)); 120 | //src += bv_select->selectLutSize(); 121 | align(src); 122 | return bv_select; 123 | } 124 | 125 | void destroy() { 126 | delete[] bits_; 127 | delete[] select_lut_; 128 | } 129 | 130 | private: 131 | // This function currently assumes that the first bit in the 132 | // bitvector is one. 133 | void initSelectLut() { 134 | position_t num_words = num_bits_ / kWordSize; 135 | if (num_bits_ % kWordSize != 0) 136 | num_words++; 137 | 138 | std::vector select_lut_vector; 139 | select_lut_vector.push_back(0); //ASSERT: first bit is 1 140 | position_t sampling_ones = sample_interval_; 141 | position_t cumu_ones_upto_word = 0; 142 | for (position_t i = 0; i < num_words; i++) { 143 | position_t num_ones_in_word = popcount(bits_[i]); 144 | while (sampling_ones <= (cumu_ones_upto_word + num_ones_in_word)) { 145 | int diff = sampling_ones - cumu_ones_upto_word; 146 | position_t result_pos = i * kWordSize + select64_popcount_search(bits_[i], diff); 147 | select_lut_vector.push_back(result_pos); 148 | sampling_ones += sample_interval_; 149 | } 150 | cumu_ones_upto_word += popcount(bits_[i]); 151 | } 152 | 153 | num_ones_ = cumu_ones_upto_word; 154 | position_t num_samples = select_lut_vector.size(); 155 | select_lut_ = new position_t[num_samples]; 156 | for (position_t i = 0; i < num_samples; i++) 157 | select_lut_[i] = select_lut_vector[i]; 158 | } 159 | 160 | private: 161 | position_t sample_interval_; 162 | position_t num_ones_; 163 | position_t* select_lut_; //select look-up table 164 | }; 165 | 166 | } // namespace surf 167 | 168 | #endif // SELECT_H_ 169 | -------------------------------------------------------------------------------- /bench/workload_arf.cpp: -------------------------------------------------------------------------------- 1 | #include "bench.hpp" 2 | #include "ARF.h" 3 | #include "Database.h" 4 | #include "Query.h" 5 | 6 | static const int kARFSize = 70000000; 7 | static const int kInputSize = 10000000; 8 | static const int kTxnSize = 10000000; 9 | static const int kTrainingSize = 2000000; 10 | static const uint64_t kDomain = (ULLONG_MAX / 2 - 1); 11 | static const uint64_t kRangeSize = 922336973116; 12 | 13 | int main(int argc, char *argv[]) { 14 | if (argc != 4) { 15 | std::cout << "Usage:\n"; 16 | std::cout << "1. percentage of keys inserted: 0 < num <= 100\n"; 17 | std::cout << "2. query type: point, range\n"; 18 | std::cout << "3. distribution: uniform, zipfian, latest\n"; 19 | return -1; 20 | } 21 | 22 | unsigned percent = atoi(argv[1]); 23 | std::string query_type = argv[2]; 24 | std::string distribution = argv[3]; 25 | 26 | // check args ==================================================== 27 | if (percent > 100) { 28 | std::cout << bench::kRed << "WRONG percentage\n" << bench::kNoColor; 29 | return -1; 30 | } 31 | 32 | if (query_type.compare(std::string("point")) != 0 33 | && query_type.compare(std::string("range")) != 0) { 34 | std::cout << bench::kRed << "WRONG query type\n" << bench::kNoColor; 35 | return -1; 36 | } 37 | 38 | if (distribution.compare(std::string("uniform")) != 0 39 | && distribution.compare(std::string("zipfian")) != 0 40 | && distribution.compare(std::string("latest")) != 0) { 41 | std::cout << bench::kRed << "WRONG distribution\n" << bench::kNoColor; 42 | return -1; 43 | } 44 | 45 | // load keys from files ======================================= 46 | std::string load_file = "workloads/load_randint"; 47 | std::vector load_keys; 48 | bench::loadKeysFromFile(load_file, kInputSize, load_keys); 49 | std::cout << "load_keys size = " << load_keys.size() << "\n"; 50 | 51 | sort(load_keys.begin(), load_keys.end()); 52 | uint64_t max_key = load_keys[load_keys.size() - 1]; 53 | std::cout << std::hex << "max key = " << max_key << std::dec << "\n"; 54 | uint64_t max_gap = load_keys[load_keys.size() - 1] - load_keys[0]; 55 | std::cout << "max gap = " << max_gap << "\n"; 56 | uint64_t avg_gap = max_gap / kInputSize; 57 | std::cout << "avg gap = " << avg_gap << "\n"; 58 | 59 | std::string txn_file = "workloads/txn_randint_"; 60 | txn_file += distribution; 61 | std::vector txn_keys; 62 | bench::loadKeysFromFile(txn_file, kTxnSize, txn_keys); 63 | std::cout << "txn_keys size = " << txn_keys.size() << "\n"; 64 | 65 | std::vector insert_keys; 66 | bench::selectIntKeysToInsert(percent, insert_keys, load_keys); 67 | std::cout << "insert_keys size = " << insert_keys.size() << "\n"; 68 | 69 | // compute upperbound keys for range queries ================= 70 | std::vector upper_bound_keys; 71 | if (query_type.compare(std::string("range")) == 0) { 72 | for (int i = 0; i < (int)txn_keys.size(); i++) { 73 | txn_keys[i]++; 74 | uint64_t upper_bound = txn_keys[i] + kRangeSize; 75 | upper_bound_keys.push_back(upper_bound); 76 | } 77 | } else { 78 | for (int i = 0; i < (int)txn_keys.size(); i++) { 79 | upper_bound_keys.push_back(txn_keys[i]); 80 | } 81 | } 82 | 83 | // create filter ============================================== 84 | arf::Database* db = new arf::Database(insert_keys); 85 | arf::ARF* filter = new arf::ARF(0, kDomain, db); 86 | 87 | // build perfect ARF ========================================== 88 | double start_time = bench::getNow(); 89 | filter->perfect(db); 90 | double end_time = bench::getNow(); 91 | double time_diff = end_time - start_time; 92 | std::cout << "build perfect time = " << time_diff << " s\n"; 93 | 94 | // training =================================================== 95 | start_time = bench::getNow(); 96 | for (int i = 0; i < kTrainingSize; i++) { 97 | if (i % 100000 == 0) 98 | std::cout << "i = " << i << std::endl; 99 | bool qR = db->rangeQuery(txn_keys[i], upper_bound_keys[i]); 100 | filter->handle_query(txn_keys[i], upper_bound_keys[i], qR, true); 101 | } 102 | filter->reset_training_phase(); 103 | filter->truncate(kARFSize); 104 | filter->end_training_phase(); 105 | filter->print_size(); 106 | end_time = bench::getNow(); 107 | time_diff = end_time - start_time; 108 | std::cout << "training time = " << time_diff << " s\n"; 109 | std::cout << "training throughput = " << ((kTrainingSize + 0.0) / time_diff) << " txns/s\n"; 110 | 111 | // execute transactions ======================================= 112 | int64_t positives = 0; 113 | start_time = bench::getNow(); 114 | for (int i = kTrainingSize; i < kTxnSize; i++) { 115 | positives += (int)filter->handle_query(txn_keys[i], upper_bound_keys[i], true, false); 116 | } 117 | end_time = bench::getNow(); 118 | time_diff = end_time - start_time; 119 | std::cout << "time = " << time_diff << " s\n"; 120 | std::cout << "throughput = " << bench::kGreen << ((kTrainingSize + 0.0) / time_diff) << bench::kNoColor << " txns/s\n"; 121 | end_time = bench::getNow(); 122 | 123 | // compute true positives ====================================== 124 | int64_t tps = 0; 125 | int64_t tns = 0; 126 | for (int i = kTrainingSize; i < kTxnSize; i++) { 127 | bool dR = db->rangeQuery(txn_keys[i], upper_bound_keys[i]); 128 | if (dR) 129 | tps++; 130 | else 131 | tns++; 132 | } 133 | int64_t fps = positives - tps; 134 | 135 | std::cout << "positives = " << positives << "\n"; 136 | std::cout << "true positives = " << tps << "\n"; 137 | std::cout << "true negatives = " << tns << "\n"; 138 | std::cout << "false positives = " << fps << "\n"; 139 | 140 | double fp_rate = 0; 141 | if (fps >= 0) 142 | fp_rate = fps / (tns + fps + 0.0); 143 | else 144 | std::cout << "ERROR: fps < 0\n"; 145 | std::cout << "False Positive Rate = " << fp_rate << "\n"; 146 | 147 | return 0; 148 | } 149 | -------------------------------------------------------------------------------- /bench/bloom.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2012 The LevelDB Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style license that can be 3 | // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 | 5 | // Modified by Huanchen, 2018 6 | 7 | #ifndef LEVELDB_BLOOM_H_ 8 | #define LEVELDB_BLOOM_H_ 9 | 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | #include "MurmurHash3.h" 17 | 18 | using namespace std; 19 | 20 | inline uint32_t DecodeFixed32(const char* ptr) { 21 | uint32_t result; 22 | memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load 23 | return result; 24 | } 25 | 26 | /* 27 | inline uint32_t Hash(const char* data, size_t n, uint32_t seed) { 28 | // Similar to murmur hash 29 | const uint32_t m = 0xc6a4a793; 30 | const uint32_t r = 24; 31 | const char* limit = data + n; 32 | uint32_t h = seed ^ (n * m); 33 | 34 | // Pick up four bytes at a time 35 | while (data + 4 <= limit) { 36 | uint32_t w = DecodeFixed32(data); 37 | data += 4; 38 | h += w; 39 | h *= m; 40 | h ^= (h >> 16); 41 | } 42 | 43 | // Pick up remaining bytes 44 | switch (limit - data) { 45 | case 3: 46 | h += static_cast(data[2]) << 16; 47 | case 2: 48 | h += static_cast(data[1]) << 8; 49 | case 1: 50 | h += static_cast(data[0]); 51 | h *= m; 52 | h ^= (h >> r); 53 | break; 54 | } 55 | return h; 56 | } 57 | */ 58 | static void BloomHash(const string &key, uint32_t* out) { 59 | MurmurHash3_x86_128(key.c_str(), key.size(), 0xbc9f1d34, out); 60 | } 61 | 62 | static void BloomHash(const uint64_t key, uint32_t* out) { 63 | MurmurHash3_x86_128((const char*)(&key), sizeof(uint64_t), 0xbc9f1d34, out); 64 | } 65 | 66 | class BloomFilter { 67 | private: 68 | size_t bits_per_key_; 69 | size_t k_; 70 | 71 | public: 72 | BloomFilter(int bits_per_key) 73 | : bits_per_key_(bits_per_key) { 74 | // We intentionally round down to reduce probing cost a little bit 75 | k_ = static_cast(bits_per_key * 0.69); // 0.69 =~ ln(2) 76 | if (k_ < 1) k_ = 1; 77 | if (k_ > 30) k_ = 30; 78 | } 79 | 80 | void CreateFilter(vector keys, int n, string* dst) const { 81 | // Compute bloom filter size (in both bits and bytes) 82 | size_t bits = n * bits_per_key_; 83 | 84 | // For small n, we can see a very high false positive rate. Fix it 85 | // by enforcing a minimum bloom filter length. 86 | if (bits < 64) bits = 64; 87 | 88 | size_t bytes = (bits + 7) / 8; 89 | bits = bytes * 8; 90 | 91 | const size_t init_size = dst->size(); 92 | dst->resize(init_size + bytes, 0); 93 | dst->push_back(static_cast(k_)); // Remember # of probes in filter 94 | char* array = &(*dst)[init_size]; 95 | for (int i = 0; i < n; i++) { 96 | // Use double-hashing to generate a sequence of hash values. 97 | // See analysis in [Kirsch,Mitzenmacher 2006]. 98 | // uint32_t h = BloomHash(keys[i]); 99 | uint32_t hbase[4]; 100 | BloomHash(keys[i], hbase); 101 | uint32_t h = hbase[0]; 102 | const uint32_t delta = hbase[1]; 103 | for (size_t j = 0; j < k_; j++) { 104 | const uint32_t bitpos = h % bits; 105 | array[bitpos/8] |= (1 << (bitpos % 8)); 106 | h += delta; 107 | } 108 | } 109 | } 110 | 111 | void CreateFilter(vector keys, int n, string* dst) const { 112 | // Compute bloom filter size (in both bits and bytes) 113 | size_t bits = n * bits_per_key_; 114 | 115 | // For small n, we can see a very high false positive rate. Fix it 116 | // by enforcing a minimum bloom filter length. 117 | if (bits < 64) bits = 64; 118 | 119 | size_t bytes = (bits + 7) / 8; 120 | bits = bytes * 8; 121 | 122 | const size_t init_size = dst->size(); 123 | dst->resize(init_size + bytes, 0); 124 | dst->push_back(static_cast(k_)); // Remember # of probes in filter 125 | char* array = &(*dst)[init_size]; 126 | for (int i = 0; i < n; i++) { 127 | // Use double-hashing to generate a sequence of hash values. 128 | // See analysis in [Kirsch,Mitzenmacher 2006]. 129 | //uint32_t h = BloomHash(keys[i]); 130 | uint32_t hbase[4]; 131 | BloomHash(keys[i], hbase); 132 | uint32_t h = hbase[0]; 133 | const uint32_t delta = hbase[1]; 134 | for (size_t j = 0; j < k_; j++) { 135 | const uint32_t bitpos = h % bits; 136 | array[bitpos/8] |= (1 << (bitpos % 8)); 137 | h += delta; 138 | } 139 | } 140 | } 141 | 142 | bool KeyMayMatch(const string& key, const string& bloom_filter) const { 143 | const size_t len = bloom_filter.size(); 144 | if (len < 2) return false; 145 | 146 | const char* array = bloom_filter.c_str(); 147 | const size_t bits = (len - 1) * 8; 148 | 149 | // Use the encoded k so that we can read filters generated by 150 | // bloom filters created using different parameters. 151 | const size_t k = array[len-1]; 152 | if (k > 30) { 153 | // Reserved for potentially new encodings for short bloom filters. 154 | // Consider it a match. 155 | return true; 156 | } 157 | 158 | uint32_t hbase[4]; 159 | BloomHash(key, hbase); 160 | uint32_t h = hbase[0]; 161 | const uint32_t delta = hbase[1]; 162 | for (size_t j = 0; j < k; j++) { 163 | const uint32_t bitpos = h % bits; 164 | if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false; 165 | h += delta; 166 | } 167 | return true; 168 | } 169 | 170 | bool KeyMayMatch(const uint64_t key, const string& bloom_filter) const { 171 | const size_t len = bloom_filter.size(); 172 | if (len < 2) return false; 173 | 174 | const char* array = bloom_filter.c_str(); 175 | const size_t bits = (len - 1) * 8; 176 | 177 | // Use the encoded k so that we can read filters generated by 178 | // bloom filters created using different parameters. 179 | const size_t k = array[len-1]; 180 | if (k > 30) { 181 | // Reserved for potentially new encodings for short bloom filters. 182 | // Consider it a match. 183 | return true; 184 | } 185 | 186 | uint32_t hbase[4]; 187 | BloomHash(key, hbase); 188 | uint32_t h = hbase[0]; 189 | const uint32_t delta = hbase[1]; 190 | for (size_t j = 0; j < k; j++) { 191 | const uint32_t bitpos = h % bits; 192 | if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false; 193 | h += delta; 194 | } 195 | return true; 196 | } 197 | }; 198 | 199 | 200 | #endif // LEVELDB_BLOOM_H_ 201 | -------------------------------------------------------------------------------- /include/label_vector.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LABELVECTOR_H_ 2 | #define LABELVECTOR_H_ 3 | 4 | #include 5 | 6 | #include 7 | 8 | #include "config.hpp" 9 | 10 | namespace surf { 11 | 12 | class LabelVector { 13 | public: 14 | LabelVector() : num_bytes_(0), labels_(nullptr) {}; 15 | 16 | LabelVector(const std::vector >& labels_per_level, 17 | const level_t start_level = 0, 18 | level_t end_level = 0/* non-inclusive */) { 19 | if (end_level == 0) 20 | end_level = labels_per_level.size(); 21 | 22 | num_bytes_ = 1; 23 | for (level_t level = start_level; level < end_level; level++) 24 | num_bytes_ += labels_per_level[level].size(); 25 | 26 | //labels_ = new label_t[num_bytes_]; 27 | position_t alloc_bytes = num_bytes_ * (num_bytes_ / kWordSize + 1); 28 | labels_ = new label_t[alloc_bytes]; 29 | for (position_t i = 0; i < alloc_bytes; i++) 30 | labels_[i] = 0; 31 | 32 | position_t pos = 0; 33 | for (level_t level = start_level; level < end_level; level++) { 34 | for (position_t idx = 0; idx < labels_per_level[level].size(); idx++) { 35 | labels_[pos] = labels_per_level[level][idx]; 36 | pos++; 37 | } 38 | } 39 | } 40 | 41 | ~LabelVector() {} 42 | 43 | position_t getNumBytes() const { 44 | return num_bytes_; 45 | } 46 | 47 | position_t serializedSize() const { 48 | position_t size = sizeof(num_bytes_) + num_bytes_; 49 | sizeAlign(size); 50 | return size; 51 | } 52 | 53 | position_t size() const { 54 | return (sizeof(LabelVector) + num_bytes_); 55 | } 56 | 57 | label_t read(const position_t pos) const { 58 | return labels_[pos]; 59 | } 60 | 61 | label_t operator[](const position_t pos) const { 62 | return labels_[pos]; 63 | } 64 | 65 | bool search(const label_t target, position_t& pos, const position_t search_len) const; 66 | bool searchGreaterThan(const label_t target, position_t& pos, const position_t search_len) const; 67 | 68 | bool binarySearch(const label_t target, position_t& pos, const position_t search_len) const; 69 | bool simdSearch(const label_t target, position_t& pos, const position_t search_len) const; 70 | bool linearSearch(const label_t target, position_t& pos, const position_t search_len) const; 71 | 72 | bool binarySearchGreaterThan(const label_t target, position_t& pos, const position_t search_len) const; 73 | bool linearSearchGreaterThan(const label_t target, position_t& pos, const position_t search_len) const; 74 | 75 | void serialize(char*& dst) const { 76 | memcpy(dst, &num_bytes_, sizeof(num_bytes_)); 77 | dst += sizeof(num_bytes_); 78 | memcpy(dst, labels_, num_bytes_); 79 | dst += num_bytes_; 80 | align(dst); 81 | } 82 | 83 | static LabelVector* deSerialize(char*& src) { 84 | LabelVector* lv = new LabelVector(); 85 | memcpy(&(lv->num_bytes_), src, sizeof(lv->num_bytes_)); 86 | src += sizeof(lv->num_bytes_); 87 | 88 | lv->labels_ = new label_t[lv->num_bytes_]; 89 | memcpy(lv->labels_, src, lv->num_bytes_); 90 | src += lv->num_bytes_; 91 | 92 | //lv->labels_ = const_cast(reinterpret_cast(src)); 93 | //src += lv->num_bytes_; 94 | align(src); 95 | return lv; 96 | } 97 | 98 | void destroy() { 99 | delete[] labels_; 100 | } 101 | 102 | private: 103 | position_t num_bytes_; 104 | label_t* labels_; 105 | }; 106 | 107 | bool LabelVector::search(const label_t target, position_t& pos, position_t search_len) const { 108 | //skip terminator label 109 | if ((search_len > 1) && (labels_[pos] == kTerminator)) { 110 | pos++; 111 | search_len--; 112 | } 113 | 114 | if (search_len < 3) 115 | return linearSearch(target, pos, search_len); 116 | if (search_len < 12) 117 | return binarySearch(target, pos, search_len); 118 | else 119 | return simdSearch(target, pos, search_len); 120 | } 121 | 122 | bool LabelVector::searchGreaterThan(const label_t target, position_t& pos, position_t search_len) const { 123 | //skip terminator label 124 | if ((search_len > 1) && (labels_[pos] == kTerminator)) { 125 | pos++; 126 | search_len--; 127 | } 128 | 129 | if (search_len < 3) 130 | return linearSearchGreaterThan(target, pos, search_len); 131 | else 132 | return binarySearchGreaterThan(target, pos, search_len); 133 | } 134 | 135 | bool LabelVector::binarySearch(const label_t target, position_t& pos, const position_t search_len) const { 136 | position_t l = pos; 137 | position_t r = pos + search_len; 138 | while (l < r) { 139 | position_t m = (l + r) >> 1; 140 | if (target < labels_[m]) { 141 | r = m; 142 | } else if (target == labels_[m]) { 143 | pos = m; 144 | return true; 145 | } else { 146 | l = m + 1; 147 | } 148 | } 149 | return false; 150 | } 151 | 152 | bool LabelVector::simdSearch(const label_t target, position_t& pos, const position_t search_len) const { 153 | position_t num_labels_searched = 0; 154 | position_t num_labels_left = search_len; 155 | while ((num_labels_left >> 4) > 0) { 156 | label_t* start_ptr = labels_ + pos + num_labels_searched; 157 | __m128i cmp = _mm_cmpeq_epi8(_mm_set1_epi8(target), 158 | _mm_loadu_si128(reinterpret_cast<__m128i*>(start_ptr))); 159 | unsigned check_bits = _mm_movemask_epi8(cmp); 160 | if (check_bits) { 161 | pos += (num_labels_searched + __builtin_ctz(check_bits)); 162 | return true; 163 | } 164 | num_labels_searched += 16; 165 | num_labels_left -= 16; 166 | } 167 | 168 | if (num_labels_left > 0) { 169 | label_t* start_ptr = labels_ + pos + num_labels_searched; 170 | __m128i cmp = _mm_cmpeq_epi8(_mm_set1_epi8(target), 171 | _mm_loadu_si128(reinterpret_cast<__m128i*>(start_ptr))); 172 | unsigned leftover_bits_mask = (1 << num_labels_left) - 1; 173 | unsigned check_bits = _mm_movemask_epi8(cmp) & leftover_bits_mask; 174 | if (check_bits) { 175 | pos += (num_labels_searched + __builtin_ctz(check_bits)); 176 | return true; 177 | } 178 | } 179 | 180 | return false; 181 | } 182 | 183 | bool LabelVector::linearSearch(const label_t target, position_t& pos, const position_t search_len) const { 184 | for (position_t i = 0; i < search_len; i++) { 185 | if (target == labels_[pos + i]) { 186 | pos += i; 187 | return true; 188 | } 189 | } 190 | return false; 191 | } 192 | 193 | bool LabelVector::binarySearchGreaterThan(const label_t target, position_t& pos, const position_t search_len) const { 194 | position_t l = pos; 195 | position_t r = pos + search_len; 196 | while (l < r) { 197 | position_t m = (l + r) >> 1; 198 | if (target < labels_[m]) { 199 | r = m; 200 | } else if (target == labels_[m]) { 201 | if (m < pos + search_len - 1) { 202 | pos = m + 1; 203 | return true; 204 | } 205 | return false; 206 | } else { 207 | l = m + 1; 208 | } 209 | } 210 | 211 | if (l < pos + search_len) { 212 | pos = l; 213 | return true; 214 | } 215 | return false; 216 | } 217 | 218 | bool LabelVector::linearSearchGreaterThan(const label_t target, position_t& pos, const position_t search_len) const { 219 | for (position_t i = 0; i < search_len; i++) { 220 | if (labels_[pos + i] > target) { 221 | pos += i; 222 | return true; 223 | } 224 | } 225 | return false; 226 | } 227 | 228 | } // namespace surf 229 | 230 | #endif // LABELVECTOR_H_ 231 | -------------------------------------------------------------------------------- /test/unitTest/test_bitvector.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "bitvector.hpp" 10 | #include "config.hpp" 11 | #include "surf_builder.hpp" 12 | 13 | namespace surf { 14 | 15 | namespace bitvectortest { 16 | 17 | static const std::string kFilePath = "../../../test/words.txt"; 18 | static const int kTestSize = 234369; 19 | static std::vector words; 20 | 21 | class BitvectorUnitTest : public ::testing::Test { 22 | public: 23 | virtual void SetUp () { 24 | bool include_dense = true; 25 | uint32_t sparse_dense_ratio = 0; 26 | level_t suffix_len = 8; 27 | builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, kReal, 0, suffix_len); 28 | num_items_ = 0; 29 | } 30 | virtual void TearDown () { 31 | delete builder_; 32 | delete bv_; 33 | delete bv2_; 34 | delete bv3_; 35 | delete bv4_; 36 | delete bv5_; 37 | } 38 | 39 | void setupWordsTest(); 40 | 41 | SuRFBuilder* builder_; 42 | Bitvector* bv_; // sparse: child indicator bits 43 | Bitvector* bv2_; // sparse: louds bits 44 | Bitvector* bv3_; // dense: label bitmap 45 | Bitvector* bv4_; // dense: child indicator bitmap 46 | Bitvector* bv5_; // dense: prefixkey indicator bits 47 | std::vector num_items_per_level_; // sparse 48 | position_t num_items_; // sparse 49 | std::vector num_bits_per_level_; // dense 50 | }; 51 | 52 | void BitvectorUnitTest::setupWordsTest() { 53 | builder_->build(words); 54 | for (level_t level = 0; level < builder_->getTreeHeight(); level++) 55 | num_items_per_level_.push_back(builder_->getLabels()[level].size()); 56 | for (level_t level = 0; level < num_items_per_level_.size(); level++) 57 | num_items_ += num_items_per_level_[level]; 58 | bv_ = new Bitvector(builder_->getChildIndicatorBits(), num_items_per_level_); 59 | bv2_ = new Bitvector(builder_->getLoudsBits(), num_items_per_level_); 60 | 61 | for (level_t level = 0; level < builder_->getTreeHeight(); level++) 62 | num_bits_per_level_.push_back(builder_->getBitmapLabels()[level].size() * kWordSize); 63 | bv3_ = new Bitvector(builder_->getBitmapLabels(), num_bits_per_level_); 64 | bv4_ = new Bitvector(builder_->getBitmapChildIndicatorBits(), num_bits_per_level_); 65 | bv5_ = new Bitvector(builder_->getPrefixkeyIndicatorBits(), builder_->getNodeCounts()); 66 | } 67 | 68 | TEST_F (BitvectorUnitTest, readBitTest) { 69 | setupWordsTest(); 70 | 71 | position_t bv_pos = 0; 72 | int node_num = -1; 73 | label_t prev_label = 0; 74 | position_t bv5_pos = 0; 75 | for (level_t level = 0; level < builder_->getTreeHeight(); level++) { 76 | for (position_t pos = 0; pos < num_items_per_level_[level]; pos++) { 77 | // bv test 78 | bool has_child = SuRFBuilder::readBit(builder_->getChildIndicatorBits()[level], pos); 79 | bool bv_bit = bv_->readBit(bv_pos); 80 | ASSERT_EQ(has_child, bv_bit); 81 | 82 | // bv2 test 83 | bool is_node_start = SuRFBuilder::readBit(builder_->getLoudsBits()[level], pos); 84 | bv_bit = bv2_->readBit(bv_pos); 85 | ASSERT_EQ(is_node_start, bv_bit); 86 | 87 | bv_pos++; 88 | 89 | if (is_node_start) 90 | node_num++; 91 | 92 | // bv5 test 93 | bool is_terminator = false; 94 | if (is_node_start) { 95 | is_terminator = (builder_->getLabels()[level][pos] == kTerminator) 96 | && !SuRFBuilder::readBit(builder_->getChildIndicatorBits()[level], pos); 97 | bv_bit = bv5_->readBit(bv5_pos); 98 | ASSERT_EQ(is_terminator, bv_bit); 99 | bv5_pos++; 100 | } 101 | 102 | if (is_terminator) { 103 | for (unsigned c = prev_label + 1; c < kFanout; c++) { 104 | bool bv3_bit = bv3_->readBit((node_num - 1) * kFanout + c); 105 | ASSERT_FALSE(bv3_bit); 106 | bool bv4_bit = bv4_->readBit((node_num - 1) * kFanout + c); 107 | ASSERT_FALSE(bv4_bit); 108 | } 109 | prev_label = '\255'; 110 | continue; 111 | } 112 | 113 | // bv3 test 114 | label_t label = builder_->getLabels()[level][pos]; 115 | bool bv3_bit = bv3_->readBit(node_num * kFanout + label); 116 | ASSERT_TRUE(bv3_bit); 117 | 118 | // bv4 test 119 | bool bv4_bit = bv4_->readBit(node_num * kFanout + label); 120 | ASSERT_EQ(has_child, bv4_bit); 121 | 122 | // bv3 bv4 zero bit test 123 | if (is_node_start) { 124 | if (node_num > 0) { 125 | for (unsigned c = prev_label + 1; c < kFanout; c++) { 126 | bv3_bit = bv3_->readBit((node_num - 1) * kFanout + c); 127 | ASSERT_FALSE(bv3_bit); 128 | bv4_bit = bv4_->readBit((node_num - 1) * kFanout + c); 129 | ASSERT_FALSE(bv4_bit); 130 | } 131 | } 132 | for (unsigned c = 0; c < (unsigned)label; c++) { 133 | bv3_bit = bv3_->readBit(node_num * kFanout + c); 134 | ASSERT_FALSE(bv3_bit); 135 | bv4_bit = bv4_->readBit(node_num * kFanout + c); 136 | ASSERT_FALSE(bv4_bit); 137 | } 138 | } else { 139 | for (unsigned c = prev_label + 1; c < (unsigned)label; c++) { 140 | bv3_bit = bv3_->readBit(node_num * kFanout + c); 141 | ASSERT_FALSE(bv3_bit); 142 | bv4_bit = bv4_->readBit(node_num * kFanout + c); 143 | ASSERT_FALSE(bv4_bit); 144 | } 145 | } 146 | prev_label = label; 147 | } 148 | } 149 | 150 | } 151 | 152 | TEST_F (BitvectorUnitTest, distanceToNextSetBitTest) { 153 | setupWordsTest(); 154 | std::vector distanceVector; 155 | position_t distance = 1; 156 | for (position_t pos = 1; pos < num_items_; pos++) { 157 | if (bv2_->readBit(pos)) { 158 | while (distance > 0) { 159 | distanceVector.push_back(distance); 160 | distance--; 161 | } 162 | distance = 1; 163 | } 164 | else { 165 | distance++; 166 | } 167 | } 168 | while (distance > 0) { 169 | distanceVector.push_back(distance); 170 | distance--; 171 | } 172 | 173 | for (position_t pos = 0; pos < num_items_; pos++) { 174 | distance = bv2_->distanceToNextSetBit(pos); 175 | ASSERT_EQ(distanceVector[pos], distance); 176 | } 177 | } 178 | 179 | TEST_F (BitvectorUnitTest, distanceToPrevSetBitTest) { 180 | setupWordsTest(); 181 | std::vector distanceVector; 182 | for (position_t pos = 0; pos < num_items_; pos++) 183 | distanceVector.push_back(0); 184 | 185 | position_t distance = 1; 186 | for (position_t pos = num_items_ - 2; pos > 0; pos--) { 187 | if (bv2_->readBit(pos)) { 188 | for (position_t i = 1; i <= distance; i++) 189 | distanceVector[pos + i] = i; 190 | distance = 1; 191 | } 192 | else { 193 | distance++; 194 | } 195 | } 196 | if (bv2_->readBit(0)) { 197 | for (position_t i = 1; i <= distance; i++) 198 | distanceVector[i] = i; 199 | } else { 200 | distance++; 201 | for (position_t i = 1; i <= distance; i++) 202 | distanceVector[i - 1] = i; 203 | } 204 | 205 | for (position_t pos = 0; pos < num_items_; pos++) { 206 | distance = bv2_->distanceToPrevSetBit(pos); 207 | ASSERT_EQ(distanceVector[pos], distance); 208 | } 209 | } 210 | 211 | void loadWordList() { 212 | std::ifstream infile(kFilePath); 213 | std::string key; 214 | int count = 0; 215 | while (infile.good() && count < kTestSize) { 216 | infile >> key; 217 | words.push_back(key); 218 | count++; 219 | } 220 | } 221 | 222 | } // namespace bitvectortest 223 | 224 | } // namespace surf 225 | 226 | int main (int argc, char** argv) { 227 | ::testing::InitGoogleTest(&argc, argv); 228 | surf::bitvectortest::loadWordList(); 229 | return RUN_ALL_TESTS(); 230 | } 231 | -------------------------------------------------------------------------------- /bench/workload_gen/workload_spec/workload_template: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2012-2016 YCSB contributors. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); you 4 | # may not use this file except in compliance with the License. You 5 | # may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 12 | # implied. See the License for the specific language governing 13 | # permissions and limitations under the License. See accompanying 14 | # LICENSE file. 15 | 16 | # Yahoo! Cloud System Benchmark 17 | # Workload Template: Default Values 18 | # 19 | # File contains all properties that can be set to define a 20 | # YCSB session. All properties are set to their default 21 | # value if one exists. If not, the property is commented 22 | # out. When a property has a finite number of settings, 23 | # the default is enabled and the alternates are shown in 24 | # comments below it. 25 | # 26 | # Use of most explained through comments in Client.java or 27 | # CoreWorkload.java or on the YCSB wiki page: 28 | # https://github.com/brianfrankcooper/YCSB/wiki/Core-Properties 29 | 30 | # The name of the workload class to use 31 | workload=com.yahoo.ycsb.workloads.CoreWorkload 32 | 33 | # There is no default setting for recordcount but it is 34 | # required to be set. 35 | # The number of records in the table to be inserted in 36 | # the load phase or the number of records already in the 37 | # table before the run phase. 38 | recordcount=1000000 39 | 40 | # There is no default setting for operationcount but it is 41 | # required to be set. 42 | # The number of operations to use during the run phase. 43 | operationcount=3000000 44 | 45 | # The number of insertions to do, if different from recordcount. 46 | # Used with insertstart to grow an existing table. 47 | #insertcount= 48 | 49 | # The offset of the first insertion 50 | insertstart=0 51 | 52 | # The number of fields in a record 53 | fieldcount=10 54 | 55 | # The size of each field (in bytes) 56 | fieldlength=100 57 | 58 | # Should read all fields 59 | readallfields=true 60 | 61 | # Should write all fields on update 62 | writeallfields=false 63 | 64 | # The distribution used to choose the length of a field 65 | fieldlengthdistribution=constant 66 | #fieldlengthdistribution=uniform 67 | #fieldlengthdistribution=zipfian 68 | 69 | # What proportion of operations are reads 70 | readproportion=0.95 71 | 72 | # What proportion of operations are updates 73 | updateproportion=0.05 74 | 75 | # What proportion of operations are inserts 76 | insertproportion=0 77 | 78 | # What proportion of operations read then modify a record 79 | readmodifywriteproportion=0 80 | 81 | # What proportion of operations are scans 82 | scanproportion=0 83 | 84 | # On a single scan, the maximum number of records to access 85 | maxscanlength=1000 86 | 87 | # The distribution used to choose the number of records to access on a scan 88 | scanlengthdistribution=uniform 89 | #scanlengthdistribution=zipfian 90 | 91 | # Should records be inserted in order or pseudo-randomly 92 | insertorder=hashed 93 | #insertorder=ordered 94 | 95 | # The distribution of requests across the keyspace 96 | requestdistribution=zipfian 97 | #requestdistribution=uniform 98 | #requestdistribution=latest 99 | 100 | # Percentage of data items that constitute the hot set 101 | hotspotdatafraction=0.2 102 | 103 | # Percentage of operations that access the hot set 104 | hotspotopnfraction=0.8 105 | 106 | # Maximum execution time in seconds 107 | #maxexecutiontime= 108 | 109 | # The name of the database table to run queries against 110 | table=usertable 111 | 112 | # The column family of fields (required by some databases) 113 | #columnfamily= 114 | 115 | # How the latency measurements are presented 116 | measurementtype=histogram 117 | #measurementtype=timeseries 118 | #measurementtype=raw 119 | # When measurementtype is set to raw, measurements will be output 120 | # as RAW datapoints in the following csv format: 121 | # "operation, timestamp of the measurement, latency in us" 122 | # 123 | # Raw datapoints are collected in-memory while the test is running. Each 124 | # data point consumes about 50 bytes (including java object overhead). 125 | # For a typical run of 1 million to 10 million operations, this should 126 | # fit into memory most of the time. If you plan to do 100s of millions of 127 | # operations per run, consider provisioning a machine with larger RAM when using 128 | # the RAW measurement type, or split the run into multiple runs. 129 | # 130 | # Optionally, you can specify an output file to save raw datapoints. 131 | # Otherwise, raw datapoints will be written to stdout. 132 | # The output file will be appended to if it already exists, otherwise 133 | # a new output file will be created. 134 | #measurement.raw.output_file = /tmp/your_output_file_for_this_run 135 | 136 | # JVM Reporting. 137 | # 138 | # Measure JVM information over time including GC counts, max and min memory 139 | # used, max and min thread counts, max and min system load and others. This 140 | # setting must be enabled in conjunction with the "-s" flag to run the status 141 | # thread. Every "status.interval", the status thread will capture JVM 142 | # statistics and record the results. At the end of the run, max and mins will 143 | # be recorded. 144 | # measurement.trackjvm = false 145 | 146 | # The range of latencies to track in the histogram (milliseconds) 147 | histogram.buckets=1000 148 | 149 | # Granularity for time series (in milliseconds) 150 | timeseries.granularity=1000 151 | 152 | # Latency reporting. 153 | # 154 | # YCSB records latency of failed operations separately from successful ones. 155 | # Latency of all OK operations will be reported under their operation name, 156 | # such as [READ], [UPDATE], etc. 157 | # 158 | # For failed operations: 159 | # By default we don't track latency numbers of specific error status. 160 | # We just report latency of all failed operation under one measurement name 161 | # such as [READ-FAILED]. But optionally, user can configure to have either: 162 | # 1. Record and report latency for each and every error status code by 163 | # setting reportLatencyForEachError to true, or 164 | # 2. Record and report latency for a select set of error status codes by 165 | # providing a CSV list of Status codes via the "latencytrackederrors" 166 | # property. 167 | # reportlatencyforeacherror=false 168 | # latencytrackederrors="" 169 | 170 | # Insertion error retry for the core workload. 171 | # 172 | # By default, the YCSB core workload does not retry any operations. 173 | # However, during the load process, if any insertion fails, the entire 174 | # load process is terminated. 175 | # If a user desires to have more robust behavior during this phase, they can 176 | # enable retry for insertion by setting the following property to a positive 177 | # number. 178 | # core_workload_insertion_retry_limit = 0 179 | # 180 | # the following number controls the interval between retries (in seconds): 181 | # core_workload_insertion_retry_interval = 3 182 | 183 | # Distributed Tracing via Apache HTrace (http://htrace.incubator.apache.org/) 184 | # 185 | # Defaults to blank / no tracing 186 | # Below sends to a local file, sampling at 0.1% 187 | # 188 | # htrace.sampler.classes=ProbabilitySampler 189 | # htrace.sampler.fraction=0.001 190 | # htrace.span.receiver.classes=org.apache.htrace.core.LocalFileSpanReceiver 191 | # htrace.local.file.span.receiver.path=/some/path/to/local/file 192 | # 193 | # To capture all spans, use the AlwaysSampler 194 | # 195 | # htrace.sampler.classes=AlwaysSampler 196 | # 197 | # To send spans to an HTraced receiver, use the below and ensure 198 | # your classpath contains the htrace-htraced jar (i.e. when invoking the ycsb 199 | # command add -cp /path/to/htrace-htraced.jar) 200 | # 201 | # htrace.span.receiver.classes=org.apache.htrace.impl.HTracedSpanReceiver 202 | # htrace.htraced.receiver.address=example.com:9075 203 | # htrace.htraced.error.log.period.ms=10000 204 | -------------------------------------------------------------------------------- /include/popcount.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 | #ifndef _FASTRANK_POPCOUNT_H_ 3 | #define _FASTRANK_POPCOUNT_H_ 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | namespace surf { 10 | 11 | #define L8 0x0101010101010101ULL // Every lowest 8th bit set: 00000001... 12 | #define G2 0xAAAAAAAAAAAAAAAAULL // Every highest 2nd bit: 101010... 13 | #define G4 0x3333333333333333ULL // 00110011 ... used to group the sum of 4 bits. 14 | #define G8 0x0F0F0F0F0F0F0F0FULL 15 | #define H8 0x8080808080808080ULL 16 | #define L9 0x0040201008040201ULL 17 | #define H9 (L9 << 8) 18 | #define L16 0x0001000100010001ULL 19 | #define H16 0x8000800080008000ULL 20 | 21 | #define ONES_STEP_4 ( 0x1111111111111111ULL ) 22 | #define ONES_STEP_8 ( 0x0101010101010101ULL ) 23 | #define ONES_STEP_9 ( 1ULL << 0 | 1ULL << 9 | 1ULL << 18 | 1ULL << 27 | 1ULL << 36 | 1ULL << 45 | 1ULL << 54 ) 24 | #define ONES_STEP_16 ( 1ULL << 0 | 1ULL << 16 | 1ULL << 32 | 1ULL << 48 ) 25 | #define MSBS_STEP_4 ( 0x8ULL * ONES_STEP_4 ) 26 | #define MSBS_STEP_8 ( 0x80ULL * ONES_STEP_8 ) 27 | #define MSBS_STEP_9 ( 0x100ULL * ONES_STEP_9 ) 28 | #define MSBS_STEP_16 ( 0x8000ULL * ONES_STEP_16 ) 29 | #define INCR_STEP_8 ( 0x80ULL << 56 | 0x40ULL << 48 | 0x20ULL << 40 | 0x10ULL << 32 | 0x8ULL << 24 | 0x4ULL << 16 | 0x2ULL << 8 | 0x1 ) 30 | 31 | #define ONES_STEP_32 ( 0x0000000100000001ULL ) 32 | #define MSBS_STEP_32 ( 0x8000000080000000ULL ) 33 | 34 | #define COMPARE_STEP_8(x,y) ( ( ( ( ( (x) | MSBS_STEP_8 ) - ( (y) & ~MSBS_STEP_8 ) ) ^ (x) ^ ~(y) ) & MSBS_STEP_8 ) >> 7 ) 35 | #define LEQ_STEP_8(x,y) ( ( ( ( ( (y) | MSBS_STEP_8 ) - ( (x) & ~MSBS_STEP_8 ) ) ^ (x) ^ (y) ) & MSBS_STEP_8 ) >> 7 ) 36 | 37 | #define UCOMPARE_STEP_9(x,y) ( ( ( ( ( ( (x) | MSBS_STEP_9 ) - ( (y) & ~MSBS_STEP_9 ) ) | ( x ^ y ) ) ^ ( x | ~y ) ) & MSBS_STEP_9 ) >> 8 ) 38 | #define UCOMPARE_STEP_16(x,y) ( ( ( ( ( ( (x) | MSBS_STEP_16 ) - ( (y) & ~MSBS_STEP_16 ) ) | ( x ^ y ) ) ^ ( x | ~y ) ) & MSBS_STEP_16 ) >> 15 ) 39 | #define ULEQ_STEP_9(x,y) ( ( ( ( ( ( (y) | MSBS_STEP_9 ) - ( (x) & ~MSBS_STEP_9 ) ) | ( x ^ y ) ) ^ ( x & ~y ) ) & MSBS_STEP_9 ) >> 8 ) 40 | #define ULEQ_STEP_16(x,y) ( ( ( ( ( ( (y) | MSBS_STEP_16 ) - ( (x) & ~MSBS_STEP_16 ) ) | ( x ^ y ) ) ^ ( x & ~y ) ) & MSBS_STEP_16 ) >> 15 ) 41 | #define ZCOMPARE_STEP_8(x) ( ( ( x | ( ( x | MSBS_STEP_8 ) - ONES_STEP_8 ) ) & MSBS_STEP_8 ) >> 7 ) 42 | 43 | // Population count of a 64 bit integer in SWAR (SIMD within a register) style 44 | // From Sebastiano Vigna, "Broadword Implementation of Rank/Select Queries" 45 | // http://sux.dsi.unimi.it/paper.pdf p4 46 | // This variant uses multiplication for the last summation instead of 47 | // continuing the shift/mask/addition chain. 48 | inline int suxpopcount(uint64_t x) { 49 | // Step 1: 00 - 00 = 0; 01 - 00 = 01; 10 - 01 = 01; 11 - 01 = 10; 50 | x = x - ((x & G2) >> 1); 51 | // step 2: add 2 groups of 2. 52 | x = (x & G4) + ((x >> 2) & G4); 53 | // 2 groups of 4. 54 | x = (x + (x >> 4)) & G8; 55 | // Using a multiply to collect the 8 groups of 8 together. 56 | x = x * L8 >> 56; 57 | return x; 58 | } 59 | 60 | // Default to using the GCC builtin popcount. On architectures 61 | // with -march popcnt, this compiles to a single popcnt instruction. 62 | #ifndef popcount 63 | #define popcount __builtin_popcountll 64 | //#define popcount suxpopcount 65 | #endif 66 | 67 | #define popcountsize 64ULL 68 | #define popcountmask (popcountsize - 1) 69 | 70 | inline uint64_t popcountLinear(uint64_t *bits, uint64_t x, uint64_t nbits) { 71 | if (nbits == 0) { return 0; } 72 | uint64_t lastword = (nbits - 1) / popcountsize; 73 | uint64_t p = 0; 74 | 75 | __builtin_prefetch(bits + x + 7, 0); //huanchen 76 | for (uint64_t i = 0; i < lastword; i++) { /* tested; manually unrolling doesn't help, at least in C */ 77 | //__builtin_prefetch(bits + x + i + 3, 0); 78 | p += popcount(bits[x+i]); // note that use binds us to 64 bit popcount impls 79 | } 80 | 81 | // 'nbits' may or may not fall on a multiple of 64 boundary, 82 | // so we may need to zero out the right side of the last word 83 | // (accomplished by shifting it right, since we're just popcounting) 84 | uint64_t lastshifted = bits[x+lastword] >> (63 - ((nbits - 1) & popcountmask)); 85 | p += popcount(lastshifted); 86 | return p; 87 | } 88 | 89 | // Return the index of the kth bit set in x 90 | inline int select64_naive(uint64_t x, int k) { 91 | int count = -1; 92 | for (int i = 63; i >= 0; i--) { 93 | count++; 94 | if (x & (1ULL << i)) { 95 | k--; 96 | if (k == 0) { 97 | return count; 98 | } 99 | } 100 | } 101 | return -1; 102 | } 103 | 104 | inline int select64_popcount_search(uint64_t x, int k) { 105 | int loc = -1; 106 | // if (k > popcount(x)) { return -1; } 107 | 108 | for (int testbits = 32; testbits > 0; testbits >>= 1) { 109 | int lcount = popcount(x >> testbits); 110 | if (k > lcount) { 111 | x &= ((1ULL << testbits)-1); 112 | loc += testbits; 113 | k -= lcount; 114 | } else { 115 | x >>= testbits; 116 | } 117 | } 118 | return loc+k; 119 | } 120 | 121 | inline int select64_broadword(uint64_t x, int k) { 122 | uint64_t word = x; 123 | int residual = k; 124 | register uint64_t byte_sums; 125 | 126 | byte_sums = word - ( ( word & 0xa * ONES_STEP_4 ) >> 1 ); 127 | byte_sums = ( byte_sums & 3 * ONES_STEP_4 ) + ( ( byte_sums >> 2 ) & 3 * ONES_STEP_4 ); 128 | byte_sums = ( byte_sums + ( byte_sums >> 4 ) ) & 0x0f * ONES_STEP_8; 129 | byte_sums *= ONES_STEP_8; 130 | 131 | // Phase 2: compare each byte sum with the residual 132 | const uint64_t residual_step_8 = residual * ONES_STEP_8; 133 | const int place = ( LEQ_STEP_8( byte_sums, residual_step_8 ) * ONES_STEP_8 >> 53 ) & ~0x7; 134 | 135 | // Phase 3: Locate the relevant byte and make 8 copies with incremental masks 136 | const int byte_rank = residual - ( ( ( byte_sums << 8 ) >> place ) & 0xFF ); 137 | 138 | const uint64_t spread_bits = ( word >> place & 0xFF ) * ONES_STEP_8 & INCR_STEP_8; 139 | const uint64_t bit_sums = ZCOMPARE_STEP_8( spread_bits ) * ONES_STEP_8; 140 | 141 | // Compute the inside-byte location and return the sum 142 | const uint64_t byte_rank_step_8 = byte_rank * ONES_STEP_8; 143 | 144 | return place + ( LEQ_STEP_8( bit_sums, byte_rank_step_8 ) * ONES_STEP_8 >> 56 ); 145 | } 146 | 147 | inline int select64(uint64_t x, int k) { 148 | return select64_popcount_search(x, k); 149 | } 150 | 151 | // x is the starting offset of the 512 bits; 152 | // k is the thing we're selecting for. 153 | inline int select512(uint64_t *bits, int x, int k) { 154 | __asm__ __volatile__ ( 155 | "prefetchnta (%0)\n" 156 | : : "r" (&bits[x]) ); 157 | int i = 0; 158 | int pop = popcount(bits[x+i]); 159 | while (k > pop && i < 7) { 160 | k -= pop; 161 | i++; 162 | pop = popcount(bits[x+i]); 163 | } 164 | if (i == 7 && popcount(bits[x+i]) < k) { 165 | return -1; 166 | } 167 | // We're now certain that the bit we want is stored in bv[x+i] 168 | return i*64 + select64(bits[x+i], k); 169 | } 170 | 171 | // brute-force linear select 172 | // x is the starting offset of the bits in bv; 173 | // k is the thing we're selecting for (starting from bv[x]). 174 | // bvlen is the total length of bv 175 | inline uint64_t selectLinear(uint64_t* bits, uint64_t length, uint64_t x, uint64_t k) { 176 | if (k > (length - x) * 64) 177 | return -1; 178 | uint64_t i = 0; 179 | uint64_t pop = popcount(bits[x+i]); 180 | while (k > pop && i < (length - 1)) { 181 | k -= pop; 182 | i++; 183 | pop = popcount(bits[x+i]); 184 | } 185 | if ((i == length - 1) && (pop < k)) { 186 | return -1; 187 | } 188 | // We're now certain that the bit we want is stored in bits[x+i] 189 | return i*64 + select64(bits[x+i], k); 190 | } 191 | 192 | } // namespace surf 193 | 194 | #endif /* _FASTRANK_POPCOUNT_H_ */ 195 | -------------------------------------------------------------------------------- /bench/MurmurHash3.h: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public 3 | // domain. The author hereby disclaims copyright to this source code. 4 | 5 | #ifndef _MURMURHASH3_H_ 6 | #define _MURMURHASH3_H_ 7 | 8 | //----------------------------------------------------------------------------- 9 | // Platform-specific functions and macros 10 | 11 | // Microsoft Visual Studio 12 | 13 | //typedef unsigned char uint8_t; 14 | //typedef unsigned int uint32_t; 15 | //typedef unsigned __int64 uint64_t; 16 | 17 | // Other compilers 18 | 19 | #include 20 | #include 21 | 22 | // Other compilers 23 | 24 | #define FORCE_INLINE inline __attribute__((always_inline)) 25 | 26 | inline uint32_t rotl32 ( uint32_t x, int8_t r ) 27 | { 28 | return (x << r) | (x >> (32 - r)); 29 | } 30 | 31 | inline uint64_t rotl64 ( uint64_t x, int8_t r ) 32 | { 33 | return (x << r) | (x >> (64 - r)); 34 | } 35 | 36 | #define ROTL32(x,y)rotl32(x,y) 37 | #define ROTL64(x,y)rotl64(x,y) 38 | 39 | #define BIG_CONSTANT(x) (x##LLU) 40 | 41 | //----------------------------------------------------------------------------- 42 | // Block read - if your platform needs to do endian-swapping or can only 43 | // handle aligned reads, do the conversion here 44 | 45 | FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i ) 46 | { 47 | return p[i]; 48 | } 49 | 50 | FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i ) 51 | { 52 | return p[i]; 53 | } 54 | 55 | //----------------------------------------------------------------------------- 56 | // Finalization mix - force all bits of a hash block to avalanche 57 | 58 | FORCE_INLINE uint32_t fmix32 ( uint32_t h ) 59 | { 60 | h ^= h >> 16; 61 | h *= 0x85ebca6b; 62 | h ^= h >> 13; 63 | h *= 0xc2b2ae35; 64 | h ^= h >> 16; 65 | 66 | return h; 67 | } 68 | 69 | //---------- 70 | 71 | FORCE_INLINE uint64_t fmix64 ( uint64_t k ) 72 | { 73 | k ^= k >> 33; 74 | k *= BIG_CONSTANT(0xff51afd7ed558ccd); 75 | k ^= k >> 33; 76 | k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); 77 | k ^= k >> 33; 78 | 79 | return k; 80 | } 81 | 82 | //----------------------------------------------------------------------------- 83 | 84 | void MurmurHash3_x86_32 ( const void * key, int len, 85 | uint32_t seed, void * out ) 86 | { 87 | const uint8_t * data = (const uint8_t*)key; 88 | const int nblocks = len / 4; 89 | 90 | uint32_t h1 = seed; 91 | 92 | const uint32_t c1 = 0xcc9e2d51; 93 | const uint32_t c2 = 0x1b873593; 94 | 95 | //---------- 96 | // body 97 | 98 | const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); 99 | 100 | for(int i = -nblocks; i; i++) 101 | { 102 | uint32_t k1 = getblock32(blocks,i); 103 | 104 | k1 *= c1; 105 | k1 = ROTL32(k1,15); 106 | k1 *= c2; 107 | 108 | h1 ^= k1; 109 | h1 = ROTL32(h1,13); 110 | h1 = h1*5+0xe6546b64; 111 | } 112 | 113 | //---------- 114 | // tail 115 | 116 | const uint8_t * tail = (const uint8_t*)(data + nblocks*4); 117 | 118 | uint32_t k1 = 0; 119 | 120 | switch(len & 3) 121 | { 122 | case 3: k1 ^= tail[2] << 16; 123 | case 2: k1 ^= tail[1] << 8; 124 | case 1: k1 ^= tail[0]; 125 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 126 | }; 127 | 128 | //---------- 129 | // finalization 130 | 131 | h1 ^= len; 132 | 133 | h1 = fmix32(h1); 134 | 135 | *(uint32_t*)out = h1; 136 | } 137 | 138 | //----------------------------------------------------------------------------- 139 | 140 | void MurmurHash3_x86_128 ( const void * key, const int len, 141 | uint32_t seed, void * out ) 142 | { 143 | const uint8_t * data = (const uint8_t*)key; 144 | const int nblocks = len / 16; 145 | 146 | uint32_t h1 = seed; 147 | uint32_t h2 = seed; 148 | uint32_t h3 = seed; 149 | uint32_t h4 = seed; 150 | 151 | const uint32_t c1 = 0x239b961b; 152 | const uint32_t c2 = 0xab0e9789; 153 | const uint32_t c3 = 0x38b34ae5; 154 | const uint32_t c4 = 0xa1e38b93; 155 | 156 | //---------- 157 | // body 158 | 159 | const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); 160 | 161 | for(int i = -nblocks; i; i++) 162 | { 163 | uint32_t k1 = getblock32(blocks,i*4+0); 164 | uint32_t k2 = getblock32(blocks,i*4+1); 165 | uint32_t k3 = getblock32(blocks,i*4+2); 166 | uint32_t k4 = getblock32(blocks,i*4+3); 167 | 168 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 169 | 170 | h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; 171 | 172 | k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; 173 | 174 | h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; 175 | 176 | k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; 177 | 178 | h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; 179 | 180 | k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; 181 | 182 | h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; 183 | } 184 | 185 | //---------- 186 | // tail 187 | 188 | const uint8_t * tail = (const uint8_t*)(data + nblocks*16); 189 | 190 | uint32_t k1 = 0; 191 | uint32_t k2 = 0; 192 | uint32_t k3 = 0; 193 | uint32_t k4 = 0; 194 | 195 | switch(len & 15) 196 | { 197 | case 15: k4 ^= tail[14] << 16; 198 | case 14: k4 ^= tail[13] << 8; 199 | case 13: k4 ^= tail[12] << 0; 200 | k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; 201 | 202 | case 12: k3 ^= tail[11] << 24; 203 | case 11: k3 ^= tail[10] << 16; 204 | case 10: k3 ^= tail[ 9] << 8; 205 | case 9: k3 ^= tail[ 8] << 0; 206 | k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; 207 | 208 | case 8: k2 ^= tail[ 7] << 24; 209 | case 7: k2 ^= tail[ 6] << 16; 210 | case 6: k2 ^= tail[ 5] << 8; 211 | case 5: k2 ^= tail[ 4] << 0; 212 | k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; 213 | 214 | case 4: k1 ^= tail[ 3] << 24; 215 | case 3: k1 ^= tail[ 2] << 16; 216 | case 2: k1 ^= tail[ 1] << 8; 217 | case 1: k1 ^= tail[ 0] << 0; 218 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 219 | }; 220 | 221 | //---------- 222 | // finalization 223 | 224 | h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; 225 | 226 | h1 += h2; h1 += h3; h1 += h4; 227 | h2 += h1; h3 += h1; h4 += h1; 228 | 229 | h1 = fmix32(h1); 230 | h2 = fmix32(h2); 231 | h3 = fmix32(h3); 232 | h4 = fmix32(h4); 233 | 234 | h1 += h2; h1 += h3; h1 += h4; 235 | h2 += h1; h3 += h1; h4 += h1; 236 | 237 | ((uint32_t*)out)[0] = h1; 238 | ((uint32_t*)out)[1] = h2; 239 | ((uint32_t*)out)[2] = h3; 240 | ((uint32_t*)out)[3] = h4; 241 | } 242 | 243 | //----------------------------------------------------------------------------- 244 | 245 | void MurmurHash3_x64_128 ( const void * key, const int len, 246 | const uint32_t seed, void * out ) 247 | { 248 | const uint8_t * data = (const uint8_t*)key; 249 | const int nblocks = len / 16; 250 | 251 | uint64_t h1 = seed; 252 | uint64_t h2 = seed; 253 | 254 | const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); 255 | const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); 256 | 257 | //---------- 258 | // body 259 | 260 | const uint64_t * blocks = (const uint64_t *)(data); 261 | 262 | for(int i = 0; i < nblocks; i++) 263 | { 264 | uint64_t k1 = getblock64(blocks,i*2+0); 265 | uint64_t k2 = getblock64(blocks,i*2+1); 266 | 267 | k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; 268 | 269 | h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; 270 | 271 | k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; 272 | 273 | h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; 274 | } 275 | 276 | //---------- 277 | // tail 278 | 279 | const uint8_t * tail = (const uint8_t*)(data + nblocks*16); 280 | 281 | uint64_t k1 = 0; 282 | uint64_t k2 = 0; 283 | 284 | switch(len & 15) 285 | { 286 | case 15: k2 ^= ((uint64_t)tail[14]) << 48; 287 | case 14: k2 ^= ((uint64_t)tail[13]) << 40; 288 | case 13: k2 ^= ((uint64_t)tail[12]) << 32; 289 | case 12: k2 ^= ((uint64_t)tail[11]) << 24; 290 | case 11: k2 ^= ((uint64_t)tail[10]) << 16; 291 | case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; 292 | case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; 293 | k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; 294 | 295 | case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; 296 | case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; 297 | case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; 298 | case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; 299 | case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; 300 | case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; 301 | case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; 302 | case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; 303 | k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; 304 | }; 305 | 306 | //---------- 307 | // finalization 308 | 309 | h1 ^= len; h2 ^= len; 310 | 311 | h1 += h2; 312 | h2 += h1; 313 | 314 | h1 = fmix64(h1); 315 | h2 = fmix64(h2); 316 | 317 | h1 += h2; 318 | h2 += h1; 319 | 320 | ((uint64_t*)out)[0] = h1; 321 | ((uint64_t*)out)[1] = h2; 322 | } 323 | 324 | #endif // _MURMURHASH3_H_ 325 | -------------------------------------------------------------------------------- /test/unitTest/test_label_vector.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "config.hpp" 10 | #include "label_vector.hpp" 11 | #include "surf_builder.hpp" 12 | 13 | namespace surf { 14 | 15 | namespace labelvectortest { 16 | 17 | static const std::string kFilePath = "../../../test/words.txt"; 18 | static const int kTestSize = 234369; 19 | static std::vector words; 20 | 21 | class LabelVectorUnitTest : public ::testing::Test { 22 | public: 23 | virtual void SetUp () { 24 | bool include_dense = false; 25 | uint32_t sparse_dense_ratio = 0; 26 | level_t suffix_len = 8; 27 | builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, kReal, 0, suffix_len); 28 | data_ = nullptr; 29 | } 30 | virtual void TearDown () { 31 | delete builder_; 32 | if (data_) 33 | delete[] data_; 34 | } 35 | 36 | void setupWordsTest(); 37 | void testSerialize(); 38 | void testSearch(); 39 | 40 | SuRFBuilder* builder_; 41 | LabelVector* labels_; 42 | char* data_; 43 | }; 44 | 45 | void LabelVectorUnitTest::setupWordsTest() { 46 | builder_->build(words); 47 | labels_ = new LabelVector(builder_->getLabels()); 48 | } 49 | 50 | void LabelVectorUnitTest::testSerialize() { 51 | uint64_t size = labels_->serializedSize(); 52 | ASSERT_TRUE((labels_->size() - size) >= 0); 53 | data_ = new char[size]; 54 | LabelVector* ori_labels = labels_; 55 | char* data = data_; 56 | ori_labels->serialize(data); 57 | data = data_; 58 | labels_ = LabelVector::deSerialize(data); 59 | 60 | ASSERT_EQ(ori_labels->getNumBytes(), labels_->getNumBytes()); 61 | 62 | for (position_t i = 0; i < ori_labels->getNumBytes(); i++) { 63 | label_t ori_label = ori_labels->read(i); 64 | label_t label = labels_->read(i); 65 | ASSERT_EQ(ori_label, label); 66 | } 67 | 68 | ori_labels->destroy(); 69 | delete ori_labels; 70 | } 71 | 72 | void LabelVectorUnitTest::testSearch() { 73 | position_t start_pos = 0; 74 | position_t search_len = 0; 75 | for (level_t level = 0; level < builder_->getTreeHeight(); level++) { 76 | for (position_t pos = 0; pos < builder_->getLabels()[level].size(); pos++) { 77 | bool louds_bit = SuRFBuilder::readBit(builder_->getLoudsBits()[level], pos); 78 | if (louds_bit) { 79 | position_t search_pos; 80 | bool search_success; 81 | for (position_t i = start_pos; i < start_pos + search_len; i++) { 82 | label_t test_label = labels_->read(i); 83 | if (i == start_pos && test_label == kTerminator && search_len > 1) 84 | continue; 85 | // search success 86 | search_pos = start_pos; 87 | search_success = labels_->search(test_label, search_pos, search_len); 88 | ASSERT_TRUE(search_success); 89 | ASSERT_EQ(i, search_pos); 90 | } 91 | // search fail 92 | search_pos = start_pos; 93 | search_success = labels_->search('\0', search_pos, search_len); 94 | ASSERT_FALSE(search_success); 95 | search_pos = start_pos; 96 | search_success = labels_->search('\255', search_pos, search_len); 97 | ASSERT_FALSE(search_success); 98 | 99 | start_pos += search_len; 100 | search_len = 0; 101 | } 102 | search_len++; 103 | } 104 | } 105 | } 106 | 107 | TEST_F (LabelVectorUnitTest, readTest) { 108 | setupWordsTest(); 109 | position_t lv_pos = 0; 110 | for (level_t level = 0; level < builder_->getTreeHeight(); level++) { 111 | for (position_t pos = 0; pos < builder_->getLabels()[level].size(); pos++) { 112 | label_t expected_label = builder_->getLabels()[level][pos]; 113 | label_t label = labels_->read(lv_pos); 114 | ASSERT_EQ(expected_label, label); 115 | lv_pos++; 116 | } 117 | } 118 | labels_->destroy(); 119 | delete labels_; 120 | } 121 | 122 | TEST_F (LabelVectorUnitTest, searchAlgTest) { 123 | setupWordsTest(); 124 | position_t start_pos = 0; 125 | position_t search_len = 0; 126 | for (level_t level = 0; level < builder_->getTreeHeight(); level++) { 127 | for (position_t pos = 0; pos < builder_->getLabels()[level].size(); pos++) { 128 | bool louds_bit = SuRFBuilder::readBit(builder_->getLoudsBits()[level], pos); 129 | if (louds_bit) { 130 | position_t binary_search_pos, simd_search_pos, linear_search_pos; 131 | bool binary_search_success, simd_search_success, linear_search_success; 132 | for (position_t i = start_pos; i < start_pos + search_len; i++) { 133 | // binary search success 134 | binary_search_pos = start_pos; 135 | binary_search_success = labels_->binarySearch(labels_->read(i), binary_search_pos, search_len); 136 | ASSERT_TRUE(binary_search_success); 137 | ASSERT_EQ(i, binary_search_pos); 138 | 139 | // simd search success 140 | simd_search_pos = start_pos; 141 | simd_search_success = labels_->simdSearch(labels_->read(i), simd_search_pos, search_len); 142 | ASSERT_TRUE(simd_search_success); 143 | ASSERT_EQ(i, simd_search_pos); 144 | 145 | // linear search success 146 | linear_search_pos = start_pos; 147 | linear_search_success = labels_->linearSearch(labels_->read(i), linear_search_pos, search_len); 148 | ASSERT_TRUE(linear_search_success); 149 | ASSERT_EQ(i, linear_search_pos); 150 | } 151 | // binary search fail 152 | binary_search_pos = start_pos; 153 | binary_search_success = labels_->binarySearch('\0', binary_search_pos, search_len); 154 | ASSERT_FALSE(binary_search_success); 155 | binary_search_pos = start_pos; 156 | binary_search_success = labels_->binarySearch('\255', binary_search_pos, search_len); 157 | ASSERT_FALSE(binary_search_success); 158 | 159 | // simd search fail 160 | simd_search_pos = start_pos; 161 | simd_search_success = labels_->simdSearch('\0', simd_search_pos, search_len); 162 | ASSERT_FALSE(simd_search_success); 163 | simd_search_pos = start_pos; 164 | simd_search_success = labels_->simdSearch('\255', simd_search_pos, search_len); 165 | ASSERT_FALSE(simd_search_success); 166 | 167 | // linear search fail 168 | linear_search_pos = start_pos; 169 | linear_search_success = labels_->linearSearch('\0', linear_search_pos, search_len); 170 | ASSERT_FALSE(linear_search_success); 171 | linear_search_pos = start_pos; 172 | linear_search_success = labels_->linearSearch('\255', linear_search_pos, search_len); 173 | ASSERT_FALSE(linear_search_success); 174 | 175 | start_pos += search_len; 176 | search_len = 0; 177 | } 178 | 179 | if (builder_->getLabels()[level][pos] == kTerminator 180 | && !SuRFBuilder::readBit(builder_->getChildIndicatorBits()[level], pos)) 181 | start_pos++; 182 | else 183 | search_len++; 184 | } 185 | } 186 | labels_->destroy(); 187 | delete labels_; 188 | } 189 | 190 | TEST_F (LabelVectorUnitTest, searchTest) { 191 | setupWordsTest(); 192 | testSearch(); 193 | labels_->destroy(); 194 | delete labels_; 195 | } 196 | 197 | TEST_F (LabelVectorUnitTest, serializeTest) { 198 | setupWordsTest(); 199 | testSerialize(); 200 | testSearch(); 201 | } 202 | 203 | TEST_F (LabelVectorUnitTest, searchGreaterThanTest) { 204 | setupWordsTest(); 205 | position_t start_pos = 0; 206 | position_t search_len = 0; 207 | for (level_t level = 0; level < builder_->getTreeHeight(); level++) { 208 | for (position_t pos = 0; pos < builder_->getLabels()[level].size(); pos++) { 209 | bool louds_bit = SuRFBuilder::readBit(builder_->getLoudsBits()[level], pos); 210 | if (louds_bit) { 211 | position_t search_pos; 212 | position_t terminator_offset = 0; 213 | bool search_success; 214 | for (position_t i = start_pos; i < start_pos + search_len; i++) { 215 | label_t cur_label = labels_->read(i); 216 | if (i == start_pos && cur_label == kTerminator && search_len > 1) { 217 | terminator_offset = 1; 218 | continue; 219 | } 220 | 221 | if (i < start_pos + search_len - 1) { 222 | label_t next_label = labels_->read(i+1); 223 | // search existing label 224 | search_pos = start_pos; 225 | search_success = labels_->searchGreaterThan(cur_label, search_pos, search_len); 226 | ASSERT_TRUE(search_success); 227 | ASSERT_EQ(i+1, search_pos); 228 | 229 | // search midpoint (could be non-existing label) 230 | label_t test_label = cur_label + ((next_label - cur_label) / 2); 231 | search_pos = start_pos; 232 | search_success = labels_->searchGreaterThan(test_label, search_pos, search_len); 233 | ASSERT_TRUE(search_success); 234 | ASSERT_EQ(i+1, search_pos); 235 | } else { 236 | // search out-of-bound label 237 | search_pos = start_pos; 238 | search_success = labels_->searchGreaterThan(labels_->read(start_pos + search_len - 1), search_pos, search_len); 239 | ASSERT_FALSE(search_success); 240 | ASSERT_EQ(start_pos + terminator_offset, search_pos); 241 | } 242 | } 243 | start_pos += search_len; 244 | search_len = 0; 245 | } 246 | search_len++; 247 | } 248 | } 249 | } 250 | 251 | void loadWordList() { 252 | std::ifstream infile(kFilePath); 253 | std::string key; 254 | int count = 0; 255 | while (infile.good() && count < kTestSize) { 256 | infile >> key; 257 | words.push_back(key); 258 | count++; 259 | } 260 | } 261 | 262 | } // namespace labelvectortest 263 | 264 | } // namespace surf 265 | 266 | int main (int argc, char** argv) { 267 | ::testing::InitGoogleTest(&argc, argv); 268 | surf::labelvectortest::loadWordList(); 269 | return RUN_ALL_TESTS(); 270 | } 271 | -------------------------------------------------------------------------------- /bench/workload_multi_thread.cpp: -------------------------------------------------------------------------------- 1 | #include "bench.hpp" 2 | #include "filter_factory.hpp" 3 | 4 | //#define VERBOSE 1 5 | 6 | static std::vector txn_keys; 7 | static std::vector upper_bound_keys; 8 | 9 | typedef struct ThreadArg { 10 | int thread_id; 11 | bench::Filter* filter; 12 | int start_pos; 13 | int end_pos; 14 | int query_type; 15 | int64_t out_positives; 16 | double tput; 17 | } ThreadArg; 18 | 19 | void* execute_workload(void* arg) { 20 | ThreadArg* thread_arg = (ThreadArg*)arg; 21 | int64_t positives = 0; 22 | double start_time = bench::getNow(); 23 | if (thread_arg->query_type == 0) { // point 24 | for (int i = thread_arg->start_pos; i < thread_arg->end_pos; i++) 25 | positives += (int)thread_arg->filter->lookup(txn_keys[i]); 26 | } else { // range 27 | for (int i = thread_arg->start_pos; i < thread_arg->end_pos; i++) 28 | positives += (int)thread_arg->filter->lookupRange(txn_keys[i], 29 | upper_bound_keys[i]); 30 | } 31 | double end_time = bench::getNow(); 32 | double tput = (thread_arg->end_pos - thread_arg->start_pos) / (end_time - start_time) / 1000000; // Mops/sec 33 | 34 | #ifdef VERBOSE 35 | std::cout << "Thread #" << thread_arg->thread_id << bench::kGreen 36 | << ": Throughput = " << bench::kNoColor << tput << "\n"; 37 | #else 38 | std::cout << tput << "\n"; 39 | #endif 40 | 41 | thread_arg->out_positives = positives; 42 | thread_arg->tput = tput; 43 | pthread_exit(NULL); 44 | return NULL; 45 | } 46 | 47 | int main(int argc, char *argv[]) { 48 | if (argc != 10) { 49 | std::cout << "Usage:\n"; 50 | std::cout << "1. filter type: SuRF, SuRFHash, SuRFReal, Bloom\n"; 51 | std::cout << "2. suffix length: 0 < len <= 64 (for SuRFHash and SuRFReal only)\n"; 52 | std::cout << "3. workload type: mixed, alterByte (only for email key)\n"; 53 | std::cout << "4. percentage of keys inserted: 0 < num <= 100\n"; 54 | std::cout << "5. byte position (conting from last, only for alterByte): num\n"; 55 | std::cout << "6. key type: randint, email\n"; 56 | std::cout << "7. query type: point, range\n"; 57 | std::cout << "8. distribution: uniform, zipfian, latest\n"; 58 | std::cout << "9. number of threads\n"; 59 | return -1; 60 | } 61 | 62 | std::string filter_type = argv[1]; 63 | uint32_t suffix_len = (uint32_t)atoi(argv[2]); 64 | std::string workload_type = argv[3]; 65 | unsigned percent = atoi(argv[4]); 66 | unsigned byte_pos = atoi(argv[5]); 67 | std::string key_type = argv[6]; 68 | std::string query_type = argv[7]; 69 | std::string distribution = argv[8]; 70 | int num_threads = atoi(argv[9]); 71 | 72 | // check args ==================================================== 73 | if (filter_type.compare(std::string("SuRF")) != 0 74 | && filter_type.compare(std::string("SuRFHash")) != 0 75 | && filter_type.compare(std::string("SuRFReal")) != 0 76 | && filter_type.compare(std::string("Bloom")) != 0 77 | && filter_type.compare(std::string("ARF")) != 0) { 78 | std::cout << bench::kRed << "WRONG filter type\n" << bench::kNoColor; 79 | return -1; 80 | } 81 | 82 | if (suffix_len == 0 || suffix_len > 64) { 83 | std::cout << bench::kRed << "WRONG suffix length\n" << bench::kNoColor; 84 | return -1; 85 | } 86 | 87 | if (workload_type.compare(std::string("mixed")) != 0 88 | && workload_type.compare(std::string("alterByte")) == 0) { 89 | std::cout << bench::kRed << "WRONG workload type\n" << bench::kNoColor; 90 | return -1; 91 | } 92 | 93 | if (percent > 100) { 94 | std::cout << bench::kRed << "WRONG percentage\n" << bench::kNoColor; 95 | return -1; 96 | } 97 | 98 | if (key_type.compare(std::string("randint")) != 0 99 | && key_type.compare(std::string("timestamp")) != 0 100 | && key_type.compare(std::string("email")) != 0) { 101 | std::cout << bench::kRed << "WRONG key type\n" << bench::kNoColor; 102 | return -1; 103 | } 104 | 105 | if (query_type.compare(std::string("point")) != 0 106 | && query_type.compare(std::string("range")) != 0) { 107 | std::cout << bench::kRed << "WRONG query type\n" << bench::kNoColor; 108 | return -1; 109 | } 110 | 111 | if (distribution.compare(std::string("uniform")) != 0 112 | && distribution.compare(std::string("zipfian")) != 0 113 | && distribution.compare(std::string("latest")) != 0) { 114 | std::cout << bench::kRed << "WRONG distribution\n" << bench::kNoColor; 115 | return -1; 116 | } 117 | 118 | // load keys from files ======================================= 119 | std::string load_file = "workloads/load_"; 120 | load_file += key_type; 121 | std::vector load_keys; 122 | if (key_type.compare(std::string("email")) == 0) 123 | bench::loadKeysFromFile(load_file, false, load_keys); 124 | else 125 | bench::loadKeysFromFile(load_file, true, load_keys); 126 | 127 | std::string txn_file = "workloads/txn_"; 128 | txn_file += key_type; 129 | txn_file += "_"; 130 | txn_file += distribution; 131 | 132 | if (key_type.compare(std::string("email")) == 0) 133 | bench::loadKeysFromFile(txn_file, false, txn_keys); 134 | else 135 | bench::loadKeysFromFile(txn_file, true, txn_keys); 136 | 137 | std::vector insert_keys; 138 | bench::selectKeysToInsert(percent, insert_keys, load_keys); 139 | 140 | if (workload_type.compare(std::string("alterByte")) == 0) 141 | bench::modifyKeyByte(txn_keys, byte_pos); 142 | 143 | // compute upperbound keys for range queries ================= 144 | if (query_type.compare(std::string("range")) == 0) { 145 | for (int i = 0; i < (int)txn_keys.size(); i++) 146 | upper_bound_keys.push_back(bench::getUpperBoundKey(key_type, txn_keys[i])); 147 | } 148 | 149 | // create filter ============================================== 150 | bench::Filter* filter = bench::FilterFactory::createFilter(filter_type, suffix_len, insert_keys); 151 | 152 | #ifdef VERBOSE 153 | std::cout << bench::kGreen << "Memory = " << bench::kNoColor << filter->getMemoryUsage() << std::endl; 154 | #endif 155 | 156 | // execute transactions ======================================= 157 | pthread_t* threads = new pthread_t[num_threads]; 158 | pthread_attr_t attr; 159 | // Initialize and set thread joinable 160 | pthread_attr_init(&attr); 161 | pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); 162 | 163 | ThreadArg* thread_args = new ThreadArg[num_threads]; 164 | int num_txns = (int)txn_keys.size(); 165 | int num_txns_per_thread = num_txns / num_threads; 166 | for (int i = 0; i < num_threads; i++) { 167 | thread_args[i].thread_id = i; 168 | thread_args[i].filter = filter; 169 | thread_args[i].start_pos = num_txns_per_thread * i; 170 | thread_args[i].end_pos = num_txns_per_thread * (i + 1); 171 | if (query_type.compare(std::string("point")) == 0) 172 | thread_args[i].query_type = 0; 173 | else 174 | thread_args[i].query_type = 1; 175 | thread_args[i].out_positives = 0; 176 | thread_args[i].tput = 0; 177 | } 178 | 179 | for (int i = 0; i < num_threads; i++) { 180 | int rc = pthread_create(&threads[i], NULL, execute_workload, (void*)(&thread_args[i])); 181 | if (rc) { 182 | std::cout << "Error: unable to create thread " << rc << std::endl; 183 | exit(-1); 184 | } 185 | } 186 | 187 | // free attribute and wait for the other threads 188 | pthread_attr_destroy(&attr); 189 | for (int i = 0; i < num_threads; i++) { 190 | void* status; 191 | int rc = pthread_join(threads[i], &status); 192 | if (rc) { 193 | std::cout << "Error:unable to join " << rc << endl; 194 | exit(-1); 195 | } 196 | } 197 | 198 | double tput = 0; 199 | for (int i = 0; i < num_threads; i++) { 200 | tput += thread_args[i].tput; 201 | } 202 | 203 | #ifdef VERBOSE 204 | std::cout << bench::kGreen << "Throughput = " << bench::kNoColor << tput << "\n"; 205 | 206 | int positives = 0; 207 | for (int i = 0; i < num_threads; i++) { 208 | positives += (thread_args[i].out_positives); 209 | } 210 | 211 | // compute true positives ====================================== 212 | std::map ht; 213 | for (int i = 0; i < (int)insert_keys.size(); i++) 214 | ht[insert_keys[i]] = true; 215 | 216 | int64_t true_positives = 0; 217 | std::map::iterator ht_iter; 218 | if (query_type.compare(std::string("point")) == 0) { 219 | for (int i = 0; i < (int)txn_keys.size(); i++) { 220 | ht_iter = ht.find(txn_keys[i]); 221 | true_positives += (ht_iter != ht.end()); 222 | } 223 | } else if (query_type.compare(std::string("range")) == 0) { 224 | for (int i = 0; i < (int)txn_keys.size(); i++) { 225 | ht_iter = ht.upper_bound(txn_keys[i]); 226 | if (ht_iter != ht.end()) { 227 | std::string fetched_key = ht_iter->first; 228 | true_positives += (fetched_key.compare(upper_bound_keys[i]) < 0); 229 | } 230 | } 231 | } 232 | int64_t false_positives = positives - true_positives; 233 | assert(false_positives >= 0); 234 | int64_t true_negatives = txn_keys.size() - true_positives; 235 | double fp_rate = 0; 236 | if (false_positives > 0) 237 | fp_rate = false_positives / (true_negatives + false_positives + 0.0); 238 | 239 | std::cout << "positives = " << positives << "\n"; 240 | std::cout << "true positives = " << true_positives << "\n"; 241 | std::cout << "false positives = " << false_positives << "\n"; 242 | std::cout << "true negatives = " << true_negatives << "\n"; 243 | std::cout << bench::kGreen << "False Positive Rate = " << bench::kNoColor << fp_rate << "\n"; 244 | #else 245 | std::cout << tput << "\n"; 246 | std::cout << bench::kGreen << bench::kNoColor << "\n\n"; 247 | #endif 248 | 249 | delete[] threads; 250 | delete[] thread_args; 251 | 252 | pthread_exit(NULL); 253 | return 0; 254 | } 255 | -------------------------------------------------------------------------------- /include/suffix.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SUFFIX_H_ 2 | #define SUFFIX_H_ 3 | 4 | #include "bitvector.hpp" 5 | 6 | #include 7 | 8 | #include 9 | 10 | #include "config.hpp" 11 | #include "hash.hpp" 12 | 13 | namespace surf { 14 | 15 | // Max suffix_len_ = 64 bits 16 | // For kReal suffixes, if the stored key is not long enough to provide 17 | // suffix_len_ suffix bits, its suffix field is cleared (i.e., all 0's) 18 | // to indicate that there is no suffix info associated with the key. 19 | class BitvectorSuffix : public Bitvector { 20 | public: 21 | BitvectorSuffix() : type_(kNone), hash_suffix_len_(0), real_suffix_len_(0) {}; 22 | 23 | BitvectorSuffix(const SuffixType type, 24 | const level_t hash_suffix_len, const level_t real_suffix_len, 25 | const std::vector >& bitvector_per_level, 26 | const std::vector& num_bits_per_level, 27 | const level_t start_level = 0, 28 | level_t end_level = 0/* non-inclusive */) 29 | : Bitvector(bitvector_per_level, num_bits_per_level, start_level, end_level) { 30 | assert((hash_suffix_len + real_suffix_len) <= kWordSize); 31 | type_ = type; 32 | hash_suffix_len_ = hash_suffix_len; 33 | real_suffix_len_ = real_suffix_len; 34 | } 35 | 36 | static word_t constructHashSuffix(const std::string& key, const level_t len) { 37 | word_t suffix = suffixHash(key); 38 | suffix <<= (kWordSize - len - kHashShift); 39 | suffix >>= (kWordSize - len); 40 | return suffix; 41 | } 42 | 43 | static word_t constructRealSuffix(const std::string& key, 44 | const level_t level, const level_t len) { 45 | if (key.length() < level || ((key.length() - level) * 8) < len) 46 | return 0; 47 | word_t suffix = 0; 48 | level_t num_complete_bytes = len / 8; 49 | if (num_complete_bytes > 0) { 50 | suffix += (word_t)(label_t)key[level]; 51 | for (position_t i = 1; i < num_complete_bytes; i++) { 52 | suffix <<= 8; 53 | suffix += (word_t)(uint8_t)key[level + i]; 54 | } 55 | } 56 | level_t offset = len % 8; 57 | if (offset > 0) { 58 | suffix <<= offset; 59 | word_t remaining_bits = 0; 60 | remaining_bits = (word_t)(uint8_t)key[level + num_complete_bytes]; 61 | remaining_bits >>= (8 - offset); 62 | suffix += remaining_bits; 63 | } 64 | return suffix; 65 | } 66 | 67 | static word_t constructMixedSuffix(const std::string& key, const level_t hash_len, 68 | const level_t real_level, const level_t real_len) { 69 | word_t hash_suffix = constructHashSuffix(key, hash_len); 70 | word_t real_suffix = constructRealSuffix(key, real_level, real_len); 71 | word_t suffix = hash_suffix; 72 | suffix <<= real_len; 73 | suffix |= real_suffix; 74 | return suffix; 75 | } 76 | 77 | static word_t constructSuffix(const SuffixType type, const std::string& key, 78 | const level_t hash_len, 79 | const level_t real_level, const level_t real_len) { 80 | switch (type) { 81 | case kHash: 82 | return constructHashSuffix(key, hash_len); 83 | case kReal: 84 | return constructRealSuffix(key, real_level, real_len); 85 | case kMixed: 86 | return constructMixedSuffix(key, hash_len, real_level, real_len); 87 | default: 88 | return 0; 89 | } 90 | } 91 | 92 | static word_t extractHashSuffix(const word_t suffix, const level_t real_suffix_len) { 93 | return (suffix >> real_suffix_len); 94 | } 95 | 96 | static word_t extractRealSuffix(const word_t suffix, const level_t real_suffix_len) { 97 | word_t real_suffix_mask = 1; 98 | real_suffix_mask <<= real_suffix_len; 99 | real_suffix_mask--; 100 | return (suffix & real_suffix_mask); 101 | } 102 | 103 | SuffixType getType() const { 104 | return type_; 105 | } 106 | 107 | level_t getSuffixLen() const { 108 | return hash_suffix_len_ + real_suffix_len_; 109 | } 110 | 111 | level_t getHashSuffixLen() const { 112 | return hash_suffix_len_; 113 | } 114 | 115 | level_t getRealSuffixLen() const { 116 | return real_suffix_len_; 117 | } 118 | 119 | position_t serializedSize() const { 120 | position_t size = sizeof(num_bits_) + sizeof(type_) 121 | + sizeof(hash_suffix_len_) + sizeof(real_suffix_len_) + bitsSize(); 122 | sizeAlign(size); 123 | return size; 124 | } 125 | 126 | position_t size() const { 127 | return (sizeof(BitvectorSuffix) + bitsSize()); 128 | } 129 | 130 | word_t read(const position_t idx) const; 131 | word_t readReal(const position_t idx) const; 132 | bool checkEquality(const position_t idx, const std::string& key, const level_t level) const; 133 | 134 | // Compare stored suffix to querying suffix. 135 | // kReal suffix type only. 136 | int compare(const position_t idx, const std::string& key, const level_t level) const; 137 | 138 | void serialize(char*& dst) const { 139 | memcpy(dst, &num_bits_, sizeof(num_bits_)); 140 | dst += sizeof(num_bits_); 141 | memcpy(dst, &type_, sizeof(type_)); 142 | dst += sizeof(type_); 143 | memcpy(dst, &hash_suffix_len_, sizeof(hash_suffix_len_)); 144 | dst += sizeof(hash_suffix_len_); 145 | memcpy(dst, &real_suffix_len_, sizeof(real_suffix_len_)); 146 | dst += sizeof(real_suffix_len_); 147 | if (type_ != kNone) { 148 | memcpy(dst, bits_, bitsSize()); 149 | dst += bitsSize(); 150 | } 151 | align(dst); 152 | } 153 | 154 | static BitvectorSuffix* deSerialize(char*& src) { 155 | BitvectorSuffix* sv = new BitvectorSuffix(); 156 | memcpy(&(sv->num_bits_), src, sizeof(sv->num_bits_)); 157 | src += sizeof(sv->num_bits_); 158 | memcpy(&(sv->type_), src, sizeof(sv->type_)); 159 | src += sizeof(sv->type_); 160 | memcpy(&(sv->hash_suffix_len_), src, sizeof(sv->hash_suffix_len_)); 161 | src += sizeof(sv->hash_suffix_len_); 162 | memcpy(&(sv->real_suffix_len_), src, sizeof(sv->real_suffix_len_)); 163 | src += sizeof(sv->real_suffix_len_); 164 | if (sv->type_ != kNone) { 165 | sv->bits_ = new word_t[sv->numWords()]; 166 | memcpy(sv->bits_, src, sv->bitsSize()); 167 | src += sv->bitsSize(); 168 | 169 | //sv->bits_ = const_cast(reinterpret_cast(src)); 170 | //src += sv->bitsSize(); 171 | } 172 | align(src); 173 | return sv; 174 | } 175 | 176 | void destroy() { 177 | if (type_ != kNone) 178 | delete[] bits_; 179 | } 180 | 181 | private: 182 | SuffixType type_; 183 | level_t hash_suffix_len_; // in bits 184 | level_t real_suffix_len_; // in bits 185 | }; 186 | 187 | word_t BitvectorSuffix::read(const position_t idx) const { 188 | if (type_ == kNone) 189 | return 0; 190 | 191 | level_t suffix_len = getSuffixLen(); 192 | if (idx * suffix_len >= num_bits_) 193 | return 0; 194 | 195 | position_t bit_pos = idx * suffix_len; 196 | position_t word_id = bit_pos / kWordSize; 197 | position_t offset = bit_pos & (kWordSize - 1); 198 | word_t ret_word = (bits_[word_id] << offset) >> (kWordSize - suffix_len); 199 | if (offset + suffix_len > kWordSize) 200 | ret_word += (bits_[word_id+1] >> (kWordSize - offset - suffix_len)); 201 | return ret_word; 202 | } 203 | 204 | word_t BitvectorSuffix::readReal(const position_t idx) const { 205 | return extractRealSuffix(read(idx), real_suffix_len_); 206 | } 207 | 208 | bool BitvectorSuffix::checkEquality(const position_t idx, 209 | const std::string& key, const level_t level) const { 210 | if (type_ == kNone) 211 | return true; 212 | if (idx * getSuffixLen() >= num_bits_) 213 | return false; 214 | 215 | word_t stored_suffix = read(idx); 216 | if (type_ == kReal) { 217 | // if no suffix info for the stored key 218 | if (stored_suffix == 0) 219 | return true; 220 | // if the querying key is shorter than the stored key 221 | if (key.length() < level || ((key.length() - level) * 8) < real_suffix_len_) 222 | return false; 223 | } 224 | word_t querying_suffix 225 | = constructSuffix(type_, key, hash_suffix_len_, level, real_suffix_len_); 226 | return (stored_suffix == querying_suffix); 227 | } 228 | 229 | // If no real suffix is stored for the key, compare returns 0. 230 | // int BitvectorSuffix::compare(const position_t idx, 231 | // const std::string& key, const level_t level) const { 232 | // if ((type_ == kNone) || (type_ == kHash) || (idx * getSuffixLen() >= num_bits_)) 233 | // return 0; 234 | // word_t stored_suffix = read(idx); 235 | // word_t querying_suffix = constructRealSuffix(key, level, real_suffix_len_); 236 | // if (type_ == kMixed) 237 | // stored_suffix = extractRealSuffix(stored_suffix, real_suffix_len_); 238 | 239 | // if (stored_suffix == 0) 240 | // return 0; 241 | // if (stored_suffix < querying_suffix) 242 | // return -1; 243 | // else if (stored_suffix == querying_suffix) 244 | // return 0; 245 | // else 246 | // return 1; 247 | // } 248 | 249 | int BitvectorSuffix::compare(const position_t idx, 250 | const std::string& key, const level_t level) const { 251 | if ((idx * getSuffixLen() >= num_bits_) || (type_ == kNone) || (type_ == kHash)) 252 | return kCouldBePositive; 253 | 254 | word_t stored_suffix = read(idx); 255 | word_t querying_suffix = constructRealSuffix(key, level, real_suffix_len_); 256 | if (type_ == kMixed) 257 | stored_suffix = extractRealSuffix(stored_suffix, real_suffix_len_); 258 | 259 | if ((stored_suffix == 0) && (querying_suffix == 0)) 260 | return kCouldBePositive; 261 | else if ((stored_suffix == 0) || (stored_suffix < querying_suffix)) 262 | return -1; 263 | else if (stored_suffix == querying_suffix) 264 | return kCouldBePositive; 265 | else 266 | return 1; 267 | } 268 | 269 | } // namespace surf 270 | 271 | #endif // SUFFIXVECTOR_H_ 272 | -------------------------------------------------------------------------------- /CodeCoverage.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2012 - 2017, Lars Bilke 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without modification, 5 | # are permitted provided that the following conditions are met: 6 | # 7 | # 1. Redistributions of source code must retain the above copyright notice, this 8 | # list of conditions and the following disclaimer. 9 | # 10 | # 2. Redistributions in binary form must reproduce the above copyright notice, 11 | # this list of conditions and the following disclaimer in the documentation 12 | # and/or other materials provided with the distribution. 13 | # 14 | # 3. Neither the name of the copyright holder nor the names of its contributors 15 | # may be used to endorse or promote products derived from this software without 16 | # specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 22 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | # 29 | # USAGE: 30 | # 31 | # 1. Copy this file into your cmake modules path. 32 | # 33 | # 2. Add the following line to your CMakeLists.txt: 34 | # include(CodeCoverage) 35 | # 36 | # 3. Append necessary compiler flags: 37 | # APPEND_COVERAGE_COMPILER_FLAGS() 38 | # 39 | # 4. If you need to exclude additional directories from the report, specify them 40 | # using the COVERAGE_EXCLUDES variable before calling SETUP_TARGET_FOR_COVERAGE. 41 | # Example: 42 | # set(COVERAGE_EXCLUDES 'dir1/*' 'dir2/*') 43 | # 44 | # 5. Use the functions described below to create a custom make target which 45 | # runs your test executable and produces a code coverage report. 46 | # 47 | # 6. Build a Debug build: 48 | # cmake -DCMAKE_BUILD_TYPE=Debug .. 49 | # make 50 | # make my_coverage_target 51 | # 52 | 53 | include(CMakeParseArguments) 54 | 55 | # Check prereqs 56 | find_program( GCOV_PATH gcov ) 57 | find_program( LCOV_PATH NAMES lcov lcov.bat lcov.exe lcov.perl) 58 | find_program( GENHTML_PATH NAMES genhtml genhtml.perl genhtml.bat ) 59 | find_program( GCOVR_PATH gcovr PATHS ${CMAKE_SOURCE_DIR}/scripts/test) 60 | find_program( SIMPLE_PYTHON_EXECUTABLE python ) 61 | 62 | if(NOT GCOV_PATH) 63 | message(FATAL_ERROR "gcov not found! Aborting...") 64 | endif() # NOT GCOV_PATH 65 | 66 | if("${CMAKE_CXX_COMPILER_ID}" MATCHES "(Apple)?[Cc]lang") 67 | if("${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS 3) 68 | message(FATAL_ERROR "Clang version must be 3.0.0 or greater! Aborting...") 69 | endif() 70 | elseif(NOT CMAKE_COMPILER_IS_GNUCXX) 71 | message(FATAL_ERROR "Compiler is not GNU gcc! Aborting...") 72 | endif() 73 | 74 | set(COVERAGE_COMPILER_FLAGS "-g -O0 --coverage -fprofile-arcs -ftest-coverage" 75 | CACHE INTERNAL "") 76 | 77 | set(CMAKE_CXX_FLAGS_COVERAGE 78 | ${COVERAGE_COMPILER_FLAGS} 79 | CACHE STRING "Flags used by the C++ compiler during coverage builds." 80 | FORCE ) 81 | set(CMAKE_C_FLAGS_COVERAGE 82 | ${COVERAGE_COMPILER_FLAGS} 83 | CACHE STRING "Flags used by the C compiler during coverage builds." 84 | FORCE ) 85 | set(CMAKE_EXE_LINKER_FLAGS_COVERAGE 86 | "" 87 | CACHE STRING "Flags used for linking binaries during coverage builds." 88 | FORCE ) 89 | set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE 90 | "" 91 | CACHE STRING "Flags used by the shared libraries linker during coverage builds." 92 | FORCE ) 93 | mark_as_advanced( 94 | CMAKE_CXX_FLAGS_COVERAGE 95 | CMAKE_C_FLAGS_COVERAGE 96 | CMAKE_EXE_LINKER_FLAGS_COVERAGE 97 | CMAKE_SHARED_LINKER_FLAGS_COVERAGE ) 98 | 99 | if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") 100 | message(WARNING "Code coverage results with an optimised (non-Debug) build may be misleading") 101 | endif() # NOT CMAKE_BUILD_TYPE STREQUAL "Debug" 102 | 103 | if(CMAKE_C_COMPILER_ID STREQUAL "GNU") 104 | link_libraries(gcov) 105 | else() 106 | set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --coverage") 107 | endif() 108 | 109 | # Defines a target for running and collection code coverage information 110 | # Builds dependencies, runs the given executable and outputs reports. 111 | # NOTE! The executable should always have a ZERO as exit code otherwise 112 | # the coverage generation will not complete. 113 | # 114 | # SETUP_TARGET_FOR_COVERAGE( 115 | # NAME testrunner_coverage # New target name 116 | # EXECUTABLE testrunner -j ${PROCESSOR_COUNT} # Executable in PROJECT_BINARY_DIR 117 | # DEPENDENCIES testrunner # Dependencies to build first 118 | # ) 119 | function(SETUP_TARGET_FOR_COVERAGE) 120 | 121 | set(options NONE) 122 | set(oneValueArgs NAME) 123 | set(multiValueArgs EXECUTABLE EXECUTABLE_ARGS DEPENDENCIES) 124 | cmake_parse_arguments(Coverage "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) 125 | 126 | if(NOT LCOV_PATH) 127 | message(FATAL_ERROR "lcov not found! Aborting...") 128 | endif() # NOT LCOV_PATH 129 | 130 | if(NOT GENHTML_PATH) 131 | message(FATAL_ERROR "genhtml not found! Aborting...") 132 | endif() # NOT GENHTML_PATH 133 | 134 | # Setup target 135 | add_custom_target(${Coverage_NAME} 136 | 137 | # Cleanup lcov 138 | COMMAND ${LCOV_PATH} --directory . --zerocounters 139 | # Create baseline to make sure untouched files show up in the report 140 | COMMAND ${LCOV_PATH} -c -i -d . -o ${Coverage_NAME}.base 141 | 142 | # Run tests 143 | COMMAND ${Coverage_EXECUTABLE} 144 | 145 | # Capturing lcov counters and generating report 146 | COMMAND ${LCOV_PATH} --directory . --capture --output-file ${Coverage_NAME}.info 147 | # add baseline counters 148 | COMMAND ${LCOV_PATH} -a ${Coverage_NAME}.base -a ${Coverage_NAME}.info --output-file ${Coverage_NAME}.total 149 | COMMAND ${LCOV_PATH} --remove ${Coverage_NAME}.total ${COVERAGE_EXCLUDES} --output-file ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info.cleaned 150 | COMMAND ${GENHTML_PATH} -o ${Coverage_NAME} ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info.cleaned 151 | COMMAND ${CMAKE_COMMAND} -E remove ${Coverage_NAME}.base ${Coverage_NAME}.total ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info.cleaned 152 | 153 | WORKING_DIRECTORY ${PROJECT_BINARY_DIR} 154 | DEPENDS ${Coverage_DEPENDENCIES} 155 | COMMENT "Resetting code coverage counters to zero.\nProcessing code coverage counters and generating report." 156 | ) 157 | 158 | # Show where to find the lcov info report 159 | add_custom_command(TARGET ${Coverage_NAME} POST_BUILD 160 | COMMAND ; 161 | COMMENT "Lcov code coverage info report saved in ${Coverage_NAME}.info." 162 | ) 163 | 164 | # Show info where to find the report 165 | add_custom_command(TARGET ${Coverage_NAME} POST_BUILD 166 | COMMAND ; 167 | COMMENT "Open ./${Coverage_NAME}/index.html in your browser to view the coverage report." 168 | ) 169 | 170 | endfunction() # SETUP_TARGET_FOR_COVERAGE 171 | 172 | # Defines a target for running and collection code coverage information 173 | # Builds dependencies, runs the given executable and outputs reports. 174 | # NOTE! The executable should always have a ZERO as exit code otherwise 175 | # the coverage generation will not complete. 176 | # 177 | # SETUP_TARGET_FOR_COVERAGE_COBERTURA( 178 | # NAME ctest_coverage # New target name 179 | # EXECUTABLE ctest -j ${PROCESSOR_COUNT} # Executable in PROJECT_BINARY_DIR 180 | # DEPENDENCIES executable_target # Dependencies to build first 181 | # ) 182 | function(SETUP_TARGET_FOR_COVERAGE_COBERTURA) 183 | 184 | set(options NONE) 185 | set(oneValueArgs NAME) 186 | set(multiValueArgs EXECUTABLE EXECUTABLE_ARGS DEPENDENCIES) 187 | cmake_parse_arguments(Coverage "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) 188 | 189 | if(NOT SIMPLE_PYTHON_EXECUTABLE) 190 | message(FATAL_ERROR "python not found! Aborting...") 191 | endif() # NOT SIMPLE_PYTHON_EXECUTABLE 192 | 193 | if(NOT GCOVR_PATH) 194 | message(FATAL_ERROR "gcovr not found! Aborting...") 195 | endif() # NOT GCOVR_PATH 196 | 197 | # Combine excludes to several -e arguments 198 | set(COBERTURA_EXCLUDES "") 199 | foreach(EXCLUDE ${COVERAGE_EXCLUDES}) 200 | set(COBERTURA_EXCLUDES "-e ${EXCLUDE} ${COBERTURA_EXCLUDES}") 201 | endforeach() 202 | 203 | add_custom_target(${Coverage_NAME} 204 | 205 | # Run tests 206 | ${Coverage_EXECUTABLE} 207 | 208 | # Running gcovr 209 | COMMAND ${GCOVR_PATH} -x -r ${CMAKE_SOURCE_DIR} ${COBERTURA_EXCLUDES} 210 | -o ${Coverage_NAME}.xml 211 | WORKING_DIRECTORY ${PROJECT_BINARY_DIR} 212 | DEPENDS ${Coverage_DEPENDENCIES} 213 | COMMENT "Running gcovr to produce Cobertura code coverage report." 214 | ) 215 | 216 | # Show info where to find the report 217 | add_custom_command(TARGET ${Coverage_NAME} POST_BUILD 218 | COMMAND ; 219 | COMMENT "Cobertura code coverage report saved in ${Coverage_NAME}.xml." 220 | ) 221 | 222 | endfunction() # SETUP_TARGET_FOR_COVERAGE_COBERTURA 223 | 224 | function(APPEND_COVERAGE_COMPILER_FLAGS) 225 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COVERAGE_COMPILER_FLAGS}" PARENT_SCOPE) 226 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_COMPILER_FLAGS}" PARENT_SCOPE) 227 | message(STATUS "Appending code coverage compiler flags: ${COVERAGE_COMPILER_FLAGS}") 228 | endfunction() # APPEND_COVERAGE_COMPILER_FLAGS -------------------------------------------------------------------------------- /bench/workload.cpp: -------------------------------------------------------------------------------- 1 | #include "bench.hpp" 2 | #include "filter_factory.hpp" 3 | 4 | int main(int argc, char *argv[]) { 5 | if (argc != 9) { 6 | std::cout << "Usage:\n"; 7 | std::cout << "1. filter type: SuRF, SuRFHash, SuRFReal, SuRFMixed, Bloom\n"; 8 | std::cout << "2. suffix length: 0 < len <= 64 (for SuRFHash and SuRFReal only)\n"; 9 | std::cout << "3. workload type: mixed, alterByte (only for email key)\n"; 10 | std::cout << "4. percentage of keys inserted: 0 < num <= 100\n"; 11 | std::cout << "5. byte position (conting from last, only for alterByte): num\n"; 12 | std::cout << "6. key type: randint, email\n"; 13 | std::cout << "7. query type: point, range, mix, count-long, count-short\n"; 14 | std::cout << "8. distribution: uniform, zipfian, latest\n"; 15 | return -1; 16 | } 17 | 18 | std::string filter_type = argv[1]; 19 | uint32_t suffix_len = (uint32_t)atoi(argv[2]); 20 | std::string workload_type = argv[3]; 21 | unsigned percent = atoi(argv[4]); 22 | unsigned byte_pos = atoi(argv[5]); 23 | std::string key_type = argv[6]; 24 | std::string query_type = argv[7]; 25 | std::string distribution = argv[8]; 26 | 27 | // check args ==================================================== 28 | if (filter_type.compare(std::string("SuRF")) != 0 29 | && filter_type.compare(std::string("SuRFHash")) != 0 30 | && filter_type.compare(std::string("SuRFReal")) != 0 31 | && filter_type.compare(std::string("SuRFMixed")) != 0 32 | && filter_type.compare(std::string("Bloom")) != 0 33 | && filter_type.compare(std::string("ARF")) != 0) { 34 | std::cout << bench::kRed << "WRONG filter type\n" << bench::kNoColor; 35 | return -1; 36 | } 37 | 38 | if (suffix_len == 0 || suffix_len > 64) { 39 | std::cout << bench::kRed << "WRONG suffix length\n" << bench::kNoColor; 40 | return -1; 41 | } 42 | 43 | if (workload_type.compare(std::string("mixed")) != 0 44 | && workload_type.compare(std::string("alterByte")) == 0) { 45 | std::cout << bench::kRed << "WRONG workload type\n" << bench::kNoColor; 46 | return -1; 47 | } 48 | 49 | if (percent > 100) { 50 | std::cout << bench::kRed << "WRONG percentage\n" << bench::kNoColor; 51 | return -1; 52 | } 53 | 54 | if (key_type.compare(std::string("randint")) != 0 55 | && key_type.compare(std::string("timestamp")) != 0 56 | && key_type.compare(std::string("email")) != 0) { 57 | std::cout << bench::kRed << "WRONG key type\n" << bench::kNoColor; 58 | return -1; 59 | } 60 | 61 | if (query_type.compare(std::string("point")) != 0 62 | && query_type.compare(std::string("range")) != 0 63 | && query_type.compare(std::string("mix")) != 0 64 | && query_type.compare(std::string("count-long")) != 0 65 | && query_type.compare(std::string("count-short")) != 0) { 66 | std::cout << bench::kRed << "WRONG query type\n" << bench::kNoColor; 67 | return -1; 68 | } 69 | 70 | if (distribution.compare(std::string("uniform")) != 0 71 | && distribution.compare(std::string("zipfian")) != 0 72 | && distribution.compare(std::string("latest")) != 0) { 73 | std::cout << bench::kRed << "WRONG distribution\n" << bench::kNoColor; 74 | return -1; 75 | } 76 | 77 | // load keys from files ======================================= 78 | std::string load_file = "workloads/load_"; 79 | load_file += key_type; 80 | std::vector load_keys; 81 | if (key_type.compare(std::string("email")) == 0) 82 | bench::loadKeysFromFile(load_file, false, load_keys); 83 | else 84 | bench::loadKeysFromFile(load_file, true, load_keys); 85 | 86 | std::string txn_file = "workloads/txn_"; 87 | txn_file += key_type; 88 | txn_file += "_"; 89 | txn_file += distribution; 90 | std::vector txn_keys; 91 | if (key_type.compare(std::string("email")) == 0) 92 | bench::loadKeysFromFile(txn_file, false, txn_keys); 93 | else 94 | bench::loadKeysFromFile(txn_file, true, txn_keys); 95 | 96 | std::vector insert_keys; 97 | bench::selectKeysToInsert(percent, insert_keys, load_keys); 98 | 99 | if (workload_type.compare(std::string("alterByte")) == 0) 100 | bench::modifyKeyByte(txn_keys, byte_pos); 101 | 102 | //compute keys for approximate count-long queries ================= 103 | std::vector left_keys, right_keys; 104 | if (query_type.compare(std::string("count-long")) == 0) { 105 | for (int i = 0; i < (int)txn_keys.size() - 1; i++) { 106 | if (txn_keys[i].compare(txn_keys[i + 1]) < 0) { 107 | left_keys.push_back(txn_keys[i]); 108 | right_keys.push_back(txn_keys[i + 1]); 109 | } else { 110 | left_keys.push_back(txn_keys[i + 1]); 111 | right_keys.push_back(txn_keys[i]); 112 | } 113 | } 114 | } 115 | 116 | // create filter ============================================== 117 | double time1 = bench::getNow(); 118 | bench::Filter* filter = bench::FilterFactory::createFilter(filter_type, suffix_len, insert_keys); 119 | double time2 = bench::getNow(); 120 | std::cout << "Build time = " << (time2 - time1) << std::endl; 121 | 122 | // execute transactions ======================================= 123 | int64_t positives = 0; 124 | uint64_t count = 0; 125 | double start_time = bench::getNow(); 126 | 127 | if (query_type.compare(std::string("point")) == 0) { 128 | for (int i = 0; i < (int)txn_keys.size(); i++) 129 | positives += (int)filter->lookup(txn_keys[i]); 130 | } else if (query_type.compare(std::string("range")) == 0) { 131 | for (int i = 0; i < (int)txn_keys.size(); i++) 132 | if (key_type.compare(std::string("email")) == 0) { 133 | std::string ret_str = txn_keys[i]; 134 | ret_str[ret_str.size() - 1] += (char)bench::kEmailRangeSize; 135 | positives += (int)filter->lookupRange(txn_keys[i], ret_str); 136 | } else { 137 | positives += (int)filter->lookupRange(txn_keys[i], bench::uint64ToString(bench::stringToUint64(txn_keys[i]) + bench::kIntRangeSize)); 138 | } 139 | } else if (query_type.compare(std::string("mix")) == 0) { 140 | for (int i = 0; i < (int)txn_keys.size(); i++) { 141 | if (i % 2 == 0) { 142 | positives += (int)filter->lookup(txn_keys[i]); 143 | } else { 144 | if (key_type.compare(std::string("email")) == 0) { 145 | std::string ret_str = txn_keys[i]; 146 | ret_str[ret_str.size() - 1] += (char)bench::kEmailRangeSize; 147 | positives += (int)filter->lookupRange(txn_keys[i], ret_str); 148 | } else { 149 | positives += (int)filter->lookupRange(txn_keys[i], bench::uint64ToString(bench::stringToUint64(txn_keys[i]) + bench::kIntRangeSize)); 150 | } 151 | } 152 | } 153 | } else if (query_type.compare(std::string("count-long")) == 0) { 154 | for (int i = 0; i < (int)txn_keys.size() - 1; i++) 155 | count += filter->approxCount(left_keys[i], right_keys[i]); 156 | } else if (query_type.compare(std::string("count-short")) == 0) { 157 | for (int i = 0; i < (int)txn_keys.size(); i++) 158 | if (key_type.compare(std::string("email")) == 0) { 159 | std::string ret_str = txn_keys[i]; 160 | ret_str[ret_str.size() - 1] += (char)bench::kEmailRangeSize; 161 | count += filter->approxCount(txn_keys[i], ret_str); 162 | } else { 163 | count += filter->approxCount(txn_keys[i], bench::uint64ToString(bench::stringToUint64(txn_keys[i]) + bench::kIntRangeSize)); 164 | } 165 | } 166 | 167 | double end_time = bench::getNow(); 168 | 169 | // compute true positives ====================================== 170 | std::map ht; 171 | for (int i = 0; i < (int)insert_keys.size(); i++) 172 | ht[insert_keys[i]] = true; 173 | 174 | int64_t true_positives = 0; 175 | std::map::iterator ht_iter; 176 | if (query_type.compare(std::string("point")) == 0) { 177 | for (int i = 0; i < (int)txn_keys.size(); i++) { 178 | ht_iter = ht.find(txn_keys[i]); 179 | true_positives += (ht_iter != ht.end()); 180 | } 181 | } else if (query_type.compare(std::string("range")) == 0) { 182 | for (int i = 0; i < (int)txn_keys.size(); i++) { 183 | ht_iter = ht.lower_bound(txn_keys[i]); 184 | if (ht_iter != ht.end()) { 185 | std::string fetched_key = ht_iter->first; 186 | if (key_type.compare(std::string("email")) == 0) { 187 | std::string ret_str = txn_keys[i]; 188 | ret_str[ret_str.size() - 1] += (char)bench::kEmailRangeSize; 189 | true_positives += (fetched_key.compare(ret_str) < 0); 190 | } else { 191 | true_positives += (fetched_key.compare(bench::uint64ToString(bench::stringToUint64(txn_keys[i]) + bench::kIntRangeSize)) < 0); 192 | } 193 | } 194 | } 195 | } else if (query_type.compare(std::string("mix")) == 0) { 196 | for (int i = 0; i < (int)txn_keys.size(); i++) { 197 | if (i % 2 == 0) { 198 | ht_iter = ht.find(txn_keys[i]); 199 | true_positives += (ht_iter != ht.end()); 200 | } else { 201 | ht_iter = ht.lower_bound(txn_keys[i]); 202 | if (ht_iter != ht.end()) { 203 | std::string fetched_key = ht_iter->first; 204 | if (key_type.compare(std::string("email")) == 0) { 205 | std::string ret_str = txn_keys[i]; 206 | ret_str[ret_str.size() - 1] += (char)bench::kEmailRangeSize; 207 | true_positives += (fetched_key.compare(ret_str) < 0); 208 | } else { 209 | true_positives += (fetched_key.compare(bench::uint64ToString(bench::stringToUint64(txn_keys[i]) + bench::kIntRangeSize)) < 0); 210 | } 211 | } 212 | } 213 | } 214 | } 215 | int64_t false_positives = positives - true_positives; 216 | assert(false_positives >= 0); 217 | int64_t true_negatives = txn_keys.size() - positives; 218 | 219 | // print 220 | double tput = txn_keys.size() / (end_time - start_time) / 1000000; // Mops/sec 221 | std::cout << bench::kGreen << "Throughput = " << bench::kNoColor << tput << "\n"; 222 | 223 | std::cout << "positives = " << positives << "\n"; 224 | std::cout << "true positives = " << true_positives << "\n"; 225 | std::cout << "false positives = " << false_positives << "\n"; 226 | std::cout << "true negatives = " << true_negatives << "\n"; 227 | std::cout << "count = " << count << "\n"; 228 | 229 | double fp_rate = 0; 230 | if (false_positives > 0) 231 | fp_rate = false_positives / (true_negatives + false_positives + 0.0); 232 | std::cout << bench::kGreen << "False Positive Rate = " << bench::kNoColor << fp_rate << "\n"; 233 | 234 | std::cout << bench::kGreen << "Memory = " << bench::kNoColor << filter->getMemoryUsage() << "\n\n"; 235 | 236 | return 0; 237 | } 238 | -------------------------------------------------------------------------------- /test/unitTest/test_suffix.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "config.hpp" 10 | #include "suffix.hpp" 11 | #include "surf_builder.hpp" 12 | 13 | namespace surf { 14 | 15 | namespace suffixtest { 16 | 17 | static const std::string kFilePath = "../../../test/words.txt"; 18 | static const int kTestSize = 234369; 19 | static std::vector words; 20 | 21 | class SuffixUnitTest : public ::testing::Test { 22 | public: 23 | virtual void SetUp () { 24 | computeWordsBySuffixStartLevel(); 25 | data_ = nullptr; 26 | } 27 | virtual void TearDown () { 28 | if (data_) 29 | delete[] data_; 30 | } 31 | 32 | void computeWordsBySuffixStartLevel(); 33 | void testSerialize(); 34 | void testCheckEquality(); 35 | 36 | SuRFBuilder* builder_; 37 | BitvectorSuffix* suffixes_; 38 | std::vector > words_by_suffix_start_level_; 39 | char* data_; 40 | }; 41 | 42 | static int getCommonPrefixLen(const std::string &a, const std::string &b) { 43 | int len = 0; 44 | while ((len < (int)a.length()) && (len < (int)b.length()) && (a[len] == b[len])) 45 | len++; 46 | return len; 47 | } 48 | 49 | static int getMax(int a, int b) { 50 | if (a < b) 51 | return b; 52 | return a; 53 | } 54 | 55 | void SuffixUnitTest::computeWordsBySuffixStartLevel() { 56 | assert(words.size() > 1); 57 | int commonPrefixLen = 0; 58 | for (unsigned i = 0; i < words.size(); i++) { 59 | if (i == 0) { 60 | commonPrefixLen = getCommonPrefixLen(words[i], words[i+1]); 61 | } else if (i == words.size() - 1) { 62 | commonPrefixLen = getCommonPrefixLen(words[i-1], words[i]); 63 | } else { 64 | commonPrefixLen = getMax(getCommonPrefixLen(words[i-1], words[i]), 65 | getCommonPrefixLen(words[i], words[i+1])); 66 | } 67 | 68 | while (words_by_suffix_start_level_.size() < (unsigned)(commonPrefixLen + 1)) 69 | words_by_suffix_start_level_.push_back(std::vector()); 70 | 71 | words_by_suffix_start_level_[commonPrefixLen].push_back(words[i]); 72 | } 73 | } 74 | 75 | void SuffixUnitTest::testSerialize() { 76 | uint64_t size = suffixes_->serializedSize(); 77 | data_ = new char[size]; 78 | BitvectorSuffix* ori_suffixes = suffixes_; 79 | char* data = data_; 80 | ori_suffixes->serialize(data); 81 | data = data_; 82 | suffixes_ = BitvectorSuffix::deSerialize(data); 83 | 84 | ASSERT_EQ(ori_suffixes->bitsSize(), suffixes_->bitsSize()); 85 | 86 | ori_suffixes->destroy(); 87 | delete ori_suffixes; 88 | } 89 | 90 | void SuffixUnitTest::testCheckEquality() { 91 | position_t suffix_idx = 0; 92 | for (level_t level = 0; level < words_by_suffix_start_level_.size(); level++) { 93 | for (unsigned k = 0; k < words_by_suffix_start_level_[level].size(); k++) { 94 | if (level == 1 && k == 32) { 95 | bool is_equal = suffixes_->checkEquality(suffix_idx, 96 | words_by_suffix_start_level_[level][k], 97 | (level + 1)); 98 | ASSERT_TRUE(is_equal); 99 | } 100 | suffix_idx++; 101 | } 102 | } 103 | } 104 | 105 | TEST_F (SuffixUnitTest, constructRealSuffixTest) { 106 | const level_t level = 2; 107 | level_t suffix_len_array[5] = {1, 3, 7, 8, 13}; 108 | for (int i = 0; i < 5; i++) { 109 | level_t suffix_len = suffix_len_array[i]; 110 | for (unsigned j = 0; j < words.size(); j++) { 111 | word_t suffix = BitvectorSuffix::constructSuffix(kReal, words[j], 0, level, suffix_len); 112 | if (words[j].length() < level || ((words[j].length() - level) * 8) < suffix_len) { 113 | ASSERT_EQ(0, suffix); 114 | continue; 115 | } 116 | for (position_t bitpos = 0; bitpos < suffix_len; bitpos++) { 117 | position_t byte_id = bitpos / 8; 118 | position_t byte_offset = bitpos % 8; 119 | uint8_t byte_mask = 0x80; 120 | byte_mask >>= byte_offset; 121 | bool expected_suffix_bit = false; 122 | if (level + byte_id < words[j].size()) 123 | expected_suffix_bit = (bool)(words[j][level + byte_id] & byte_mask); 124 | 125 | word_t word_mask = kMsbMask; 126 | word_mask >>= (kWordSize - suffix_len + bitpos); 127 | bool suffix_bit = (bool)(suffix & word_mask); 128 | 129 | ASSERT_EQ(expected_suffix_bit, suffix_bit); 130 | } 131 | } 132 | } 133 | } 134 | 135 | TEST_F (SuffixUnitTest, constructMixedSuffixTest) { 136 | const level_t level = 2; 137 | level_t suffix_len_array[5] = {1, 3, 7, 8, 13}; 138 | for (int i = 0; i < 5; i++) { 139 | level_t suffix_len = suffix_len_array[i]; 140 | for (unsigned j = 0; j < words.size(); j++) { 141 | word_t suffix = BitvectorSuffix::constructSuffix(kMixed, words[j], suffix_len, 142 | level, suffix_len); 143 | word_t hash_suffix = BitvectorSuffix::extractHashSuffix(suffix, suffix_len); 144 | word_t expected_hash_suffix = BitvectorSuffix::constructHashSuffix(words[j], suffix_len); 145 | ASSERT_EQ(expected_hash_suffix, hash_suffix); 146 | 147 | word_t real_suffix = BitvectorSuffix::extractRealSuffix(suffix, suffix_len); 148 | if (words[j].length() < level || ((words[j].length() - level) * 8) < suffix_len) { 149 | ASSERT_EQ(0, real_suffix); 150 | continue; 151 | } 152 | for (position_t bitpos = 0; bitpos < suffix_len; bitpos++) { 153 | position_t byte_id = bitpos / 8; 154 | position_t byte_offset = bitpos % 8; 155 | uint8_t byte_mask = 0x80; 156 | byte_mask >>= byte_offset; 157 | bool expected_suffix_bit = false; 158 | if (level + byte_id < words[j].size()) 159 | expected_suffix_bit = (bool)(words[j][level + byte_id] & byte_mask); 160 | 161 | word_t word_mask = kMsbMask; 162 | word_mask >>= (kWordSize - suffix_len + bitpos); 163 | bool suffix_bit = (bool)(real_suffix & word_mask); 164 | ASSERT_EQ(expected_suffix_bit, suffix_bit); 165 | } 166 | } 167 | } 168 | } 169 | 170 | TEST_F (SuffixUnitTest, checkEqualityTest) { 171 | bool include_dense = false; 172 | uint32_t sparse_dense_ratio = 0; 173 | SuffixType suffix_type_array[3] = {kHash, kReal, kMixed}; 174 | level_t suffix_len_array[5] = {1, 3, 7, 8, 13}; 175 | for (int i = 0; i < 3; i++) { 176 | for (int j = 0; j < 5; j++) { 177 | // build test 178 | SuffixType suffix_type = suffix_type_array[i]; 179 | level_t suffix_len = suffix_len_array[j]; 180 | 181 | if (i == 0) 182 | builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, suffix_type, suffix_len, 0); 183 | else if (i == 1) 184 | builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, suffix_type, 0, suffix_len); 185 | else 186 | builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, 187 | suffix_type, suffix_len, suffix_len); 188 | builder_->build(words); 189 | 190 | level_t height = builder_->getLabels().size(); 191 | std::vector num_suffix_bits_per_level; 192 | for (level_t level = 0; level < height; level++) { 193 | if (suffix_type == kMixed) 194 | num_suffix_bits_per_level.push_back(builder_->getSuffixCounts()[level] * suffix_len * 2); 195 | else 196 | num_suffix_bits_per_level.push_back(builder_->getSuffixCounts()[level] * suffix_len); 197 | } 198 | 199 | if (i == 0) 200 | suffixes_ = new BitvectorSuffix(builder_->getSuffixType(), suffix_len, 0, builder_->getSuffixes(), num_suffix_bits_per_level, 0, height); 201 | else if (i == 1) 202 | suffixes_ = new BitvectorSuffix(builder_->getSuffixType(), 0, suffix_len, builder_->getSuffixes(), num_suffix_bits_per_level, 0, height); 203 | else 204 | suffixes_ = new BitvectorSuffix(builder_->getSuffixType(), suffix_len, suffix_len, builder_->getSuffixes(), num_suffix_bits_per_level, 0, height); 205 | 206 | testCheckEquality(); 207 | delete builder_; 208 | suffixes_->destroy(); 209 | delete suffixes_; 210 | } 211 | } 212 | } 213 | 214 | TEST_F (SuffixUnitTest, serializeTest) { 215 | bool include_dense = false; 216 | uint32_t sparse_dense_ratio = 0; 217 | SuffixType suffix_type_array[3] = {kHash, kReal, kMixed}; 218 | level_t suffix_len_array[5] = {1, 3, 7, 8, 13}; 219 | for (int i = 0; i < 3; i++) { 220 | for (int j = 0; j < 5; j++) { 221 | // build test 222 | SuffixType suffix_type = suffix_type_array[i]; 223 | level_t suffix_len = suffix_len_array[j]; 224 | if (i == 0) 225 | builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, suffix_type, suffix_len, 0); 226 | else if (i == 1) 227 | builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, suffix_type, 0, suffix_len); 228 | else 229 | builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, 230 | suffix_type, suffix_len, suffix_len); 231 | builder_->build(words); 232 | 233 | level_t height = builder_->getLabels().size(); 234 | std::vector num_suffix_bits_per_level; 235 | for (level_t level = 0; level < height; level++) { 236 | if (suffix_type == kMixed) 237 | num_suffix_bits_per_level.push_back(builder_->getSuffixCounts()[level] * suffix_len * 2); 238 | else 239 | num_suffix_bits_per_level.push_back(builder_->getSuffixCounts()[level] * suffix_len); 240 | } 241 | 242 | if (i == 0) 243 | suffixes_ = new BitvectorSuffix(builder_->getSuffixType(), suffix_len, 0, builder_->getSuffixes(), num_suffix_bits_per_level, 0, height); 244 | else if (i == 1) 245 | suffixes_ = new BitvectorSuffix(builder_->getSuffixType(), 0, suffix_len, builder_->getSuffixes(), num_suffix_bits_per_level, 0, height); 246 | else 247 | suffixes_ = new BitvectorSuffix(builder_->getSuffixType(), suffix_len, suffix_len, builder_->getSuffixes(), num_suffix_bits_per_level, 0, height); 248 | 249 | testSerialize(); 250 | testCheckEquality(); 251 | delete builder_; 252 | } 253 | } 254 | } 255 | 256 | void loadWordList() { 257 | std::ifstream infile(kFilePath); 258 | std::string key; 259 | int count = 0; 260 | while (infile.good() && count < kTestSize) { 261 | infile >> key; 262 | words.push_back(key); 263 | count++; 264 | } 265 | } 266 | 267 | } // namespace suffixtest 268 | 269 | } // namespace surf 270 | 271 | int main (int argc, char** argv) { 272 | ::testing::InitGoogleTest(&argc, argv); 273 | surf::suffixtest::loadWordList(); 274 | return RUN_ALL_TESTS(); 275 | } 276 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | [![Coverage Status](https://coveralls.io/repos/github/efficient/SuRF/badge.svg?branch=master)](https://coveralls.io/github/efficient/SuRF?branch=master) wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | -------------------------------------------------------------------------------- /include/surf.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SURF_H_ 2 | #define SURF_H_ 3 | 4 | #include 5 | #include 6 | 7 | #include "config.hpp" 8 | #include "louds_dense.hpp" 9 | #include "louds_sparse.hpp" 10 | #include "surf_builder.hpp" 11 | 12 | namespace surf { 13 | 14 | class SuRF { 15 | public: 16 | class Iter { 17 | public: 18 | Iter() {}; 19 | Iter(const SuRF* filter) { 20 | dense_iter_ = LoudsDense::Iter(filter->louds_dense_); 21 | sparse_iter_ = LoudsSparse::Iter(filter->louds_sparse_); 22 | could_be_fp_ = false; 23 | } 24 | 25 | void clear(); 26 | bool isValid() const; 27 | bool getFpFlag() const; 28 | int compare(const std::string& key) const; 29 | std::string getKey() const; 30 | int getSuffix(word_t* suffix) const; 31 | std::string getKeyWithSuffix(unsigned* bitlen) const; 32 | 33 | // Returns true if the status of the iterator after the operation is valid 34 | bool operator ++(int); 35 | bool operator --(int); 36 | 37 | private: 38 | void passToSparse(); 39 | bool incrementDenseIter(); 40 | bool incrementSparseIter(); 41 | bool decrementDenseIter(); 42 | bool decrementSparseIter(); 43 | 44 | private: 45 | // true implies that dense_iter_ is valid 46 | LoudsDense::Iter dense_iter_; 47 | LoudsSparse::Iter sparse_iter_; 48 | bool could_be_fp_; 49 | 50 | friend class SuRF; 51 | }; 52 | 53 | public: 54 | SuRF() {}; 55 | 56 | //------------------------------------------------------------------ 57 | // Input keys must be SORTED 58 | //------------------------------------------------------------------ 59 | SuRF(const std::vector& keys) { 60 | create(keys, kIncludeDense, kSparseDenseRatio, kNone, 0, 0); 61 | } 62 | 63 | SuRF(const std::vector& keys, const SuffixType suffix_type, 64 | const level_t hash_suffix_len, const level_t real_suffix_len) { 65 | create(keys, kIncludeDense, kSparseDenseRatio, suffix_type, hash_suffix_len, real_suffix_len); 66 | } 67 | 68 | SuRF(const std::vector& keys, 69 | const bool include_dense, const uint32_t sparse_dense_ratio, 70 | const SuffixType suffix_type, const level_t hash_suffix_len, const level_t real_suffix_len) { 71 | create(keys, include_dense, sparse_dense_ratio, suffix_type, hash_suffix_len, real_suffix_len); 72 | } 73 | 74 | ~SuRF() {} 75 | 76 | void create(const std::vector& keys, 77 | const bool include_dense, const uint32_t sparse_dense_ratio, 78 | const SuffixType suffix_type, 79 | const level_t hash_suffix_len, const level_t real_suffix_len); 80 | 81 | bool lookupKey(const std::string& key) const; 82 | // This function searches in a conservative way: if inclusive is true 83 | // and the stored key prefix matches key, iter stays at this key prefix. 84 | SuRF::Iter moveToKeyGreaterThan(const std::string& key, const bool inclusive) const; 85 | SuRF::Iter moveToKeyLessThan(const std::string& key, const bool inclusive) const; 86 | SuRF::Iter moveToFirst() const; 87 | SuRF::Iter moveToLast() const; 88 | bool lookupRange(const std::string& left_key, const bool left_inclusive, 89 | const std::string& right_key, const bool right_inclusive); 90 | // Accurate except at the boundaries --> undercount by at most 2 91 | uint64_t approxCount(const std::string& left_key, const std::string& right_key); 92 | uint64_t approxCount(const SuRF::Iter* iter, const SuRF::Iter* iter2); 93 | 94 | uint64_t serializedSize() const; 95 | uint64_t getMemoryUsage() const; 96 | level_t getHeight() const; 97 | level_t getSparseStartLevel() const; 98 | 99 | char* serialize() const { 100 | uint64_t size = serializedSize(); 101 | char* data = new char[size]; 102 | char* cur_data = data; 103 | louds_dense_->serialize(cur_data); 104 | louds_sparse_->serialize(cur_data); 105 | assert(cur_data - data == (int64_t)size); 106 | return data; 107 | } 108 | 109 | static SuRF* deSerialize(char* src) { 110 | SuRF* surf = new SuRF(); 111 | surf->louds_dense_ = LoudsDense::deSerialize(src); 112 | surf->louds_sparse_ = LoudsSparse::deSerialize(src); 113 | surf->iter_ = SuRF::Iter(surf); 114 | return surf; 115 | } 116 | 117 | void destroy() { 118 | louds_dense_->destroy(); 119 | louds_sparse_->destroy(); 120 | } 121 | 122 | private: 123 | LoudsDense* louds_dense_; 124 | LoudsSparse* louds_sparse_; 125 | SuRFBuilder* builder_; 126 | SuRF::Iter iter_; 127 | SuRF::Iter iter2_; 128 | }; 129 | 130 | void SuRF::create(const std::vector& keys, 131 | const bool include_dense, const uint32_t sparse_dense_ratio, 132 | const SuffixType suffix_type, 133 | const level_t hash_suffix_len, const level_t real_suffix_len) { 134 | builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, 135 | suffix_type, hash_suffix_len, real_suffix_len); 136 | builder_->build(keys); 137 | louds_dense_ = new LoudsDense(builder_); 138 | louds_sparse_ = new LoudsSparse(builder_); 139 | iter_ = SuRF::Iter(this); 140 | delete builder_; 141 | } 142 | 143 | bool SuRF::lookupKey(const std::string& key) const { 144 | position_t connect_node_num = 0; 145 | if (!louds_dense_->lookupKey(key, connect_node_num)) 146 | return false; 147 | else if (connect_node_num != 0) 148 | return louds_sparse_->lookupKey(key, connect_node_num); 149 | return true; 150 | } 151 | 152 | SuRF::Iter SuRF::moveToKeyGreaterThan(const std::string& key, const bool inclusive) const { 153 | SuRF::Iter iter(this); 154 | iter.could_be_fp_ = louds_dense_->moveToKeyGreaterThan(key, inclusive, iter.dense_iter_); 155 | 156 | if (!iter.dense_iter_.isValid()) 157 | return iter; 158 | if (iter.dense_iter_.isComplete()) 159 | return iter; 160 | 161 | if (!iter.dense_iter_.isSearchComplete()) { 162 | iter.passToSparse(); 163 | iter.could_be_fp_ = louds_sparse_->moveToKeyGreaterThan(key, inclusive, iter.sparse_iter_); 164 | if (!iter.sparse_iter_.isValid()) 165 | iter.incrementDenseIter(); 166 | return iter; 167 | } else if (!iter.dense_iter_.isMoveLeftComplete()) { 168 | iter.passToSparse(); 169 | iter.sparse_iter_.moveToLeftMostKey(); 170 | return iter; 171 | } 172 | 173 | assert(false); // shouldn't reach here 174 | return iter; 175 | } 176 | 177 | SuRF::Iter SuRF::moveToKeyLessThan(const std::string& key, const bool inclusive) const { 178 | SuRF::Iter iter = moveToKeyGreaterThan(key, false); 179 | if (!iter.isValid()) { 180 | iter = moveToLast(); 181 | return iter; 182 | } 183 | if (!iter.getFpFlag()) { 184 | iter--; 185 | if (lookupKey(key)) 186 | iter--; 187 | } 188 | return iter; 189 | } 190 | 191 | SuRF::Iter SuRF::moveToFirst() const { 192 | SuRF::Iter iter(this); 193 | if (louds_dense_->getHeight() > 0) { 194 | iter.dense_iter_.setToFirstLabelInRoot(); 195 | iter.dense_iter_.moveToLeftMostKey(); 196 | if (iter.dense_iter_.isMoveLeftComplete()) 197 | return iter; 198 | iter.passToSparse(); 199 | iter.sparse_iter_.moveToLeftMostKey(); 200 | } else { 201 | iter.sparse_iter_.setToFirstLabelInRoot(); 202 | iter.sparse_iter_.moveToLeftMostKey(); 203 | } 204 | return iter; 205 | } 206 | 207 | SuRF::Iter SuRF::moveToLast() const { 208 | SuRF::Iter iter(this); 209 | if (louds_dense_->getHeight() > 0) { 210 | iter.dense_iter_.setToLastLabelInRoot(); 211 | iter.dense_iter_.moveToRightMostKey(); 212 | if (iter.dense_iter_.isMoveRightComplete()) 213 | return iter; 214 | iter.passToSparse(); 215 | iter.sparse_iter_.moveToRightMostKey(); 216 | } else { 217 | iter.sparse_iter_.setToLastLabelInRoot(); 218 | iter.sparse_iter_.moveToRightMostKey(); 219 | } 220 | return iter; 221 | } 222 | 223 | bool SuRF::lookupRange(const std::string& left_key, const bool left_inclusive, 224 | const std::string& right_key, const bool right_inclusive) { 225 | iter_.clear(); 226 | louds_dense_->moveToKeyGreaterThan(left_key, left_inclusive, iter_.dense_iter_); 227 | if (!iter_.dense_iter_.isValid()) return false; 228 | if (!iter_.dense_iter_.isComplete()) { 229 | if (!iter_.dense_iter_.isSearchComplete()) { 230 | iter_.passToSparse(); 231 | louds_sparse_->moveToKeyGreaterThan(left_key, left_inclusive, iter_.sparse_iter_); 232 | if (!iter_.sparse_iter_.isValid()) { 233 | iter_.incrementDenseIter(); 234 | } 235 | } else if (!iter_.dense_iter_.isMoveLeftComplete()) { 236 | iter_.passToSparse(); 237 | iter_.sparse_iter_.moveToLeftMostKey(); 238 | } 239 | } 240 | if (!iter_.isValid()) return false; 241 | int compare = iter_.compare(right_key); 242 | if (compare == kCouldBePositive) 243 | return true; 244 | if (right_inclusive) 245 | return (compare <= 0); 246 | else 247 | return (compare < 0); 248 | } 249 | 250 | uint64_t SuRF::approxCount(const SuRF::Iter* iter, const SuRF::Iter* iter2) { 251 | if (!iter->isValid() || !iter2->isValid()) return 0; 252 | position_t out_node_num_left = 0, out_node_num_right = 0; 253 | uint64_t count = louds_dense_->approxCount(&(iter->dense_iter_), 254 | &(iter2->dense_iter_), 255 | out_node_num_left, 256 | out_node_num_right); 257 | count += louds_sparse_->approxCount(&(iter->sparse_iter_), 258 | &(iter2->sparse_iter_), 259 | out_node_num_left, 260 | out_node_num_right); 261 | return count; 262 | } 263 | 264 | uint64_t SuRF::approxCount(const std::string& left_key, 265 | const std::string& right_key) { 266 | iter_.clear(); iter2_.clear(); 267 | iter_ = moveToKeyGreaterThan(left_key, true); 268 | if (!iter_.isValid()) return 0; 269 | iter2_ = moveToKeyGreaterThan(right_key, true); 270 | if (!iter2_.isValid()) 271 | iter2_ = moveToLast(); 272 | 273 | return approxCount(&iter_, &iter2_); 274 | } 275 | 276 | uint64_t SuRF::serializedSize() const { 277 | return (louds_dense_->serializedSize() 278 | + louds_sparse_->serializedSize()); 279 | } 280 | 281 | uint64_t SuRF::getMemoryUsage() const { 282 | return (sizeof(SuRF) + louds_dense_->getMemoryUsage() + louds_sparse_->getMemoryUsage()); 283 | } 284 | 285 | level_t SuRF::getHeight() const { 286 | return louds_sparse_->getHeight(); 287 | } 288 | 289 | level_t SuRF::getSparseStartLevel() const { 290 | return louds_sparse_->getStartLevel(); 291 | } 292 | 293 | //============================================================================ 294 | 295 | void SuRF::Iter::clear() { 296 | dense_iter_.clear(); 297 | sparse_iter_.clear(); 298 | } 299 | 300 | bool SuRF::Iter::getFpFlag() const { 301 | return could_be_fp_; 302 | } 303 | 304 | bool SuRF::Iter::isValid() const { 305 | return dense_iter_.isValid() 306 | && (dense_iter_.isComplete() || sparse_iter_.isValid()); 307 | } 308 | 309 | int SuRF::Iter::compare(const std::string& key) const { 310 | assert(isValid()); 311 | int dense_compare = dense_iter_.compare(key); 312 | if (dense_iter_.isComplete() || dense_compare != 0) 313 | return dense_compare; 314 | return sparse_iter_.compare(key); 315 | } 316 | 317 | std::string SuRF::Iter::getKey() const { 318 | if (!isValid()) 319 | return std::string(); 320 | if (dense_iter_.isComplete()) 321 | return dense_iter_.getKey(); 322 | return dense_iter_.getKey() + sparse_iter_.getKey(); 323 | } 324 | 325 | int SuRF::Iter::getSuffix(word_t* suffix) const { 326 | if (!isValid()) 327 | return 0; 328 | if (dense_iter_.isComplete()) 329 | return dense_iter_.getSuffix(suffix); 330 | return sparse_iter_.getSuffix(suffix); 331 | } 332 | 333 | std::string SuRF::Iter::getKeyWithSuffix(unsigned* bitlen) const { 334 | *bitlen = 0; 335 | if (!isValid()) 336 | return std::string(); 337 | if (dense_iter_.isComplete()) 338 | return dense_iter_.getKeyWithSuffix(bitlen); 339 | return dense_iter_.getKeyWithSuffix(bitlen) + sparse_iter_.getKeyWithSuffix(bitlen); 340 | } 341 | 342 | void SuRF::Iter::passToSparse() { 343 | sparse_iter_.setStartNodeNum(dense_iter_.getSendOutNodeNum()); 344 | } 345 | 346 | bool SuRF::Iter::incrementDenseIter() { 347 | if (!dense_iter_.isValid()) 348 | return false; 349 | 350 | dense_iter_++; 351 | if (!dense_iter_.isValid()) 352 | return false; 353 | if (dense_iter_.isMoveLeftComplete()) 354 | return true; 355 | 356 | passToSparse(); 357 | sparse_iter_.moveToLeftMostKey(); 358 | return true; 359 | } 360 | 361 | bool SuRF::Iter::incrementSparseIter() { 362 | if (!sparse_iter_.isValid()) 363 | return false; 364 | sparse_iter_++; 365 | return sparse_iter_.isValid(); 366 | } 367 | 368 | bool SuRF::Iter::operator ++(int) { 369 | if (!isValid()) 370 | return false; 371 | if (incrementSparseIter()) 372 | return true; 373 | return incrementDenseIter(); 374 | } 375 | 376 | bool SuRF::Iter::decrementDenseIter() { 377 | if (!dense_iter_.isValid()) 378 | return false; 379 | 380 | dense_iter_--; 381 | if (!dense_iter_.isValid()) 382 | return false; 383 | if (dense_iter_.isMoveRightComplete()) 384 | return true; 385 | 386 | passToSparse(); 387 | sparse_iter_.moveToRightMostKey(); 388 | return true; 389 | } 390 | 391 | bool SuRF::Iter::decrementSparseIter() { 392 | if (!sparse_iter_.isValid()) 393 | return false; 394 | sparse_iter_--; 395 | return sparse_iter_.isValid(); 396 | } 397 | 398 | bool SuRF::Iter::operator --(int) { 399 | if (!isValid()) 400 | return false; 401 | if (decrementSparseIter()) 402 | return true; 403 | return decrementDenseIter(); 404 | } 405 | 406 | } // namespace surf 407 | 408 | #endif // SURF_H 409 | --------------------------------------------------------------------------------