├── src
    └── CMakeLists.txt
├── .gitmodules
├── bench
    ├── workload_gen
    │   ├── ycsb_download.sh
    │   ├── gen_workload.sh
    │   ├── workload_spec
    │   │   ├── workloadc_email_latest
    │   │   ├── workloadc_email_zipfian
    │   │   ├── workloadc_randint_latest
    │   │   ├── workloadc_randint_zipfian
    │   │   ├── workloadc_email_uniform
    │   │   ├── workloadc_randint_uniform
    │   │   └── workload_template
    │   ├── gen_txn.py
    │   └── gen_load.py
    ├── CMakeLists.txt
    ├── filter.hpp
    ├── filter_factory.hpp
    ├── filter_bloom.hpp
    ├── filter_surf.hpp
    ├── run.sh
    ├── bench.hpp
    ├── workload_arf.cpp
    ├── bloom.hpp
    ├── MurmurHash3.h
    ├── workload_multi_thread.cpp
    └── workload.cpp
├── test
    ├── CMakeLists.txt
    └── unitTest
    │   ├── CMakeLists.txt
    │   ├── test_louds_sparse_small.cpp
    │   ├── test_surf_small.cpp
    │   ├── test_louds_dense_small.cpp
    │   ├── test_suffix_vector.cpp
    │   ├── test_select.cpp
    │   ├── test_rank.cpp
    │   ├── test_bitvector.cpp
    │   ├── test_label_vector.cpp
    │   └── test_suffix.cpp
├── .gitignore
├── .travis.yml
├── CMakeLists.txt
├── include
    ├── hash.hpp
    ├── config.hpp
    ├── rank.hpp
    ├── bitvector.hpp
    ├── select.hpp
    ├── label_vector.hpp
    ├── popcount.h
    ├── suffix.hpp
    └── surf.hpp
├── README.md
├── simple_example.cpp
├── CodeCoverage.cmake
└── LICENSE


/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(surf surf.cpp)
2 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "ARF"]
2 | 	path = ARF
3 | 	url = https://github.com/efficient/ARF.git
4 | 	branch = master
5 | 


--------------------------------------------------------------------------------
/bench/workload_gen/ycsb_download.sh:
--------------------------------------------------------------------------------
1 | mkdir ../workloads
2 | curl -O --location https://github.com/brianfrankcooper/YCSB/releases/download/0.12.0/ycsb-0.12.0.tar.gz
3 | tar xfvz ycsb-0.12.0.tar.gz
4 | rm ycsb-0.12.0.tar.gz
5 | mv ycsb-0.12.0 YCSB
6 | 


--------------------------------------------------------------------------------
/bench/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(workload workload.cpp)
2 | target_link_libraries(workload)
3 | 
4 | add_executable(workload_multi_thread workload_multi_thread.cpp)
5 | target_link_libraries(workload_multi_thread)
6 | 
7 | #add_executable(workload_arf workload_arf.cpp)
8 | #target_link_libraries(workload_arf ARF)
9 | 


--------------------------------------------------------------------------------
/bench/workload_gen/gen_workload.sh:
--------------------------------------------------------------------------------
 1 | #!bin/bash
 2 | 
 3 | python gen_load.py randint uniform
 4 | python gen_txn.py randint uniform
 5 | python gen_txn.py randint zipfian
 6 | #python gen_txn.py randint latest
 7 | 
 8 | #python gen_load.py email uniform
 9 | #python gen_txn.py email uniform
10 | #python gen_txn.py email zipfian
11 | #python gen_txn.py email latest
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | find_package(GTest REQUIRED)
 2 | include_directories(${GTEST_INCLUDE_DIR})
 3 | 
 4 | function (add_surf_test file_name )
 5 |   add_executable(${file_name} ${file_name}.cpp)
 6 |   target_link_libraries(${file_name} gtest)
 7 |   add_test(NAME ${file_name} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${file_name})
 8 | endfunction()
 9 | 
10 | add_subdirectory(unitTest)


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 


--------------------------------------------------------------------------------
/bench/filter.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef FILTER_H_
 2 | #define FILTER_H_
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | namespace bench {
 8 | 
 9 | class Filter {
10 | public:
11 |     virtual bool lookup(const std::string& key) = 0;
12 |     virtual bool lookupRange(const std::string& left_key, const std::string& right_key) = 0;
13 |     virtual bool approxCount(const std::string& left_key, const std::string& right_key) = 0;
14 |     virtual uint64_t getMemoryUsage() = 0;
15 | };
16 | 
17 | } // namespace bench
18 | 
19 | #endif // FILTER_H
20 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: cpp
 2 | sudo: required
 3 | dist: xenial
 4 | compiler: gcc
 5 | 
 6 | install:
 7 | - sudo apt-get install build-essential
 8 | - sudo apt-get install cmake
 9 | - sudo apt-get install libgtest.dev
10 | - cd /usr/src/gtest
11 | - sudo cmake CMakeLists.txt
12 | - sudo make
13 | - sudo cp *.a /usr/lib
14 | - sudo apt-get install lcov
15 | - sudo apt-get install ruby
16 | - sudo gem install coveralls-lcov
17 | 
18 | script:
19 | - cd $TRAVIS_BUILD_DIR
20 | - mkdir build
21 | - cd build
22 | - cmake -DCMAKE_BUILD_TYPE=Debug -DCOVERALLS=ON ..
23 | - make -j
24 | - make coverage
25 | 
26 | after_success:
27 | - lcov --remove coverage.info 'test/*' '/usr/*' '/lib/*' --output-file coverage.info
28 | - lcov --list coverage.info
29 | - coveralls-lcov --repo-token=${COVERALLS_TOKEN} coverage.info
30 | 


--------------------------------------------------------------------------------
/test/unitTest/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | find_package(GTest REQUIRED)
 2 | include_directories(${GTEST_INCLUDE_DIR})
 3 | 
 4 | function (add_unit_test file_name)
 5 |   add_executable(${file_name} ${file_name}.cpp)
 6 |   target_link_libraries(${file_name} gtest)
 7 |   add_test(NAME ${file_name} 
 8 |     COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${file_name}
 9 |     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
10 | endfunction()
11 | 
12 | add_unit_test(test_bitvector)
13 | add_unit_test(test_label_vector)
14 | add_unit_test(test_louds_dense)
15 | add_unit_test(test_louds_dense_small)
16 | add_unit_test(test_louds_sparse)
17 | add_unit_test(test_louds_sparse_small)
18 | add_unit_test(test_rank)
19 | add_unit_test(test_select)
20 | add_unit_test(test_suffix)
21 | add_unit_test(test_surf)
22 | add_unit_test(test_surf_builder)
23 | add_unit_test(test_surf_small)
24 | 
25 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required (VERSION 2.6)
 2 | project (SuRF)
 3 | 
 4 | message(STATUS "Configuring..." ${CMAKE_PROJECT_NAME})
 5 | 
 6 | if (NOT CMAKE_BUILD_TYPE)
 7 |   set(CMAKE_BUILD_TYPE "Release")
 8 | endif()
 9 | 
10 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -g -Wall -mpopcnt -pthread -std=c++11")
11 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3 -Wall -Werror -mpopcnt -pthread -std=c++11")
12 | 
13 | option(COVERALLS "Generate coveralls data" OFF)
14 | 
15 | if (COVERALLS)
16 |   include("${CMAKE_CURRENT_SOURCE_DIR}/CodeCoverage.cmake")
17 |   append_coverage_compiler_flags()
18 |   set(COVERAGE_EXCLUDES 'ARF/*' 'bench/*' 'test/*' '/usr/*' '/lib/*')
19 |   setup_target_for_coverage(
20 |     NAME coverage
21 |     EXECUTABLE make test
22 |     )
23 | else()
24 |   add_definitions(-DNDEBUG)
25 | endif()
26 | 
27 | enable_testing()
28 | 
29 | include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include")
30 | 
31 | add_subdirectory(test)
32 | add_subdirectory(bench)
33 | 
34 | #include_directories("${CMAKE_CURRENT_SOURCE_DIR}/ARF/include")
35 | #add_subdirectory(ARF)
36 | 


--------------------------------------------------------------------------------
/bench/filter_factory.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef FILTER_FACTORY_H_
 2 | #define FILTER_FACTORY_H_
 3 | 
 4 | #include "filter.hpp"
 5 | #include "filter_bloom.hpp"
 6 | #include "filter_surf.hpp"
 7 | 
 8 | namespace bench {
 9 | 
10 | class FilterFactory {
11 | public:
12 |     static Filter* createFilter(const std::string& filter_type,
13 | 				const uint32_t suffix_len,
14 | 				const std::vector<std::string>& keys) {
15 | 	if (filter_type.compare(std::string("SuRF")) == 0)
16 | 	    return new FilterSuRF(keys, surf::kNone, 0, 0);
17 | 	else if (filter_type.compare(std::string("SuRFHash")) == 0)
18 | 	    return new FilterSuRF(keys, surf::kHash, suffix_len, 0);
19 | 	else if (filter_type.compare(std::string("SuRFReal")) == 0)
20 | 	    return new FilterSuRF(keys, surf::kReal, 0, suffix_len);
21 |         else if (filter_type.compare(std::string("SuRFMixed")) == 0)
22 | 	    return new FilterSuRF(keys, surf::kMixed, suffix_len, suffix_len);
23 | 	else if (filter_type.compare(std::string("Bloom")) == 0)
24 | 	    return new FilterBloom(keys);
25 | 	else
26 | 	    return new FilterSuRF(keys, surf::kReal, 0, suffix_len); // default
27 |     }
28 | };
29 | 
30 | } // namespace bench
31 | 
32 | #endif // FILTER_FACTORY_H
33 | 


--------------------------------------------------------------------------------
/bench/filter_bloom.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef FILTER_BLOOM_H_
 2 | #define FILTER_BLOOM_H_
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | #include "bloom.hpp"
 8 | 
 9 | namespace bench {
10 | 
11 | class FilterBloom : public Filter {
12 | public:
13 |     // Requires that keys are sorted
14 |     FilterBloom(const std::vector<std::string>& keys) {
15 | 	filter_ = new BloomFilter(kBitsPerKey);
16 | 	filter_->CreateFilter(keys, keys.size(), &filter_data_);
17 |     }
18 | 
19 |     ~FilterBloom() {
20 | 	delete filter_;
21 |     }
22 | 
23 |     bool lookup(const std::string& key) {
24 | 	return filter_->KeyMayMatch(key, filter_data_);
25 |     }
26 | 
27 |     bool lookupRange(const std::string& left_key, const std::string& right_key) {
28 | 	std::cout << kRed << "A Bloom filter does not support range queries\n" << kNoColor;
29 | 	return false;
30 |     }
31 | 
32 |     bool approxCount(const std::string& left_key, const std::string& right_key) {
33 | 	std::cout << kRed << "A Bloom filter does not support approximate count queries\n" << kNoColor;
34 | 	return false;
35 |     }
36 | 
37 |     uint64_t getMemoryUsage() {
38 | 	return filter_data_.size();
39 |     }
40 | 
41 | private:
42 |     int kBitsPerKey = 10;
43 | 
44 |     BloomFilter* filter_;
45 |     std::string filter_data_;
46 | };
47 | 
48 | } // namespace bench
49 | 
50 | #endif // FILTER_BLOOM_H
51 | 


--------------------------------------------------------------------------------
/bench/filter_surf.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef FILTER_SURF_H_
 2 | #define FILTER_SURF_H_
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | #include "surf.hpp"
 8 | 
 9 | namespace bench {
10 | 
11 | class FilterSuRF : public Filter {
12 | public:
13 |     // Requires that keys are sorted
14 |     FilterSuRF(const std::vector<std::string>& keys,
15 | 	       const surf::SuffixType suffix_type,
16 |                const uint32_t hash_suffix_len, const uint32_t real_suffix_len) {
17 | 	// uses default sparse-dense size ratio
18 | 	filter_ = new surf::SuRF(keys, surf::kIncludeDense, surf::kSparseDenseRatio,
19 | 				 suffix_type, hash_suffix_len, real_suffix_len);
20 |     }
21 | 
22 |     ~FilterSuRF() {
23 | 	filter_->destroy();
24 | 	delete filter_;
25 |     }
26 | 
27 |     bool lookup(const std::string& key) {
28 | 	return filter_->lookupKey(key);
29 |     }
30 | 
31 |     bool lookupRange(const std::string& left_key, const std::string& right_key) {
32 | 	//return filter_->lookupRange(left_key, false, right_key, false);
33 | 	return filter_->lookupRange(left_key, true, right_key, true);
34 |     }
35 | 
36 |     bool approxCount(const std::string& left_key, const std::string& right_key) {
37 | 	return filter_->approxCount(left_key, right_key);
38 |     }
39 | 
40 |     uint64_t getMemoryUsage() {
41 | 	return filter_->getMemoryUsage();
42 |     }
43 | 
44 | private:
45 |     surf::SuRF* filter_;
46 | };
47 | 
48 | } // namespace bench
49 | 
50 | #endif // FILTER_SURF_H
51 | 


--------------------------------------------------------------------------------
/include/hash.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef HASH_H_
 2 | #define HASH_H_
 3 | 
 4 | #include <string>
 5 | 
 6 | namespace surf {
 7 | 
 8 | //******************************************************
 9 | //HASH FUNCTION FROM LEVELDB
10 | //******************************************************
11 | inline uint32_t DecodeFixed32(const char* ptr) {
12 |     uint32_t result;
13 |     memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
14 |     return result;
15 | }
16 | 
17 | inline uint32_t Hash(const char* data, size_t n, uint32_t seed) {
18 |     // Similar to murmur hash
19 |     const uint32_t m = 0xc6a4a793;
20 |     const uint32_t r = 24;
21 |     const char* limit = data + n;
22 |     uint32_t h = seed ^ (n * m);
23 | 
24 |     // Pick up four bytes at a time
25 |     while (data + 4 <= limit) {
26 | 	uint32_t w = DecodeFixed32(data);
27 | 	data += 4;
28 | 	h += w;
29 | 	h *= m;
30 | 	h ^= (h >> 16);
31 |     }
32 | 
33 |     // Pick up remaining bytes
34 |     switch (limit - data) {
35 |     case 3:
36 | 	h += static_cast<unsigned char>(data[2]) << 16;
37 |     case 2:
38 | 	h += static_cast<unsigned char>(data[1]) << 8;
39 |     case 1:
40 | 	h += static_cast<unsigned char>(data[0]);
41 | 	h *= m;
42 | 	h ^= (h >> r);
43 | 	break;
44 |     }
45 |     return h;
46 | }
47 | 
48 | inline uint32_t suffixHash(const std::string &key) {
49 |     return Hash(key.c_str(), key.size(), 0xbc9f1d34);
50 | }
51 | 
52 | inline uint32_t suffixHash(const char* key, const int keylen) {
53 |     return Hash(key, keylen, 0xbc9f1d34);
54 | }
55 | 
56 | } // namespace surf
57 | 
58 | #endif // HASH_H_
59 | 
60 | 


--------------------------------------------------------------------------------
/bench/run.sh:
--------------------------------------------------------------------------------
 1 | #!bin/bash
 2 | 
 3 | echo 'Bloom Filter, random int, point queries'
 4 | ../build/bench/workload Bloom 1 mixed 50 0 randint point zipfian
 5 | 
 6 | echo 'SuRF, random int, point queries'
 7 | ../build/bench/workload SuRF 1 mixed 50 0 randint point zipfian
 8 | 
 9 | echo 'SuRFHash, 4-bit suffixes, random int, point queries'
10 | ../build/bench/workload SuRFHash 4 mixed 50 0 randint point zipfian
11 | 
12 | echo 'SuRFReal, 4-bit suffixes, random int, point queries'
13 | ../build/bench/workload SuRFReal 4 mixed 50 0 randint point zipfian
14 | 
15 | echo 'SuRFMixed, 2-bit hash suffixes and 2-bit real suffixes, random int, point queries'
16 | ../build/bench/workload SuRFMixed 2 mixed 50 0 randint mix zipfian
17 | 
18 | 
19 | # echo 'Bloom Filter, email, point queries'
20 | # ../build/bench/workload Bloom 1 mixed 50 0 email point zipfian
21 | 
22 | # echo 'SuRF, email, point queries'
23 | # ../build/bench/workload SuRF 1 mixed 50 0 email point zipfian
24 | 
25 | # echo 'SuRFHash, 4-bit suffixes, email, point queries'
26 | # ../build/bench/workload SuRFHash 4 mixed 50 0 email point zipfian
27 | 
28 | # echo 'SuRFReal, 4-bit suffixes, email, point queries'
29 | # ../build/bench/workload SuRFReal 4 mixed 50 0 email point zipfian
30 | 
31 | # echo 'SuRFMixed, 2-bit hash suffixes and 2-bit real suffixes, email, point queries'
32 | # ../build/bench/workload SuRFMixed 2 mixed 50 0 email mix zipfian
33 | 
34 | 
35 | echo 'SuRFReal, 4-bit suffixes, random int, range queries'
36 | ../build/bench/workload SuRFReal 4 mixed 50 0 randint range zipfian
37 | 
38 | # echo 'SuRFReal, 4-bit suffixes, email, point queries'
39 | # ../build/bench/workload SuRFReal 4 mixed 50 0 email range zipfian
40 | 
41 | 


--------------------------------------------------------------------------------
/include/config.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CONFIG_H_
 2 | #define CONFIG_H_
 3 | 
 4 | #include <stdint.h>
 5 | #include <string.h>
 6 | 
 7 | namespace surf {
 8 | 
 9 | using level_t = uint32_t;
10 | using position_t = uint32_t;
11 | static const position_t kMaxPos = UINT32_MAX;
12 | 
13 | using label_t = uint8_t;
14 | static const position_t kFanout = 256;
15 | 
16 | using word_t = uint64_t;
17 | static const unsigned kWordSize = 64;
18 | static const word_t kMsbMask = 0x8000000000000000;
19 | static const word_t kOneMask = 0xFFFFFFFFFFFFFFFF;
20 | 
21 | static const bool kIncludeDense = true;
22 | //static const uint32_t kSparseDenseRatio = 64;
23 | static const uint32_t kSparseDenseRatio = 16;
24 | static const label_t kTerminator = 255;
25 | 
26 | static const int kHashShift = 7;
27 | 
28 | static const int kCouldBePositive = 2018; // used in suffix comparison
29 | 
30 | enum SuffixType {
31 |     kNone = 0,
32 |     kHash = 1,
33 |     kReal = 2,
34 |     kMixed = 3
35 | };
36 | 
37 | void align(char*& ptr) {
38 |     ptr = (char*)(((uint64_t)ptr + 7) & ~((uint64_t)7));
39 | }
40 | 
41 | void sizeAlign(position_t& size) {
42 |     size = (size + 7) & ~((position_t)7);
43 | }
44 | 
45 | void sizeAlign(uint64_t& size) {
46 |     size = (size + 7) & ~((uint64_t)7);
47 | }
48 | 
49 | std::string uint64ToString(const uint64_t word) {
50 |     uint64_t endian_swapped_word = __builtin_bswap64(word);
51 |     return std::string(reinterpret_cast<const char*>(&endian_swapped_word), 8);
52 | }
53 | 
54 | uint64_t stringToUint64(const std::string& str_word) {
55 |     uint64_t int_word = 0;
56 |     memcpy(reinterpret_cast<char*>(&int_word), str_word.data(), 8);
57 |     return __builtin_bswap64(int_word);
58 | }
59 | 
60 | } // namespace surf
61 | 
62 | #endif // CONFIG_H_
63 | 


--------------------------------------------------------------------------------
/test/unitTest/test_louds_sparse_small.cpp:
--------------------------------------------------------------------------------
 1 | #include "gtest/gtest.h"
 2 | 
 3 | #include <assert.h>
 4 | 
 5 | #include <string>
 6 | #include <vector>
 7 | 
 8 | #include "config.hpp"
 9 | #include "surf.hpp"
10 | 
11 | namespace surf {
12 | 
13 | namespace surftest {
14 | 
15 | static const bool kIncludeDense = false;
16 | static const uint32_t kSparseDenseRatio = 0;
17 | static const SuffixType kSuffixType = kReal;
18 | static const level_t kSuffixLen = 8;
19 | 
20 | class SuRFSmallTest : public ::testing::Test {
21 | public:
22 |     virtual void SetUp () {}
23 |     virtual void TearDown () {}
24 | };
25 | 
26 | TEST_F (SuRFSmallTest, ExampleInPaperTest) {
27 |     std::vector<std::string> keys;
28 | 
29 |     keys.push_back(std::string("f"));
30 |     keys.push_back(std::string("far"));
31 |     keys.push_back(std::string("fas"));
32 |     keys.push_back(std::string("fast"));
33 |     keys.push_back(std::string("fat"));
34 |     keys.push_back(std::string("s"));
35 |     keys.push_back(std::string("top"));
36 |     keys.push_back(std::string("toy"));
37 |     keys.push_back(std::string("trie"));
38 |     keys.push_back(std::string("trip"));
39 |     keys.push_back(std::string("try"));
40 | 
41 |     SuRFBuilder* builder = new SuRFBuilder(kIncludeDense, kSparseDenseRatio, kSuffixType, 0, kSuffixLen);
42 |     builder->build(keys);
43 |     LoudsSparse* louds_sparse = new LoudsSparse(builder);
44 |     LoudsSparse::Iter iter(louds_sparse);
45 |     
46 |     louds_sparse->moveToKeyGreaterThan(std::string("to"), true, iter);
47 |     ASSERT_TRUE(iter.isValid());
48 |     ASSERT_EQ(0, iter.getKey().compare("top"));
49 |     iter++;
50 |     ASSERT_EQ(0, iter.getKey().compare("toy"));
51 | }
52 | 
53 | } // namespace surftest
54 | 
55 | } // namespace surf
56 | 
57 | int main (int argc, char** argv) {
58 |     ::testing::InitGoogleTest(&argc, argv);
59 |     return RUN_ALL_TESTS();
60 | }
61 | 


--------------------------------------------------------------------------------
/test/unitTest/test_surf_small.cpp:
--------------------------------------------------------------------------------
 1 | #include "gtest/gtest.h"
 2 | 
 3 | #include <assert.h>
 4 | 
 5 | #include <string>
 6 | #include <vector>
 7 | 
 8 | #include "config.hpp"
 9 | #include "surf.hpp"
10 | 
11 | namespace surf {
12 | 
13 | namespace surftest {
14 | 
15 | static const SuffixType kSuffixType = kReal;
16 | static const level_t kSuffixLen = 8;
17 | 
18 | class SuRFSmallTest : public ::testing::Test {
19 | public:
20 |     virtual void SetUp () {}
21 |     virtual void TearDown () {}
22 | };
23 | 
24 | TEST_F (SuRFSmallTest, ExampleInPaperTest) {
25 |     std::vector<std::string> keys;
26 | 
27 |     keys.push_back(std::string("f"));
28 |     keys.push_back(std::string("far"));
29 |     keys.push_back(std::string("fas"));
30 |     keys.push_back(std::string("fast"));
31 |     keys.push_back(std::string("fat"));
32 |     keys.push_back(std::string("s"));
33 |     keys.push_back(std::string("top"));
34 |     keys.push_back(std::string("toy"));
35 |     keys.push_back(std::string("trie"));
36 |     keys.push_back(std::string("trip"));
37 |     keys.push_back(std::string("try"));
38 | 
39 |     SuRF* surf = new SuRF(keys, kIncludeDense, kSparseDenseRatio, kSuffixType, 0, kSuffixLen);
40 |     bool exist = surf->lookupRange(std::string("top"), false, std::string("toyy"), false);
41 |     ASSERT_TRUE(exist);
42 |     exist = surf->lookupRange(std::string("toq"), false, std::string("toyy"), false);
43 |     ASSERT_TRUE(exist);
44 |     exist = surf->lookupRange(std::string("trie"), false, std::string("tripp"), false);
45 |     ASSERT_TRUE(exist);
46 | 
47 |     SuRF::Iter iter = surf->moveToKeyGreaterThan(std::string("t"), true);
48 |     ASSERT_TRUE(iter.isValid());
49 |     iter++;
50 |     ASSERT_TRUE(iter.isValid());
51 | }
52 | 
53 | } // namespace surftest
54 | 
55 | } // namespace surf
56 | 
57 | int main (int argc, char** argv) {
58 |     ::testing::InitGoogleTest(&argc, argv);
59 |     return RUN_ALL_TESTS();
60 | }
61 | 


--------------------------------------------------------------------------------
/test/unitTest/test_louds_dense_small.cpp:
--------------------------------------------------------------------------------
 1 | #include "gtest/gtest.h"
 2 | 
 3 | #include <assert.h>
 4 | 
 5 | #include <string>
 6 | #include <vector>
 7 | 
 8 | #include "config.hpp"
 9 | #include "surf.hpp"
10 | 
11 | namespace surf {
12 | 
13 | namespace surftest {
14 | 
15 | static const bool kIncludeDense = true;
16 | static const uint32_t kSparseDenseRatio = 0;
17 | static const SuffixType kSuffixType = kReal;
18 | static const level_t kSuffixLen = 8;
19 | 
20 | class SuRFSmallTest : public ::testing::Test {
21 | public:
22 |     virtual void SetUp () {}
23 |     virtual void TearDown () {}
24 | };
25 | 
26 | TEST_F (SuRFSmallTest, ExampleInPaperTest) {
27 |     std::vector<std::string> keys;
28 | 
29 |     keys.push_back(std::string("f"));
30 |     keys.push_back(std::string("far"));
31 |     keys.push_back(std::string("fas"));
32 |     keys.push_back(std::string("fast"));
33 |     keys.push_back(std::string("fat"));
34 |     keys.push_back(std::string("s"));
35 |     keys.push_back(std::string("top"));
36 |     keys.push_back(std::string("toy"));
37 |     keys.push_back(std::string("trie"));
38 |     keys.push_back(std::string("trip"));
39 |     keys.push_back(std::string("try"));
40 | 
41 |     SuRFBuilder* builder = new SuRFBuilder(kIncludeDense, kSparseDenseRatio, kSuffixType, 0, kSuffixLen);
42 |     builder->build(keys);
43 |     LoudsDense* louds_dense = new LoudsDense(builder);
44 |     LoudsDense::Iter iter(louds_dense);
45 |     
46 |     louds_dense->moveToKeyGreaterThan(std::string("to"), true, iter);
47 |     ASSERT_TRUE(iter.isValid());
48 |     ASSERT_EQ(0, iter.getKey().compare("top"));
49 |     iter++;
50 |     ASSERT_EQ(0, iter.getKey().compare("toy"));
51 | 
52 |     iter.clear();
53 |     louds_dense->moveToKeyGreaterThan(std::string("fas"), true, iter);
54 |     ASSERT_TRUE(iter.isValid());
55 |     ASSERT_EQ(0, iter.getKey().compare("fas"));
56 | }
57 | 
58 | } // namespace surftest
59 | 
60 | } // namespace surf
61 | 
62 | int main (int argc, char** argv) {
63 |     ::testing::InitGoogleTest(&argc, argv);
64 |     return RUN_ALL_TESTS();
65 | }
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Succinct Range Filter (SuRF)
 2 | [![Build Status](https://travis-ci.org/efficient/SuRF.svg?branch=master)](https://travis-ci.org/efficient/SuRF)
 3 | [![Coverage Status](https://coveralls.io/repos/github/efficient/SuRF/badge.svg?branch=master)](https://coveralls.io/github/efficient/SuRF?branch=master)
 4 | 
 5 | **SuRF** is a fast and compact filter that provides exact-match filtering,
 6 | range filtering, and approximate range counts. This is the source code for our
 7 | [SIGMOD best paper](http://www.cs.cmu.edu/~huanche1/publications/surf_paper.pdf).
 8 | We also host a [demo website](https://www.rangefilter.io/).
 9 | The RocksDB experiments with SuRF can be found [here](https://github.com/efficient/rocksdb).
10 | 
11 | ## Install Dependencies
12 |     sudo apt-get install build-essential cmake libgtest.dev
13 |     cd /usr/src/gtest
14 |     sudo cmake CMakeLists.txt
15 |     sudo make
16 |     sudo cp *.a /usr/lib
17 | 
18 | ## Build
19 |     git submodule init
20 |     git submodule update
21 |     mkdir build
22 |     cd build
23 |     cmake ..
24 |     make -j
25 | 
26 | ## Simple Example
27 | A simple example can be found [here](https://github.com/efficient/SuRF/blob/master/simple_example.cpp). To run the example:
28 | ```
29 | g++ -mpopcnt -std=c++11 simple_example.cpp
30 | ./a.out
31 | ```
32 | Note that the key list passed to the SuRF constructor must be SORTED.
33 | 
34 | ## Run Unit Tests
35 |     make test
36 | 
37 | ## Benchmark
38 | 
39 | ### Step 1: Download YCSB
40 |     cd bench/workload_gen
41 |     bash ycsb_download.sh
42 | 
43 | ### Step 2: Generate Workloads
44 |     cd bench/workload_gen
45 |     bash gen_workload.sh
46 | You must provide your own email list to generate email-key workloads.
47 | 
48 | ### Step 3: Run Workloads
49 |     cd bench
50 |     bash run.sh
51 | Note that `run.sh` only includes several representative runs.
52 | Refer to `bench/workload.cpp`, `bench/workload_multi_thread.cpp`
53 | and `bench/workload_arf.cpp` for more experiment configurations.
54 | 
55 | ## License
56 | Copyright 2018, Carnegie Mellon University
57 | 
58 | Licensed under the [Apache License](https://github.com/efficient/SuRF/blob/master/LICENSE).
59 | 


--------------------------------------------------------------------------------
/test/unitTest/test_suffix_vector.cpp:
--------------------------------------------------------------------------------
 1 | #include "gtest/gtest.h"
 2 | 
 3 | #include <assert.h>
 4 | 
 5 | #include <fstream>
 6 | #include <string>
 7 | #include <vector>
 8 | 
 9 | #include "config.hpp"
10 | #include "suffix_vector.hpp"
11 | #include "surf_builder.hpp"
12 | 
13 | namespace surf {
14 | 
15 | // DEPRECATED
16 | namespace suffixvectortest {
17 | 
18 | static const std::string kFilePath = "../../../test/words.txt";
19 | static const int kTestSize = 234369;
20 | static std::vector<std::string> words;
21 | 
22 | class SuffixVectorUnitTest : public ::testing::Test {
23 | public:
24 |     virtual void SetUp () {
25 | 	;
26 |     }
27 |     virtual void TearDown () {
28 | 	delete builder_;
29 | 	delete suffixes_;
30 |     }
31 | 
32 |     SuRFBuilder* builder_;
33 |     SuffixVector* suffixes_;
34 | };
35 | 
36 | TEST_F (SuffixVectorUnitTest, buildNoneTest) {
37 |     bool include_dense = false;
38 |     uint32_t sparse_dense_ratio = 0;
39 |     builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, kNone);
40 |     builder_->build(words);
41 |     suffixes_ = new SuffixVector(kNone, builder_->getSuffixes());
42 | }
43 | 
44 | TEST_F (SuffixVectorUnitTest, buildHashTest) {
45 |     bool include_dense = false;
46 |     uint32_t sparse_dense_ratio = 0;
47 |     builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, kHash);
48 |     builder_->build(words);
49 |     suffixes_ = new SuffixVector(kHash, builder_->getSuffixes());
50 | }
51 | 
52 | TEST_F (SuffixVectorUnitTest, buildRealTest) {
53 |     bool include_dense = false;
54 |     uint32_t sparse_dense_ratio = 0;
55 |     builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, kReal);
56 |     builder_->build(words);
57 |     suffixes_ = new SuffixVector(kReal, builder_->getSuffixes());
58 | }
59 | 
60 | //TODO checkEqualityTest
61 | //TODO compareTest
62 | 
63 | void loadWordList() {
64 |     std::ifstream infile(kFilePath);
65 |     std::string key;
66 |     int count = 0;
67 |     while (infile.good() && count < kTestSize) {
68 | 	infile >> key;
69 | 	words.push_back(key);
70 | 	count++;
71 |     }
72 | }
73 | 
74 | } // namespace suffixvectortest
75 | 
76 | } // namespace surf
77 | 
78 | int main (int argc, char** argv) {
79 |     ::testing::InitGoogleTest(&argc, argv);
80 |     surf::suffixvectortest::loadWordList();
81 |     return RUN_ALL_TESTS();
82 | }
83 | 


--------------------------------------------------------------------------------
/bench/workload_gen/workload_spec/workloadc_email_latest:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2010 Yahoo! Inc. All rights reserved.                                                                                                                             
 2 | #                                                                                                                                                                                 
 3 | # Licensed under the Apache License, Version 2.0 (the "License"); you                                                                                                             
 4 | # may not use this file except in compliance with the License. You                                                                                                                
 5 | # may obtain a copy of the License at                                                                                                                                             
 6 | #                                                                                                                                                                                 
 7 | # http://www.apache.org/licenses/LICENSE-2.0                                                                                                                                      
 8 | #                                                                                                                                                                                 
 9 | # Unless required by applicable law or agreed to in writing, software                                                                                                             
10 | # distributed under the License is distributed on an "AS IS" BASIS,                                                                                                               
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or                                                                                                                 
12 | # implied. See the License for the specific language governing                                                                                                                    
13 | # permissions and limitations under the License. See accompanying                                                                                                                 
14 | # LICENSE file.                                                                                                                                                                   
15 | 
16 | # Yahoo! Cloud System Benchmark
17 | # Workload C: Read only
18 | #   Application example: user profile cache, where profiles are constructed elsewhere (e.g., Hadoop)
19 | #                        
20 | #   Read/update ratio: 100/0
21 | #   Default data size: 1 KB records (10 fields, 100 bytes each, plus key)
22 | #   Request distribution: zipfian
23 | 
24 | recordcount=25000000
25 | operationcount=10000000
26 | workload=com.yahoo.ycsb.workloads.CoreWorkload
27 | 
28 | readallfields=true
29 | 
30 | readproportion=1
31 | updateproportion=0
32 | scanproportion=0
33 | insertproportion=0
34 | 
35 | requestdistribution=latest
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/bench/workload_gen/workload_spec/workloadc_email_zipfian:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2010 Yahoo! Inc. All rights reserved.                                                                                                                             
 2 | #                                                                                                                                                                                 
 3 | # Licensed under the Apache License, Version 2.0 (the "License"); you                                                                                                             
 4 | # may not use this file except in compliance with the License. You                                                                                                                
 5 | # may obtain a copy of the License at                                                                                                                                             
 6 | #                                                                                                                                                                                 
 7 | # http://www.apache.org/licenses/LICENSE-2.0                                                                                                                                      
 8 | #                                                                                                                                                                                 
 9 | # Unless required by applicable law or agreed to in writing, software                                                                                                             
10 | # distributed under the License is distributed on an "AS IS" BASIS,                                                                                                               
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or                                                                                                                 
12 | # implied. See the License for the specific language governing                                                                                                                    
13 | # permissions and limitations under the License. See accompanying                                                                                                                 
14 | # LICENSE file.                                                                                                                                                                   
15 | 
16 | # Yahoo! Cloud System Benchmark
17 | # Workload C: Read only
18 | #   Application example: user profile cache, where profiles are constructed elsewhere (e.g., Hadoop)
19 | #                        
20 | #   Read/update ratio: 100/0
21 | #   Default data size: 1 KB records (10 fields, 100 bytes each, plus key)
22 | #   Request distribution: zipfian
23 | 
24 | recordcount=25000000
25 | operationcount=10000000
26 | workload=com.yahoo.ycsb.workloads.CoreWorkload
27 | 
28 | readallfields=true
29 | 
30 | readproportion=1
31 | updateproportion=0
32 | scanproportion=0
33 | insertproportion=0
34 | 
35 | requestdistribution=zipfian
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/bench/workload_gen/workload_spec/workloadc_randint_latest:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2010 Yahoo! Inc. All rights reserved.                                                                                                                             
 2 | #                                                                                                                                                                                 
 3 | # Licensed under the Apache License, Version 2.0 (the "License"); you                                                                                                             
 4 | # may not use this file except in compliance with the License. You                                                                                                                
 5 | # may obtain a copy of the License at                                                                                                                                             
 6 | #                                                                                                                                                                                 
 7 | # http://www.apache.org/licenses/LICENSE-2.0                                                                                                                                      
 8 | #                                                                                                                                                                                 
 9 | # Unless required by applicable law or agreed to in writing, software                                                                                                             
10 | # distributed under the License is distributed on an "AS IS" BASIS,                                                                                                               
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or                                                                                                                 
12 | # implied. See the License for the specific language governing                                                                                                                    
13 | # permissions and limitations under the License. See accompanying                                                                                                                 
14 | # LICENSE file.                                                                                                                                                                   
15 | 
16 | # Yahoo! Cloud System Benchmark
17 | # Workload C: Read only
18 | #   Application example: user profile cache, where profiles are constructed elsewhere (e.g., Hadoop)
19 | #                        
20 | #   Read/update ratio: 100/0
21 | #   Default data size: 1 KB records (10 fields, 100 bytes each, plus key)
22 | #   Request distribution: zipfian
23 | 
24 | recordcount=100000000
25 | operationcount=10000000
26 | workload=com.yahoo.ycsb.workloads.CoreWorkload
27 | 
28 | readallfields=true
29 | 
30 | readproportion=1
31 | updateproportion=0
32 | scanproportion=0
33 | insertproportion=0
34 | 
35 | requestdistribution=latest
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/bench/workload_gen/workload_spec/workloadc_randint_zipfian:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2010 Yahoo! Inc. All rights reserved.                                                                                                                             
 2 | #                                                                                                                                                                                 
 3 | # Licensed under the Apache License, Version 2.0 (the "License"); you                                                                                                             
 4 | # may not use this file except in compliance with the License. You                                                                                                                
 5 | # may obtain a copy of the License at                                                                                                                                             
 6 | #                                                                                                                                                                                 
 7 | # http://www.apache.org/licenses/LICENSE-2.0                                                                                                                                      
 8 | #                                                                                                                                                                                 
 9 | # Unless required by applicable law or agreed to in writing, software                                                                                                             
10 | # distributed under the License is distributed on an "AS IS" BASIS,                                                                                                               
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or                                                                                                                 
12 | # implied. See the License for the specific language governing                                                                                                                    
13 | # permissions and limitations under the License. See accompanying                                                                                                                 
14 | # LICENSE file.                                                                                                                                                                   
15 | 
16 | # Yahoo! Cloud System Benchmark
17 | # Workload C: Read only
18 | #   Application example: user profile cache, where profiles are constructed elsewhere (e.g., Hadoop)
19 | #                        
20 | #   Read/update ratio: 100/0
21 | #   Default data size: 1 KB records (10 fields, 100 bytes each, plus key)
22 | #   Request distribution: zipfian
23 | 
24 | recordcount=100000000
25 | operationcount=10000000
26 | workload=com.yahoo.ycsb.workloads.CoreWorkload
27 | 
28 | readallfields=true
29 | 
30 | readproportion=1
31 | updateproportion=0
32 | scanproportion=0
33 | insertproportion=0
34 | 
35 | requestdistribution=zipfian
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/bench/workload_gen/workload_spec/workloadc_email_uniform:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2010 Yahoo! Inc. All rights reserved.                                                                                                                             
 2 | #                                                                                                                                                                                 
 3 | # Licensed under the Apache License, Version 2.0 (the "License"); you                                                                                                             
 4 | # may not use this file except in compliance with the License. You                                                                                                                
 5 | # may obtain a copy of the License at                                                                                                                                             
 6 | #                                                                                                                                                                                 
 7 | # http://www.apache.org/licenses/LICENSE-2.0                                                                                                                                      
 8 | #                                                                                                                                                                                 
 9 | # Unless required by applicable law or agreed to in writing, software                                                                                                             
10 | # distributed under the License is distributed on an "AS IS" BASIS,                                                                                                               
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or                                                                                                                 
12 | # implied. See the License for the specific language governing                                                                                                                    
13 | # permissions and limitations under the License. See accompanying                                                                                                                 
14 | # LICENSE file.                                                                                                                                                                   
15 | 
16 | # Yahoo! Cloud System Benchmark
17 | # Workload C: Read only
18 | #   Application example: user profile cache, where profiles are constructed elsewhere (e.g., Hadoop)
19 | #                        
20 | #   Read/update ratio: 100/0
21 | #   Default data size: 1 KB records (10 fields, 100 bytes each, plus key)
22 | #   Request distribution: zipfian
23 | 
24 | recordcount=25000000
25 | operationcount=10000000
26 | workload=com.yahoo.ycsb.workloads.CoreWorkload
27 | 
28 | fieldcount=1
29 | fieldlength=10
30 | readallfields=true
31 | 
32 | readproportion=1
33 | updateproportion=0
34 | scanproportion=0
35 | insertproportion=0
36 | 
37 | requestdistribution=uniform
38 | 
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/bench/workload_gen/workload_spec/workloadc_randint_uniform:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2010 Yahoo! Inc. All rights reserved.                                                                                                                             
 2 | #                                                                                                                                                                                 
 3 | # Licensed under the Apache License, Version 2.0 (the "License"); you                                                                                                             
 4 | # may not use this file except in compliance with the License. You                                                                                                                
 5 | # may obtain a copy of the License at                                                                                                                                             
 6 | #                                                                                                                                                                                 
 7 | # http://www.apache.org/licenses/LICENSE-2.0                                                                                                                                      
 8 | #                                                                                                                                                                                 
 9 | # Unless required by applicable law or agreed to in writing, software                                                                                                             
10 | # distributed under the License is distributed on an "AS IS" BASIS,                                                                                                               
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or                                                                                                                 
12 | # implied. See the License for the specific language governing                                                                                                                    
13 | # permissions and limitations under the License. See accompanying                                                                                                                 
14 | # LICENSE file.                                                                                                                                                                   
15 | 
16 | # Yahoo! Cloud System Benchmark
17 | # Workload C: Read only
18 | #   Application example: user profile cache, where profiles are constructed elsewhere (e.g., Hadoop)
19 | #                        
20 | #   Read/update ratio: 100/0
21 | #   Default data size: 1 KB records (10 fields, 100 bytes each, plus key)
22 | #   Request distribution: zipfian
23 | 
24 | recordcount=100000000
25 | operationcount=10000000
26 | workload=com.yahoo.ycsb.workloads.CoreWorkload
27 | 
28 | fieldcount=1
29 | fieldlength=10
30 | readallfields=true
31 | 
32 | readproportion=1
33 | updateproportion=0
34 | scanproportion=0
35 | insertproportion=0
36 | 
37 | requestdistribution=uniform
38 | 
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/simple_example.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <vector>
 3 | 
 4 | #include "include/surf.hpp"
 5 | 
 6 | using namespace surf;
 7 | 
 8 | int main() {
 9 |     std::vector<std::string> keys = {
10 | 	"f",
11 | 	"far",
12 | 	"fast",
13 | 	"s",
14 | 	"top",
15 | 	"toy",
16 | 	"trie",
17 |     };
18 | 
19 |     // basic surf
20 |     SuRF* surf = new SuRF(keys);
21 | 
22 |     // use default dense-to-sparse ratio; specify suffix type and length
23 |     SuRF* surf_hash = new SuRF(keys, surf::kHash, 8, 0);
24 |     SuRF* surf_real = new SuRF(keys, surf::kReal, 0, 8);
25 | 
26 |     // customize dense-to-sparse ratio; specify suffix type and length
27 |     SuRF* surf_mixed = new SuRF(keys, true, 16,  surf::kMixed, 4, 4);
28 | 
29 |     //----------------------------------------
30 |     // point queries
31 |     //----------------------------------------
32 |     std::cout << "Point Query Example: fase" << std::endl;
33 |     
34 |     std::string key = "fase";
35 |     
36 |     if (surf->lookupKey(key))
37 | 	std::cout << "False Positive: "<< key << " found in basic SuRF" << std::endl;
38 |     else
39 | 	std::cout << "Correct: " << key << " NOT found in basic SuRF" << std::endl;
40 | 
41 |     if (surf_hash->lookupKey(key))
42 | 	std::cout << "False Positive: " << key << " found in SuRF hash" << std::endl;
43 |     else
44 | 	std::cout << "Correct: " << key << " NOT found in SuRF hash" << std::endl;
45 | 
46 |     if (surf_real->lookupKey(key))
47 | 	std::cout << "False Positive: " << key << " found in SuRF real" << std::endl;
48 |     else
49 | 	std::cout << "Correct: " << key << " NOT found in SuRF real" << std::endl;
50 | 
51 |     if (surf_mixed->lookupKey(key))
52 | 	std::cout << "False Positive: " << key << " found in SuRF mixed" << std::endl;
53 |     else
54 | 	std::cout << "Correct: " << key << " NOT found in SuRF mixed" << std::endl;
55 | 
56 |     //----------------------------------------
57 |     // range queries
58 |     //----------------------------------------
59 |     std::cout << "\nRange Query Example: [fare, fase)" << std::endl;
60 |     
61 |     std::string left_key = "fare";
62 |     std::string right_key = "fase";
63 | 
64 |     if (surf->lookupRange(left_key, true, right_key, false))
65 | 	std::cout << "False Positive: There exist key(s) within range ["
66 | 		  << left_key << ", " << right_key << ") " << "according to basic SuRF" << std::endl;
67 |     else
68 | 	std::cout << "Correct: No key exists within range ["
69 | 		  << left_key << ", " << right_key << ") " << "according to basic SuRF" << std::endl;
70 | 
71 |     if (surf_hash->lookupRange(left_key, true, right_key, false))
72 | 	std::cout << "False Positive: There exist key(s) within range ["
73 | 		  << left_key << ", " << right_key << ") " << "according to SuRF hash" << std::endl;
74 |     else
75 | 	std::cout << "Correct: No key exists within range ["
76 | 		  << left_key << ", " << right_key << ") " << "according to SuRF hash" << std::endl;
77 | 
78 |     if (surf_real->lookupRange(left_key, true, right_key, false))
79 | 	std::cout << "False Positive: There exist key(s) within range ["
80 | 		  << left_key << ", " << right_key << ") " << "according to SuRF real" << std::endl;
81 |     else
82 | 	std::cout << "Correct: No key exists within range ["
83 | 		  << left_key << ", " << right_key << ") " << "according to SuRF real" << std::endl;
84 | 
85 |     if (surf_mixed->lookupRange(left_key, true, right_key, false))
86 | 	std::cout << "False Positive: There exist key(s) within range ["
87 | 		  << left_key << ", " << right_key << ") " << "according to SuRF mixed" << std::endl;
88 |     else
89 | 	std::cout << "Correct: No key exists within range ["
90 | 		  << left_key << ", " << right_key << ") " << "according to SuRF mixed" << std::endl;
91 | 
92 |     return 0;
93 | }
94 | 


--------------------------------------------------------------------------------
/test/unitTest/test_select.cpp:
--------------------------------------------------------------------------------
  1 | #include "gtest/gtest.h"
  2 | 
  3 | #include <assert.h>
  4 | 
  5 | #include <fstream>
  6 | #include <string>
  7 | #include <vector>
  8 | 
  9 | #include "config.hpp"
 10 | #include "select.hpp"
 11 | #include "surf_builder.hpp"
 12 | 
 13 | namespace surf {
 14 | 
 15 | namespace selecttest {
 16 | 
 17 | static const std::string kFilePath = "../../../test/words.txt";
 18 | static const int kTestSize = 234369;
 19 | static std::vector<std::string> words;
 20 | 
 21 | class SelectUnitTest : public ::testing::Test {
 22 | public:
 23 |     virtual void SetUp () {
 24 | 	bool include_dense = false;
 25 | 	uint32_t sparse_dense_ratio = 0;
 26 | 	level_t suffix_len = 8;
 27 | 	builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, kReal, 0, suffix_len);
 28 | 	data_ = nullptr;
 29 | 	num_items_ = 0;
 30 |     }
 31 |     virtual void TearDown () {
 32 | 	delete builder_;
 33 | 	if (data_)
 34 | 	    delete[] data_;
 35 |     }
 36 | 
 37 |     void setupWordsTest();
 38 |     void testSerialize();
 39 |     void testSelect();
 40 | 
 41 |     static const position_t kSelectSampleInterval = 64;
 42 | 
 43 |     SuRFBuilder* builder_;
 44 |     BitvectorSelect* bv_;
 45 |     std::vector<position_t> num_items_per_level_;
 46 |     position_t num_items_;
 47 |     char* data_;
 48 | };
 49 | 
 50 | void SelectUnitTest::setupWordsTest() {
 51 |     builder_->build(words);
 52 |     for (level_t level = 0; level < builder_->getTreeHeight(); level++)
 53 | 	num_items_per_level_.push_back(builder_->getLabels()[level].size());
 54 |     for (level_t level = 0; level < num_items_per_level_.size(); level++)
 55 | 	num_items_ += num_items_per_level_[level];
 56 |     bv_ = new BitvectorSelect(kSelectSampleInterval, builder_->getLoudsBits(), num_items_per_level_);
 57 | }
 58 | 
 59 | void SelectUnitTest::testSerialize() {
 60 |     uint64_t size = bv_->serializedSize();
 61 |     ASSERT_TRUE((bv_->size() - size) >= 0);
 62 |     data_ = new char[size];
 63 |     BitvectorSelect* ori_bv = bv_;
 64 |     char* data = data_;
 65 |     ori_bv->serialize(data);
 66 |     data = data_;
 67 |     bv_ = BitvectorSelect::deSerialize(data);
 68 | 
 69 |     ASSERT_EQ(ori_bv->bitsSize(), bv_->bitsSize());
 70 |     ASSERT_EQ(ori_bv->selectLutSize(), bv_->selectLutSize());
 71 |     
 72 |     ori_bv->destroy();
 73 |     delete ori_bv;
 74 | }
 75 | 
 76 | void SelectUnitTest::testSelect() {
 77 |     position_t rank = 1;
 78 |     for (position_t pos = 0; pos < num_items_; pos++) {
 79 | 	if (bv_->readBit(pos)) {
 80 | 	    position_t select = bv_->select(rank);
 81 | 	    ASSERT_EQ(pos, select);
 82 | 	    rank++;
 83 | 	}
 84 |     }
 85 | }
 86 | 
 87 | TEST_F (SelectUnitTest, readBitTest) {
 88 |     setupWordsTest();
 89 |     position_t bv_pos = 0;
 90 |     for (level_t level = 0; level < builder_->getTreeHeight(); level++) {
 91 | 	for (position_t pos = 0; pos < num_items_per_level_[level]; pos++) {
 92 | 	    bool expected_bit = SuRFBuilder::readBit(builder_->getLoudsBits()[level], pos);
 93 | 	    bool bv_bit = bv_->readBit(bv_pos);
 94 | 	    ASSERT_EQ(expected_bit, bv_bit);
 95 | 	    bv_pos++;
 96 | 	}
 97 |     }
 98 |     bv_->destroy();
 99 |     delete bv_;
100 | }
101 | 
102 | TEST_F (SelectUnitTest, selectTest) {
103 |     setupWordsTest();
104 |     testSelect();
105 | }
106 | 
107 | TEST_F (SelectUnitTest, serializeTest) {
108 |     setupWordsTest();
109 |     testSerialize();
110 |     testSelect();
111 | }
112 | 
113 | void loadWordList() {
114 |     std::ifstream infile(kFilePath);
115 |     std::string key;
116 |     int count = 0;
117 |     while (infile.good() && count < kTestSize) {
118 | 	infile >> key;
119 | 	words.push_back(key);
120 | 	count++;
121 |     }
122 | }
123 | 
124 | } // namespace ranktest
125 | 
126 | } // namespace surf
127 | 
128 | int main (int argc, char** argv) {
129 |     ::testing::InitGoogleTest(&argc, argv);
130 |     surf::selecttest::loadWordList();
131 |     return RUN_ALL_TESTS();
132 | }
133 | 


--------------------------------------------------------------------------------
/include/rank.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef RANK_H_
  2 | #define RANK_H_
  3 | 
  4 | #include "bitvector.hpp"
  5 | 
  6 | #include <assert.h>
  7 | 
  8 | #include <vector>
  9 | 
 10 | #include "popcount.h"
 11 | 
 12 | namespace surf {
 13 | 
 14 | class BitvectorRank : public Bitvector {
 15 | public:
 16 |     BitvectorRank() : basic_block_size_(0), rank_lut_(nullptr) {};
 17 | 
 18 |     BitvectorRank(const position_t basic_block_size, 
 19 | 		  const std::vector<std::vector<word_t> >& bitvector_per_level, 
 20 | 		  const std::vector<position_t>& num_bits_per_level,
 21 | 		  const level_t start_level = 0,
 22 | 		  const level_t end_level = 0/* non-inclusive */) 
 23 | 	: Bitvector(bitvector_per_level, num_bits_per_level, start_level, end_level) {
 24 | 	basic_block_size_ = basic_block_size;
 25 | 	initRankLut();
 26 |     }
 27 | 
 28 |     ~BitvectorRank() {}
 29 | 
 30 |     // Counts the number of 1's in the bitvector up to position pos.
 31 |     // pos is zero-based; count is one-based.
 32 |     // E.g., for bitvector: 100101000, rank(3) = 2
 33 |     position_t rank(position_t pos) const {
 34 |         assert(pos <= num_bits_);
 35 |         position_t word_per_basic_block = basic_block_size_ / kWordSize;
 36 |         position_t block_id = pos / basic_block_size_;
 37 |         position_t offset = pos & (basic_block_size_ - 1);
 38 |         return (rank_lut_[block_id] 
 39 | 		+ popcountLinear(bits_, block_id * word_per_basic_block, offset + 1));
 40 |     }
 41 | 
 42 |     position_t rankLutSize() const {
 43 | 	return ((num_bits_ / basic_block_size_ + 1) * sizeof(position_t));
 44 |     }
 45 | 
 46 |     position_t serializedSize() const {
 47 | 	position_t size = sizeof(num_bits_) + sizeof(basic_block_size_) 
 48 | 	    + bitsSize() + rankLutSize();
 49 | 	sizeAlign(size);
 50 | 	return size;
 51 |     }
 52 | 
 53 |     position_t size() const {
 54 | 	return (sizeof(BitvectorRank) + bitsSize() + rankLutSize());
 55 |     }
 56 | 
 57 |     void prefetch(position_t pos) const {
 58 | 	__builtin_prefetch(bits_ + (pos / kWordSize));
 59 | 	__builtin_prefetch(rank_lut_ + (pos / basic_block_size_));
 60 |     }
 61 | 
 62 |     void serialize(char*& dst) const {
 63 | 	memcpy(dst, &num_bits_, sizeof(num_bits_));
 64 | 	dst += sizeof(num_bits_);
 65 | 	memcpy(dst, &basic_block_size_, sizeof(basic_block_size_));
 66 | 	dst += sizeof(basic_block_size_);
 67 | 	memcpy(dst, bits_, bitsSize());
 68 | 	dst += bitsSize();
 69 | 	memcpy(dst, rank_lut_, rankLutSize());
 70 | 	dst += rankLutSize();
 71 | 	align(dst);
 72 |     }
 73 | 
 74 |     static BitvectorRank* deSerialize(char*& src) {
 75 | 	BitvectorRank* bv_rank = new BitvectorRank();
 76 | 	memcpy(&(bv_rank->num_bits_), src, sizeof(bv_rank->num_bits_));
 77 | 	src += sizeof(bv_rank->num_bits_);
 78 | 	memcpy(&(bv_rank->basic_block_size_), src, sizeof(bv_rank->basic_block_size_));
 79 | 	src += sizeof(bv_rank->basic_block_size_);
 80 | 
 81 | 	bv_rank->bits_ = new word_t[bv_rank->numWords()];
 82 | 	memcpy(bv_rank->bits_, src, bv_rank->bitsSize());
 83 | 	src += bv_rank->bitsSize();
 84 | 	bv_rank->rank_lut_ = new position_t[bv_rank->rankLutSize() / sizeof(position_t)];
 85 | 	memcpy(bv_rank->rank_lut_, src, bv_rank->rankLutSize());
 86 | 	src += bv_rank->rankLutSize();
 87 | 	
 88 | 	//bv_rank->bits_ = const_cast<word_t*>(reinterpret_cast<const word_t*>(src));
 89 | 	//src += bv_rank->bitsSize();
 90 | 	//bv_rank->rank_lut_ = const_cast<position_t*>(reinterpret_cast<const position_t*>(src));
 91 | 	//src += bv_rank->rankLutSize();
 92 | 	
 93 | 	align(src);
 94 | 	return bv_rank;
 95 |     }
 96 | 
 97 |     void destroy() {
 98 | 	delete[] bits_;
 99 | 	delete[] rank_lut_;
100 |     }
101 | 
102 | private:
103 |     void initRankLut() {
104 |         position_t word_per_basic_block = basic_block_size_ / kWordSize;
105 |         position_t num_blocks = num_bits_ / basic_block_size_ + 1;
106 | 	rank_lut_ = new position_t[num_blocks];
107 | 
108 |         position_t cumu_rank = 0;
109 |         for (position_t i = 0; i < num_blocks - 1; i++) {
110 |             rank_lut_[i] = cumu_rank;
111 |             cumu_rank += popcountLinear(bits_, i * word_per_basic_block, basic_block_size_);
112 |         }
113 | 	rank_lut_[num_blocks - 1] = cumu_rank;
114 |     }
115 | 
116 |     position_t basic_block_size_;
117 |     position_t* rank_lut_; //rank look-up table
118 | };
119 | 
120 | } // namespace surf
121 | 
122 | #endif // RANK_H_
123 | 


--------------------------------------------------------------------------------
/bench/workload_gen/gen_txn.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | 
  4 | class bcolors:
  5 |     HEADER = '\033[95m'
  6 |     OKBLUE = '\033[94m'
  7 |     OKGREEN = '\033[92m'
  8 |     WARNING = '\033[93m'
  9 |     FAIL = '\033[91m'
 10 |     ENDC = '\033[0m'
 11 |     BOLD = '\033[1m'
 12 |     UNDERLINE = '\033[4m'
 13 | 
 14 | #####################################################################################
 15 | 
 16 | def reverseHostName ( email ) :
 17 |     name, sep, host = email.partition('@')
 18 |     hostparts = host[:-1].split('.')
 19 |     r_host = ''
 20 |     for part in hostparts :
 21 |         r_host = part + '.' + r_host
 22 |     return r_host + sep + name
 23 | 
 24 | #####################################################################################
 25 | 
 26 | if (len(sys.argv) < 3) :
 27 |     print bcolors.FAIL + 'Usage:'
 28 |     print 'arg 1, key type: randint, timestamp, email' 
 29 |     print 'arg 2, distribution: uniform, zipfian, latest' + bcolors.ENDC
 30 |     sys.exit()
 31 | 
 32 | key_type = sys.argv[1]
 33 | distribution = sys.argv[2]
 34 | 
 35 | print bcolors.OKGREEN +  'key type = ' + key_type
 36 | print 'distribution = ' + distribution + bcolors.ENDC
 37 | 
 38 | ycsb_dir = 'YCSB/bin/'
 39 | workload_dir = 'workload_spec/'
 40 | output_dir='../workloads/'
 41 | 
 42 | email_list = 'email_list.txt'
 43 | email_list_size = 27549660
 44 | email_keymap_file = output_dir + 'email_keymap.txt'
 45 | 
 46 | timestamp_list = 'poisson_timestamps.csv'
 47 | timestamp_keymap_file = output_dir + 'timestamp_keymap.txt'
 48 | 
 49 | if key_type != 'randint' and key_type != 'timestamp' and key_type != 'email' :
 50 |     print bcolors.FAIL + 'Incorrect key_type: please pick from randint and email' + bcolors.ENDC
 51 |     sys.exit()
 52 | 
 53 | if distribution != 'uniform' and distribution != 'zipfian' and distribution != 'latest' :
 54 |     print bcolors.FAIL + 'Incorrect distribution: please pick from uniform, zipfian and latest' + bcolors.ENDC
 55 |     sys.exit()
 56 | 
 57 | out_ycsb_txn = output_dir + 'ycsb_txn_' + key_type + '_' + distribution
 58 | out_txn_ycsbkey = output_dir + 'txn_' + 'ycsbkey' + '_' + distribution
 59 | out_txn = output_dir + 'txn_' + key_type + '_' + distribution
 60 | 
 61 | cmd_ycsb_txn = ycsb_dir + 'ycsb run basic -P ' + workload_dir + 'workloadc_' + key_type + '_' + distribution + ' -s > ' + out_ycsb_txn
 62 | 
 63 | os.system(cmd_ycsb_txn)
 64 | 
 65 | #####################################################################################
 66 | 
 67 | f_txn = open (out_ycsb_txn, 'r')
 68 | f_txn_out = open (out_txn_ycsbkey, 'w')
 69 | for line in f_txn :
 70 |     cols = line.split()
 71 |     if len(cols) > 2 and cols[0] == 'READ' :
 72 |         f_txn_out.write (cols[2][4:] + "\n")
 73 | f_txn.close()
 74 | f_txn_out.close()
 75 | 
 76 | cmd = 'rm -f ' + out_ycsb_txn
 77 | os.system(cmd)
 78 | 
 79 | #####################################################################################
 80 | 
 81 | if key_type == 'randint' :
 82 |     f_txn = open (out_txn_ycsbkey, 'r')
 83 |     f_txn_out = open (out_txn, 'w')
 84 |     for line in f_txn :
 85 |         f_txn_out.write (line)
 86 | 
 87 | elif key_type == 'timestamp' :
 88 |     timestamp_keymap = {}
 89 |     f_timestamp_keymap = open (timestamp_keymap_file, 'r')
 90 |     for line in f_timestamp_keymap :
 91 |         cols = line.split()
 92 |         timestamp_keymap[int(cols[0])] = cols[1]
 93 | 
 94 |     count = 0
 95 |     f_txn = open (out_txn_ycsbkey, 'r')
 96 |     f_txn_out = open (out_txn, 'w')
 97 |     for line in f_txn :
 98 |         cols = line.split()
 99 |         if len(cols) > 0 :
100 |             f_txn_out.write (timestamp_keymap[int(cols[0])] + '\n')
101 |     f_timestamp_keymap.close()
102 | 
103 | elif key_type == 'email' :
104 |     email_keymap = {}
105 |     f_email_keymap = open (email_keymap_file, 'r')
106 |     for line in f_email_keymap :
107 |         cols = line.split()
108 |         email_keymap[int(cols[0])] = cols[1]
109 | 
110 |     count = 0
111 |     f_txn = open (out_txn_ycsbkey, 'r')
112 |     f_txn_out = open (out_txn, 'w')
113 |     for line in f_txn :
114 |         cols = line.split()
115 |         if len(cols) > 0 :
116 |             f_txn_out.write (email_keymap[int(cols[0])] + '\n')
117 |     f_email_keymap.close()
118 | 
119 | f_txn.close()
120 | f_txn_out.close()
121 | 
122 | cmd = 'rm -f ' + out_txn_ycsbkey
123 | os.system(cmd)
124 | 


--------------------------------------------------------------------------------
/bench/bench.hpp:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <pthread.h>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <string.h>
  6 | #include <time.h>
  7 | #include <sys/time.h>
  8 | 
  9 | #include <vector>
 10 | #include <fstream>
 11 | #include <iostream>
 12 | #include <utility>
 13 | #include <algorithm>
 14 | #include <random>
 15 | #include <climits>
 16 | #include <cstdlib>
 17 | #include <unordered_set>
 18 | #include <map>
 19 | 
 20 | namespace bench {
 21 | 
 22 | static const uint64_t kNumIntRecords = 100000000;
 23 | static const uint64_t kNumEmailRecords = 25000000;
 24 | static const uint64_t kNumTxns = 10000000;
 25 | static const uint64_t kIntRangeSize = 92233697311;
 26 | static const uint64_t kEmailRangeSize = 128;
 27 | 
 28 | //static const uint64_t kRandintRangeSize = 328 * 1024 * 1024 * (uint64_t)1024;
 29 | //static const char* kWordloadDir = "workloads/";
 30 | 
 31 | // for pretty print
 32 | static const char* kGreen ="\033[0;32m";
 33 | static const char* kRed ="\033[0;31m";
 34 | static const char* kNoColor ="\033[0;0m";
 35 | 
 36 | // for time measurement
 37 | double getNow() {
 38 |   struct timeval tv;
 39 |   gettimeofday(&tv, 0);
 40 |   return tv.tv_sec + tv.tv_usec / 1000000.0;
 41 | }
 42 | 
 43 | std::string uint64ToString(uint64_t key) {
 44 |     uint64_t endian_swapped_key = __builtin_bswap64(key);
 45 |     return std::string(reinterpret_cast<const char*>(&endian_swapped_key), 8);
 46 | }
 47 | 
 48 | uint64_t stringToUint64(std::string str_key) {
 49 |     uint64_t int_key = 0;
 50 |     memcpy(reinterpret_cast<char*>(&int_key), str_key.data(), 8);
 51 |     return __builtin_bswap64(int_key);
 52 | }
 53 | 
 54 | void loadKeysFromFile(const std::string& file_name, const bool is_key_int, 
 55 | 		      std::vector<std::string> &keys) {
 56 |     std::ifstream infile(file_name);
 57 |     std::string key;
 58 |     uint64_t count = 0;
 59 |     if (is_key_int) {
 60 | 	while (count < kNumIntRecords && infile.good()) {
 61 | 	    uint64_t int_key;
 62 | 	    infile >> int_key;
 63 | 	    key = uint64ToString(int_key);
 64 | 	    keys.push_back(key);
 65 | 	    count++;
 66 | 	}
 67 |     } else {
 68 | 	while (count < kNumEmailRecords && infile.good()) {
 69 | 	    infile >> key;
 70 | 	    keys.push_back(key);
 71 | 	    count++;
 72 | 	}
 73 |     }
 74 | }
 75 | 
 76 | void loadKeysFromFile(const std::string& file_name, uint64_t num_records,
 77 | 		      std::vector<uint64_t> &keys) {
 78 |     std::ifstream infile(file_name);
 79 |     uint64_t count = 0;
 80 |     while (count < num_records && infile.good()) {
 81 | 	uint64_t key;
 82 | 	infile >> key;
 83 | 	keys.push_back(key);
 84 | 	count++;
 85 |     }
 86 | }
 87 | 
 88 | // 0 < percent <= 100
 89 | void selectKeysToInsert(const unsigned percent, 
 90 | 			std::vector<std::string> &insert_keys, 
 91 | 			std::vector<std::string> &keys) {
 92 |     random_shuffle(keys.begin(), keys.end());
 93 |     uint64_t num_insert_keys = keys.size() * percent / 100;
 94 |     for (uint64_t i = 0; i < num_insert_keys; i++)
 95 | 	insert_keys.push_back(keys[i]);
 96 | 
 97 |     keys.clear();
 98 |     sort(insert_keys.begin(), insert_keys.end());
 99 | }
100 | 
101 | // 0 < percent <= 100
102 | void selectIntKeysToInsert(const unsigned percent, 
103 | 			   std::vector<uint64_t> &insert_keys, 
104 | 			   std::vector<uint64_t> &keys) {
105 |     random_shuffle(keys.begin(), keys.end());
106 |     uint64_t num_insert_keys = keys.size() * percent / 100;
107 |     for (uint64_t i = 0; i < num_insert_keys; i++)
108 | 	insert_keys.push_back(keys[i]);
109 | 
110 |     keys.clear();
111 |     sort(insert_keys.begin(), insert_keys.end());
112 | }
113 | 
114 | // pos > 0, position counting from the last byte
115 | void modifyKeyByte(std::vector<std::string> &keys, int pos) {
116 |     for (int i = 0; i < (int)keys.size(); i++) {
117 | 	int keylen = keys[i].length();
118 | 	if (keylen > pos)
119 | 	    keys[i][keylen - 1 - pos] = '+';
120 | 	else
121 | 	    keys[i][0] = '+';
122 |     } 
123 | }
124 | 
125 | std::string getUpperBoundKey(const std::string& key_type, const std::string& key) {
126 |     std::string ret_str = key;
127 |     if (key_type.compare(std::string("email")) == 0) {
128 | 	ret_str[ret_str.size() - 1] += (char)kEmailRangeSize;
129 |     } else {
130 | 	uint64_t int_key = stringToUint64(key);
131 | 	int_key += kIntRangeSize;
132 | 	ret_str = uint64ToString(int_key);
133 |     }
134 |     return ret_str;
135 | }
136 | 
137 | } // namespace bench
138 | 


--------------------------------------------------------------------------------
/bench/workload_gen/gen_load.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | 
  4 | class bcolors:
  5 |     HEADER = '\033[95m'
  6 |     OKBLUE = '\033[94m'
  7 |     OKGREEN = '\033[92m'
  8 |     WARNING = '\033[93m'
  9 |     FAIL = '\033[91m'
 10 |     ENDC = '\033[0m'
 11 |     BOLD = '\033[1m'
 12 |     UNDERLINE = '\033[4m'
 13 | 
 14 | #####################################################################################
 15 | 
 16 | def reverseHostName ( email ) :
 17 |     name, sep, host = email.partition('@')
 18 |     hostparts = host[:-1].split('.')
 19 |     r_host = ''
 20 |     for part in hostparts :
 21 |         r_host = part + '.' + r_host
 22 |     return r_host + sep + name
 23 | 
 24 | #####################################################################################
 25 | 
 26 | if (len(sys.argv) < 3) :
 27 |     print bcolors.FAIL + 'Usage:'
 28 |     print 'arg 1, key type: randint, timestamp, email' 
 29 |     print 'arg 2, distribution: uniform, zipfian, latest' + bcolors.ENDC
 30 |     sys.exit()
 31 | 
 32 | key_type = sys.argv[1]
 33 | distribution = sys.argv[2]
 34 | 
 35 | print bcolors.OKGREEN + 'key type = ' + key_type 
 36 | print 'distribution = ' + distribution + bcolors.ENDC
 37 | 
 38 | ycsb_dir = 'YCSB/bin/'
 39 | workload_dir = 'workload_spec/'
 40 | output_dir='../workloads/'
 41 | 
 42 | email_list = 'email_list.txt'
 43 | email_list_size = 27549660
 44 | email_keymap_file = output_dir + 'email_keymap.txt'
 45 | 
 46 | timestamp_list = 'poisson_timestamps.csv'
 47 | timestamp_keymap_file = output_dir + 'timestamp_keymap.txt'
 48 | 
 49 | if key_type != 'randint' and key_type != 'timestamp' and key_type != 'email' :
 50 |     print bcolors.FAIL + 'Incorrect key_type: please pick from randint and email' + bcolors.ENDC
 51 |     sys.exit()
 52 | 
 53 | if distribution != 'uniform' and distribution != 'zipfian' and distribution != 'latest' :
 54 |     print bcolors.FAIL + 'Incorrect distribution: please pick from uniform, zipfian and latest' + bcolors.ENDC
 55 |     sys.exit()
 56 | 
 57 | out_ycsb_load = output_dir + 'ycsb_load_' + key_type
 58 | out_load_ycsbkey = output_dir + 'load_' + 'ycsbkey'
 59 | out_load = output_dir + 'load_' + key_type
 60 | 
 61 | cmd_ycsb_load = ycsb_dir + 'ycsb load basic -P ' + workload_dir + 'workloadc_' + key_type + '_' + distribution + ' -s > ' + out_ycsb_load
 62 | 
 63 | os.system(cmd_ycsb_load)
 64 | 
 65 | #####################################################################################
 66 | 
 67 | f_load = open (out_ycsb_load, 'r')
 68 | f_load_out = open (out_load_ycsbkey, 'w')
 69 | for line in f_load :
 70 |     cols = line.split()
 71 |     if len(cols) > 2 and cols[0] == "INSERT":
 72 |         f_load_out.write (cols[2][4:] + '\n')
 73 | f_load.close()
 74 | f_load_out.close()
 75 | 
 76 | cmd = 'rm -f ' + out_ycsb_load
 77 | os.system(cmd)
 78 | 
 79 | #####################################################################################
 80 | 
 81 | if key_type == 'randint' :
 82 |     f_load = open (out_load_ycsbkey, 'r')
 83 |     f_load_out = open (out_load, 'w')
 84 |     for line in f_load :
 85 |         f_load_out.write (line)
 86 | 
 87 | elif key_type == 'timestamp' :
 88 |     timestamp_keymap = {}
 89 |     f_timestamp_keymap = open (timestamp_keymap_file, 'w')
 90 | 
 91 |     f_timestamp = open (timestamp_list, 'r')
 92 |     timestamps = f_timestamp.readlines()
 93 | 
 94 |     f_load_out = open (out_load, 'w')
 95 |     f_load = open (out_load_ycsbkey, 'r')
 96 |     count = 0
 97 |     for line in f_load :
 98 |         cols = line.split()
 99 |         ts = timestamps[count]
100 |         f_load_out.write (ts)
101 |         f_timestamp_keymap.write (cols[0] + ' ' + ts)
102 |         count += 1
103 |     f_timestamp_keymap.close()
104 | 
105 | elif key_type == 'email' :
106 |     email_keymap = {}
107 |     f_email_keymap = open (email_keymap_file, 'w')
108 | 
109 |     f_email = open (email_list, 'r')
110 |     emails = f_email.readlines()
111 | 
112 |     f_load = open (out_load_ycsbkey, 'r')
113 |     f_load_out = open (out_load, 'w')
114 | 
115 |     sample_size = len(f_load.readlines())
116 |     gap = email_list_size / sample_size
117 | 
118 |     f_load.close()
119 |     f_load = open (out_load_ycsbkey, 'r')
120 |     count = 0
121 |     for line in f_load :
122 |         cols = line.split()
123 |         email = reverseHostName(emails[count * gap])
124 |         f_load_out.write (email + '\n')
125 |         f_email_keymap.write (cols[0] + ' ' + email + '\n')
126 |         count += 1
127 |     f_email_keymap.close()
128 | 
129 | f_load.close()
130 | f_load_out.close()
131 | 
132 | cmd = 'rm -f ' + out_load_ycsbkey
133 | os.system(cmd)
134 | 


--------------------------------------------------------------------------------
/test/unitTest/test_rank.cpp:
--------------------------------------------------------------------------------
  1 | #include "gtest/gtest.h"
  2 | 
  3 | #include <assert.h>
  4 | 
  5 | #include <fstream>
  6 | #include <string>
  7 | #include <vector>
  8 | 
  9 | #include "config.hpp"
 10 | #include "rank.hpp"
 11 | #include "surf_builder.hpp"
 12 | 
 13 | namespace surf {
 14 | 
 15 | namespace ranktest {
 16 | 
 17 | static const std::string kFilePath = "../../../test/words.txt";
 18 | static const int kTestSize = 234369;
 19 | static std::vector<std::string> words;
 20 | 
 21 | class RankUnitTest : public ::testing::Test {
 22 | public:
 23 |     virtual void SetUp () {
 24 | 	bool include_dense = false;
 25 | 	uint32_t sparse_dense_ratio = 0;
 26 | 	level_t suffix_len = 8;
 27 | 	builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, kReal, 0, suffix_len);
 28 | 	data_ = nullptr;
 29 | 	data2_ = nullptr;
 30 | 	num_items_ = 0;
 31 |     }
 32 |     virtual void TearDown () {
 33 | 	delete builder_;
 34 | 	if (data_)
 35 | 	    delete[] data_;
 36 | 	if (data2_)
 37 | 	    delete[] data2_;
 38 |     }
 39 | 
 40 |     void setupWordsTest();
 41 |     void testSerialize();
 42 |     void testRank();
 43 | 
 44 |     static const position_t kRankBasicBlockSize = 512;
 45 | 
 46 |     SuRFBuilder* builder_;
 47 |     BitvectorRank* bv_;
 48 |     BitvectorRank* bv2_;
 49 |     std::vector<position_t> num_items_per_level_;
 50 |     position_t num_items_;
 51 |     char* data_;
 52 |     char* data2_;
 53 | };
 54 | 
 55 | void RankUnitTest::setupWordsTest() {
 56 |     builder_->build(words);
 57 |     for (level_t level = 0; level < builder_->getTreeHeight(); level++)
 58 | 	num_items_per_level_.push_back(builder_->getLabels()[level].size());
 59 |     for (level_t level = 0; level < num_items_per_level_.size(); level++)
 60 | 	num_items_ += num_items_per_level_[level];
 61 |     bv_ = new BitvectorRank(kRankBasicBlockSize, builder_->getChildIndicatorBits(), num_items_per_level_);
 62 |     bv2_ = new BitvectorRank(kRankBasicBlockSize, builder_->getLoudsBits(), num_items_per_level_);
 63 | }
 64 | 
 65 | void RankUnitTest::testSerialize() {
 66 |     uint64_t size = bv_->serializedSize();
 67 |     ASSERT_TRUE((bv_->size() - size) >= 0);
 68 |     data_ = new char[size];
 69 |     BitvectorRank* ori_bv = bv_;
 70 |     char* data = data_;
 71 |     ori_bv->serialize(data);
 72 |     data = data_;
 73 |     bv_ = BitvectorRank::deSerialize(data);
 74 | 
 75 |     ASSERT_EQ(ori_bv->bitsSize(), bv_->bitsSize());
 76 |     ASSERT_EQ(ori_bv->rankLutSize(), bv_->rankLutSize());
 77 |     
 78 |     ori_bv->destroy();
 79 |     delete ori_bv;
 80 | 
 81 |     size = bv2_->serializedSize();
 82 |     data2_ = new char[size];
 83 |     BitvectorRank* ori_bv2 = bv2_;
 84 |     char* data2 = data2_;
 85 |     ori_bv2->serialize(data2);
 86 |     data2 = data2_;
 87 |     bv2_ = BitvectorRank::deSerialize(data2);
 88 | 
 89 |     ASSERT_EQ(ori_bv2->bitsSize(), bv2_->bitsSize());
 90 |     ASSERT_EQ(ori_bv2->rankLutSize(), bv2_->rankLutSize());
 91 |     
 92 |     ori_bv2->destroy();
 93 |     delete ori_bv2;
 94 | }
 95 | 
 96 | void RankUnitTest::testRank() {
 97 |     position_t expected_rank = 0;
 98 |     position_t expected_rank2 = 0;
 99 |     for (position_t pos = 0; pos < num_items_; pos++) {
100 | 	if (bv_->readBit(pos)) expected_rank++;
101 | 	position_t rank = bv_->rank(pos);
102 | 	ASSERT_EQ(expected_rank, rank);
103 | 
104 | 	if (bv2_->readBit(pos)) expected_rank2++;
105 | 	position_t rank2 = bv2_->rank(pos);
106 | 	ASSERT_EQ(expected_rank2, rank2);
107 |     }
108 | }
109 | 
110 | TEST_F (RankUnitTest, readBitTest) {
111 |     setupWordsTest();
112 |     position_t bv_pos = 0;
113 |     for (level_t level = 0; level < builder_->getTreeHeight(); level++) {
114 | 	for (position_t pos = 0; pos < num_items_per_level_[level]; pos++) {
115 | 	    bool expected_bit = SuRFBuilder::readBit(builder_->getChildIndicatorBits()[level], pos);
116 | 	    bool bv_bit = bv_->readBit(bv_pos);
117 | 	    ASSERT_EQ(expected_bit, bv_bit);
118 | 
119 | 	    expected_bit = SuRFBuilder::readBit(builder_->getLoudsBits()[level], pos);
120 | 	    bv_bit = bv2_->readBit(bv_pos);
121 | 	    ASSERT_EQ(expected_bit, bv_bit);
122 | 
123 | 	    bv_pos++;
124 | 	}
125 |     }
126 |     bv_->destroy();
127 |     delete bv_;
128 |     bv2_->destroy();
129 |     delete bv2_;
130 | }
131 | 
132 | TEST_F (RankUnitTest, rankTest) {
133 |     setupWordsTest();
134 |     testRank();
135 |     bv_->destroy();
136 |     delete bv_;
137 |     bv2_->destroy();
138 |     delete bv2_;
139 | }
140 | 
141 | TEST_F (RankUnitTest, serializeTest) {
142 |     setupWordsTest();
143 |     testSerialize();
144 |     testRank();
145 | }
146 | 
147 | void loadWordList() {
148 |     std::ifstream infile(kFilePath);
149 |     std::string key;
150 |     int count = 0;
151 |     while (infile.good() && count < kTestSize) {
152 | 	infile >> key;
153 | 	words.push_back(key);
154 | 	count++;
155 |     }
156 | }
157 | 
158 | } // namespace ranktest
159 | 
160 | } // namespace surf
161 | 
162 | int main (int argc, char** argv) {
163 |     ::testing::InitGoogleTest(&argc, argv);
164 |     surf::ranktest::loadWordList();
165 |     return RUN_ALL_TESTS();
166 | }
167 | 


--------------------------------------------------------------------------------
/include/bitvector.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef BITVECTOR_H_
  2 | #define BITVECTOR_H_
  3 | 
  4 | #include <assert.h>
  5 | 
  6 | #include <vector>
  7 | 
  8 | #include "config.hpp"
  9 | 
 10 | namespace surf {
 11 | 
 12 | class Bitvector {
 13 | public:
 14 |     Bitvector() : num_bits_(0), bits_(nullptr) {};
 15 | 
 16 |     Bitvector(const std::vector<std::vector<word_t> >& bitvector_per_level, 
 17 | 	      const std::vector<position_t>& num_bits_per_level, 
 18 | 	      const level_t start_level = 0, 
 19 | 	      level_t end_level = 0/* non-inclusive */) {
 20 | 	if (end_level == 0)
 21 | 	    end_level = bitvector_per_level.size();
 22 | 	num_bits_ = totalNumBits(num_bits_per_level, start_level, end_level);
 23 | 	bits_ = new word_t[numWords()];
 24 | 	memset(bits_, 0, bitsSize());
 25 | 	concatenateBitvectors(bitvector_per_level, num_bits_per_level, start_level, end_level);
 26 |     }
 27 | 
 28 |     ~Bitvector() {}
 29 | 
 30 |     position_t numBits() const {
 31 | 	return num_bits_;
 32 |     }
 33 | 
 34 |     position_t numWords() const {
 35 | 	if (num_bits_ % kWordSize == 0)
 36 | 	    return (num_bits_ / kWordSize);
 37 | 	else
 38 | 	    return (num_bits_ / kWordSize + 1);
 39 |     }
 40 | 
 41 |     // in bytes
 42 |     position_t bitsSize() const {
 43 | 	return (numWords() * (kWordSize / 8));
 44 |     }
 45 | 
 46 |     // in bytes
 47 |     position_t size() const {
 48 | 	return (sizeof(Bitvector) + bitsSize());
 49 |     }
 50 | 
 51 |     bool readBit(const position_t pos) const;
 52 | 
 53 |     position_t distanceToNextSetBit(const position_t pos) const;
 54 |     position_t distanceToPrevSetBit(const position_t pos) const;
 55 | 
 56 | private:
 57 |     position_t totalNumBits(const std::vector<position_t>& num_bits_per_level, 
 58 | 			    const level_t start_level, 
 59 | 			    const level_t end_level/* non-inclusive */);
 60 | 
 61 |     void concatenateBitvectors(const std::vector<std::vector<word_t> >& bitvector_per_level, 
 62 | 			       const std::vector<position_t>& num_bits_per_level, 
 63 | 			       const level_t start_level, 
 64 | 			       const level_t end_level/* non-inclusive */);
 65 | protected:
 66 |     position_t num_bits_;
 67 |     word_t* bits_;
 68 | };
 69 | 
 70 | bool Bitvector::readBit (const position_t pos) const {
 71 |     assert(pos <= num_bits_);
 72 |     position_t word_id = pos / kWordSize;
 73 |     position_t offset = pos & (kWordSize - 1);
 74 |     return bits_[word_id] & (kMsbMask >> offset);
 75 | }
 76 | 
 77 | position_t Bitvector::distanceToNextSetBit (const position_t pos) const {
 78 |     assert(pos < num_bits_);
 79 |     position_t distance = 1;
 80 | 
 81 |     position_t word_id = (pos + 1) / kWordSize;
 82 |     position_t offset = (pos + 1) % kWordSize;
 83 | 
 84 |     //first word left-over bits
 85 |     word_t test_bits = bits_[word_id] << offset;
 86 |     if (test_bits > 0) {
 87 | 	return (distance + __builtin_clzll(test_bits));
 88 |     } else {
 89 | 	if (word_id == numWords() - 1)
 90 | 	    return (num_bits_ - pos);
 91 | 	distance += (kWordSize - offset);
 92 |     }
 93 | 
 94 |     while (word_id < numWords() - 1) {
 95 | 	word_id++;
 96 | 	test_bits = bits_[word_id];
 97 | 	if (test_bits > 0)
 98 | 	    return (distance + __builtin_clzll(test_bits));
 99 | 	distance += kWordSize;
100 |     }
101 |     return distance;
102 | }
103 | 
104 | position_t Bitvector::distanceToPrevSetBit (const position_t pos) const {
105 |     assert(pos <= num_bits_);
106 |     if (pos == 0) return 0;
107 |     position_t distance = 1;
108 | 
109 |     position_t word_id = (pos - 1) / kWordSize;
110 |     position_t offset = (pos - 1) % kWordSize;
111 | 
112 |     //first word left-over bits
113 |     word_t test_bits = bits_[word_id] >> (kWordSize - 1 - offset);
114 |     if (test_bits > 0) {
115 | 	return (distance + __builtin_ctzll(test_bits));
116 |     } else {
117 | 	//if (word_id == 0)
118 | 	//return (offset + 1);
119 | 	distance += (offset + 1);
120 |     }
121 | 
122 |     while (word_id > 0) {
123 | 	word_id--;
124 | 	test_bits = bits_[word_id];
125 | 	if (test_bits > 0)
126 | 	    return (distance + __builtin_ctzll(test_bits));
127 | 	distance += kWordSize;
128 |     }
129 |     return distance;
130 | }
131 | 
132 | position_t Bitvector::totalNumBits(const std::vector<position_t>& num_bits_per_level, 
133 | 			     const level_t start_level, 
134 | 			     const level_t end_level/* non-inclusive */) {
135 |     position_t num_bits = 0;
136 |     for (level_t level = start_level; level < end_level; level++)
137 | 	num_bits += num_bits_per_level[level];
138 |     return num_bits;
139 | }
140 | 
141 | void Bitvector::concatenateBitvectors(const std::vector<std::vector<word_t> >& bitvector_per_level, 
142 | 				      const std::vector<position_t>& num_bits_per_level, 
143 | 				      const level_t start_level, 
144 | 				      const level_t end_level/* non-inclusive */) {
145 |     position_t bit_shift = 0;
146 |     position_t word_id = 0;
147 |     for (level_t level = start_level; level < end_level; level++) {
148 | 	if (num_bits_per_level[level] == 0) continue;
149 | 	position_t num_complete_words = num_bits_per_level[level] / kWordSize;
150 | 	for (position_t word = 0; word < num_complete_words; word++) {
151 | 	    bits_[word_id] |= (bitvector_per_level[level][word] >> bit_shift);
152 | 	    word_id++;
153 | 	    if (bit_shift > 0)
154 | 		bits_[word_id] |= (bitvector_per_level[level][word] << (kWordSize - bit_shift));
155 | 	}
156 | 
157 | 	word_t bits_remain = num_bits_per_level[level] - num_complete_words * kWordSize;
158 | 	if (bits_remain > 0) {
159 | 	    word_t last_word = bitvector_per_level[level][num_complete_words];
160 | 	    bits_[word_id] |= (last_word >> bit_shift);
161 | 	    if (bit_shift + bits_remain < kWordSize) {
162 | 		bit_shift += bits_remain;
163 | 	    } else {
164 | 		word_id++;
165 | 		bits_[word_id] |= (last_word << (kWordSize - bit_shift));
166 | 		bit_shift = bit_shift + bits_remain - kWordSize;
167 | 	    }
168 | 	}
169 |     }
170 | }
171 | 
172 | } // namespace surf
173 | 
174 | #endif // BITVECTOR_H_
175 | 


--------------------------------------------------------------------------------
/include/select.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef SELECT_H_
  2 | #define SELECT_H_
  3 | 
  4 | #include "bitvector.hpp"
  5 | 
  6 | #include <assert.h>
  7 | 
  8 | #include <vector>
  9 | 
 10 | #include "config.hpp"
 11 | #include "popcount.h"
 12 | 
 13 | namespace surf {
 14 | 
 15 | class BitvectorSelect : public Bitvector {
 16 | public:
 17 |     BitvectorSelect() : sample_interval_(0), num_ones_(0), select_lut_(nullptr) {};
 18 | 
 19 |     BitvectorSelect(const position_t sample_interval, 
 20 | 		    const std::vector<std::vector<word_t> >& bitvector_per_level, 
 21 | 		    const std::vector<position_t>& num_bits_per_level,
 22 | 		    const level_t start_level = 0,
 23 | 		    const level_t end_level = 0/* non-inclusive */) 
 24 | 	: Bitvector(bitvector_per_level, num_bits_per_level, start_level, end_level) {
 25 | 	sample_interval_ = sample_interval;
 26 | 	initSelectLut();
 27 |     }
 28 | 
 29 |     ~BitvectorSelect() {}
 30 | 
 31 |     // Returns the postion of the rank-th 1 bit.
 32 |     // posistion is zero-based; rank is one-based.
 33 |     // E.g., for bitvector: 100101000, select(3) = 5
 34 |     position_t select(position_t rank) const {
 35 | 	assert(rank > 0);
 36 | 	assert(rank <= num_ones_ + 1);
 37 | 	position_t lut_idx = rank / sample_interval_;
 38 | 	position_t rank_left = rank % sample_interval_;
 39 | 	// The first slot in select_lut_ stores the position of the first 1 bit.
 40 | 	// Slot i > 0 stores the position of (i * sample_interval_)-th 1 bit
 41 | 	if (lut_idx == 0)
 42 | 	    rank_left--;
 43 | 
 44 | 	position_t pos = select_lut_[lut_idx];
 45 | 
 46 | 	if (rank_left == 0)
 47 | 	    return pos;
 48 | 
 49 | 	position_t word_id = pos / kWordSize;
 50 | 	position_t offset = pos % kWordSize;
 51 | 	if (offset == kWordSize - 1) {
 52 | 	    word_id++;
 53 | 	    offset = 0;
 54 | 	} else {
 55 | 	    offset++;
 56 | 	}
 57 | 	word_t word = bits_[word_id] << offset >> offset; //zero-out most significant bits
 58 | 	position_t ones_count_in_word = popcount(word);
 59 | 	while (ones_count_in_word < rank_left) {
 60 | 	    word_id++;
 61 | 	    word = bits_[word_id];
 62 | 	    rank_left -= ones_count_in_word;
 63 | 	    ones_count_in_word = popcount(word);
 64 | 	}
 65 | 	return (word_id * kWordSize + select64_popcount_search(word, rank_left));
 66 |     }
 67 | 
 68 |     position_t selectLutSize() const {
 69 | 	return ((num_ones_ / sample_interval_ + 1) * sizeof(position_t));
 70 |     }
 71 | 
 72 |     position_t serializedSize() const {
 73 | 	position_t size = sizeof(num_bits_) + sizeof(sample_interval_) + sizeof(num_ones_)
 74 | 	    + bitsSize() + selectLutSize();
 75 | 	sizeAlign(size);
 76 | 	return size;
 77 |     }
 78 | 
 79 |     position_t size() const {
 80 | 	return (sizeof(BitvectorSelect) + bitsSize() + selectLutSize());
 81 |     }
 82 | 
 83 |     position_t numOnes() const {
 84 | 	return num_ones_;
 85 |     }
 86 | 
 87 |     void serialize(char*& dst) const {
 88 | 	memcpy(dst, &num_bits_, sizeof(num_bits_));
 89 | 	dst += sizeof(num_bits_);
 90 | 	memcpy(dst, &sample_interval_, sizeof(sample_interval_));
 91 | 	dst += sizeof(sample_interval_);
 92 | 	memcpy(dst, &num_ones_, sizeof(num_ones_));
 93 | 	dst += sizeof(num_ones_);
 94 | 	memcpy(dst, bits_, bitsSize());
 95 | 	dst += bitsSize();
 96 | 	memcpy(dst, select_lut_, selectLutSize());
 97 | 	dst += selectLutSize();
 98 | 	align(dst);
 99 |     }
100 | 
101 |     static BitvectorSelect* deSerialize(char*& src) {
102 | 	BitvectorSelect* bv_select = new BitvectorSelect();
103 | 	memcpy(&(bv_select->num_bits_), src, sizeof(bv_select->num_bits_));
104 | 	src += sizeof(bv_select->num_bits_);
105 | 	memcpy(&(bv_select->sample_interval_), src, sizeof(bv_select->sample_interval_));
106 | 	src += sizeof(bv_select->sample_interval_);
107 | 	memcpy(&(bv_select->num_ones_), src, sizeof(bv_select->num_ones_));
108 | 	src += sizeof(bv_select->num_ones_);
109 | 
110 | 	bv_select->bits_ = new word_t[bv_select->numWords()];
111 | 	memcpy(bv_select->bits_, src, bv_select->bitsSize());
112 | 	src += bv_select->bitsSize();
113 | 	bv_select->select_lut_ = new position_t[bv_select->selectLutSize() / sizeof(position_t)];
114 | 	memcpy(bv_select->select_lut_, src, bv_select->selectLutSize());
115 | 	src += bv_select->selectLutSize();
116 | 	
117 | 	//bv_select->bits_ = const_cast<word_t*>(reinterpret_cast<const word_t*>(src));
118 | 	//src += bv_select->bitsSize();
119 | 	//bv_select->select_lut_ = const_cast<position_t*>(reinterpret_cast<const position_t*>(src));
120 | 	//src += bv_select->selectLutSize();
121 | 	align(src);
122 | 	return bv_select;
123 |     }
124 | 
125 |     void destroy() {
126 | 	delete[] bits_;
127 | 	delete[] select_lut_;
128 |     }
129 | 
130 | private:
131 |     // This function currently assumes that the first bit in the
132 |     // bitvector is one.
133 |     void initSelectLut() {
134 | 	position_t num_words = num_bits_ / kWordSize;
135 | 	if (num_bits_ % kWordSize != 0)
136 | 	    num_words++;
137 | 
138 | 	std::vector<position_t> select_lut_vector;
139 | 	select_lut_vector.push_back(0); //ASSERT: first bit is 1
140 | 	position_t sampling_ones = sample_interval_;
141 | 	position_t cumu_ones_upto_word = 0;
142 | 	for (position_t i = 0; i < num_words; i++) {
143 | 	    position_t num_ones_in_word = popcount(bits_[i]);
144 | 	    while (sampling_ones <= (cumu_ones_upto_word + num_ones_in_word)) {
145 | 		int diff = sampling_ones - cumu_ones_upto_word;
146 | 		position_t result_pos = i * kWordSize + select64_popcount_search(bits_[i], diff);
147 | 		select_lut_vector.push_back(result_pos);
148 | 		sampling_ones += sample_interval_;
149 | 	    }
150 | 	    cumu_ones_upto_word += popcount(bits_[i]);
151 | 	}
152 | 
153 | 	num_ones_ = cumu_ones_upto_word;
154 | 	position_t num_samples = select_lut_vector.size();
155 | 	select_lut_ = new position_t[num_samples];
156 | 	for (position_t i = 0; i < num_samples; i++)
157 | 	    select_lut_[i] = select_lut_vector[i];
158 |     }
159 | 
160 | private:
161 |     position_t sample_interval_;
162 |     position_t num_ones_;
163 |     position_t* select_lut_; //select look-up table
164 | };
165 | 
166 | } // namespace surf
167 | 
168 | #endif // SELECT_H_
169 | 


--------------------------------------------------------------------------------
/bench/workload_arf.cpp:
--------------------------------------------------------------------------------
  1 | #include "bench.hpp"
  2 | #include "ARF.h"
  3 | #include "Database.h"
  4 | #include "Query.h"
  5 | 
  6 | static const int kARFSize = 70000000;
  7 | static const int kInputSize = 10000000;
  8 | static const int kTxnSize = 10000000;
  9 | static const int kTrainingSize = 2000000;
 10 | static const uint64_t kDomain = (ULLONG_MAX / 2 - 1);
 11 | static const uint64_t kRangeSize = 922336973116;
 12 | 
 13 | int main(int argc, char *argv[]) {
 14 |     if (argc != 4) {
 15 | 	std::cout << "Usage:\n";
 16 | 	std::cout << "1. percentage of keys inserted: 0 < num <= 100\n";
 17 | 	std::cout << "2. query type: point, range\n";
 18 | 	std::cout << "3. distribution: uniform, zipfian, latest\n";
 19 | 	return -1;
 20 |     }
 21 | 
 22 |     unsigned percent = atoi(argv[1]);
 23 |     std::string query_type = argv[2];
 24 |     std::string distribution = argv[3];
 25 | 
 26 |     // check args ====================================================
 27 |     if (percent > 100) {
 28 | 	std::cout << bench::kRed << "WRONG percentage\n" << bench::kNoColor;
 29 | 	return -1;
 30 |     }
 31 | 
 32 |     if (query_type.compare(std::string("point")) != 0
 33 | 	&& query_type.compare(std::string("range")) != 0) {
 34 | 	std::cout << bench::kRed << "WRONG query type\n" << bench::kNoColor;
 35 | 	return -1;
 36 |     }
 37 | 
 38 |     if (distribution.compare(std::string("uniform")) != 0
 39 | 	&& distribution.compare(std::string("zipfian")) != 0
 40 | 	&& distribution.compare(std::string("latest")) != 0) {
 41 | 	std::cout << bench::kRed << "WRONG distribution\n" << bench::kNoColor;
 42 | 	return -1;
 43 |     }
 44 | 
 45 |     // load keys from files =======================================
 46 |     std::string load_file = "workloads/load_randint";
 47 |     std::vector<uint64_t> load_keys;
 48 |     bench::loadKeysFromFile(load_file, kInputSize, load_keys);
 49 |     std::cout << "load_keys size = " << load_keys.size() << "\n";
 50 | 
 51 |     sort(load_keys.begin(), load_keys.end());
 52 |     uint64_t max_key = load_keys[load_keys.size() - 1];
 53 |     std::cout << std::hex << "max key = " << max_key << std::dec << "\n";
 54 |     uint64_t max_gap = load_keys[load_keys.size() - 1] - load_keys[0];
 55 |     std::cout << "max gap = " << max_gap << "\n";
 56 |     uint64_t avg_gap = max_gap / kInputSize;
 57 |     std::cout << "avg gap = " << avg_gap << "\n";
 58 | 
 59 |     std::string txn_file = "workloads/txn_randint_";
 60 |     txn_file += distribution;
 61 |     std::vector<uint64_t> txn_keys;
 62 |     bench::loadKeysFromFile(txn_file, kTxnSize, txn_keys);
 63 |     std::cout << "txn_keys size = " << txn_keys.size() << "\n";
 64 | 
 65 |     std::vector<uint64_t> insert_keys;
 66 |     bench::selectIntKeysToInsert(percent, insert_keys, load_keys);
 67 |     std::cout << "insert_keys size = " << insert_keys.size() << "\n";
 68 | 
 69 |     // compute upperbound keys for range queries =================
 70 |     std::vector<uint64_t> upper_bound_keys;
 71 |     if (query_type.compare(std::string("range")) == 0) {
 72 | 	for (int i = 0; i < (int)txn_keys.size(); i++) {
 73 | 	    txn_keys[i]++;
 74 | 	    uint64_t upper_bound = txn_keys[i] + kRangeSize;
 75 | 	    upper_bound_keys.push_back(upper_bound);
 76 | 	}
 77 |     } else {
 78 | 	for (int i = 0; i < (int)txn_keys.size(); i++) {
 79 | 	    upper_bound_keys.push_back(txn_keys[i]);
 80 | 	}
 81 |     }
 82 | 
 83 |     // create filter ==============================================
 84 |     arf::Database* db = new arf::Database(insert_keys);
 85 |     arf::ARF* filter = new arf::ARF(0, kDomain, db);
 86 | 
 87 |     // build perfect ARF ==========================================
 88 |     double start_time = bench::getNow();
 89 |     filter->perfect(db);
 90 |     double end_time = bench::getNow();
 91 |     double time_diff = end_time - start_time;
 92 |     std::cout << "build perfect time = " << time_diff << " s\n";
 93 | 
 94 |     // training ===================================================
 95 |     start_time = bench::getNow();
 96 |     for (int i = 0; i < kTrainingSize; i++) {
 97 | 	if (i % 100000 == 0)
 98 | 	    std::cout << "i = " << i << std::endl;
 99 | 	bool qR = db->rangeQuery(txn_keys[i], upper_bound_keys[i]);
100 | 	filter->handle_query(txn_keys[i], upper_bound_keys[i], qR, true);
101 |     }
102 |     filter->reset_training_phase();
103 |     filter->truncate(kARFSize);
104 |     filter->end_training_phase();
105 |     filter->print_size();
106 |     end_time = bench::getNow();
107 |     time_diff = end_time - start_time;
108 |     std::cout << "training time = " << time_diff << " s\n";
109 |     std::cout << "training throughput = " << ((kTrainingSize + 0.0) / time_diff) << " txns/s\n";
110 | 
111 |     // execute transactions =======================================
112 |     int64_t positives = 0;
113 |     start_time = bench::getNow();
114 |     for (int i = kTrainingSize; i < kTxnSize; i++) {
115 | 	positives += (int)filter->handle_query(txn_keys[i], upper_bound_keys[i], true, false);
116 |     }
117 |     end_time = bench::getNow();
118 |     time_diff = end_time - start_time;
119 |     std::cout << "time = " << time_diff << " s\n";
120 |     std::cout << "throughput = " << bench::kGreen << ((kTrainingSize + 0.0) / time_diff) << bench::kNoColor << " txns/s\n";
121 |     end_time = bench::getNow();
122 | 
123 |     // compute true positives ======================================
124 |     int64_t tps = 0;
125 |     int64_t tns = 0;
126 |     for (int i = kTrainingSize; i < kTxnSize; i++) {
127 | 	bool dR = db->rangeQuery(txn_keys[i], upper_bound_keys[i]);
128 | 	if (dR)
129 | 	    tps++;
130 | 	else
131 | 	    tns++;
132 |     }
133 |     int64_t fps = positives - tps;
134 | 
135 |     std::cout << "positives = " << positives << "\n";
136 |     std::cout << "true positives = " << tps << "\n";
137 |     std::cout << "true negatives = " << tns << "\n";
138 |     std::cout << "false positives = " << fps << "\n";
139 | 
140 |     double fp_rate = 0;
141 |     if (fps >= 0)
142 | 	fp_rate = fps / (tns + fps + 0.0);
143 |     else
144 | 	std::cout << "ERROR: fps < 0\n";
145 |     std::cout << "False Positive Rate = " << fp_rate << "\n";
146 | 
147 |     return 0;
148 | }
149 | 


--------------------------------------------------------------------------------
/bench/bloom.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2012 The LevelDB Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style license that can be
  3 | // found in the LICENSE file. See the AUTHORS file for names of contributors.
  4 | 
  5 | // Modified by Huanchen, 2018
  6 | 
  7 | #ifndef LEVELDB_BLOOM_H_
  8 | #define LEVELDB_BLOOM_H_
  9 | 
 10 | #include <stdint.h>
 11 | #include <string.h>
 12 | 
 13 | #include <vector>
 14 | #include <string>
 15 | 
 16 | #include "MurmurHash3.h"
 17 | 
 18 | using namespace std;
 19 | 
 20 | inline uint32_t DecodeFixed32(const char* ptr) {
 21 |     uint32_t result;
 22 |     memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
 23 |     return result;
 24 | }
 25 | 
 26 | /*
 27 | inline uint32_t Hash(const char* data, size_t n, uint32_t seed) {
 28 |     // Similar to murmur hash
 29 |     const uint32_t m = 0xc6a4a793;
 30 |     const uint32_t r = 24;
 31 |     const char* limit = data + n;
 32 |     uint32_t h = seed ^ (n * m);
 33 | 
 34 |     // Pick up four bytes at a time
 35 |     while (data + 4 <= limit) {
 36 | 	uint32_t w = DecodeFixed32(data);
 37 | 	data += 4;
 38 | 	h += w;
 39 | 	h *= m;
 40 | 	h ^= (h >> 16);
 41 |     }
 42 | 
 43 |     // Pick up remaining bytes
 44 |     switch (limit - data) {
 45 |     case 3:
 46 | 	h += static_cast<unsigned char>(data[2]) << 16;
 47 |     case 2:
 48 | 	h += static_cast<unsigned char>(data[1]) << 8;
 49 |     case 1:
 50 | 	h += static_cast<unsigned char>(data[0]);
 51 | 	h *= m;
 52 | 	h ^= (h >> r);
 53 | 	break;
 54 |     }
 55 |     return h;
 56 | }
 57 | */
 58 | static void BloomHash(const string &key, uint32_t* out) {
 59 |     MurmurHash3_x86_128(key.c_str(), key.size(), 0xbc9f1d34, out);
 60 | }
 61 | 
 62 | static void BloomHash(const uint64_t key, uint32_t* out) {
 63 |     MurmurHash3_x86_128((const char*)(&key), sizeof(uint64_t), 0xbc9f1d34, out);
 64 | }
 65 | 
 66 | class BloomFilter {
 67 |  private:
 68 |     size_t bits_per_key_;
 69 |     size_t k_;
 70 | 
 71 |  public:
 72 |  BloomFilter(int bits_per_key)
 73 |      : bits_per_key_(bits_per_key) {
 74 | 	// We intentionally round down to reduce probing cost a little bit
 75 | 	k_ = static_cast<size_t>(bits_per_key * 0.69);  // 0.69 =~ ln(2)
 76 | 	if (k_ < 1) k_ = 1;
 77 | 	if (k_ > 30) k_ = 30;
 78 |     }
 79 | 
 80 |     void CreateFilter(vector<string> keys, int n, string* dst) const {
 81 | 	// Compute bloom filter size (in both bits and bytes)
 82 | 	size_t bits = n * bits_per_key_;
 83 | 
 84 | 	// For small n, we can see a very high false positive rate.  Fix it
 85 | 	// by enforcing a minimum bloom filter length.
 86 | 	if (bits < 64) bits = 64;
 87 | 
 88 | 	size_t bytes = (bits + 7) / 8;
 89 | 	bits = bytes * 8;
 90 | 
 91 | 	const size_t init_size = dst->size();
 92 | 	dst->resize(init_size + bytes, 0);
 93 | 	dst->push_back(static_cast<char>(k_));  // Remember # of probes in filter
 94 | 	char* array = &(*dst)[init_size];
 95 | 	for (int i = 0; i < n; i++) {
 96 | 	    // Use double-hashing to generate a sequence of hash values.
 97 | 	    // See analysis in [Kirsch,Mitzenmacher 2006].
 98 | 	    // uint32_t h = BloomHash(keys[i]);
 99 | 	    uint32_t hbase[4];
100 | 	    BloomHash(keys[i], hbase);
101 | 	    uint32_t h = hbase[0];
102 | 	    const uint32_t delta = hbase[1];
103 | 	    for (size_t j = 0; j < k_; j++) {
104 | 		const uint32_t bitpos = h % bits;
105 | 		array[bitpos/8] |= (1 << (bitpos % 8));
106 | 		h += delta;
107 | 	    }
108 | 	}
109 |     }
110 | 
111 |     void CreateFilter(vector<uint64_t> keys, int n, string* dst) const {
112 | 	// Compute bloom filter size (in both bits and bytes)
113 | 	size_t bits = n * bits_per_key_;
114 | 
115 | 	// For small n, we can see a very high false positive rate.  Fix it
116 | 	// by enforcing a minimum bloom filter length.
117 | 	if (bits < 64) bits = 64;
118 | 
119 | 	size_t bytes = (bits + 7) / 8;
120 | 	bits = bytes * 8;
121 | 
122 | 	const size_t init_size = dst->size();
123 | 	dst->resize(init_size + bytes, 0);
124 | 	dst->push_back(static_cast<char>(k_));  // Remember # of probes in filter
125 | 	char* array = &(*dst)[init_size];
126 | 	for (int i = 0; i < n; i++) {
127 | 	    // Use double-hashing to generate a sequence of hash values.
128 | 	    // See analysis in [Kirsch,Mitzenmacher 2006].
129 | 	    //uint32_t h = BloomHash(keys[i]);
130 | 	    uint32_t hbase[4];
131 | 	    BloomHash(keys[i], hbase);
132 | 	    uint32_t h = hbase[0];
133 | 	    const uint32_t delta = hbase[1];
134 | 	    for (size_t j = 0; j < k_; j++) {
135 | 		const uint32_t bitpos = h % bits;
136 | 		array[bitpos/8] |= (1 << (bitpos % 8));
137 | 		h += delta;
138 | 	    }
139 | 	}
140 |     }
141 | 
142 |     bool KeyMayMatch(const string& key, const string& bloom_filter) const {
143 | 	const size_t len = bloom_filter.size();
144 | 	if (len < 2) return false;
145 | 
146 | 	const char* array = bloom_filter.c_str();
147 | 	const size_t bits = (len - 1) * 8;
148 | 
149 | 	// Use the encoded k so that we can read filters generated by
150 | 	// bloom filters created using different parameters.
151 | 	const size_t k = array[len-1];
152 | 	if (k > 30) {
153 | 	    // Reserved for potentially new encodings for short bloom filters.
154 | 	    // Consider it a match.
155 | 	    return true;
156 | 	}
157 | 
158 | 	uint32_t hbase[4];
159 | 	BloomHash(key, hbase);
160 | 	uint32_t h = hbase[0];
161 | 	const uint32_t delta = hbase[1];
162 | 	for (size_t j = 0; j < k; j++) {
163 | 	    const uint32_t bitpos = h % bits;
164 | 	    if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false;
165 | 	    h += delta;
166 | 	}
167 | 	return true;
168 |     }
169 | 
170 |     bool KeyMayMatch(const uint64_t key, const string& bloom_filter) const {
171 | 	const size_t len = bloom_filter.size();
172 | 	if (len < 2) return false;
173 | 
174 | 	const char* array = bloom_filter.c_str();
175 | 	const size_t bits = (len - 1) * 8;
176 | 
177 | 	// Use the encoded k so that we can read filters generated by
178 | 	// bloom filters created using different parameters.
179 | 	const size_t k = array[len-1];
180 | 	if (k > 30) {
181 | 	    // Reserved for potentially new encodings for short bloom filters.
182 | 	    // Consider it a match.
183 | 	    return true;
184 | 	}
185 | 
186 | 	uint32_t hbase[4];
187 | 	BloomHash(key, hbase);
188 | 	uint32_t h = hbase[0];
189 | 	const uint32_t delta = hbase[1];
190 | 	for (size_t j = 0; j < k; j++) {
191 | 	    const uint32_t bitpos = h % bits;
192 | 	    if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false;
193 | 	    h += delta;
194 | 	}
195 | 	return true;
196 |     }
197 | };
198 | 
199 | 
200 | #endif  // LEVELDB_BLOOM_H_
201 | 


--------------------------------------------------------------------------------
/include/label_vector.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef LABELVECTOR_H_
  2 | #define LABELVECTOR_H_
  3 | 
  4 | #include <emmintrin.h>
  5 | 
  6 | #include <vector>
  7 | 
  8 | #include "config.hpp"
  9 | 
 10 | namespace surf {
 11 | 
 12 | class LabelVector {
 13 | public:
 14 |     LabelVector() : num_bytes_(0), labels_(nullptr) {};
 15 | 
 16 |     LabelVector(const std::vector<std::vector<label_t> >& labels_per_level,
 17 | 		const level_t start_level = 0,
 18 | 		level_t end_level = 0/* non-inclusive */) {
 19 | 	if (end_level == 0)
 20 | 	    end_level = labels_per_level.size();
 21 | 
 22 | 	num_bytes_ = 1;
 23 | 	for (level_t level = start_level; level < end_level; level++)
 24 | 	    num_bytes_ += labels_per_level[level].size();
 25 | 
 26 | 	//labels_ = new label_t[num_bytes_];
 27 | 	position_t alloc_bytes = num_bytes_ * (num_bytes_ / kWordSize + 1);
 28 | 	labels_ = new label_t[alloc_bytes];
 29 | 	for (position_t i = 0; i < alloc_bytes; i++)
 30 | 	  labels_[i] = 0;
 31 | 
 32 | 	position_t pos = 0;
 33 | 	for (level_t level = start_level; level < end_level; level++) {
 34 | 	    for (position_t idx = 0; idx < labels_per_level[level].size(); idx++) {
 35 | 		labels_[pos] = labels_per_level[level][idx];
 36 | 		pos++;
 37 | 	    }
 38 | 	}
 39 |     }
 40 | 
 41 |     ~LabelVector() {}
 42 | 
 43 |     position_t getNumBytes() const {
 44 | 	return num_bytes_;
 45 |     }
 46 | 
 47 |     position_t serializedSize() const {
 48 | 	position_t size = sizeof(num_bytes_) + num_bytes_;
 49 | 	sizeAlign(size);
 50 | 	return size;
 51 |     }
 52 | 
 53 |     position_t size() const {
 54 | 	return (sizeof(LabelVector) + num_bytes_);
 55 |     }
 56 | 
 57 |     label_t read(const position_t pos) const {
 58 | 	return labels_[pos];
 59 |     }
 60 | 
 61 |     label_t operator[](const position_t pos) const {
 62 | 	return labels_[pos];
 63 |     }
 64 | 
 65 |     bool search(const label_t target, position_t& pos, const position_t search_len) const;
 66 |     bool searchGreaterThan(const label_t target, position_t& pos, const position_t search_len) const;
 67 | 
 68 |     bool binarySearch(const label_t target, position_t& pos, const position_t search_len) const;
 69 |     bool simdSearch(const label_t target, position_t& pos, const position_t search_len) const;
 70 |     bool linearSearch(const label_t target, position_t& pos, const position_t search_len) const;
 71 | 
 72 |     bool binarySearchGreaterThan(const label_t target, position_t& pos, const position_t search_len) const;
 73 |     bool linearSearchGreaterThan(const label_t target, position_t& pos, const position_t search_len) const;
 74 | 
 75 |     void serialize(char*& dst) const {
 76 | 	memcpy(dst, &num_bytes_, sizeof(num_bytes_));
 77 | 	dst += sizeof(num_bytes_);
 78 | 	memcpy(dst, labels_, num_bytes_);
 79 | 	dst += num_bytes_;
 80 | 	align(dst);
 81 |     }
 82 |     
 83 |     static LabelVector* deSerialize(char*& src) {
 84 | 	LabelVector* lv = new LabelVector();
 85 | 	memcpy(&(lv->num_bytes_), src, sizeof(lv->num_bytes_));
 86 | 	src += sizeof(lv->num_bytes_);
 87 | 	
 88 | 	lv->labels_ = new label_t[lv->num_bytes_];
 89 | 	memcpy(lv->labels_, src, lv->num_bytes_);
 90 | 	src += lv->num_bytes_;
 91 | 	
 92 | 	//lv->labels_ = const_cast<label_t*>(reinterpret_cast<const label_t*>(src));
 93 | 	//src += lv->num_bytes_;
 94 | 	align(src);
 95 | 	return lv;
 96 |     }
 97 | 
 98 |     void destroy() {
 99 | 	delete[] labels_;	
100 |     }
101 | 
102 | private:
103 |     position_t num_bytes_;
104 |     label_t* labels_;
105 | };
106 | 
107 | bool LabelVector::search(const label_t target, position_t& pos, position_t search_len) const {
108 |     //skip terminator label
109 |     if ((search_len > 1) && (labels_[pos] == kTerminator)) {
110 | 	pos++;
111 | 	search_len--;
112 |     }
113 | 
114 |     if (search_len < 3)
115 | 	return linearSearch(target, pos, search_len);
116 |     if (search_len < 12)
117 | 	return binarySearch(target, pos, search_len);
118 |     else
119 | 	return simdSearch(target, pos, search_len);
120 | }
121 | 
122 | bool LabelVector::searchGreaterThan(const label_t target, position_t& pos, position_t search_len) const {
123 |     //skip terminator label
124 |     if ((search_len > 1) && (labels_[pos] == kTerminator)) {
125 | 	pos++;
126 | 	search_len--;
127 |     }
128 | 
129 |     if (search_len < 3)
130 | 	return linearSearchGreaterThan(target, pos, search_len);
131 |     else
132 | 	return binarySearchGreaterThan(target, pos, search_len);
133 | }
134 | 
135 | bool LabelVector::binarySearch(const label_t target, position_t& pos, const position_t search_len) const {
136 |     position_t l = pos;
137 |     position_t r = pos + search_len;
138 |     while (l < r) {
139 | 	position_t m = (l + r) >> 1;
140 | 	if (target < labels_[m]) {
141 | 	    r = m;
142 | 	} else if (target == labels_[m]) {
143 | 	    pos = m;
144 | 	    return true;
145 | 	} else {
146 | 	    l = m + 1;
147 | 	}
148 |     }
149 |     return false;
150 | }
151 | 
152 | bool LabelVector::simdSearch(const label_t target, position_t& pos, const position_t search_len) const {
153 |     position_t num_labels_searched = 0;
154 |     position_t num_labels_left = search_len;
155 |     while ((num_labels_left >> 4) > 0) {
156 | 	label_t* start_ptr = labels_ + pos + num_labels_searched;
157 | 	__m128i cmp = _mm_cmpeq_epi8(_mm_set1_epi8(target), 
158 | 				     _mm_loadu_si128(reinterpret_cast<__m128i*>(start_ptr)));
159 | 	unsigned check_bits = _mm_movemask_epi8(cmp);
160 | 	if (check_bits) {
161 | 	    pos += (num_labels_searched + __builtin_ctz(check_bits));
162 | 	    return true;
163 | 	}
164 | 	num_labels_searched += 16;
165 | 	num_labels_left -= 16;
166 |     }
167 | 
168 |     if (num_labels_left > 0) {
169 | 	label_t* start_ptr = labels_ + pos + num_labels_searched;
170 | 	__m128i cmp = _mm_cmpeq_epi8(_mm_set1_epi8(target), 
171 | 				     _mm_loadu_si128(reinterpret_cast<__m128i*>(start_ptr)));
172 | 	unsigned leftover_bits_mask = (1 << num_labels_left) - 1;
173 | 	unsigned check_bits = _mm_movemask_epi8(cmp) & leftover_bits_mask;
174 | 	if (check_bits) {
175 | 	    pos += (num_labels_searched + __builtin_ctz(check_bits));
176 | 	    return true;
177 | 	}
178 |     }
179 | 
180 |     return false;
181 | }
182 | 
183 | bool LabelVector::linearSearch(const label_t target, position_t&  pos, const position_t search_len) const {
184 |     for (position_t i = 0; i < search_len; i++) {
185 | 	if (target == labels_[pos + i]) {
186 | 	    pos += i;
187 | 	    return true;
188 | 	}
189 |     }
190 |     return false;
191 | }
192 | 
193 | bool LabelVector::binarySearchGreaterThan(const label_t target, position_t& pos, const position_t search_len) const {
194 |     position_t l = pos;
195 |     position_t r = pos + search_len;
196 |     while (l < r) {
197 | 	position_t m = (l + r) >> 1;
198 | 	if (target < labels_[m]) {
199 | 	    r = m;
200 | 	} else if (target == labels_[m]) {
201 | 	    if (m < pos + search_len - 1) {
202 | 		pos = m + 1;
203 | 		return true;
204 | 	    }
205 | 	    return false;
206 | 	} else {
207 | 	    l = m + 1;
208 | 	}
209 |     }
210 | 
211 |     if (l < pos + search_len) {
212 | 	pos = l;
213 | 	return true;
214 |     }
215 |     return false;
216 | }
217 | 
218 | bool LabelVector::linearSearchGreaterThan(const label_t target, position_t& pos, const position_t search_len) const {
219 |     for (position_t i = 0; i < search_len; i++) {
220 | 	if (labels_[pos + i] > target) {
221 | 	    pos += i;
222 | 	    return true;
223 | 	}
224 |     }
225 |     return false;
226 | }
227 | 
228 | } // namespace surf
229 | 
230 | #endif // LABELVECTOR_H_
231 | 


--------------------------------------------------------------------------------
/test/unitTest/test_bitvector.cpp:
--------------------------------------------------------------------------------
  1 | #include "gtest/gtest.h"
  2 | 
  3 | #include <assert.h>
  4 | 
  5 | #include <fstream>
  6 | #include <string>
  7 | #include <vector>
  8 | 
  9 | #include "bitvector.hpp"
 10 | #include "config.hpp"
 11 | #include "surf_builder.hpp"
 12 | 
 13 | namespace surf {
 14 | 
 15 | namespace bitvectortest {
 16 | 
 17 | static const std::string kFilePath = "../../../test/words.txt";
 18 | static const int kTestSize = 234369;
 19 | static std::vector<std::string> words;
 20 | 
 21 | class BitvectorUnitTest : public ::testing::Test {
 22 | public:
 23 |     virtual void SetUp () {
 24 | 	bool include_dense = true;
 25 | 	uint32_t sparse_dense_ratio = 0;
 26 | 	level_t suffix_len = 8;
 27 | 	builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, kReal, 0, suffix_len);
 28 | 	num_items_ = 0;
 29 |     }
 30 |     virtual void TearDown () {
 31 | 	delete builder_;
 32 | 	delete bv_;
 33 | 	delete bv2_;
 34 | 	delete bv3_;
 35 | 	delete bv4_;
 36 | 	delete bv5_;
 37 |     }
 38 | 
 39 |     void setupWordsTest();
 40 | 
 41 |     SuRFBuilder* builder_;
 42 |     Bitvector* bv_; // sparse: child indicator bits
 43 |     Bitvector* bv2_; // sparse: louds bits
 44 |     Bitvector* bv3_; // dense: label bitmap
 45 |     Bitvector* bv4_; // dense: child indicator bitmap
 46 |     Bitvector* bv5_; // dense: prefixkey indicator bits
 47 |     std::vector<position_t> num_items_per_level_; // sparse
 48 |     position_t num_items_; // sparse
 49 |     std::vector<position_t> num_bits_per_level_; // dense
 50 | };
 51 | 
 52 | void BitvectorUnitTest::setupWordsTest() {
 53 |     builder_->build(words);
 54 |     for (level_t level = 0; level < builder_->getTreeHeight(); level++)
 55 | 	num_items_per_level_.push_back(builder_->getLabels()[level].size());
 56 |     for (level_t level = 0; level < num_items_per_level_.size(); level++)
 57 | 	num_items_ += num_items_per_level_[level];
 58 |     bv_ = new Bitvector(builder_->getChildIndicatorBits(), num_items_per_level_);
 59 |     bv2_ = new Bitvector(builder_->getLoudsBits(), num_items_per_level_);
 60 | 
 61 |     for (level_t level = 0; level < builder_->getTreeHeight(); level++)
 62 | 	num_bits_per_level_.push_back(builder_->getBitmapLabels()[level].size() * kWordSize);
 63 |     bv3_ = new Bitvector(builder_->getBitmapLabels(), num_bits_per_level_);
 64 |     bv4_ = new Bitvector(builder_->getBitmapChildIndicatorBits(), num_bits_per_level_);
 65 |     bv5_ = new Bitvector(builder_->getPrefixkeyIndicatorBits(), builder_->getNodeCounts());
 66 | }
 67 | 
 68 | TEST_F (BitvectorUnitTest, readBitTest) {
 69 |     setupWordsTest();
 70 | 
 71 |     position_t bv_pos = 0;
 72 |     int node_num = -1;
 73 |     label_t prev_label = 0;
 74 |     position_t bv5_pos = 0;
 75 |     for (level_t level = 0; level < builder_->getTreeHeight(); level++) {
 76 | 	for (position_t pos = 0; pos < num_items_per_level_[level]; pos++) {
 77 | 	    // bv test
 78 | 	    bool has_child = SuRFBuilder::readBit(builder_->getChildIndicatorBits()[level], pos);
 79 | 	    bool bv_bit = bv_->readBit(bv_pos);
 80 | 	    ASSERT_EQ(has_child, bv_bit);
 81 | 
 82 | 	    // bv2 test
 83 | 	    bool is_node_start = SuRFBuilder::readBit(builder_->getLoudsBits()[level], pos);
 84 | 	    bv_bit = bv2_->readBit(bv_pos);
 85 | 	    ASSERT_EQ(is_node_start, bv_bit);
 86 | 
 87 | 	    bv_pos++;
 88 | 
 89 | 	    if (is_node_start)
 90 | 		node_num++;
 91 | 
 92 | 	    // bv5 test
 93 | 	    bool is_terminator = false;
 94 | 	    if (is_node_start) {
 95 | 	        is_terminator = (builder_->getLabels()[level][pos] == kTerminator)
 96 | 		    && !SuRFBuilder::readBit(builder_->getChildIndicatorBits()[level], pos);
 97 | 		bv_bit = bv5_->readBit(bv5_pos);
 98 | 		ASSERT_EQ(is_terminator, bv_bit);
 99 | 		bv5_pos++;
100 | 	    }
101 | 
102 | 	    if (is_terminator) {
103 | 		for (unsigned c = prev_label + 1; c < kFanout; c++) {
104 | 		    bool bv3_bit = bv3_->readBit((node_num - 1) * kFanout + c);
105 | 		    ASSERT_FALSE(bv3_bit);
106 | 		    bool bv4_bit = bv4_->readBit((node_num - 1) * kFanout + c);
107 | 		    ASSERT_FALSE(bv4_bit);
108 | 		}
109 | 		prev_label = '\255';
110 | 		continue;
111 | 	    }
112 | 
113 | 	    // bv3 test
114 | 	    label_t label = builder_->getLabels()[level][pos];
115 | 	    bool bv3_bit = bv3_->readBit(node_num * kFanout + label);
116 | 	    ASSERT_TRUE(bv3_bit);
117 | 
118 | 	    // bv4 test
119 | 	    bool bv4_bit = bv4_->readBit(node_num * kFanout + label);
120 | 	    ASSERT_EQ(has_child, bv4_bit);
121 | 
122 | 	    // bv3 bv4 zero bit test
123 | 	    if (is_node_start) {
124 | 		if (node_num > 0) {
125 | 		    for (unsigned c = prev_label + 1; c < kFanout; c++) {
126 | 			bv3_bit = bv3_->readBit((node_num - 1) * kFanout + c);
127 | 			ASSERT_FALSE(bv3_bit);
128 | 			bv4_bit = bv4_->readBit((node_num - 1) * kFanout + c);
129 | 			ASSERT_FALSE(bv4_bit);
130 | 		    }
131 | 		}
132 | 		for (unsigned c = 0; c < (unsigned)label; c++) {
133 | 		    bv3_bit = bv3_->readBit(node_num * kFanout + c);
134 | 		    ASSERT_FALSE(bv3_bit);
135 | 		    bv4_bit = bv4_->readBit(node_num * kFanout + c);
136 | 		    ASSERT_FALSE(bv4_bit);
137 | 		}
138 | 	    } else {
139 | 		for (unsigned c = prev_label + 1; c < (unsigned)label; c++) {
140 | 		    bv3_bit = bv3_->readBit(node_num * kFanout + c);
141 | 		    ASSERT_FALSE(bv3_bit);
142 | 		    bv4_bit = bv4_->readBit(node_num * kFanout + c);
143 | 		    ASSERT_FALSE(bv4_bit);
144 | 		}
145 | 	    }
146 | 	    prev_label = label;
147 | 	}
148 |     }
149 | 
150 | }
151 | 
152 | TEST_F (BitvectorUnitTest, distanceToNextSetBitTest) {
153 |     setupWordsTest();
154 |     std::vector<position_t> distanceVector;
155 |     position_t distance = 1;
156 |     for (position_t pos = 1; pos < num_items_; pos++) {
157 | 	if (bv2_->readBit(pos)) {
158 | 	    while (distance > 0) {
159 | 		distanceVector.push_back(distance);
160 | 		distance--;
161 | 	    }
162 | 	    distance = 1;
163 | 	}
164 | 	else {
165 | 	    distance++;
166 | 	}
167 |     }
168 |     while (distance > 0) {
169 | 	distanceVector.push_back(distance);
170 | 	distance--;
171 |     }
172 | 
173 |     for (position_t pos = 0; pos < num_items_; pos++) {
174 | 	distance = bv2_->distanceToNextSetBit(pos);
175 | 	ASSERT_EQ(distanceVector[pos], distance);
176 |     }
177 | }
178 | 
179 | TEST_F (BitvectorUnitTest, distanceToPrevSetBitTest) {
180 |     setupWordsTest();
181 |     std::vector<position_t> distanceVector;
182 |     for (position_t pos = 0; pos < num_items_; pos++)
183 | 	distanceVector.push_back(0);
184 |     
185 |     position_t distance = 1;
186 |     for (position_t pos = num_items_ - 2; pos > 0; pos--) {
187 | 	if (bv2_->readBit(pos)) {
188 | 	    for (position_t i = 1; i <= distance; i++)
189 | 		distanceVector[pos + i] = i;
190 | 	    distance = 1;
191 | 	}
192 | 	else {
193 | 	    distance++;
194 | 	}
195 |     }
196 |     if (bv2_->readBit(0)) {
197 | 	for (position_t i = 1; i <= distance; i++)
198 | 	    distanceVector[i] = i;
199 |     } else {
200 | 	distance++;
201 | 	for (position_t i = 1; i <= distance; i++)
202 | 	    distanceVector[i - 1] = i;
203 |     }
204 | 
205 |     for (position_t pos = 0; pos < num_items_; pos++) {
206 | 	distance = bv2_->distanceToPrevSetBit(pos);
207 | 	ASSERT_EQ(distanceVector[pos], distance);
208 |     }
209 | }
210 | 
211 | void loadWordList() {
212 |     std::ifstream infile(kFilePath);
213 |     std::string key;
214 |     int count = 0;
215 |     while (infile.good() && count < kTestSize) {
216 | 	infile >> key;
217 | 	words.push_back(key);
218 | 	count++;
219 |     }
220 | }
221 | 
222 | } // namespace bitvectortest
223 | 
224 | } // namespace surf
225 | 
226 | int main (int argc, char** argv) {
227 |     ::testing::InitGoogleTest(&argc, argv);
228 |     surf::bitvectortest::loadWordList();
229 |     return RUN_ALL_TESTS();
230 | }
231 | 


--------------------------------------------------------------------------------
/bench/workload_gen/workload_spec/workload_template:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2012-2016 YCSB contributors. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"); you
  4 | # may not use this file except in compliance with the License. You
  5 | # may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 12 | # implied. See the License for the specific language governing
 13 | # permissions and limitations under the License. See accompanying
 14 | # LICENSE file.
 15 | 
 16 | # Yahoo! Cloud System Benchmark
 17 | # Workload Template: Default Values
 18 | #
 19 | # File contains all properties that can be set to define a
 20 | # YCSB session. All properties are set to their default
 21 | # value if one exists. If not, the property is commented
 22 | # out. When a property has a finite number of settings,
 23 | # the default is enabled and the alternates are shown in
 24 | # comments below it.
 25 | # 
 26 | # Use of most explained through comments in Client.java or 
 27 | # CoreWorkload.java or on the YCSB wiki page:
 28 | # https://github.com/brianfrankcooper/YCSB/wiki/Core-Properties
 29 | 
 30 | # The name of the workload class to use
 31 | workload=com.yahoo.ycsb.workloads.CoreWorkload
 32 | 
 33 | # There is no default setting for recordcount but it is
 34 | # required to be set.
 35 | # The number of records in the table to be inserted in
 36 | # the load phase or the number of records already in the 
 37 | # table before the run phase.
 38 | recordcount=1000000
 39 | 
 40 | # There is no default setting for operationcount but it is
 41 | # required to be set.
 42 | # The number of operations to use during the run phase.
 43 | operationcount=3000000
 44 | 
 45 | # The number of insertions to do, if different from recordcount.
 46 | # Used with insertstart to grow an existing table.
 47 | #insertcount=
 48 | 
 49 | # The offset of the first insertion
 50 | insertstart=0
 51 | 
 52 | # The number of fields in a record
 53 | fieldcount=10
 54 | 
 55 | # The size of each field (in bytes)
 56 | fieldlength=100
 57 | 
 58 | # Should read all fields
 59 | readallfields=true
 60 | 
 61 | # Should write all fields on update
 62 | writeallfields=false
 63 | 
 64 | # The distribution used to choose the length of a field
 65 | fieldlengthdistribution=constant
 66 | #fieldlengthdistribution=uniform
 67 | #fieldlengthdistribution=zipfian
 68 | 
 69 | # What proportion of operations are reads
 70 | readproportion=0.95
 71 | 
 72 | # What proportion of operations are updates
 73 | updateproportion=0.05
 74 | 
 75 | # What proportion of operations are inserts
 76 | insertproportion=0
 77 | 
 78 | # What proportion of operations read then modify a record
 79 | readmodifywriteproportion=0
 80 | 
 81 | # What proportion of operations are scans
 82 | scanproportion=0
 83 | 
 84 | # On a single scan, the maximum number of records to access
 85 | maxscanlength=1000
 86 | 
 87 | # The distribution used to choose the number of records to access on a scan
 88 | scanlengthdistribution=uniform
 89 | #scanlengthdistribution=zipfian
 90 | 
 91 | # Should records be inserted in order or pseudo-randomly
 92 | insertorder=hashed
 93 | #insertorder=ordered
 94 | 
 95 | # The distribution of requests across the keyspace
 96 | requestdistribution=zipfian
 97 | #requestdistribution=uniform
 98 | #requestdistribution=latest
 99 | 
100 | # Percentage of data items that constitute the hot set
101 | hotspotdatafraction=0.2
102 | 
103 | # Percentage of operations that access the hot set
104 | hotspotopnfraction=0.8
105 | 
106 | # Maximum execution time in seconds
107 | #maxexecutiontime= 
108 | 
109 | # The name of the database table to run queries against
110 | table=usertable
111 | 
112 | # The column family of fields (required by some databases)
113 | #columnfamily=
114 | 
115 | # How the latency measurements are presented
116 | measurementtype=histogram
117 | #measurementtype=timeseries
118 | #measurementtype=raw
119 | # When measurementtype is set to raw, measurements will be output
120 | # as RAW datapoints in the following csv format:
121 | # "operation, timestamp of the measurement, latency in us"
122 | #
123 | # Raw datapoints are collected in-memory while the test is running. Each
124 | # data point consumes about 50 bytes (including java object overhead).
125 | # For a typical run of 1 million to 10 million operations, this should
126 | # fit into memory most of the time. If you plan to do 100s of millions of
127 | # operations per run, consider provisioning a machine with larger RAM when using
128 | # the RAW measurement type, or split the run into multiple runs.
129 | #
130 | # Optionally, you can specify an output file to save raw datapoints.
131 | # Otherwise, raw datapoints will be written to stdout.
132 | # The output file will be appended to if it already exists, otherwise
133 | # a new output file will be created.
134 | #measurement.raw.output_file = /tmp/your_output_file_for_this_run
135 | 
136 | # JVM Reporting.
137 | #
138 | # Measure JVM information over time including GC counts, max and min memory
139 | # used, max and min thread counts, max and min system load and others. This
140 | # setting must be enabled in conjunction with the "-s" flag to run the status
141 | # thread. Every "status.interval", the status thread will capture JVM 
142 | # statistics and record the results. At the end of the run, max and mins will
143 | # be recorded.
144 | # measurement.trackjvm = false
145 | 
146 | # The range of latencies to track in the histogram (milliseconds)
147 | histogram.buckets=1000
148 | 
149 | # Granularity for time series (in milliseconds)
150 | timeseries.granularity=1000
151 | 
152 | # Latency reporting.
153 | #
154 | # YCSB records latency of failed operations separately from successful ones.
155 | # Latency of all OK operations will be reported under their operation name,
156 | # such as [READ], [UPDATE], etc.
157 | #
158 | # For failed operations:
159 | # By default we don't track latency numbers of specific error status.
160 | # We just report latency of all failed operation under one measurement name
161 | # such as [READ-FAILED]. But optionally, user can configure to have either:
162 | # 1. Record and report latency for each and every error status code by
163 | #    setting reportLatencyForEachError to true, or
164 | # 2. Record and report latency for a select set of error status codes by
165 | #    providing a CSV list of Status codes via the "latencytrackederrors"
166 | #    property.
167 | # reportlatencyforeacherror=false
168 | # latencytrackederrors="<comma separated strings of error codes>"
169 | 
170 | # Insertion error retry for the core workload.
171 | #
172 | # By default, the YCSB core workload does not retry any operations.
173 | # However, during the load process, if any insertion fails, the entire
174 | # load process is terminated.
175 | # If a user desires to have more robust behavior during this phase, they can
176 | # enable retry for insertion by setting the following property to a positive
177 | # number.
178 | # core_workload_insertion_retry_limit = 0
179 | #
180 | # the following number controls the interval between retries (in seconds):
181 | # core_workload_insertion_retry_interval = 3
182 | 
183 | # Distributed Tracing via Apache HTrace (http://htrace.incubator.apache.org/)
184 | #
185 | # Defaults to blank / no tracing
186 | # Below sends to a local file, sampling at 0.1%
187 | #
188 | # htrace.sampler.classes=ProbabilitySampler
189 | # htrace.sampler.fraction=0.001
190 | # htrace.span.receiver.classes=org.apache.htrace.core.LocalFileSpanReceiver
191 | # htrace.local.file.span.receiver.path=/some/path/to/local/file
192 | #
193 | # To capture all spans, use the AlwaysSampler
194 | #
195 | # htrace.sampler.classes=AlwaysSampler
196 | #
197 | # To send spans to an HTraced receiver, use the below and ensure
198 | # your classpath contains the htrace-htraced jar (i.e. when invoking the ycsb
199 | # command add -cp /path/to/htrace-htraced.jar)
200 | #
201 | # htrace.span.receiver.classes=org.apache.htrace.impl.HTracedSpanReceiver
202 | # htrace.htraced.receiver.address=example.com:9075
203 | # htrace.htraced.error.log.period.ms=10000
204 | 


--------------------------------------------------------------------------------
/include/popcount.h:
--------------------------------------------------------------------------------
  1 | /* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
  2 | #ifndef _FASTRANK_POPCOUNT_H_
  3 | #define _FASTRANK_POPCOUNT_H_
  4 | 
  5 | #include <sys/types.h>
  6 | #include <stdio.h>
  7 | #include <stdint.h>
  8 | 
  9 | namespace surf {
 10 | 
 11 | #define L8 0x0101010101010101ULL // Every lowest 8th bit set: 00000001...
 12 | #define G2 0xAAAAAAAAAAAAAAAAULL // Every highest 2nd bit: 101010...
 13 | #define G4 0x3333333333333333ULL // 00110011 ... used to group the sum of 4 bits.
 14 | #define G8 0x0F0F0F0F0F0F0F0FULL
 15 | #define H8 0x8080808080808080ULL 
 16 | #define L9 0x0040201008040201ULL
 17 | #define H9 (L9 << 8)
 18 | #define L16 0x0001000100010001ULL
 19 | #define H16 0x8000800080008000ULL
 20 | 
 21 | #define ONES_STEP_4 ( 0x1111111111111111ULL )
 22 | #define ONES_STEP_8 ( 0x0101010101010101ULL )
 23 | #define ONES_STEP_9 ( 1ULL << 0 | 1ULL << 9 | 1ULL << 18 | 1ULL << 27 | 1ULL << 36 | 1ULL << 45 | 1ULL << 54 )
 24 | #define ONES_STEP_16 ( 1ULL << 0 | 1ULL << 16 | 1ULL << 32 | 1ULL << 48 )
 25 | #define MSBS_STEP_4 ( 0x8ULL * ONES_STEP_4 )
 26 | #define MSBS_STEP_8 ( 0x80ULL * ONES_STEP_8 )
 27 | #define MSBS_STEP_9 ( 0x100ULL * ONES_STEP_9 )
 28 | #define MSBS_STEP_16 ( 0x8000ULL * ONES_STEP_16 )
 29 | #define INCR_STEP_8 ( 0x80ULL << 56 | 0x40ULL << 48 | 0x20ULL << 40 | 0x10ULL << 32 | 0x8ULL << 24 | 0x4ULL << 16 | 0x2ULL << 8 | 0x1 )
 30 | 
 31 | #define ONES_STEP_32 ( 0x0000000100000001ULL )
 32 | #define MSBS_STEP_32 ( 0x8000000080000000ULL )
 33 | 	
 34 | #define COMPARE_STEP_8(x,y) ( ( ( ( ( (x) | MSBS_STEP_8 ) - ( (y) & ~MSBS_STEP_8 ) ) ^ (x) ^ ~(y) ) & MSBS_STEP_8 ) >> 7 )
 35 | #define LEQ_STEP_8(x,y) ( ( ( ( ( (y) | MSBS_STEP_8 ) - ( (x) & ~MSBS_STEP_8 ) ) ^ (x) ^ (y) ) & MSBS_STEP_8 ) >> 7 )
 36 | 
 37 | #define UCOMPARE_STEP_9(x,y) ( ( ( ( ( ( (x) | MSBS_STEP_9 ) - ( (y) & ~MSBS_STEP_9 ) ) | ( x ^ y ) ) ^ ( x | ~y ) ) & MSBS_STEP_9 ) >> 8 )
 38 | #define UCOMPARE_STEP_16(x,y) ( ( ( ( ( ( (x) | MSBS_STEP_16 ) - ( (y) & ~MSBS_STEP_16 ) ) | ( x ^ y ) ) ^ ( x | ~y ) ) & MSBS_STEP_16 ) >> 15 )
 39 | #define ULEQ_STEP_9(x,y) ( ( ( ( ( ( (y) | MSBS_STEP_9 ) - ( (x) & ~MSBS_STEP_9 ) ) | ( x ^ y ) ) ^ ( x & ~y ) ) & MSBS_STEP_9 ) >> 8 )
 40 | #define ULEQ_STEP_16(x,y) ( ( ( ( ( ( (y) | MSBS_STEP_16 ) - ( (x) & ~MSBS_STEP_16 ) ) | ( x ^ y ) ) ^ ( x & ~y ) ) & MSBS_STEP_16 ) >> 15 )
 41 | #define ZCOMPARE_STEP_8(x) ( ( ( x | ( ( x | MSBS_STEP_8 ) - ONES_STEP_8 ) ) & MSBS_STEP_8 ) >> 7 )
 42 | 
 43 | // Population count of a 64 bit integer in SWAR (SIMD within a register) style
 44 | // From Sebastiano Vigna, "Broadword Implementation of Rank/Select Queries"
 45 | // http://sux.dsi.unimi.it/paper.pdf p4
 46 | // This variant uses multiplication for the last summation instead of
 47 | // continuing the shift/mask/addition chain.
 48 | inline int suxpopcount(uint64_t x) {
 49 |     // Step 1:  00 - 00 = 0;  01 - 00 = 01; 10 - 01 = 01; 11 - 01 = 10;
 50 |     x = x - ((x & G2) >> 1);
 51 |     // step 2:  add 2 groups of 2.
 52 |     x = (x & G4) + ((x >> 2) & G4);
 53 |     // 2 groups of 4.
 54 |     x = (x + (x >> 4)) & G8;
 55 |     // Using a multiply to collect the 8 groups of 8 together.
 56 |     x = x * L8 >> 56;
 57 |     return x;
 58 | }
 59 | 
 60 | // Default to using the GCC builtin popcount.  On architectures
 61 | // with -march popcnt, this compiles to a single popcnt instruction.
 62 | #ifndef popcount
 63 | #define popcount __builtin_popcountll
 64 | //#define popcount suxpopcount
 65 | #endif
 66 | 
 67 | #define popcountsize 64ULL
 68 | #define popcountmask (popcountsize - 1)
 69 | 
 70 | inline uint64_t popcountLinear(uint64_t *bits, uint64_t x, uint64_t nbits) {
 71 |     if (nbits == 0) { return 0; }
 72 |     uint64_t lastword = (nbits - 1) / popcountsize;
 73 |     uint64_t p = 0;
 74 | 
 75 |     __builtin_prefetch(bits + x + 7, 0); //huanchen
 76 |     for (uint64_t i = 0; i < lastword; i++) { /* tested;  manually unrolling doesn't help, at least in C */
 77 |         //__builtin_prefetch(bits + x + i + 3, 0);
 78 |         p += popcount(bits[x+i]); // note that use binds us to 64 bit popcount impls
 79 |     }
 80 | 
 81 |     // 'nbits' may or may not fall on a multiple of 64 boundary,
 82 |     // so we may need to zero out the right side of the last word
 83 |     // (accomplished by shifting it right, since we're just popcounting)
 84 |     uint64_t lastshifted = bits[x+lastword] >> (63 - ((nbits - 1) & popcountmask));
 85 |     p += popcount(lastshifted);
 86 |     return p;
 87 | }
 88 | 
 89 | // Return the index of the kth bit set in x 
 90 | inline int select64_naive(uint64_t x, int k) {
 91 |     int count = -1;
 92 |     for (int i = 63; i >= 0; i--) {
 93 |         count++;
 94 |         if (x & (1ULL << i)) {
 95 |             k--;
 96 |             if (k == 0) {
 97 |                 return count;
 98 |             }
 99 |         }
100 |     }
101 |     return -1;
102 | }
103 | 
104 | inline int select64_popcount_search(uint64_t x, int k) {
105 |     int loc = -1;
106 |     // if (k > popcount(x)) { return -1; }
107 | 
108 |     for (int testbits = 32; testbits > 0; testbits >>= 1) {
109 |         int lcount = popcount(x >> testbits);
110 |         if (k > lcount) {
111 |             x &= ((1ULL << testbits)-1);
112 |             loc += testbits;
113 |             k -= lcount;
114 |         } else {
115 |             x >>= testbits;
116 |         }
117 |     }
118 |     return loc+k;
119 | }
120 | 
121 | inline int select64_broadword(uint64_t x, int k) {
122 |     uint64_t word = x;
123 |     int residual = k;
124 |     register uint64_t byte_sums;
125 |     
126 |     byte_sums = word - ( ( word & 0xa * ONES_STEP_4 ) >> 1 );
127 |     byte_sums = ( byte_sums & 3 * ONES_STEP_4 ) + ( ( byte_sums >> 2 ) & 3 * ONES_STEP_4 );
128 |     byte_sums = ( byte_sums + ( byte_sums >> 4 ) ) & 0x0f * ONES_STEP_8;
129 |     byte_sums *= ONES_STEP_8;
130 |     
131 |     // Phase 2: compare each byte sum with the residual
132 |     const uint64_t residual_step_8 = residual * ONES_STEP_8;
133 |     const int place = ( LEQ_STEP_8( byte_sums, residual_step_8 ) * ONES_STEP_8 >> 53 ) & ~0x7;
134 |     
135 |     // Phase 3: Locate the relevant byte and make 8 copies with incremental masks
136 |     const int byte_rank = residual - ( ( ( byte_sums << 8 ) >> place ) & 0xFF );
137 |     
138 |     const uint64_t spread_bits = ( word >> place & 0xFF ) * ONES_STEP_8 & INCR_STEP_8;
139 |     const uint64_t bit_sums = ZCOMPARE_STEP_8( spread_bits ) * ONES_STEP_8;
140 |     
141 |     // Compute the inside-byte location and return the sum
142 |     const uint64_t byte_rank_step_8 = byte_rank * ONES_STEP_8;
143 |     
144 |     return place + ( LEQ_STEP_8( bit_sums, byte_rank_step_8 ) * ONES_STEP_8 >> 56 );   
145 | }
146 | 
147 | inline int select64(uint64_t x, int k) {
148 |     return select64_popcount_search(x, k);
149 | }
150 | 
151 | // x is the starting offset of the 512 bits;
152 | // k is the thing we're selecting for.
153 | inline int select512(uint64_t *bits, int x, int k) {
154 |     __asm__ __volatile__ (
155 |                           "prefetchnta (%0)\n"
156 |                           : : "r" (&bits[x]) );
157 |     int i = 0;
158 |     int pop = popcount(bits[x+i]);
159 |     while (k > pop && i < 7) {
160 |         k -= pop;
161 |         i++;
162 |         pop = popcount(bits[x+i]);
163 |     }
164 |     if (i == 7 && popcount(bits[x+i]) < k) {
165 |         return -1;
166 |     }
167 |     // We're now certain that the bit we want is stored in bv[x+i]
168 |     return i*64 + select64(bits[x+i], k);
169 | }
170 | 
171 | // brute-force linear select
172 | // x is the starting offset of the bits in bv;
173 | // k is the thing we're selecting for (starting from bv[x]).
174 | // bvlen is the total length of bv
175 | inline uint64_t selectLinear(uint64_t* bits, uint64_t length, uint64_t x, uint64_t k) {
176 |     if (k > (length - x) * 64)
177 |         return -1;
178 |     uint64_t i = 0;
179 |     uint64_t pop = popcount(bits[x+i]);
180 |     while (k > pop && i < (length - 1)) {
181 |         k -= pop;
182 |         i++;
183 |         pop = popcount(bits[x+i]);
184 |     }
185 |     if ((i == length - 1) && (pop < k)) {
186 |         return -1;
187 |     }
188 |     // We're now certain that the bit we want is stored in bits[x+i]
189 |     return i*64 + select64(bits[x+i], k);
190 | }
191 | 
192 | } // namespace surf
193 | 
194 | #endif /* _FASTRANK_POPCOUNT_H_ */
195 | 


--------------------------------------------------------------------------------
/bench/MurmurHash3.h:
--------------------------------------------------------------------------------
  1 | //-----------------------------------------------------------------------------
  2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public
  3 | // domain. The author hereby disclaims copyright to this source code.
  4 | 
  5 | #ifndef _MURMURHASH3_H_
  6 | #define _MURMURHASH3_H_
  7 | 
  8 | //-----------------------------------------------------------------------------
  9 | // Platform-specific functions and macros
 10 | 
 11 | // Microsoft Visual Studio
 12 | 
 13 | //typedef unsigned char uint8_t;
 14 | //typedef unsigned int uint32_t;
 15 | //typedef unsigned __int64 uint64_t;
 16 | 
 17 | // Other compilers
 18 | 
 19 | #include <stdint.h>
 20 | #include <stdlib.h>
 21 | 
 22 | // Other compilers
 23 | 
 24 | #define FORCE_INLINE inline __attribute__((always_inline))
 25 | 
 26 | inline uint32_t rotl32 ( uint32_t x, int8_t r )
 27 | {
 28 |     return (x << r) | (x >> (32 - r));
 29 | }
 30 | 
 31 | inline uint64_t rotl64 ( uint64_t x, int8_t r )
 32 | {
 33 |     return (x << r) | (x >> (64 - r));
 34 | }
 35 | 
 36 | #define ROTL32(x,y)rotl32(x,y)
 37 | #define ROTL64(x,y)rotl64(x,y)
 38 | 
 39 | #define BIG_CONSTANT(x) (x##LLU)
 40 | 
 41 | //-----------------------------------------------------------------------------
 42 | // Block read - if your platform needs to do endian-swapping or can only
 43 | // handle aligned reads, do the conversion here
 44 | 
 45 | FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i )
 46 | {
 47 |     return p[i];
 48 | }
 49 | 
 50 | FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i )
 51 | {
 52 |     return p[i];
 53 | }
 54 | 
 55 | //-----------------------------------------------------------------------------
 56 | // Finalization mix - force all bits of a hash block to avalanche
 57 | 
 58 | FORCE_INLINE uint32_t fmix32 ( uint32_t h )
 59 | {
 60 |     h ^= h >> 16;
 61 |     h *= 0x85ebca6b;
 62 |     h ^= h >> 13;
 63 |     h *= 0xc2b2ae35;
 64 |     h ^= h >> 16;
 65 | 
 66 |     return h;
 67 | }
 68 | 
 69 | //----------
 70 | 
 71 | FORCE_INLINE uint64_t fmix64 ( uint64_t k )
 72 | {
 73 |     k ^= k >> 33;
 74 |     k *= BIG_CONSTANT(0xff51afd7ed558ccd);
 75 |     k ^= k >> 33;
 76 |     k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
 77 |     k ^= k >> 33;
 78 | 
 79 |     return k;
 80 | }
 81 | 
 82 | //-----------------------------------------------------------------------------
 83 | 
 84 | void MurmurHash3_x86_32 ( const void * key, int len,
 85 |                           uint32_t seed, void * out )
 86 | {
 87 |     const uint8_t * data = (const uint8_t*)key;
 88 |     const int nblocks = len / 4;
 89 | 
 90 |     uint32_t h1 = seed;
 91 | 
 92 |     const uint32_t c1 = 0xcc9e2d51;
 93 |     const uint32_t c2 = 0x1b873593;
 94 | 
 95 |     //----------
 96 |     // body
 97 | 
 98 |     const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
 99 | 
100 |     for(int i = -nblocks; i; i++)
101 | 	{
102 | 	    uint32_t k1 = getblock32(blocks,i);
103 | 
104 | 	    k1 *= c1;
105 | 	    k1 = ROTL32(k1,15);
106 | 	    k1 *= c2;
107 |     
108 | 	    h1 ^= k1;
109 | 	    h1 = ROTL32(h1,13); 
110 | 	    h1 = h1*5+0xe6546b64;
111 | 	}
112 | 
113 |     //----------
114 |     // tail
115 | 
116 |     const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
117 | 
118 |     uint32_t k1 = 0;
119 | 
120 |     switch(len & 3)
121 | 	{
122 | 	case 3: k1 ^= tail[2] << 16;
123 | 	case 2: k1 ^= tail[1] << 8;
124 | 	case 1: k1 ^= tail[0];
125 | 	    k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
126 | 	};
127 | 
128 |     //----------
129 |     // finalization
130 | 
131 |     h1 ^= len;
132 | 
133 |     h1 = fmix32(h1);
134 | 
135 |     *(uint32_t*)out = h1;
136 | } 
137 | 
138 | //-----------------------------------------------------------------------------
139 | 
140 | void MurmurHash3_x86_128 ( const void * key, const int len,
141 |                            uint32_t seed, void * out )
142 | {
143 |     const uint8_t * data = (const uint8_t*)key;
144 |     const int nblocks = len / 16;
145 | 
146 |     uint32_t h1 = seed;
147 |     uint32_t h2 = seed;
148 |     uint32_t h3 = seed;
149 |     uint32_t h4 = seed;
150 | 
151 |     const uint32_t c1 = 0x239b961b; 
152 |     const uint32_t c2 = 0xab0e9789;
153 |     const uint32_t c3 = 0x38b34ae5; 
154 |     const uint32_t c4 = 0xa1e38b93;
155 | 
156 |     //----------
157 |     // body
158 | 
159 |     const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
160 | 
161 |     for(int i = -nblocks; i; i++)
162 | 	{
163 | 	    uint32_t k1 = getblock32(blocks,i*4+0);
164 | 	    uint32_t k2 = getblock32(blocks,i*4+1);
165 | 	    uint32_t k3 = getblock32(blocks,i*4+2);
166 | 	    uint32_t k4 = getblock32(blocks,i*4+3);
167 | 
168 | 	    k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
169 | 
170 | 	    h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
171 | 
172 | 	    k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
173 | 
174 | 	    h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
175 | 
176 | 	    k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
177 | 
178 | 	    h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
179 | 
180 | 	    k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
181 | 
182 | 	    h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
183 | 	}
184 | 
185 |     //----------
186 |     // tail
187 | 
188 |     const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
189 | 
190 |     uint32_t k1 = 0;
191 |     uint32_t k2 = 0;
192 |     uint32_t k3 = 0;
193 |     uint32_t k4 = 0;
194 | 
195 |     switch(len & 15)
196 | 	{
197 | 	case 15: k4 ^= tail[14] << 16;
198 | 	case 14: k4 ^= tail[13] << 8;
199 | 	case 13: k4 ^= tail[12] << 0;
200 | 	    k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
201 | 
202 | 	case 12: k3 ^= tail[11] << 24;
203 | 	case 11: k3 ^= tail[10] << 16;
204 | 	case 10: k3 ^= tail[ 9] << 8;
205 | 	case  9: k3 ^= tail[ 8] << 0;
206 | 	    k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
207 | 
208 | 	case  8: k2 ^= tail[ 7] << 24;
209 | 	case  7: k2 ^= tail[ 6] << 16;
210 | 	case  6: k2 ^= tail[ 5] << 8;
211 | 	case  5: k2 ^= tail[ 4] << 0;
212 | 	    k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
213 | 
214 | 	case  4: k1 ^= tail[ 3] << 24;
215 | 	case  3: k1 ^= tail[ 2] << 16;
216 | 	case  2: k1 ^= tail[ 1] << 8;
217 | 	case  1: k1 ^= tail[ 0] << 0;
218 | 	    k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
219 | 	};
220 | 
221 |     //----------
222 |     // finalization
223 | 
224 |     h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
225 | 
226 |     h1 += h2; h1 += h3; h1 += h4;
227 |     h2 += h1; h3 += h1; h4 += h1;
228 | 
229 |     h1 = fmix32(h1);
230 |     h2 = fmix32(h2);
231 |     h3 = fmix32(h3);
232 |     h4 = fmix32(h4);
233 | 
234 |     h1 += h2; h1 += h3; h1 += h4;
235 |     h2 += h1; h3 += h1; h4 += h1;
236 | 
237 |     ((uint32_t*)out)[0] = h1;
238 |     ((uint32_t*)out)[1] = h2;
239 |     ((uint32_t*)out)[2] = h3;
240 |     ((uint32_t*)out)[3] = h4;
241 | }
242 | 
243 | //-----------------------------------------------------------------------------
244 | 
245 | void MurmurHash3_x64_128 ( const void * key, const int len,
246 |                            const uint32_t seed, void * out )
247 | {
248 |     const uint8_t * data = (const uint8_t*)key;
249 |     const int nblocks = len / 16;
250 | 
251 |     uint64_t h1 = seed;
252 |     uint64_t h2 = seed;
253 | 
254 |     const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
255 |     const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
256 | 
257 |     //----------
258 |     // body
259 | 
260 |     const uint64_t * blocks = (const uint64_t *)(data);
261 | 
262 |     for(int i = 0; i < nblocks; i++)
263 | 	{
264 | 	    uint64_t k1 = getblock64(blocks,i*2+0);
265 | 	    uint64_t k2 = getblock64(blocks,i*2+1);
266 | 
267 | 	    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
268 | 
269 | 	    h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
270 | 
271 | 	    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
272 | 
273 | 	    h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
274 | 	}
275 | 
276 |     //----------
277 |     // tail
278 | 
279 |     const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
280 | 
281 |     uint64_t k1 = 0;
282 |     uint64_t k2 = 0;
283 | 
284 |     switch(len & 15)
285 | 	{
286 | 	case 15: k2 ^= ((uint64_t)tail[14]) << 48;
287 | 	case 14: k2 ^= ((uint64_t)tail[13]) << 40;
288 | 	case 13: k2 ^= ((uint64_t)tail[12]) << 32;
289 | 	case 12: k2 ^= ((uint64_t)tail[11]) << 24;
290 | 	case 11: k2 ^= ((uint64_t)tail[10]) << 16;
291 | 	case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;
292 | 	case  9: k2 ^= ((uint64_t)tail[ 8]) << 0;
293 | 	    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
294 | 
295 | 	case  8: k1 ^= ((uint64_t)tail[ 7]) << 56;
296 | 	case  7: k1 ^= ((uint64_t)tail[ 6]) << 48;
297 | 	case  6: k1 ^= ((uint64_t)tail[ 5]) << 40;
298 | 	case  5: k1 ^= ((uint64_t)tail[ 4]) << 32;
299 | 	case  4: k1 ^= ((uint64_t)tail[ 3]) << 24;
300 | 	case  3: k1 ^= ((uint64_t)tail[ 2]) << 16;
301 | 	case  2: k1 ^= ((uint64_t)tail[ 1]) << 8;
302 | 	case  1: k1 ^= ((uint64_t)tail[ 0]) << 0;
303 | 	    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
304 | 	};
305 | 
306 |     //----------
307 |     // finalization
308 | 
309 |     h1 ^= len; h2 ^= len;
310 | 
311 |     h1 += h2;
312 |     h2 += h1;
313 | 
314 |     h1 = fmix64(h1);
315 |     h2 = fmix64(h2);
316 | 
317 |     h1 += h2;
318 |     h2 += h1;
319 | 
320 |     ((uint64_t*)out)[0] = h1;
321 |     ((uint64_t*)out)[1] = h2;
322 | }
323 | 
324 | #endif // _MURMURHASH3_H_
325 | 


--------------------------------------------------------------------------------
/test/unitTest/test_label_vector.cpp:
--------------------------------------------------------------------------------
  1 | #include "gtest/gtest.h"
  2 | 
  3 | #include <assert.h>
  4 | 
  5 | #include <fstream>
  6 | #include <string>
  7 | #include <vector>
  8 | 
  9 | #include "config.hpp"
 10 | #include "label_vector.hpp"
 11 | #include "surf_builder.hpp"
 12 | 
 13 | namespace surf {
 14 | 
 15 | namespace labelvectortest {
 16 | 
 17 | static const std::string kFilePath = "../../../test/words.txt";
 18 | static const int kTestSize = 234369;
 19 | static std::vector<std::string> words;
 20 | 
 21 | class LabelVectorUnitTest : public ::testing::Test {
 22 | public:
 23 |     virtual void SetUp () {
 24 | 	bool include_dense = false;
 25 | 	uint32_t sparse_dense_ratio = 0;
 26 | 	level_t suffix_len = 8;
 27 | 	builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, kReal, 0, suffix_len);
 28 | 	data_ = nullptr;
 29 |     }
 30 |     virtual void TearDown () {
 31 | 	delete builder_;
 32 | 	if (data_)
 33 | 	    delete[] data_;
 34 |     }
 35 | 
 36 |     void setupWordsTest();
 37 |     void testSerialize();
 38 |     void testSearch();
 39 | 
 40 |     SuRFBuilder* builder_;
 41 |     LabelVector* labels_;
 42 |     char* data_;
 43 | };
 44 | 
 45 | void LabelVectorUnitTest::setupWordsTest() {
 46 |     builder_->build(words);
 47 |     labels_ = new LabelVector(builder_->getLabels());
 48 | }
 49 | 
 50 | void LabelVectorUnitTest::testSerialize() {
 51 |     uint64_t size = labels_->serializedSize();
 52 |     ASSERT_TRUE((labels_->size() - size) >= 0);
 53 |     data_ = new char[size];
 54 |     LabelVector* ori_labels = labels_;
 55 |     char* data = data_;
 56 |     ori_labels->serialize(data);
 57 |     data = data_;
 58 |     labels_ = LabelVector::deSerialize(data);
 59 | 
 60 |     ASSERT_EQ(ori_labels->getNumBytes(), labels_->getNumBytes());
 61 | 
 62 |     for (position_t i = 0; i < ori_labels->getNumBytes(); i++) {
 63 | 	label_t ori_label = ori_labels->read(i);
 64 | 	label_t label = labels_->read(i);
 65 | 	ASSERT_EQ(ori_label, label);
 66 |     }
 67 |     
 68 |     ori_labels->destroy();
 69 |     delete ori_labels;
 70 | }
 71 | 
 72 | void LabelVectorUnitTest::testSearch() {
 73 |     position_t start_pos = 0;
 74 |     position_t search_len = 0;
 75 |     for (level_t level = 0; level < builder_->getTreeHeight(); level++) {
 76 | 	for (position_t pos = 0; pos < builder_->getLabels()[level].size(); pos++) {
 77 | 	    bool louds_bit = SuRFBuilder::readBit(builder_->getLoudsBits()[level], pos);
 78 | 	    if (louds_bit) {
 79 | 		position_t search_pos;
 80 | 		bool search_success;
 81 | 		for (position_t i = start_pos; i < start_pos + search_len; i++) {
 82 | 		    label_t test_label = labels_->read(i);
 83 | 		    if (i == start_pos && test_label == kTerminator && search_len > 1)
 84 | 			continue;
 85 | 		    // search success
 86 | 		    search_pos = start_pos;
 87 | 		    search_success = labels_->search(test_label, search_pos, search_len);
 88 | 		    ASSERT_TRUE(search_success);
 89 | 		    ASSERT_EQ(i, search_pos);
 90 | 		}
 91 | 		// search fail
 92 | 		search_pos = start_pos;
 93 | 		search_success = labels_->search('\0', search_pos, search_len);
 94 | 		ASSERT_FALSE(search_success);
 95 | 		search_pos = start_pos;
 96 | 		search_success = labels_->search('\255', search_pos, search_len);
 97 | 		ASSERT_FALSE(search_success);
 98 | 
 99 | 		start_pos += search_len;
100 | 		search_len = 0;
101 | 	    }
102 | 	    search_len++;
103 | 	}
104 |     }
105 | }
106 | 
107 | TEST_F (LabelVectorUnitTest, readTest) {
108 |     setupWordsTest();
109 |     position_t lv_pos = 0;
110 |     for (level_t level = 0; level < builder_->getTreeHeight(); level++) {
111 | 	for (position_t pos = 0; pos < builder_->getLabels()[level].size(); pos++) {
112 | 	    label_t expected_label = builder_->getLabels()[level][pos];
113 | 	    label_t label = labels_->read(lv_pos);
114 | 	    ASSERT_EQ(expected_label, label);
115 | 	    lv_pos++;
116 | 	}
117 |     }
118 |     labels_->destroy();
119 |     delete labels_;
120 | }
121 | 
122 | TEST_F (LabelVectorUnitTest, searchAlgTest) {
123 |     setupWordsTest();
124 |     position_t start_pos = 0;
125 |     position_t search_len = 0;
126 |     for (level_t level = 0; level < builder_->getTreeHeight(); level++) {
127 | 	for (position_t pos = 0; pos < builder_->getLabels()[level].size(); pos++) {
128 | 	    bool louds_bit = SuRFBuilder::readBit(builder_->getLoudsBits()[level], pos);
129 | 	    if (louds_bit) {
130 | 		position_t binary_search_pos, simd_search_pos, linear_search_pos;
131 | 		bool binary_search_success, simd_search_success, linear_search_success;
132 | 		for (position_t i = start_pos; i < start_pos + search_len; i++) {
133 | 		    // binary search success
134 | 		    binary_search_pos = start_pos;
135 | 		    binary_search_success = labels_->binarySearch(labels_->read(i), binary_search_pos, search_len);
136 | 		    ASSERT_TRUE(binary_search_success);
137 | 		    ASSERT_EQ(i, binary_search_pos);
138 | 
139 | 		    // simd search success
140 | 		    simd_search_pos = start_pos;
141 | 		    simd_search_success = labels_->simdSearch(labels_->read(i), simd_search_pos, search_len);
142 | 		    ASSERT_TRUE(simd_search_success);
143 | 		    ASSERT_EQ(i, simd_search_pos);
144 | 
145 | 		    // linear search success
146 | 		    linear_search_pos = start_pos;
147 | 		    linear_search_success = labels_->linearSearch(labels_->read(i), linear_search_pos, search_len);
148 | 		    ASSERT_TRUE(linear_search_success);
149 | 		    ASSERT_EQ(i, linear_search_pos);
150 | 		}
151 | 		// binary search fail
152 | 		binary_search_pos = start_pos;
153 | 		binary_search_success = labels_->binarySearch('\0', binary_search_pos, search_len);
154 | 		ASSERT_FALSE(binary_search_success);
155 | 		binary_search_pos = start_pos;
156 | 		binary_search_success = labels_->binarySearch('\255', binary_search_pos, search_len);
157 | 		ASSERT_FALSE(binary_search_success);
158 | 
159 | 		// simd search fail
160 | 		simd_search_pos = start_pos;
161 | 		simd_search_success = labels_->simdSearch('\0', simd_search_pos, search_len);
162 | 		ASSERT_FALSE(simd_search_success);
163 | 		simd_search_pos = start_pos;
164 | 		simd_search_success = labels_->simdSearch('\255', simd_search_pos, search_len);
165 | 		ASSERT_FALSE(simd_search_success);
166 | 
167 | 		// linear search fail
168 | 		linear_search_pos = start_pos;
169 | 		linear_search_success = labels_->linearSearch('\0', linear_search_pos, search_len);
170 | 		ASSERT_FALSE(linear_search_success);
171 | 		linear_search_pos = start_pos;
172 | 		linear_search_success = labels_->linearSearch('\255', linear_search_pos, search_len);
173 | 		ASSERT_FALSE(linear_search_success);
174 | 
175 | 		start_pos += search_len;
176 | 		search_len = 0;
177 | 	    }
178 | 
179 | 	    if (builder_->getLabels()[level][pos] == kTerminator
180 | 		&& !SuRFBuilder::readBit(builder_->getChildIndicatorBits()[level], pos))
181 | 		start_pos++;
182 | 	    else
183 | 		search_len++;
184 | 	}
185 |     }
186 |     labels_->destroy();
187 |     delete labels_;
188 | }
189 | 
190 | TEST_F (LabelVectorUnitTest, searchTest) {
191 |     setupWordsTest();
192 |     testSearch();
193 |     labels_->destroy();
194 |     delete labels_;
195 | }
196 | 
197 | TEST_F (LabelVectorUnitTest, serializeTest) {
198 |     setupWordsTest();
199 |     testSerialize();
200 |     testSearch();
201 | }
202 | 
203 | TEST_F (LabelVectorUnitTest, searchGreaterThanTest) {
204 |     setupWordsTest();
205 |     position_t start_pos = 0;
206 |     position_t search_len = 0;
207 |     for (level_t level = 0; level < builder_->getTreeHeight(); level++) {
208 | 	for (position_t pos = 0; pos < builder_->getLabels()[level].size(); pos++) {
209 | 	    bool louds_bit = SuRFBuilder::readBit(builder_->getLoudsBits()[level], pos);
210 | 	    if (louds_bit) {
211 | 		position_t search_pos;
212 | 		position_t terminator_offset = 0;
213 | 		bool search_success;
214 | 		for (position_t i = start_pos; i < start_pos + search_len; i++) {
215 | 		    label_t cur_label = labels_->read(i);
216 | 		    if (i == start_pos && cur_label == kTerminator && search_len > 1) {
217 | 			terminator_offset = 1;
218 | 			continue;
219 | 		    }
220 | 
221 | 		    if (i < start_pos + search_len - 1) {
222 | 			label_t next_label = labels_->read(i+1);
223 | 			// search existing label
224 | 			search_pos = start_pos;
225 | 			search_success = labels_->searchGreaterThan(cur_label, search_pos, search_len);
226 | 			ASSERT_TRUE(search_success);
227 | 			ASSERT_EQ(i+1, search_pos);
228 | 
229 | 			// search midpoint (could be non-existing label)
230 | 			label_t test_label = cur_label + ((next_label - cur_label) / 2);
231 | 			search_pos = start_pos;
232 | 			search_success = labels_->searchGreaterThan(test_label, search_pos, search_len);
233 | 			ASSERT_TRUE(search_success);
234 | 			ASSERT_EQ(i+1, search_pos);
235 | 		    } else {
236 | 			// search out-of-bound label
237 | 			search_pos = start_pos;
238 | 			search_success = labels_->searchGreaterThan(labels_->read(start_pos + search_len - 1), search_pos, search_len);
239 | 			ASSERT_FALSE(search_success);
240 | 			ASSERT_EQ(start_pos + terminator_offset, search_pos);
241 | 		    }
242 | 		}
243 | 		start_pos += search_len;
244 | 		search_len = 0;
245 | 	    }
246 | 	    search_len++;
247 | 	}
248 |     }
249 | }
250 | 
251 | void loadWordList() {
252 |     std::ifstream infile(kFilePath);
253 |     std::string key;
254 |     int count = 0;
255 |     while (infile.good() && count < kTestSize) {
256 | 	infile >> key;
257 | 	words.push_back(key);
258 | 	count++;
259 |     }
260 | }
261 | 
262 | } // namespace labelvectortest
263 | 
264 | } // namespace surf
265 | 
266 | int main (int argc, char** argv) {
267 |     ::testing::InitGoogleTest(&argc, argv);
268 |     surf::labelvectortest::loadWordList();
269 |     return RUN_ALL_TESTS();
270 | }
271 | 


--------------------------------------------------------------------------------
/bench/workload_multi_thread.cpp:
--------------------------------------------------------------------------------
  1 | #include "bench.hpp"
  2 | #include "filter_factory.hpp"
  3 | 
  4 | //#define VERBOSE 1
  5 | 
  6 | static std::vector<std::string> txn_keys;
  7 | static std::vector<std::string> upper_bound_keys;
  8 | 
  9 | typedef struct ThreadArg {
 10 |     int thread_id;
 11 |     bench::Filter* filter;
 12 |     int start_pos;
 13 |     int end_pos;
 14 |     int query_type;
 15 |     int64_t out_positives;
 16 |     double tput;
 17 | } ThreadArg;
 18 | 
 19 | void* execute_workload(void* arg) {
 20 |     ThreadArg* thread_arg = (ThreadArg*)arg;
 21 |     int64_t positives = 0;
 22 |     double start_time = bench::getNow();
 23 |     if (thread_arg->query_type == 0) { // point
 24 | 	for (int i = thread_arg->start_pos; i < thread_arg->end_pos; i++)
 25 | 	    positives += (int)thread_arg->filter->lookup(txn_keys[i]);
 26 |     } else { // range
 27 | 	for (int i = thread_arg->start_pos; i < thread_arg->end_pos; i++)
 28 | 	    positives += (int)thread_arg->filter->lookupRange(txn_keys[i], 
 29 | 							      upper_bound_keys[i]);
 30 |     }
 31 |     double end_time = bench::getNow();
 32 |     double tput = (thread_arg->end_pos - thread_arg->start_pos) / (end_time - start_time) / 1000000; // Mops/sec
 33 | 
 34 | #ifdef VERBOSE
 35 |     std::cout << "Thread #" << thread_arg->thread_id << bench::kGreen 
 36 |     	      << ": Throughput = " << bench::kNoColor << tput << "\n";
 37 | #else
 38 |     std::cout << tput << "\n";
 39 | #endif
 40 | 
 41 |     thread_arg->out_positives = positives;
 42 |     thread_arg->tput = tput;
 43 |     pthread_exit(NULL);
 44 |     return NULL;
 45 | }
 46 | 
 47 | int main(int argc, char *argv[]) {
 48 |     if (argc != 10) {
 49 | 	std::cout << "Usage:\n";
 50 | 	std::cout << "1. filter type: SuRF, SuRFHash, SuRFReal, Bloom\n";
 51 | 	std::cout << "2. suffix length: 0 < len <= 64 (for SuRFHash and SuRFReal only)\n";
 52 | 	std::cout << "3. workload type: mixed, alterByte (only for email key)\n";
 53 | 	std::cout << "4. percentage of keys inserted: 0 < num <= 100\n";
 54 | 	std::cout << "5. byte position (conting from last, only for alterByte): num\n";
 55 | 	std::cout << "6. key type: randint, email\n";
 56 | 	std::cout << "7. query type: point, range\n";
 57 | 	std::cout << "8. distribution: uniform, zipfian, latest\n";
 58 | 	std::cout << "9. number of threads\n";
 59 | 	return -1;
 60 |     }
 61 | 
 62 |     std::string filter_type = argv[1];
 63 |     uint32_t suffix_len = (uint32_t)atoi(argv[2]);
 64 |     std::string workload_type = argv[3];
 65 |     unsigned percent = atoi(argv[4]);
 66 |     unsigned byte_pos = atoi(argv[5]);
 67 |     std::string key_type = argv[6];
 68 |     std::string query_type = argv[7];
 69 |     std::string distribution = argv[8];
 70 |     int num_threads = atoi(argv[9]);
 71 | 
 72 |     // check args ====================================================
 73 |     if (filter_type.compare(std::string("SuRF")) != 0
 74 | 	&& filter_type.compare(std::string("SuRFHash")) != 0
 75 | 	&& filter_type.compare(std::string("SuRFReal")) != 0
 76 | 	&& filter_type.compare(std::string("Bloom")) != 0
 77 | 	&& filter_type.compare(std::string("ARF")) != 0) {
 78 | 	std::cout << bench::kRed << "WRONG filter type\n" << bench::kNoColor;
 79 | 	return -1;
 80 |     }
 81 | 
 82 |     if (suffix_len == 0 || suffix_len > 64) {
 83 | 	std::cout << bench::kRed << "WRONG suffix length\n" << bench::kNoColor;
 84 | 	return -1;
 85 |     }
 86 | 
 87 |     if (workload_type.compare(std::string("mixed")) != 0
 88 | 	&& workload_type.compare(std::string("alterByte")) == 0) {
 89 | 	std::cout << bench::kRed << "WRONG workload type\n" << bench::kNoColor;
 90 | 	return -1;
 91 |     }
 92 | 
 93 |     if (percent > 100) {
 94 | 	std::cout << bench::kRed << "WRONG percentage\n" << bench::kNoColor;
 95 | 	return -1;
 96 |     }
 97 | 
 98 |     if (key_type.compare(std::string("randint")) != 0
 99 | 	&& key_type.compare(std::string("timestamp")) != 0
100 | 	&& key_type.compare(std::string("email")) != 0) {
101 | 	std::cout << bench::kRed << "WRONG key type\n" << bench::kNoColor;
102 | 	return -1;
103 |     }
104 | 
105 |     if (query_type.compare(std::string("point")) != 0
106 | 	&& query_type.compare(std::string("range")) != 0) {
107 | 	std::cout << bench::kRed << "WRONG query type\n" << bench::kNoColor;
108 | 	return -1;
109 |     }
110 | 
111 |     if (distribution.compare(std::string("uniform")) != 0
112 | 	&& distribution.compare(std::string("zipfian")) != 0
113 | 	&& distribution.compare(std::string("latest")) != 0) {
114 | 	std::cout << bench::kRed << "WRONG distribution\n" << bench::kNoColor;
115 | 	return -1;
116 |     }
117 | 
118 |     // load keys from files =======================================
119 |     std::string load_file = "workloads/load_";
120 |     load_file += key_type;
121 |     std::vector<std::string> load_keys;
122 |     if (key_type.compare(std::string("email")) == 0)
123 | 	bench::loadKeysFromFile(load_file, false, load_keys);
124 |     else
125 | 	bench::loadKeysFromFile(load_file, true, load_keys);
126 | 
127 |     std::string txn_file = "workloads/txn_";
128 |     txn_file += key_type;
129 |     txn_file += "_";
130 |     txn_file += distribution;
131 | 
132 |     if (key_type.compare(std::string("email")) == 0)
133 | 	bench::loadKeysFromFile(txn_file, false, txn_keys);
134 |     else
135 | 	bench::loadKeysFromFile(txn_file, true, txn_keys);
136 | 
137 |     std::vector<std::string> insert_keys;
138 |     bench::selectKeysToInsert(percent, insert_keys, load_keys);
139 | 
140 |     if (workload_type.compare(std::string("alterByte")) == 0)
141 | 	bench::modifyKeyByte(txn_keys, byte_pos);
142 | 
143 |     // compute upperbound keys for range queries =================
144 |     if (query_type.compare(std::string("range")) == 0) {
145 | 	for (int i = 0; i < (int)txn_keys.size(); i++)
146 | 	    upper_bound_keys.push_back(bench::getUpperBoundKey(key_type, txn_keys[i]));
147 |     }
148 | 
149 |     // create filter ==============================================
150 |     bench::Filter* filter = bench::FilterFactory::createFilter(filter_type, suffix_len, insert_keys);
151 | 
152 | #ifdef VERBOSE
153 |     std::cout << bench::kGreen << "Memory = " << bench::kNoColor << filter->getMemoryUsage() << std::endl;
154 | #endif
155 | 
156 |     // execute transactions =======================================
157 |     pthread_t* threads = new pthread_t[num_threads];
158 |     pthread_attr_t attr;
159 |     // Initialize and set thread joinable
160 |     pthread_attr_init(&attr);
161 |     pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
162 |     
163 |     ThreadArg* thread_args = new ThreadArg[num_threads];
164 |     int num_txns = (int)txn_keys.size();
165 |     int num_txns_per_thread = num_txns / num_threads;
166 |     for (int i = 0; i < num_threads; i++) {
167 | 	thread_args[i].thread_id = i;
168 | 	thread_args[i].filter = filter;
169 | 	thread_args[i].start_pos = num_txns_per_thread * i;
170 | 	thread_args[i].end_pos = num_txns_per_thread * (i + 1);
171 | 	if (query_type.compare(std::string("point")) == 0)
172 | 	    thread_args[i].query_type = 0;
173 | 	else
174 | 	    thread_args[i].query_type = 1;
175 | 	thread_args[i].out_positives = 0;
176 | 	thread_args[i].tput = 0;
177 |     }
178 | 
179 |     for (int i = 0; i < num_threads; i++) {
180 | 	int rc = pthread_create(&threads[i], NULL, execute_workload, (void*)(&thread_args[i]));
181 | 	if (rc) {
182 | 	    std::cout << "Error: unable to create thread " << rc << std::endl;
183 | 	    exit(-1);
184 | 	}
185 |     }
186 | 
187 |     // free attribute and wait for the other threads
188 |     pthread_attr_destroy(&attr);
189 |     for (int i = 0; i < num_threads; i++) {
190 | 	void* status;
191 | 	int rc = pthread_join(threads[i], &status);
192 | 	if (rc) {
193 | 	    std::cout << "Error:unable to join " << rc << endl;
194 | 	    exit(-1);
195 | 	}
196 |     }
197 | 
198 |     double tput = 0;
199 |     for (int i = 0; i < num_threads; i++) {
200 | 	tput += thread_args[i].tput;
201 |     }
202 | 
203 | #ifdef VERBOSE
204 |     std::cout << bench::kGreen << "Throughput = " << bench::kNoColor << tput << "\n";
205 | 
206 |     int positives = 0;
207 |     for (int i = 0; i < num_threads; i++) {
208 | 	positives += (thread_args[i].out_positives);
209 |     }
210 | 
211 |     // compute true positives ======================================
212 |     std::map<std::string, bool> ht;
213 |     for (int i = 0; i < (int)insert_keys.size(); i++)
214 | 	ht[insert_keys[i]] = true;
215 | 
216 |     int64_t true_positives = 0;
217 |     std::map<std::string, bool>::iterator ht_iter;
218 |     if (query_type.compare(std::string("point")) == 0) {
219 | 	for (int i = 0; i < (int)txn_keys.size(); i++) {
220 | 	    ht_iter = ht.find(txn_keys[i]);
221 | 	    true_positives += (ht_iter != ht.end());
222 | 	}
223 |     } else if (query_type.compare(std::string("range")) == 0) {
224 | 	for (int i = 0; i < (int)txn_keys.size(); i++) {
225 | 	    ht_iter = ht.upper_bound(txn_keys[i]);
226 | 	    if (ht_iter != ht.end()) {
227 | 		std::string fetched_key = ht_iter->first;
228 | 		true_positives += (fetched_key.compare(upper_bound_keys[i]) < 0);
229 | 	    }
230 | 	}
231 |     }
232 |     int64_t false_positives = positives - true_positives;
233 |     assert(false_positives >= 0);
234 |     int64_t true_negatives = txn_keys.size() - true_positives;
235 |     double fp_rate = 0;
236 |     if (false_positives > 0)
237 | 	fp_rate = false_positives / (true_negatives + false_positives + 0.0);
238 | 
239 |     std::cout << "positives = " << positives << "\n";
240 |     std::cout << "true positives = " << true_positives << "\n";
241 |     std::cout << "false positives = " << false_positives << "\n";
242 |     std::cout << "true negatives = " << true_negatives << "\n";
243 |     std::cout << bench::kGreen << "False Positive Rate = " << bench::kNoColor << fp_rate << "\n";
244 | #else
245 |     std::cout << tput << "\n";
246 |     std::cout << bench::kGreen << bench::kNoColor << "\n\n";
247 | #endif
248 | 
249 |     delete[] threads;
250 |     delete[] thread_args;
251 | 
252 |     pthread_exit(NULL);
253 |     return 0;
254 | }
255 | 


--------------------------------------------------------------------------------
/include/suffix.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef SUFFIX_H_
  2 | #define SUFFIX_H_
  3 | 
  4 | #include "bitvector.hpp"
  5 | 
  6 | #include <assert.h>
  7 | 
  8 | #include <vector>
  9 | 
 10 | #include "config.hpp"
 11 | #include "hash.hpp"
 12 | 
 13 | namespace surf {
 14 | 
 15 | // Max suffix_len_ = 64 bits
 16 | // For kReal suffixes, if the stored key is not long enough to provide
 17 | // suffix_len_ suffix bits, its suffix field is cleared (i.e., all 0's)
 18 | // to indicate that there is no suffix info associated with the key.
 19 | class BitvectorSuffix : public Bitvector {
 20 | public:
 21 |     BitvectorSuffix() : type_(kNone), hash_suffix_len_(0), real_suffix_len_(0) {};
 22 | 
 23 |     BitvectorSuffix(const SuffixType type,
 24 |                     const level_t hash_suffix_len, const level_t real_suffix_len,
 25 |                     const std::vector<std::vector<word_t> >& bitvector_per_level,
 26 |                     const std::vector<position_t>& num_bits_per_level,
 27 |                     const level_t start_level = 0,
 28 |                     level_t end_level = 0/* non-inclusive */)
 29 | 	: Bitvector(bitvector_per_level, num_bits_per_level, start_level, end_level) {
 30 | 	assert((hash_suffix_len + real_suffix_len) <= kWordSize);
 31 | 	type_ = type;
 32 | 	hash_suffix_len_ = hash_suffix_len;
 33 |         real_suffix_len_ = real_suffix_len;
 34 |     }
 35 | 
 36 |     static word_t constructHashSuffix(const std::string& key, const level_t len) {
 37 | 	word_t suffix = suffixHash(key);
 38 | 	suffix <<= (kWordSize - len - kHashShift);
 39 | 	suffix >>= (kWordSize - len);
 40 | 	return suffix;
 41 |     }
 42 | 
 43 |     static word_t constructRealSuffix(const std::string& key,
 44 | 				      const level_t level, const level_t len) {
 45 | 	if (key.length() < level || ((key.length() - level) * 8) < len)
 46 | 	    return 0;
 47 | 	word_t suffix = 0;
 48 | 	level_t num_complete_bytes = len / 8;
 49 | 	if (num_complete_bytes > 0) {
 50 | 	    suffix += (word_t)(label_t)key[level];
 51 | 	    for (position_t i = 1; i < num_complete_bytes; i++) {
 52 | 		suffix <<= 8;
 53 | 		suffix += (word_t)(uint8_t)key[level + i];
 54 | 	    }
 55 | 	}
 56 | 	level_t offset = len % 8;
 57 | 	if (offset > 0) {
 58 | 	    suffix <<= offset;
 59 | 	    word_t remaining_bits = 0;
 60 | 	    remaining_bits = (word_t)(uint8_t)key[level + num_complete_bytes];
 61 | 	    remaining_bits >>= (8 - offset);
 62 | 	    suffix += remaining_bits;
 63 | 	}
 64 | 	return suffix;
 65 |     }
 66 | 
 67 |     static word_t constructMixedSuffix(const std::string& key, const level_t hash_len,
 68 | 				       const level_t real_level, const level_t real_len) {
 69 |         word_t hash_suffix = constructHashSuffix(key, hash_len);
 70 |         word_t real_suffix = constructRealSuffix(key, real_level, real_len);
 71 |         word_t suffix = hash_suffix;
 72 |         suffix <<= real_len;
 73 |         suffix |= real_suffix;
 74 |         return suffix;
 75 |     }
 76 | 
 77 |     static word_t constructSuffix(const SuffixType type, const std::string& key,
 78 |                                   const level_t hash_len,
 79 |                                   const level_t real_level, const level_t real_len) {
 80 | 	switch (type) {
 81 | 	case kHash:
 82 | 	    return constructHashSuffix(key, hash_len);
 83 | 	case kReal:
 84 | 	    return constructRealSuffix(key, real_level, real_len);
 85 |         case kMixed:
 86 |             return constructMixedSuffix(key, hash_len, real_level, real_len);
 87 | 	default:
 88 | 	    return 0;
 89 |         }
 90 |     }
 91 | 
 92 |     static word_t extractHashSuffix(const word_t suffix, const level_t real_suffix_len) {
 93 |         return (suffix >> real_suffix_len);
 94 |     }
 95 | 
 96 |     static word_t extractRealSuffix(const word_t suffix, const level_t real_suffix_len) {
 97 |         word_t real_suffix_mask = 1;
 98 |         real_suffix_mask <<= real_suffix_len;
 99 |         real_suffix_mask--;
100 |         return (suffix & real_suffix_mask);
101 |     }
102 | 
103 |     SuffixType getType() const {
104 | 	return type_;
105 |     }
106 | 
107 |     level_t getSuffixLen() const {
108 | 	return hash_suffix_len_ + real_suffix_len_;
109 |     }
110 | 
111 |     level_t getHashSuffixLen() const {
112 | 	return hash_suffix_len_;
113 |     }
114 | 
115 |     level_t getRealSuffixLen() const {
116 | 	return real_suffix_len_;
117 |     }
118 | 
119 |     position_t serializedSize() const {
120 | 	position_t size = sizeof(num_bits_) + sizeof(type_)
121 |             + sizeof(hash_suffix_len_) + sizeof(real_suffix_len_) + bitsSize();
122 | 	sizeAlign(size);
123 | 	return size;
124 |     }
125 | 
126 |     position_t size() const {
127 | 	return (sizeof(BitvectorSuffix) + bitsSize());
128 |     }
129 | 
130 |     word_t read(const position_t idx) const;
131 |     word_t readReal(const position_t idx) const;
132 |     bool checkEquality(const position_t idx, const std::string& key, const level_t level) const;
133 | 
134 |     // Compare stored suffix to querying suffix.
135 |     // kReal suffix type only.
136 |     int compare(const position_t idx, const std::string& key, const level_t level) const;
137 | 
138 |     void serialize(char*& dst) const {
139 | 	memcpy(dst, &num_bits_, sizeof(num_bits_));
140 | 	dst += sizeof(num_bits_);
141 | 	memcpy(dst, &type_, sizeof(type_));
142 | 	dst += sizeof(type_);
143 | 	memcpy(dst, &hash_suffix_len_, sizeof(hash_suffix_len_));
144 | 	dst += sizeof(hash_suffix_len_);
145 |         memcpy(dst, &real_suffix_len_, sizeof(real_suffix_len_));
146 | 	dst += sizeof(real_suffix_len_);
147 | 	if (type_ != kNone) {
148 | 	    memcpy(dst, bits_, bitsSize());
149 | 	    dst += bitsSize();
150 | 	}
151 | 	align(dst);
152 |     }
153 | 
154 |     static BitvectorSuffix* deSerialize(char*& src) {
155 | 	BitvectorSuffix* sv = new BitvectorSuffix();
156 | 	memcpy(&(sv->num_bits_), src, sizeof(sv->num_bits_));
157 | 	src += sizeof(sv->num_bits_);
158 | 	memcpy(&(sv->type_), src, sizeof(sv->type_));
159 | 	src += sizeof(sv->type_);
160 | 	memcpy(&(sv->hash_suffix_len_), src, sizeof(sv->hash_suffix_len_));
161 | 	src += sizeof(sv->hash_suffix_len_);
162 |         memcpy(&(sv->real_suffix_len_), src, sizeof(sv->real_suffix_len_));
163 | 	src += sizeof(sv->real_suffix_len_);
164 | 	if (sv->type_ != kNone) {
165 | 	    sv->bits_ = new word_t[sv->numWords()];
166 | 	    memcpy(sv->bits_, src, sv->bitsSize());
167 | 	    src += sv->bitsSize();
168 | 	    
169 | 	    //sv->bits_ = const_cast<word_t*>(reinterpret_cast<const word_t*>(src));
170 | 	    //src += sv->bitsSize();
171 | 	}
172 | 	align(src);
173 | 	return sv;
174 |     }
175 | 
176 |     void destroy() {
177 | 	if (type_ != kNone)
178 | 	    delete[] bits_;
179 |     }
180 | 
181 | private:
182 |     SuffixType type_;
183 |     level_t hash_suffix_len_; // in bits
184 |     level_t real_suffix_len_; // in bits
185 | };
186 | 
187 | word_t BitvectorSuffix::read(const position_t idx) const {
188 |     if (type_ == kNone) 
189 | 	return 0;
190 | 
191 |     level_t suffix_len = getSuffixLen();
192 |     if (idx * suffix_len >= num_bits_) 
193 | 	return 0;
194 | 
195 |     position_t bit_pos = idx * suffix_len;
196 |     position_t word_id = bit_pos / kWordSize;
197 |     position_t offset = bit_pos & (kWordSize - 1);
198 |     word_t ret_word = (bits_[word_id] << offset) >> (kWordSize - suffix_len);
199 |     if (offset + suffix_len > kWordSize)
200 | 	ret_word += (bits_[word_id+1] >> (kWordSize - offset - suffix_len));
201 |     return ret_word;
202 | }
203 | 
204 | word_t BitvectorSuffix::readReal(const position_t idx) const {
205 |     return extractRealSuffix(read(idx), real_suffix_len_);
206 | }
207 | 
208 | bool BitvectorSuffix::checkEquality(const position_t idx, 
209 | 				    const std::string& key, const level_t level) const {
210 |     if (type_ == kNone) 
211 | 	return true;
212 |     if (idx * getSuffixLen() >= num_bits_) 
213 | 	return false;
214 | 
215 |     word_t stored_suffix = read(idx);
216 |     if (type_ == kReal) {
217 | 	// if no suffix info for the stored key
218 | 	if (stored_suffix == 0) 
219 | 	    return true;
220 | 	// if the querying key is shorter than the stored key
221 | 	if (key.length() < level || ((key.length() - level) * 8) < real_suffix_len_) 
222 | 	    return false;
223 |     }
224 |     word_t querying_suffix 
225 | 	= constructSuffix(type_, key, hash_suffix_len_, level, real_suffix_len_);
226 |     return (stored_suffix == querying_suffix);
227 | }
228 | 
229 | // If no real suffix is stored for the key, compare returns 0.
230 | // int BitvectorSuffix::compare(const position_t idx, 
231 | // 			     const std::string& key, const level_t level) const {
232 | //     if ((type_ == kNone) || (type_ == kHash) || (idx * getSuffixLen() >= num_bits_))
233 | // 	return 0;
234 | //     word_t stored_suffix = read(idx);
235 | //     word_t querying_suffix = constructRealSuffix(key, level, real_suffix_len_);
236 | //     if (type_ == kMixed)
237 | //         stored_suffix = extractRealSuffix(stored_suffix, real_suffix_len_);
238 | 
239 | //     if (stored_suffix == 0) 
240 | // 	return 0;
241 | //     if (stored_suffix < querying_suffix) 
242 | // 	return -1;
243 | //     else if (stored_suffix == querying_suffix) 
244 | // 	return 0;
245 | //     else 
246 | // 	return 1;
247 | // }
248 | 
249 | int BitvectorSuffix::compare(const position_t idx, 
250 | 			     const std::string& key, const level_t level) const {
251 |     if ((idx * getSuffixLen() >= num_bits_) || (type_ == kNone) || (type_ == kHash))
252 | 	return kCouldBePositive;
253 | 
254 |     word_t stored_suffix = read(idx);
255 |     word_t querying_suffix = constructRealSuffix(key, level, real_suffix_len_);
256 |     if (type_ == kMixed)
257 |         stored_suffix = extractRealSuffix(stored_suffix, real_suffix_len_);
258 | 
259 |     if ((stored_suffix == 0) && (querying_suffix == 0))
260 | 	return kCouldBePositive;
261 |     else if ((stored_suffix == 0) || (stored_suffix < querying_suffix))
262 | 	return -1;
263 |     else if (stored_suffix == querying_suffix) 
264 | 	return kCouldBePositive;
265 |     else 
266 | 	return 1;
267 | }
268 | 
269 | } // namespace surf
270 | 
271 | #endif // SUFFIXVECTOR_H_
272 | 


--------------------------------------------------------------------------------
/CodeCoverage.cmake:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2012 - 2017, Lars Bilke
  2 | # All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without modification,
  5 | # are permitted provided that the following conditions are met:
  6 | #
  7 | # 1. Redistributions of source code must retain the above copyright notice, this
  8 | #    list of conditions and the following disclaimer.
  9 | #
 10 | # 2. Redistributions in binary form must reproduce the above copyright notice,
 11 | #    this list of conditions and the following disclaimer in the documentation
 12 | #    and/or other materials provided with the distribution.
 13 | #
 14 | # 3. Neither the name of the copyright holder nor the names of its contributors
 15 | #    may be used to endorse or promote products derived from this software without
 16 | #    specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 19 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 20 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 21 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 22 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 23 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 24 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 25 | # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 27 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | #
 29 | # USAGE:
 30 | #
 31 | # 1. Copy this file into your cmake modules path.
 32 | #
 33 | # 2. Add the following line to your CMakeLists.txt:
 34 | #      include(CodeCoverage)
 35 | #
 36 | # 3. Append necessary compiler flags:
 37 | #      APPEND_COVERAGE_COMPILER_FLAGS()
 38 | #
 39 | # 4. If you need to exclude additional directories from the report, specify them
 40 | #    using the COVERAGE_EXCLUDES variable before calling SETUP_TARGET_FOR_COVERAGE.
 41 | #    Example:
 42 | #      set(COVERAGE_EXCLUDES 'dir1/*' 'dir2/*')
 43 | #
 44 | # 5. Use the functions described below to create a custom make target which
 45 | #    runs your test executable and produces a code coverage report.
 46 | #
 47 | # 6. Build a Debug build:
 48 | #      cmake -DCMAKE_BUILD_TYPE=Debug ..
 49 | #      make
 50 | #      make my_coverage_target
 51 | #
 52 | 
 53 | include(CMakeParseArguments)
 54 | 
 55 | # Check prereqs
 56 | find_program( GCOV_PATH gcov )
 57 | find_program( LCOV_PATH  NAMES lcov lcov.bat lcov.exe lcov.perl)
 58 | find_program( GENHTML_PATH NAMES genhtml genhtml.perl genhtml.bat )
 59 | find_program( GCOVR_PATH gcovr PATHS ${CMAKE_SOURCE_DIR}/scripts/test)
 60 | find_program( SIMPLE_PYTHON_EXECUTABLE python )
 61 | 
 62 | if(NOT GCOV_PATH)
 63 |     message(FATAL_ERROR "gcov not found! Aborting...")
 64 | endif() # NOT GCOV_PATH
 65 | 
 66 | if("${CMAKE_CXX_COMPILER_ID}" MATCHES "(Apple)?[Cc]lang")
 67 |     if("${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS 3)
 68 |         message(FATAL_ERROR "Clang version must be 3.0.0 or greater! Aborting...")
 69 |     endif()
 70 | elseif(NOT CMAKE_COMPILER_IS_GNUCXX)
 71 |     message(FATAL_ERROR "Compiler is not GNU gcc! Aborting...")
 72 | endif()
 73 | 
 74 | set(COVERAGE_COMPILER_FLAGS "-g -O0 --coverage -fprofile-arcs -ftest-coverage"
 75 |     CACHE INTERNAL "")
 76 | 
 77 | set(CMAKE_CXX_FLAGS_COVERAGE
 78 |     ${COVERAGE_COMPILER_FLAGS}
 79 |     CACHE STRING "Flags used by the C++ compiler during coverage builds."
 80 |     FORCE )
 81 | set(CMAKE_C_FLAGS_COVERAGE
 82 |     ${COVERAGE_COMPILER_FLAGS}
 83 |     CACHE STRING "Flags used by the C compiler during coverage builds."
 84 |     FORCE )
 85 | set(CMAKE_EXE_LINKER_FLAGS_COVERAGE
 86 |     ""
 87 |     CACHE STRING "Flags used for linking binaries during coverage builds."
 88 |     FORCE )
 89 | set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE
 90 |     ""
 91 |     CACHE STRING "Flags used by the shared libraries linker during coverage builds."
 92 |     FORCE )
 93 | mark_as_advanced(
 94 |     CMAKE_CXX_FLAGS_COVERAGE
 95 |     CMAKE_C_FLAGS_COVERAGE
 96 |     CMAKE_EXE_LINKER_FLAGS_COVERAGE
 97 |     CMAKE_SHARED_LINKER_FLAGS_COVERAGE )
 98 | 
 99 | if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
100 |     message(WARNING "Code coverage results with an optimised (non-Debug) build may be misleading")
101 | endif() # NOT CMAKE_BUILD_TYPE STREQUAL "Debug"
102 | 
103 | if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
104 |     link_libraries(gcov)
105 | else()
106 |     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --coverage")
107 | endif()
108 | 
109 | # Defines a target for running and collection code coverage information
110 | # Builds dependencies, runs the given executable and outputs reports.
111 | # NOTE! The executable should always have a ZERO as exit code otherwise
112 | # the coverage generation will not complete.
113 | #
114 | # SETUP_TARGET_FOR_COVERAGE(
115 | #     NAME testrunner_coverage                    # New target name
116 | #     EXECUTABLE testrunner -j ${PROCESSOR_COUNT} # Executable in PROJECT_BINARY_DIR
117 | #     DEPENDENCIES testrunner                     # Dependencies to build first
118 | # )
119 | function(SETUP_TARGET_FOR_COVERAGE)
120 | 
121 |     set(options NONE)
122 |     set(oneValueArgs NAME)
123 |     set(multiValueArgs EXECUTABLE EXECUTABLE_ARGS DEPENDENCIES)
124 |     cmake_parse_arguments(Coverage "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
125 | 
126 |     if(NOT LCOV_PATH)
127 |         message(FATAL_ERROR "lcov not found! Aborting...")
128 |     endif() # NOT LCOV_PATH
129 | 
130 |     if(NOT GENHTML_PATH)
131 |         message(FATAL_ERROR "genhtml not found! Aborting...")
132 |     endif() # NOT GENHTML_PATH
133 | 
134 |     # Setup target
135 |     add_custom_target(${Coverage_NAME}
136 | 
137 |         # Cleanup lcov
138 |         COMMAND ${LCOV_PATH} --directory . --zerocounters
139 |         # Create baseline to make sure untouched files show up in the report
140 |         COMMAND ${LCOV_PATH} -c -i -d . -o ${Coverage_NAME}.base
141 | 
142 |         # Run tests
143 |         COMMAND ${Coverage_EXECUTABLE}
144 | 
145 |         # Capturing lcov counters and generating report
146 |         COMMAND ${LCOV_PATH} --directory . --capture --output-file ${Coverage_NAME}.info
147 |         # add baseline counters
148 |         COMMAND ${LCOV_PATH} -a ${Coverage_NAME}.base -a ${Coverage_NAME}.info --output-file ${Coverage_NAME}.total
149 |         COMMAND ${LCOV_PATH} --remove ${Coverage_NAME}.total ${COVERAGE_EXCLUDES} --output-file ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info.cleaned
150 |         COMMAND ${GENHTML_PATH} -o ${Coverage_NAME} ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info.cleaned
151 |         COMMAND ${CMAKE_COMMAND} -E remove ${Coverage_NAME}.base ${Coverage_NAME}.total ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info.cleaned
152 | 
153 |         WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
154 |         DEPENDS ${Coverage_DEPENDENCIES}
155 |         COMMENT "Resetting code coverage counters to zero.\nProcessing code coverage counters and generating report."
156 |     )
157 |     
158 |     # Show where to find the lcov info report
159 |     add_custom_command(TARGET ${Coverage_NAME} POST_BUILD
160 |         COMMAND ;
161 |         COMMENT "Lcov code coverage info report saved in ${Coverage_NAME}.info."
162 |     )
163 | 
164 |     # Show info where to find the report
165 |     add_custom_command(TARGET ${Coverage_NAME} POST_BUILD
166 |         COMMAND ;
167 |         COMMENT "Open ./${Coverage_NAME}/index.html in your browser to view the coverage report."
168 |     )
169 | 
170 | endfunction() # SETUP_TARGET_FOR_COVERAGE
171 | 
172 | # Defines a target for running and collection code coverage information
173 | # Builds dependencies, runs the given executable and outputs reports.
174 | # NOTE! The executable should always have a ZERO as exit code otherwise
175 | # the coverage generation will not complete.
176 | #
177 | # SETUP_TARGET_FOR_COVERAGE_COBERTURA(
178 | #     NAME ctest_coverage                    # New target name
179 | #     EXECUTABLE ctest -j ${PROCESSOR_COUNT} # Executable in PROJECT_BINARY_DIR
180 | #     DEPENDENCIES executable_target         # Dependencies to build first
181 | # )
182 | function(SETUP_TARGET_FOR_COVERAGE_COBERTURA)
183 | 
184 |     set(options NONE)
185 |     set(oneValueArgs NAME)
186 |     set(multiValueArgs EXECUTABLE EXECUTABLE_ARGS DEPENDENCIES)
187 |     cmake_parse_arguments(Coverage "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
188 | 
189 |     if(NOT SIMPLE_PYTHON_EXECUTABLE)
190 |         message(FATAL_ERROR "python not found! Aborting...")
191 |     endif() # NOT SIMPLE_PYTHON_EXECUTABLE
192 | 
193 |     if(NOT GCOVR_PATH)
194 |         message(FATAL_ERROR "gcovr not found! Aborting...")
195 |     endif() # NOT GCOVR_PATH
196 | 
197 |     # Combine excludes to several -e arguments
198 |     set(COBERTURA_EXCLUDES "")
199 |     foreach(EXCLUDE ${COVERAGE_EXCLUDES})
200 |         set(COBERTURA_EXCLUDES "-e ${EXCLUDE} ${COBERTURA_EXCLUDES}")
201 |     endforeach()
202 | 
203 |     add_custom_target(${Coverage_NAME}
204 | 
205 |         # Run tests
206 |         ${Coverage_EXECUTABLE}
207 | 
208 |         # Running gcovr
209 |         COMMAND ${GCOVR_PATH} -x -r ${CMAKE_SOURCE_DIR} ${COBERTURA_EXCLUDES}
210 |             -o ${Coverage_NAME}.xml
211 |         WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
212 |         DEPENDS ${Coverage_DEPENDENCIES}
213 |         COMMENT "Running gcovr to produce Cobertura code coverage report."
214 |     )
215 | 
216 |     # Show info where to find the report
217 |     add_custom_command(TARGET ${Coverage_NAME} POST_BUILD
218 |         COMMAND ;
219 |         COMMENT "Cobertura code coverage report saved in ${Coverage_NAME}.xml."
220 |     )
221 | 
222 | endfunction() # SETUP_TARGET_FOR_COVERAGE_COBERTURA
223 | 
224 | function(APPEND_COVERAGE_COMPILER_FLAGS)
225 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COVERAGE_COMPILER_FLAGS}" PARENT_SCOPE)
226 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_COMPILER_FLAGS}" PARENT_SCOPE)
227 |     message(STATUS "Appending code coverage compiler flags: ${COVERAGE_COMPILER_FLAGS}")
228 | endfunction() # APPEND_COVERAGE_COMPILER_FLAGS


--------------------------------------------------------------------------------
/bench/workload.cpp:
--------------------------------------------------------------------------------
  1 | #include "bench.hpp"
  2 | #include "filter_factory.hpp"
  3 | 
  4 | int main(int argc, char *argv[]) {
  5 |     if (argc != 9) {
  6 | 	std::cout << "Usage:\n";
  7 | 	std::cout << "1. filter type: SuRF, SuRFHash, SuRFReal, SuRFMixed, Bloom\n";
  8 | 	std::cout << "2. suffix length: 0 < len <= 64 (for SuRFHash and SuRFReal only)\n";
  9 | 	std::cout << "3. workload type: mixed, alterByte (only for email key)\n";
 10 | 	std::cout << "4. percentage of keys inserted: 0 < num <= 100\n";
 11 | 	std::cout << "5. byte position (conting from last, only for alterByte): num\n";
 12 | 	std::cout << "6. key type: randint, email\n";
 13 | 	std::cout << "7. query type: point, range, mix, count-long, count-short\n";
 14 | 	std::cout << "8. distribution: uniform, zipfian, latest\n";
 15 | 	return -1;
 16 |     }
 17 | 
 18 |     std::string filter_type = argv[1];
 19 |     uint32_t suffix_len = (uint32_t)atoi(argv[2]);
 20 |     std::string workload_type = argv[3];
 21 |     unsigned percent = atoi(argv[4]);
 22 |     unsigned byte_pos = atoi(argv[5]);
 23 |     std::string key_type = argv[6];
 24 |     std::string query_type = argv[7];
 25 |     std::string distribution = argv[8];
 26 | 
 27 |     // check args ====================================================
 28 |     if (filter_type.compare(std::string("SuRF")) != 0
 29 | 	&& filter_type.compare(std::string("SuRFHash")) != 0
 30 | 	&& filter_type.compare(std::string("SuRFReal")) != 0
 31 | 	&& filter_type.compare(std::string("SuRFMixed")) != 0
 32 | 	&& filter_type.compare(std::string("Bloom")) != 0
 33 | 	&& filter_type.compare(std::string("ARF")) != 0) {
 34 | 	std::cout << bench::kRed << "WRONG filter type\n" << bench::kNoColor;
 35 | 	return -1;
 36 |     }
 37 | 
 38 |     if (suffix_len == 0 || suffix_len > 64) {
 39 | 	std::cout << bench::kRed << "WRONG suffix length\n" << bench::kNoColor;
 40 | 	return -1;
 41 |     }
 42 | 
 43 |     if (workload_type.compare(std::string("mixed")) != 0
 44 | 	&& workload_type.compare(std::string("alterByte")) == 0) {
 45 | 	std::cout << bench::kRed << "WRONG workload type\n" << bench::kNoColor;
 46 | 	return -1;
 47 |     }
 48 | 
 49 |     if (percent > 100) {
 50 | 	std::cout << bench::kRed << "WRONG percentage\n" << bench::kNoColor;
 51 | 	return -1;
 52 |     }
 53 | 
 54 |     if (key_type.compare(std::string("randint")) != 0
 55 | 	&& key_type.compare(std::string("timestamp")) != 0
 56 | 	&& key_type.compare(std::string("email")) != 0) {
 57 | 	std::cout << bench::kRed << "WRONG key type\n" << bench::kNoColor;
 58 | 	return -1;
 59 |     }
 60 | 
 61 |     if (query_type.compare(std::string("point")) != 0
 62 | 	&& query_type.compare(std::string("range")) != 0
 63 | 	&& query_type.compare(std::string("mix")) != 0
 64 | 	&& query_type.compare(std::string("count-long")) != 0
 65 | 	&& query_type.compare(std::string("count-short")) != 0) {
 66 | 	std::cout << bench::kRed << "WRONG query type\n" << bench::kNoColor;
 67 | 	return -1;
 68 |     }
 69 | 
 70 |     if (distribution.compare(std::string("uniform")) != 0
 71 | 	&& distribution.compare(std::string("zipfian")) != 0
 72 | 	&& distribution.compare(std::string("latest")) != 0) {
 73 | 	std::cout << bench::kRed << "WRONG distribution\n" << bench::kNoColor;
 74 | 	return -1;
 75 |     }
 76 | 
 77 |     // load keys from files =======================================
 78 |     std::string load_file = "workloads/load_";
 79 |     load_file += key_type;
 80 |     std::vector<std::string> load_keys;
 81 |     if (key_type.compare(std::string("email")) == 0)
 82 | 	bench::loadKeysFromFile(load_file, false, load_keys);
 83 |     else
 84 | 	bench::loadKeysFromFile(load_file, true, load_keys);
 85 | 
 86 |     std::string txn_file = "workloads/txn_";
 87 |     txn_file += key_type;
 88 |     txn_file += "_";
 89 |     txn_file += distribution;
 90 |     std::vector<std::string> txn_keys;
 91 |     if (key_type.compare(std::string("email")) == 0)
 92 | 	bench::loadKeysFromFile(txn_file, false, txn_keys);
 93 |     else
 94 | 	bench::loadKeysFromFile(txn_file, true, txn_keys);
 95 | 
 96 |     std::vector<std::string> insert_keys;
 97 |     bench::selectKeysToInsert(percent, insert_keys, load_keys);
 98 | 
 99 |     if (workload_type.compare(std::string("alterByte")) == 0)
100 | 	bench::modifyKeyByte(txn_keys, byte_pos);
101 | 
102 |     //compute keys for approximate count-long queries =================
103 |     std::vector<std::string> left_keys, right_keys;
104 |     if (query_type.compare(std::string("count-long")) == 0) {
105 |     	for (int i = 0; i < (int)txn_keys.size() - 1; i++) {
106 |     	    if (txn_keys[i].compare(txn_keys[i + 1]) < 0) {
107 |     		left_keys.push_back(txn_keys[i]);
108 |     		right_keys.push_back(txn_keys[i + 1]);
109 |     	    } else {
110 |     		left_keys.push_back(txn_keys[i + 1]);
111 |     		right_keys.push_back(txn_keys[i]);
112 |     	    }
113 |     	}
114 |     }
115 |     
116 |     // create filter ==============================================
117 |     double time1 = bench::getNow();
118 |     bench::Filter* filter = bench::FilterFactory::createFilter(filter_type, suffix_len, insert_keys);
119 |     double time2 = bench::getNow();
120 |     std::cout << "Build time = " << (time2 - time1) << std::endl;
121 | 
122 |     // execute transactions =======================================
123 |     int64_t positives = 0;
124 |     uint64_t count = 0;
125 |     double start_time = bench::getNow();
126 | 
127 |     if (query_type.compare(std::string("point")) == 0) {
128 | 	for (int i = 0; i < (int)txn_keys.size(); i++)
129 | 	    positives += (int)filter->lookup(txn_keys[i]);
130 |     } else if (query_type.compare(std::string("range")) == 0) {
131 | 	for (int i = 0; i < (int)txn_keys.size(); i++)
132 | 	    if (key_type.compare(std::string("email")) == 0) {
133 | 		std::string ret_str = txn_keys[i];
134 | 		ret_str[ret_str.size() - 1] += (char)bench::kEmailRangeSize;
135 | 		positives += (int)filter->lookupRange(txn_keys[i], ret_str);
136 | 	    } else {
137 | 		positives += (int)filter->lookupRange(txn_keys[i], bench::uint64ToString(bench::stringToUint64(txn_keys[i]) + bench::kIntRangeSize));
138 | 	    }
139 |     } else if (query_type.compare(std::string("mix")) == 0) {
140 | 	for (int i = 0; i < (int)txn_keys.size(); i++) {
141 | 	    if (i % 2 == 0) {
142 | 		positives += (int)filter->lookup(txn_keys[i]);
143 | 	    } else {
144 | 		if (key_type.compare(std::string("email")) == 0) {
145 | 		    std::string ret_str = txn_keys[i];
146 | 		    ret_str[ret_str.size() - 1] += (char)bench::kEmailRangeSize;
147 | 		    positives += (int)filter->lookupRange(txn_keys[i], ret_str);
148 | 		} else {
149 | 		    positives += (int)filter->lookupRange(txn_keys[i], bench::uint64ToString(bench::stringToUint64(txn_keys[i]) + bench::kIntRangeSize));
150 | 		}
151 | 	    }
152 | 	}
153 |     } else if (query_type.compare(std::string("count-long")) == 0) {
154 | 	for (int i = 0; i < (int)txn_keys.size() - 1; i++)
155 | 	    count += filter->approxCount(left_keys[i], right_keys[i]);
156 |     } else if (query_type.compare(std::string("count-short")) == 0) {
157 | 	for (int i = 0; i < (int)txn_keys.size(); i++)
158 | 	    if (key_type.compare(std::string("email")) == 0) {
159 | 		std::string ret_str = txn_keys[i];
160 | 		ret_str[ret_str.size() - 1] += (char)bench::kEmailRangeSize;
161 | 		count += filter->approxCount(txn_keys[i], ret_str);
162 | 	    } else {
163 | 		count += filter->approxCount(txn_keys[i], bench::uint64ToString(bench::stringToUint64(txn_keys[i]) + bench::kIntRangeSize));
164 | 	    }
165 |     }
166 | 
167 |     double end_time = bench::getNow();
168 | 
169 |     // compute true positives ======================================
170 |     std::map<std::string, bool> ht;
171 |     for (int i = 0; i < (int)insert_keys.size(); i++)
172 | 	ht[insert_keys[i]] = true;
173 | 
174 |     int64_t true_positives = 0;
175 |     std::map<std::string, bool>::iterator ht_iter;
176 |     if (query_type.compare(std::string("point")) == 0) {
177 | 	for (int i = 0; i < (int)txn_keys.size(); i++) {
178 | 	    ht_iter = ht.find(txn_keys[i]);
179 | 	    true_positives += (ht_iter != ht.end());
180 | 	}
181 |     } else if (query_type.compare(std::string("range")) == 0) {
182 | 	for (int i = 0; i < (int)txn_keys.size(); i++) {
183 | 	    ht_iter = ht.lower_bound(txn_keys[i]);
184 | 	    if (ht_iter != ht.end()) {
185 | 		std::string fetched_key = ht_iter->first;
186 | 		if (key_type.compare(std::string("email")) == 0) {
187 | 		    std::string ret_str = txn_keys[i];
188 | 		    ret_str[ret_str.size() - 1] += (char)bench::kEmailRangeSize;
189 | 		    true_positives += (fetched_key.compare(ret_str) < 0);
190 | 		} else {
191 | 		    true_positives += (fetched_key.compare(bench::uint64ToString(bench::stringToUint64(txn_keys[i]) + bench::kIntRangeSize)) < 0);
192 | 		}
193 | 	    }
194 | 	}
195 |     } else if (query_type.compare(std::string("mix")) == 0) {
196 | 	for (int i = 0; i < (int)txn_keys.size(); i++) {
197 | 	    if (i % 2 == 0) {
198 | 		ht_iter = ht.find(txn_keys[i]);
199 | 		true_positives += (ht_iter != ht.end());
200 | 	    } else {
201 | 		ht_iter = ht.lower_bound(txn_keys[i]);
202 | 		if (ht_iter != ht.end()) {
203 | 		    std::string fetched_key = ht_iter->first;
204 | 		    if (key_type.compare(std::string("email")) == 0) {
205 | 			std::string ret_str = txn_keys[i];
206 | 			ret_str[ret_str.size() - 1] += (char)bench::kEmailRangeSize;
207 | 			true_positives += (fetched_key.compare(ret_str) < 0);
208 | 		    } else {
209 | 			true_positives += (fetched_key.compare(bench::uint64ToString(bench::stringToUint64(txn_keys[i]) + bench::kIntRangeSize)) < 0);
210 | 		    }
211 | 		}	
212 | 	    }
213 | 	}
214 |     }
215 |     int64_t false_positives = positives - true_positives;
216 |     assert(false_positives >= 0);
217 |     int64_t true_negatives = txn_keys.size() - positives;
218 | 
219 |     // print
220 |     double tput = txn_keys.size() / (end_time - start_time) / 1000000; // Mops/sec
221 |     std::cout << bench::kGreen << "Throughput = " << bench::kNoColor << tput << "\n";
222 | 
223 |     std::cout << "positives = " << positives << "\n";
224 |     std::cout << "true positives = " << true_positives << "\n";
225 |     std::cout << "false positives = " << false_positives << "\n";
226 |     std::cout << "true negatives = " << true_negatives << "\n";
227 |     std::cout << "count = " << count << "\n";
228 | 
229 |     double fp_rate = 0;
230 |     if (false_positives > 0)
231 | 	fp_rate = false_positives / (true_negatives + false_positives + 0.0);
232 |     std::cout << bench::kGreen << "False Positive Rate = " << bench::kNoColor << fp_rate << "\n";
233 | 
234 |     std::cout << bench::kGreen << "Memory = " << bench::kNoColor << filter->getMemoryUsage() << "\n\n";
235 | 
236 |     return 0;
237 | }
238 | 


--------------------------------------------------------------------------------
/test/unitTest/test_suffix.cpp:
--------------------------------------------------------------------------------
  1 | #include "gtest/gtest.h"
  2 | 
  3 | #include <assert.h>
  4 | 
  5 | #include <fstream>
  6 | #include <string>
  7 | #include <vector>
  8 | 
  9 | #include "config.hpp"
 10 | #include "suffix.hpp"
 11 | #include "surf_builder.hpp"
 12 | 
 13 | namespace surf {
 14 | 
 15 | namespace suffixtest {
 16 | 
 17 | static const std::string kFilePath = "../../../test/words.txt";
 18 | static const int kTestSize = 234369;
 19 | static std::vector<std::string> words;
 20 | 
 21 | class SuffixUnitTest : public ::testing::Test {
 22 | public:
 23 |     virtual void SetUp () {
 24 | 	computeWordsBySuffixStartLevel();
 25 | 	data_ = nullptr;
 26 |     }
 27 |     virtual void TearDown () {
 28 | 	if (data_)
 29 | 	    delete[] data_;
 30 |     }
 31 | 
 32 |     void computeWordsBySuffixStartLevel();
 33 |     void testSerialize();
 34 |     void testCheckEquality();
 35 | 
 36 |     SuRFBuilder* builder_;
 37 |     BitvectorSuffix* suffixes_;
 38 |     std::vector<std::vector<std::string> > words_by_suffix_start_level_;
 39 |     char* data_;
 40 | };
 41 | 
 42 | static int getCommonPrefixLen(const std::string &a, const std::string &b) {
 43 |     int len = 0;
 44 |     while ((len < (int)a.length()) && (len < (int)b.length()) && (a[len] == b[len]))
 45 | 	len++;
 46 |     return len;
 47 | }
 48 | 
 49 | static int getMax(int a, int b) {
 50 |     if (a < b)
 51 | 	return b;
 52 |     return a;
 53 | }
 54 | 
 55 | void SuffixUnitTest::computeWordsBySuffixStartLevel() {
 56 |     assert(words.size() > 1);
 57 |     int commonPrefixLen = 0;
 58 |     for (unsigned i = 0; i < words.size(); i++) {
 59 | 	if (i == 0) {
 60 | 	    commonPrefixLen = getCommonPrefixLen(words[i], words[i+1]);
 61 | 	} else if (i == words.size() - 1) {
 62 | 	    commonPrefixLen = getCommonPrefixLen(words[i-1], words[i]);
 63 | 	} else {
 64 | 	    commonPrefixLen = getMax(getCommonPrefixLen(words[i-1], words[i]),
 65 | 				     getCommonPrefixLen(words[i], words[i+1]));
 66 | 	}
 67 | 
 68 | 	while (words_by_suffix_start_level_.size() < (unsigned)(commonPrefixLen + 1))
 69 | 	    words_by_suffix_start_level_.push_back(std::vector<std::string>());
 70 | 
 71 | 	words_by_suffix_start_level_[commonPrefixLen].push_back(words[i]);
 72 |     }
 73 | }
 74 | 
 75 | void SuffixUnitTest::testSerialize() {
 76 |     uint64_t size = suffixes_->serializedSize();
 77 |     data_ = new char[size];
 78 |     BitvectorSuffix* ori_suffixes = suffixes_;
 79 |     char* data = data_;
 80 |     ori_suffixes->serialize(data);
 81 |     data = data_;
 82 |     suffixes_ = BitvectorSuffix::deSerialize(data);
 83 | 
 84 |     ASSERT_EQ(ori_suffixes->bitsSize(), suffixes_->bitsSize());
 85 | 
 86 |     ori_suffixes->destroy();
 87 |     delete ori_suffixes;
 88 | }
 89 | 
 90 | void SuffixUnitTest::testCheckEquality() {
 91 |     position_t suffix_idx = 0;
 92 |     for (level_t level = 0; level < words_by_suffix_start_level_.size(); level++) {
 93 |         for (unsigned k = 0; k < words_by_suffix_start_level_[level].size(); k++) {
 94 |             if (level == 1 && k == 32) {
 95 |                 bool is_equal = suffixes_->checkEquality(suffix_idx,
 96 |                                                          words_by_suffix_start_level_[level][k],
 97 |                                                          (level + 1));
 98 |                 ASSERT_TRUE(is_equal);
 99 |             }
100 | 	    suffix_idx++;
101 | 	}
102 |     }
103 | }
104 | 
105 | TEST_F (SuffixUnitTest, constructRealSuffixTest) {
106 |     const level_t level = 2;
107 |     level_t suffix_len_array[5] = {1, 3, 7, 8, 13};
108 |     for (int i = 0; i < 5; i++) {
109 | 	level_t suffix_len = suffix_len_array[i];
110 | 	for (unsigned j = 0; j < words.size(); j++) {
111 | 	    word_t suffix = BitvectorSuffix::constructSuffix(kReal, words[j], 0, level, suffix_len);
112 | 	    if (words[j].length() < level || ((words[j].length() - level) * 8) < suffix_len) {
113 | 		ASSERT_EQ(0, suffix);
114 | 		continue;
115 | 	    }
116 | 	    for (position_t bitpos = 0; bitpos < suffix_len; bitpos++) {
117 | 		position_t byte_id = bitpos / 8;
118 | 		position_t byte_offset = bitpos % 8;
119 | 		uint8_t byte_mask = 0x80;
120 | 		byte_mask >>= byte_offset;
121 | 		bool expected_suffix_bit = false;
122 | 		if (level + byte_id < words[j].size())
123 | 		    expected_suffix_bit = (bool)(words[j][level + byte_id] & byte_mask);
124 | 
125 | 		word_t word_mask = kMsbMask;
126 | 		word_mask >>= (kWordSize - suffix_len + bitpos);
127 | 		bool suffix_bit = (bool)(suffix & word_mask);
128 | 
129 | 		ASSERT_EQ(expected_suffix_bit, suffix_bit);
130 | 	    }
131 | 	}
132 |     }
133 | }
134 | 
135 | TEST_F (SuffixUnitTest, constructMixedSuffixTest) {
136 |     const level_t level = 2;
137 |     level_t suffix_len_array[5] = {1, 3, 7, 8, 13};
138 |     for (int i = 0; i < 5; i++) {
139 | 	level_t suffix_len = suffix_len_array[i];
140 | 	for (unsigned j = 0; j < words.size(); j++) {
141 | 	    word_t suffix = BitvectorSuffix::constructSuffix(kMixed, words[j], suffix_len,
142 |                                                              level, suffix_len);
143 |             word_t hash_suffix = BitvectorSuffix::extractHashSuffix(suffix, suffix_len);
144 |             word_t expected_hash_suffix = BitvectorSuffix::constructHashSuffix(words[j], suffix_len);
145 |             ASSERT_EQ(expected_hash_suffix, hash_suffix);
146 | 
147 |             word_t real_suffix = BitvectorSuffix::extractRealSuffix(suffix, suffix_len);
148 | 	    if (words[j].length() < level || ((words[j].length() - level) * 8) < suffix_len) {
149 | 		ASSERT_EQ(0, real_suffix);
150 | 		continue;
151 | 	    }
152 | 	    for (position_t bitpos = 0; bitpos < suffix_len; bitpos++) {
153 | 		position_t byte_id = bitpos / 8;
154 | 		position_t byte_offset = bitpos % 8;
155 | 		uint8_t byte_mask = 0x80;
156 | 		byte_mask >>= byte_offset;
157 | 		bool expected_suffix_bit = false;
158 | 		if (level + byte_id < words[j].size())
159 | 		    expected_suffix_bit = (bool)(words[j][level + byte_id] & byte_mask);
160 | 
161 | 		word_t word_mask = kMsbMask;
162 | 		word_mask >>= (kWordSize - suffix_len + bitpos);
163 | 		bool suffix_bit = (bool)(real_suffix & word_mask);
164 | 		ASSERT_EQ(expected_suffix_bit, suffix_bit);
165 | 	    }
166 | 	}
167 |     }
168 | }
169 | 
170 | TEST_F (SuffixUnitTest, checkEqualityTest) {
171 |     bool include_dense = false;
172 |     uint32_t sparse_dense_ratio = 0;
173 |     SuffixType suffix_type_array[3] = {kHash, kReal, kMixed};
174 |     level_t suffix_len_array[5] = {1, 3, 7, 8, 13};
175 |     for (int i = 0; i < 3; i++) {
176 | 	for (int j = 0; j < 5; j++) {
177 | 	    // build test
178 | 	    SuffixType suffix_type = suffix_type_array[i];
179 | 	    level_t suffix_len = suffix_len_array[j];
180 | 
181 |             if (i == 0)
182 |                 builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, suffix_type, suffix_len, 0);
183 |             else if (i == 1)
184 |                 builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, suffix_type, 0, suffix_len);
185 |             else
186 |                 builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio,
187 |                                            suffix_type, suffix_len, suffix_len);
188 | 	    builder_->build(words);
189 | 
190 | 	    level_t height = builder_->getLabels().size();
191 | 	    std::vector<position_t> num_suffix_bits_per_level;
192 | 	    for (level_t level = 0; level < height; level++) {
193 |                 if (suffix_type == kMixed)
194 |                     num_suffix_bits_per_level.push_back(builder_->getSuffixCounts()[level] * suffix_len * 2);
195 |                 else
196 |                     num_suffix_bits_per_level.push_back(builder_->getSuffixCounts()[level] * suffix_len);
197 |             }
198 | 
199 |             if (i == 0)
200 |                 suffixes_ = new BitvectorSuffix(builder_->getSuffixType(), suffix_len, 0, builder_->getSuffixes(), num_suffix_bits_per_level, 0, height);
201 |             else if (i == 1)
202 |                 suffixes_ = new BitvectorSuffix(builder_->getSuffixType(), 0, suffix_len, builder_->getSuffixes(), num_suffix_bits_per_level, 0, height);
203 |             else
204 |                 suffixes_ = new BitvectorSuffix(builder_->getSuffixType(), suffix_len, suffix_len, builder_->getSuffixes(), num_suffix_bits_per_level, 0, height);
205 | 
206 | 	    testCheckEquality();
207 | 	    delete builder_;
208 | 	    suffixes_->destroy();
209 | 	    delete suffixes_;
210 | 	}
211 |     }
212 | }
213 | 
214 | TEST_F (SuffixUnitTest, serializeTest) {
215 |     bool include_dense = false;
216 |     uint32_t sparse_dense_ratio = 0;
217 |     SuffixType suffix_type_array[3] = {kHash, kReal, kMixed};
218 |     level_t suffix_len_array[5] = {1, 3, 7, 8, 13};
219 |     for (int i = 0; i < 3; i++) {
220 | 	for (int j = 0; j < 5; j++) {
221 | 	    // build test
222 | 	    SuffixType suffix_type = suffix_type_array[i];
223 | 	    level_t suffix_len = suffix_len_array[j];
224 |             if (i == 0)
225 |                 builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, suffix_type, suffix_len, 0);
226 |             else if (i == 1)
227 |                 builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio, suffix_type, 0, suffix_len);
228 |             else
229 |                 builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio,
230 |                                            suffix_type, suffix_len, suffix_len);
231 | 	    builder_->build(words);
232 | 
233 | 	    level_t height = builder_->getLabels().size();
234 | 	    std::vector<position_t> num_suffix_bits_per_level;
235 | 	    for (level_t level = 0; level < height; level++) {
236 |                 if (suffix_type == kMixed)
237 |                     num_suffix_bits_per_level.push_back(builder_->getSuffixCounts()[level] * suffix_len * 2);
238 |                 else
239 |                     num_suffix_bits_per_level.push_back(builder_->getSuffixCounts()[level] * suffix_len);
240 |             }
241 | 
242 |             if (i == 0)
243 |                 suffixes_ = new BitvectorSuffix(builder_->getSuffixType(), suffix_len, 0, builder_->getSuffixes(), num_suffix_bits_per_level, 0, height);
244 |             else if (i == 1)
245 |                 suffixes_ = new BitvectorSuffix(builder_->getSuffixType(), 0, suffix_len, builder_->getSuffixes(), num_suffix_bits_per_level, 0, height);
246 |             else
247 |                 suffixes_ = new BitvectorSuffix(builder_->getSuffixType(), suffix_len, suffix_len, builder_->getSuffixes(), num_suffix_bits_per_level, 0, height);
248 | 
249 | 	    testSerialize();
250 | 	    testCheckEquality();
251 | 	    delete builder_;
252 | 	}
253 |     }
254 | }
255 | 
256 | void loadWordList() {
257 |     std::ifstream infile(kFilePath);
258 |     std::string key;
259 |     int count = 0;
260 |     while (infile.good() && count < kTestSize) {
261 | 	infile >> key;
262 | 	words.push_back(key);
263 | 	count++;
264 |     }
265 | }
266 | 
267 | } // namespace suffixtest
268 | 
269 | } // namespace surf
270 | 
271 | int main (int argc, char** argv) {
272 |     ::testing::InitGoogleTest(&argc, argv);
273 |     surf::suffixtest::loadWordList();
274 |     return RUN_ALL_TESTS();
275 | }
276 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 | [![Coverage Status](https://coveralls.io/repos/github/efficient/SuRF/badge.svg?branch=master)](https://coveralls.io/github/efficient/SuRF?branch=master)          wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 


--------------------------------------------------------------------------------
/include/surf.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef SURF_H_
  2 | #define SURF_H_
  3 | 
  4 | #include <string>
  5 | #include <vector>
  6 | 
  7 | #include "config.hpp"
  8 | #include "louds_dense.hpp"
  9 | #include "louds_sparse.hpp"
 10 | #include "surf_builder.hpp"
 11 | 
 12 | namespace surf {
 13 | 
 14 | class SuRF {
 15 | public:
 16 |     class Iter {
 17 |     public:
 18 | 	Iter() {};
 19 | 	Iter(const SuRF* filter) {
 20 | 	    dense_iter_ = LoudsDense::Iter(filter->louds_dense_);
 21 | 	    sparse_iter_ = LoudsSparse::Iter(filter->louds_sparse_);
 22 | 	    could_be_fp_ = false;
 23 | 	}
 24 | 
 25 | 	void clear();
 26 | 	bool isValid() const;
 27 | 	bool getFpFlag() const;
 28 | 	int compare(const std::string& key) const;
 29 | 	std::string getKey() const;
 30 | 	int getSuffix(word_t* suffix) const;
 31 | 	std::string getKeyWithSuffix(unsigned* bitlen) const;
 32 | 
 33 | 	// Returns true if the status of the iterator after the operation is valid
 34 | 	bool operator ++(int);
 35 | 	bool operator --(int);
 36 | 
 37 |     private:
 38 | 	void passToSparse();
 39 | 	bool incrementDenseIter();
 40 | 	bool incrementSparseIter();
 41 | 	bool decrementDenseIter();
 42 | 	bool decrementSparseIter();
 43 | 
 44 |     private:
 45 | 	// true implies that dense_iter_ is valid
 46 | 	LoudsDense::Iter dense_iter_;
 47 | 	LoudsSparse::Iter sparse_iter_;
 48 | 	bool could_be_fp_;
 49 | 
 50 | 	friend class SuRF;
 51 |     };
 52 | 
 53 | public:
 54 |     SuRF() {};
 55 | 
 56 |     //------------------------------------------------------------------
 57 |     // Input keys must be SORTED
 58 |     //------------------------------------------------------------------
 59 |     SuRF(const std::vector<std::string>& keys) {
 60 | 	create(keys, kIncludeDense, kSparseDenseRatio, kNone, 0, 0);
 61 |     }
 62 | 
 63 |     SuRF(const std::vector<std::string>& keys, const SuffixType suffix_type,
 64 | 	 const level_t hash_suffix_len, const level_t real_suffix_len) {
 65 | 	create(keys, kIncludeDense, kSparseDenseRatio, suffix_type, hash_suffix_len, real_suffix_len);
 66 |     }
 67 |     
 68 |     SuRF(const std::vector<std::string>& keys,
 69 | 	 const bool include_dense, const uint32_t sparse_dense_ratio,
 70 | 	 const SuffixType suffix_type, const level_t hash_suffix_len, const level_t real_suffix_len) {
 71 | 	create(keys, include_dense, sparse_dense_ratio, suffix_type, hash_suffix_len, real_suffix_len);
 72 |     }
 73 | 
 74 |     ~SuRF() {}
 75 | 
 76 |     void create(const std::vector<std::string>& keys,
 77 | 		const bool include_dense, const uint32_t sparse_dense_ratio,
 78 | 		const SuffixType suffix_type,
 79 |                 const level_t hash_suffix_len, const level_t real_suffix_len);
 80 | 
 81 |     bool lookupKey(const std::string& key) const;
 82 |     // This function searches in a conservative way: if inclusive is true
 83 |     // and the stored key prefix matches key, iter stays at this key prefix.
 84 |     SuRF::Iter moveToKeyGreaterThan(const std::string& key, const bool inclusive) const;
 85 |     SuRF::Iter moveToKeyLessThan(const std::string& key, const bool inclusive) const;
 86 |     SuRF::Iter moveToFirst() const;
 87 |     SuRF::Iter moveToLast() const;
 88 |     bool lookupRange(const std::string& left_key, const bool left_inclusive, 
 89 | 		     const std::string& right_key, const bool right_inclusive);
 90 |     // Accurate except at the boundaries --> undercount by at most 2
 91 |     uint64_t approxCount(const std::string& left_key, const std::string& right_key);
 92 |     uint64_t approxCount(const SuRF::Iter* iter, const SuRF::Iter* iter2);
 93 | 
 94 |     uint64_t serializedSize() const;
 95 |     uint64_t getMemoryUsage() const;
 96 |     level_t getHeight() const;
 97 |     level_t getSparseStartLevel() const;
 98 | 
 99 |     char* serialize() const {
100 | 	uint64_t size = serializedSize();
101 | 	char* data = new char[size];
102 | 	char* cur_data = data;
103 | 	louds_dense_->serialize(cur_data);
104 | 	louds_sparse_->serialize(cur_data);
105 | 	assert(cur_data - data == (int64_t)size);
106 | 	return data;
107 |     }
108 | 
109 |     static SuRF* deSerialize(char* src) {
110 | 	SuRF* surf = new SuRF();
111 | 	surf->louds_dense_ = LoudsDense::deSerialize(src);
112 | 	surf->louds_sparse_ = LoudsSparse::deSerialize(src);
113 | 	surf->iter_ = SuRF::Iter(surf);
114 | 	return surf;
115 |     }
116 | 
117 |     void destroy() {
118 | 	louds_dense_->destroy();
119 | 	louds_sparse_->destroy();
120 |     }
121 | 
122 | private:
123 |     LoudsDense* louds_dense_;
124 |     LoudsSparse* louds_sparse_;
125 |     SuRFBuilder* builder_;
126 |     SuRF::Iter iter_;
127 |     SuRF::Iter iter2_;
128 | };
129 | 
130 | void SuRF::create(const std::vector<std::string>& keys, 
131 | 		  const bool include_dense, const uint32_t sparse_dense_ratio,
132 | 		  const SuffixType suffix_type,
133 |                   const level_t hash_suffix_len, const level_t real_suffix_len) {
134 |     builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio,
135 |                               suffix_type, hash_suffix_len, real_suffix_len);
136 |     builder_->build(keys);
137 |     louds_dense_ = new LoudsDense(builder_);
138 |     louds_sparse_ = new LoudsSparse(builder_);
139 |     iter_ = SuRF::Iter(this);
140 |     delete builder_;
141 | }
142 | 
143 | bool SuRF::lookupKey(const std::string& key) const {
144 |     position_t connect_node_num = 0;
145 |     if (!louds_dense_->lookupKey(key, connect_node_num))
146 | 	return false;
147 |     else if (connect_node_num != 0)
148 | 	return louds_sparse_->lookupKey(key, connect_node_num);
149 |     return true;
150 | }
151 | 
152 | SuRF::Iter SuRF::moveToKeyGreaterThan(const std::string& key, const bool inclusive) const {
153 |     SuRF::Iter iter(this);
154 |     iter.could_be_fp_ = louds_dense_->moveToKeyGreaterThan(key, inclusive, iter.dense_iter_);
155 | 
156 |     if (!iter.dense_iter_.isValid())
157 | 	return iter;
158 |     if (iter.dense_iter_.isComplete())
159 | 	return iter;
160 | 
161 |     if (!iter.dense_iter_.isSearchComplete()) {
162 | 	iter.passToSparse();
163 | 	iter.could_be_fp_ = louds_sparse_->moveToKeyGreaterThan(key, inclusive, iter.sparse_iter_);
164 | 	if (!iter.sparse_iter_.isValid())
165 | 	    iter.incrementDenseIter();
166 | 	return iter;
167 |     } else if (!iter.dense_iter_.isMoveLeftComplete()) {
168 | 	iter.passToSparse();
169 | 	iter.sparse_iter_.moveToLeftMostKey();
170 | 	return iter;
171 |     }
172 | 
173 |     assert(false); // shouldn't reach here
174 |     return iter;
175 | }
176 | 
177 | SuRF::Iter SuRF::moveToKeyLessThan(const std::string& key, const bool inclusive) const {
178 |     SuRF::Iter iter = moveToKeyGreaterThan(key, false);
179 |     if (!iter.isValid()) {
180 | 	iter = moveToLast();
181 | 	return iter;
182 |     }
183 |     if (!iter.getFpFlag()) {
184 | 	iter--;
185 | 	if (lookupKey(key))
186 | 	    iter--;
187 |     }
188 |     return iter;
189 | }
190 | 
191 | SuRF::Iter SuRF::moveToFirst() const {
192 |     SuRF::Iter iter(this);
193 |     if (louds_dense_->getHeight() > 0) {
194 | 	iter.dense_iter_.setToFirstLabelInRoot();
195 | 	iter.dense_iter_.moveToLeftMostKey();
196 | 	if (iter.dense_iter_.isMoveLeftComplete())
197 | 	    return iter;
198 | 	iter.passToSparse();
199 | 	iter.sparse_iter_.moveToLeftMostKey();
200 |     } else {
201 | 	iter.sparse_iter_.setToFirstLabelInRoot();
202 | 	iter.sparse_iter_.moveToLeftMostKey();
203 |     }
204 |     return iter;
205 | }
206 | 
207 | SuRF::Iter SuRF::moveToLast() const {
208 |     SuRF::Iter iter(this);
209 |     if (louds_dense_->getHeight() > 0) {
210 | 	iter.dense_iter_.setToLastLabelInRoot();
211 | 	iter.dense_iter_.moveToRightMostKey();
212 | 	if (iter.dense_iter_.isMoveRightComplete())
213 | 	    return iter;
214 | 	iter.passToSparse();
215 | 	iter.sparse_iter_.moveToRightMostKey();
216 |     } else {
217 | 	iter.sparse_iter_.setToLastLabelInRoot();
218 | 	iter.sparse_iter_.moveToRightMostKey();
219 |     }
220 |     return iter;
221 | }
222 | 
223 | bool SuRF::lookupRange(const std::string& left_key, const bool left_inclusive, 
224 | 		       const std::string& right_key, const bool right_inclusive) {
225 |     iter_.clear();
226 |     louds_dense_->moveToKeyGreaterThan(left_key, left_inclusive, iter_.dense_iter_);
227 |     if (!iter_.dense_iter_.isValid()) return false;
228 |     if (!iter_.dense_iter_.isComplete()) {
229 | 	if (!iter_.dense_iter_.isSearchComplete()) {
230 | 	    iter_.passToSparse();
231 | 	    louds_sparse_->moveToKeyGreaterThan(left_key, left_inclusive, iter_.sparse_iter_);
232 | 	    if (!iter_.sparse_iter_.isValid()) {
233 | 		iter_.incrementDenseIter();
234 | 	    }
235 | 	} else if (!iter_.dense_iter_.isMoveLeftComplete()) {
236 | 	    iter_.passToSparse();
237 | 	    iter_.sparse_iter_.moveToLeftMostKey();
238 | 	}
239 |     }
240 |     if (!iter_.isValid()) return false;
241 |     int compare = iter_.compare(right_key);
242 |     if (compare == kCouldBePositive)
243 | 	return true;
244 |     if (right_inclusive)
245 | 	return (compare <= 0);
246 |     else
247 | 	return (compare < 0);
248 | }
249 | 
250 | uint64_t SuRF::approxCount(const SuRF::Iter* iter, const SuRF::Iter* iter2) {
251 |     if (!iter->isValid() || !iter2->isValid()) return 0;
252 |     position_t out_node_num_left = 0, out_node_num_right = 0;
253 |     uint64_t count = louds_dense_->approxCount(&(iter->dense_iter_),
254 | 					       &(iter2->dense_iter_),
255 | 					       out_node_num_left,
256 | 					       out_node_num_right);
257 |     count += louds_sparse_->approxCount(&(iter->sparse_iter_),
258 | 					&(iter2->sparse_iter_),
259 | 					out_node_num_left,
260 | 					out_node_num_right);
261 |     return count;
262 | }
263 | 
264 | uint64_t SuRF::approxCount(const std::string& left_key,
265 | 			   const std::string& right_key) {
266 |     iter_.clear(); iter2_.clear();
267 |     iter_ = moveToKeyGreaterThan(left_key, true);
268 |     if (!iter_.isValid()) return 0;
269 |     iter2_ = moveToKeyGreaterThan(right_key, true);
270 |     if (!iter2_.isValid())
271 | 	iter2_ = moveToLast();
272 | 
273 |     return approxCount(&iter_, &iter2_);
274 | }
275 | 
276 | uint64_t SuRF::serializedSize() const {
277 |     return (louds_dense_->serializedSize()
278 | 	    + louds_sparse_->serializedSize());
279 | }
280 | 
281 | uint64_t SuRF::getMemoryUsage() const {
282 |     return (sizeof(SuRF) + louds_dense_->getMemoryUsage() + louds_sparse_->getMemoryUsage());
283 | }
284 | 
285 | level_t SuRF::getHeight() const {
286 |     return louds_sparse_->getHeight();
287 | }
288 | 
289 | level_t SuRF::getSparseStartLevel() const {
290 |     return louds_sparse_->getStartLevel();
291 | }
292 | 
293 | //============================================================================
294 | 
295 | void SuRF::Iter::clear() {
296 |     dense_iter_.clear();
297 |     sparse_iter_.clear();
298 | }
299 | 
300 | bool SuRF::Iter::getFpFlag() const {
301 |     return could_be_fp_;
302 | }
303 | 
304 | bool SuRF::Iter::isValid() const {
305 |     return dense_iter_.isValid() 
306 | 	&& (dense_iter_.isComplete() || sparse_iter_.isValid());
307 | }
308 | 
309 | int SuRF::Iter::compare(const std::string& key) const {
310 |     assert(isValid());
311 |     int dense_compare = dense_iter_.compare(key);
312 |     if (dense_iter_.isComplete() || dense_compare != 0) 
313 | 	return dense_compare;
314 |     return sparse_iter_.compare(key);
315 | }
316 | 
317 | std::string SuRF::Iter::getKey() const {
318 |     if (!isValid())
319 | 	return std::string();
320 |     if (dense_iter_.isComplete())
321 | 	return dense_iter_.getKey();
322 |     return dense_iter_.getKey() + sparse_iter_.getKey();
323 | }
324 | 
325 | int SuRF::Iter::getSuffix(word_t* suffix) const {
326 |     if (!isValid())
327 | 	return 0;
328 |     if (dense_iter_.isComplete())
329 | 	return dense_iter_.getSuffix(suffix);
330 |     return sparse_iter_.getSuffix(suffix);
331 | }
332 | 
333 | std::string SuRF::Iter::getKeyWithSuffix(unsigned* bitlen) const {
334 |     *bitlen = 0;
335 |     if (!isValid())
336 | 	return std::string();
337 |     if (dense_iter_.isComplete())
338 | 	return dense_iter_.getKeyWithSuffix(bitlen);
339 |     return dense_iter_.getKeyWithSuffix(bitlen) + sparse_iter_.getKeyWithSuffix(bitlen);
340 | }
341 | 
342 | void SuRF::Iter::passToSparse() {
343 |     sparse_iter_.setStartNodeNum(dense_iter_.getSendOutNodeNum());
344 | }
345 | 
346 | bool SuRF::Iter::incrementDenseIter() {
347 |     if (!dense_iter_.isValid()) 
348 | 	return false;
349 | 
350 |     dense_iter_++;
351 |     if (!dense_iter_.isValid()) 
352 | 	return false;
353 |     if (dense_iter_.isMoveLeftComplete()) 
354 | 	return true;
355 | 
356 |     passToSparse();
357 |     sparse_iter_.moveToLeftMostKey();
358 |     return true;
359 | }
360 | 
361 | bool SuRF::Iter::incrementSparseIter() {
362 |     if (!sparse_iter_.isValid()) 
363 | 	return false;
364 |     sparse_iter_++;
365 |     return sparse_iter_.isValid();
366 | }
367 | 
368 | bool SuRF::Iter::operator ++(int) {
369 |     if (!isValid()) 
370 | 	return false;
371 |     if (incrementSparseIter()) 
372 | 	return true;
373 |     return incrementDenseIter();
374 | }
375 | 
376 | bool SuRF::Iter::decrementDenseIter() {
377 |     if (!dense_iter_.isValid()) 
378 | 	return false;
379 | 
380 |     dense_iter_--;
381 |     if (!dense_iter_.isValid()) 
382 | 	return false;
383 |     if (dense_iter_.isMoveRightComplete()) 
384 | 	return true;
385 | 
386 |     passToSparse();
387 |     sparse_iter_.moveToRightMostKey();
388 |     return true;
389 | }
390 | 
391 | bool SuRF::Iter::decrementSparseIter() {
392 |     if (!sparse_iter_.isValid()) 
393 | 	return false;
394 |     sparse_iter_--;
395 |     return sparse_iter_.isValid();
396 | }
397 | 
398 | bool SuRF::Iter::operator --(int) {
399 |     if (!isValid()) 
400 | 	return false;
401 |     if (decrementSparseIter()) 
402 | 	return true;
403 |     return decrementDenseIter();
404 | }
405 | 
406 | } // namespace surf
407 | 
408 | #endif // SURF_H
409 | 


--------------------------------------------------------------------------------