├── doc ├── figs │ ├── bf-basic.png │ ├── bf-bitwise.png │ ├── shrinking.png │ ├── architecture.png │ ├── bf-counting.png │ ├── bf-scalable.png │ ├── bf-basic-part.png │ ├── sliding-window.png │ └── spectral-rm-bug.png ├── documentation.dox └── Makefile ├── .gitignore ├── test ├── bf │ ├── CMakeLists.txt │ ├── configuration.h │ ├── util │ │ ├── error.h │ │ ├── trial.h │ │ └── configuration.h │ ├── configuration.cc │ └── bf.cc ├── CMakeLists.txt ├── test.hpp ├── tests.cpp ├── unit_test_impl.hpp └── unit_test.hpp ├── bf ├── all.hpp ├── object.hpp ├── wrap.hpp ├── bloom_filter │ ├── bitwise.hpp │ ├── stable.hpp │ ├── a2.hpp │ ├── basic.hpp │ └── counting.hpp ├── bloom_filter.hpp ├── hash.hpp ├── h3.hpp ├── counter_vector.hpp └── bitvector.hpp ├── src ├── bloom_filter │ ├── stable.cpp │ ├── a2.cpp │ ├── bitwise.cpp │ ├── basic.cpp │ └── counting.cpp ├── hash.cpp ├── counter_vector.cpp └── bitvector.cpp ├── .clang-format ├── cmake └── cmake_uninstall.cmake.in ├── COPYING ├── CMakeLists.txt ├── configure └── README.md /doc/figs/bf-basic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mavam/libbf/HEAD/doc/figs/bf-basic.png -------------------------------------------------------------------------------- /doc/figs/bf-bitwise.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mavam/libbf/HEAD/doc/figs/bf-bitwise.png -------------------------------------------------------------------------------- /doc/figs/shrinking.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mavam/libbf/HEAD/doc/figs/shrinking.png -------------------------------------------------------------------------------- /doc/figs/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mavam/libbf/HEAD/doc/figs/architecture.png -------------------------------------------------------------------------------- /doc/figs/bf-counting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mavam/libbf/HEAD/doc/figs/bf-counting.png -------------------------------------------------------------------------------- /doc/figs/bf-scalable.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mavam/libbf/HEAD/doc/figs/bf-scalable.png -------------------------------------------------------------------------------- /doc/figs/bf-basic-part.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mavam/libbf/HEAD/doc/figs/bf-basic-part.png -------------------------------------------------------------------------------- /doc/figs/sliding-window.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mavam/libbf/HEAD/doc/figs/sliding-window.png -------------------------------------------------------------------------------- /doc/figs/spectral-rm-bug.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mavam/libbf/HEAD/doc/figs/spectral-rm-bug.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .*swp 2 | .*swo 3 | .DS_Store 4 | build 5 | doc/gh-pages 6 | Makefile 7 | .idea 8 | cmake-build-*/ 9 | -------------------------------------------------------------------------------- /test/bf/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(BEFORE .) 2 | set(bf_sources 3 | bf.cc 4 | configuration.cc 5 | ) 6 | 7 | add_executable(bf ${bf_sources}) 8 | target_link_libraries(bf libbf_shared) 9 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(bf) 2 | 3 | enable_testing() 4 | add_executable(bf-test tests.cpp) 5 | target_link_libraries(bf-test libbf_shared ${CMAKE_THREAD_LIBS_INIT}) 6 | add_test(unit ${CMAKE_BINARY_DIR}/bin/bf-test) 7 | -------------------------------------------------------------------------------- /bf/all.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BF_ALL_HPP 2 | #define BF_ALL_HPP 3 | 4 | #include "bf/bloom_filter/a2.hpp" 5 | #include "bf/bloom_filter/basic.hpp" 6 | #include "bf/bloom_filter/bitwise.hpp" 7 | #include "bf/bloom_filter/counting.hpp" 8 | #include "bf/bloom_filter/stable.hpp" 9 | 10 | #endif 11 | -------------------------------------------------------------------------------- /doc/documentation.dox: -------------------------------------------------------------------------------- 1 | /** 2 | 3 | \mainpage libbf 4 | 5 | **libbf** is a C++11 Bloom filter library. 6 | 7 | \section Usage 8 | 9 | Please refer to [the project page](https://github.com/mavam/libbf) for usage 10 | examples. 11 | 12 | This manual only documents the API. 13 | 14 | */ 15 | -------------------------------------------------------------------------------- /test/bf/configuration.h: -------------------------------------------------------------------------------- 1 | #ifndef CONFIGURATION_H 2 | #define CONFIGURATION_H 3 | 4 | #include "util/configuration.h" 5 | 6 | class config : public util::configuration { 7 | public: 8 | config() = default; 9 | 10 | void initialize(); 11 | std::string banner() const; 12 | }; 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | REPO := git@github.com:mavam/libbf.git 2 | 3 | doc: check 4 | doxygen Doxyfile 5 | 6 | check: FORCE 7 | @test -d gh-pages/api \ 8 | || (git clone $(REPO) gh-pages && git checkout gh-pages) 9 | 10 | commit: doc 11 | @cd gh-pages \ 12 | && git add api \ 13 | && git commit -a -e -m 'Update Doxygen API documentation.' 14 | 15 | deploy: FORCE 16 | @cd gh-pages && git push origin HEAD 17 | 18 | update: 19 | doxygen -u Doxyfile 20 | 21 | FORCE: 22 | 23 | .PHONY: doc check deploy push update FORCE 24 | -------------------------------------------------------------------------------- /bf/object.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BF_OBJECT_HPP 2 | #define BF_OBJECT_HPP 3 | 4 | #include 5 | 6 | namespace bf { 7 | 8 | /// Wraps sequential data to be used in hashing. 9 | class object 10 | { 11 | public: 12 | object(void const* data, size_t size) 13 | : data_(data), size_(size) 14 | { 15 | } 16 | 17 | void const* data() const 18 | { 19 | return data_; 20 | } 21 | 22 | size_t size() const 23 | { 24 | return size_; 25 | } 26 | 27 | private: 28 | void const* data_ = nullptr; 29 | size_t size_ = 0; 30 | }; 31 | 32 | } // namespace bf 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /test/test.hpp: -------------------------------------------------------------------------------- 1 | #ifndef TEST_TEST_H 2 | #define TEST_TEST_H 3 | 4 | #ifdef SUITE 5 | #define CAF_SUITE SUITE 6 | #endif 7 | 8 | #include "unit_test_impl.hpp" 9 | 10 | // Logging 11 | #define ERROR CAF_TEST_ERROR 12 | #define INFO CAF_TEST_INFO 13 | #define VERBOSE CAF_TEST_VERBOSE 14 | #define MESSAGE CAF_TEST_VERBOSE 15 | 16 | // Test setup 17 | #define TEST CAF_TEST 18 | #define FIXTURE_SCOPE CAF_TEST_FIXTURE_SCOPE 19 | #define FIXTURE_SCOPE_END CAF_TEST_FIXTURE_SCOPE_END 20 | 21 | // Checking 22 | #define REQUIRE CAF_REQUIRE 23 | #define REQUIRE_EQUAL CAF_REQUIRE_EQUAL 24 | #define CHECK CAF_CHECK 25 | #define CHECK_EQUAL CAF_CHECK_EQUAL 26 | #define CHECK_FAIL CAF_CHECK_FAIL 27 | #define FAIL CAF_FAIL 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /bf/wrap.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BF_WRAP_HPP 2 | #define BF_WRAP_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace bf { 10 | 11 | template < 12 | typename T, 13 | typename = typename std::enable_if::value>::type 14 | > 15 | object wrap(T const& x) 16 | { 17 | return {&x, sizeof(T)}; 18 | } 19 | 20 | template < 21 | typename T, 22 | size_t N, 23 | typename = typename std::enable_if::value>::type 24 | > 25 | object wrap(T const (&str)[N]) 26 | { 27 | return {&str, N * sizeof(T)}; 28 | } 29 | 30 | template < 31 | typename T, 32 | typename = typename std::enable_if::value>::type 33 | > 34 | object wrap(std::vector const& s) 35 | { 36 | return {s.data(), s.size()}; 37 | } 38 | 39 | inline object wrap(std::string const& str) 40 | { 41 | return {str.data(), str.size()}; 42 | } 43 | 44 | } // namespace bf 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /src/bloom_filter/stable.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | namespace bf { 6 | 7 | stable_bloom_filter::stable_bloom_filter(hasher h, size_t cells, size_t width, 8 | size_t d) 9 | : counting_bloom_filter(std::move(h), cells, width), 10 | d_(d), 11 | unif_(0, cells - 1) { 12 | assert(d <= cells); 13 | } 14 | 15 | void stable_bloom_filter::add(object const& o) { 16 | // Decrement d distinct cells uniformly at random. 17 | std::vector indices; 18 | for (size_t d = 0; d < d_; ++d) { 19 | bool unique; 20 | do { 21 | size_t u = unif_(generator_); 22 | unique = true; 23 | for (auto i : indices) 24 | if (i == u) { 25 | unique = false; 26 | break; 27 | } 28 | if (unique) { 29 | indices.push_back(u); 30 | cells_.decrement(u); 31 | } 32 | } while (!unique); 33 | } 34 | 35 | increment(find_indices(o), cells_.max()); 36 | } 37 | 38 | } // namespace bf 39 | -------------------------------------------------------------------------------- /bf/bloom_filter/bitwise.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BF_BLOOM_FILTER_BITWISE_HPP 2 | #define BF_BLOOM_FILTER_BITWISE_HPP 3 | 4 | #include 5 | 6 | namespace bf { 7 | 8 | /// The bitwise Bloom filter. 9 | class bitwise_bloom_filter : public bloom_filter 10 | { 11 | public: 12 | /// Constructs a bitwise Bloom filter. 13 | /// @param k The number of hash functions in the first level. 14 | /// @param cells0 The number of cells in the the first level. 15 | /// @param seed0 The seed for the first level. 16 | bitwise_bloom_filter(size_t k, size_t cells, size_t seed = 0); 17 | 18 | using bloom_filter::add; 19 | using bloom_filter::lookup; 20 | 21 | virtual void add(object const& o) override; 22 | virtual size_t lookup(object const& o) const override; 23 | virtual void clear() override; 24 | 25 | private: 26 | /// Appends a new level. 27 | /// @post `levels_.size() += 1` 28 | void grow(); 29 | 30 | size_t k_; 31 | size_t cells_; 32 | size_t seed_; 33 | std::vector levels_; 34 | }; 35 | 36 | } // namespace bf 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /test/bf/util/error.h: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_ERROR_H 2 | #define UTIL_ERROR_H 3 | 4 | #include 5 | 6 | namespace util { 7 | 8 | /// Holds an error message. 9 | class error { 10 | public: 11 | /// Default-constructs an empty error message. 12 | error() = default; 13 | 14 | /// Constructs an error from a C-string. 15 | /// @param msg The error message. 16 | explicit error(char const* msg) : msg_{msg} { 17 | } 18 | 19 | /// Constructs an error from a C++ string. 20 | /// @param msg The error message. 21 | explicit error(std::string msg) : msg_{std::move(msg)} { 22 | } 23 | 24 | error(error const&) = default; 25 | error(error&&) = default; 26 | error& operator=(error const&) = default; 27 | error& operator=(error&&) = default; 28 | 29 | explicit operator std::string() const { 30 | return msg_; 31 | } 32 | 33 | /// Retrieves the error message. 34 | /// @returns The error string. 35 | std::string const& msg() const { 36 | return msg_; 37 | } 38 | 39 | private: 40 | std::string msg_; 41 | }; 42 | 43 | } // namespace util 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /bf/bloom_filter/stable.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BF_BLOOM_FILTER_STABLE_HPP 2 | #define BF_BLOOM_FILTER_STABLE_HPP 3 | 4 | #include 5 | #include 6 | 7 | namespace bf { 8 | 9 | /// A stable Bloom filter. 10 | class stable_bloom_filter : public counting_bloom_filter 11 | { 12 | public: 13 | /// Constructs a stable Bloom filter. 14 | /// @param h The hasher. 15 | /// @param cells The number of cells. 16 | /// @param width The number of bits per cell. 17 | /// @param d The number of cells to decrement before adding an element. 18 | /// @pre `cells <= d` 19 | stable_bloom_filter(hasher h, size_t cells, size_t width, size_t d); 20 | 21 | /// Adds an item to the stable Bloom filter. 22 | /// This invovles first decrementing *k* positions uniformly at random and 23 | /// then setting the counter of *o* to all 1s. 24 | /// @param o The object to add. 25 | virtual void add(object const& o) override; 26 | 27 | using bloom_filter::add; 28 | using bloom_filter::lookup; 29 | 30 | private: 31 | size_t d_; 32 | std::mt19937 generator_; 33 | std::uniform_int_distribution<> unif_; 34 | }; 35 | 36 | } // namespace bf 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/bloom_filter/a2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | namespace bf { 6 | 7 | size_t a2_bloom_filter::k(double fp) { 8 | return std::floor(-std::log(1 - std::sqrt(1 - fp)) / std::log(2)); 9 | } 10 | 11 | size_t a2_bloom_filter::capacity(double fp, size_t cells) { 12 | return std::floor(cells / (2 * k(fp)) * std::log(2)); 13 | } 14 | 15 | a2_bloom_filter::a2_bloom_filter(size_t k, size_t cells, size_t capacity, 16 | size_t seed1, size_t seed2) 17 | : first_(make_hasher(k, seed1), cells / 2), 18 | second_(make_hasher(k, seed2), cells / 2), 19 | capacity_(capacity) { 20 | assert(cells % 2 == 0); 21 | } 22 | 23 | void a2_bloom_filter::add(object const& o) { 24 | if (first_.lookup(o)) 25 | return; 26 | first_.add(o); // FIXME: do not hash object twice for better performance. 27 | if (++items_ <= capacity_) 28 | return; 29 | items_ = 1; 30 | second_.clear(); 31 | first_.swap(second_); 32 | first_.add(o); 33 | } 34 | 35 | size_t a2_bloom_filter::lookup(object const& o) const { 36 | auto r1 = first_.lookup(o); 37 | return r1 > 0 ? r1 : second_.lookup(o); 38 | } 39 | 40 | void a2_bloom_filter::clear() { 41 | first_.clear(); 42 | second_.clear(); 43 | } 44 | 45 | } // namespace bf 46 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | AccessModifierOffset: -2 3 | AlignEscapedNewlinesLeft: false 4 | AlignTrailingComments: true 5 | AllowAllParametersOfDeclarationOnNextLine: false 6 | AllowShortIfStatementsOnASingleLine: false 7 | AllowShortLoopsOnASingleLine: false 8 | AllowShortFunctionsOnASingleLine: false 9 | AlwaysBreakBeforeMultilineStrings: true 10 | AlwaysBreakTemplateDeclarations: true 11 | BinPackParameters: true 12 | BreakBeforeBinaryOperators: NonAssignment 13 | BreakBeforeBraces: Attach 14 | BreakBeforeTernaryOperators: false 15 | ColumnLimit: 80 16 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 17 | ConstructorInitializerIndentWidth: 4 18 | ContinuationIndentWidth: 2 19 | Cpp11BracedListStyle: true 20 | IndentCaseLabels: true 21 | IndentWidth: 2 22 | MaxEmptyLinesToKeep: 1 23 | NamespaceIndentation: None 24 | 25 | # Force pointers to the type 26 | DerivePointerAlignment: false 27 | PointerAlignment: Left 28 | 29 | # Put space after = and after control statements 30 | SpaceBeforeAssignmentOperators: true 31 | SpaceBeforeParens: ControlStatements 32 | 33 | SpaceInEmptyParentheses: false 34 | SpacesBeforeTrailingComments: 1 35 | SpacesInAngles: false 36 | SpacesInCStyleCastParentheses: false 37 | SpacesInParentheses: false 38 | Standard: Cpp11 39 | UseTab: Never 40 | BreakConstructorInitializersBeforeComma: false 41 | ... 42 | -------------------------------------------------------------------------------- /cmake/cmake_uninstall.cmake.in: -------------------------------------------------------------------------------- 1 | function(uninstall_manifest manifestPath) 2 | file(READ "${manifestPath}" files) 3 | string(REGEX REPLACE "\n" ";" files "${files}") 4 | foreach (file ${files}) 5 | set(fileName $ENV{DESTDIR}${file}) 6 | 7 | if (EXISTS "${fileName}" OR IS_SYMLINK "${fileName}") 8 | message(STATUS "Uninstalling: ${fileName}") 9 | 10 | execute_process( 11 | COMMAND "@CMAKE_COMMAND@" -E remove "${fileName}" 12 | OUTPUT_VARIABLE rm_out 13 | RESULT_VARIABLE rm_retval 14 | ) 15 | 16 | if (NOT ${rm_retval} EQUAL 0) 17 | message(FATAL_ERROR "Problem when removing: ${fileName}") 18 | endif () 19 | else () 20 | message(STATUS "Does not exist: ${fileName}") 21 | endif () 22 | 23 | endforeach () 24 | endfunction(uninstall_manifest) 25 | 26 | file(GLOB install_manifests @CMAKE_CURRENT_BINARY_DIR@/install_manifest*.txt) 27 | 28 | if (install_manifests) 29 | foreach (manifest ${install_manifests}) 30 | uninstall_manifest(${manifest}) 31 | endforeach () 32 | else () 33 | message(FATAL_ERROR "Cannot find any install manifests in: " 34 | "\"@CMAKE_CURRENT_BINARY_DIR@/install_manifest*.txt\"") 35 | endif () 36 | -------------------------------------------------------------------------------- /bf/bloom_filter.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BF_BLOOM_FILTER_HPP 2 | #define BF_BLOOM_FILTER_HPP 3 | 4 | #include 5 | 6 | namespace bf { 7 | 8 | /// The abstract Bloom filter interface. 9 | class bloom_filter 10 | { 11 | bloom_filter(bloom_filter const&) = delete; 12 | bloom_filter& operator=(bloom_filter const&) = delete; 13 | 14 | public: 15 | bloom_filter() = default; 16 | virtual ~bloom_filter() = default; 17 | 18 | /// Adds an element to the Bloom filter. 19 | /// @tparam T The type of the element to insert. 20 | /// @param x An instance of type `T`. 21 | template 22 | void add(T const& x) 23 | { 24 | add(wrap(x)); 25 | } 26 | 27 | /// Adds an element to the Bloom filter. 28 | /// @param o A wrapped object. 29 | virtual void add(object const& o) = 0; 30 | 31 | /// Retrieves the count of an element. 32 | /// @tparam T The type of the element to query. 33 | /// @param x An instance of type `T`. 34 | /// @return A frequency estimate for *x*. 35 | template 36 | size_t lookup(T const& x) const 37 | { 38 | return lookup(wrap(x)); 39 | } 40 | 41 | /// Retrieves the count of an element. 42 | /// @param o A wrapped object. 43 | /// @return A frequency estimate for *o*. 44 | virtual size_t lookup(object const& o) const = 0; 45 | 46 | /// Removes all items from the Bloom filter. 47 | virtual void clear() = 0; 48 | }; 49 | 50 | } // namespace bf 51 | 52 | #endif 53 | -------------------------------------------------------------------------------- /src/bloom_filter/bitwise.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace bf { 4 | 5 | bitwise_bloom_filter::bitwise_bloom_filter(size_t k, size_t cells, size_t seed) 6 | : k_(k), cells_(cells), seed_(seed) { 7 | grow(); 8 | } 9 | 10 | void bitwise_bloom_filter::add(object const& o) { 11 | size_t l = 0; 12 | // FIXME: do not hash element more than once for better performance. 13 | while (l < levels_.size()) 14 | if (levels_[l].lookup(o)) { 15 | levels_[l++].remove(o); 16 | } else { 17 | levels_[l].add(o); 18 | return; 19 | } 20 | 21 | grow(); 22 | levels_.back().add(o); 23 | } 24 | 25 | size_t bitwise_bloom_filter::lookup(object const& o) const { 26 | size_t result = 0; 27 | for (size_t l = 0; l < levels_.size(); ++l) 28 | result += levels_[l].lookup(o) << l; 29 | return result; 30 | } 31 | 32 | void bitwise_bloom_filter::clear() { 33 | levels_.clear(); 34 | grow(); 35 | } 36 | 37 | void bitwise_bloom_filter::grow() { 38 | auto l = levels_.size(); 39 | 40 | // TODO: come up with a reasonable growth scheme. 41 | static size_t const min_size = 128; 42 | auto cells = l == 0 ? min_size : cells_ / (2 * l); 43 | if (cells < min_size) 44 | cells = min_size; 45 | 46 | size_t seed = seed_; 47 | std::minstd_rand0 prng(seed); 48 | for (size_t i = 0; i < l; ++i) 49 | seed = prng(); 50 | 51 | levels_.emplace_back(make_hasher(k_, seed), cells); 52 | } 53 | 54 | } // namespace bf 55 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Matthias Vallentin 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of the copyright holder nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 22 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 | POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /src/hash.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | 7 | namespace bf { 8 | 9 | default_hash_function::default_hash_function(size_t seed) : h3_(seed) { 10 | } 11 | 12 | size_t default_hash_function::operator()(object const& o) const { 13 | // FIXME: fall back to a generic universal hash function (e.g., HMAC/MD5) for 14 | // too large objects. 15 | if (o.size() > max_obj_size) 16 | throw std::runtime_error("object too large"); 17 | return o.size() == 0 ? 0 : h3_(o.data(), o.size()); 18 | } 19 | 20 | default_hasher::default_hasher(std::vector fns) 21 | : fns_(std::move(fns)) { 22 | } 23 | 24 | std::vector default_hasher::operator()(object const& o) const { 25 | std::vector d(fns_.size()); 26 | for (size_t i = 0; i < fns_.size(); ++i) 27 | d[i] = fns_[i](o); 28 | return d; 29 | } 30 | 31 | double_hasher::double_hasher(size_t k, hash_function h1, hash_function h2) 32 | : k_(k), h1_(std::move(h1)), h2_(std::move(h2)) { 33 | } 34 | 35 | std::vector double_hasher::operator()(object const& o) const { 36 | auto d1 = h1_(o); 37 | auto d2 = h2_(o); 38 | std::vector d(k_); 39 | for (size_t i = 0; i < d.size(); ++i) 40 | d[i] = d1 + i * d2; 41 | return d; 42 | } 43 | 44 | hasher make_hasher(size_t k, size_t seed, bool double_hashing) { 45 | assert(k > 0); 46 | std::minstd_rand0 prng(seed); 47 | if (double_hashing) { 48 | auto h1 = default_hash_function(prng()); 49 | auto h2 = default_hash_function(prng()); 50 | return double_hasher(k, std::move(h1), std::move(h2)); 51 | } else { 52 | std::vector fns(k); 53 | for (size_t i = 0; i < k; ++i) 54 | fns[i] = default_hash_function(prng()); 55 | return default_hasher(std::move(fns)); 56 | } 57 | } 58 | 59 | } // namespace bf 60 | -------------------------------------------------------------------------------- /bf/bloom_filter/a2.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BF_BLOOM_FILTER_A2_HPP 2 | #define BF_BLOOM_FILTER_A2_HPP 3 | 4 | #include 5 | 6 | namespace bf { 7 | 8 | class a2_bloom_filter : public bloom_filter 9 | { 10 | public: 11 | /// Computes the optimal value number of hash functions based on a desired 12 | /// false-positive rate. 13 | /// 14 | /// @param fp The desired false-positive rate. 15 | /// 16 | /// @return The optimal number of hash functions for *fp*. 17 | static size_t k(double fp); 18 | 19 | /// @param fp The desired false-positive rate. 20 | /// 21 | /// @param cells The number of cells (bits) to use. 22 | /// 23 | /// @return The optimal capacity for *fp* and *m*. 24 | static size_t capacity(double fp, size_t cells); 25 | 26 | /// Constructs an @f$A^2$@f Bloom filter. 27 | /// 28 | /// @param k The number of hash functions to use in each Bloom filter. 29 | /// 30 | /// @param cells The number cells to use for both Bloom filters, i.e., each 31 | /// Bloom filter uses `cells / 2` cells. 32 | /// 33 | /// @param seed1 The initial seed for the first Bloom filter. 34 | /// 35 | /// @param seed2 The initial seed for the second Bloom filter. 36 | /// 37 | /// @pre `cells % 2 == 0` 38 | a2_bloom_filter(size_t k, size_t cells, size_t capacity, 39 | size_t seed1 = 0, size_t seed2 = 0); 40 | 41 | using bloom_filter::add; 42 | using bloom_filter::lookup; 43 | 44 | virtual void add(object const& o) override; 45 | virtual size_t lookup(object const& o) const override; 46 | virtual void clear() override; 47 | 48 | private: 49 | basic_bloom_filter first_; 50 | basic_bloom_filter second_; 51 | size_t items_ = 0; ///< Number of items in the active Bloom filter. 52 | size_t capacity_; ///< Maximum number of items in the active Bloom filter. 53 | }; 54 | 55 | } // namespace bf 56 | 57 | #endif 58 | -------------------------------------------------------------------------------- /test/bf/configuration.cc: -------------------------------------------------------------------------------- 1 | #include "configuration.h" 2 | 3 | #include 4 | 5 | std::string config::banner() const { 6 | std::stringstream ss; 7 | ss << " __ ____\n" 8 | " / /_ / __/\n" 9 | " / __ \\/ /_\n" 10 | " / /_/ / __/\n" 11 | "/_.___/_/\n"; 12 | 13 | return ss.str(); 14 | } 15 | 16 | void config::initialize() { 17 | auto& general = create_block("general options"); 18 | general.add('i', "input", "input file").single(); 19 | general.add('q', "query", "query file").single(); 20 | general.add('h', "help", "display this help"); 21 | general.add('n', "numeric", "interpret input as numeric values"); 22 | 23 | auto& bloomfilter = create_block("bloom filter options"); 24 | bloomfilter 25 | .add('t', "type", "basic|counting|spectral-mi|spectral-rm|bitwise|stable") 26 | .single(); 27 | bloomfilter.add('f', "fp-rate", "desired false-positive rate").init(0); 28 | bloomfilter.add('c', "capacity", "max number of expected elements").init(0); 29 | bloomfilter.add('m', "cells", "number of cells").init(0); 30 | bloomfilter.add('w', "width", "bits per cells").init(1); 31 | bloomfilter.add('p', "partition", "enable partitioning"); 32 | bloomfilter.add('e', "evict", "number of cells to evict (stable)").init(0); 33 | bloomfilter.add('k', "hash-functions", "number of hash functions").init(0); 34 | bloomfilter.add('d', "double-hashing", "use double-hashing"); 35 | bloomfilter.add('s', "seed", "specify a custom seed").init(0); 36 | 37 | auto& second = create_block("second bloom filter options"); 38 | second.add('M', "cells-2nd", "number of cells").init(0); 39 | second.add('W', "width-2nd", "bits per cells").init(1); 40 | second.add('K', "hash-functions-2nd", "number of hash functions").init(0); 41 | second.add('D', "double-hashing-2nd", "use double-hashing"); 42 | second.add('S', "seed-2nd", "specify a custom seed").init(0); 43 | } 44 | -------------------------------------------------------------------------------- /bf/hash.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BF_HASH_POLICY_HPP 2 | #define BF_HASH_POLICY_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace bf { 9 | 10 | /// The hash digest type. 11 | typedef size_t digest; 12 | 13 | /// The hash function type. 14 | typedef std::function hash_function; 15 | 16 | /// A function that hashes an object *k* times. 17 | typedef std::function(object const&)> hasher; 18 | 19 | class default_hash_function 20 | { 21 | public: 22 | constexpr static size_t max_obj_size = 36; 23 | 24 | default_hash_function(size_t seed); 25 | 26 | size_t operator()(object const& o) const; 27 | 28 | private: 29 | h3 h3_; 30 | }; 31 | 32 | /// A hasher which hashes an object *k* times. 33 | class default_hasher 34 | { 35 | public: 36 | default_hasher(std::vector fns); 37 | 38 | std::vector operator()(object const& o) const; 39 | 40 | private: 41 | std::vector fns_; 42 | }; 43 | 44 | /// A hasher which hashes an object two times and generates *k* digests through 45 | /// a linear combinations of the two digests. 46 | class double_hasher 47 | { 48 | public: 49 | double_hasher(size_t k, hash_function h1, hash_function h2); 50 | 51 | std::vector operator()(object const& o) const; 52 | 53 | private: 54 | size_t k_; 55 | hash_function h1_; 56 | hash_function h2_; 57 | }; 58 | 59 | /// Creates a default or double hasher with the default hash function, using 60 | /// seeds from a linear congruential PRNG. 61 | /// 62 | /// @param k The number of hash functions to use. 63 | /// 64 | /// @param seed The initial seed of the PRNG. 65 | /// 66 | /// @param double_hashing If `true`, the function constructs a ::double_hasher 67 | /// and a ::default_hasher otherwise. 68 | /// 69 | /// @return A ::hasher with the *k* hash functions. 70 | /// 71 | /// @pre `k > 0` 72 | hasher make_hasher(size_t k, size_t seed = 0, bool double_hashing = false); 73 | 74 | } // namespace bf 75 | 76 | #endif 77 | -------------------------------------------------------------------------------- /bf/h3.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BF_H3_HPP 2 | #define BF_H3_HPP 3 | 4 | #include 5 | #include 6 | 7 | namespace bf { 8 | 9 | /// An implementation of the H3 hash function family. 10 | template 11 | class h3 12 | { 13 | static size_t const bits_per_byte = 14 | std::numeric_limits::digits; 15 | 16 | public: 17 | constexpr static size_t byte_range = 18 | std::numeric_limits::max() + 1; 19 | 20 | h3(T seed = 0) 21 | { 22 | T bits[N * bits_per_byte]; 23 | std::minstd_rand0 prng(seed); 24 | for (size_t bit = 0; bit < N * bits_per_byte; ++bit) 25 | { 26 | bits[bit] = 0; 27 | for (size_t i = 0; i < sizeof(T)/2; i++) 28 | bits[bit] = (bits[bit] << 16) | (prng() & 0xFFFF); 29 | } 30 | 31 | for (size_t byte = 0; byte < N; ++byte) 32 | for (size_t val = 0; val < byte_range; ++val) 33 | { 34 | bytes_[byte][val] = 0; 35 | for (size_t bit = 0; bit < bits_per_byte; ++bit) 36 | if (val & (1 << bit)) 37 | bytes_[byte][val] ^= bits[byte * bits_per_byte + bit]; 38 | } 39 | } 40 | 41 | T operator()(void const* data, size_t size, size_t offset = 0) const 42 | { 43 | auto *p = static_cast(data); 44 | T result = 0; 45 | // Duff's Device. 46 | auto n = (size + 7) / 8; 47 | #pragma GCC diagnostic push 48 | #pragma GCC diagnostic ignored "-Wimplicit-fallthrough" 49 | switch (size % 8) 50 | { 51 | case 0: do { result ^= bytes_[offset++][*p++]; 52 | case 7: result ^= bytes_[offset++][*p++]; 53 | case 6: result ^= bytes_[offset++][*p++]; 54 | case 5: result ^= bytes_[offset++][*p++]; 55 | case 4: result ^= bytes_[offset++][*p++]; 56 | case 3: result ^= bytes_[offset++][*p++]; 57 | case 2: result ^= bytes_[offset++][*p++]; 58 | case 1: result ^= bytes_[offset++][*p++]; 59 | } while ( --n > 0 ); 60 | } 61 | #pragma GCC diagnostic pop 62 | return result; 63 | } 64 | 65 | private: 66 | T bytes_[N][byte_range]; 67 | }; 68 | 69 | } // namespace bf 70 | 71 | #endif 72 | -------------------------------------------------------------------------------- /src/bloom_filter/basic.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | namespace bf { 7 | 8 | size_t basic_bloom_filter::m(double fp, size_t capacity) { 9 | auto ln2 = std::log(2); 10 | return std::ceil(-(capacity * std::log(fp) / ln2 / ln2)); 11 | } 12 | 13 | size_t basic_bloom_filter::k(size_t cells, size_t capacity) { 14 | auto frac = static_cast(cells) / static_cast(capacity); 15 | return std::ceil(frac * std::log(2)); 16 | } 17 | 18 | basic_bloom_filter::basic_bloom_filter(hasher h, size_t cells, bool partition) 19 | : hasher_(std::move(h)), bits_(cells), partition_(partition) { 20 | } 21 | 22 | basic_bloom_filter::basic_bloom_filter(double fp, size_t capacity, size_t seed, 23 | bool double_hashing, bool partition) 24 | : partition_(partition) { 25 | auto required_cells = m(fp, capacity); 26 | auto optimal_k = k(required_cells, capacity); 27 | if (partition_) 28 | required_cells += optimal_k - required_cells % optimal_k; 29 | bits_.resize(required_cells); 30 | hasher_ = make_hasher(optimal_k, seed, double_hashing); 31 | } 32 | 33 | basic_bloom_filter::basic_bloom_filter(hasher h, bitvector b) 34 | : hasher_(std::move(h)), bits_(std::move(b)) { 35 | } 36 | 37 | basic_bloom_filter::basic_bloom_filter(basic_bloom_filter&& other) 38 | : hasher_(std::move(other.hasher_)), bits_(std::move(other.bits_)) { 39 | } 40 | 41 | void basic_bloom_filter::add(object const& o) { 42 | auto digests = hasher_(o); 43 | if (partition_) { 44 | assert(bits_.size() % digests.size() == 0); 45 | auto parts = bits_.size() / digests.size(); 46 | for (size_t i = 0; i < digests.size(); ++i) 47 | bits_.set(i * parts + (digests[i] % parts)); 48 | } else { 49 | for (auto d : digests) 50 | bits_.set(d % bits_.size()); 51 | } 52 | } 53 | 54 | size_t basic_bloom_filter::lookup(object const& o) const { 55 | auto digests = hasher_(o); 56 | if (partition_) { 57 | assert(bits_.size() % digests.size() == 0); 58 | auto parts = bits_.size() / digests.size(); 59 | for (size_t i = 0; i < digests.size(); ++i) 60 | if (!bits_[i * parts + (digests[i] % parts)]) 61 | return 0; 62 | } else { 63 | for (auto d : digests) 64 | if (!bits_[d % bits_.size()]) 65 | return 0; 66 | } 67 | 68 | return 1; 69 | } 70 | 71 | void basic_bloom_filter::clear() { 72 | bits_.reset(); 73 | } 74 | 75 | void basic_bloom_filter::remove(object const& o) { 76 | for (auto d : hasher_(o)) 77 | bits_.reset(d % bits_.size()); 78 | } 79 | 80 | void basic_bloom_filter::swap(basic_bloom_filter& other) { 81 | using std::swap; 82 | swap(hasher_, other.hasher_); 83 | swap(bits_, other.bits_); 84 | } 85 | 86 | bitvector const& basic_bloom_filter::storage() const { 87 | return bits_; 88 | } 89 | hasher const& basic_bloom_filter::hasher_function() const { 90 | return hasher_; 91 | } 92 | 93 | } // namespace bf 94 | -------------------------------------------------------------------------------- /src/counter_vector.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | namespace bf { 6 | 7 | counter_vector::counter_vector(size_t cells, size_t width) 8 | : bits_(cells * width), width_(width) { 9 | assert(cells > 0); 10 | assert(width > 0); 11 | } 12 | 13 | counter_vector& counter_vector::operator|=(counter_vector const& other) { 14 | assert(size() == other.size()); 15 | assert(width() == other.width()); 16 | for (size_t cell = 0; cell < size(); ++cell) { 17 | bool carry = false; 18 | size_t lsb = cell * width_; 19 | for (size_t i = 0; i < width_; ++i) { 20 | bool b1 = bits_[lsb + i]; 21 | bool b2 = other.bits_[lsb + i]; 22 | bits_[lsb + i] = b1 ^ b2 ^ carry; 23 | carry = (b1 && b2) || (carry && (b1 != b2)); 24 | } 25 | if (carry) 26 | for (size_t i = 0; i < width_; ++i) 27 | bits_.set(lsb + i); 28 | } 29 | return *this; 30 | } 31 | 32 | counter_vector operator|(counter_vector const& x, counter_vector const& y) { 33 | counter_vector cv(x); 34 | return cv |= y; 35 | } 36 | 37 | bool counter_vector::increment(size_t cell, size_t value) { 38 | assert(cell < size()); 39 | assert(value != 0); 40 | size_t lsb = cell * width_; 41 | bool carry = false; 42 | for (size_t i = 0; i < width_; ++i) { 43 | bool b1 = bits_[lsb + i]; 44 | bool b2 = value & (1 << i); 45 | bits_[lsb + i] = b1 ^ b2 ^ carry; 46 | carry = (b1 && b2) || (carry && (b1 != b2)); 47 | } 48 | if (carry) 49 | for (size_t i = 0; i < width_; ++i) 50 | bits_[lsb + i] = true; 51 | return !carry; 52 | } 53 | 54 | bool counter_vector::decrement(size_t cell, size_t value) { 55 | assert(cell < size()); 56 | assert(value != 0); 57 | value = ~value + 1; // A - B := A + ~B + 1 58 | bool carry = false; 59 | size_t lsb = cell * width_; 60 | for (size_t i = 0; i < width_; ++i) { 61 | bool b1 = bits_[lsb + i]; 62 | bool b2 = value & (1 << i); 63 | bits_[lsb + i] = b1 ^ b2 ^ carry; 64 | carry = (b1 && b2) || (carry && (b1 != b2)); 65 | } 66 | return carry; 67 | } 68 | 69 | size_t counter_vector::count(size_t cell) const { 70 | assert(cell < size()); 71 | size_t cnt = 0, order = 1; 72 | size_t lsb = cell * width_; 73 | for (auto i = lsb; i < lsb + width_; ++i, order <<= 1) 74 | if (bits_[i]) 75 | cnt |= order; 76 | return cnt; 77 | } 78 | 79 | void counter_vector::set(size_t cell, size_t value) { 80 | assert(cell < size()); 81 | assert(value <= max()); 82 | bitvector bits(width_, value); 83 | auto lsb = cell * width_; 84 | for (size_t i = 0; i < width_; ++i) 85 | bits_[lsb + i] = bits[i]; 86 | } 87 | 88 | void counter_vector::clear() { 89 | bits_.reset(); 90 | } 91 | 92 | size_t counter_vector::size() const { 93 | return bits_.size() / width_; 94 | } 95 | 96 | size_t counter_vector::max() const { 97 | using limits = std::numeric_limits; 98 | return limits::max() >> (limits::digits - width()); 99 | } 100 | 101 | size_t counter_vector::width() const { 102 | return width_; 103 | } 104 | 105 | } // namespace bf 106 | -------------------------------------------------------------------------------- /bf/counter_vector.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BF_COUNTER_VECTOR_HPP 2 | #define BF_COUNTER_VECTOR_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace bf { 9 | 10 | /// The *fixed width* storage policy implements a bit vector where each 11 | /// cell represents a counter having a fixed number of bits. 12 | class counter_vector 13 | { 14 | /// Generates a string representation of a counter vector. 15 | /// The arguments have the same meaning as in bf::bitvector. 16 | friend std::string to_string(counter_vector const& v, bool all = false, 17 | size_t cut_off = 0) 18 | { 19 | return to_string(v.bits_, false, all, cut_off); 20 | } 21 | 22 | friend counter_vector operator|(counter_vector const& x, 23 | counter_vector const& y); 24 | 25 | public: 26 | /// Construct a counter vector of size @f$O(mw)@f$ where *m is the number of 27 | /// cells and *w the number of bits per cell. 28 | /// 29 | /// @param cells The number of cells. 30 | /// 31 | /// @param width The number of bits per cell. 32 | /// 33 | /// @pre `cells > 0 && width > 0` 34 | counter_vector(size_t cells, size_t width); 35 | 36 | /// Merges this counter vector with another counter vector. 37 | /// @param other The other counter vector. 38 | /// @return A reference to `*this`. 39 | /// @pre `size() == other.size() && width() == other.width()` 40 | counter_vector& operator|=(counter_vector const& other); 41 | 42 | /// Increments a cell counter by a given value. If the value is larger 43 | /// than or equal to max(), all bits are set to 1. 44 | /// 45 | /// @param cell The cell index. 46 | /// 47 | /// @param value The value that is added to the current cell value. 48 | /// 49 | /// @return `true` if the increment succeeded, `false` if all bits in 50 | /// the cell were already 1. 51 | /// 52 | /// @pre `cell < size()` 53 | bool increment(size_t cell, size_t value = 1); 54 | 55 | /// Decrements a cell counter. 56 | /// 57 | /// @param cell The cell index. 58 | /// 59 | /// @return `true` if decrementing succeeded, `false` if all bits in the 60 | /// cell were already 0. 61 | /// 62 | /// @pre `cell < size()` 63 | bool decrement(size_t cell, size_t value = 1); 64 | 65 | /// Retrieves the counter of a cell. 66 | /// 67 | /// @param cell The cell index. 68 | /// 69 | /// @return The counter associated with *cell*. 70 | /// 71 | /// @pre `cell < size()` 72 | size_t count(size_t cell) const; 73 | 74 | /// Sets a cell to a given value. 75 | /// @param cell The cell whose value changes. 76 | /// @param value The new value of the cell. 77 | /// @pre `cell < size()` 78 | void set(size_t cell, size_t value); 79 | 80 | /// Sets all counter values to 0. 81 | void clear(); 82 | 83 | /// Retrieves the number of cells. 84 | /// @return The number of cells in the counter vector. 85 | size_t size() const; 86 | 87 | /// Retrieves the maximum possible counter value. 88 | /// @return The maximum counter value constrained by the cell width. 89 | size_t max() const; 90 | 91 | /// Retrieves the counter width. 92 | /// @return The number of bits per cell. 93 | size_t width() const; 94 | 95 | private: 96 | bitvector bits_; 97 | size_t width_; 98 | }; 99 | 100 | 101 | } // namespace bf 102 | 103 | #endif 104 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # -- Project Setup ------------------------------------------------------------ 2 | 3 | project(libbf CXX) 4 | 5 | include(CTest) 6 | 7 | cmake_minimum_required(VERSION 2.6 FATAL_ERROR) 8 | 9 | set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) 10 | 11 | # Support 'make uninstall'. 12 | if ("${PROJECT_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}") 13 | configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in" 14 | "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" 15 | @ONLY) 16 | add_custom_target(uninstall COMMAND 17 | ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake) 18 | endif () 19 | 20 | # Prohibit in-source builds. 21 | if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}") 22 | message(FATAL_ERROR "In-source builds are not allowed. Please use " 23 | "./configure to choose a build directory and " 24 | "initialize the build configuration.") 25 | endif () 26 | 27 | # Silence warning CMP0042 28 | if (APPLE AND NOT DEFINED CMAKE_MACOSX_RPATH) 29 | set(CMAKE_MACOSX_RPATH true) 30 | endif() 31 | 32 | set(EXECUTABLE_OUTPUT_PATH ${CMAKE_CURRENT_BINARY_DIR}/bin 33 | CACHE PATH "Single directory for all executables") 34 | 35 | set (LIBRARY_OUTPUT_PATH ${CMAKE_CURRENT_BINARY_DIR}/lib 36 | CACHE PATH "Single directory for all libraries") 37 | 38 | # -- Dependencies ------------------------------------------------------------- 39 | 40 | find_package(Threads) 41 | if (NOT Threads_FOUND) 42 | message(FATAL_ERROR "Could not find system threading libraries") 43 | endif () 44 | 45 | set(CMAKE_CXX_FLAGS "-Wall -Wextra -std=c++11") 46 | set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g") 47 | set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG") 48 | set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG") 49 | set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g") 50 | 51 | if (ENABLE_DEBUG) 52 | set(CMAKE_BUILD_TYPE Debug) 53 | else () 54 | set(CMAKE_BUILD_TYPE Release) 55 | endif () 56 | 57 | if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") 58 | execute_process( 59 | COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) 60 | if (NOT (GCC_VERSION VERSION_GREATER 4.7 OR GCC_VERSION VERSION_EQUAL 4.7)) 61 | message(FATAL_ERROR "${PROJECT_NAME} requires g++ 4.7 or greater.") 62 | endif () 63 | elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") 64 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") 65 | else () 66 | message(FATAL_ERROR "Your C++ compiler does not support C++11.") 67 | endif () 68 | 69 | # -- libbf ------------------------------------------------------------------- 70 | 71 | include_directories(${CMAKE_SOURCE_DIR}) 72 | 73 | set(libbf_sources 74 | src/bitvector.cpp 75 | src/counter_vector.cpp 76 | src/hash.cpp 77 | src/bloom_filter/a2.cpp 78 | src/bloom_filter/basic.cpp 79 | src/bloom_filter/bitwise.cpp 80 | src/bloom_filter/counting.cpp 81 | src/bloom_filter/stable.cpp 82 | ) 83 | 84 | add_library(libbf_static STATIC ${libbf_sources}) 85 | set_target_properties(libbf_static PROPERTIES OUTPUT_NAME "bf") 86 | set_target_properties(libbf_static PROPERTIES POSITION_INDEPENDENT_CODE TRUE) 87 | 88 | add_library(libbf_shared SHARED ${libbf_sources}) 89 | set_target_properties(libbf_shared PROPERTIES OUTPUT_NAME "bf") 90 | set_target_properties(libbf_shared PROPERTIES POSITION_INDEPENDENT_CODE TRUE) 91 | 92 | install(TARGETS libbf_static DESTINATION lib) 93 | install(TARGETS libbf_shared DESTINATION lib) 94 | install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/bf DESTINATION include) 95 | 96 | add_subdirectory(test) 97 | -------------------------------------------------------------------------------- /bf/bloom_filter/basic.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BF_BLOOM_FILTER_BASIC_HPP 2 | #define BF_BLOOM_FILTER_BASIC_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace bf { 10 | 11 | /// The basic Bloom filter. 12 | /// 13 | /// @note This Bloom filter does not use partitioning because it results in 14 | /// slightly worse performance because partitioned Bloom filters tend to have 15 | /// more 1s than non-partitioned filters. 16 | class basic_bloom_filter : public bloom_filter 17 | { 18 | public: 19 | /// Computes the number of cells based on a false-positive rate and capacity. 20 | /// 21 | /// @param fp The desired false-positive rate 22 | /// 23 | /// @param capacity The maximum number of items. 24 | /// 25 | /// @return The number of cells to use that guarantee *fp* for *capacity* 26 | /// elements. 27 | static size_t m(double fp, size_t capacity); 28 | 29 | /// Computes @f$k^*@f$, the optimal number of hash functions for a given 30 | /// Bloom filter size (in terms of cells) and capacity. 31 | /// 32 | /// @param cells The number of cells in the Bloom filter (aka. *m*) 33 | /// 34 | /// @param capacity The maximum number of elements that can guarantee *fp*. 35 | /// 36 | /// @return The optimal number of hash functions for *cells* and *capacity*. 37 | static size_t k(size_t cells, size_t capacity); 38 | 39 | /// Constructs a basic Bloom filter. 40 | /// @param hasher The hasher to use. 41 | /// @param cells The number of cells in the bit vector. 42 | /// @param partition Whether to partition the bit vector per hash function. 43 | basic_bloom_filter(hasher h, size_t cells, bool partition = false); 44 | 45 | /// Constructs a basic Bloom filter by given a desired false-positive 46 | /// probability and an expected number of elements. The implementation 47 | /// computes the optimal number of hash function and required space. 48 | /// 49 | /// @param fp The desired false-positive probability. 50 | /// 51 | /// @param capacity The desired false-positive probability. 52 | /// 53 | /// @param seed The initial seed used to construct the hash functions. 54 | /// 55 | /// @param double_hashing Flag indicating whether to use default or double 56 | /// hashing. 57 | /// 58 | /// @param partition Whether to partition the bit vector per hash function. 59 | basic_bloom_filter(double fp, size_t capacity, size_t seed = 0, 60 | bool double_hashing = true, bool partition = true); 61 | 62 | /// Constructs a basic Bloom filter given a hasher and a bitvector. 63 | /// 64 | /// @param hasher The hasher to use. 65 | /// @param bitvector the underlying bitvector of the bf. 66 | basic_bloom_filter(hasher h, bitvector b); 67 | 68 | basic_bloom_filter(basic_bloom_filter&&); 69 | 70 | using bloom_filter::add; 71 | using bloom_filter::lookup; 72 | 73 | virtual void add(object const& o) override; 74 | virtual size_t lookup(object const& o) const override; 75 | virtual void clear() override; 76 | 77 | /// Removes an object from the Bloom filter. 78 | /// May introduce false negatives because the bitvector indices of the object 79 | /// to remove may be shared with other objects. 80 | /// 81 | /// @param o The object to remove. 82 | void remove(object const& o); 83 | 84 | /// Swaps two basic Bloom filters. 85 | /// @param other The other basic Bloom filter. 86 | void swap(basic_bloom_filter& other); 87 | 88 | /// Returns the underlying storage of the Bloom filter. 89 | bitvector const& storage() const; 90 | 91 | /// Returns the hasher of the Bloom filter. 92 | hasher const& hasher_function() const; 93 | 94 | private: 95 | hasher hasher_; 96 | bitvector bits_; 97 | bool partition_; 98 | }; 99 | 100 | } // namespace bf 101 | 102 | #endif 103 | -------------------------------------------------------------------------------- /configure: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Convenience wrapper for easily viewing/setting options that 3 | # the project's CMake scripts will recognize 4 | 5 | command="$0 $*" 6 | sourcedir="$( cd "$( dirname "$0" )" && pwd )" 7 | type cmake > /dev/null 2>&1 || { 8 | echo "\ 9 | This package requires CMake, please install it first, then you may 10 | use this configure script to access CMake equivalent functionality.\ 11 | " >&2; 12 | exit 1; 13 | } 14 | 15 | usage="\ 16 | Usage: $0 [OPTION]... [VAR=VALUE]... 17 | 18 | Build Options: 19 | --builddir=DIR place build files in directory [build] 20 | --generator=GENERATOR CMake generator to use (see cmake --help) 21 | 22 | Installation Directories: 23 | --prefix=PREFIX installation directory [/usr/local] 24 | 25 | Optional Features: 26 | --enable-debug compile in debugging mode 27 | --enable-perftools use Google perftools 28 | 29 | Required Packages in Non-Standard Locations: 30 | --with-boost=PATH path to Boost install root 31 | 32 | Influential Environment Variables (only on first invocation 33 | per build directory): 34 | CXX C++ compiler command 35 | CXXFLAGS C++ compiler flags 36 | " 37 | 38 | # Appends a CMake cache entry definition to the CMakeCacheEntries variable. 39 | # $1 is the cache entry variable name. 40 | # $2 is the cache entry variable type. 41 | # $3 is the cache entry variable value. 42 | append_cache_entry () { 43 | CMakeCacheEntries="$CMakeCacheEntries -D $1:$2=$3" 44 | } 45 | 46 | # Set defaults 47 | builddir=build 48 | CMakeCacheEntries="" 49 | append_cache_entry CMAKE_INSTALL_PREFIX PATH /usr/local 50 | append_cache_entry ENABLE_DEBUG BOOL false 51 | 52 | # parse arguments 53 | while [ $# -ne 0 ]; do 54 | case "$1" in 55 | -*=*) optarg=`echo "$1" | sed 's/[-_a-zA-Z0-9]*=//'` ;; 56 | *) optarg= ;; 57 | esac 58 | 59 | case "$1" in 60 | --help|-h) 61 | echo "${usage}" 1>&2 62 | exit 1 63 | ;; 64 | --builddir=*) 65 | builddir=$optarg 66 | ;; 67 | --generator=*) 68 | CMakeGenerator="$optarg" 69 | ;; 70 | --prefix=*) 71 | append_cache_entry CMAKE_INSTALL_PREFIX PATH $optarg 72 | ;; 73 | --enable-debug) 74 | append_cache_entry ENABLE_DEBUG BOOL true 75 | ;; 76 | --with-boost=*) 77 | append_cache_entry BOOST_ROOT PATH $optarg 78 | ;; 79 | *) 80 | echo "Invalid option '$1'. Try $0 --help to see available options." 81 | exit 1 82 | ;; 83 | esac 84 | shift 85 | done 86 | 87 | if [ -d $builddir ]; then 88 | if [ -f $builddir/CMakeCache.txt ]; then 89 | rm -f $builddir/CMakeCache.txt 90 | fi 91 | else 92 | mkdir -p $builddir 93 | fi 94 | 95 | echo "Build Directory : $builddir" 96 | echo "Source Directory: $sourcedir" 97 | cd $builddir 98 | 99 | if [ -n "$CMakeGenerator" ]; then 100 | cmake -G "$CMakeGenerator" $CMakeCacheEntries $sourcedir 101 | else 102 | cmake $CMakeCacheEntries $sourcedir 103 | fi 104 | 105 | echo "# This is the command used to configure this build" > config.status 106 | if [ -n "$CC" ]; then 107 | printf "CC=%s" $CC >> config.status 108 | printf ' ' >> config.status 109 | fi 110 | if [ -n "$CXX" ]; then 111 | printf "CXX=%s" $CXX >> config.status 112 | printf ' ' >> config.status 113 | fi 114 | echo $command >> config.status 115 | chmod u+x config.status 116 | 117 | cd .. 118 | 119 | printf "DIRS := %s\n\n" $builddir > $sourcedir/Makefile 120 | makefile=$(cat <<'EOT' 121 | all: 122 | @for i in $(DIRS); do $(MAKE) -C $$i $@; done 123 | 124 | doc: 125 | $(MAKE) -C $@ 126 | 127 | test: 128 | @for i in $(DIRS); do $(MAKE) -C $$i $@; done 129 | 130 | install: 131 | @for i in $(DIRS); do $(MAKE) -C $$i $@; done 132 | 133 | uninstall: 134 | @for i in $(DIRS); do $(MAKE) -C $$i $@; done 135 | 136 | clean: 137 | @for i in $(DIRS); do $(MAKE) -C $$i $@; done 138 | 139 | distclean: 140 | rm -rf $(DIRS) Makefile 141 | 142 | .PHONY: all doc test install uninstall clean distclean 143 | EOT 144 | ) 145 | 146 | echo "$makefile" >> $sourcedir/Makefile 147 | -------------------------------------------------------------------------------- /test/bf/util/trial.h: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_TRIAL_H 2 | #define UTIL_TRIAL_H 3 | 4 | #include "util/error.h" 5 | #include 6 | 7 | namespace util { 8 | 9 | /// Represents the result of a computation which can either complete 10 | /// successfully with an instance of type `T` or fail with an ::error. 11 | /// 12 | /// @tparam The type of the result. 13 | template 14 | class trial { 15 | public: 16 | /// Constructs a trial by forwarding arguments to the underlying type. 17 | /// @param args The arguments to pass to `T`'s constructor. 18 | /// @post The trial is *engaged*, i.e., `*this == true`. 19 | trial(T x) : engaged_{true} { 20 | new (&value_) T(std::move(x)); 21 | } 22 | 23 | /// Constructs a trial from an error. 24 | /// @param e The error. 25 | /// @post The trial is *disengaged*, i.e., `*this == false`. 26 | trial(error e) : engaged_{false} { 27 | new (&error_) error{std::move(e)}; 28 | } 29 | 30 | /// Copy-constructs a trial. 31 | /// @param other The other trial. 32 | trial(trial const& other) { 33 | construct(other); 34 | } 35 | 36 | /// Move-constructs a trial. 37 | /// @param other The other trial. 38 | trial(trial&& other) { 39 | construct(std::move(other)); 40 | } 41 | 42 | ~trial() { 43 | destroy(); 44 | } 45 | 46 | /// Assigns another trial to this instance. 47 | /// @param other The RHS of the assignment. 48 | /// @returns A reference to `*this`. 49 | trial& operator=(trial other) { 50 | construct(std::move(other)); 51 | return *this; 52 | } 53 | 54 | /// Assigns a trial by constructing an instance of `T` from the RHS 55 | /// expression. 56 | /// 57 | /// @param args The arguments to forward to `T`'s constructor. 58 | /// @returns A reference to `*this`. 59 | trial& operator=(T x) { 60 | destroy(); 61 | engaged_ = true; 62 | new (&value_) T(std::move(x)); 63 | return *this; 64 | } 65 | 66 | /// Assigns an error to this trial instance. 67 | /// @param e The error. 68 | /// @returns A reference to `*this`. 69 | /// @post The trial is *disengaged*, i.e., `*this == false`. 70 | trial& operator=(error e) { 71 | destroy(); 72 | engaged_ = false; 73 | new (&value_) error{std::move(e)}; 74 | return *this; 75 | } 76 | 77 | /// Checks whether the trial is engaged. 78 | /// @returns `true` iff the trial is engaged. 79 | explicit operator bool() const { 80 | return engaged_; 81 | } 82 | 83 | /// Shorthand for ::value. 84 | T& operator*() { 85 | return value(); 86 | } 87 | 88 | /// Shorthand for ::value. 89 | T const& operator*() const { 90 | return value(); 91 | } 92 | 93 | /// Shorthand for ::value. 94 | T* operator->() { 95 | return &value(); 96 | } 97 | 98 | /// Shorthand for ::value. 99 | T const* operator->() const { 100 | return &value(); 101 | } 102 | 103 | /// Retrieves the value of the trial. 104 | /// @returns A mutable reference to the contained value. 105 | /// @pre `*this == true`. 106 | T& value() { 107 | assert(engaged_); 108 | return value_; 109 | } 110 | 111 | /// Retrieves the value of the trial. 112 | /// @returns The contained value. 113 | /// @pre `*this == true`. 114 | T const& value() const { 115 | assert(engaged_); 116 | return value_; 117 | } 118 | 119 | /// Retrieves the error of the trial. 120 | /// @returns The contained error. 121 | /// @pre `*this == false`. 122 | error const& failure() const { 123 | assert(!engaged_); 124 | return error_; 125 | } 126 | 127 | private: 128 | void construct(trial const& other) { 129 | if (other.engaged_) { 130 | engaged_ = true; 131 | new (&value_) T(other.value_); 132 | } else { 133 | engaged_ = false; 134 | new (&error_) error{other.error_}; 135 | } 136 | } 137 | 138 | void construct(trial&& other) { 139 | if (other.engaged_) { 140 | engaged_ = true; 141 | new (&value_) T(std::move(other.value_)); 142 | } else { 143 | engaged_ = false; 144 | new (&error_) error{std::move(other.error_)}; 145 | } 146 | } 147 | 148 | void destroy() { 149 | if (engaged_) 150 | value_.~T(); 151 | else 152 | error_.~error(); 153 | } 154 | 155 | union { 156 | T value_; 157 | error error_; 158 | }; 159 | 160 | bool engaged_; 161 | }; 162 | 163 | /// An empty struct that represents a `void` ::trial. The pattern 164 | /// `trial` shall be used for functions that may generate an error but 165 | /// would otherwise return `void`. 166 | struct nothing {}; 167 | 168 | static constexpr auto nil = nothing{}; 169 | 170 | } // namespace util 171 | 172 | #endif 173 | -------------------------------------------------------------------------------- /bf/bloom_filter/counting.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BF_BLOOM_FILTER_COUNTING_HPP 2 | #define BF_BLOOM_FILTER_COUNTING_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace bf { 9 | 10 | class spectral_mi_bloom_filter; 11 | class spectral_rm_bloom_filter; 12 | 13 | /// The counting Bloom filter. 14 | class counting_bloom_filter : public bloom_filter 15 | { 16 | friend spectral_mi_bloom_filter; 17 | friend spectral_rm_bloom_filter; 18 | 19 | public: 20 | /// Constructs a counting Bloom filter. 21 | /// @param h The hasher. 22 | /// @param cells The number of cells. 23 | /// @param width The number of bits per cell. 24 | /// @param partition Whether to partition the bit vector per hash function. 25 | counting_bloom_filter(hasher h, size_t cells, size_t width, 26 | bool partition = false); 27 | 28 | /// Move-constructs a counting Bloom filter. 29 | counting_bloom_filter(counting_bloom_filter&&) = default; 30 | 31 | using bloom_filter::add; 32 | using bloom_filter::lookup; 33 | 34 | virtual void add(object const& o) override; 35 | virtual size_t lookup(object const& o) const override; 36 | virtual void clear() override; 37 | 38 | /// Removes an element. 39 | /// @param o The object whose cells to decrement by 1. 40 | void remove(object const& o); 41 | 42 | template 43 | void remove(T const& x) 44 | { 45 | remove(wrap(x)); 46 | } 47 | 48 | protected: 49 | /// Maps an object to the indices in the underlying counter vector. 50 | /// @param o The object to map. 51 | /// @return The indices corresponding to the digests of *o*. 52 | std::vector find_indices(object const& o) const; 53 | 54 | /// Finds the minimum value in a list of arbitrary indices. 55 | /// @param indices The indices over which to compute the minimum. 56 | /// @return The minimum counter value over *indices*. 57 | size_t find_minimum(std::vector const& indices) const; 58 | 59 | /// Finds one or more minimum indices for a list of arbitrary indices. 60 | /// @param indices The indices over which to compute the minimum. 61 | /// @return The indices corresponding to the minima in the counter vector. 62 | std::vector find_minima(std::vector const& indices) const; 63 | 64 | /// Increments a given set of indices in the underlying counter vector. 65 | /// @param indices The indices to increment. 66 | /// @return `true` iff no counter overflowed. 67 | bool increment(std::vector const& indices, size_t value = 1); 68 | 69 | /// Decrements a given set of indices in the underlying counter vector. 70 | /// @param indices The indices to decrement. 71 | /// @return `true` iff no counter underflowed. 72 | bool decrement(std::vector const& indices, size_t value = 1); 73 | 74 | /// Retrieves the counter for given cell index. 75 | /// @param index The index of the counter vector. 76 | /// @pre `index < cells.size()` 77 | size_t count(size_t index) const; 78 | 79 | hasher hasher_; 80 | counter_vector cells_; 81 | bool partition_; 82 | }; 83 | 84 | /// A spectral Bloom filter with minimum increase (MI) policy. 85 | class spectral_mi_bloom_filter : public counting_bloom_filter 86 | { 87 | public: 88 | /// Constructs a spectral MI Bloom filter. 89 | /// @param h The hasher. 90 | /// @param cells The number of cells. 91 | /// @param width The number of bits per cell. 92 | /// @param partition Whether to partition the bit vector per hash function. 93 | spectral_mi_bloom_filter(hasher h, size_t cells, size_t width, 94 | bool partition = false); 95 | 96 | using bloom_filter::add; 97 | using bloom_filter::lookup; 98 | using counting_bloom_filter::remove; 99 | virtual void add(object const& o) override; 100 | }; 101 | 102 | /// A spectral Bloom filter with recurring minimum (RM) policy. 103 | class spectral_rm_bloom_filter : public bloom_filter 104 | { 105 | public: 106 | /// Constructs a spectral RM Bloom filter. 107 | /// @param h1 The first hasher. 108 | /// @param cells1 The number of cells in the first Bloom filter. 109 | /// @param width1 The number of bits per cell in the first Bloom filter. 110 | /// @param h2 The second hasher. 111 | /// @param cells2 The number of cells in the second Bloom filter. 112 | /// @param width2 The number of bits per cell in the second Bloom filter. 113 | /// @param partition Whether to partition the bit vector per hash function. 114 | spectral_rm_bloom_filter(hasher h1, size_t cells1, size_t width1, 115 | hasher h2, size_t cells2, size_t width2, 116 | bool partition = false); 117 | 118 | using bloom_filter::add; 119 | using bloom_filter::lookup; 120 | virtual void add(object const& o) override; 121 | virtual size_t lookup(object const& o) const override; 122 | virtual void clear() override; 123 | 124 | /// Removes an element. 125 | /// @param o The object whose cells to decrement by 1. 126 | void remove(object const& o); 127 | 128 | private: 129 | counting_bloom_filter first_; 130 | counting_bloom_filter second_; 131 | }; 132 | 133 | } // namespace bf 134 | 135 | #endif 136 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ⚠️ **Maintainer needed**: unfortunately I lack the cycles to maintain this 2 | project. I would gladly support anyone who is willing to step in. 3 | 4 | **libbf** is a C++11 library which implements [various Bloom 5 | filters][blog-post], including: 6 | 7 | - Basic 8 | - Counting 9 | - Spectral MI 10 | - Spectral RM 11 | - Bitwise 12 | - A^2 13 | - Stable 14 | 15 | [blog-post]: http://matthias.vallentin.net/blog/2011/06/a-garden-variety-of-bloom-filters/ 16 | 17 | Synopsis 18 | ======== 19 | 20 | #include 21 | #include 22 | 23 | int main() 24 | { 25 | bf::basic_bloom_filter b(0.8, 100); 26 | 27 | // Add two elements. 28 | b.add("foo"); 29 | b.add(42); 30 | 31 | // Test set membership 32 | std::cout << b.lookup("foo") << std::endl; // 1 33 | std::cout << b.lookup("bar") << std::endl; // 0 34 | std::cout << b.lookup(42) << std::endl; // 1 35 | 36 | // Remove all elements. 37 | b.clear(); 38 | std::cout << b.lookup("foo") << std::endl; // 0 39 | std::cout << b.lookup(42) << std::endl; // 0 40 | 41 | return 0; 42 | } 43 | 44 | Requirements 45 | ============ 46 | 47 | - A C++11 compiler (GCC >= 4.7 or Clang >= 3.2) 48 | - CMake (>= 2.8) 49 | 50 | Installation 51 | ============ 52 | 53 | The build process uses CMake, wrapped in autotools-like scripts. The configure 54 | script honors the `CXX` environment variable to select a specific C++compiler. 55 | For example, the following steps compile libbf with Clang and install it under 56 | `PREFIX`: 57 | 58 | CXX=clang++ ./configure --prefix=PREFIX 59 | make 60 | make test 61 | make install 62 | 63 | Documentation 64 | ============= 65 | 66 | The most recent version of the Doxygen API documentation exists at 67 | . Alternatively, you can build the 68 | documentation locally via `make doc` and then browse to 69 | `doc/gh-pages/api/index.html`. 70 | 71 | Usage 72 | ===== 73 | 74 | After having installed libbf, you can use it in your application by including 75 | the header file `bf.h` and linking against the library. All data structures 76 | reside in the namespace `bf` and the following examples assume: 77 | 78 | using namespace bf; 79 | 80 | Each Bloom filter inherits from the abstract base class `bloom_filter`, which 81 | provides addition and lookup via the virtual functions `add` and `lookup`. 82 | These functions take an *object* as argument, which serves a light-weight view 83 | over sequential data for hashing. 84 | 85 | For example, if you can create a basic Bloom filter with a desired 86 | false-positive probability and capacity as follows: 87 | 88 | // Construction. 89 | bloom_filter* bf = new basic_bloom_filter(0.8, 100); 90 | 91 | // Addition. 92 | bf->add("foo"); 93 | bf->add(42); 94 | 95 | // Lookup. 96 | assert(bf->lookup("foo") == 1); 97 | assert(bf->lookup(42) == 1); 98 | 99 | // Remove all elements from the Bloom filter. 100 | bf->clear(); 101 | 102 | In this case, libbf computes the optimal number of hash functions needed to 103 | achieve the desired false-positive rate which holds until the capacity has been 104 | reached (80% and 100 distinct elements, in the above example). Alternatively, 105 | you can construct a basic Bloom filter by specifying the number of hash 106 | functions and the number of cells in the underlying bit vector: 107 | 108 | bloom_filter* bf = new basic_bloom_filter(make_hasher(3), 1024); 109 | 110 | Since not all Bloom filter implementations come with closed-form solutions 111 | based on false-positive probabilities, most constructors use this latter form 112 | of explicit resource provisioning. 113 | 114 | In the above example, the free function `make_hasher` constructs a *hasher*-an 115 | abstraction for hashing objects *k* times. There exist currently two different 116 | hasher, a `default_hasher` and a 117 | [`double_hasher`](http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/rsa.pdf). The 118 | latter uses a linear combination of two pairwise-independent, universal hash 119 | functions to produce the *k* digests, whereas the former merely hashes the 120 | object *k* times. 121 | 122 | Evaluation 123 | ---------- 124 | 125 | libbf also ships with a small Bloom filter tool `bf` in the test directory. 126 | This program supports evaluation the accuracy of the different Bloom filter 127 | flavors with respect to their false-positive and false-negative rates. Have a 128 | look at the console help (`-h` or `--help`) for detailed usage instructions. 129 | 130 | The tool operates in two phases: 131 | 132 | 1. Read input from a file and insert it into a Bloom filter 133 | 2. Query the Bloom filter and compare the result to the ground truth 134 | 135 | For example, consider the following input file: 136 | 137 | foo 138 | bar 139 | baz 140 | baz 141 | foo 142 | 143 | From this input file, you can generate the real ground truth file as follows: 144 | 145 | sort input.txt | uniq -c | tee query.txt 146 | 1 bar 147 | 2 baz 148 | 2 foo 149 | 150 | The tool `bf` will compute false-positive and false-negative counts for each 151 | element, based on the ground truth given. In the case of a simple counting 152 | Bloom filter, an invocation may look like this: 153 | 154 | bf -t counting -m 2 -k 3 -i input.txt -q query.txt | column -t 155 | 156 | Yielding the following output: 157 | 158 | TN TP FP FN G C E 159 | 0 1 0 0 1 1 bar 160 | 0 1 0 1 2 1 baz 161 | 0 1 0 2 2 1 foo 162 | 163 | The column headings denote true negatives (`TN`), true positives (`TP`), false 164 | positives (`FP`), false negatives (`FN`), ground truth count (`G`), actual 165 | count (`C`), and the queried element. The counts are cumulative to support 166 | incremental evaluation. 167 | 168 | Versioning 169 | ========== 170 | We follow [Semantic Versioning](http://semver.org/spec/v1.0.0.html). The version X.Y.Z indicates: 171 | 172 | * X is the major version (backward-incompatible), 173 | * Y is the minor version (backward-compatible), and 174 | * Z is the patch version (backward-compatible bug fix). 175 | 176 | License 177 | ======== 178 | 179 | libbf comes with a BSD-style license (see [COPYING](COPYING) for details). 180 | 181 | -------------------------------------------------------------------------------- /src/bloom_filter/counting.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | namespace bf { 7 | 8 | counting_bloom_filter::counting_bloom_filter(hasher h, size_t cells, 9 | size_t width, bool partition) 10 | : hasher_(std::move(h)), cells_(cells, width), partition_(partition) { 11 | } 12 | 13 | void counting_bloom_filter::add(object const& o) { 14 | increment(find_indices(o)); 15 | } 16 | 17 | size_t counting_bloom_filter::lookup(object const& o) const { 18 | auto min = cells_.max(); 19 | for (auto i : find_indices(o)) { 20 | auto cnt = cells_.count(i); 21 | if (cnt < min) 22 | min = cnt; 23 | } 24 | return min; 25 | } 26 | 27 | void counting_bloom_filter::clear() { 28 | cells_.clear(); 29 | } 30 | 31 | void counting_bloom_filter::remove(object const& o) { 32 | decrement(find_indices(o)); 33 | } 34 | 35 | std::vector counting_bloom_filter::find_indices(object const& o) const { 36 | auto digests = hasher_(o); 37 | std::vector indices(digests.size()); 38 | if (partition_) { 39 | assert(cells_.size() % digests.size() == 0); 40 | auto const parts = cells_.size() / digests.size(); 41 | for (size_t i = 0; i < indices.size(); ++i) 42 | indices[i] = (i * parts) + digests[i] % parts; 43 | } else { 44 | for (size_t i = 0; i < indices.size(); ++i) 45 | indices[i] = digests[i] % cells_.size(); 46 | } 47 | std::sort(indices.begin(), indices.end()); 48 | indices.erase(std::unique(indices.begin(), indices.end()), indices.end()); 49 | return indices; 50 | }; 51 | 52 | size_t 53 | counting_bloom_filter::find_minimum(std::vector const& indices) const { 54 | auto min = cells_.max(); 55 | for (auto i : indices) { 56 | auto cnt = cells_.count(i); 57 | if (cnt < min) 58 | min = cnt; 59 | } 60 | return min; 61 | } 62 | 63 | std::vector 64 | counting_bloom_filter::find_minima(std::vector const& indices) const { 65 | auto min = cells_.max(); 66 | std::vector positions; 67 | for (auto i : indices) { 68 | auto cnt = cells_.count(i); 69 | if (cnt == min) { 70 | positions.push_back(i); 71 | } else if (cnt < min) { 72 | min = cnt; 73 | positions.clear(); 74 | positions.push_back(i); 75 | } 76 | } 77 | return positions; 78 | } 79 | 80 | bool counting_bloom_filter::increment(std::vector const& indices, 81 | size_t value) { 82 | auto status = true; 83 | for (auto i : indices) 84 | if (!cells_.increment(i, value)) 85 | status = false; 86 | return status; 87 | } 88 | 89 | bool counting_bloom_filter::decrement(std::vector const& indices, 90 | size_t value) { 91 | auto status = true; 92 | for (auto i : indices) 93 | if (!cells_.decrement(i, value)) 94 | status = false; 95 | return status; 96 | } 97 | 98 | size_t counting_bloom_filter::count(size_t index) const { 99 | return cells_.count(index); 100 | } 101 | 102 | spectral_mi_bloom_filter::spectral_mi_bloom_filter(hasher h, size_t cells, 103 | size_t width, bool partition) 104 | : counting_bloom_filter(std::move(h), cells, width, partition) { 105 | } 106 | 107 | void spectral_mi_bloom_filter::add(object const& o) { 108 | increment(find_minima(find_indices(o))); 109 | } 110 | 111 | spectral_rm_bloom_filter::spectral_rm_bloom_filter(hasher h1, size_t cells1, 112 | size_t width1, hasher h2, 113 | size_t cells2, size_t width2, 114 | bool partition) 115 | : first_(std::move(h1), cells1, width1, partition), 116 | second_(std::move(h2), cells2, width2, partition) { 117 | } 118 | 119 | // "When adding an item x, increase the counters of x in the primary SBF. Then 120 | // check if x has a recurring minimum. If so, continue normally. Otherwise (if 121 | // x has a single minimum), look for x in the secondary SBF. If found, increase 122 | // its counters, otherwise add x to the secondary SBF, with an initial value 123 | // that equals its minimal value from the primary SBF." 124 | void spectral_rm_bloom_filter::add(object const& o) { 125 | auto indices1 = first_.find_indices(o); 126 | first_.increment(indices1); 127 | auto mins1 = first_.find_minima(indices1); 128 | if (mins1.size() > 1) 129 | return; 130 | 131 | auto indices2 = second_.find_indices(o); 132 | auto min1 = first_.count(mins1[0]); 133 | auto min2 = second_.find_minimum(indices2); 134 | 135 | // Note: it's unclear to me whether "increase its counters" means increase 136 | // only the minima or all indices. I opted for the latter (same during 137 | // deletion). 138 | second_.increment(indices2, min2 > 0 ? 1 : min1); 139 | } 140 | 141 | // "When performing lookup for x, check if x has a recurring minimum in the 142 | // primary SBF. If so return the minimum. Otherwise, perform lookup for x in 143 | // secondary SBF. If [the] returned value is greater than 0, return it. 144 | // Otherwise, return minimum from primary SBF." 145 | size_t spectral_rm_bloom_filter::lookup(object const& o) const { 146 | auto mins1 = first_.find_minima(first_.find_indices(o)); 147 | auto min1 = first_.count(mins1[0]); 148 | if (mins1.size() > 1) 149 | return min1; 150 | auto min2 = second_.find_minimum(second_.find_indices(o)); 151 | return min2 > 0 ? min2 : min1; 152 | } 153 | 154 | void spectral_rm_bloom_filter::clear() { 155 | first_.clear(); 156 | second_.clear(); 157 | } 158 | 159 | // "First decrease its counters in the primary SBF, then if it has a single 160 | // minimum (or if it exists in Bf) decrease its counters in the secondary SBF, 161 | // unless at least one of them is 0." 162 | void spectral_rm_bloom_filter::remove(object const& o) { 163 | auto indices1 = first_.find_indices(o); 164 | first_.decrement(indices1); 165 | auto mins1 = first_.find_minima(indices1); 166 | if (mins1.size() > 1) 167 | return; 168 | 169 | auto indices2 = second_.find_indices(o); 170 | if (second_.find_minimum(indices2) > 0) 171 | second_.decrement(indices2); 172 | } 173 | 174 | } // namespace bf 175 | -------------------------------------------------------------------------------- /test/bf/bf.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "configuration.h" 9 | 10 | #include "bf/all.hpp" 11 | 12 | using namespace util; 13 | using namespace bf; 14 | 15 | trial run(config const& cfg) { 16 | auto numeric = cfg.check("numeric"); 17 | auto k = *cfg.as("hash-functions"); 18 | auto cells = *cfg.as("cells"); 19 | auto seed = *cfg.as("seed"); 20 | auto fpr = *cfg.as("fp-rate"); 21 | auto capacity = *cfg.as("capacity"); 22 | auto width = *cfg.as("width"); 23 | auto part = cfg.check("partition"); 24 | auto double_hashing = cfg.check("double-hashing"); 25 | auto d = *cfg.as("evict"); 26 | 27 | auto k2 = *cfg.as("hash-functions-2nd"); 28 | auto cells2 = *cfg.as("cells-2nd"); 29 | auto seed2 = *cfg.as("seed-2nd"); 30 | auto width2 = *cfg.as("width-2nd"); 31 | auto double_hashing2 = cfg.check("double-hashing-2nd"); 32 | 33 | auto const& type = *cfg.as("type"); 34 | std::unique_ptr bf; 35 | 36 | if (type == "basic") { 37 | if (fpr == 0 || capacity == 0) { 38 | if (cells == 0) 39 | return error{"need non-zero cells"}; 40 | if (k == 0) 41 | return error{"need non-zero k"}; 42 | 43 | auto h = make_hasher(k, seed, double_hashing); 44 | bf.reset(new basic_bloom_filter(std::move(h), cells, part)); 45 | } else { 46 | assert(fpr != 0 && capacity != 0); 47 | bf.reset(new basic_bloom_filter(fpr, capacity, seed, part)); 48 | } 49 | } else if (type == "counting") { 50 | if (cells == 0) 51 | return error{"need non-zero cells"}; 52 | if (width == 0) 53 | return error{"need non-zero cell width"}; 54 | if (k == 0) 55 | return error{"need non-zero k"}; 56 | 57 | auto h = make_hasher(k, seed, double_hashing); 58 | bf.reset(new counting_bloom_filter(std::move(h), cells, width, part)); 59 | } else if (type == "spectral-mi") { 60 | if (cells == 0) 61 | return error{"need non-zero cells"}; 62 | if (width == 0) 63 | return error{"need non-zero cell width"}; 64 | if (k == 0) 65 | return error{"need non-zero k"}; 66 | 67 | auto h = make_hasher(k, seed, double_hashing); 68 | bf.reset(new spectral_mi_bloom_filter(std::move(h), cells, width, part)); 69 | } else if (type == "spectral-rm") { 70 | if (cells == 0) 71 | return error{"need non-zero cells"}; 72 | if (cells2 == 0) 73 | return error{"need non-zero cells for 2nd bloom filter"}; 74 | 75 | if (width == 0) 76 | return error{"need non-zero cell width"}; 77 | if (width2 == 0) 78 | return error{"need non-zero cell width for 2nd bloom filter"}; 79 | 80 | if (k == 0) 81 | return error{"need non-zero k"}; 82 | if (k2 == 0) 83 | return error{"need non-zero k for second bloom filter"}; 84 | 85 | auto h1 = make_hasher(k, seed, double_hashing); 86 | auto h2 = make_hasher(k2, seed2, double_hashing2); 87 | bf.reset(new spectral_rm_bloom_filter(std::move(h1), cells, width, 88 | std::move(h2), cells2, width2, part)); 89 | } else if (type == "bitwise") { 90 | if (cells == 0) 91 | return error{"need non-zero cells"}; 92 | if (k == 0) 93 | return error{"need non-zero k"}; 94 | 95 | bf.reset(new bitwise_bloom_filter(k, cells, seed)); 96 | } else if (type == "a2") { 97 | if (cells == 0) 98 | return error{"need non-zero cells"}; 99 | if (capacity == 0) 100 | return error{"need non-zero capacity"}; 101 | if (k == 0) 102 | return error{"need non-zero k"}; 103 | 104 | bf.reset(new a2_bloom_filter(k, cells, capacity, seed, seed2)); 105 | } else if (type == "stable") { 106 | if (cells == 0) 107 | return error{"need non-zero cells"}; 108 | if (k == 0) 109 | return error{"need non-zero k"}; 110 | 111 | auto h = make_hasher(k, seed, double_hashing); 112 | bf.reset(new stable_bloom_filter(std::move(h), cells, seed, d)); 113 | } else { 114 | return error{"invalid bloom filter type"}; 115 | } 116 | 117 | std::string line; 118 | auto input_file = *cfg.as("input"); 119 | std::ifstream in{input_file}; 120 | if (!in) 121 | return error{"cannot read " + input_file}; 122 | 123 | in >> std::noskipws; 124 | 125 | while (std::getline(in, line)) { 126 | if (line.empty()) 127 | continue; 128 | 129 | auto p = line.data(); 130 | while (*p) 131 | if (*p == ' ' || *p == '\t') 132 | return error{"whitespace in input not supported"}; 133 | else 134 | ++p; 135 | 136 | if (numeric) 137 | bf->add(std::strtod(line.c_str(), nullptr)); 138 | else 139 | bf->add(line); 140 | } 141 | 142 | size_t tn = 0, tp = 0, fp = 0, fn = 0; 143 | size_t ground_truth; 144 | std::string element; 145 | auto query_file = *cfg.as("query"); 146 | std::ifstream query{query_file}; 147 | if (!query) 148 | return error{"cannot read " + query_file}; 149 | 150 | std::cout << "TN TP FP FN G C E" << std::endl; 151 | while (query >> ground_truth >> element) // uniq -c 152 | { 153 | size_t count; 154 | if (numeric) 155 | count = bf->lookup(std::strtod(element.c_str(), nullptr)); 156 | else 157 | count = bf->lookup(element); 158 | 159 | if (!query) 160 | return error{"failed to parse element"}; 161 | 162 | if (count == 0 && ground_truth == 0) 163 | ++tn; 164 | else if (count == ground_truth) 165 | ++tp; 166 | else if (count > ground_truth) 167 | ++fp; 168 | else 169 | ++fn; 170 | 171 | std::cout << tn << ' ' << tp << ' ' << fp << ' ' << fn << ' ' 172 | << ground_truth << ' ' << count << ' '; 173 | 174 | if (numeric) 175 | std::cout << std::strtod(element.c_str(), nullptr); 176 | else 177 | std::cout << element; 178 | 179 | std::cout << std::endl; 180 | } 181 | 182 | return nil; 183 | } 184 | 185 | int main(int argc, char* argv[]) { 186 | auto cfg = config::parse(argc, argv); 187 | if (!cfg) { 188 | std::cerr << cfg.failure().msg() << ", try -h or --help" << std::endl; 189 | return 1; 190 | } 191 | 192 | if (argc < 2 || cfg->check("help") || cfg->check("advanced")) { 193 | cfg->usage(std::cerr, cfg->check("advanced")); 194 | return 0; 195 | } 196 | 197 | if (!cfg->check("type")) { 198 | std::cerr << "missing bloom filter type" << std::endl; 199 | return 1; 200 | } 201 | 202 | if (!cfg->check("input")) { 203 | std::cerr << "missing input file" << std::endl; 204 | return 1; 205 | } 206 | 207 | if (!cfg->check("query")) { 208 | std::cerr << "missing query file" << std::endl; 209 | return 1; 210 | } 211 | 212 | auto t = run(*cfg); 213 | if (!t) { 214 | std::cerr << t.failure().msg() << std::endl; 215 | return 1; 216 | } 217 | 218 | return 0; 219 | } 220 | -------------------------------------------------------------------------------- /bf/bitvector.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BF_BITVECTOR_HPP 2 | #define BF_BITVECTOR_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace bf { 10 | 11 | /// A vector of bits. 12 | class bitvector 13 | { 14 | friend std::string to_string(bitvector const&, bool, size_t); 15 | 16 | public: 17 | typedef size_t block_type; 18 | typedef size_t size_type; 19 | static size_type constexpr npos = static_cast(-1); 20 | static block_type constexpr bits_per_block = 21 | std::numeric_limits::digits; 22 | 23 | public: 24 | /// An lvalue proxy for single bits. 25 | class reference 26 | { 27 | friend class bitvector; 28 | void operator&() = delete; 29 | 30 | /// Constructs a bit from a block. 31 | /// @param block The block to look at. 32 | /// @param i The bit position within *block*. 33 | reference(block_type& block, block_type i); 34 | 35 | public: 36 | reference& flip(); 37 | operator bool() const; 38 | bool operator~() const; 39 | reference& operator=(bool x); 40 | reference& operator=(reference const& other); 41 | reference& operator|=(bool x); 42 | reference& operator&=(bool x); 43 | reference& operator^=(bool x); 44 | reference& operator-=(bool x); 45 | 46 | private: 47 | block_type& block_; 48 | block_type const mask_; 49 | }; 50 | 51 | /// Unlike the reference type, a const_reference does not need lvalue 52 | /// semantics and can thus represent simply a boolean (bit) value. 53 | typedef bool const_reference; 54 | 55 | /// Constructs an empty bit vector. 56 | bitvector(); 57 | 58 | /// Constructs a bit vector of a given size. 59 | /// @param size The number of bits. 60 | /// @param value The value for each bit. 61 | explicit bitvector(size_type size, bool value = false); 62 | 63 | /// Constructs a bit vector from a sequence of blocks. 64 | template 65 | bitvector(InputIterator first, InputIterator last) 66 | { 67 | bits_.insert(bits_.end(), first, last); 68 | num_bits_ = bits_.size() * bits_per_block; 69 | } 70 | 71 | /// Copy-constructs a bit vector. 72 | /// @param other The bit vector to copy. 73 | bitvector(bitvector const& other); 74 | 75 | /// Move-constructs a bit vector. 76 | /// @param other The bit vector to move. 77 | bitvector(bitvector&& other); 78 | 79 | /// Assigns another bit vector to this instance. 80 | /// @param other The RHS of the assignment. 81 | bitvector& operator=(bitvector other); 82 | 83 | /// Swaps two bit vectors. 84 | friend void swap(bitvector& x, bitvector& y); 85 | 86 | // 87 | // Bitwise operations 88 | // 89 | bitvector operator~() const; 90 | bitvector operator<<(size_type n) const; 91 | bitvector operator>>(size_type n) const; 92 | bitvector& operator<<=(size_type n); 93 | bitvector& operator>>=(size_type n); 94 | bitvector& operator&=(bitvector const& other); 95 | bitvector& operator|=(bitvector const& other); 96 | bitvector& operator^=(bitvector const& other); 97 | bitvector& operator-=(bitvector const& other); 98 | friend bitvector operator&(bitvector const& x, bitvector const& y); 99 | friend bitvector operator|(bitvector const& x, bitvector const& y); 100 | friend bitvector operator^(bitvector const& x, bitvector const& y); 101 | friend bitvector operator-(bitvector const& x, bitvector const& y); 102 | 103 | // 104 | // Relational operators 105 | // 106 | friend bool operator==(bitvector const& x, bitvector const& y); 107 | friend bool operator!=(bitvector const& x, bitvector const& y); 108 | friend bool operator<(bitvector const& x, bitvector const& y); 109 | 110 | // 111 | // Basic operations 112 | // 113 | /// Appends the bits in a sequence of values. 114 | /// @tparam Iterator A forward iterator. 115 | /// @param first An iterator pointing to the first element of the sequence. 116 | /// @param last An iterator pointing to one past the last element of the 117 | /// sequence. 118 | template < 119 | typename Iterator, 120 | typename std::enable_if< 121 | std::is_same< 122 | typename std::iterator_traits::iterator_category, 123 | std::forward_iterator_tag 124 | >::value 125 | >::type = 0 126 | > 127 | void append(Iterator first, Iterator last) 128 | { 129 | if (first == last) 130 | return; 131 | 132 | auto excess = extra_bits(); 133 | auto delta = std::distance(first, last); 134 | bits_.reserve(blocks() + delta); 135 | if (excess == 0) 136 | { 137 | bits_.back() |= (*first << excess); 138 | do 139 | { 140 | auto b = *first++ >> (bits_per_block - excess); 141 | bits_.push_back(b | (first == last ? 0 : *first << excess)); 142 | } while (first != last); 143 | } 144 | else 145 | { 146 | bits_.insert(bits_.end(), first, last); 147 | } 148 | 149 | num_bits_ += bits_per_block * delta; 150 | } 151 | 152 | /// Appends the bits in a given block. 153 | /// @param block The block containing bits to append. 154 | void append(block_type block); 155 | 156 | /// Appends a single bit to the end of the bit vector. 157 | /// @param bit The value of the bit. 158 | void push_back(bool bit); 159 | 160 | /// Clears all bits in the bitvector. 161 | void clear() noexcept; 162 | 163 | /// Resizes the bit vector to a new number of bits. 164 | /// @param n The new number of bits of the bit vector. 165 | /// @param value The bit value of new values, if the vector expands. 166 | void resize(size_type n, bool value = false); 167 | 168 | /// Sets a bit at a specific position to a given value. 169 | /// @param i The bit position. 170 | /// @param bit The value assigned to position *i*. 171 | /// @return A reference to the bit vector instance. 172 | bitvector& set(size_type i, bool bit = true); 173 | 174 | /// Sets all bits to 1. 175 | /// @return A reference to the bit vector instance. 176 | bitvector& set(); 177 | 178 | /// Resets a bit at a specific position, i.e., sets it to 0. 179 | /// @param i The bit position. 180 | /// @return A reference to the bit vector instance. 181 | bitvector& reset(size_type i); 182 | 183 | /// Sets all bits to 0. 184 | /// @return A reference to the bit vector instance. 185 | bitvector& reset(); 186 | 187 | /// Toggles/flips a bit at a specific position. 188 | /// @param i The bit position. 189 | /// @return A reference to the bit vector instance. 190 | bitvector& flip(size_type i); 191 | 192 | /// Computes the complement 193 | /// @return A reference to the bit vector instance. 194 | bitvector& flip(); 195 | 196 | /// Retrieves a single bit. 197 | /// @param i The bit position. 198 | /// @return A mutable reference to the bit at position *i*. 199 | reference operator[](size_type i); 200 | 201 | /// Retrieves a single bit. 202 | /// @param i The bit position. 203 | /// @return A const-reference to the bit at position *i*. 204 | const_reference operator[](size_type i) const; 205 | 206 | /// Counts the number of 1-bits in the bit vector. Also known as *population 207 | /// count* or *Hamming weight*. 208 | /// @return The number of bits set to 1. 209 | size_type count() const; 210 | 211 | /// Retrieves the number of blocks of the underlying storage. 212 | /// @param The number of blocks that represent `size()` bits. 213 | size_type blocks() const; 214 | 215 | /// Retrieves the number of bits the bitvector consist of. 216 | /// @return The length of the bit vector in bits. 217 | size_type size() const; 218 | 219 | /// Checks whether the bit vector is empty. 220 | /// @return `true` iff the bitvector has zero length. 221 | bool empty() const; 222 | 223 | /// Finds the bit position of of the first 1-bit. 224 | /// 225 | /// @return The position of the first bit that equals to one or `npos` if no 226 | /// such bit exists. 227 | size_type find_first() const; 228 | 229 | /// Finds the next 1-bit from a given starting position. 230 | /// 231 | /// @param i The index where to start looking. 232 | /// 233 | /// @return The position of the first bit that equals to 1 after position 234 | /// *i* or `npos` if no such bit exists. 235 | size_type find_next(size_type i) const; 236 | 237 | private: 238 | /// Computes the block index for a given bit position. 239 | static size_type constexpr block_index(size_type i) 240 | { 241 | return i / bits_per_block; 242 | } 243 | 244 | /// Computes the bit index within a given block for a given bit position. 245 | static block_type constexpr bit_index(size_type i) 246 | { 247 | return i % bits_per_block; 248 | } 249 | 250 | /// Computes the bitmask block to extract a bit a given bit position. 251 | static block_type constexpr bit_mask(size_type i) 252 | { 253 | return block_type(1) << bit_index(i); 254 | } 255 | 256 | /// Computes the number of blocks needed to represent a given number of 257 | /// bits. 258 | /// @param bits the number of bits. 259 | /// @return The number of blocks to represent *bits* number of bits. 260 | static size_type constexpr bits_to_blocks(size_type bits) 261 | { 262 | return bits / bits_per_block 263 | + static_cast(bits % bits_per_block != 0); 264 | } 265 | 266 | /// Computes the bit position first 1-bit in a given block. 267 | /// @param block The block to inspect. 268 | /// @return The bit position where *block* has its first bit set to 1. 269 | static size_type lowest_bit(block_type block); 270 | 271 | /// Computes the number of excess/unused bits in the bit vector. 272 | block_type extra_bits() const; 273 | 274 | // If the number of bits in the vector are not not a multiple of 275 | // bitvector::bits_per_block, then the last block exhibits unused bits which 276 | // this function resets. 277 | void zero_unused_bits(); 278 | 279 | /// Looks for the first 1-bit starting at a given position. 280 | /// 281 | /// @param i The block index to start looking. 282 | /// 283 | /// @return The block index of the first 1-bit starting from *i* or 284 | /// `bitvector::npos` if no 1-bit exists. 285 | size_type find_from(size_type i) const; 286 | 287 | std::vector bits_; 288 | size_type num_bits_; 289 | }; 290 | 291 | /// Converts a bitvector to a `std::string`. 292 | /// 293 | /// @param b The bitvector to convert. 294 | /// 295 | /// @param msb_to_lsb The order of display. If `true`, display bits from MSB to 296 | /// LSB and in the reverse order otherwise. 297 | /// 298 | /// @param all Indicates whether to include also the unused bits of the last 299 | /// block if the number of `b.size()` is not a multiple of 300 | /// `bitvector::bits_per_block`. 301 | /// 302 | /// @param cut_off Specifies a maximum size on the output. If 0, no cutting 303 | /// occurs. 304 | /// 305 | /// @return A `std::string` representation of *b*. 306 | std::string to_string(bitvector const& b, 307 | bool msb_to_lsb = true, 308 | bool all = false, 309 | size_t cut_off = 0); 310 | 311 | } // namespace bf 312 | 313 | #endif 314 | -------------------------------------------------------------------------------- /test/tests.cpp: -------------------------------------------------------------------------------- 1 | #include "test.hpp" 2 | 3 | #include "bf/all.hpp" 4 | 5 | using namespace bf; 6 | 7 | TEST(counter_vector_incrementing_width2) { 8 | counter_vector v(3, 2); 9 | // Increment 1/3 10 | CHECK(v.increment(0)); 11 | CHECK_EQUAL(to_string(v), "100000"); 12 | CHECK_EQUAL(v.count(0), 1u); 13 | // Increment 2/3 14 | CHECK(v.increment(0)); 15 | CHECK_EQUAL(to_string(v), "010000"); 16 | CHECK_EQUAL(v.count(0), 2u); 17 | // Increment 3/3 18 | CHECK(v.increment(0)); 19 | CHECK_EQUAL(to_string(v), "110000"); 20 | CHECK_EQUAL(v.count(0), 3u); 21 | // Already reached maximum counter value 3 with 2 bits. 22 | CHECK(!v.increment(0)); 23 | CHECK_EQUAL(to_string(v), "110000"); 24 | CHECK_EQUAL(v.count(0), 3u); 25 | // Increment adjacent value. 26 | CHECK(v.increment(1)); 27 | CHECK_EQUAL(to_string(v), "111000"); 28 | CHECK_EQUAL(v.count(1), 1u); 29 | // And another random one. 30 | CHECK(v.increment(2)); 31 | CHECK_EQUAL(to_string(v), "111010"); 32 | CHECK_EQUAL(v.count(2), 1u); 33 | } 34 | 35 | TEST(counter_vector_incrementing_width3) { 36 | counter_vector v(3, 3); 37 | // Increment 1/7 38 | CHECK(v.increment(1)); 39 | CHECK_EQUAL(to_string(v), "000100000"); 40 | CHECK_EQUAL(v.count(1), 1u); 41 | // Increment 2/7 42 | CHECK(v.increment(1)); 43 | CHECK_EQUAL(to_string(v), "000010000"); 44 | CHECK_EQUAL(v.count(1), 2u); 45 | // Increment 3/7 46 | CHECK(v.increment(1)); 47 | CHECK_EQUAL(to_string(v), "000110000"); 48 | CHECK_EQUAL(v.count(1), 3u); 49 | // Increment 4/7 50 | CHECK(v.increment(1)); 51 | CHECK_EQUAL(to_string(v), "000001000"); 52 | CHECK_EQUAL(v.count(1), 4u); 53 | // Increment += 3 to 7/7 54 | CHECK(v.increment(1, 3)); 55 | CHECK_EQUAL(to_string(v), "000111000"); 56 | CHECK_EQUAL(v.count(1), 7u); 57 | // Reset 58 | v.clear(); 59 | CHECK_EQUAL(to_string(v), "000000000"); 60 | CHECK(v.increment(1, 6)); 61 | CHECK_EQUAL(to_string(v), "000011000"); 62 | CHECK_EQUAL(v.count(1), 6u); 63 | } 64 | 65 | TEST(counter_vector_decrementing_width3) { 66 | counter_vector v(3, 3); 67 | // Increment to max value of 7. 68 | CHECK(v.increment(1, 7)); 69 | CHECK_EQUAL(v.count(1), 7u); 70 | CHECK_EQUAL(to_string(v), "000111000"); 71 | // Decrement by 1. 72 | CHECK(v.decrement(1)); 73 | CHECK_EQUAL(to_string(v), "000011000"); 74 | CHECK_EQUAL(v.count(1), 6u); 75 | // Decrement by 1, again. 76 | CHECK(v.decrement(1)); 77 | CHECK_EQUAL(to_string(v), "000101000"); 78 | CHECK_EQUAL(v.count(1), 5u); 79 | // Decrement by 1, again. 80 | CHECK(v.decrement(1)); 81 | CHECK_EQUAL(to_string(v), "000001000"); 82 | CHECK_EQUAL(v.count(1), 4u); 83 | // Increment by 1, then decrement by 3. 84 | CHECK(v.increment(1)); 85 | CHECK(v.decrement(1, 3)); 86 | CHECK_EQUAL(to_string(v), "000010000"); 87 | CHECK_EQUAL(v.count(1), 2u); 88 | } 89 | 90 | TEST(counter_vector_adding) { 91 | counter_vector v(2, 3); 92 | // Increment to 3. 93 | CHECK(v.increment(0, 3)); 94 | CHECK_EQUAL(to_string(v), "110000"); 95 | CHECK_EQUAL(v.count(0), 3u); 96 | // Increment to 4. 97 | CHECK(v.increment(0, 1) == 1); 98 | CHECK_EQUAL(to_string(v), "001000"); 99 | CHECK_EQUAL(v.count(0), 4u); 100 | // Increment to 5. 101 | CHECK(v.increment(0, 1)); 102 | CHECK_EQUAL(v.count(0), 5u); 103 | CHECK_EQUAL(to_string(v), "101000"); 104 | // Increment to 7. 105 | CHECK(!v.increment(0, 3)); 106 | CHECK_EQUAL(to_string(v), "111000"); 107 | CHECK_EQUAL(v.count(0), 7u); 108 | // Go to 2nd cell, repeat. 109 | CHECK(v.increment(1, 4)); 110 | CHECK_EQUAL(to_string(v), "111001"); 111 | CHECK_EQUAL(v.count(1), 4u); 112 | // Fill it up. 113 | CHECK(v.increment(1, 3)); 114 | CHECK_EQUAL(to_string(v), "111111"); 115 | // Cannot increment at max cell value. 116 | CHECK(!v.increment(1)); 117 | CHECK(!v.increment(1, 42)); 118 | } 119 | 120 | TEST(counter_vector_adding_big) { 121 | counter_vector v(3, 32); 122 | auto max = std::numeric_limits::max(); 123 | REQUIRE_EQUAL(v.max(), max); 124 | std::string value; 125 | size_t step = 1 << 15; 126 | size_t last = 0; 127 | for (size_t i = 0; i < max && last <= i; i += step) { 128 | last = i; 129 | value = std::to_string(i); 130 | REQUIRE(v.count(0) == i); 131 | v.increment(0, step); 132 | } 133 | } 134 | 135 | TEST(counter_vector_mergin) { 136 | counter_vector a(5, 2); 137 | counter_vector b(5, 2); 138 | a.increment(0, 1); 139 | a.increment(1, 1); 140 | a.increment(2, 2); 141 | b.increment(1, 1); 142 | b.increment(2, 1); 143 | b.increment(3, 3); 144 | CHECK_EQUAL(to_string(a | b), "1001111100"); 145 | } 146 | 147 | TEST(bloom_filter_basic) { 148 | basic_bloom_filter bf(0.8, 10); 149 | bf.add("foo"); 150 | bf.add("bar"); 151 | bf.add("baz"); 152 | bf.add('c'); 153 | bf.add(4.2); 154 | bf.add(4711ULL); 155 | // True-positives 156 | CHECK_EQUAL(bf.lookup("foo"), 1u); 157 | CHECK_EQUAL(bf.lookup("bar"), 1u); 158 | CHECK_EQUAL(bf.lookup("baz"), 1u); 159 | CHECK_EQUAL(bf.lookup(4.2), 1u); 160 | CHECK_EQUAL(bf.lookup('c'), 1u); 161 | CHECK_EQUAL(bf.lookup(4711ULL), 1u); 162 | // True-negatives 163 | CHECK_EQUAL(bf.lookup("qux"), 0u); 164 | CHECK_EQUAL(bf.lookup("graunt"), 0u); 165 | CHECK_EQUAL(bf.lookup(3.1415), 0u); 166 | // False-positives 167 | CHECK_EQUAL(bf.lookup("corge"), 1u); 168 | CHECK_EQUAL(bf.lookup('a'), 1u); 169 | 170 | // another filter 171 | basic_bloom_filter obf(0.8, 10); 172 | obf.swap(bf); 173 | 174 | CHECK_EQUAL(obf.lookup("foo"), 1u); 175 | 176 | // Make bf using another filter's storage 177 | hasher h = obf.hasher_function(); 178 | bitvector b = obf.storage(); 179 | basic_bloom_filter obfc(h, b); 180 | CHECK_EQUAL(obfc.storage(), b); 181 | CHECK_EQUAL(obfc.lookup("foo"), 1u); 182 | } 183 | 184 | TEST(bloom_filter_counting) { 185 | counting_bloom_filter bf(make_hasher(3), 10, 2); 186 | for (size_t i = 0; i < 3; ++i) { 187 | bf.add("qux"); 188 | bf.add("corge"); 189 | bf.add("grault"); 190 | bf.add(3.14159265); 191 | } 192 | CHECK_EQUAL(bf.lookup("foo"), 0u); 193 | CHECK_EQUAL(bf.lookup("qux"), 3u); 194 | CHECK_EQUAL(bf.lookup("corge"), 3u); 195 | CHECK_EQUAL(bf.lookup("grault"), 3u); 196 | CHECK_EQUAL(bf.lookup(3.14159265), 3u); 197 | for (size_t i = 0; i < 3; ++i) 198 | bf.remove("grault"); 199 | CHECK_EQUAL(bf.lookup("corge"), 0u); 200 | } 201 | 202 | TEST(bloom_filter_spectral_mi) { 203 | spectral_mi_bloom_filter bf(make_hasher(3), 8, 2); 204 | bf.add("oh"); 205 | bf.add("oh"); 206 | bf.add("my"); 207 | bf.add("god"); 208 | bf.add("becky"); 209 | bf.add("look"); 210 | CHECK_EQUAL(bf.lookup("oh"), 2u); 211 | CHECK_EQUAL(bf.lookup("my"), 1u); 212 | CHECK_EQUAL(bf.lookup("god"), 1u); 213 | CHECK_EQUAL(bf.lookup("becky"), 1u); 214 | CHECK_EQUAL(bf.lookup("look"), 2u); // FP, same cells as "god". 215 | } 216 | 217 | TEST(bloom_filter_spectral_rm) { 218 | auto h1 = make_hasher(3, 0); 219 | auto h2 = make_hasher(3, 1); 220 | spectral_rm_bloom_filter bf(std::move(h1), 5, 2, std::move(h2), 4, 2); 221 | bf.add("foo"); 222 | CHECK_EQUAL(bf.lookup("foo"), 1u); 223 | // TODO: port old unit tests and double-check the implementation. 224 | 225 | //// For "bar", all hash functions return the same position, the we have 226 | //// necessarily a recurring minimum (RM). Thus we do not look in the second 227 | //// core and return 2, although the correct count would be 1. 228 | // b.add("bar"); // 2 0 1 0 0 and 0 0 229 | // CHECK(b.count("bar") == 2); 230 | // CHECK(to_string(b) == "010000100000000\n0000"); 231 | //// For "foo", we encounter a unique minimum in the first core, but since 232 | //// all positions for "foo" are zero in the second core, we return the 233 | //// mimimum of the first, which is 1. 234 | // CHECK(b.count("foo") == 1); 235 | //// After increasing the counters for "foo", we find that it (still) has a 236 | //// unique minimum in in the first core. Hence we add its minimum to the 237 | //// second core. 238 | // b.add("foo"); // 3 0 2 0 0 and 2 2 239 | // CHECK(b.count("foo") == 2); 240 | // CHECK(to_string(b) == "110000010000000\n0101"); 241 | //// The "blue fish" causes some trouble: because its insertion yields a 242 | //// unique minimum, we go into the second bitvector. There, we find that it 243 | //// hashes to the same positions as foo, wich has a counter of 2. Because it 244 | //// appears to exist there, we have to increment its counters. This falsely 245 | //// bumps up the counter of "blue fish" to 3. 246 | // b.add("blue fish"); // 3 0 3 0 1 and 3 3 247 | // CHECK(b.count("blue fish") == 3); 248 | // CHECK(to_string(b) == "110000110000100\n1111"); 249 | //// Since the "blue fish" has (still) a unique minimum after removing it one 250 | //// time, we look in the second core and find it to be present there. 251 | //// Hence we decrement the counters in the second core. 252 | // b.remove("blue fish"); // 3 0 2 0 0 and 2 2 253 | // CHECK(b.count("blue fish") == 2); 254 | // CHECK(to_string(b) == "110000010000000\n0101"); 255 | // b.remove("blue fish"); 256 | // CHECK(b.count("blue fish") == 1); // 3 0 1 0 0 and 1 1 257 | //// Let's look at "foo". This fellow has now a unique minimum. Since it has 258 | //// a unique minimum after the removal, we also decrement the counter in the 259 | //// second core. 260 | // b.remove("foo"); // 2 0 0 0 0 and 0 0 261 | // CHECK(b.count("foo") == 0); 262 | // CHECK(to_string(b) == "010000000000000\n0000"); 263 | //// Alas, we violated Claim 1 in Section 2.2 in the paper! The spectral 264 | //// Bloom filter returns a count of 0 for "foo", although it should be 1. 265 | //// Thus, the frequency estimate is no longer a lower bound. This occurs 266 | //// presumably due to the fact that we remove "blue fish" twice although we 267 | //// added it only once. 268 | } 269 | 270 | TEST(bloom_filter_bitwise) { 271 | bitwise_bloom_filter bf(3, 8); 272 | CHECK_EQUAL(bf.lookup("foo"), 0u); 273 | bf.add("foo"); 274 | CHECK_EQUAL(bf.lookup("foo"), 1u); 275 | bf.add("foo"); 276 | CHECK_EQUAL(bf.lookup("foo"), 2u); 277 | bf.add("foo"); 278 | CHECK_EQUAL(bf.lookup("foo"), 3u); 279 | // Other elements. 280 | CHECK_EQUAL(bf.lookup("baz"), 0u); 281 | bf.add("baz"); 282 | CHECK_EQUAL(bf.lookup("baz"), 1u); 283 | CHECK_EQUAL(bf.lookup("foo"), 3u); 284 | bf.add("baz"); 285 | CHECK_EQUAL(bf.lookup("baz"), 2u); 286 | CHECK_EQUAL(bf.lookup("foo"), 3u); 287 | } 288 | 289 | TEST(bloom_filter_stable) { 290 | stable_bloom_filter bf(make_hasher(3), 11, 2, 2); 291 | bf.add("one fish"); 292 | bf.add("two fish"); 293 | bf.add("red fish"); 294 | bf.add("blue fish"); 295 | bf.add("green fish"); 296 | bf.add("cyan fish"); 297 | bf.add("yellow fish"); 298 | bf.add("orange fish"); 299 | bf.add("purple fish"); 300 | bf.add("pink fish"); 301 | bf.add("brown fish"); 302 | bf.add("white fish"); 303 | bf.add("black fish"); 304 | bf.add("grey fish"); 305 | bf.add("jelly fish"); 306 | CHECK_EQUAL(bf.lookup("one fish"), 0u); 307 | CHECK_EQUAL(bf.lookup("two fish"), 2u); 308 | CHECK_EQUAL(bf.lookup("red fish"), 3u); 309 | CHECK_EQUAL(bf.lookup("blue fish"), 3u); 310 | } 311 | 312 | TEST(bloom_filter_a2) { 313 | a2_bloom_filter bf(3, 32, 3); 314 | bf.add("foo"); 315 | bf.add("foo"); // Duplicate inserts have no effect. 316 | bf.add("bar"); 317 | bf.add("baz"); 318 | // Reaches capacity and causes swapping. 319 | bf.add("qux"); 320 | CHECK_EQUAL(bf.lookup("foo"), 1u); 321 | CHECK_EQUAL(bf.lookup("bar"), 1u); 322 | CHECK_EQUAL(bf.lookup("baz"), 1u); 323 | CHECK_EQUAL(bf.lookup("qux"), 1u); 324 | } 325 | -------------------------------------------------------------------------------- /src/bitvector.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | namespace bf { 7 | 8 | typedef bitvector::size_type size_type; 9 | typedef bitvector::block_type block_type; 10 | 11 | namespace { 12 | 13 | uint8_t count_table[] = { 14 | 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 15 | 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 16 | 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 17 | 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 18 | 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 19 | 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 20 | 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 21 | 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 22 | 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 23 | 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; 24 | 25 | } // namespace 26 | 27 | bitvector::reference::reference(block_type& block, block_type i) 28 | : block_(block), mask_(block_type(1) << i) { 29 | assert(i < bits_per_block); 30 | } 31 | 32 | bitvector::reference& bitvector::reference::flip() { 33 | block_ ^= mask_; 34 | return *this; 35 | } 36 | 37 | bitvector::reference::operator bool() const { 38 | return (block_ & mask_) != 0; 39 | } 40 | 41 | bool bitvector::reference::operator~() const { 42 | return (block_ & mask_) == 0; 43 | } 44 | 45 | bitvector::reference& bitvector::reference::operator=(bool x) { 46 | x ? block_ |= mask_ : block_ &= ~mask_; 47 | return *this; 48 | } 49 | 50 | bitvector::reference& bitvector::reference::operator=(reference const& other) { 51 | other ? block_ |= mask_ : block_ &= ~mask_; 52 | return *this; 53 | } 54 | 55 | bitvector::reference& bitvector::reference::operator|=(bool x) { 56 | if (x) 57 | block_ |= mask_; 58 | return *this; 59 | } 60 | 61 | bitvector::reference& bitvector::reference::operator&=(bool x) { 62 | if (!x) 63 | block_ &= ~mask_; 64 | return *this; 65 | } 66 | 67 | bitvector::reference& bitvector::reference::operator^=(bool x) { 68 | if (x) 69 | block_ ^= mask_; 70 | return *this; 71 | } 72 | 73 | bitvector::reference& bitvector::reference::operator-=(bool x) { 74 | if (x) 75 | block_ &= ~mask_; 76 | return *this; 77 | } 78 | 79 | bitvector::bitvector() : num_bits_(0) { 80 | } 81 | 82 | bitvector::bitvector(size_type size, bool value) 83 | : bits_(bits_to_blocks(size), value ? ~block_type(0) : 0), num_bits_(size) { 84 | } 85 | 86 | bitvector::bitvector(bitvector const& other) 87 | : bits_(other.bits_), num_bits_(other.num_bits_) { 88 | } 89 | 90 | bitvector::bitvector(bitvector&& other) 91 | : bits_(std::move(other.bits_)), num_bits_(other.num_bits_) { 92 | other.num_bits_ = 0; 93 | } 94 | 95 | bitvector bitvector::operator~() const { 96 | bitvector b(*this); 97 | b.flip(); 98 | return b; 99 | } 100 | 101 | bitvector& bitvector::operator=(bitvector other) { 102 | swap(*this, other); 103 | return *this; 104 | } 105 | 106 | void swap(bitvector& x, bitvector& y) { 107 | using std::swap; 108 | swap(x.bits_, y.bits_); 109 | swap(x.num_bits_, y.num_bits_); 110 | } 111 | 112 | bitvector bitvector::operator<<(size_type n) const { 113 | bitvector b(*this); 114 | return b <<= n; 115 | } 116 | 117 | bitvector bitvector::operator>>(size_type n) const { 118 | bitvector b(*this); 119 | return b >>= n; 120 | } 121 | 122 | bitvector& bitvector::operator<<=(size_type n) { 123 | if (n >= num_bits_) 124 | return reset(); 125 | 126 | if (n > 0) { 127 | auto last = blocks() - 1; 128 | auto div = n / bits_per_block; 129 | auto r = bit_index(n); 130 | auto b = &bits_[0]; 131 | assert(blocks() >= 1); 132 | assert(div <= last); 133 | 134 | if (r != 0) { 135 | for (size_type i = last - div; i > 0; --i) 136 | b[i + div] = (b[i] << r) | (b[i - 1] >> (bits_per_block - r)); 137 | b[div] = b[0] << r; 138 | } else { 139 | for (size_type i = last - div; i > 0; --i) 140 | b[i + div] = b[i]; 141 | b[div] = b[0]; 142 | } 143 | 144 | std::fill_n(b, div, block_type(0)); 145 | zero_unused_bits(); 146 | } 147 | 148 | return *this; 149 | } 150 | 151 | bitvector& bitvector::operator>>=(size_type n) { 152 | if (n >= num_bits_) 153 | return reset(); 154 | 155 | if (n > 0) { 156 | auto last = blocks() - 1; 157 | auto div = n / bits_per_block; 158 | auto r = bit_index(n); 159 | auto b = &bits_[0]; 160 | assert(blocks() >= 1); 161 | assert(div <= last); 162 | 163 | if (r != 0) { 164 | for (size_type i = last - div; i > 0; --i) 165 | b[i - div] = (b[i] >> r) | (b[i + 1] << (bits_per_block - r)); 166 | b[last - div] = b[last] >> r; 167 | } else { 168 | for (size_type i = div; i <= last; ++i) 169 | b[i - div] = b[i]; 170 | } 171 | 172 | std::fill_n(b + (blocks() - div), div, block_type(0)); 173 | } 174 | 175 | return *this; 176 | } 177 | 178 | bitvector& bitvector::operator&=(bitvector const& other) { 179 | assert(size() >= other.size()); 180 | for (size_type i = 0; i < blocks(); ++i) 181 | bits_[i] &= other.bits_[i]; 182 | return *this; 183 | } 184 | 185 | bitvector& bitvector::operator|=(bitvector const& other) { 186 | assert(size() >= other.size()); 187 | for (size_type i = 0; i < blocks(); ++i) 188 | bits_[i] |= other.bits_[i]; 189 | return *this; 190 | } 191 | 192 | bitvector& bitvector::operator^=(bitvector const& other) { 193 | assert(size() >= other.size()); 194 | for (size_type i = 0; i < blocks(); ++i) 195 | bits_[i] ^= other.bits_[i]; 196 | return *this; 197 | } 198 | 199 | bitvector& bitvector::operator-=(bitvector const& other) { 200 | assert(size() >= other.size()); 201 | for (size_type i = 0; i < blocks(); ++i) 202 | bits_[i] &= ~other.bits_[i]; 203 | return *this; 204 | } 205 | 206 | bitvector operator&(bitvector const& x, bitvector const& y) { 207 | bitvector b(x); 208 | return b &= y; 209 | } 210 | 211 | bitvector operator|(bitvector const& x, bitvector const& y) { 212 | bitvector b(x); 213 | return b |= y; 214 | } 215 | 216 | bitvector operator^(bitvector const& x, bitvector const& y) { 217 | bitvector b(x); 218 | return b ^= y; 219 | } 220 | 221 | bitvector operator-(bitvector const& x, bitvector const& y) { 222 | bitvector b(x); 223 | return b -= y; 224 | } 225 | 226 | bool operator==(bitvector const& x, bitvector const& y) { 227 | return x.num_bits_ == y.num_bits_ && x.bits_ == y.bits_; 228 | } 229 | 230 | bool operator!=(bitvector const& x, bitvector const& y) { 231 | return !(x == y); 232 | } 233 | 234 | bool operator<(bitvector const& x, bitvector const& y) { 235 | assert(x.size() == y.size()); 236 | for (size_type r = x.blocks(); r > 0; --r) { 237 | auto i = r - 1; 238 | if (x.bits_[i] < y.bits_[i]) 239 | return true; 240 | else if (x.bits_[i] > y.bits_[i]) 241 | return false; 242 | } 243 | return false; 244 | } 245 | 246 | void bitvector::resize(size_type n, bool value) { 247 | auto old = blocks(); 248 | auto required = bits_to_blocks(n); 249 | auto block_value = value ? ~block_type(0) : block_type(0); 250 | 251 | if (required != old) 252 | bits_.resize(required, block_value); 253 | 254 | if (value && (n > num_bits_) && extra_bits()) 255 | bits_[old - 1] |= (block_value << extra_bits()); 256 | 257 | num_bits_ = n; 258 | zero_unused_bits(); 259 | } 260 | 261 | void bitvector::clear() noexcept { 262 | bits_.clear(); 263 | num_bits_ = 0; 264 | } 265 | 266 | void bitvector::push_back(bool bit) { 267 | auto s = size(); 268 | resize(s + 1); 269 | set(s, bit); 270 | } 271 | 272 | void bitvector::append(block_type block) { 273 | auto excess = extra_bits(); 274 | if (excess) { 275 | assert(!bits_.empty()); 276 | bits_.push_back(block >> (bits_per_block - excess)); 277 | bits_[bits_.size() - 2] |= (block << excess); 278 | } else { 279 | bits_.push_back(block); 280 | } 281 | num_bits_ += bits_per_block; 282 | } 283 | 284 | bitvector& bitvector::set(size_type i, bool bit) { 285 | assert(i < num_bits_); 286 | 287 | if (bit) 288 | bits_[block_index(i)] |= bit_mask(i); 289 | else 290 | reset(i); 291 | 292 | return *this; 293 | } 294 | 295 | bitvector& bitvector::set() { 296 | std::fill(bits_.begin(), bits_.end(), ~block_type(0)); 297 | zero_unused_bits(); 298 | return *this; 299 | } 300 | 301 | bitvector& bitvector::reset(size_type i) { 302 | assert(i < num_bits_); 303 | bits_[block_index(i)] &= ~bit_mask(i); 304 | return *this; 305 | } 306 | 307 | bitvector& bitvector::reset() { 308 | std::fill(bits_.begin(), bits_.end(), block_type(0)); 309 | return *this; 310 | } 311 | 312 | bitvector& bitvector::flip(size_type i) { 313 | assert(i < num_bits_); 314 | bits_[block_index(i)] ^= bit_mask(i); 315 | return *this; 316 | } 317 | 318 | bitvector& bitvector::flip() { 319 | for (size_type i = 0; i < blocks(); ++i) 320 | bits_[i] = ~bits_[i]; 321 | zero_unused_bits(); 322 | return *this; 323 | } 324 | 325 | bool bitvector::operator[](size_type i) const { 326 | assert(i < num_bits_); 327 | return (bits_[block_index(i)] & bit_mask(i)) != 0; 328 | } 329 | 330 | bitvector::reference bitvector::operator[](size_type i) { 331 | assert(i < num_bits_); 332 | return {bits_[block_index(i)], bit_index(i)}; 333 | } 334 | 335 | size_type bitvector::count() const { 336 | auto first = bits_.begin(); 337 | size_t n = 0; 338 | auto length = blocks(); 339 | while (length) { 340 | auto block = *first; 341 | while (block) { 342 | // TODO: use __popcnt if available. 343 | n += count_table[block & ((1u << 8) - 1)]; 344 | block >>= 8; 345 | } 346 | ++first; 347 | --length; 348 | } 349 | return n; 350 | } 351 | 352 | size_type bitvector::blocks() const { 353 | return bits_.size(); 354 | } 355 | 356 | size_type bitvector::size() const { 357 | return num_bits_; 358 | } 359 | 360 | bool bitvector::empty() const { 361 | return bits_.empty(); 362 | } 363 | 364 | size_type bitvector::find_first() const { 365 | return find_from(0); 366 | } 367 | 368 | size_type bitvector::find_next(size_type i) const { 369 | if (i >= (size() - 1) || size() == 0) 370 | return npos; 371 | ++i; 372 | auto bi = block_index(i); 373 | auto block = bits_[bi] & (~block_type(0) << bit_index(i)); 374 | return block ? bi * bits_per_block + lowest_bit(block) : find_from(bi + 1); 375 | } 376 | 377 | size_type bitvector::lowest_bit(block_type block) { 378 | auto x = block - (block & (block - 1)); // Extract right-most 1-bit. 379 | size_type log = 0; 380 | while (x >>= 1) 381 | ++log; 382 | return log; 383 | } 384 | 385 | block_type bitvector::extra_bits() const { 386 | return bit_index(size()); 387 | } 388 | 389 | void bitvector::zero_unused_bits() { 390 | if (extra_bits()) 391 | bits_.back() &= ~(~block_type(0) << extra_bits()); 392 | } 393 | 394 | size_type bitvector::find_from(size_type i) const { 395 | while (i < blocks() && bits_[i] == 0) 396 | ++i; 397 | if (i >= blocks()) 398 | return npos; 399 | return i * bits_per_block + lowest_bit(bits_[i]); 400 | } 401 | 402 | std::string to_string(bitvector const& b, bool msb_to_lsb, bool all, 403 | size_t cut_off) { 404 | std::string str; 405 | auto str_size = all ? bitvector::bits_per_block * b.blocks() : b.size(); 406 | if (cut_off == 0 || str_size <= cut_off) { 407 | str.assign(str_size, '0'); 408 | } else { 409 | str.assign(cut_off + 2, '0'); 410 | str[cut_off + 0] = '.'; 411 | str[cut_off + 1] = '.'; 412 | str_size = cut_off; 413 | } 414 | 415 | for (bitvector::size_type i = 0; i < std::min(str_size, b.size()); ++i) 416 | if (b[i]) 417 | str[msb_to_lsb ? str_size - i - 1 : i] = '1'; 418 | 419 | return str; 420 | } 421 | 422 | } // namespace bf 423 | -------------------------------------------------------------------------------- /test/bf/util/configuration.h: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_CONFIGURATION_H 2 | #define UTIL_CONFIGURATION_H 3 | 4 | #include "util/trial.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | namespace util { 16 | 17 | /// A command line parser and program option utility. 18 | template 19 | class configuration { 20 | public: 21 | struct error : util::error { 22 | using util::error::error; 23 | 24 | error(std::string msg, char c) : util::error{msg + " (-" + c + ')'} { 25 | } 26 | error(std::string msg, std::string opt) 27 | : util::error{msg + " (--" + opt + ')'} { 28 | } 29 | }; 30 | 31 | /// Initializes the configuration from a configuration file. 32 | /// @param filename The name of the configuration file. 33 | /// @returns An engaged trial on success. 34 | static trial parse(std::string const& /* filename */) { 35 | return error{"function not yet implemented"}; 36 | } 37 | 38 | /// Initializes the configuration from command line parameters. 39 | /// @argc The argc parameter from main. 40 | /// @param argv The argv parameter from main. 41 | static trial parse(int argc, char* argv[]) { 42 | Derived cfg; 43 | 44 | // Although we don't like to use exceptions, for the "configuration DSL" we 45 | // prefer a monadic style to declare our program and hence have to fall 46 | // back to exceptions. 47 | try { 48 | cfg.initialize(); 49 | } catch (std::logic_error const& e) { 50 | return error{e.what()}; 51 | } 52 | 53 | for (int i = 1; i < argc; ++i) { 54 | std::vector values; 55 | 56 | std::string arg{argv[i]}; 57 | auto val = cfg.optionize(arg); 58 | if (!val) 59 | return val.failure(); 60 | else if (!val->empty()) 61 | values.emplace_back(*val); 62 | 63 | auto o = cfg.find_option(arg); 64 | if (o) 65 | o->defaulted_ = false; 66 | else 67 | return error{"unknown option", arg}; 68 | 69 | // Consume everything until the next option. 70 | while (i + 1 < argc) { 71 | std::string next{argv[i + 1]}; 72 | if (cfg.optionize(next)) 73 | break; 74 | 75 | values.emplace_back(std::move(next)); 76 | ++i; 77 | } 78 | 79 | if (values.size() > o->max_vals_) 80 | return error{"too many values", arg}; 81 | 82 | if (o->max_vals_ == 1 && values.size() != 1) 83 | return error{"option value required", arg}; 84 | 85 | if (!values.empty()) 86 | o->values_ = std::move(values); 87 | } 88 | 89 | if (!cfg.verify()) 90 | return error{"configuration verification failed"}; 91 | 92 | return {std::move(cfg)}; 93 | } 94 | 95 | /// Checks whether the given option is set. 96 | /// @param opt Name of the option to check. 97 | /// @returns `true` if *opt* is set. 98 | bool check(std::string const& opt) const { 99 | auto o = find_option(opt); 100 | return o && !o->defaulted_; 101 | } 102 | 103 | /// Returns the value of the given option. 104 | /// @param opt The name of the option. 105 | /// @returns The option value. 106 | trial get(std::string const& opt) const { 107 | auto o = find_option(opt); 108 | if (!o) 109 | return error{"option does not exist"}; 110 | if (o->values_.empty()) 111 | return error{"option has no value"}; 112 | if (o->max_vals_ > 1) 113 | return error{"cannot get multi-value option"}; 114 | 115 | assert(o->values_.size() == 1); 116 | return o->values_.front(); 117 | } 118 | 119 | /// Retrieves an option as a specific type. 120 | /// @tparam T The type to convert the option to. 121 | /// @param opt The name of the option. 122 | /// @returns The converted option value. 123 | template 124 | trial as(std::string const& opt) const { 125 | auto o = find_option(opt); 126 | if (!o) 127 | return error{"unknown option", opt}; 128 | 129 | return dispatch(*o, std::is_same>()); 130 | } 131 | 132 | /// Prints the usage into a given output stream. 133 | /// @param sink The output stream to receive the configuration. 134 | /// @param show_all Whether to also print invisible options. 135 | void usage(std::ostream& sink, bool show_all = false) { 136 | sink << derived()->banner() << "\n"; 137 | 138 | for (auto& b : blocks_) { 139 | if (!show_all && !b.visible_) 140 | continue; 141 | 142 | sink << "\n " << b.name_ << ":\n"; 143 | 144 | auto has_shortcut = 145 | std::any_of(b.options_.begin(), b.options_.end(), 146 | [](option const& o) { return o.shortcut_ != '\0'; }); 147 | 148 | auto max = std::max_element(b.options_.begin(), b.options_.end(), 149 | [](option const& o1, option const& o2) { 150 | return o1.name_.size() < o2.name_.size(); 151 | }); 152 | 153 | auto max_len = max->name_.size(); 154 | for (auto& opt : b.options_) { 155 | sink << " --" << opt.name_; 156 | sink << std::string(max_len - opt.name_.size(), ' '); 157 | if (has_shortcut) 158 | sink << (opt.shortcut_ ? std::string(" | -") + opt.shortcut_ : 159 | " "); 160 | 161 | sink << " " << opt.description_ << "\n"; 162 | } 163 | } 164 | 165 | sink << std::endl; 166 | } 167 | 168 | protected: 169 | class option { 170 | friend configuration; 171 | 172 | public: 173 | option(std::string name, std::string desc, char shortcut = '\0') 174 | : name_{std::move(name)}, 175 | description_{std::move(desc)}, 176 | shortcut_{shortcut} { 177 | } 178 | 179 | template 180 | option& init(T const& x) { 181 | std::ostringstream ss; 182 | ss << x; 183 | values_.push_back(ss.str()); 184 | max_vals_ = (values_.size() == 1) ? 1 : -1; 185 | return *this; 186 | } 187 | 188 | template 189 | option& init(T const& head, Args... tail) { 190 | init(head); 191 | init(tail...); 192 | return *this; 193 | } 194 | 195 | option& multi(size_t n = -1) { 196 | max_vals_ = n; 197 | return *this; 198 | } 199 | 200 | option& single() { 201 | return multi(1); 202 | } 203 | 204 | private: 205 | std::string name_; 206 | std::vector values_; 207 | std::string description_; 208 | size_t max_vals_ = 0; 209 | bool defaulted_ = true; 210 | char shortcut_ = '\0'; 211 | }; 212 | 213 | /// A proxy class to add options to the configuration. 214 | class block { 215 | friend class configuration; 216 | block(block const&) = delete; 217 | block& operator=(block other) = delete; 218 | 219 | public: 220 | /// Separates hierarchical options. 221 | static constexpr char const* separator = "."; 222 | 223 | /// Move-constructs a block. 224 | /// @param other The block to move. 225 | block(block&& other) 226 | : visible_{other.visible_}, 227 | name_{std::move(other.name_)}, 228 | prefix_{std::move(other.prefix_)}, 229 | options_{std::move(other.options_)}, 230 | config_{other.config_} { 231 | other.visible_ = true; 232 | other.config_ = nullptr; 233 | } 234 | 235 | /// Adds a new option. 236 | /// @param name The option name. 237 | /// @param desc The option description. 238 | option& add(std::string const& name, std::string desc) { 239 | std::string fqn = qualify(name); 240 | if (config_->find_option(fqn)) 241 | throw std::logic_error{"duplicate option"}; 242 | options_.emplace_back(std::move(fqn), std::move(desc)); 243 | return options_.back(); 244 | } 245 | 246 | /// Adds a new option with shortcut. 247 | /// @param shortcut The shortcut of the option (single character). 248 | /// @param name The option name. 249 | /// @param desc The option description. 250 | option& add(char shortcut, std::string const& name, std::string desc) { 251 | if (config_->shortcuts_.count({shortcut})) 252 | throw std::logic_error{"duplicate shortcut"}; 253 | std::string fqn = qualify(name); 254 | config_->shortcuts_.insert({{shortcut}, fqn}); 255 | if (config_->find_option(fqn)) 256 | throw std::logic_error{"duplicate option"}; 257 | options_.emplace_back(std::move(fqn), std::move(desc), shortcut); 258 | return options_.back(); 259 | } 260 | 261 | /// Sets the visibility of this block when displaying the usage. 262 | bool visible() const { 263 | return visible_; 264 | } 265 | 266 | /// Sets the visibility of this block when displaying the usage. 267 | void visible(bool flag) { 268 | visible_ = flag; 269 | } 270 | 271 | private: 272 | block(std::string name, std::string prefix, configuration* config) 273 | : name_{std::move(name)}, prefix_{std::move(prefix)}, config_{config} { 274 | } 275 | 276 | std::string qualify(std::string const& name) const { 277 | return prefix_.empty() ? name : prefix_ + separator + name; 278 | } 279 | 280 | bool visible_ = true; 281 | std::string name_; 282 | std::string prefix_; 283 | std::vector