├── .clang-format
├── .gitignore
├── .travis.yml
├── LICENSE
├── Makefile
├── README.md
├── appveyor.yml
├── benchmarks
    ├── Makefile
    ├── basic.js
    ├── bulk-insert-and-query.cc
    ├── conext-figure5.cc
    ├── conext-table3.cc
    ├── random.h
    └── timing.h
├── binding.gyp
├── cuckoo.cc
├── cuckoo.h
├── example
    ├── test-chars.cc
    └── test.cc
├── index.js
├── package.json
├── src
    ├── bitsutil.h
    ├── cuckoofilter.h
    ├── debug.h
    ├── hashutil.cc
    ├── hashutil.h
    ├── packedtable.h
    ├── permencoding.h
    ├── printutil.cc
    ├── printutil.h
    ├── simd-block.h
    └── singletable.h
└── test.js


/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | Language:        Cpp
 3 | # BasedOnStyle:  Google
 4 | AccessModifierOffset: -1
 5 | AlignAfterOpenBracket: Align
 6 | AlignConsecutiveAssignments: false
 7 | AlignConsecutiveDeclarations: false
 8 | AlignEscapedNewlinesLeft: true
 9 | AlignOperands:   true
10 | AlignTrailingComments: true
11 | AllowAllParametersOfDeclarationOnNextLine: true
12 | AllowShortBlocksOnASingleLine: false
13 | AllowShortCaseLabelsOnASingleLine: false
14 | AllowShortFunctionsOnASingleLine: All
15 | AllowShortIfStatementsOnASingleLine: true
16 | AllowShortLoopsOnASingleLine: true
17 | AlwaysBreakAfterDefinitionReturnType: None
18 | AlwaysBreakAfterReturnType: None
19 | AlwaysBreakBeforeMultilineStrings: true
20 | AlwaysBreakTemplateDeclarations: true
21 | BinPackArguments: true
22 | BinPackParameters: true
23 | BraceWrapping:   
24 |   AfterClass:      false
25 |   AfterControlStatement: false
26 |   AfterEnum:       false
27 |   AfterFunction:   false
28 |   AfterNamespace:  false
29 |   AfterObjCDeclaration: false
30 |   AfterStruct:     false
31 |   AfterUnion:      false
32 |   BeforeCatch:     false
33 |   BeforeElse:      false
34 |   IndentBraces:    false
35 | BreakBeforeBinaryOperators: None
36 | BreakBeforeBraces: Attach
37 | BreakBeforeTernaryOperators: true
38 | BreakConstructorInitializersBeforeComma: false
39 | BreakAfterJavaFieldAnnotations: false
40 | BreakStringLiterals: true
41 | ColumnLimit:     80
42 | CommentPragmas:  '^ IWYU pragma:'
43 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
44 | ConstructorInitializerIndentWidth: 4
45 | ContinuationIndentWidth: 4
46 | Cpp11BracedListStyle: true
47 | DerivePointerAlignment: false
48 | DisableFormat:   false
49 | ExperimentalAutoDetectBinPacking: false
50 | ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
51 | IncludeCategories: 
52 |   - Regex:           '^<.*\.h>'
53 |     Priority:        1
54 |   - Regex:           '^<.*'
55 |     Priority:        2
56 |   - Regex:           '.*'
57 |     Priority:        3
58 | IncludeIsMainRegex: '([-_](test|unittest))?$'
59 | IndentCaseLabels: true
60 | IndentWidth:     2
61 | IndentWrappedFunctionNames: false
62 | JavaScriptQuotes: Leave
63 | JavaScriptWrapImports: true
64 | KeepEmptyLinesAtTheStartOfBlocks: false
65 | MacroBlockBegin: ''
66 | MacroBlockEnd:   ''
67 | MaxEmptyLinesToKeep: 1
68 | NamespaceIndentation: None
69 | ObjCBlockIndentWidth: 2
70 | ObjCSpaceAfterProperty: false
71 | ObjCSpaceBeforeProtocolList: false
72 | PenaltyBreakBeforeFirstCallParameter: 1
73 | PenaltyBreakComment: 300
74 | PenaltyBreakFirstLessLess: 120
75 | PenaltyBreakString: 1000
76 | PenaltyExcessCharacter: 1000000
77 | PenaltyReturnTypeOnItsOwnLine: 200
78 | PointerAlignment: Right
79 | ReflowComments:  true
80 | SortIncludes:    true
81 | SpaceAfterCStyleCast: false
82 | SpaceBeforeAssignmentOperators: true
83 | SpaceBeforeParens: ControlStatements
84 | SpaceInEmptyParentheses: false
85 | SpacesBeforeTrailingComments: 2
86 | SpacesInAngles:  false
87 | SpacesInContainerLiterals: true
88 | SpacesInCStyleCastParentheses: false
89 | SpacesInParentheses: false
90 | SpacesInSquareBrackets: false
91 | Standard:        Auto
92 | TabWidth:        8
93 | UseTab:          Never
94 | ...
95 | 
96 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Logs
 2 | logs
 3 | *.log
 4 | npm-debug.log*
 5 | 
 6 | # Runtime data
 7 | pids
 8 | *.pid
 9 | *.seed
10 | 
11 | # Directory for instrumented libs generated by jscoverage/JSCover
12 | lib-cov
13 | 
14 | # Coverage directory used by tools like istanbul
15 | coverage
16 | 
17 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
18 | .grunt
19 | 
20 | # node-waf configuration
21 | .lock-wscript
22 | 
23 | # Compiled binary addons (http://nodejs.org/api/addons.html)
24 | build/Release
25 | 
26 | # Dependency directory
27 | node_modules
28 | 
29 | # Optional npm cache directory
30 | .npm
31 | 
32 | # Optional REPL history
33 | .node_repl_history
34 | 
35 | # 0x
36 | .__browserify_string_empty.js
37 | profile-*
38 | 
39 | # tap --cov
40 | .nyc_output/
41 | 
42 | # JetBrains IntelliJ IDEA
43 | .idea
44 | *.iml
45 | 
46 | # VS Code
47 | .vscode/
48 | 
49 | # lock files
50 | yarn.lock
51 | package-lock.json
52 | 
53 | # build
54 | 
55 | build/
56 | 
57 | #*
58 | #*#
59 | *#
60 | *.*#
61 | *.class
62 | *.dSYM
63 | *.la
64 | *.lo
65 | *.o
66 | *.so
67 | test
68 | test-chars
69 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | language: node_js
 3 | node_js:
 4 | - node
 5 | addons:
 6 |   apt:
 7 |     sources:
 8 |       - ubuntu-toolchain-r-test
 9 |     packages:
10 |       - g++-4.8
11 |       - gcc-4.8-multilib
12 |       - g++-4.8-multilib
13 |       - gcc-multilib
14 |       - g++-multilib
15 | os:
16 | - osx
17 | - linux
18 | before_deploy:
19 | - ARCHIVE_NAME="${TRAVIS_TAG:-latest}-$TRAVIS_OS_NAME-`uname -m`.tar"
20 | - npm run prebuild
21 | - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ARCH=ia32 npm run prebuild; fi
22 | - tar --create --verbose --file="$ARCHIVE_NAME" --directory "$TRAVIS_BUILD_DIR/prebuilds" .
23 | deploy:
24 |   provider: releases
25 |   draft: false
26 |   prerelease: true
27 |   api_key:
28 |     secure: SgSC/v3jqjZUsG/J//XE6N/1HqzieJIpT4rmy+I3YgMGsbQFShp7G/Nq33Z9tuBgDf6c8iqOvLwqwBacX55qq2KjhW6N84+4Nmr/u+ogRDZIDyPVftuSW26IoieicixEq9s5AC9JyWDjOVUZbeMcDf+8Z6CJWQTbgpyMBiKNCTxVnFZuacHMUhdOzRHMyBrMEiEcHYTrtYNpit/Yr89QnDDy1+xRqybMJZQANM6R39E5F6bpQzaTtfCO/YgbCkFerI3zQzrAK17dbSzrnt83jOBkAn8pu3apCQ7eUz8SOr3csUHuO1JLAZdtOR2LQ08BZaQfWIR6MdBpeanKhN0uUQsPpypSBTHsYZq25QXl9GOm8+9WPNSWGFvhJ2xxbzMa18alL3ZeowF5NbM2rS164wXnHUakDP+OKimC7O77Lg96FgdzMFFzS0RlrTbUCM11JFAZ6AJ7Qmz7jYgb4X81qBjnzbcE/kFSrgTye8iCwI8JIyXbSyRh5BbTj+p4f/92xeXo0neDpV08tciRyR3ButYOJNfTzBZGvuc9ujha7hzZm7B1mftKkh33lNd64KLRQk0SrV3U9/QE0U02gDoIjJOSybZTJo1BxkII1l1BzNVRvcg4qqsPcBq9ZS4qKgjVfaywpESyvkgoDi0qBejQ7AuWNtKi7nsbrk/OKZAVPFk=
29 |   file: "$ARCHIVE_NAME"
30 |   skip_cleanup: true
31 |   on:
32 |     tags: true
33 |     node: node
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2018, Matteo Collina
 2 | Copyright (C) 2013, Carnegie Mellon University and Intel Corporation
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |      http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CC = g++
 2 | AR = ar
 3 | PREFIX=/usr/local
 4 | 
 5 | # Uncomment one of the following to switch between debug and opt mode
 6 | #OPT = -O3 -DNDEBUG
 7 | OPT = -g -ggdb
 8 | 
 9 | CFLAGS += --std=c++11 -fno-strict-aliasing -Wall -c -I. -I./include -I/usr/include/ -I./src/ $(OPT)
10 | 
11 | LDFLAGS+= -Wall -lpthread
12 | 
13 | LIBOBJECTS = \
14 | 	./src/hashutil.o \
15 | 
16 | HEADERS = $(wildcard src/*.h)
17 | ALIB = libcuckoofilter.a
18 | 
19 | TEST = test
20 | 
21 | all: $(TEST)
22 | 
23 | clean:
24 | 	rm -f $(TEST) */*.o
25 | 
26 | test: example/test.o $(LIBOBJECTS)
27 | 	$(CC) example/test.o $(LIBOBJECTS) $(LDFLAGS) -o $@
28 | 
29 | test-chars: example/test-chars.o $(LIBOBJECTS)
30 | 	$(CC) example/test-chars.o $(LIBOBJECTS) $(LDFLAGS) -o $@
31 | 
32 | %.o: %.cc ${HEADERS} Makefile
33 | 	$(CC) $(CFLAGS) $< -o $@
34 | 
35 | $(ALIB): $(LIBOBJECTS)
36 | 	$(AR) rcs $@ $(LIBOBJECTS)
37 | 
38 | .PHONY: install
39 | install: $(ALIB)
40 | 	install -D -m 0755 $(HEADERS) -t $(DESTDIR)$(PREFIX)/include/cuckoofilter
41 | 	install -D -m 0755 $< -t $(DESTDIR)$(PREFIX)/lib
42 | 
43 | .PHONY: uninstall
44 | uninstall:
45 | 	rm -f $(DESTDIR)$(PREFIX)/lib/$(ALIB)
46 | 	rm -rf $(DESTDIR)$(PREFIX)/include/cuckoofilter
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Cuckoo Filter
 2 | 
 3 | Cuckoo filter is a Bloom filter replacement for approximated set-membership queries. While Bloom filters are well-known space-efficient data structures to serve queries like "if item x is in a set?", they do not support deletion. Their variances to enable deletion (like counting Bloom filters) usually require much more space.
 4 | 
 5 | Cuckoo ﬁlters provide the ﬂexibility to add and remove items dynamically. A cuckoo filter is based on cuckoo hashing (and therefore named as cuckoo filter).  It is essentially a cuckoo hash table storing each key's fingerprint. Cuckoo hash tables can be highly compact, thus a cuckoo filter could use less space than conventional Bloom ﬁlters, for applications that require low false positive rates (< 3%).
 6 | For details about the algorithm and citations please use:
 7 | 
 8 | ["Cuckoo Filter: Practically Better Than Bloom"](http://www.cs.cmu.edu/~binfan/papers/conext14_cuckoofilter.pdf) in proceedings of ACM CoNEXT 2014 by Bin Fan, Dave Andersen and Michael Kaminsky
 9 | 
10 | This is Node.js binding for the amazing
11 | [cuckoofilter](https://github.com/efficient/cuckoofilter) C++ library.
12 | 
13 | ## Install
14 | 
15 | ```
16 | $ npm install cuckoofilter-native
17 | ```
18 | 
19 | ## Usage
20 | 
21 | Here is a simple example for the basic usage of cuckoo filter.
22 | 
23 | ```js
24 | const CuckooFilter = require('cuckoofilter-native')
25 | 
26 | const filter = new CuckooFilter(1024)
27 | 
28 | filter.add('hello world') // returns the filter
29 | filter.contain('hello world') // returns true
30 | filter.size // returns 1
31 | filter.delete('hello world') // returns the filter
32 | filter.size // now returns 0, as we have deleted
33 | ```
34 | 
35 | ## API
36 | 
37 | A cuckoo filter supports following operations:
38 | 
39 | *  `add(item)`: insert an item to the filter
40 | *  `contain(item)`: return if item is already in the filter. Note that this method may return false positive results like Bloom filters
41 | *  `delete(item)`: delete the given item from the filter. Note that to use this method, it must be ensured that this item is in the filter (e.g., based on records on external storage); otherwise, a false item may be deleted.
42 | *  `size`: return the total number of items currently in the filter
43 | *  `bytes`: return the filter size in bytes
44 | 
45 | `item` must be a string, but `Buffer` could be supported in the future.
46 | Feel free to send a PR.
47 | 
48 | ## Benchmarks
49 | 
50 | This library is fast:
51 | 
52 | ```
53 | cuckoofilter-native add x 1,097,031 ops/sec ±2.12% (82 runs sampled)
54 | cuckoofilter-native contain x 3,286,894 ops/sec ±2.08% (84 runs sampled)
55 | cuckoo-filter add x 2,267 ops/sec ±13.26% (16 runs sampled)
56 | cuckoo-filter contains x 31,680 ops/sec ±4.85% (70 runs sampled)
57 | bloomfilter add x 773,499 ops/sec ±2.38% (86 runs sampled)
58 | bloomfilter contains x 1,866,273 ops/sec ±2.20% (85 runs sampled)
59 | ```
60 | 
61 | ## License
62 | 
63 | Apache-2.0
64 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | build: false
 2 | 
 3 | skip_branch_with_pr: true
 4 | 
 5 | environment:
 6 |   matrix:
 7 |     - nodejs_version: "Current"
 8 | 
 9 | configuration: Release
10 | platform:
11 |   - x86
12 |   - x64
13 | 
14 | install:
15 |   - SET PATH=C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin;%PATH%
16 |   - ps: Install-Product node $env:nodejs_version $env:platform
17 |   - npm install
18 | 
19 | test_script:
20 |   - node --version
21 |   - npm --version
22 |   - npm test
23 | 
24 | after_test:
25 |   - ps: If ($env:nodejs_version -eq "Current") { npm run prebuild }
26 | 
27 | artifacts:
28 |   - path: prebuilds
29 |     name: $(APPVEYOR_REPO_TAG_NAME)-win-$(PLATFORM)
30 |     type: zip
31 | 
32 | deploy:
33 |   - provider: GitHub
34 |     artifact: /.*\.zip/
35 |     draft: false
36 |     prerelease: true
37 |     auth_token:
38 |       secure: g3luGj79Aazd5zOSReTZ2yx4D93qv9vCwL7JFVo7zHfcj+cZIeEDy/KA6obqA6fA
39 |     on:
40 |       appveyor_repo_tag: true
41 |       nodejs_version: "Current"
42 | 


--------------------------------------------------------------------------------
/benchmarks/Makefile:
--------------------------------------------------------------------------------
 1 | # Uncomment one of the following to switch between debug and opt mode
 2 | OPT = -O3 -DNDEBUG
 3 | #OPT = -g -ggdb
 4 | 
 5 | CXXFLAGS += -fno-strict-aliasing -Wall -std=c++11 -I. -I../src/ $(OPT) -march=core-avx2
 6 | 
 7 | LDFLAGS+= -Wall -lpthread -lssl -lcrypto
 8 | 
 9 | HEADERS = $(wildcard ../src/*.h) *.h
10 | 
11 | SRC = ../src/hashutil.cc
12 | 
13 | .PHONY: all
14 | 
15 | BINS = conext-table3.exe conext-figure5.exe bulk-insert-and-query.exe
16 | 
17 | all: $(BINS)
18 | 
19 | clean:
20 | 	/bin/rm -f $(BINS)
21 | 
22 | %.exe: %.cc ${HEADERS} ${SRC} Makefile
23 | 	$(CXX) $(CXXFLAGS) $< -o $@ $(SRC) $(LDFLAGS)
24 | 


--------------------------------------------------------------------------------
/benchmarks/basic.js:
--------------------------------------------------------------------------------
 1 | 'use strict'
 2 | 
 3 | const benchmark = require('benchmark')
 4 | const suite = benchmark.Suite()
 5 | const Cuckoo = require('..')
 6 | const CuckooFilter = require('cuckoo-filter').CuckooFilter
 7 | const BloomFilter = require('bloomfilter').BloomFilter
 8 | 
 9 | const cuckoo = new Cuckoo(1024 * 8)
10 | const cuckoofilter = new CuckooFilter(200, 4, 2)
11 | 
12 | const bloom = new BloomFilter(
13 |   32 * 256,
14 |   16
15 | )
16 | 
17 | cuckoo.add('hello world')
18 | cuckoofilter.add('hello world')
19 | bloom.add('hello world')
20 | 
21 | suite.add('cuckoofilter-native add', function () {
22 |   cuckoo.add('hello world' + Math.random())
23 | })
24 | 
25 | suite.add('cuckoofilter-native contain', function () {
26 |   cuckoo.contain('hello world')
27 | })
28 | 
29 | suite.add('cuckoo-filter add', function () {
30 |   cuckoofilter.add('hello world' + Math.random())
31 | })
32 | 
33 | suite.add('cuckoo-filter contains', function () {
34 |   cuckoofilter.contains('hello world')
35 | })
36 | 
37 | suite.add('bloomfilter add', function () {
38 |   bloom.add('hello world' + Math.random())
39 | })
40 | 
41 | suite.add('bloomfilter contains', function () {
42 |   bloom.test('hello world')
43 | })
44 | 
45 | suite
46 |   .on('cycle', function (event) {
47 |     console.log(String(event.target))
48 |   })
49 |   .on('complete', function () {})
50 |   .run()
51 | 


--------------------------------------------------------------------------------
/benchmarks/bulk-insert-and-query.cc:
--------------------------------------------------------------------------------
  1 | // This benchmark reports on the bulk insert and bulk query rates. It is invoked as:
  2 | //
  3 | //     ./bulk-insert-and-query.exe 158000
  4 | //
  5 | // That invocation will test each probabilistic membership container type with 158000
  6 | // randomly generated items. It tests bulk Add() from empty to full and Contain() on
  7 | // filters with varying rates of expected success. For instance, at 75%, three out of
  8 | // every four values passed to Contain() were earlier Add()ed.
  9 | //
 10 | // Example output:
 11 | //
 12 | // $ for num in 55 75 85; do echo $num:; /usr/bin/time -f 'time: %e seconds' ./bulk-insert-and-query.exe ${num}00000; echo; done
 13 | // 55:
 14 | //                   Million    Find    Find    Find    Find    Find                       optimal  wasted
 15 | //                  adds/sec      0%     25%     50%     75%    100%       ε  bits/item  bits/item   space
 16 | //      Cuckoo12       23.78   37.24   35.04   37.17   37.35   36.35  0.131%      18.30       9.58   91.1%
 17 | //    SemiSort13       11.63   17.55   17.08   17.14   17.54   22.32  0.064%      18.30      10.62   72.4%
 18 | //       Cuckoo8       35.31   49.32   50.24   49.98   48.32   50.49  2.044%      12.20       5.61  117.4%
 19 | //     SemiSort9       13.99   22.23   22.78   22.13   23.16   24.06  1.207%      12.20       6.37   91.5%
 20 | //      Cuckoo16       27.06   36.94   37.12   35.31   36.81   35.10  0.009%      24.40      13.46   81.4%
 21 | //    SemiSort17       10.37   15.70   15.84   15.78   15.55   15.93  0.004%      24.40      14.72   65.8%
 22 | //    SimdBlock8       74.22   72.34   74.23   74.34   74.69   74.32  0.508%      12.20       7.62   60.1%
 23 | // time: 14.34 seconds
 24 | //
 25 | // 75:
 26 | //                   Million    Find    Find    Find    Find    Find                       optimal  wasted
 27 | //                  adds/sec      0%     25%     50%     75%    100%       ε  bits/item  bits/item   space
 28 | //      Cuckoo12       15.61   37.24   37.23   37.34   37.15   37.36  0.173%      13.42       9.18   46.2%
 29 | //    SemiSort13        8.77   17.11   15.70   17.34   17.73   18.86  0.087%      13.42      10.17   31.9%
 30 | //       Cuckoo8       23.46   48.81   48.14   39.48   49.28   49.65  2.806%       8.95       5.16   73.6%
 31 | //     SemiSort9       11.14   23.98   20.80   23.37   24.35   21.41  1.428%       8.95       6.13   46.0%
 32 | //      Cuckoo16       15.08   36.64   36.75   36.83   36.59   36.74  0.011%      17.90      13.11   36.5%
 33 | //    SemiSort17        8.02   15.63   15.66   15.87   15.67   15.88  0.006%      17.90      14.02   27.6%
 34 | //    SimdBlock8       73.26   74.41   74.28   70.86   72.02   70.69  2.071%       8.95       5.59   60.0%
 35 | // time: 18.06 seconds
 36 | //
 37 | // 85:
 38 | //                   Million    Find    Find    Find    Find    Find                       optimal  wasted
 39 | //                  adds/sec      0%     25%     50%     75%    100%       ε  bits/item  bits/item   space
 40 | //      Cuckoo12       22.74   32.49   32.69   32.58   32.85   32.71  0.102%      23.69       9.94  138.3%
 41 | //    SemiSort13        9.97   13.16   13.15   13.54   16.01   19.58  0.056%      23.69      10.80  119.4%
 42 | //       Cuckoo8       30.67   36.86   36.79   37.09   36.97   36.87  1.581%      15.79       5.98  163.9%
 43 | //     SemiSort9       10.96   15.49   15.37   15.40   15.18   15.63  1.047%      15.79       6.58  140.1%
 44 | //      Cuckoo16       27.84   33.74   33.72   33.69   33.75   33.62  0.007%      31.58      13.80  128.8%
 45 | //    SemiSort17        9.51   12.83   12.80   12.64   12.86   12.50  0.004%      31.58      14.65  115.6%
 46 | //    SimdBlock8       54.84   58.37   59.73   59.13   60.11   60.12  0.144%      15.79       9.44   67.3%
 47 | // time: 19.43 seconds
 48 | //
 49 | 
 50 | #include <climits>
 51 | #include <iomanip>
 52 | #include <map>
 53 | #include <stdexcept>
 54 | #include <vector>
 55 | 
 56 | #include "cuckoofilter.h"
 57 | #include "random.h"
 58 | #include "simd-block.h"
 59 | #include "timing.h"
 60 | 
 61 | using namespace std;
 62 | 
 63 | using namespace cuckoofilter;
 64 | 
 65 | // The number of items sampled when determining the lookup performance
 66 | const size_t SAMPLE_SIZE = 1000 * 1000;
 67 | 
 68 | // The statistics gathered for each table type:
 69 | struct Statistics {
 70 |   double adds_per_nano;
 71 |   map<int, double> finds_per_nano; // The key is the percent of queries that were expected
 72 |                                    // to be positive
 73 |   double false_positive_probabilty;
 74 |   double bits_per_item;
 75 | };
 76 | 
 77 | // Output for the first row of the table of results. type_width is the maximum number of
 78 | // characters of the description of any table type, and find_percent_count is the number
 79 | // of different lookup statistics gathered for each table. This function assumes the
 80 | // lookup expected positive probabiilties are evenly distributed, with the first being 0%
 81 | // and the last 100%.
 82 | string StatisticsTableHeader(int type_width, int find_percent_count) {
 83 |   ostringstream os;
 84 | 
 85 |   os << string(type_width, ' ');
 86 |   os << setw(12) << right << "Million";
 87 |   for (int i = 0; i < find_percent_count; ++i) {
 88 |     os << setw(8) << "Find";
 89 |   }
 90 |   os << setw(8) << "" << setw(11) << "" << setw(11)
 91 |      << "optimal" << setw(8) << "wasted" << endl;
 92 | 
 93 |   os << string(type_width, ' ');
 94 |   os << setw(12) << right << "adds/sec";
 95 |   for (int i = 0; i < find_percent_count; ++i) {
 96 |     os << setw(7)
 97 |        << static_cast<int>(100 * i / static_cast<double>(find_percent_count - 1)) << '%';
 98 |   }
 99 |   os << setw(9) << "ε" << setw(11) << "bits/item" << setw(11)
100 |      << "bits/item" << setw(8) << "space";
101 |   return os.str();
102 | }
103 | 
104 | // Overloading the usual operator<< as used in "std::cout << foo", but for Statistics
105 | template <class CharT, class Traits>
106 | basic_ostream<CharT, Traits>& operator<<(
107 |     basic_ostream<CharT, Traits>& os, const Statistics& stats) {
108 |   constexpr double NANOS_PER_MILLION = 1000;
109 |   os << fixed << setprecision(2) << setw(12) << right
110 |      << stats.adds_per_nano * NANOS_PER_MILLION;
111 |   for (const auto& fps : stats.finds_per_nano) {
112 |     os << setw(8) << fps.second * NANOS_PER_MILLION;
113 |   }
114 |   const auto minbits = log2(1 / stats.false_positive_probabilty);
115 |   os << setw(7) << setprecision(3) << stats.false_positive_probabilty * 100 << '%'
116 |      << setw(11) << setprecision(2) << stats.bits_per_item << setw(11) << minbits
117 |      << setw(7) << setprecision(1) << 100 * (stats.bits_per_item / minbits - 1) << '%';
118 | 
119 |   return os;
120 | }
121 | 
122 | template<typename Table>
123 | struct FilterAPI {};
124 | 
125 | template <typename ItemType, size_t bits_per_item, template <size_t> class TableType>
126 | struct FilterAPI<CuckooFilter<ItemType, bits_per_item, TableType>> {
127 |   using Table = CuckooFilter<ItemType, bits_per_item, TableType>;
128 |   static Table ConstructFromAddCount(size_t add_count) { return Table(add_count); }
129 |   static void Add(uint64_t key, Table * table) {
130 |     if (0 != table->Add(key)) {
131 |       throw logic_error("The filter is too small to hold all of the elements");
132 |     }
133 |   }
134 |   static bool Contain(uint64_t key, const Table * table) {
135 |     return (0 == table->Contain(key));
136 |   }
137 | };
138 | 
139 | template <>
140 | struct FilterAPI<SimdBlockFilter<>> {
141 |   using Table = SimdBlockFilter<>;
142 |   static Table ConstructFromAddCount(size_t add_count) {
143 |     Table ans(ceil(log2(add_count * 8.0 / CHAR_BIT)));
144 |     return ans;
145 |   }
146 |   static void Add(uint64_t key, Table* table) {
147 |     table->Add(key);
148 |   }
149 |   static bool Contain(uint64_t key, const Table * table) {
150 |     return table->Find(key);
151 |   }
152 | };
153 | 
154 | template <typename Table>
155 | Statistics FilterBenchmark(
156 |     size_t add_count, const vector<uint64_t>& to_add, const vector<uint64_t>& to_lookup) {
157 |   if (add_count > to_add.size()) {
158 |     throw out_of_range("to_add must contain at least add_count values");
159 |   }
160 | 
161 |   if (SAMPLE_SIZE > to_lookup.size()) {
162 |     throw out_of_range("to_lookup must contain at least SAMPLE_SIZE values");
163 |   }
164 | 
165 |   Table filter = FilterAPI<Table>::ConstructFromAddCount(add_count);
166 |   Statistics result;
167 | 
168 |   // Add values until failure or until we run out of values to add:
169 |   auto start_time = NowNanos();
170 |   for (size_t added = 0; added < add_count; ++added) {
171 |     FilterAPI<Table>::Add(to_add[added], &filter);
172 |   }
173 |   result.adds_per_nano = add_count / static_cast<double>(NowNanos() - start_time);
174 |   result.bits_per_item = static_cast<double>(CHAR_BIT * filter.SizeInBytes()) / add_count;
175 | 
176 |   size_t found_count = 0;
177 |   for (const double found_probability : {0.0, 0.25, 0.50, 0.75, 1.00}) {
178 |     const auto to_lookup_mixed = MixIn(&to_lookup[0], &to_lookup[SAMPLE_SIZE], &to_add[0],
179 |         &to_add[add_count], found_probability);
180 |     const auto start_time = NowNanos();
181 |     for (const auto v : to_lookup_mixed) {
182 |       found_count += FilterAPI<Table>::Contain(v, &filter);
183 |     }
184 |     const auto lookup_time = NowNanos() - start_time;
185 |     result.finds_per_nano[100 * found_probability] =
186 |         SAMPLE_SIZE / static_cast<double>(lookup_time);
187 |     if (0.0 == found_probability) {
188 |       result.false_positive_probabilty =
189 |           found_count / static_cast<double>(to_lookup_mixed.size());
190 |     }
191 |   }
192 |   return result;
193 | }
194 | 
195 | int main(int argc, char * argv[]) {
196 |   if (argc != 2) {
197 |     cerr << "Usage: " << argv[0] << " $NUMBER" << endl;
198 |     return 1;
199 |   }
200 |   stringstream input_string(argv[1]);
201 |   size_t add_count;
202 |   input_string >> add_count;
203 |   if (input_string.fail()) {
204 |     cerr << "Invalid number: " << argv[1];
205 |     return 2;
206 |   }
207 | 
208 |   const vector<uint64_t> to_add = GenerateRandom64(add_count);
209 |   const vector<uint64_t> to_lookup = GenerateRandom64(SAMPLE_SIZE);
210 | 
211 |   constexpr int NAME_WIDTH = 13;
212 | 
213 |   cout << StatisticsTableHeader(NAME_WIDTH, 5) << endl;
214 | 
215 |   auto cf = FilterBenchmark<
216 |       CuckooFilter<uint64_t, 12 /* bits per item */, SingleTable /* not semi-sorted*/>>(
217 |       add_count, to_add, to_lookup);
218 | 
219 |   cout << setw(NAME_WIDTH) << "Cuckoo12" << cf << endl;
220 | 
221 |   cf = FilterBenchmark<
222 |       CuckooFilter<uint64_t, 13 /* bits per item */, PackedTable /* semi-sorted*/>>(
223 |       add_count, to_add, to_lookup);
224 | 
225 |   cout << setw(NAME_WIDTH) << "SemiSort13" << cf << endl;
226 | 
227 |   cf = FilterBenchmark<
228 |       CuckooFilter<uint64_t, 8 /* bits per item */, SingleTable /* not semi-sorted*/>>(
229 |       add_count, to_add, to_lookup);
230 | 
231 |   cout << setw(NAME_WIDTH) << "Cuckoo8" << cf << endl;
232 | 
233 |   cf = FilterBenchmark<
234 |       CuckooFilter<uint64_t, 9 /* bits per item */, PackedTable /* semi-sorted*/>>(
235 |       add_count, to_add, to_lookup);
236 | 
237 |   cout << setw(NAME_WIDTH) << "SemiSort9" << cf << endl;
238 | 
239 |   cf = FilterBenchmark<
240 |       CuckooFilter<uint64_t, 16 /* bits per item */, SingleTable /* not semi-sorted*/>>(
241 |       add_count, to_add, to_lookup);
242 | 
243 |   cout << setw(NAME_WIDTH) << "Cuckoo16" << cf << endl;
244 | 
245 |   cf = FilterBenchmark<
246 |       CuckooFilter<uint64_t, 17 /* bits per item */, PackedTable /* semi-sorted*/>>(
247 |       add_count, to_add, to_lookup);
248 | 
249 |   cout << setw(NAME_WIDTH) << "SemiSort17" << cf << endl;
250 | 
251 |   cf = FilterBenchmark<SimdBlockFilter<>>(add_count, to_add, to_lookup);
252 | 
253 |   cout << setw(NAME_WIDTH) << "SimdBlock8" << cf << endl;
254 | 
255 | }
256 | 


--------------------------------------------------------------------------------
/benchmarks/conext-figure5.cc:
--------------------------------------------------------------------------------
 1 | // This benchmark reproduces the CoNEXT 2014 results found in "Figure 5: Lookup
 2 | // performance when a filter achieves its capacity." It takes about two minutes to run on
 3 | // an Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz.
 4 | //
 5 | // Results:
 6 | // fraction of queries on existing items/lookup throughput (million OPS)
 7 | //                      CF     ss-CF
 8 | //         0.00%     24.79      9.37
 9 | //        25.00%     24.65      9.57
10 | //        50.00%     24.84      9.57
11 | //        75.00%     24.86      9.62
12 | //       100.00%     24.89      9.96
13 | 
14 | #include <climits>
15 | #include <iomanip>
16 | #include <vector>
17 | 
18 | #include "cuckoofilter.h"
19 | #include "random.h"
20 | #include "timing.h"
21 | 
22 | using namespace std;
23 | 
24 | using namespace cuckoofilter;
25 | 
26 | // The number of items sampled when determining the lookup performance
27 | const size_t SAMPLE_SIZE = 1000 * 1000;
28 | 
29 | // The time (in seconds) to lookup SAMPLE_SIZE keys in which 0%, 25%, 50%, 75%, and 100%
30 | // of the keys looked up are found.
31 | template <typename Table>
32 | array<double, 5> CuckooBenchmark(
33 |     size_t add_count, const vector<uint64_t>& to_add, const vector<uint64_t>& to_lookup) {
34 |   Table cuckoo(add_count);
35 |   array<double, 5> result;
36 | 
37 |   // Add values until failure or until we run out of values to add:
38 |   size_t added = 0;
39 |   while (added < to_add.size() && 0 == cuckoo.Add(to_add[added])) ++added;
40 | 
41 |   // A value to track to prevent the compiler from optimizing out all lookups:
42 |   size_t found_count = 0;
43 |   for (const double found_percent : {0.0, 0.25, 0.50, 0.75, 1.00}) {
44 |     const auto to_lookup_mixed = MixIn(&to_lookup[0], &to_lookup[SAMPLE_SIZE], &to_add[0],
45 |         &to_add[added], found_percent);
46 |     auto start_time = NowNanos();
47 |     for (const auto v : to_lookup_mixed) found_count += (0 == cuckoo.Contain(v));
48 |     auto lookup_time = NowNanos() - start_time;
49 |     result[found_percent * 4] = lookup_time / (1000.0 * 1000.0 * 1000.0);
50 |   }
51 |   if (6 * SAMPLE_SIZE == found_count) exit(1);
52 |   return result;
53 | }
54 | 
55 | int main() {
56 |   // Number of distinct values, used only for the constructor of CuckooFilter, which does
57 |   // not allow the caller to specify the space usage directly. The actual number of
58 |   // distinct items inserted depends on how many fit until an insert failure occurs.
59 |   size_t add_count = 127.78 * 1000 * 1000;
60 | 
61 |   // Overestimate add_count so we don't run out of random data:
62 |   const size_t max_add_count = 2 * add_count;
63 |   const vector<uint64_t> to_add = GenerateRandom64(max_add_count);
64 |   const vector<uint64_t> to_lookup = GenerateRandom64(SAMPLE_SIZE);
65 | 
66 |   // Calculate metrics:
67 |   const auto cf = CuckooBenchmark<
68 |       CuckooFilter<uint64_t, 12 /* bits per item */, SingleTable /* not semi-sorted*/>>(
69 |       add_count, to_add, to_lookup);
70 |   const auto sscf = CuckooBenchmark<
71 |       CuckooFilter<uint64_t, 13 /* bits per item */, PackedTable /* semi-sorted*/>>(
72 |       add_count, to_add, to_lookup);
73 | 
74 |   cout << "fraction of queries on existing items/lookup throughput (million OPS) "
75 |        << endl;
76 |   cout << setw(10) << ""
77 |        << " " << setw(10) << right << "CF" << setw(10) << right << "ss-CF" << endl;
78 |   for (const double found_percent : {0.0, 0.25, 0.50, 0.75, 1.00}) {
79 |     cout << fixed << setprecision(2) << setw(10) << right << 100 * found_percent << "%";
80 |     cout << setw(10) << right << (SAMPLE_SIZE / cf[found_percent * 4]) / (1000 * 1000);
81 |     cout << setw(10) << right << (SAMPLE_SIZE / sscf[found_percent * 4]) / (1000 * 1000);
82 |     cout << endl;
83 |   }
84 | }
85 | 


--------------------------------------------------------------------------------
/benchmarks/conext-table3.cc:
--------------------------------------------------------------------------------
 1 | // This benchmark reproduces the CoNEXT 2014 results found in "Table 3: Space efficiency
 2 | // and construction speed." It takes about two minutes to run on an Intel(R) Core(TM)
 3 | // i7-4790 CPU @ 3.60GHz.
 4 | //
 5 | // Results:
 6 | //
 7 | // metrics                                    CF     ss-CF
 8 | // # of items (million)                   127.82    127.90
 9 | // bits per item                           12.60     12.59
10 | // false positive rate                     0.18%     0.09%
11 | // constr. speed (million keys/sec)         5.86      4.10
12 | 
13 | #include <climits>
14 | #include <iomanip>
15 | #include <vector>
16 | 
17 | #include "cuckoofilter.h"
18 | #include "random.h"
19 | #include "timing.h"
20 | 
21 | using namespace std;
22 | 
23 | using namespace cuckoofilter;
24 | 
25 | // The number of items sampled when determining the false positive rate
26 | const size_t FPR_SAMPLE_SIZE = 1000 * 1000;
27 | 
28 | struct Metrics {
29 |   double add_count;  // # of items (million)
30 |   double space;      // bits per item
31 |   double fpr;        // false positive rate (%)
32 |   double speed;      // const. speed (million keys/sec)
33 | };
34 | 
35 | template<typename Table>
36 | Metrics CuckooBenchmark(size_t add_count, const vector<uint64_t>& input) {
37 |   Table cuckoo(add_count);
38 |   auto start_time = NowNanos();
39 | 
40 |   // Insert until failure:
41 |   size_t inserted = 0;
42 |   while (inserted < input.size() && 0 == cuckoo.Add(input[inserted])) ++inserted;
43 | 
44 |   auto constr_time = NowNanos() - start_time;
45 | 
46 |   // Count false positives:
47 |   size_t false_positive_count = 0;
48 |   size_t absent = 0;
49 |   for (; inserted + absent < input.size() && absent < FPR_SAMPLE_SIZE; ++absent) {
50 |     false_positive_count += (0 == cuckoo.Contain(input[inserted + absent]));
51 |   }
52 | 
53 |   // Calculate metrics:
54 |   const auto time = constr_time / static_cast<double>(1000 * 1000 * 1000);
55 |   Metrics result;
56 |   result.add_count = static_cast<double>(inserted) / (1000 * 1000);
57 |   result.space = static_cast<double>(CHAR_BIT * cuckoo.SizeInBytes()) / inserted;
58 |   result.fpr = (100.0 * false_positive_count) / absent;
59 |   result.speed = (inserted / time) / (1000 * 1000);
60 |   return result;
61 | }
62 | 
63 | int main() {
64 |   // Number of distinct values, used only for the constructor of CuckooFilter, which does
65 |   // not allow the caller to specify the space usage directly. The actual number of
66 |   // distinct items inserted depends on how many fit until an insert failure occurs.
67 |   const size_t add_count = 127.78 * 1000 * 1000;
68 | 
69 |   // Overestimate add_count so we don't run out of random data:
70 |   const size_t max_add_count = 2 * add_count;
71 |   const vector<uint64_t> input = GenerateRandom64(max_add_count + FPR_SAMPLE_SIZE);
72 | 
73 |   // Calculate metrics:
74 |   const auto cf = CuckooBenchmark<
75 |       CuckooFilter<uint64_t, 12 /* bits per item */, SingleTable /* not semi-sorted*/>>(
76 |       add_count, input);
77 |   const auto sscf = CuckooBenchmark<
78 |       CuckooFilter<uint64_t, 13 /* bits per item */, PackedTable /* semi-sorted*/>>(
79 |       add_count, input);
80 | 
81 |   cout << setw(35) << left << "metrics " << setw(10) << right << "CF" << setw(10)
82 |        << "ss-CF" << endl
83 |        << fixed << setprecision(2) << setw(35) << left << "# of items (million) "
84 |        << setw(10) << right << cf.add_count << setw(10) << sscf.add_count << endl
85 |        << setw(35) << left << "bits per item " << setw(10) << right << cf.space
86 |        << setw(10) << sscf.space << endl
87 |        << setw(35) << left << "false positive rate " << setw(9) << right << cf.fpr << "%"
88 |        << setw(9) << sscf.fpr << "%" << endl
89 |        << setw(35) << left << "constr. speed (million keys/sec) " << setw(10) << right
90 |        << cf.speed << setw(10) << sscf.speed << endl;
91 | }
92 | 


--------------------------------------------------------------------------------
/benchmarks/random.h:
--------------------------------------------------------------------------------
 1 | // Generating random data
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <algorithm>
 6 | #include <cstdint>
 7 | #include <functional>
 8 | #include <random>
 9 | #include <stdexcept>
10 | #include <vector>
11 | 
12 | 
13 | ::std::vector<::std::uint64_t> GenerateRandom64(::std::size_t count) {
14 |   ::std::vector<::std::uint64_t> result(count);
15 |   ::std::random_device random;
16 |   // To generate random keys to lookup, this uses ::std::random_device which is slower but
17 |   // stronger than some other pseudo-random alternatives. The reason is that some of these
18 |   // alternatives (like libstdc++'s ::std::default_random, which is a linear congruential
19 |   // generator) behave non-randomly under some hash families like Dietzfelbinger's
20 |   // multiply-shift.
21 |   auto genrand = [&random]() {
22 |     return random() + (static_cast<::std::uint64_t>(random()) << 32);
23 |   };
24 |   ::std::generate(result.begin(), result.end(), ::std::ref(genrand));
25 |   return result;
26 | }
27 | 
28 | // Using two pointer ranges for sequences x and y, create a vector clone of x but for
29 | // y_probability y's mixed in.
30 | template <typename T>
31 | ::std::vector<T> MixIn(const T* x_begin, const T* x_end, const T* y_begin, const T* y_end,
32 |     double y_probability) {
33 |   const size_t x_size = x_end - x_begin, y_size = y_end - y_begin;
34 |   if (y_size > (1ull << 32)) throw ::std::length_error("y is too long");
35 |   ::std::vector<T> result(x_begin, x_end);
36 |   ::std::random_device random;
37 |   auto genrand = [&random, y_size]() {
38 |     return (static_cast<size_t>(random()) * y_size) >> 32;
39 |   };
40 |   for (size_t i = 0; i < y_probability * x_size; ++i) {
41 |     result[i] = *(y_begin + genrand());
42 |   }
43 |   ::std::shuffle(result.begin(), result.end(), random);
44 |   return result;
45 | }
46 | 


--------------------------------------------------------------------------------
/benchmarks/timing.h:
--------------------------------------------------------------------------------
 1 | // Timers for use in benchmarking.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <cstdint>
 6 | #include <chrono>
 7 | 
 8 | ::std::uint64_t NowNanos() {
 9 |   return ::std::chrono::duration_cast<::std::chrono::nanoseconds>(
10 |              ::std::chrono::steady_clock::now().time_since_epoch())
11 |       .count();
12 | }
13 | 


--------------------------------------------------------------------------------
/binding.gyp:
--------------------------------------------------------------------------------
 1 | {
 2 |   "targets": [
 3 |     {
 4 |       "target_name": "cuckoo",
 5 |       "cflags!": [ "--std=c++11 -fno-exceptions -fno-strict-aliasing" ],
 6 |       "cflags_cc!": [ "--std=c++11 -fno-exceptions -fno-strict-aliasing" ],
 7 |       "include_dirs": [
 8 |         "<!@(node -p \"require('node-addon-api').include\")",
 9 |         "src/"
10 |       ],
11 |       "dependencies": ["<!(node -p \"require('node-addon-api').gyp\")"],
12 |       'defines': [ 'NAPI_DISABLE_CPP_EXCEPTIONS' ],
13 |       "sources": [
14 |         "./src/hashutil.cc",
15 |         "./cuckoo.cc"
16 |       ],
17 |     }
18 |   ]
19 | }
20 | 


--------------------------------------------------------------------------------
/cuckoo.cc:
--------------------------------------------------------------------------------
  1 | #include "./cuckoo.h"
  2 | 
  3 | Napi::FunctionReference Cuckoo::constructor;
  4 | 
  5 | Napi::String Method(const Napi::CallbackInfo& info) {
  6 |   Napi::Env env = info.Env();
  7 |   return Napi::String::New(env, "world");
  8 | }
  9 | 
 10 | Napi::Object Cuckoo::Init(Napi::Env env, Napi::Object exports) {
 11 |   Napi::HandleScope scope(env);
 12 | 
 13 |   Napi::Function func = DefineClass(env, "CuckooFilter", {
 14 |     InstanceMethod("add", &Cuckoo::Add),
 15 |     InstanceMethod("contain", &Cuckoo::Contain),
 16 |     InstanceMethod("delete", &Cuckoo::Delete),
 17 |     InstanceAccessor("size", &Cuckoo::Size, NULL),
 18 |     InstanceAccessor("bytes", &Cuckoo::SizeInBytes, NULL)
 19 |   });
 20 | 
 21 |   constructor = Napi::Persistent(func);
 22 |   constructor.SuppressDestruct();
 23 | 
 24 |   exports.Set("CuckooFilter", func);
 25 |   return exports;
 26 | }
 27 | 
 28 | Cuckoo::Cuckoo(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Cuckoo>(info)  {
 29 |   const Napi::Env env = info.Env();
 30 | 
 31 |   if (info.Length() != 1) {
 32 |     Napi::TypeError::New(env, "Wrong number of arguments").ThrowAsJavaScriptException();
 33 |     return;
 34 |   }
 35 | 
 36 |   if (!info[0].IsNumber()) {
 37 |     Napi::TypeError::New(env, "You must pass the filter size").ThrowAsJavaScriptException();
 38 |     return;
 39 |   }
 40 | 
 41 |   Napi::Number num = info[0].As<Napi::Number>();
 42 | 
 43 |   this->filter = new cuckoofilter::CuckooFilter<Napi::String, 12, cuckoofilter::SingleTable, NapiStringHash>(num.Uint32Value());
 44 | }
 45 | 
 46 | Cuckoo::~Cuckoo() {
 47 |   delete this->filter;
 48 | }
 49 | 
 50 | Napi::Value Cuckoo::Add(const Napi::CallbackInfo& info) {
 51 |   const Napi::Env env = info.Env();
 52 | 
 53 |   if (info.Length() != 1) {
 54 |     Napi::TypeError::New(env, "Wrong number of arguments").ThrowAsJavaScriptException();
 55 |     return env.Null();
 56 |   }
 57 | 
 58 |   if (!info[0].IsString()) {
 59 |     Napi::TypeError::New(env, "Only strings are supported").ThrowAsJavaScriptException();
 60 |     return env.Null();
 61 |   }
 62 | 
 63 |   Napi::String str = info[0].As<Napi::String>();
 64 | 
 65 |   // do not add twice, we cannot support unlimited adding
 66 |   // it will fail after ~10 add with the same parameter
 67 |   // otherwise
 68 |   if (this->filter->Contain(str) == cuckoofilter::Ok) {
 69 |     return info.This();
 70 |   }
 71 | 
 72 |   const int res = this->filter->Add(str);
 73 | 
 74 |   if (res == cuckoofilter::Ok) {
 75 |     return info.This();
 76 |   } else if (res == cuckoofilter::NotEnoughSpace) {
 77 |     Napi::Error::New(env, "Not enough space to add this key").ThrowAsJavaScriptException();
 78 |     return env.Null();
 79 |   } else {
 80 |     Napi::Error::New(env, "something went wrong during add").ThrowAsJavaScriptException();
 81 |     return env.Null();
 82 |   }
 83 | }
 84 | 
 85 | Napi::Value Cuckoo::Contain(const Napi::CallbackInfo& info) {
 86 |   const Napi::Env env = info.Env();
 87 | 
 88 |   if (info.Length() != 1) {
 89 |     Napi::TypeError::New(env, "Wrong number of arguments").ThrowAsJavaScriptException();
 90 |     return env.Null();
 91 |   }
 92 | 
 93 |   if (!info[0].IsString()) {
 94 |     Napi::TypeError::New(env, "Only strings are supported").ThrowAsJavaScriptException();
 95 |     return env.Null();
 96 |   }
 97 | 
 98 |   Napi::String str = info[0].As<Napi::String>();
 99 | 
100 |   if (this->filter->Contain(str) == cuckoofilter::Ok) {
101 |     return Napi::Boolean::New(env, true);
102 |   } else {
103 |     return Napi::Boolean::New(env, false);
104 |   }
105 | }
106 | 
107 | Napi::Value Cuckoo::Delete(const Napi::CallbackInfo& info) {
108 |   const Napi::Env env = info.Env();
109 | 
110 |   if (info.Length() != 1) {
111 |     Napi::TypeError::New(env, "Wrong number of arguments").ThrowAsJavaScriptException();
112 |     return env.Null();
113 |   }
114 | 
115 |   if (!info[0].IsString()) {
116 |     Napi::TypeError::New(env, "Only strings are supported").ThrowAsJavaScriptException();
117 |     return env.Null();
118 |   }
119 | 
120 |   Napi::String str = info[0].As<Napi::String>();
121 | 
122 |   const int res = this->filter->Delete(str);
123 | 
124 |   if (res == cuckoofilter::Ok) {
125 |     return info.This();
126 |   } else if (res == cuckoofilter::NotFound) {
127 |     // not found is ok too
128 |     return info.This();
129 |   } else {
130 |     Napi::Error::New(env, "something went wrong during delete").ThrowAsJavaScriptException();
131 |     return env.Null();
132 |   }
133 | }
134 | 
135 | Napi::Value Cuckoo::Size(const Napi::CallbackInfo& info) {
136 |   Napi::Env env = info.Env();
137 |   return Napi::Number::New(env, this->filter->Size());
138 | }
139 | 
140 | Napi::Value Cuckoo::SizeInBytes(const Napi::CallbackInfo& info) {
141 |   Napi::Env env = info.Env();
142 |   return Napi::Number::New(env, this->filter->SizeInBytes());
143 | }
144 | 
145 | Napi::Object Init(Napi::Env env, Napi::Object exports) {
146 |   return Cuckoo::Init(env, exports);
147 | }
148 | 
149 | NODE_API_MODULE(NODE_GYP_MODULE_NAME, Init)
150 | 
151 | 


--------------------------------------------------------------------------------
/cuckoo.h:
--------------------------------------------------------------------------------
 1 | #ifndef CUCKOO_H
 2 | #define CUCKOO_H
 3 | 
 4 | #include <napi.h>
 5 | #include "cuckoofilter.h"
 6 | 
 7 | class NapiStringHash {
 8 |   public:
 9 |     uint64_t operator()(Napi::String str) const {
10 |       return cuckoofilter::HashUtil::BobHash(str);
11 |     }
12 | };
13 | 
14 | class Cuckoo : public Napi::ObjectWrap<Cuckoo> {
15 |  public:
16 |   static Napi::Object Init(Napi::Env env, Napi::Object exports);
17 |   Cuckoo(const Napi::CallbackInfo& info);
18 |   ~Cuckoo();
19 | 
20 |  private:
21 |   Napi::Value Add(const Napi::CallbackInfo& info);
22 |   Napi::Value Contain(const Napi::CallbackInfo& info);
23 |   Napi::Value Delete(const Napi::CallbackInfo& info);
24 |   Napi::Value Size(const Napi::CallbackInfo& info);
25 |   Napi::Value SizeInBytes(const Napi::CallbackInfo& info);
26 | 
27 |   static Napi::FunctionReference constructor;
28 |   cuckoofilter::CuckooFilter<Napi::String, 12, cuckoofilter::SingleTable, NapiStringHash> *filter;
29 | };
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/example/test-chars.cc:
--------------------------------------------------------------------------------
 1 | #include "cuckoofilter.h"
 2 | 
 3 | #include <assert.h>
 4 | #include <math.h>
 5 | 
 6 | #include <iostream>
 7 | #include <vector>
 8 | 
 9 | using cuckoofilter::CuckooFilter;
10 | 
11 | class MyObj {
12 |   public:
13 |     std::string str;
14 | 
15 |     MyObj(const std::string str) {
16 |       this->str = str;
17 |     }
18 | };
19 | 
20 | class MyHash {
21 |   public:
22 |     uint64_t operator()(MyObj* o) const {
23 |       return cuckoofilter::HashUtil::BobHash(o->str);
24 |     }
25 | };
26 | 
27 | int main(int argc, char **argv) {
28 |   size_t total_items = 1000000;
29 | 
30 |   CuckooFilter<MyObj*, 12, cuckoofilter::SingleTable, MyHash> filter(total_items);
31 | 
32 |   if (filter.Add(new MyObj("hello")) != cuckoofilter::Ok) {
33 |     std::cout << "unable to add";
34 |     return 0;
35 |   }
36 | 
37 |   if (filter.Contain(new MyObj("hello")) == cuckoofilter::Ok) {
38 |     std::cout << "ok";
39 |   } else {
40 |     std::cout << "not ok";
41 |   }
42 | 
43 |   if (filter.Contain(new MyObj("not hello")) == cuckoofilter::Ok) {
44 |     std::cout << "not ok";
45 |   } else {
46 |     std::cout << "ok";
47 |   }
48 | 
49 |   if (filter.Contain(new MyObj("muaaha")) == cuckoofilter::Ok) {
50 |     std::cout << "not ok";
51 |   } else {
52 |     std::cout << "ok";
53 |   }
54 | 
55 |   return 0;
56 | }
57 | 


--------------------------------------------------------------------------------
/example/test.cc:
--------------------------------------------------------------------------------
 1 | #include "cuckoofilter.h"
 2 | 
 3 | #include <assert.h>
 4 | #include <math.h>
 5 | 
 6 | #include <iostream>
 7 | #include <vector>
 8 | 
 9 | using cuckoofilter::CuckooFilter;
10 | 
11 | int main(int argc, char **argv) {
12 |   size_t total_items = 1000000;
13 | 
14 |   // Create a cuckoo filter where each item is of type size_t and
15 |   // use 12 bits for each item:
16 |   //    CuckooFilter<size_t, 12> filter(total_items);
17 |   // To enable semi-sorting, define the storage of cuckoo filter to be
18 |   // PackedTable, accepting keys of size_t type and making 13 bits
19 |   // for each key:
20 |   //   CuckooFilter<size_t, 13, cuckoofilter::PackedTable> filter(total_items);
21 |   CuckooFilter<size_t, 12> filter(total_items);
22 | 
23 |   // Insert items to this cuckoo filter
24 |   size_t num_inserted = 0;
25 |   for (size_t i = 0; i < total_items; i++, num_inserted++) {
26 |     if (filter.Add(i) != cuckoofilter::Ok) {
27 |       break;
28 |     }
29 |   }
30 | 
31 |   // Check if previously inserted items are in the filter, expected
32 |   // true for all items
33 |   for (size_t i = 0; i < num_inserted; i++) {
34 |     assert(filter.Contain(i) == cuckoofilter::Ok);
35 |   }
36 | 
37 |   // Check non-existing items, a few false positives expected
38 |   size_t total_queries = 0;
39 |   size_t false_queries = 0;
40 |   for (size_t i = total_items; i < 2 * total_items; i++) {
41 |     if (filter.Contain(i) == cuckoofilter::Ok) {
42 |       false_queries++;
43 |     }
44 |     total_queries++;
45 |   }
46 | 
47 |   // Output the measured false positive rate
48 |   std::cout << "false positive rate is "
49 |             << 100.0 * false_queries / total_queries << "%\n";
50 | 
51 |   return 0;
52 | }
53 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 | 
3 | const binding = require('node-gyp-build')(__dirname)
4 | module.exports = binding.CuckooFilter
5 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "cuckoofilter-native",
 3 |   "version": "0.1.1",
 4 |   "description": "A native implementation of a cuckoofilter",
 5 |   "main": "index.js",
 6 |   "directories": {
 7 |     "example": "example",
 8 |     "test": "test"
 9 |   },
10 |   "scripts": {
11 |     "install": "node-gyp-build",
12 |     "test": "standard | snazzy && tap test.js",
13 |     "prebuild": "prebuildify --napi"
14 |   },
15 |   "repository": {
16 |     "type": "git",
17 |     "url": "git+https://github.com/mcollina/cuckoofilter.git"
18 |   },
19 |   "keywords": [
20 |     "cuckoo",
21 |     "cuckoofilter",
22 |     "filter",
23 |     "bloom",
24 |     "bloomfilter"
25 |   ],
26 |   "author": "Matteo Collina <hello@matteocollina.com>",
27 |   "license": "Apache-2.0",
28 |   "bugs": {
29 |     "url": "https://github.com/mcollina/cuckoofilter/issues"
30 |   },
31 |   "homepage": "https://github.com/mcollina/cuckoofilter#readme",
32 |   "devDependencies": {
33 |     "benchmark": "^2.1.4",
34 |     "bloomfilter": "0.0.16",
35 |     "cuckoo-filter": "^1.0.4",
36 |     "pre-commit": "^1.2.2",
37 |     "prebuildify": "^2.6.0",
38 |     "snazzy": "^7.1.1",
39 |     "standard": "^11.0.1",
40 |     "tap": "^11.1.2"
41 |   },
42 |   "dependencies": {
43 |     "node-addon-api": "^1.2.0",
44 |     "node-gyp-build": "^3.3.0"
45 |   },
46 |   "gypfile": true
47 | }
48 | 


--------------------------------------------------------------------------------
/src/bitsutil.h:
--------------------------------------------------------------------------------
 1 | #ifndef CUCKOO_FILTER_BITS_H_
 2 | #define CUCKOO_FILTER_BITS_H_
 3 | 
 4 | namespace cuckoofilter {
 5 | 
 6 | // inspired from
 7 | // http://www-graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
 8 | #define haszero4(x) (((x)-0x1111ULL) & (~(x)) & 0x8888ULL)
 9 | #define hasvalue4(x, n) (haszero4((x) ^ (0x1111ULL * (n))))
10 | 
11 | #define haszero8(x) (((x)-0x01010101ULL) & (~(x)) & 0x80808080ULL)
12 | #define hasvalue8(x, n) (haszero8((x) ^ (0x01010101ULL * (n))))
13 | 
14 | #define haszero12(x) (((x)-0x001001001001ULL) & (~(x)) & 0x800800800800ULL)
15 | #define hasvalue12(x, n) (haszero12((x) ^ (0x001001001001ULL * (n))))
16 | 
17 | #define haszero16(x) \
18 |   (((x)-0x0001000100010001ULL) & (~(x)) & 0x8000800080008000ULL)
19 | #define hasvalue16(x, n) (haszero16((x) ^ (0x0001000100010001ULL * (n))))
20 | 
21 | inline uint64_t upperpower2(uint64_t x) {
22 |   x--;
23 |   x |= x >> 1;
24 |   x |= x >> 2;
25 |   x |= x >> 4;
26 |   x |= x >> 8;
27 |   x |= x >> 16;
28 |   x |= x >> 32;
29 |   x++;
30 |   return x;
31 | }
32 | 
33 | }  // namespace cuckoofilter
34 | 
35 | #endif  // CUCKOO_FILTER_BITS_H
36 | 


--------------------------------------------------------------------------------
/src/cuckoofilter.h:
--------------------------------------------------------------------------------
  1 | #ifndef CUCKOO_FILTER_CUCKOO_FILTER_H_
  2 | #define CUCKOO_FILTER_CUCKOO_FILTER_H_
  3 | 
  4 | #include <assert.h>
  5 | #include <algorithm>
  6 | 
  7 | #include "debug.h"
  8 | #include "hashutil.h"
  9 | #include "packedtable.h"
 10 | #include "printutil.h"
 11 | #include "singletable.h"
 12 | 
 13 | namespace cuckoofilter {
 14 | // status returned by a cuckoo filter operation
 15 | enum Status {
 16 |   Ok = 0,
 17 |   NotFound = 1,
 18 |   NotEnoughSpace = 2,
 19 |   NotSupported = 3,
 20 | };
 21 | 
 22 | // maximum number of cuckoo kicks before claiming failure
 23 | const size_t kMaxCuckooCount = 500;
 24 | 
 25 | // A cuckoo filter class exposes a Bloomier filter interface,
 26 | // providing methods of Add, Delete, Contain. It takes three
 27 | // template parameters:
 28 | //   ItemType:  the type of item you want to insert
 29 | //   bits_per_item: how many bits each item is hashed into
 30 | //   TableType: the storage of table, SingleTable by default, and
 31 | // PackedTable to enable semi-sorting
 32 | template <typename ItemType, size_t bits_per_item,
 33 |           template <size_t> class TableType = SingleTable,
 34 |           typename HashFamily = SimpleTabulation>
 35 | class CuckooFilter {
 36 |   // Storage of items
 37 |   TableType<bits_per_item> *table_;
 38 | 
 39 |   // Number of items stored
 40 |   size_t num_items_;
 41 | 
 42 |   typedef struct {
 43 |     size_t index;
 44 |     uint32_t tag;
 45 |     bool used;
 46 |   } VictimCache;
 47 | 
 48 |   VictimCache victim_;
 49 | 
 50 |   HashFamily hasher_;
 51 | 
 52 |   inline size_t IndexHash(uint32_t hv) const {
 53 |     // table_->num_buckets is always a power of two, so modulo can be replaced
 54 |     // with
 55 |     // bitwise-and:
 56 |     return hv & (table_->NumBuckets() - 1);
 57 |   }
 58 | 
 59 |   inline uint32_t TagHash(uint32_t hv) const {
 60 |     uint32_t tag;
 61 |     tag = hv & ((1ULL << bits_per_item) - 1);
 62 |     tag += (tag == 0);
 63 |     return tag;
 64 |   }
 65 | 
 66 |   inline void GenerateIndexTagHash(const ItemType& item, size_t* index,
 67 |                                    uint32_t* tag) const {
 68 |     const uint64_t hash = hasher_(item);
 69 |     *index = IndexHash(hash >> 32);
 70 |     *tag = TagHash(hash);
 71 |   }
 72 | 
 73 |   inline size_t AltIndex(const size_t index, const uint32_t tag) const {
 74 |     // NOTE(binfan): originally we use:
 75 |     // index ^ HashUtil::BobHash((const void*) (&tag), 4)) & table_->INDEXMASK;
 76 |     // now doing a quick-n-dirty way:
 77 |     // 0x5bd1e995 is the hash constant from MurmurHash2
 78 |     return IndexHash((uint32_t)(index ^ (tag * 0x5bd1e995)));
 79 |   }
 80 | 
 81 |   Status AddImpl(const size_t i, const uint32_t tag);
 82 | 
 83 |   // load factor is the fraction of occupancy
 84 |   double LoadFactor() const { return 1.0 * Size() / table_->SizeInTags(); }
 85 | 
 86 |   double BitsPerItem() const { return 8.0 * table_->SizeInBytes() / Size(); }
 87 | 
 88 |  public:
 89 |   explicit CuckooFilter(const size_t max_num_keys) : num_items_(0), victim_(), hasher_() {
 90 |     size_t assoc = 4;
 91 |     size_t num_buckets = upperpower2(std::max<uint64_t>(1, max_num_keys / assoc));
 92 |     double frac = (double)max_num_keys / num_buckets / assoc;
 93 |     if (frac > 0.96) {
 94 |       num_buckets <<= 1;
 95 |     }
 96 |     victim_.used = false;
 97 |     table_ = new TableType<bits_per_item>(num_buckets);
 98 |   }
 99 | 
100 |   ~CuckooFilter() { delete table_; }
101 | 
102 |   // Add an item to the filter.
103 |   Status Add(const ItemType &item);
104 | 
105 |   // Report if the item is inserted, with false positive rate.
106 |   Status Contain(const ItemType &item) const;
107 | 
108 |   // Delete an key from the filter
109 |   Status Delete(const ItemType &item);
110 | 
111 |   /* methods for providing stats  */
112 |   // summary infomation
113 |   std::string Info() const;
114 | 
115 |   // number of current inserted items;
116 |   size_t Size() const { return num_items_; }
117 | 
118 |   // size of the filter in bytes.
119 |   size_t SizeInBytes() const { return table_->SizeInBytes(); }
120 | };
121 | 
122 | template <typename ItemType, size_t bits_per_item,
123 |           template <size_t> class TableType, typename HashFamily>
124 | Status CuckooFilter<ItemType, bits_per_item, TableType, HashFamily>::Add(
125 |     const ItemType &item) {
126 |   size_t i;
127 |   uint32_t tag;
128 | 
129 |   if (victim_.used) {
130 |     return NotEnoughSpace;
131 |   }
132 | 
133 |   GenerateIndexTagHash(item, &i, &tag);
134 |   return AddImpl(i, tag);
135 | }
136 | 
137 | template <typename ItemType, size_t bits_per_item,
138 |           template <size_t> class TableType, typename HashFamily>
139 | Status CuckooFilter<ItemType, bits_per_item, TableType, HashFamily>::AddImpl(
140 |     const size_t i, const uint32_t tag) {
141 |   size_t curindex = i;
142 |   uint32_t curtag = tag;
143 |   uint32_t oldtag;
144 | 
145 |   for (uint32_t count = 0; count < kMaxCuckooCount; count++) {
146 |     bool kickout = count > 0;
147 |     oldtag = 0;
148 |     if (table_->InsertTagToBucket(curindex, curtag, kickout, oldtag)) {
149 |       num_items_++;
150 |       return Ok;
151 |     }
152 |     if (kickout) {
153 |       curtag = oldtag;
154 |     }
155 |     curindex = AltIndex(curindex, curtag);
156 |   }
157 | 
158 |   victim_.index = curindex;
159 |   victim_.tag = curtag;
160 |   victim_.used = true;
161 |   return Ok;
162 | }
163 | 
164 | template <typename ItemType, size_t bits_per_item,
165 |           template <size_t> class TableType, typename HashFamily>
166 | Status CuckooFilter<ItemType, bits_per_item, TableType, HashFamily>::Contain(
167 |     const ItemType &key) const {
168 |   bool found = false;
169 |   size_t i1, i2;
170 |   uint32_t tag;
171 | 
172 |   GenerateIndexTagHash(key, &i1, &tag);
173 |   i2 = AltIndex(i1, tag);
174 | 
175 |   assert(i1 == AltIndex(i2, tag));
176 | 
177 |   found = victim_.used && (tag == victim_.tag) &&
178 |           (i1 == victim_.index || i2 == victim_.index);
179 | 
180 |   if (found || table_->FindTagInBuckets(i1, i2, tag)) {
181 |     return Ok;
182 |   } else {
183 |     return NotFound;
184 |   }
185 | }
186 | 
187 | template <typename ItemType, size_t bits_per_item,
188 |           template <size_t> class TableType, typename HashFamily>
189 | Status CuckooFilter<ItemType, bits_per_item, TableType, HashFamily>::Delete(
190 |     const ItemType &key) {
191 |   size_t i1, i2;
192 |   uint32_t tag;
193 | 
194 |   GenerateIndexTagHash(key, &i1, &tag);
195 |   i2 = AltIndex(i1, tag);
196 | 
197 |   if (table_->DeleteTagFromBucket(i1, tag)) {
198 |     num_items_--;
199 |     goto TryEliminateVictim;
200 |   } else if (table_->DeleteTagFromBucket(i2, tag)) {
201 |     num_items_--;
202 |     goto TryEliminateVictim;
203 |   } else if (victim_.used && tag == victim_.tag &&
204 |              (i1 == victim_.index || i2 == victim_.index)) {
205 |     // num_items_--;
206 |     victim_.used = false;
207 |     return Ok;
208 |   } else {
209 |     return NotFound;
210 |   }
211 | TryEliminateVictim:
212 |   if (victim_.used) {
213 |     victim_.used = false;
214 |     size_t i = victim_.index;
215 |     uint32_t tag = victim_.tag;
216 |     AddImpl(i, tag);
217 |   }
218 |   return Ok;
219 | }
220 | 
221 | template <typename ItemType, size_t bits_per_item,
222 |           template <size_t> class TableType, typename HashFamily>
223 | std::string CuckooFilter<ItemType, bits_per_item, TableType, HashFamily>::Info() const {
224 |   std::stringstream ss;
225 |   ss << "CuckooFilter Status:\n"
226 |      << "\t\t" << table_->Info() << "\n"
227 |      << "\t\tKeys stored: " << Size() << "\n"
228 |      << "\t\tLoad factor: " << LoadFactor() << "\n"
229 |      << "\t\tHashtable size: " << (table_->SizeInBytes() >> 10) << " KB\n";
230 |   if (Size() > 0) {
231 |     ss << "\t\tbit/key:   " << BitsPerItem() << "\n";
232 |   } else {
233 |     ss << "\t\tbit/key:   N/A\n";
234 |   }
235 |   return ss.str();
236 | }
237 | }  // namespace cuckoofilter
238 | #endif  // CUCKOO_FILTER_CUCKOO_FILTER_H_
239 | 


--------------------------------------------------------------------------------
/src/debug.h:
--------------------------------------------------------------------------------
 1 | #ifndef CUCKOO_FILTER_DEBUG_H_
 2 | #define CUCKOO_FILTER_DEBUG_H_
 3 | 
 4 | #include <stdio.h>  // for perror
 5 | 
 6 | namespace cuckoofilter {
 7 | 
 8 | #ifndef DEBUG
 9 | //#define DEBUG
10 | #endif
11 | 
12 | #define debug_level (DEBUG_ERRS | DEBUG_CUCKOO)
13 | 
14 | #ifdef DEBUG
15 | // extern unsigned int debug;
16 | 
17 | /*
18 |  * a combination of DEBUG_ERRS, DEBUG_CUCKOO, DEBUG_TABLE, DEBUG_ENCODE
19 |  */
20 | 
21 | #define DPRINTF(level, ...)                                    \
22 |   do {                                                         \
23 |     if (debug_level & (level)) fprintf(stdout, ##__VA_ARGS__); \
24 |   } while (0)
25 | #define DEBUG_PERROR(errmsg)                      \
26 |   do {                                            \
27 |     if (debug_level & DEBUG_ERRS) perror(errmsg); \
28 |   } while (0)
29 | 
30 | #else
31 | 
32 | #define DPRINTF(level, ...)
33 | #define DEBUG_PERROR(level, ...)
34 | 
35 | #endif
36 | 
37 | /*
38 |  * The format of this should be obvious.  Please add some explanatory
39 |  * text if you add a debugging value.  This text will show up in
40 |  * -d list
41 |  */
42 | #define DEBUG_NONE 0x00    // DBTEXT:  No debugging
43 | #define DEBUG_ERRS 0x01    // DBTEXT:  Verbose error reporting
44 | #define DEBUG_CUCKOO 0x02  // DBTEXT:  Messages for cuckoo hashing
45 | #define DEBUG_TABLE 0x04   // DBTEXT:  Messages for table operations
46 | #define DEBUG_ENCODE 0x08  // DBTEXT:  Messages for encoding
47 | 
48 | #define DEBUG_ALL 0xffffffff
49 | 
50 | // int set_debug(char *arg);  /* Returns 0 on success, -1 on failure */
51 | 
52 | }  // namespace cuckoofilter
53 | 
54 | #endif  // CUCKOO_FILTER_DEBUG_H_
55 | 


--------------------------------------------------------------------------------
/src/hashutil.cc:
--------------------------------------------------------------------------------
  1 | // Pulled from lookup3.c by Bob Jenkins
  2 | #include "hashutil.h"
  3 | 
  4 | #define rot(x, k) (((x) << (k)) | ((x) >> (32 - (k))))
  5 | #define mix(a,b,c)                              \
  6 |     {                                           \
  7 |         a -= c;  a ^= rot(c, 4);  c += b;       \
  8 |         b -= a;  b ^= rot(a, 6);  a += c;       \
  9 |         c -= b;  c ^= rot(b, 8);  b += a;       \
 10 |         a -= c;  a ^= rot(c,16);  c += b;       \
 11 |         b -= a;  b ^= rot(a,19);  a += c;       \
 12 |         c -= b;  c ^= rot(b, 4);  b += a;       \
 13 |     }
 14 | 
 15 | #define final(a,b,c)                            \
 16 |     {                                           \
 17 |         c ^= b; c -= rot(b,14);                 \
 18 |         a ^= c; a -= rot(c,11);                 \
 19 |         b ^= a; b -= rot(a,25);                 \
 20 |         c ^= b; c -= rot(b,16);                 \
 21 |         a ^= c; a -= rot(c,4);                  \
 22 |         b ^= a; b -= rot(a,14);                 \
 23 |         c ^= b; c -= rot(b,24);                 \
 24 |     }
 25 | // Assuming little endian
 26 | #define HASH_LITTLE_ENDIAN 1
 27 | 
 28 | #define get16bits(d) (*((const uint16_t *)(d)))
 29 | 
 30 | #if defined(_WIN32)
 31 | 
 32 | #include <stdint.h>
 33 | 
 34 | typedef uint8_t u_int8_t;
 35 | typedef uint16_t u_int16_t;
 36 | typedef uint32_t u_int32_t;
 37 | 
 38 | #endif
 39 | 
 40 | namespace cuckoofilter {
 41 | /*
 42 |   hashlittle() -- hash a variable-length key into a 32-bit value
 43 |   k       : the key (the unaligned variable-length array of bytes)
 44 |   length  : the length of the key, counting by bytes
 45 |   initval : can be any 4-byte value
 46 |   Returns a 32-bit value.  Every bit of the key affects every bit of
 47 |   the return value.  Two keys differing by one or two bits will have
 48 |   totally different hash values.
 49 | 
 50 |   The best hash table sizes are powers of 2.  There is no need to do
 51 |   mod a prime (mod is sooo slow!).  If you need less than 32 bits,
 52 |   use a bitmask.  For example, if you need only 10 bits, do
 53 |   h = (h & hashmask(10));
 54 |   In which case, the hash table should have hashsize(10) elements.
 55 | 
 56 |   If you are hashing n strings (uint8_t **)k, do it like this:
 57 |   for (i=0, h=0; i<n; ++i) h = hashlittle( k[i], len[i], h);
 58 | 
 59 |   By Bob Jenkins, 2006.  bob_jenkins@burtleburtle.net.  You may use this
 60 |   code any way you wish, private, educational, or commercial.  It's free.
 61 | 
 62 |   Use for hash table lookup, or anything where one collision in 2^^32 is
 63 |   acceptable.  Do NOT use for cryptographic purposes.
 64 | */
 65 | 
 66 | uint32_t HashUtil::BobHash(const std::string &s, uint32_t seed) {
 67 |   return BobHash(s.data(), s.length(), seed);
 68 | }
 69 | 
 70 | uint32_t HashUtil::BobHash(const void *buf, size_t length, uint32_t seed) {
 71 |   uint32_t a, b, c; /* internal state */
 72 |   union {
 73 |     const void *ptr;
 74 |     size_t i;
 75 |   } u; /* needed for Mac Powerbook G4 */
 76 | 
 77 |   /* Set up the internal state */
 78 |   // Is it safe to use key as the initial state setter?
 79 |   a = b = c = 0xdeadbeef + ((uint32_t)length) + seed;
 80 | 
 81 |   u.ptr = buf;
 82 |   if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
 83 |     const uint32_t *k = (const uint32_t *)buf; /* read 32-bit chunks */
 84 | 
 85 |     /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
 86 |     while (length > 12) {
 87 |       a += k[0];
 88 |       b += k[1];
 89 |       c += k[2];
 90 |       mix(a, b, c);
 91 |       length -= 12;
 92 |       k += 3;
 93 |     }
 94 | 
 95 | /*----------------------------- handle the last (probably partial) block */
 96 | /*
 97 |  * "k[2]&0xffffff" actually reads beyond the end of the string, but
 98 |  * then masks off the part it's not allowed to read.  Because the
 99 |  * string is aligned, the masked-off tail is in the same word as the
100 |  * rest of the string.  Every machine with memory protection I've seen
101 |  * does it on word boundaries, so is OK with this.  But VALGRIND will
102 |  * still catch it and complain.  The masking trick does make the hash
103 |  * noticably faster for short strings (like English words).
104 |  */
105 | #ifndef VALGRIND
106 | 
107 |     switch (length) {
108 |       case 12:
109 |         c += k[2];
110 |         b += k[1];
111 |         a += k[0];
112 |         break;
113 |       case 11:
114 |         c += k[2] & 0xffffff;
115 |         b += k[1];
116 |         a += k[0];
117 |         break;
118 |       case 10:
119 |         c += k[2] & 0xffff;
120 |         b += k[1];
121 |         a += k[0];
122 |         break;
123 |       case 9:
124 |         c += k[2] & 0xff;
125 |         b += k[1];
126 |         a += k[0];
127 |         break;
128 |       case 8:
129 |         b += k[1];
130 |         a += k[0];
131 |         break;
132 |       case 7:
133 |         b += k[1] & 0xffffff;
134 |         a += k[0];
135 |         break;
136 |       case 6:
137 |         b += k[1] & 0xffff;
138 |         a += k[0];
139 |         break;
140 |       case 5:
141 |         b += k[1] & 0xff;
142 |         a += k[0];
143 |         break;
144 |       case 4:
145 |         a += k[0];
146 |         break;
147 |       case 3:
148 |         a += k[0] & 0xffffff;
149 |         break;
150 |       case 2:
151 |         a += k[0] & 0xffff;
152 |         break;
153 |       case 1:
154 |         a += k[0] & 0xff;
155 |         break;
156 |       case 0:
157 |         return c; /* zero length strings require no mixing */
158 |     }
159 | 
160 | #else /* make valgrind happy */
161 | 
162 |     const u_int8_t *k8;
163 |     k8 = (const u_int8_t *)k;
164 |     switch (length) {
165 |       case 12:
166 |         c += k[2];
167 |         b += k[1];
168 |         a += k[0];
169 |         break;
170 |       case 11:
171 |         c += ((uint32_t)k8[10]) << 16; /* fall through */
172 |       case 10:
173 |         c += ((uint32_t)k8[9]) << 8; /* fall through */
174 |       case 9:
175 |         c += k8[8]; /* fall through */
176 |       case 8:
177 |         b += k[1];
178 |         a += k[0];
179 |         break;
180 |       case 7:
181 |         b += ((uint32_t)k8[6]) << 16; /* fall through */
182 |       case 6:
183 |         b += ((uint32_t)k8[5]) << 8; /* fall through */
184 |       case 5:
185 |         b += k8[4]; /* fall through */
186 |       case 4:
187 |         a += k[0];
188 |         break;
189 |       case 3:
190 |         a += ((uint32_t)k8[2]) << 16; /* fall through */
191 |       case 2:
192 |         a += ((uint32_t)k8[1]) << 8; /* fall through */
193 |       case 1:
194 |         a += k8[0];
195 |         break;
196 |       case 0:
197 |         return c;
198 |     }
199 | 
200 | #endif /* !valgrind */
201 | 
202 |   } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
203 |     const u_int16_t *k = (const u_int16_t *)buf; /* read 16-bit chunks */
204 |     const u_int8_t *k8;
205 | 
206 |     /*--------------- all but last block: aligned reads and different mixing */
207 |     while (length > 12) {
208 |       a += k[0] + (((uint32_t)k[1]) << 16);
209 |       b += k[2] + (((uint32_t)k[3]) << 16);
210 |       c += k[4] + (((uint32_t)k[5]) << 16);
211 |       mix(a, b, c);
212 |       length -= 12;
213 |       k += 6;
214 |     }
215 | 
216 |     /*----------------------------- handle the last (probably partial) block */
217 |     k8 = (const u_int8_t *)k;
218 |     switch (length) {
219 |       case 12:
220 |         c += k[4] + (((uint32_t)k[5]) << 16);
221 |         b += k[2] + (((uint32_t)k[3]) << 16);
222 |         a += k[0] + (((uint32_t)k[1]) << 16);
223 |         break;
224 |       case 11:
225 |         c += ((uint32_t)k8[10]) << 16; /* fall through */
226 |       case 10:
227 |         c += k[4];
228 |         b += k[2] + (((uint32_t)k[3]) << 16);
229 |         a += k[0] + (((uint32_t)k[1]) << 16);
230 |         break;
231 |       case 9:
232 |         c += k8[8]; /* fall through */
233 |       case 8:
234 |         b += k[2] + (((uint32_t)k[3]) << 16);
235 |         a += k[0] + (((uint32_t)k[1]) << 16);
236 |         break;
237 |       case 7:
238 |         b += ((uint32_t)k8[6]) << 16; /* fall through */
239 |       case 6:
240 |         b += k[2];
241 |         a += k[0] + (((uint32_t)k[1]) << 16);
242 |         break;
243 |       case 5:
244 |         b += k8[4]; /* fall through */
245 |       case 4:
246 |         a += k[0] + (((uint32_t)k[1]) << 16);
247 |         break;
248 |       case 3:
249 |         a += ((uint32_t)k8[2]) << 16; /* fall through */
250 |       case 2:
251 |         a += k[0];
252 |         break;
253 |       case 1:
254 |         a += k8[0];
255 |         break;
256 |       case 0:
257 |         return c; /* zero length requires no mixing */
258 |     }
259 | 
260 |   } else { /* need to read the key one byte at a time */
261 |     const u_int8_t *k = (const u_int8_t *)buf;
262 | 
263 |     /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
264 |     while (length > 12) {
265 |       a += k[0];
266 |       a += ((uint32_t)k[1]) << 8;
267 |       a += ((uint32_t)k[2]) << 16;
268 |       a += ((uint32_t)k[3]) << 24;
269 |       b += k[4];
270 |       b += ((uint32_t)k[5]) << 8;
271 |       b += ((uint32_t)k[6]) << 16;
272 |       b += ((uint32_t)k[7]) << 24;
273 |       c += k[8];
274 |       c += ((uint32_t)k[9]) << 8;
275 |       c += ((uint32_t)k[10]) << 16;
276 |       c += ((uint32_t)k[11]) << 24;
277 |       mix(a, b, c);
278 |       length -= 12;
279 |       k += 12;
280 |     }
281 | 
282 |     /*-------------------------------- last block: affect all 32 bits of (c) */
283 |     switch (length) /* all the case statements fall through */
284 |     {
285 |       case 12:
286 |         c += ((uint32_t)k[11]) << 24;
287 |       case 11:
288 |         c += ((uint32_t)k[10]) << 16;
289 |       case 10:
290 |         c += ((uint32_t)k[9]) << 8;
291 |       case 9:
292 |         c += k[8];
293 |       case 8:
294 |         b += ((uint32_t)k[7]) << 24;
295 |       case 7:
296 |         b += ((uint32_t)k[6]) << 16;
297 |       case 6:
298 |         b += ((uint32_t)k[5]) << 8;
299 |       case 5:
300 |         b += k[4];
301 |       case 4:
302 |         a += ((uint32_t)k[3]) << 24;
303 |       case 3:
304 |         a += ((uint32_t)k[2]) << 16;
305 |       case 2:
306 |         a += ((uint32_t)k[1]) << 8;
307 |       case 1:
308 |         a += k[0];
309 |         break;
310 |       case 0:
311 |         return c;
312 |     }
313 |   }
314 | 
315 |   final(a, b, c);
316 |   return c;
317 | }
318 | 
319 | /*
320 |  * hashlittle2: return 2 32-bit hash values
321 |  *
322 |  * This is identical to hashlittle(), except it returns two 32-bit hash
323 |  * values instead of just one.  This is good enough for hash table
324 |  * lookup with 2^^64 buckets, or if you want a second hash if you're not
325 |  * happy with the first, or if you want a probably-unique 64-bit ID for
326 |  * the key.  *pc is better mixed than *pb, so use *pc first.  If you want
327 |  * a 64-bit value do something like "*pc + (((uint64_t)*pb)<<32)".
328 |  */
329 | void HashUtil::BobHash(const void *buf, size_t length, uint32_t *idx1,
330 |                        uint32_t *idx2) {
331 |   uint32_t a, b, c; /* internal state */
332 |   union {
333 |     const void *ptr;
334 |     size_t i;
335 |   } u; /* needed for Mac Powerbook G4 */
336 | 
337 |   /* Set up the internal state */
338 |   a = b = c = 0xdeadbeef + ((uint32_t)length) + *idx1;
339 |   c += *idx2;
340 | 
341 |   u.ptr = buf;
342 |   if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
343 |     const uint32_t *k = (const uint32_t *)buf; /* read 32-bit chunks */
344 | #ifdef VALGRIND
345 |     const uint8_t *k8;
346 | #endif
347 |     /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
348 |     while (length > 12) {
349 |       a += k[0];
350 |       b += k[1];
351 |       c += k[2];
352 |       mix(a, b, c);
353 |       length -= 12;
354 |       k += 3;
355 |     }
356 | 
357 | /*----------------------------- handle the last (probably partial) block */
358 | /*
359 |  * "k[2]&0xffffff" actually reads beyond the end of the string, but
360 |  * then masks off the part it's not allowed to read.  Because the
361 |  * string is aligned, the masked-off tail is in the same word as the
362 |  * rest of the string.  Every machine with memory protection I've seen
363 |  * does it on word boundaries, so is OK with this.  But VALGRIND will
364 |  * still catch it and complain.  The masking trick does make the hash
365 |  * noticably faster for short strings (like English words).
366 |  */
367 | #ifndef VALGRIND
368 | 
369 |     switch (length) {
370 |       case 12:
371 |         c += k[2];
372 |         b += k[1];
373 |         a += k[0];
374 |         break;
375 |       case 11:
376 |         c += k[2] & 0xffffff;
377 |         b += k[1];
378 |         a += k[0];
379 |         break;
380 |       case 10:
381 |         c += k[2] & 0xffff;
382 |         b += k[1];
383 |         a += k[0];
384 |         break;
385 |       case 9:
386 |         c += k[2] & 0xff;
387 |         b += k[1];
388 |         a += k[0];
389 |         break;
390 |       case 8:
391 |         b += k[1];
392 |         a += k[0];
393 |         break;
394 |       case 7:
395 |         b += k[1] & 0xffffff;
396 |         a += k[0];
397 |         break;
398 |       case 6:
399 |         b += k[1] & 0xffff;
400 |         a += k[0];
401 |         break;
402 |       case 5:
403 |         b += k[1] & 0xff;
404 |         a += k[0];
405 |         break;
406 |       case 4:
407 |         a += k[0];
408 |         break;
409 |       case 3:
410 |         a += k[0] & 0xffffff;
411 |         break;
412 |       case 2:
413 |         a += k[0] & 0xffff;
414 |         break;
415 |       case 1:
416 |         a += k[0] & 0xff;
417 |         break;
418 |       case 0:
419 |         *idx1 = c;
420 |         *idx2 = b;
421 |         return; /* zero length strings require no mixing */
422 |     }
423 | 
424 | #else /* make valgrind happy */
425 | 
426 |     k8 = (const uint8_t *)k;
427 |     switch (length) {
428 |       case 12:
429 |         c += k[2];
430 |         b += k[1];
431 |         a += k[0];
432 |         break;
433 |       case 11:
434 |         c += ((uint32_t)k8[10]) << 16; /* fall through */
435 |       case 10:
436 |         c += ((uint32_t)k8[9]) << 8; /* fall through */
437 |       case 9:
438 |         c += k8[8]; /* fall through */
439 |       case 8:
440 |         b += k[1];
441 |         a += k[0];
442 |         break;
443 |       case 7:
444 |         b += ((uint32_t)k8[6]) << 16; /* fall through */
445 |       case 6:
446 |         b += ((uint32_t)k8[5]) << 8; /* fall through */
447 |       case 5:
448 |         b += k8[4]; /* fall through */
449 |       case 4:
450 |         a += k[0];
451 |         break;
452 |       case 3:
453 |         a += ((uint32_t)k8[2]) << 16; /* fall through */
454 |       case 2:
455 |         a += ((uint32_t)k8[1]) << 8; /* fall through */
456 |       case 1:
457 |         a += k8[0];
458 |         break;
459 |       case 0:
460 |         *idx1 = c;
461 |         *idx2 = b;
462 |         return; /* zero length strings require no mixing */
463 |     }
464 | 
465 | #endif /* !valgrind */
466 | 
467 |   } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
468 |     const uint16_t *k = (const uint16_t *)buf; /* read 16-bit chunks */
469 |     const uint8_t *k8;
470 | 
471 |     /*--------------- all but last block: aligned reads and different mixing */
472 |     while (length > 12) {
473 |       a += k[0] + (((uint32_t)k[1]) << 16);
474 |       b += k[2] + (((uint32_t)k[3]) << 16);
475 |       c += k[4] + (((uint32_t)k[5]) << 16);
476 |       mix(a, b, c);
477 |       length -= 12;
478 |       k += 6;
479 |     }
480 | 
481 |     /*----------------------------- handle the last (probably partial) block */
482 |     k8 = (const uint8_t *)k;
483 |     switch (length) {
484 |       case 12:
485 |         c += k[4] + (((uint32_t)k[5]) << 16);
486 |         b += k[2] + (((uint32_t)k[3]) << 16);
487 |         a += k[0] + (((uint32_t)k[1]) << 16);
488 |         break;
489 |       case 11:
490 |         c += ((uint32_t)k8[10]) << 16; /* fall through */
491 |       case 10:
492 |         c += k[4];
493 |         b += k[2] + (((uint32_t)k[3]) << 16);
494 |         a += k[0] + (((uint32_t)k[1]) << 16);
495 |         break;
496 |       case 9:
497 |         c += k8[8]; /* fall through */
498 |       case 8:
499 |         b += k[2] + (((uint32_t)k[3]) << 16);
500 |         a += k[0] + (((uint32_t)k[1]) << 16);
501 |         break;
502 |       case 7:
503 |         b += ((uint32_t)k8[6]) << 16; /* fall through */
504 |       case 6:
505 |         b += k[2];
506 |         a += k[0] + (((uint32_t)k[1]) << 16);
507 |         break;
508 |       case 5:
509 |         b += k8[4]; /* fall through */
510 |       case 4:
511 |         a += k[0] + (((uint32_t)k[1]) << 16);
512 |         break;
513 |       case 3:
514 |         a += ((uint32_t)k8[2]) << 16; /* fall through */
515 |       case 2:
516 |         a += k[0];
517 |         break;
518 |       case 1:
519 |         a += k8[0];
520 |         break;
521 |       case 0:
522 |         *idx1 = c;
523 |         *idx2 = b;
524 |         return; /* zero length strings require no mixing */
525 |     }
526 | 
527 |   } else { /* need to read the key one byte at a time */
528 |     const uint8_t *k = (const uint8_t *)buf;
529 | 
530 |     /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
531 |     while (length > 12) {
532 |       a += k[0];
533 |       a += ((uint32_t)k[1]) << 8;
534 |       a += ((uint32_t)k[2]) << 16;
535 |       a += ((uint32_t)k[3]) << 24;
536 |       b += k[4];
537 |       b += ((uint32_t)k[5]) << 8;
538 |       b += ((uint32_t)k[6]) << 16;
539 |       b += ((uint32_t)k[7]) << 24;
540 |       c += k[8];
541 |       c += ((uint32_t)k[9]) << 8;
542 |       c += ((uint32_t)k[10]) << 16;
543 |       c += ((uint32_t)k[11]) << 24;
544 |       mix(a, b, c);
545 |       length -= 12;
546 |       k += 12;
547 |     }
548 | 
549 |     /*-------------------------------- last block: affect all 32 bits of (c) */
550 |     switch (length) /* all the case statements fall through */
551 |     {
552 |       case 12:
553 |         c += ((uint32_t)k[11]) << 24;
554 |       case 11:
555 |         c += ((uint32_t)k[10]) << 16;
556 |       case 10:
557 |         c += ((uint32_t)k[9]) << 8;
558 |       case 9:
559 |         c += k[8];
560 |       case 8:
561 |         b += ((uint32_t)k[7]) << 24;
562 |       case 7:
563 |         b += ((uint32_t)k[6]) << 16;
564 |       case 6:
565 |         b += ((uint32_t)k[5]) << 8;
566 |       case 5:
567 |         b += k[4];
568 |       case 4:
569 |         a += ((uint32_t)k[3]) << 24;
570 |       case 3:
571 |         a += ((uint32_t)k[2]) << 16;
572 |       case 2:
573 |         a += ((uint32_t)k[1]) << 8;
574 |       case 1:
575 |         a += k[0];
576 |         break;
577 |       case 0:
578 |         *idx1 = c;
579 |         *idx2 = b;
580 |         return; /* zero length strings require no mixing */
581 |     }
582 |   }
583 | 
584 |   final(a, b, c);
585 |   *idx1 = c;
586 |   *idx2 = b;
587 | }
588 | 
589 | void HashUtil::BobHash(const std::string &s, uint32_t *idx1, uint32_t *idx2) {
590 |   return BobHash(s.data(), s.length(), idx1, idx2);
591 | }
592 | 
593 | //-----------------------------------------------------------------------------
594 | // MurmurHash2, by Austin Appleby
595 | // Note - This code makes a few assumptions about how your machine behaves -
596 | // 1. We can read a 4-byte value from any address without crashing
597 | // 2. sizeof(int) == 4
598 | // And it has a few limitations -
599 | // 1. It will not work incrementally.
600 | // 2. It will not produce the same results on little-endian and big-endian
601 | //    machines.
602 | // All code is released to the public domain. For business purposes,
603 | // Murmurhash is under the MIT license.
604 | 
605 | uint32_t HashUtil::MurmurHash(const void *buf, size_t len, uint32_t seed) {
606 |   // 'm' and 'r' are mixing constants generated offline.
607 |   // They're not really 'magic', they just happen to work well.
608 | 
609 |   const unsigned int m = 0x5bd1e995;
610 |   const int r = 24;
611 | 
612 |   // Initialize the hash to a 'random' value
613 |   uint32_t h = seed ^ len;
614 | 
615 |   // Mix 4 bytes at a time into the hash
616 |   const unsigned char *data = (const unsigned char *)buf;
617 | 
618 |   while (len >= 4) {
619 |     unsigned int k = *(unsigned int *)data;
620 | 
621 |     k *= m;
622 |     k ^= k >> r;
623 |     k *= m;
624 | 
625 |     h *= m;
626 |     h ^= k;
627 | 
628 |     data += 4;
629 |     len -= 4;
630 |   }
631 | 
632 |   // Handle the last few bytes of the input array
633 |   switch (len) {
634 |     case 3:
635 |       h ^= data[2] << 16;
636 |     case 2:
637 |       h ^= data[1] << 8;
638 |     case 1:
639 |       h ^= data[0];
640 |       h *= m;
641 |   };
642 | 
643 |   // Do a few final mixes of the hash to ensure the last few
644 |   // bytes are well-incorporated.
645 |   h ^= h >> 13;
646 |   h *= m;
647 |   h ^= h >> 15;
648 |   return h;
649 | }
650 | 
651 | uint32_t HashUtil::MurmurHash(const std::string &s, uint32_t seed) {
652 |   return MurmurHash(s.data(), s.length(), seed);
653 | }
654 | 
655 | // SuperFastHash aka Hsieh Hash, License: GPL 2.0
656 | uint32_t HashUtil::SuperFastHash(const void *buf, size_t len) {
657 |   const char *data = (const char *)buf;
658 |   uint32_t hash = len, tmp;
659 |   int rem;
660 | 
661 |   if (len == 0 || data == NULL) return 0;
662 | 
663 |   rem = len & 3;
664 |   len >>= 2;
665 | 
666 |   /* Main loop */
667 |   for (; len > 0; len--) {
668 |     hash += get16bits(data);
669 |     tmp = (get16bits(data + 2) << 11) ^ hash;
670 |     hash = (hash << 16) ^ tmp;
671 |     data += 2 * sizeof(uint16_t);
672 |     hash += hash >> 11;
673 |   }
674 | 
675 |   /* Handle end cases */
676 |   switch (rem) {
677 |     case 3:
678 |       hash += get16bits(data);
679 |       hash ^= hash << 16;
680 |       hash ^= data[sizeof(uint16_t)] << 18;
681 |       hash += hash >> 11;
682 |       break;
683 |     case 2:
684 |       hash += get16bits(data);
685 |       hash ^= hash << 11;
686 |       hash += hash >> 17;
687 |       break;
688 |     case 1:
689 |       hash += *data;
690 |       hash ^= hash << 10;
691 |       hash += hash >> 1;
692 |   }
693 | 
694 |   /* Force "avalanching" of final 127 bits */
695 |   hash ^= hash << 3;
696 |   hash += hash >> 5;
697 |   hash ^= hash << 4;
698 |   hash += hash >> 17;
699 |   hash ^= hash << 25;
700 |   hash += hash >> 6;
701 | 
702 |   return hash;
703 | }
704 | 
705 | uint32_t HashUtil::SuperFastHash(const std::string &s) {
706 |   return SuperFastHash(s.data(), s.length());
707 | }
708 | 
709 | uint32_t HashUtil::NullHash(const void *buf, size_t length,
710 |                             uint32_t shiftbytes) {
711 |   // Ensure that enough bits exist in buffer
712 |   if (length - shiftbytes < sizeof(uint32_t)) {
713 |     return 0;
714 |   }
715 |   char *data = (char *)buf;
716 |   return ((data[(length - shiftbytes - 4)] << 24) +
717 |           (data[(length - shiftbytes - 3)] << 16) +
718 |           (data[(length - shiftbytes - 2)] << 8) +
719 |           (data[(length - shiftbytes - 1)]));
720 | }
721 | 
722 | /*
723 |  * Compatibility layer for OpenSSL < 1.1.0.
724 |  * Implemented as proposed by https://wiki.openssl.org/index.php/OpenSSL_1.1.0_Changes
725 |  */
726 | // #if OPENSSL_VERSION_NUMBER < 0x10100000L
727 | // #include <string.h>
728 | // static void *OPENSSL_zalloc(size_t num)
729 | // {
730 | //   void *ret = OPENSSL_malloc(num);
731 | // 
732 | //   if (ret != NULL)
733 | //       memset(ret, 0, num);
734 | //   return ret;
735 | // }
736 | // 
737 | // EVP_MD_CTX *EVP_MD_CTX_new(void)
738 | // {
739 | //    return (EVP_MD_CTX *)OPENSSL_zalloc(sizeof(EVP_MD_CTX));
740 | // }
741 | // 
742 | // void EVP_MD_CTX_free(EVP_MD_CTX *ctx)
743 | // {
744 | //    EVP_MD_CTX_cleanup(ctx);
745 | //    OPENSSL_free(ctx);
746 | // }
747 | // #endif
748 | // 
749 | // std::string HashUtil::MD5Hash(const char *inbuf, size_t in_length) {
750 | //   EVP_MD_CTX *mdctx;
751 | //   unsigned char md_value[EVP_MAX_MD_SIZE];
752 | //   unsigned int md_len;
753 | // 
754 | //   mdctx = EVP_MD_CTX_new();
755 | //   EVP_DigestInit(mdctx, EVP_md5());
756 | //   EVP_DigestUpdate(mdctx, (const void *)inbuf, in_length);
757 | //   EVP_DigestFinal_ex(mdctx, md_value, &md_len);
758 | //   EVP_MD_CTX_free(mdctx);
759 | // 
760 | //   return std::string((char *)md_value, (size_t)md_len);
761 | // }
762 | 
763 | // std::string HashUtil::SHA1Hash(const char *inbuf, size_t in_length) {
764 | //   EVP_MD_CTX *mdctx;
765 | //   unsigned char md_value[EVP_MAX_MD_SIZE];
766 | //   unsigned int md_len;
767 | // 
768 | //   mdctx = EVP_MD_CTX_new();
769 | //   EVP_DigestInit(mdctx, EVP_sha1());
770 | //   EVP_DigestUpdate(mdctx, (const void *)inbuf, in_length);
771 | //   EVP_DigestFinal_ex(mdctx, md_value, &md_len);
772 | //   EVP_MD_CTX_free(mdctx);
773 | // 
774 | //   return std::string((char *)md_value, (size_t)md_len);
775 | // }
776 | }  // namespace cuckoofilter
777 | 


--------------------------------------------------------------------------------
/src/hashutil.h:
--------------------------------------------------------------------------------
 1 | #ifndef CUCKOO_FILTER_HASHUTIL_H_
 2 | #define CUCKOO_FILTER_HASHUTIL_H_
 3 | 
 4 | #include <stdint.h>
 5 | #include <stdlib.h>
 6 | #include <sys/types.h>
 7 | #include <limits.h>
 8 | 
 9 | #include <string>
10 | 
11 | // #include <openssl/evp.h>
12 | #include <random>
13 | 
14 | namespace cuckoofilter {
15 | 
16 | class HashUtil {
17 |  public:
18 |   // Bob Jenkins Hash
19 |   static uint32_t BobHash(const void *buf, size_t length, uint32_t seed = 0);
20 |   static uint32_t BobHash(const std::string &s, uint32_t seed = 0);
21 | 
22 |   // Bob Jenkins Hash that returns two indices in one call
23 |   // Useful for Cuckoo hashing, power of two choices, etc.
24 |   // Use idx1 before idx2, when possible. idx1 and idx2 should be initialized to seeds.
25 |   static void BobHash(const void *buf, size_t length, uint32_t *idx1,
26 |                       uint32_t *idx2);
27 |   static void BobHash(const std::string &s, uint32_t *idx1, uint32_t *idx2);
28 | 
29 |   // MurmurHash2
30 |   static uint32_t MurmurHash(const void *buf, size_t length, uint32_t seed = 0);
31 |   static uint32_t MurmurHash(const std::string &s, uint32_t seed = 0);
32 | 
33 |   // SuperFastHash
34 |   static uint32_t SuperFastHash(const void *buf, size_t len);
35 |   static uint32_t SuperFastHash(const std::string &s);
36 | 
37 |   // Null hash (shift and mask)
38 |   static uint32_t NullHash(const void *buf, size_t length, uint32_t shiftbytes);
39 | 
40 |   // Wrappers for MD5 and SHA1 hashing using EVP
41 |   // static std::string MD5Hash(const char *inbuf, size_t in_length);
42 |   // static std::string SHA1Hash(const char *inbuf, size_t in_length);
43 | 
44 |  private:
45 |   HashUtil();
46 | };
47 | 
48 | // See Martin Dietzfelbinger, "Universal hashing and k-wise independent random
49 | // variables via integer arithmetic without primes".
50 | //
51 | // Commented out because it does not build on windows
52 | // we are not using it with Node.js anyway
53 | //
54 | // class TwoIndependentMultiplyShift {
55 | //   unsigned __int128 multiply_, add_;
56 | //
57 | //  public:
58 | //   TwoIndependentMultiplyShift() {
59 | //     ::std::random_device random;
60 | //     for (auto v : {&multiply_, &add_}) {
61 | //       *v = random();
62 | //       for (int i = 1; i <= 4; ++i) {
63 | //         *v = *v << 32;
64 | //         *v |= random();
65 | //       }
66 | //     }
67 | //   }
68 | //
69 | //   uint64_t operator()(uint64_t key) const {
70 | //     return (add_ + multiply_ * static_cast<decltype(multiply_)>(key)) >> 64;
71 | //   }
72 | // };
73 | 
74 | // See Patrascu and Thorup's "The Power of Simple Tabulation Hashing"
75 | class SimpleTabulation {
76 |   uint64_t tables_[sizeof(uint64_t)][1 << CHAR_BIT];
77 | 
78 |  public:
79 |   SimpleTabulation() {
80 |     ::std::random_device random;
81 |     for (unsigned i = 0; i < sizeof(uint64_t); ++i) {
82 |       for (int j = 0; j < (1 << CHAR_BIT); ++j) {
83 |         tables_[i][j] = random() | ((static_cast<uint64_t>(random())) << 32);
84 |       }
85 |     }
86 |   }
87 | 
88 |   uint64_t operator()(uint64_t key) const {
89 |     uint64_t result = 0;
90 |     for (unsigned i = 0; i < sizeof(key); ++i) {
91 |       result ^= tables_[i][reinterpret_cast<uint8_t *>(&key)[i]];
92 |     }
93 |     return result;
94 |   }
95 | };
96 | }
97 | 
98 | #endif  // CUCKOO_FILTER_HASHUTIL_H_
99 | 


--------------------------------------------------------------------------------
/src/packedtable.h:
--------------------------------------------------------------------------------
  1 | #ifndef CUCKOO_FILTER_PACKED_TABLE_H_
  2 | #define CUCKOO_FILTER_PACKED_TABLE_H_
  3 | 
  4 | #include <sstream>
  5 | #include <utility>
  6 | 
  7 | #include "debug.h"
  8 | #include "permencoding.h"
  9 | #include "printutil.h"
 10 | 
 11 | namespace cuckoofilter {
 12 | 
 13 | // Using Permutation encoding to save 1 bit per tag
 14 | template <size_t bits_per_tag>
 15 | class PackedTable {
 16 |   static const size_t kDirBitsPerTag = bits_per_tag - 4;
 17 |   static const size_t kBitsPerBucket = (3 + kDirBitsPerTag) * 4;
 18 |   static const size_t kBytesPerBucket = (kBitsPerBucket + 7) >> 3;
 19 |   static const uint32_t kDirBitsMask = ((1ULL << kDirBitsPerTag) - 1) << 4;
 20 | 
 21 |   // using a pointer adds one more indirection
 22 |   size_t len_;
 23 |   size_t num_buckets_;
 24 |   char *buckets_;
 25 |   PermEncoding perm_;
 26 | 
 27 |  public:
 28 |   explicit PackedTable(size_t num) : num_buckets_(num) {
 29 |     // NOTE(binfan): use 7 extra bytes to avoid overrun as we
 30 |     // always read a uint64
 31 |     len_ = kBytesPerBucket * num_buckets_ + 7;
 32 |     buckets_ = new char[len_];
 33 |     memset(buckets_, 0, len_); 
 34 |   }
 35 | 
 36 |   ~PackedTable() { 
 37 |     delete[] buckets_; 
 38 |   }
 39 | 
 40 |   size_t NumBuckets() const {
 41 |     return num_buckets_;
 42 |   }
 43 | 
 44 |   size_t SizeInTags() const { 
 45 |     return 4 * num_buckets_; 
 46 |   }
 47 | 
 48 |   size_t SizeInBytes() const { 
 49 |     return len_; 
 50 |   }
 51 | 
 52 |   std::string Info() const {
 53 |     std::stringstream ss;
 54 |     ss << "PackedHashtable with tag size: " << bits_per_tag << " bits";
 55 |     ss << "\t4 packed bits(3 bits after compression) and " << kDirBitsPerTag
 56 |        << " direct bits\n";
 57 |     ss << "\t\tAssociativity: 4\n";
 58 |     ss << "\t\tTotal # of rows: " << num_buckets_ << "\n";
 59 |     ss << "\t\ttotal # slots: " << SizeInTags() << "\n";
 60 |     return ss.str();
 61 |   }
 62 | 
 63 |   void PrintBucket(const size_t i) const {
 64 |     DPRINTF(DEBUG_TABLE, "PackedTable::PrintBucket %zu \n", i);
 65 |     const char *p = buckets_ + kBitsPerBucket * i / 8;
 66 |     std::cout << "\tbucketbits  ="
 67 |               << PrintUtil::bytes_to_hex((char *)p, kBytesPerBucket + 1)
 68 |               << std::endl;
 69 | 
 70 |     uint32_t tags[4];
 71 | 
 72 |     ReadBucket(i, tags);
 73 |     PrintTags(tags);
 74 |     DPRINTF(DEBUG_TABLE, "PackedTable::PrintBucket done \n");
 75 |   }
 76 | 
 77 |   void PrintTags(uint32_t tags[4]) const {
 78 |     DPRINTF(DEBUG_TABLE, "PackedTable::PrintTags \n");
 79 |     uint8_t lowbits[4];
 80 |     uint32_t dirbits[4];
 81 |     for (size_t j = 0; j < 4; j++) {
 82 |       lowbits[j] = tags[j] & 0x0f;
 83 |       dirbits[j] = (tags[j] & kDirBitsMask) >> 4;
 84 |     }
 85 |     uint16_t codeword = perm_.encode(lowbits);
 86 |     std::cout << "\tcodeword  ="
 87 |               << PrintUtil::bytes_to_hex((char *)&codeword, 2) << std::endl;
 88 |     for (size_t j = 0; j < 4; j++) {
 89 |       std::cout << "\ttag[" << j
 90 |                 << "]: " << PrintUtil::bytes_to_hex((char *)&tags[j], 4);
 91 |       std::cout << " lowbits="
 92 |                 << PrintUtil::bytes_to_hex((char *)&lowbits[j], 1)
 93 |                 << " dirbits="
 94 |                 << PrintUtil::bytes_to_hex((char *)&dirbits[j],
 95 |                                            kDirBitsPerTag / 8 + 1)
 96 |                 << std::endl;
 97 |     }
 98 |     DPRINTF(DEBUG_TABLE, "PackedTable::PrintTags done\n");
 99 |   }
100 | 
101 |   inline void SortPair(uint32_t &a, uint32_t &b) {
102 |     if ((a & 0x0f) > (b & 0x0f)) {
103 |       std::swap(a, b);
104 |     }
105 |   }
106 | 
107 |   inline void SortTags(uint32_t *tags) {
108 |     SortPair(tags[0], tags[2]);
109 |     SortPair(tags[1], tags[3]);
110 |     SortPair(tags[0], tags[1]);
111 |     SortPair(tags[2], tags[3]);
112 |     SortPair(tags[1], tags[2]);
113 |   }
114 | 
115 |   /* read and decode the bucket i, pass the 4 decoded tags to the 2nd arg
116 |    * bucket bits = 12 codeword bits + dir bits of tag1 + dir bits of tag2 ...
117 |    */
118 |   inline void ReadBucket(const size_t i, uint32_t tags[4]) const {
119 |     DPRINTF(DEBUG_TABLE, "PackedTable::ReadBucket %zu \n", i);
120 |     DPRINTF(DEBUG_TABLE, "kdirbitsMask=%x\n", kDirBitsMask);
121 | 
122 |     const char *p;  // =  buckets_ + ((kBitsPerBucket * i) >> 3);
123 |     uint16_t codeword;
124 |     uint8_t lowbits[4];
125 | 
126 |     if (bits_per_tag == 5) {
127 |       // 1 dirbits per tag, 16 bits per bucket
128 |       p = buckets_ + (i * 2);
129 |       uint16_t bucketbits = *((uint16_t *)p);
130 |       codeword = bucketbits & 0x0fff;
131 |       tags[0] = ((bucketbits >> 8) & kDirBitsMask);
132 |       tags[1] = ((bucketbits >> 9) & kDirBitsMask);
133 |       tags[2] = ((bucketbits >> 10) & kDirBitsMask);
134 |       tags[3] = ((bucketbits >> 11) & kDirBitsMask);
135 |     } else if (bits_per_tag == 6) {
136 |       // 2 dirbits per tag, 20 bits per bucket
137 |       p = buckets_ + ((20 * i) >> 3);
138 |       uint32_t bucketbits = *((uint32_t *)p);
139 |       codeword = (*((uint16_t *)p) >> ((i & 1) << 2)) & 0x0fff;
140 |       tags[0] = (bucketbits >> (8 + ((i & 1) << 2))) & kDirBitsMask;
141 |       tags[1] = (bucketbits >> (10 + ((i & 1) << 2))) & kDirBitsMask;
142 |       tags[2] = (bucketbits >> (12 + ((i & 1) << 2))) & kDirBitsMask;
143 |       tags[3] = (bucketbits >> (14 + ((i & 1) << 2))) & kDirBitsMask;
144 |     } else if (bits_per_tag == 7) {
145 |       // 3 dirbits per tag, 24 bits per bucket
146 |       p = buckets_ + (i << 1) + i;
147 |       uint32_t bucketbits = *((uint32_t *)p);
148 |       codeword = *((uint16_t *)p) & 0x0fff;
149 |       tags[0] = (bucketbits >> 8) & kDirBitsMask;
150 |       tags[1] = (bucketbits >> 11) & kDirBitsMask;
151 |       tags[2] = (bucketbits >> 14) & kDirBitsMask;
152 |       tags[3] = (bucketbits >> 17) & kDirBitsMask;
153 |     } else if (bits_per_tag == 8) {
154 |       // 4 dirbits per tag, 28 bits per bucket
155 |       p = buckets_ + ((28 * i) >> 3);
156 |       uint32_t bucketbits = *((uint32_t *)p);
157 |       codeword = (*((uint16_t *)p) >> ((i & 1) << 2)) & 0x0fff;
158 |       tags[0] = (bucketbits >> (8 + ((i & 1) << 2))) & kDirBitsMask;
159 |       tags[1] = (bucketbits >> (12 + ((i & 1) << 2))) & kDirBitsMask;
160 |       tags[2] = (bucketbits >> (16 + ((i & 1) << 2))) & kDirBitsMask;
161 |       tags[3] = (bucketbits >> (20 + ((i & 1) << 2))) & kDirBitsMask;
162 |     } else if (bits_per_tag == 9) {
163 |       // 5 dirbits per tag, 32 bits per bucket
164 |       p = buckets_ + (i * 4);
165 |       uint32_t bucketbits = *((uint32_t *)p);
166 |       codeword = *((uint16_t *)p) & 0x0fff;
167 |       tags[0] = (bucketbits >> 8) & kDirBitsMask;
168 |       tags[1] = (bucketbits >> 13) & kDirBitsMask;
169 |       tags[2] = (bucketbits >> 18) & kDirBitsMask;
170 |       tags[3] = (bucketbits >> 23) & kDirBitsMask;
171 |     } else if (bits_per_tag == 13) {
172 |       // 9 dirbits per tag,  48 bits per bucket
173 |       p = buckets_ + (i * 6);
174 |       uint64_t bucketbits = *((uint64_t *)p);
175 |       codeword = *((uint16_t *)p) & 0x0fff;
176 |       tags[0] = (bucketbits >> 8) & kDirBitsMask;
177 |       tags[1] = (bucketbits >> 17) & kDirBitsMask;
178 |       tags[2] = (bucketbits >> 26) & kDirBitsMask;
179 |       tags[3] = (bucketbits >> 35) & kDirBitsMask;
180 |     } else if (bits_per_tag == 17) {
181 |       // 13 dirbits per tag, 64 bits per bucket
182 |       p = buckets_ + (i << 3);
183 |       uint64_t bucketbits = *((uint64_t *)p);
184 |       codeword = *((uint16_t *)p) & 0x0fff;
185 |       tags[0] = (bucketbits >> 8) & kDirBitsMask;
186 |       tags[1] = (bucketbits >> 21) & kDirBitsMask;
187 |       tags[2] = (bucketbits >> 34) & kDirBitsMask;
188 |       tags[3] = (bucketbits >> 47) & kDirBitsMask;
189 |     }
190 | 
191 |     /* codeword is the lowest 12 bits in the bucket */
192 |     uint16_t v = perm_.dec_table[codeword];
193 |     lowbits[0] = (v & 0x000f);
194 |     lowbits[2] = ((v >> 4) & 0x000f);
195 |     lowbits[1] = ((v >> 8) & 0x000f);
196 |     lowbits[3] = ((v >> 12) & 0x000f);
197 | 
198 |     tags[0] |= lowbits[0];
199 |     tags[1] |= lowbits[1];
200 |     tags[2] |= lowbits[2];
201 |     tags[3] |= lowbits[3];
202 | 
203 |     if (debug_level & DEBUG_TABLE) {
204 |       PrintTags(tags);
205 |     }
206 |     DPRINTF(DEBUG_TABLE, "PackedTable::ReadBucket done \n");
207 |   }
208 | 
209 |   /* Tag = 4 low bits + x high bits
210 |    * L L L L H H H H ...
211 |    */
212 |   inline void WriteBucket(const size_t i, uint32_t tags[4], bool sort = true) {
213 |     DPRINTF(DEBUG_TABLE, "PackedTable::WriteBucket %zu \n", i);
214 |     /* first sort the tags in increasing order is arg sort = true*/
215 |     if (sort) {
216 |       DPRINTF(DEBUG_TABLE, "Sort tags\n");
217 |       SortTags(tags);
218 |     }
219 |     if (debug_level & DEBUG_TABLE) {
220 |       PrintTags(tags);
221 |     }
222 | 
223 |     /* put in direct bits for each tag*/
224 | 
225 |     uint8_t lowbits[4];
226 |     uint32_t highbits[4];
227 | 
228 |     lowbits[0] = tags[0] & 0x0f;
229 |     lowbits[1] = tags[1] & 0x0f;
230 |     lowbits[2] = tags[2] & 0x0f;
231 |     lowbits[3] = tags[3] & 0x0f;
232 | 
233 |     highbits[0] = tags[0] & 0xfffffff0;
234 |     highbits[1] = tags[1] & 0xfffffff0;
235 |     highbits[2] = tags[2] & 0xfffffff0;
236 |     highbits[3] = tags[3] & 0xfffffff0;
237 | 
238 |     // note that :  tags[j] = lowbits[j] | highbits[j]
239 | 
240 |     uint16_t codeword = perm_.encode(lowbits);
241 |     DPRINTF(DEBUG_TABLE, "codeword=%s\n",
242 |             PrintUtil::bytes_to_hex((char *)&codeword, 2).c_str());
243 | 
244 |     /* write out the bucketbits to its place*/
245 |     const char *p = buckets_ + ((kBitsPerBucket * i) >> 3);
246 |     DPRINTF(DEBUG_TABLE, "original bucketbits=%s\n",
247 |             PrintUtil::bytes_to_hex((char *)p, 8).c_str());
248 | 
249 |     if (kBitsPerBucket == 16) {
250 |       // 1 dirbits per tag
251 |       *((uint16_t *)p) = codeword | (highbits[0] << 8) | (highbits[1] << 9) |
252 |                          (highbits[2] << 10) | (highbits[3] << 11);
253 |     } else if (kBitsPerBucket == 20) {
254 |       // 2 dirbits per tag
255 |       if ((i & 0x0001) == 0) {
256 |         *((uint32_t *)p) &= 0xfff00000;
257 |         *((uint32_t *)p) |= codeword | (highbits[0] << 8) |
258 |                             (highbits[1] << 10) | (highbits[2] << 12) |
259 |                             (highbits[3] << 14);
260 |       } else {
261 |         *((uint32_t *)p) &= 0xff00000f;
262 |         *((uint32_t *)p) |= (codeword << 4) | (highbits[0] << 12) |
263 |                             (highbits[1] << 14) | (highbits[2] << 16) |
264 |                             (highbits[3] << 18);
265 |       }
266 |     } else if (kBitsPerBucket == 24) {
267 |       // 3 dirbits per tag
268 |       *((uint32_t *)p) &= 0xff000000;
269 |       *((uint32_t *)p) |= codeword | (highbits[0] << 8) | (highbits[1] << 11) |
270 |                           (highbits[2] << 14) | (highbits[3] << 17);
271 |     } else if (kBitsPerBucket == 28) {
272 |       // 4 dirbits per tag
273 |       if ((i & 0x0001) == 0) {
274 |         *((uint32_t *)p) &= 0xf0000000;
275 |         *((uint32_t *)p) |= codeword | (highbits[0] << 8) |
276 |                             (highbits[1] << 12) | (highbits[2] << 16) |
277 |                             (highbits[3] << 20);
278 |       } else {
279 |         *((uint32_t *)p) &= 0x0000000f;
280 |         *((uint32_t *)p) |= (codeword << 4) | (highbits[0] << 12) |
281 |                             (highbits[1] << 16) | (highbits[2] << 20) |
282 |                             (highbits[3] << 24);
283 |       }
284 |     } else if (kBitsPerBucket == 32) {
285 |       // 5 dirbits per tag
286 |       *((uint32_t *)p) = codeword | (highbits[0] << 8) | (highbits[1] << 13) |
287 |                          (highbits[2] << 18) | (highbits[3] << 23);
288 |       DPRINTF(DEBUG_TABLE, " new bucketbits=%s\n",
289 |               PrintUtil::bytes_to_hex((char *)p, 4).c_str());
290 |     } else if (kBitsPerBucket == 48) {
291 |       // 9 dirbits per tag
292 |       *((uint64_t *)p) &= 0xffff000000000000ULL;
293 |       *((uint64_t *)p) |= codeword | ((uint64_t)highbits[0] << 8) |
294 |                           ((uint64_t)highbits[1] << 17) |
295 |                           ((uint64_t)highbits[2] << 26) |
296 |                           ((uint64_t)highbits[3] << 35);
297 |       DPRINTF(DEBUG_TABLE, " new bucketbits=%s\n",
298 |               PrintUtil::bytes_to_hex((char *)p, 4).c_str());
299 | 
300 |     } else if (kBitsPerBucket == 64) {
301 |       // 13 dirbits per tag
302 |       *((uint64_t *)p) = codeword | ((uint64_t)highbits[0] << 8) |
303 |                          ((uint64_t)highbits[1] << 21) |
304 |                          ((uint64_t)highbits[2] << 34) |
305 |                          ((uint64_t)highbits[3] << 47);
306 |     }
307 |     DPRINTF(DEBUG_TABLE, "PackedTable::WriteBucket done\n");
308 |   }
309 | 
310 |   bool FindTagInBuckets(const size_t i1, const size_t i2,
311 |                         const uint32_t tag) const {
312 |     //            DPRINTF(DEBUG_TABLE, "PackedTable::FindTagInBucket %zu\n", i);
313 |     uint32_t tags1[4];
314 |     uint32_t tags2[4];
315 | 
316 |     // disable for now
317 |     // _mm_prefetch( buckets_ + (i1 * kBitsPerBucket) / 8,  _MM_HINT_NTA);
318 |     // _mm_prefetch( buckets_ + (i2 * kBitsPerBucket) / 8,  _MM_HINT_NTA);
319 | 
320 |     // ReadBucket(i1, tags1);
321 |     // ReadBucket(i2, tags2);
322 | 
323 |     uint16_t v;
324 |     uint64_t bucketbits1 = *((uint64_t *)(buckets_ + kBitsPerBucket * i1 / 8));
325 |     uint64_t bucketbits2 = *((uint64_t *)(buckets_ + kBitsPerBucket * i2 / 8));
326 | 
327 |     tags1[0] = (bucketbits1 >> 8) & kDirBitsMask;
328 |     tags1[1] = (bucketbits1 >> 17) & kDirBitsMask;
329 |     tags1[2] = (bucketbits1 >> 26) & kDirBitsMask;
330 |     tags1[3] = (bucketbits1 >> 35) & kDirBitsMask;
331 |     v = perm_.dec_table[(bucketbits1)&0x0fff];
332 |     // the order 0 2 1 3 is not a bug
333 |     tags1[0] |= (v & 0x000f);
334 |     tags1[2] |= ((v >> 4) & 0x000f);
335 |     tags1[1] |= ((v >> 8) & 0x000f);
336 |     tags1[3] |= ((v >> 12) & 0x000f);
337 | 
338 |     tags2[0] = (bucketbits2 >> 8) & kDirBitsMask;
339 |     tags2[1] = (bucketbits2 >> 17) & kDirBitsMask;
340 |     tags2[2] = (bucketbits2 >> 26) & kDirBitsMask;
341 |     tags2[3] = (bucketbits2 >> 35) & kDirBitsMask;
342 |     v = perm_.dec_table[(bucketbits2)&0x0fff];
343 |     tags2[0] |= (v & 0x000f);
344 |     tags2[2] |= ((v >> 4) & 0x000f);
345 |     tags2[1] |= ((v >> 8) & 0x000f);
346 |     tags2[3] |= ((v >> 12) & 0x000f);
347 | 
348 |     return (tags1[0] == tag) || (tags1[1] == tag) || (tags1[2] == tag) ||
349 |            (tags1[3] == tag) || (tags2[0] == tag) || (tags2[1] == tag) ||
350 |            (tags2[2] == tag) || (tags2[3] == tag);
351 |   }
352 | 
353 |   bool FindTagInBucket(const size_t i, const uint32_t tag) const {
354 |     DPRINTF(DEBUG_TABLE, "PackedTable::FindTagInBucket %zu\n", i);
355 |     uint32_t tags[4];
356 |     ReadBucket(i, tags);
357 |     if (debug_level & DEBUG_TABLE) {
358 |       PrintTags(tags);
359 |     }
360 | 
361 |     bool ret = ((tags[0] == tag) || (tags[1] == tag) || (tags[2] == tag) ||
362 |                 (tags[3] == tag));
363 |     DPRINTF(DEBUG_TABLE, "PackedTable::FindTagInBucket %d \n", ret);
364 |     return ret;
365 |   }
366 | 
367 |   bool DeleteTagFromBucket(const size_t i, const uint32_t tag) {
368 |     uint32_t tags[4];
369 |     ReadBucket(i, tags);
370 |     if (debug_level & DEBUG_TABLE) {
371 |       PrintTags(tags);
372 |     }
373 |     for (size_t j = 0; j < 4; j++) {
374 |       if (tags[j] == tag) {
375 |         tags[j] = 0;
376 |         WriteBucket(i, tags);
377 |         return true;
378 |       }
379 |     }
380 |     return false;
381 |   }  // DeleteTagFromBucket
382 | 
383 |   bool InsertTagToBucket(const size_t i, const uint32_t tag, const bool kickout,
384 |                          uint32_t &oldtag) {
385 |     DPRINTF(DEBUG_TABLE, "PackedTable::InsertTagToBucket %zu \n", i);
386 | 
387 |     uint32_t tags[4];
388 |     DPRINTF(DEBUG_TABLE,
389 |             "PackedTable::InsertTagToBucket read bucket to tags\n");
390 |     ReadBucket(i, tags);
391 |     if (debug_level & DEBUG_TABLE) {
392 |       PrintTags(tags);
393 |       PrintBucket(i);
394 |     }
395 |     for (size_t j = 0; j < 4; j++) {
396 |       if (tags[j] == 0) {
397 |         DPRINTF(DEBUG_TABLE,
398 |                 "PackedTable::InsertTagToBucket slot %zu is empty\n", j);
399 | 
400 |         tags[j] = tag;
401 |         WriteBucket(i, tags);
402 |         if (debug_level & DEBUG_TABLE) {
403 |           PrintBucket(i);
404 |           ReadBucket(i, tags);
405 |         }
406 |         DPRINTF(DEBUG_TABLE, "PackedTable::InsertTagToBucket Ok\n");
407 |         return true;
408 |       }
409 |     }
410 |     if (kickout) {
411 |       size_t r = rand() & 3;
412 |       DPRINTF(
413 |           DEBUG_TABLE,
414 |           "PackedTable::InsertTagToBucket, let's kick out a random slot %zu \n",
415 |           r);
416 |       // PrintBucket(i);
417 | 
418 |       oldtag = tags[r];
419 |       tags[r] = tag;
420 |       WriteBucket(i, tags);
421 |       if (debug_level & DEBUG_TABLE) {
422 |         PrintTags(tags);
423 |       }
424 |     }
425 |     DPRINTF(DEBUG_TABLE, "PackedTable::InsertTagToBucket, insert failed \n");
426 |     return false;
427 |   }
428 | 
429 |   // inline size_t NumTagsInBucket(const size_t i) {
430 |   //     size_t num = 0;
431 |   //     for (size_t j = 0; j < tags_per_bucket; j++ ){
432 |   //         if (ReadTag(i, j) != 0) {
433 |   //             num ++;
434 |   //         }
435 |   //     }
436 |   //     return num;
437 |   // } // NumTagsInBucket
438 | 
439 | };  // PackedTable
440 | }  // namespace cuckoofilter
441 | 
442 | #endif  // CUCKOO_FILTER_PACKED_TABLE_H_
443 | 


--------------------------------------------------------------------------------
/src/permencoding.h:
--------------------------------------------------------------------------------
 1 | #ifndef CUCKOO_FILTER_PERM_ENCODING_H_
 2 | #define CUCKOO_FILTER_PERM_ENCODING_H_
 3 | 
 4 | #include <stdint.h>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string.h>
 8 | 
 9 | #include <iostream>
10 | 
11 | #include "debug.h"
12 | 
13 | namespace cuckoofilter {
14 | 
15 | class PermEncoding {
16 |   /* unpack one 2-byte number to four 4-bit numbers */
17 |   // inline void unpack(const uint16_t in, const uint8_t out[4]) const {
18 |   //     (*(uint16_t *)out)      = in & 0x0f0f;
19 |   //     (*(uint16_t *)(out +2)) = (in >> 4) & 0x0f0f;
20 |   // }
21 | 
22 |   inline void unpack(uint16_t in, uint8_t out[4]) const {
23 |     out[0] = (in & 0x000f);
24 |     out[2] = ((in >> 4) & 0x000f);
25 |     out[1] = ((in >> 8) & 0x000f);
26 |     out[3] = ((in >> 12) & 0x000f);
27 |   }
28 | 
29 |   /* pack four 4-bit numbers to one 2-byte number */
30 |   inline uint16_t pack(const uint8_t in[4]) const {
31 |     uint16_t in1 = *((uint16_t *)(in)) & 0x0f0f;
32 |     uint16_t in2 = *((uint16_t *)(in + 2)) << 4;
33 |     return in1 | in2;
34 |   }
35 | 
36 |  public:
37 |   PermEncoding() {
38 |     uint8_t dst[4];
39 |     uint16_t idx = 0;
40 |     memset(dec_table, 0, sizeof(dec_table));
41 |     memset(enc_table, 0, sizeof(enc_table));
42 |     gen_tables(0, 0, dst, idx);
43 |   }
44 | 
45 |   ~PermEncoding() {}
46 | 
47 |   static const size_t N_ENTS = 3876;
48 | 
49 |   uint16_t dec_table[N_ENTS];
50 |   uint16_t enc_table[1 << 16];
51 | 
52 |   inline void decode(const uint16_t codeword, uint8_t lowbits[4]) const {
53 |     unpack(dec_table[codeword], lowbits);
54 |   }
55 | 
56 |   inline uint16_t encode(const uint8_t lowbits[4]) const {
57 |     if (DEBUG_ENCODE & debug_level) {
58 |       printf("Perm.encode\n");
59 |       for (int i = 0; i < 4; i++) {
60 |         printf("encode lowbits[%d]=%x\n", i, lowbits[i]);
61 |       }
62 |       printf("pack(lowbits) = %x\n", pack(lowbits));
63 |       printf("enc_table[%x]=%x\n", pack(lowbits), enc_table[pack(lowbits)]);
64 |     }
65 | 
66 |     return enc_table[pack(lowbits)];
67 |   }
68 | 
69 |   void gen_tables(int base, int k, uint8_t dst[4], uint16_t &idx) {
70 |     for (int i = base; i < 16; i++) {
71 |       /* for fast comparison in binary_search in little-endian machine */
72 |       dst[k] = i;
73 |       if (k + 1 < 4) {
74 |         gen_tables(i, k + 1, dst, idx);
75 |       } else {
76 |         dec_table[idx] = pack(dst);
77 |         enc_table[pack(dst)] = idx;
78 |         if (DEBUG_ENCODE & debug_level) {
79 |           printf("enc_table[%04x]=%04x\t%x %x %x %x\n", pack(dst), idx, dst[0],
80 |                  dst[1], dst[2], dst[3]);
81 |         }
82 |         idx++;
83 |       }
84 |     }
85 |   }
86 | };
87 | }  // namespace cuckoofilter
88 | #endif  // CUCKOO_FILTER_PERM_ENCODING_H_
89 | 


--------------------------------------------------------------------------------
/src/printutil.cc:
--------------------------------------------------------------------------------
 1 | #include "printutil.h"
 2 | 
 3 | #include <stdio.h>
 4 | 
 5 | #include <iostream>
 6 | 
 7 | namespace cuckoofilter {
 8 | 
 9 | std::string PrintUtil::bytes_to_hex(const char *data, size_t len) {
10 |   std::string hexstr = "";
11 |   static const char hexes[] = "0123456789ABCDEF ";
12 | 
13 |   for (size_t i = 0; i < len; i++) {
14 |     unsigned char c = data[i];
15 |     hexstr.push_back(hexes[c >> 4]);
16 |     hexstr.push_back(hexes[c & 0xf]);
17 |     hexstr.push_back(hexes[16]);
18 |   }
19 |   return hexstr;
20 | };
21 | 
22 | std::string PrintUtil::bytes_to_hex(const std::string &s) {
23 |   return bytes_to_hex((const char *)s.data(), s.size());
24 | };
25 | 
26 | }  // namespace cuckoofilter
27 | 


--------------------------------------------------------------------------------
/src/printutil.h:
--------------------------------------------------------------------------------
 1 | #ifndef CUCKOO_FILTER_PRINTUTIL_H_
 2 | #define CUCKOO_FILTER_PRINTUTIL_H_
 3 | 
 4 | #include <string>
 5 | 
 6 | namespace cuckoofilter {
 7 | class PrintUtil {
 8 |  public:
 9 |   static std::string bytes_to_hex(const char *data, size_t len) {
10 |     std::string hexstr = "";
11 |     static const char hexes[] = "0123456789ABCDEF ";
12 | 
13 |     for (size_t i = 0; i < len; i++) {
14 |       unsigned char c = data[i];
15 |       hexstr.push_back(hexes[c >> 4]);
16 |       hexstr.push_back(hexes[c & 0xf]);
17 |       hexstr.push_back(hexes[16]);
18 |     }
19 |     return hexstr;
20 |   }
21 | 
22 |   static std::string bytes_to_hex(const std::string &s) {
23 |     return bytes_to_hex((const char *)s.data(), s.size());
24 |   }
25 | 
26 |  private:
27 |   PrintUtil();
28 | };  // class PrintUtil
29 | 
30 | }  // namespace cuckoofilter
31 | 
32 | #endif  // CUCKOO_FILTER_PRINTUTIL_H_
33 | 


--------------------------------------------------------------------------------
/src/simd-block.h:
--------------------------------------------------------------------------------
  1 | // Copied from Apache Impala (incubating), usable under the terms in the Apache License,
  2 | // Version 2.0.
  3 | 
  4 | // This is a block Bloom filter (from Putze et al.'s "Cache-, Hash- and Space-Efficient
  5 | // Bloom Filters") with some twists:
  6 | //
  7 | // 1. Each block is a split Bloom filter - see Section 2.1 of Broder and Mitzenmacher's
  8 | // "Network Applications of Bloom Filters: A Survey".
  9 | //
 10 | // 2. The number of bits set per Add() is contant in order to take advantage of SIMD
 11 | // instructions.
 12 | 
 13 | #pragma once
 14 | 
 15 | #include <cstdint>
 16 | #include <cstdlib>
 17 | 
 18 | #include <algorithm>
 19 | #include <new>
 20 | 
 21 | #include <immintrin.h>
 22 | 
 23 | #include "hashutil.h"
 24 | 
 25 | using uint32_t = ::std::uint32_t;
 26 | using uint64_t = ::std::uint64_t;
 27 | 
 28 | template<typename HashFamily = ::cuckoofilter::TwoIndependentMultiplyShift>
 29 | class SimdBlockFilter {
 30 |  private:
 31 |   // The filter is divided up into Buckets:
 32 |   using Bucket = uint32_t[8];
 33 | 
 34 |   // log2(number of bytes in a bucket):
 35 |   static constexpr int LOG_BUCKET_BYTE_SIZE = 5;
 36 | 
 37 |   static_assert(
 38 |       (1 << LOG_BUCKET_BYTE_SIZE) == sizeof(Bucket) && sizeof(Bucket) == sizeof(__m256i),
 39 |       "Bucket sizing has gone awry.");
 40 | 
 41 |   // log_num_buckets_ is the log (base 2) of the number of buckets in the directory:
 42 |   const int log_num_buckets_;
 43 | 
 44 |   // directory_mask_ is (1 << log_num_buckets_) - 1. It is precomputed in the contructor
 45 |   // for efficiency reasons:
 46 |   const uint32_t directory_mask_;
 47 | 
 48 |   Bucket* directory_;
 49 | 
 50 |   HashFamily hasher_;
 51 | 
 52 |  public:
 53 |   // Consumes at most (1 << log_heap_space) bytes on the heap:
 54 |   explicit SimdBlockFilter(const int log_heap_space);
 55 |   SimdBlockFilter(SimdBlockFilter&& that)
 56 |     : log_num_buckets_(that.log_num_buckets_),
 57 |       directory_mask_(that.directory_mask_),
 58 |       directory_(that.directory_),
 59 |       hasher_(that.hasher_) {}
 60 |   ~SimdBlockFilter() noexcept;
 61 |   void Add(const uint64_t key) noexcept;
 62 |   bool Find(const uint64_t key) const noexcept;
 63 |   uint64_t SizeInBytes() const { return sizeof(Bucket) * (1ull << log_num_buckets_); }
 64 | 
 65 |  private:
 66 |   // A helper function for Insert()/Find(). Turns a 32-bit hash into a 256-bit Bucket
 67 |   // with 1 single 1-bit set in each 32-bit lane.
 68 |   static __m256i MakeMask(const uint32_t hash) noexcept;
 69 | 
 70 |   SimdBlockFilter(const SimdBlockFilter&) = delete;
 71 |   void operator=(const SimdBlockFilter&) = delete;
 72 | };
 73 | 
 74 | template<typename HashFamily>
 75 | SimdBlockFilter<HashFamily>::SimdBlockFilter(const int log_heap_space)
 76 |   :  // Since log_heap_space is in bytes, we need to convert it to the number of Buckets
 77 |      // we will use.
 78 |     log_num_buckets_(::std::max(1, log_heap_space - LOG_BUCKET_BYTE_SIZE)),
 79 |     // Don't use log_num_buckets_ if it will lead to undefined behavior by a shift that is
 80 |     // too large.
 81 |     directory_mask_((1ull << ::std::min(63, log_num_buckets_)) - 1),
 82 |     directory_(nullptr),
 83 |     hasher_() {
 84 |   if (!__builtin_cpu_supports("avx2")) {
 85 |     throw ::std::runtime_error("SimdBlockFilter does not work without AVX2 instructions");
 86 |   }
 87 |   const size_t alloc_size = 1ull << (log_num_buckets_ + LOG_BUCKET_BYTE_SIZE);
 88 |   const int malloc_failed =
 89 |       posix_memalign(reinterpret_cast<void**>(&directory_), 64, alloc_size);
 90 |   if (malloc_failed) throw ::std::bad_alloc();
 91 |   memset(directory_, 0, alloc_size);
 92 | }
 93 | 
 94 | template<typename HashFamily>
 95 | SimdBlockFilter<HashFamily>::~SimdBlockFilter() noexcept {
 96 |   free(directory_);
 97 |   directory_ = nullptr;
 98 | }
 99 | 
100 | // The SIMD reinterpret_casts technically violate C++'s strict aliasing rules. However, we
101 | // compile with -fno-strict-aliasing.
102 | template <typename HashFamily>
103 | [[gnu::always_inline]] inline __m256i
104 | SimdBlockFilter<HashFamily>::MakeMask(const uint32_t hash) noexcept {
105 |   const __m256i ones = _mm256_set1_epi32(1);
106 |   // Odd contants for hashing:
107 |   const __m256i rehash = _mm256_setr_epi32(0x47b6137bU, 0x44974d91U, 0x8824ad5bU,
108 |       0xa2b7289dU, 0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U);
109 |   // Load hash into a YMM register, repeated eight times
110 |   __m256i hash_data = _mm256_set1_epi32(hash);
111 |   // Multiply-shift hashing ala Dietzfelbinger et al.: multiply 'hash' by eight different
112 |   // odd constants, then keep the 5 most significant bits from each product.
113 |   hash_data = _mm256_mullo_epi32(rehash, hash_data);
114 |   hash_data = _mm256_srli_epi32(hash_data, 27);
115 |   // Use these 5 bits to shift a single bit to a location in each 32-bit lane
116 |   return _mm256_sllv_epi32(ones, hash_data);
117 | }
118 | 
119 | template <typename HashFamily>
120 | [[gnu::always_inline]] inline void
121 | SimdBlockFilter<HashFamily>::Add(const uint64_t key) noexcept {
122 |   const auto hash = hasher_(key);
123 |   const uint32_t bucket_idx = hash & directory_mask_;
124 |   const __m256i mask = MakeMask(hash >> log_num_buckets_);
125 |   __m256i* const bucket = &reinterpret_cast<__m256i*>(directory_)[bucket_idx];
126 |   _mm256_store_si256(bucket, _mm256_or_si256(*bucket, mask));
127 | }
128 | 
129 | template <typename HashFamily>
130 | [[gnu::always_inline]] inline bool
131 | SimdBlockFilter<HashFamily>::Find(const uint64_t key) const noexcept {
132 |   const auto hash = hasher_(key);
133 |   const uint32_t bucket_idx = hash & directory_mask_;
134 |   const __m256i mask = MakeMask(hash >> log_num_buckets_);
135 |   const __m256i bucket = reinterpret_cast<__m256i*>(directory_)[bucket_idx];
136 |   // We should return true if 'bucket' has a one wherever 'mask' does. _mm256_testc_si256
137 |   // takes the negation of its first argument and ands that with its second argument. In
138 |   // our case, the result is zero everywhere iff there is a one in 'bucket' wherever
139 |   // 'mask' is one. testc returns 1 if the result is 0 everywhere and returns 0 otherwise.
140 |   return _mm256_testc_si256(bucket, mask);
141 | }
142 | 


--------------------------------------------------------------------------------
/src/singletable.h:
--------------------------------------------------------------------------------
  1 | #ifndef CUCKOO_FILTER_SINGLE_TABLE_H_
  2 | #define CUCKOO_FILTER_SINGLE_TABLE_H_
  3 | 
  4 | #include <assert.h>
  5 | 
  6 | #include <sstream>
  7 | 
  8 | #include "bitsutil.h"
  9 | #include "debug.h"
 10 | #include "printutil.h"
 11 | 
 12 | namespace cuckoofilter {
 13 | 
 14 | // the most naive table implementation: one huge bit array
 15 | template <size_t bits_per_tag>
 16 | class SingleTable {
 17 |   static const size_t kTagsPerBucket = 4;
 18 |   static const size_t kBytesPerBucket =
 19 |       (bits_per_tag * kTagsPerBucket + 7) >> 3;
 20 |   static const uint32_t kTagMask = (1ULL << bits_per_tag) - 1;
 21 |   // NOTE: accomodate extra buckets if necessary to avoid overrun
 22 |   // as we always read a uint64
 23 |   static const size_t kPaddingBuckets =
 24 |     ((((kBytesPerBucket + 7) / 8) * 8) - 1) / kBytesPerBucket;
 25 | 
 26 |   struct Bucket {
 27 |     char bits_[kBytesPerBucket];
 28 |   };
 29 | 
 30 |   // using a pointer adds one more indirection
 31 |   Bucket *buckets_;
 32 |   size_t num_buckets_;
 33 | 
 34 |  public:
 35 |   explicit SingleTable(const size_t num) : num_buckets_(num) {
 36 |     buckets_ = new Bucket[num_buckets_ + kPaddingBuckets];
 37 |     memset(buckets_, 0, kBytesPerBucket * (num_buckets_ + kPaddingBuckets));
 38 |   }
 39 | 
 40 |   ~SingleTable() { 
 41 |     delete[] buckets_;
 42 |   }
 43 | 
 44 |   size_t NumBuckets() const {
 45 |     return num_buckets_;
 46 |   }
 47 | 
 48 |   size_t SizeInBytes() const { 
 49 |     return kBytesPerBucket * num_buckets_; 
 50 |   }
 51 | 
 52 |   size_t SizeInTags() const { 
 53 |     return kTagsPerBucket * num_buckets_; 
 54 |   }
 55 | 
 56 |   std::string Info() const {
 57 |     std::stringstream ss;
 58 |     ss << "SingleHashtable with tag size: " << bits_per_tag << " bits \n";
 59 |     ss << "\t\tAssociativity: " << kTagsPerBucket << "\n";
 60 |     ss << "\t\tTotal # of rows: " << num_buckets_ << "\n";
 61 |     ss << "\t\tTotal # slots: " << SizeInTags() << "\n";
 62 |     return ss.str();
 63 |   }
 64 | 
 65 |   // read tag from pos(i,j)
 66 |   inline uint32_t ReadTag(const size_t i, const size_t j) const {
 67 |     const char *p = buckets_[i].bits_;
 68 |     uint32_t tag;
 69 |     /* following code only works for little-endian */
 70 |     if (bits_per_tag == 2) {
 71 |       tag = *((uint8_t *)p) >> (j * 2);
 72 |     } else if (bits_per_tag == 4) {
 73 |       p += (j >> 1);
 74 |       tag = *((uint8_t *)p) >> ((j & 1) << 2);
 75 |     } else if (bits_per_tag == 8) {
 76 |       p += j;
 77 |       tag = *((uint8_t *)p);
 78 |     } else if (bits_per_tag == 12) {
 79 |       p += j + (j >> 1);
 80 |       tag = *((uint16_t *)p) >> ((j & 1) << 2);
 81 |     } else if (bits_per_tag == 16) {
 82 |       p += (j << 1);
 83 |       tag = *((uint16_t *)p);
 84 |     } else if (bits_per_tag == 32) {
 85 |       tag = ((uint32_t *)p)[j];
 86 |     }
 87 |     return tag & kTagMask;
 88 |   }
 89 | 
 90 |   // write tag to pos(i,j)
 91 |   inline void WriteTag(const size_t i, const size_t j, const uint32_t t) {
 92 |     char *p = buckets_[i].bits_;
 93 |     uint32_t tag = t & kTagMask;
 94 |     /* following code only works for little-endian */
 95 |     if (bits_per_tag == 2) {
 96 |       *((uint8_t *)p) |= tag << (2 * j);
 97 |     } else if (bits_per_tag == 4) {
 98 |       p += (j >> 1);
 99 |       if ((j & 1) == 0) {
100 |         *((uint8_t *)p) &= 0xf0;
101 |         *((uint8_t *)p) |= tag;
102 |       } else {
103 |         *((uint8_t *)p) &= 0x0f;
104 |         *((uint8_t *)p) |= (tag << 4);
105 |       }
106 |     } else if (bits_per_tag == 8) {
107 |       ((uint8_t *)p)[j] = tag;
108 |     } else if (bits_per_tag == 12) {
109 |       p += (j + (j >> 1));
110 |       if ((j & 1) == 0) {
111 |         ((uint16_t *)p)[0] &= 0xf000;
112 |         ((uint16_t *)p)[0] |= tag;
113 |       } else {
114 |         ((uint16_t *)p)[0] &= 0x000f;
115 |         ((uint16_t *)p)[0] |= (tag << 4);
116 |       }
117 |     } else if (bits_per_tag == 16) {
118 |       ((uint16_t *)p)[j] = tag;
119 |     } else if (bits_per_tag == 32) {
120 |       ((uint32_t *)p)[j] = tag;
121 |     }
122 |   }
123 | 
124 |   inline bool FindTagInBuckets(const size_t i1, const size_t i2,
125 |                                const uint32_t tag) const {
126 |     const char *p1 = buckets_[i1].bits_;
127 |     const char *p2 = buckets_[i2].bits_;
128 | 
129 |     uint64_t v1 = *((uint64_t *)p1);
130 |     uint64_t v2 = *((uint64_t *)p2);
131 | 
132 |     // caution: unaligned access & assuming little endian
133 |     if (bits_per_tag == 4 && kTagsPerBucket == 4) {
134 |       return hasvalue4(v1, tag) || hasvalue4(v2, tag);
135 |     } else if (bits_per_tag == 8 && kTagsPerBucket == 4) {
136 |       return hasvalue8(v1, tag) || hasvalue8(v2, tag);
137 |     } else if (bits_per_tag == 12 && kTagsPerBucket == 4) {
138 |       return hasvalue12(v1, tag) || hasvalue12(v2, tag);
139 |     } else if (bits_per_tag == 16 && kTagsPerBucket == 4) {
140 |       return hasvalue16(v1, tag) || hasvalue16(v2, tag);
141 |     } else {
142 |       for (size_t j = 0; j < kTagsPerBucket; j++) {
143 |         if ((ReadTag(i1, j) == tag) || (ReadTag(i2, j) == tag)) {
144 |           return true;
145 |         }
146 |       }
147 |       return false;
148 |     }
149 |   }
150 | 
151 |   inline bool FindTagInBucket(const size_t i, const uint32_t tag) const {
152 |     // caution: unaligned access & assuming little endian
153 |     if (bits_per_tag == 4 && kTagsPerBucket == 4) {
154 |       const char *p = buckets_[i].bits_;
155 |       uint64_t v = *(uint64_t *)p;  // uint16_t may suffice
156 |       return hasvalue4(v, tag);
157 |     } else if (bits_per_tag == 8 && kTagsPerBucket == 4) {
158 |       const char *p = buckets_[i].bits_;
159 |       uint64_t v = *(uint64_t *)p;  // uint32_t may suffice
160 |       return hasvalue8(v, tag);
161 |     } else if (bits_per_tag == 12 && kTagsPerBucket == 4) {
162 |       const char *p = buckets_[i].bits_;
163 |       uint64_t v = *(uint64_t *)p;
164 |       return hasvalue12(v, tag);
165 |     } else if (bits_per_tag == 16 && kTagsPerBucket == 4) {
166 |       const char *p = buckets_[i].bits_;
167 |       uint64_t v = *(uint64_t *)p;
168 |       return hasvalue16(v, tag);
169 |     } else {
170 |       for (size_t j = 0; j < kTagsPerBucket; j++) {
171 |         if (ReadTag(i, j) == tag) {
172 |           return true;
173 |         }
174 |       }
175 |       return false;
176 |     }
177 |   }
178 | 
179 |   inline bool DeleteTagFromBucket(const size_t i, const uint32_t tag) {
180 |     for (size_t j = 0; j < kTagsPerBucket; j++) {
181 |       if (ReadTag(i, j) == tag) {
182 |         assert(FindTagInBucket(i, tag) == true);
183 |         WriteTag(i, j, 0);
184 |         return true;
185 |       }
186 |     }
187 |     return false;
188 |   }
189 | 
190 |   inline bool InsertTagToBucket(const size_t i, const uint32_t tag,
191 |                                 const bool kickout, uint32_t &oldtag) {
192 |     for (size_t j = 0; j < kTagsPerBucket; j++) {
193 |       if (ReadTag(i, j) == 0) {
194 |         WriteTag(i, j, tag);
195 |         return true;
196 |       }
197 |     }
198 |     if (kickout) {
199 |       size_t r = rand() % kTagsPerBucket;
200 |       oldtag = ReadTag(i, r);
201 |       WriteTag(i, r, tag);
202 |     }
203 |     return false;
204 |   }
205 | 
206 |   inline size_t NumTagsInBucket(const size_t i) const {
207 |     size_t num = 0;
208 |     for (size_t j = 0; j < kTagsPerBucket; j++) {
209 |       if (ReadTag(i, j) != 0) {
210 |         num++;
211 |       }
212 |     }
213 |     return num;
214 |   }
215 | };
216 | }  // namespace cuckoofilter
217 | #endif  // CUCKOO_FILTER_SINGLE_TABLE_H_
218 | 


--------------------------------------------------------------------------------
/test.js:
--------------------------------------------------------------------------------
 1 | 'use strict'
 2 | 
 3 | const t = require('tap')
 4 | const CuckooFilter = require('.')
 5 | 
 6 | const filter = new CuckooFilter(1024)
 7 | 
 8 | t.equal(filter.add('hello world'), filter)
 9 | t.equal(filter.contain('hello world'), true)
10 | t.equal(filter.size, 1)
11 | 
12 | // adding twice
13 | t.equal(filter.add('hello world'), filter)
14 | t.equal(filter.contain('hello world'), true)
15 | t.equal(filter.size, 1)
16 | 
17 | // another added thing
18 | t.equal(filter.add('hello matteo'), filter)
19 | t.equal(filter.contain('hello matteo'), true)
20 | t.equal(filter.size, 2)
21 | 
22 | t.equal(filter.contain('hello world'), true)
23 | t.equal(filter.contain('not present'), false)
24 | 
25 | // removing once does delete it
26 | t.equal(filter.delete('hello world'), filter)
27 | t.equal(filter.contain('hello world'), false)
28 | t.equal(filter.size, 1)
29 | 
30 | // removing twice deletes it too
31 | t.equal(filter.delete('hello world'), filter)
32 | t.equal(filter.contain('hello world'), false)
33 | t.equal(filter.size, 1)
34 | 
35 | // removing twice
36 | t.equal(filter.contain('hello world'), false)
37 | 
38 | // removing a value only added once
39 | t.equal(filter.delete('hello matteo'), filter)
40 | t.equal(filter.contain('hello matteo'), false)
41 | t.equal(filter.size, 0)
42 | 
43 | ;(function () {
44 |   var n = 10
45 |   var key = 'something repeated'
46 |   var i
47 | 
48 |   for (i = 0; i < n; i++) {
49 |     filter.add(key)
50 |   }
51 | 
52 |   t.equal(filter.size, 1)
53 | 
54 |   t.equal(filter.contain(key), true)
55 | 
56 |   for (i = 0; i < n; i++) {
57 |     filter.delete(key)
58 |     t.equal(filter.contain(key), false)
59 |   }
60 | 
61 |   t.equal(filter.size, 0)
62 | })()
63 | 
64 | // current size with the given parameters
65 | t.equal(filter.bytes, 3072)
66 | 


--------------------------------------------------------------------------------