(
69 | CuckooFilterConfig.newBuilder()
70 | .setSize(table.size())
71 | .setHashFunction(hashFunction)
72 | .setStrategy(strategy)
73 | .build(),
74 | table,
75 | funnel,
76 | random);
77 | }
78 |
79 | private CuckooFilter(
80 | CuckooFilterConfig config, CuckooFilterTable table, Funnel super T> funnel, Random random) {
81 | this.config = config;
82 | this.table = table;
83 | this.funnel = funnel;
84 | this.random = random;
85 | count = 0;
86 | }
87 |
88 | /**
89 | * Returns true if {@code element} is in the cuckoo filter.
90 | *
91 | * By the probabilistic nature of the cuckoo filter data structure, this method may return a
92 | * false positive result. In other words, this method may incorrectly return true for an element
93 | * that was actually never inserted. This probability can depend on various factors, including the
94 | * size of the cuckoo filter and the hash function used.
95 | *
96 | *
However, it is guaranteed that this method never returns a false negative result, as long as
97 | * {@code delete} method is called on an element that exists in the filter. Please see {@code
98 | * delete} method for more details.
99 | */
100 | public boolean contains(T element) {
101 | HashCode hash = config.hashFunction().hash(element, funnel);
102 | long fingerprint =
103 | config.strategy().computeFingerprint(hash, config.size().fingerprintLength());
104 | int bucketIndex = config.strategy().computeBucketIndex(hash, config.size().bucketCount());
105 | int otherBucketIndex =
106 | config
107 | .strategy()
108 | .computeOtherBucketIndex(
109 | fingerprint, bucketIndex, config.size().bucketCount(), config.hashFunction());
110 | return table.contains(bucketIndex, fingerprint)
111 | || table.contains(otherBucketIndex, fingerprint);
112 | }
113 |
114 | /**
115 | * Inserts {@code element} to the cuckoo filter, returning true if the element was inserted
116 | * successfully.
117 | *
118 | *
Insertion of {@code element} will fail if there is no room for {@code element}. Note that
119 | * even when the insertion of {@code element} fails, it is possible for another element to be
120 | * inserted successfully. Even then, the insertion failure should be a good indicator that the
121 | * filter is getting close to its maximum capacity.
122 | */
123 | public boolean insert(T element) {
124 | HashCode hash = config.hashFunction().hash(element, funnel);
125 | long fingerprint =
126 | config.strategy().computeFingerprint(hash, config.size().fingerprintLength());
127 | int bucketIndex = config.strategy().computeBucketIndex(hash, config.size().bucketCount());
128 | int otherBucketIndex =
129 | config
130 | .strategy()
131 | .computeOtherBucketIndex(
132 | fingerprint, bucketIndex, config.size().bucketCount(), config.hashFunction());
133 |
134 | // First attempt to insert the fingerprint to one of the two assigned buckets.
135 | if (attemptInsertion(fingerprint, bucketIndex, otherBucketIndex)) {
136 | count++;
137 | return true;
138 | }
139 |
140 | // If both buckets are full, execute insertion with repeated replacements algorithm.
141 | int startBucketIndex = (random.nextInt(2) == 0) ? bucketIndex : otherBucketIndex;
142 | boolean inserted = insertWithRepeatedReplacements(fingerprint, startBucketIndex);
143 | if (inserted) {
144 | count++;
145 | }
146 | return inserted;
147 | }
148 |
149 | /**
150 | * Deletes {@code element} from the cuckoo filter, returning true if the element was deleted
151 | * successfully.
152 | *
153 | *
It is critical for {@code delete} to be called on an already existing element. Otherwise,
154 | * the filter may incorrectly delete a wrong element. When this happens, it is possible for {@code
155 | * contains} method to return a false negative result.
156 | */
157 | public boolean delete(T element) {
158 | HashCode hash = config.hashFunction().hash(element, funnel);
159 | long fingerprint =
160 | config.strategy().computeFingerprint(hash, config.size().fingerprintLength());
161 | int bucketIndex = config.strategy().computeBucketIndex(hash, config.size().bucketCount());
162 | int otherBucketIndex =
163 | config
164 | .strategy()
165 | .computeOtherBucketIndex(
166 | fingerprint, bucketIndex, config.size().bucketCount(), config.hashFunction());
167 | boolean deleted =
168 | table.delete(bucketIndex, fingerprint) || table.delete(otherBucketIndex, fingerprint);
169 | if (deleted) {
170 | count--;
171 | }
172 | return deleted;
173 | }
174 |
175 | /** Returns the size of the cuckoo filter. */
176 | public CuckooFilterConfig.Size size() {
177 | return config.size();
178 | }
179 |
180 | /** Returns the count of the elements in the cuckoo filter. */
181 | public long count() {
182 | return count;
183 | }
184 |
185 | /**
186 | * Returns the ratio of the total number of elements in the cuckoo filter and the theoretical max
187 | * capacity.
188 | *
189 | *
The returned value is in range [0, 1].
190 | */
191 | public double load() {
192 | return count / ((double) config.size().bucketCount() * config.size().bucketCapacity());
193 | }
194 |
195 | /**
196 | * Serializes the state of the cuckoo filter table.
197 | *
198 | *
Note that this method does not serialize hash function, strategy, and funnel. When
199 | * instantiating a cuckoo filter from the returned {@link SerializedCuckooFilterTable}, it is up
200 | * to the user to supply appropriate hash function, strategy, and funnel that were used.
201 | */
202 | public SerializedCuckooFilterTable serializeTable() {
203 | return table.serialize();
204 | }
205 |
206 | /**
207 | * Attempts to insert {@code fingerprint} to one of the buckets with indices {@code bucketIndex}
208 | * and {@code otherBucketIndex}, returning true when successful. Returns false if both buckets are
209 | * full and the insertion failed.
210 | */
211 | private boolean attemptInsertion(long fingerprint, int bucketIndex, int otherBucketIndex) {
212 | if (!table.isFull(bucketIndex)) {
213 | table.insertWithReplacement(bucketIndex, fingerprint);
214 | return true;
215 | }
216 | if (!table.isFull(otherBucketIndex)) {
217 | table.insertWithReplacement(otherBucketIndex, fingerprint);
218 | return true;
219 | }
220 | return false;
221 | }
222 |
223 | /**
224 | * Randomly traverses the cuckoo graph to find an available bucket for insertion.
225 | *
226 | *
At a high level, this algorithm starts at vertex {@code bucketIndex} and performs a random
227 | * walk of length at most {@link CuckooFilterConfig.Strategy#maxReplacementCount}. If an available
228 | * bucket is found, the algorithm "pushes" all the fingerprints (edges) that are visited (note
229 | * that in the cuckoo graph, the edges are the fingerprints) to their alternate buckets, and make
230 | * room for {@code fingerprint} to be inserted.
231 | *
232 | *
If during the random walk an available bucket is not found, the insertion fails and the
233 | * method returns false.
234 | *
235 | *
Note that it is possible to deterministically find an available bucket by performing breadth
236 | * first search in the cuckoo graph, but this is usually slower and the extra chance of successful
237 | * insertion is negligibly small in practice.
238 | */
239 | private boolean insertWithRepeatedReplacements(long fingerprint, int bucketIndex) {
240 | List visitedBucketIndices = new ArrayList<>();
241 | List replacedFingerprints = new ArrayList<>();
242 |
243 | long currFingerprint = fingerprint;
244 | int currBucketIndex = bucketIndex;
245 | visitedBucketIndices.add(-1); // Just for index alignment purpose.
246 | replacedFingerprints.add(currFingerprint);
247 | for (int i = 0; i < config.strategy().maxReplacementCount(); i++) {
248 | Optional replacedFingerprint =
249 | table.insertWithReplacement(currBucketIndex, currFingerprint);
250 | // Found an available bucket, and the insertion is successful.
251 | if (replacedFingerprint.isEmpty()) {
252 | return true;
253 | }
254 |
255 | visitedBucketIndices.add(currBucketIndex);
256 | replacedFingerprints.add(replacedFingerprint.get());
257 |
258 | currFingerprint = replacedFingerprint.get();
259 | currBucketIndex =
260 | config
261 | .strategy()
262 | .computeOtherBucketIndex(
263 | currFingerprint,
264 | currBucketIndex,
265 | config.size().bucketCount(),
266 | config.hashFunction());
267 | }
268 |
269 | // Failed to find a bucket to insert. Reverse the replacements and declare that the insertion
270 | // failed.
271 | for (int i = visitedBucketIndices.size() - 1; i > 0; i--) {
272 | int previousBucketIndex = visitedBucketIndices.get(i);
273 | table.delete(previousBucketIndex, replacedFingerprints.get(i - 1));
274 | table.insertWithReplacement(previousBucketIndex, replacedFingerprints.get(i));
275 | }
276 | return false;
277 | }
278 | }
279 |
--------------------------------------------------------------------------------
/setfilters/src/com/google/setfilters/cuckoofilter/CuckooFilterArray.java:
--------------------------------------------------------------------------------
1 | // Copyright 2022 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // https://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.setfilters.cuckoofilter;
16 |
17 | import static com.google.common.base.Preconditions.checkArgument;
18 |
19 | import java.nio.ByteBuffer;
20 | import java.nio.ByteOrder;
21 |
22 | /**
23 | * Static array where each element is an integer of size {@code bitsPerElement} bits.
24 | *
25 | * Supports up to 64 bits per element. This will be used internally by cuckoo filter.
26 | */
27 | final class CuckooFilterArray {
28 | private final long length;
29 | private final int bitsPerElement;
30 | private final long[] bitArray;
31 |
32 | /**
33 | * Constructs a new cuckoo filter array with length {@code length}, with each element of length
34 | * {@code bitsPerElement} bits.
35 | *
36 | * @throws IllegalArgumentException if {@code length} <= 0 or {@code bitsPerElement} <= 0 or
37 | * {@code bitsPerElement} > 64.
38 | */
39 | public CuckooFilterArray(long length, int bitsPerElement) {
40 | checkLengthIsValid(length);
41 | checkBitsPerElementIsValid(bitsPerElement);
42 |
43 | this.length = length;
44 | this.bitsPerElement = bitsPerElement;
45 | long totalBits = length * bitsPerElement;
46 | // ceil(totalBits / 64) number of elements.
47 | long longArrayLength = (totalBits + Long.SIZE - 1) / Long.SIZE;
48 | checkArgument(
49 | longArrayLength < Integer.MAX_VALUE,
50 | "Too large: could not create CuckooFilterArray with length %s and bitsPerElement %s.",
51 | length,
52 | bitsPerElement);
53 | bitArray = new long[(int) longArrayLength];
54 | }
55 |
56 | /**
57 | * Constructs a cuckoo filter array with length {@code length}, with each element of length {@code
58 | * bitsPerElement}, from {@code byteArray}.
59 | */
60 | public CuckooFilterArray(long length, int bitsPerElement, byte[] byteArray) {
61 | this(length, bitsPerElement);
62 | ByteBuffer buffer = ByteBuffer.wrap(byteArray).order(ByteOrder.LITTLE_ENDIAN);
63 | for (int i = 0; i < bitArray.length; i++) {
64 | bitArray[i] = buffer.getLong();
65 | }
66 | }
67 |
68 | /** Returns the length of the array. */
69 | public long length() {
70 | return length;
71 | }
72 |
73 | /** Returns the number of bits per element. */
74 | public int bitsPerElement() {
75 | return bitsPerElement;
76 | }
77 |
78 | /**
79 | * Returns the element at the {@code index}th position as a long.
80 | *
81 | *
The lowest {@code bitsPerElement} bits will correspond to the value of the element.
82 | *
83 | * @throws IllegalArgumentException if {@code index} is out of bounds.
84 | */
85 | public long getAsLong(long index) {
86 | checkIndexOutOfBounds(index);
87 | long bitStart = index * bitsPerElement;
88 | long bitEnd = bitStart + bitsPerElement;
89 | int arrayIndex1 = (int) (bitStart / Long.SIZE);
90 | int arrayIndex2 = (int) ((bitEnd - 1) / Long.SIZE);
91 |
92 | int a = (int) (bitStart % Long.SIZE);
93 | // The element intersects the two array indices.
94 | if (arrayIndex1 < arrayIndex2) {
95 | int b = a + bitsPerElement - Long.SIZE;
96 | long value1 = bitArray[arrayIndex1] >>> a;
97 | long value2 = bitArray[arrayIndex2] & mask(b);
98 | return (value1 | (value2 << (Long.SIZE - a)));
99 | }
100 | // Element is contained in one array index.
101 | return (bitArray[arrayIndex1] >>> a) & mask(bitsPerElement);
102 | }
103 |
104 | /**
105 | * Sets the element at {@code index}th position as {@code value}, using the lowest {@code
106 | * bitsPerElement} bits as the value of the element.
107 | *
108 | * @throws IllegalArgumentException if {@code index} is out of bounds.
109 | */
110 | public void set(long index, long value) {
111 | checkIndexOutOfBounds(index);
112 | long bitStart = index * bitsPerElement;
113 | long bitEnd = bitStart + bitsPerElement;
114 | int arrayIndex1 = (int) (bitStart / Long.SIZE);
115 | int arrayIndex2 = (int) ((bitEnd - 1) / Long.SIZE);
116 |
117 | // Use the lowest bitsPerElement bits and clear all other bits.
118 | value &= mask(bitsPerElement);
119 |
120 | int a = (int) (bitStart % Long.SIZE);
121 | // The element intersects the two array indices.
122 | if (arrayIndex1 < arrayIndex2) {
123 | int b = a + bitsPerElement - Long.SIZE;
124 | bitArray[arrayIndex1] &= clearMask(Long.SIZE, a, Long.SIZE);
125 | bitArray[arrayIndex1] |= (value << a);
126 | bitArray[arrayIndex2] &= clearMask(Long.SIZE, 0, b);
127 | bitArray[arrayIndex2] |= (value >>> (Long.SIZE - a));
128 | } else {
129 | // Element is contained in one array index.
130 | int b = a + bitsPerElement;
131 | bitArray[arrayIndex1] &= clearMask(Long.SIZE, a, b);
132 | bitArray[arrayIndex1] |= (value << a);
133 | }
134 | }
135 |
136 | /** Returns byte array representation of the {@link CuckooFilterArray}. */
137 | public byte[] toByteArray() {
138 | byte[] byteArray = new byte[bitArray.length * Long.BYTES];
139 | for (int i = 0; i < bitArray.length; i++) {
140 | long value = bitArray[i];
141 | for (int j = 0; j < Long.BYTES; j++) {
142 | // Explicit conversion from long to byte will truncate to lowest 8 bits.
143 | byteArray[i * Long.BYTES + j] = (byte) value;
144 | value >>>= Byte.SIZE;
145 | }
146 | }
147 | return byteArray;
148 | }
149 |
150 | // Theoretical max size of a long array is Integer.MAX_VALUE. Assuming each element is 1 bit,
151 | // we can support up to Integer.MAX_VALUE * 64 number of elements.
152 | private void checkLengthIsValid(long length) {
153 | checkArgument(
154 | 0 < length && length < (long) Integer.MAX_VALUE * Long.SIZE,
155 | "length must be in range (0, %s).",
156 | (long) Integer.MAX_VALUE * Long.SIZE);
157 | }
158 |
159 | private void checkBitsPerElementIsValid(int bitsPerElement) {
160 | checkArgument(
161 | 0 < bitsPerElement && bitsPerElement <= 64, "bitsPerElement must be in range [1, 64].");
162 | }
163 |
164 | private void checkIndexOutOfBounds(long index) {
165 | checkArgument(0 <= index && index < length, "Index is out of bounds: %s.", index);
166 | }
167 |
168 | private static long mask(int length) {
169 | if (length == Long.SIZE) {
170 | // -1 in 2s complement is 0xFFFFFFFFFFFFFFFF.
171 | return -1;
172 | }
173 | return (1L << length) - 1;
174 | }
175 |
176 | // Mask for clearing bits in range [a, b).
177 | private static long clearMask(int length, int a, int b) {
178 | long mask1 = mask(length);
179 | long mask2 = mask(b - a);
180 | return mask1 ^ (mask2 << a);
181 | }
182 | }
183 |
--------------------------------------------------------------------------------
/setfilters/src/com/google/setfilters/cuckoofilter/CuckooFilterConfig.java:
--------------------------------------------------------------------------------
1 | // Copyright 2022 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // https://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.setfilters.cuckoofilter;
16 |
17 | import static com.google.common.base.Preconditions.checkArgument;
18 |
19 | import com.google.common.collect.ImmutableMap;
20 | import com.google.common.hash.Funnel;
21 | import com.google.common.hash.HashCode;
22 | import com.google.errorprone.annotations.CanIgnoreReturnValue;
23 | import java.util.Map;
24 |
25 | /**
26 | * Specification for the cuckoo filter.
27 | *
28 | *
This class is immutable.
29 | */
30 | // TODO: Handle serialization.
31 | public final class CuckooFilterConfig {
32 | private final Size size;
33 | private final HashFunction hashFunction;
34 | private final Strategy strategy;
35 | private final boolean useSpaceOptimization;
36 |
37 | private CuckooFilterConfig(
38 | Size size, HashFunction hashFunction, Strategy strategy, boolean useSpaceOptimization) {
39 | this.size = size;
40 | this.hashFunction = hashFunction;
41 | this.strategy = strategy;
42 | this.useSpaceOptimization = useSpaceOptimization;
43 | }
44 |
45 | public Size size() {
46 | return size;
47 | }
48 |
49 | public HashFunction hashFunction() {
50 | return hashFunction;
51 | }
52 |
53 | public Strategy strategy() {
54 | return strategy;
55 | }
56 |
57 | public boolean useSpaceOptimization() {
58 | return useSpaceOptimization;
59 | }
60 |
61 | public static Builder newBuilder() {
62 | return new Builder();
63 | }
64 |
65 | /** Builder for the {@link CuckooFilterConfig}. */
66 | public static class Builder {
67 | private Size size;
68 | private HashFunction hashFunction;
69 | private Strategy strategy;
70 | private boolean useSpaceOptimization;
71 |
72 | private Builder() {}
73 |
74 | @CanIgnoreReturnValue
75 | public Builder setSize(Size size) {
76 | this.size = size;
77 | return this;
78 | }
79 |
80 | @CanIgnoreReturnValue
81 | public Builder setHashFunction(HashFunction hashFunction) {
82 | this.hashFunction = hashFunction;
83 | return this;
84 | }
85 |
86 | @CanIgnoreReturnValue
87 | public Builder setStrategy(Strategy strategy) {
88 | this.strategy = strategy;
89 | return this;
90 | }
91 |
92 | /**
93 | * Whether to use space optimized filter representation (if possible).
94 | *
95 | *
Setting this field to {@code true} does not guarantee the optimization algorithm to always
96 | * apply - it is best effort.
97 | *
98 | *
In general, using this may result in slower filter operations, and incurs an additional
99 | * fixed space overhead. Thus, it is possible for the "optimized" version of the filter to
100 | * actually take more space than the non optimized one.
101 | */
102 | @CanIgnoreReturnValue
103 | public Builder setUseSpaceOptimization(boolean useSpaceOptimization) {
104 | this.useSpaceOptimization = useSpaceOptimization;
105 | return this;
106 | }
107 |
108 | /**
109 | * Builds {@link CuckooFilterConfig}.
110 | *
111 | * @throws IllegalArgumentException if the required parameters are not set.
112 | */
113 | public CuckooFilterConfig build() {
114 | checkArgument(size != null, "Size must be set.");
115 | checkArgument(hashFunction != null, "Hash function must be set.");
116 | checkArgument(strategy != null, "Strategy must be set.");
117 |
118 | return new CuckooFilterConfig(size, hashFunction, strategy, useSpaceOptimization);
119 | }
120 | }
121 |
122 | /**
123 | * Specification of the cuckoo filter size.
124 | *
125 | *
A cuckoo filter's size can be defined as a tuple (bucketCount, bucketCapacity,
126 | * fingeprintLength); this means that there are bucketCount number of buckets, where each bucket
127 | * can store up to bucketCapacity fingerprints, and each fingerprint is of length
128 | * fingerprintLength bits.
129 | *
130 | *
All fields are required and must be set explicitly.
131 | *
132 | *
This class is immutable.
133 | */
134 | public static class Size {
135 | private static final int MAX_BUCKET_CAPACITY = 128;
136 | private static final int MAX_FINGERPRINT_LENGTH = 64;
137 | /** Empirical load by the bucket capacity. */
138 | private static final ImmutableMap APPROX_LOAD_BY_BUCKET_CAPACITY =
139 | ImmutableMap.builder()
140 | .put(2, 0.85)
141 | .put(3, 0.91)
142 | .put(4, 0.95)
143 | .put(5, 0.96)
144 | .put(6, 0.97)
145 | .put(7, 0.98)
146 | .put(8, 0.98)
147 | .buildOrThrow();
148 |
149 | private final int bucketCount;
150 | private final int bucketCapacity;
151 | private final int fingerprintLength;
152 |
153 | private Size(int bucketCount, int bucketCapacity, int fingerprintLength) {
154 | this.bucketCount = bucketCount;
155 | this.bucketCapacity = bucketCapacity;
156 | this.fingerprintLength = fingerprintLength;
157 | }
158 |
159 | /**
160 | * Automatically computes a reasonably efficient cuckoo filter {@link Size} that ensures (with
161 | * high probability) storing up to {@code elementsCountUpperBound} elements (with high
162 | * probability) with the given {@code targetFalsePositiveRate}.
163 | *
164 | * @throws IllegalArgumentException if {@code targetFalsePositiveRate} is not in range [0, 1] or
165 | * {@code elementsCountUpperBound} is <= 0, or a suitable cuckoo filter size could not be
166 | * computed based on the given input.
167 | */
168 | public static Size computeEfficientSize(
169 | double targetFalsePositiveRate, long elementsCountUpperBound) {
170 | checkArgument(
171 | 0 < targetFalsePositiveRate && targetFalsePositiveRate < 1,
172 | "targetFalsePositiveRate must be in range (0, 1): %s given.",
173 | targetFalsePositiveRate);
174 | checkArgument(
175 | elementsCountUpperBound > 0,
176 | "elementsCountUpperBound must be > 0: %s given.",
177 | elementsCountUpperBound);
178 |
179 | long bestCuckooFilterSizeInBits = -1;
180 | int bestBucketCount = 0;
181 | int bestBucketCapacity = 0;
182 | int bestFingerprintLength = 0;
183 | for (Map.Entry entry : APPROX_LOAD_BY_BUCKET_CAPACITY.entrySet()) {
184 | int bucketCapacity = entry.getKey();
185 | double load = entry.getValue();
186 |
187 | int fingerprintLength =
188 | (int) Math.ceil(-log2(targetFalsePositiveRate) + log2(bucketCapacity) + 1);
189 | long bucketCount = (long) Math.ceil(elementsCountUpperBound / (bucketCapacity * load));
190 |
191 | // The computed size is invalid if fingerprint length is larger than max length or the
192 | // bucket count that is larger than max integer.
193 | if (fingerprintLength > MAX_FINGERPRINT_LENGTH || bucketCount >= Integer.MAX_VALUE) {
194 | continue;
195 | }
196 |
197 | long totalBits = bucketCount * bucketCapacity * fingerprintLength;
198 | if (bestCuckooFilterSizeInBits == -1 || bestCuckooFilterSizeInBits > totalBits) {
199 | bestCuckooFilterSizeInBits = totalBits;
200 | bestBucketCount = (int) bucketCount;
201 | bestBucketCapacity = bucketCapacity;
202 | bestFingerprintLength = fingerprintLength;
203 | }
204 | }
205 |
206 | checkArgument(
207 | bestCuckooFilterSizeInBits != -1,
208 | "Could not compute suitable cuckoo filter size based on the given input. Either the"
209 | + " target false positive rate is too low, or the computed size is too big.");
210 |
211 | return Size.newBuilder()
212 | .setBucketCount(bestBucketCount)
213 | .setBucketCapacity(bestBucketCapacity)
214 | .setFingerprintLength(bestFingerprintLength)
215 | .build();
216 | }
217 |
218 | public static Builder newBuilder() {
219 | return new Builder();
220 | }
221 |
222 | /** Returns the total number of buckets in the cuckoo filter. */
223 | public int bucketCount() {
224 | return bucketCount;
225 | }
226 |
227 | /** Returns the maximum number of fingerprints each bucket can hold. */
228 | public int bucketCapacity() {
229 | return bucketCapacity;
230 | }
231 |
232 | /** Returns the length of the fingerprint in bits. */
233 | public int fingerprintLength() {
234 | return fingerprintLength;
235 | }
236 |
237 | /** Builder for the {@link Size}. */
238 | public static class Builder {
239 | private int bucketCount;
240 | private int bucketCapacity;
241 | private int fingerprintLength;
242 |
243 | private Builder() {}
244 |
245 | /**
246 | * Sets the number of buckets in the cuckoo filter.
247 | *
248 | * {@code bucketCount} must be > 0.
249 | */
250 | @CanIgnoreReturnValue
251 | public Builder setBucketCount(int bucketCount) {
252 | this.bucketCount = bucketCount;
253 | return this;
254 | }
255 |
256 | /**
257 | * Sets the maximum number of fingerprints each bucket can hold.
258 | *
259 | *
{@code bucketCapacity} must be in range (0, {@value #MAX_BUCKET_CAPACITY}].
260 | */
261 | @CanIgnoreReturnValue
262 | public Builder setBucketCapacity(int bucketCapacity) {
263 | this.bucketCapacity = bucketCapacity;
264 | return this;
265 | }
266 |
267 | /**
268 | * Sets the length of each fingerprint in bits.
269 | *
270 | *
{@code fingerprintLength} must be in range (0, {@value #MAX_FINGERPRINT_LENGTH}].
271 | */
272 | @CanIgnoreReturnValue
273 | public Builder setFingerprintLength(int fingerprintLength) {
274 | this.fingerprintLength = fingerprintLength;
275 | return this;
276 | }
277 |
278 | /**
279 | * Builds {@link Size}.
280 | *
281 | * @throws IllegalArgumentException if the configured parameters are invalid.
282 | */
283 | public Size build() {
284 | checkArgument(bucketCount > 0, "bucketCount must be > 0: %s given instead.", bucketCount);
285 | checkArgument(
286 | 0 < bucketCapacity && bucketCapacity <= MAX_BUCKET_CAPACITY,
287 | "bucketCapacity must be in range (0, %s]: %s given instead.",
288 | MAX_BUCKET_CAPACITY,
289 | bucketCapacity);
290 | checkArgument(
291 | 0 < fingerprintLength && fingerprintLength <= MAX_FINGERPRINT_LENGTH,
292 | "fingerprintLength must be in range (0, %s]: %s given instead.",
293 | MAX_FINGERPRINT_LENGTH,
294 | fingerprintLength);
295 |
296 | return new Size(bucketCount, bucketCapacity, fingerprintLength);
297 | }
298 | }
299 |
300 | private static double log2(double x) {
301 | return Math.log(x) / Math.log(2);
302 | }
303 | }
304 |
305 | /** Hash function for transforming an arbitrary type element to a {@link HashCode}. */
306 | public interface HashFunction {
307 | /** Hashes given {@code element} to a {@link HashCode}, using the given {@code funnel}. */
308 | HashCode hash(T element, Funnel super T> funnel);
309 | }
310 |
311 | /**
312 | * Strategy for computing fingerprints and where these fingerprints belong in the cuckoo filter
313 | * table.
314 | */
315 | public interface Strategy {
316 |
317 | /**
318 | * Computes the fingerprint value given the element's {@code hash} output from {@link
319 | * HashFunction}.
320 | *
321 | * The returned value should be in range (0, 2^{@code fingerprintLength}). Otherwise, the
322 | * behavior of the cuckoo filter is undefined. Note that the interval is an open interval, so 0
323 | * and 2^{@code fingerprintLength} are not included.
324 | */
325 | long computeFingerprint(HashCode hash, int fingerprintLength);
326 |
327 | /**
328 | * Computes one of the bucket indices given the element's {@code hash} output from {@link
329 | * HashFunction} and {@code bucketCount} of the cuckoo filter.
330 | *
331 | *
The returned value should be in range [0, {@code bucketCount}). Otherwise, the behavior of
332 | * the cuckoo filter is undefined.
333 | */
334 | int computeBucketIndex(HashCode hash, int bucketCount);
335 |
336 | /**
337 | * Computes the element's other bucket index given the element's {@code fingerprint} value and
338 | * its initial {@code bucketIndex}.
339 | *
340 | *
{@code hashFunction} corresponds to the {@link HashFunction} that was supplied when the
341 | * config was constructed. Depending on the implementation, {@code hashFunction} may or may not
342 | * be used.
343 | *
344 | *
The returned value should be in range [0, {@code bucketCount}), and the method needs to be
345 | * an involution with respect to {@code bucketIndex}. That is, with other parameters fixed, the
346 | * method needs to satisfy bucketIndex =
347 | * computeOtherBucketIndex(computeOtherBucketIndex(bucketIndex)) for all valid
348 | * bucketIndex. Note that other parameters are omitted for brevity. If these properties
349 | * don't hold, the behavior of the cuckoo filter is undefined.
350 | */
351 | int computeOtherBucketIndex(
352 | long fingerprint, int bucketIndex, int bucketCount, HashFunction hashFunction);
353 |
354 | /**
355 | * Maximum number of replacements to be made during insertion, before declaring that the
356 | * insertion has failed.
357 | *
358 | *
If not overridden, set to 500 as a default.
359 | */
360 | default int maxReplacementCount() {
361 | return 500;
362 | }
363 | }
364 | }
365 |
--------------------------------------------------------------------------------
/setfilters/src/com/google/setfilters/cuckoofilter/CuckooFilterHashFunctions.java:
--------------------------------------------------------------------------------
1 | // Copyright 2022 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // https://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.setfilters.cuckoofilter;
16 |
17 | import com.google.common.hash.Funnel;
18 | import com.google.common.hash.HashCode;
19 | import com.google.common.hash.Hashing;
20 |
21 | /** A set of predefined {@link CuckooFilterConfig.HashFunction}s. */
22 | public enum CuckooFilterHashFunctions implements CuckooFilterConfig.HashFunction {
23 |
24 | /**
25 | * MurmurHash3 that yields 128 bit hash value.
26 | *
27 | *
Behavior of MurmurHash3 is fixed and should not change in the future.
28 | */
29 | MURMUR3_128() {
30 | @Override
31 | public HashCode hash(T element, Funnel super T> funnel) {
32 | return Hashing.murmur3_128().hashObject(element, funnel);
33 | }
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/setfilters/src/com/google/setfilters/cuckoofilter/CuckooFilterStrategies.java:
--------------------------------------------------------------------------------
1 | // Copyright 2022 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // https://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.setfilters.cuckoofilter;
16 |
17 | import com.google.common.hash.Funnels;
18 | import com.google.common.hash.HashCode;
19 |
20 | /** A set of predefined {@link CuckooFilterConfig.Strategy}s. */
21 | public enum CuckooFilterStrategies implements CuckooFilterConfig.Strategy {
22 |
23 | /**
24 | * A strategy that uses a mod operator to produce the desired outputs.
25 | *
26 | * The {@link HashCode} generated with the hash function should be at least 64 bits. This will
27 | * achieve good false positive rate when fingerprintLength <= 32.
28 | */
29 | SIMPLE_MOD() {
30 | @Override
31 | public long computeFingerprint(HashCode hash, int fingerprintLength) {
32 | // Use the most significant fingerprintLength bits. This is needed to get rid of the
33 | // correlation with the bucket index.
34 | long fingerprint = hash.asLong() >>> (Long.SIZE - fingerprintLength);
35 | // Value 0 is reserved, so instead map to 1. This means that the generated fingerprint value
36 | // is skewed (1 is twice as more likely to be generated than any other value). Note that, we
37 | // could have taken mod (2^fingerprintLength - 1) and added 1, which would produce a more
38 | // uniform distribution. However, for performance reason, we choose to take this approach
39 | // instead.
40 | if (fingerprint == 0) {
41 | return 1L;
42 | }
43 | return fingerprint;
44 | }
45 |
46 | @Override
47 | public int computeBucketIndex(HashCode hash, int bucketCount) {
48 | return Math.floorMod(hash.asLong(), bucketCount);
49 | }
50 |
51 | @Override
52 | public int computeOtherBucketIndex(
53 | long fingerprint,
54 | int bucketIndex,
55 | int bucketCount,
56 | CuckooFilterConfig.HashFunction hashFunction) {
57 | long fingerprintHash = hashFunction.hash(fingerprint, Funnels.longFunnel()).asLong();
58 | // Use (hash(fingerprint) - bucketIndex) mod bucketCount as the involution.
59 | return Math.floorMod(fingerprintHash - bucketIndex, bucketCount);
60 | }
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/setfilters/src/com/google/setfilters/cuckoofilter/CuckooFilterTable.java:
--------------------------------------------------------------------------------
1 | // Copyright 2022 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // https://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.setfilters.cuckoofilter;
16 |
17 | import java.nio.ByteBuffer;
18 | import java.util.Optional;
19 | import java.util.Random;
20 |
21 | /** An array of buckets where each bucket can store a fixed number of fingerprints. */
22 | interface CuckooFilterTable {
23 | /** Value of the empty "slot", which is reserved as 0. */
24 | public static long EMPTY_SLOT = 0L;
25 |
26 | /**
27 | * Creates an implementation of an empty cuckoo filter based on whether space optimization should
28 | * be used.
29 | *
30 | *
Space optimization is best effort, and is not guaranteed.
31 | */
32 | public static CuckooFilterTable create(
33 | CuckooFilterConfig.Size size, boolean useSpaceOptimization, Random random) {
34 | if (useSpaceOptimization && size.bucketCapacity() == 4 && size.fingerprintLength() >= 4) {
35 | return new SemiSortedCuckooFilterTable(size, random);
36 | }
37 | return new UncompressedCuckooFilterTable(size, random);
38 | }
39 |
40 | /** Creates an implementation of the cuckoo filter based on the serialization. */
41 | public static CuckooFilterTable createFromSerialization(
42 | SerializedCuckooFilterTable serializedTable, Random random) {
43 | ByteBuffer buffer = ByteBuffer.wrap(serializedTable.asByteArray());
44 |
45 | if (buffer.remaining() <= 16) {
46 | throw new IllegalArgumentException("Unable to parse the SerializedCuckooFilterTable.");
47 | }
48 |
49 | int tableType = buffer.getInt();
50 | int bucketCount = buffer.getInt();
51 | int bucketCapacity = buffer.getInt();
52 | int fingerprintLength = buffer.getInt();
53 | CuckooFilterConfig.Size size =
54 | CuckooFilterConfig.Size.newBuilder()
55 | .setBucketCount(bucketCount)
56 | .setBucketCapacity(bucketCapacity)
57 | .setFingerprintLength(fingerprintLength)
58 | .build();
59 |
60 | byte[] bitArray = new byte[buffer.remaining()];
61 | buffer.get(bitArray);
62 |
63 | if (tableType == UncompressedCuckooFilterTable.TABLE_TYPE) {
64 | return new UncompressedCuckooFilterTable(size, bitArray, random);
65 | } else if (tableType == SemiSortedCuckooFilterTable.TABLE_TYPE) {
66 | return new SemiSortedCuckooFilterTable(size, bitArray, random);
67 | } else {
68 | throw new IllegalArgumentException("Unable to parse the SerializedCuckooFilterTable.");
69 | }
70 | }
71 |
72 | /**
73 | * Inserts given {@code fingerprint} to the {@code bucketIndex}th bucket, replacing an arbitrary
74 | * fingerprint if the bucket is full.
75 | *
76 | *
How this arbitrary fingerprint is chosen depends on the implementation.
77 | *
78 | * @return the value of the replaced fingerprint if the bucket is full, and an empty {@link
79 | * Optional} otherwise.
80 | */
81 | Optional insertWithReplacement(int bucketIndex, long fingerprint);
82 |
83 | /** Returns whether {@code bucketIndex}th bucket contains {@code fingerprint}. */
84 | boolean contains(int bucketIndex, long fingerprint);
85 |
86 | /**
87 | * Deletes a {@code fingerprint} from {@code bucketIndex}th bucket.
88 | *
89 | * If a bucket contains multiple {@code fingerprint} values, this method only deletes one.
90 | *
91 | * @return {@code true} if {@code fingerprint} is in {@code bucketIndex}th bucket and is deleted,
92 | * and {@code false} otherwise.
93 | */
94 | boolean delete(int bucketIndex, long fingerprint);
95 |
96 | /** Returns whether {@code bucketIndex}th bucket is full. */
97 | boolean isFull(int bucketIndex);
98 |
99 | /** Returns the size of {@link CuckooFilterTable}. */
100 | CuckooFilterConfig.Size size();
101 |
102 | /** Returns serialization of {@link CuckooFilterTable}. */
103 | SerializedCuckooFilterTable serialize();
104 |
105 | // TODO: Add more methods as needed.
106 | }
107 |
--------------------------------------------------------------------------------
/setfilters/src/com/google/setfilters/cuckoofilter/SemiSortedCuckooFilterTable.java:
--------------------------------------------------------------------------------
1 | // Copyright 2022 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // https://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.setfilters.cuckoofilter;
16 |
17 | import static com.google.common.base.Preconditions.checkArgument;
18 | import static java.util.Comparator.comparingInt;
19 |
20 | import com.google.common.collect.ImmutableMap;
21 | import java.nio.ByteBuffer;
22 | import java.util.Arrays;
23 | import java.util.Optional;
24 | import java.util.Random;
25 |
26 | /**
27 | * Implementation of the {@link CuckooFilterTable} using the semi-sorting bucket compression scheme
28 | * in the original paper by Fan et al (https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf) -
29 | * see section 5.2.
30 | *
31 | *
The main idea behind the compression algorithm is that the order of the fingerprints in each
32 | * bucket is irrelevant - that is, the fingerprints in each bucket forms a multiset. For fingerprint
33 | * length f and bucket capacity b, the possible number of multisets of b fingerprints of f bits each
34 | * is given by C(2^f + b - 1, b), where C denotes binomial coefficient. In particular, we can encode
35 | * each bucket with ceil(log2(C(2^f + b - 1, b))) bits. On the other hand, naively encoding the
36 | * fingerprints will take b * f bits. Thus, it is theoretically possible to save b * f -
37 | * ceil(log2(C(2^f + b - 1, b))) bits per bucket (note that this is not information theoretically
38 | * tight because the distribution of the multisets is not uniform).
39 | *
40 | *
For performance reason, this only supports a table with bucket capacity of size 4 and
41 | * fingerprint length >= 4 - in many cases this is not a limitation because, for many practical
42 | * applications, bucket capacity of size 4 yields the optimal cuckoo filter size and fingerprint
43 | * length < 4 will never achieve good enough false positive rate.
44 | *
45 | *
Compared to the {@link UncompressedCuckooFilterTable}, this implementation can save 1 bit per
46 | * element, at the cost of slower filter operations by a constant factor (asymptotically, it is the
47 | * same as the uncompressed one). Note that for bucket capacity of size 4, saving 1 bit per element
48 | * is "optimal" up to rounding down, as the function 4 * f - ceil(log2(C(2^f + 3, 4))) < 5 for
49 | * reasonable values of f. However, this also incurs an additional fixed space overhead, so for
50 | * smaller filter the extra saving of 1 bit per element may not be worth it.
51 | */
52 | final class SemiSortedCuckooFilterTable implements CuckooFilterTable {
53 | // Implementation type of the table, to be encoded in the serialization.
54 | public static final int TABLE_TYPE = 1;
55 |
56 | // Table containing all sorted 4 bit partial fingerprints of length 4 (16 bits) by its index.
57 | private static final short[] SORTED_PARTIAL_FINGERPRINTS = computeSortedPartialFingerprints();
58 | // Inverse map of SORTED_PARTIAL_FINGERPRINTS.
59 | private static final ImmutableMap SORTED_PARTIAL_FINGERPRINTS_INDEX =
60 | computeSortedPartialFingerprintsIndex(SORTED_PARTIAL_FINGERPRINTS);
61 |
62 | private final CuckooFilterConfig.Size size;
63 | private final Random random;
64 | private final CuckooFilterArray cuckooFilterArray;
65 |
66 | /**
67 | * Creates a new uncompressed cuckoo filter table of the given size.
68 | *
69 | * Uses the given source of {@code random} to choose the replaced fingerprint in {@code
70 | * insertWithReplacement} method.
71 | */
72 | public SemiSortedCuckooFilterTable(CuckooFilterConfig.Size size, Random random) {
73 | this.size = size;
74 | checkArgument(
75 | size.bucketCapacity() == 4,
76 | "SemiSortedCuckooFilterTable only supports bucket capacity of 4.");
77 | checkArgument(
78 | size.fingerprintLength() >= 4,
79 | "SemiSortedCuckooFilterTable only supports fingerprint length >= 4.");
80 | this.random = random;
81 | // bucketCapacity == 4 and fingerprintLength <= 64, so we can assume that it will always fit
82 | // into a long.
83 | cuckooFilterArray =
84 | new CuckooFilterArray(
85 | (long) size.bucketCount() * size.bucketCapacity(), size.fingerprintLength() - 1);
86 | }
87 |
88 | /** Creates {@link SemiSortedCuckooFilterTable} from {@link SerializedCuckooFilterTable}. */
89 | public SemiSortedCuckooFilterTable(CuckooFilterConfig.Size size, byte[] bitArray, Random random) {
90 | this.size = size;
91 | this.random = random;
92 | cuckooFilterArray =
93 | new CuckooFilterArray(
94 | (long) size.bucketCount() * size.bucketCapacity(),
95 | size.fingerprintLength() - 1,
96 | bitArray);
97 | }
98 |
99 | @Override
100 | public Optional insertWithReplacement(int bucketIndex, long fingerprint) {
101 | long[] fingerprints = decodeBucket(bucketIndex);
102 | for (int i = 0; i < size.bucketCapacity(); i++) {
103 | if (fingerprints[i] == EMPTY_SLOT) {
104 | fingerprints[i] = fingerprint;
105 | encodeAndPut(bucketIndex, fingerprints);
106 | return Optional.empty();
107 | }
108 | }
109 |
110 | int replacedSlotIndex = random.nextInt(size.bucketCapacity());
111 | long replacedFingerprint = fingerprints[replacedSlotIndex];
112 | fingerprints[replacedSlotIndex] = fingerprint;
113 | encodeAndPut(bucketIndex, fingerprints);
114 | return Optional.of(replacedFingerprint);
115 | }
116 |
117 | @Override
118 | public boolean contains(int bucketIndex, long fingerprint) {
119 | long[] fingerprints = decodeBucket(bucketIndex);
120 | for (long fingerprintInBucket : fingerprints) {
121 | if (fingerprintInBucket == fingerprint) {
122 | return true;
123 | }
124 | }
125 | return false;
126 | }
127 |
128 | @Override
129 | public boolean delete(int bucketIndex, long fingerprint) {
130 | long[] fingerprints = decodeBucket(bucketIndex);
131 | for (int i = 0; i < fingerprints.length; i++) {
132 | if (fingerprints[i] == fingerprint) {
133 | fingerprints[i] = EMPTY_SLOT;
134 | encodeAndPut(bucketIndex, fingerprints);
135 | return true;
136 | }
137 | }
138 | return false;
139 | }
140 |
141 | @Override
142 | public boolean isFull(int bucketIndex) {
143 | return !contains(bucketIndex, CuckooFilterTable.EMPTY_SLOT);
144 | }
145 |
146 | @Override
147 | public CuckooFilterConfig.Size size() {
148 | return size;
149 | }
150 |
151 | @Override
152 | public SerializedCuckooFilterTable serialize() {
153 | byte[] serializedArray = cuckooFilterArray.toByteArray();
154 |
155 | // The first 16 bytes specifies the implementation type and the size of the table (defined by
156 | // tuple (type, bucketCount,
157 | // bucketCapacity, fingerprintLength)).
158 | // Rest is the bit array.
159 | ByteBuffer encoded = ByteBuffer.allocate(16 + serializedArray.length);
160 | return SerializedCuckooFilterTable.createFromByteArray(
161 | encoded
162 | .putInt(TABLE_TYPE)
163 | .putInt(size.bucketCount())
164 | .putInt(size.bucketCapacity())
165 | .putInt(size.fingerprintLength())
166 | .put(serializedArray)
167 | .array());
168 | }
169 |
170 | private long toArrayIndex(int bucketIndex, int slotIndex) {
171 | return (long) bucketIndex * size.bucketCapacity() + slotIndex;
172 | }
173 |
174 | // TODO: Check if encoding/decoding needs to be optimized.
175 |
176 | // Decodes fingerprints at bucketIndex.
177 | private long[] decodeBucket(int bucketIndex) {
178 | int encodedSortedPartialFingerintsIndex = 0;
179 | long[] fingerprintPrefixes = new long[size.bucketCapacity()];
180 | for (int i = 0; i < size.bucketCapacity(); i++) {
181 | long arrayIndex = toArrayIndex(bucketIndex, i);
182 | long n = cuckooFilterArray.getAsLong(arrayIndex);
183 | encodedSortedPartialFingerintsIndex <<= 3;
184 | encodedSortedPartialFingerintsIndex |= (int) (n & 0x7);
185 | fingerprintPrefixes[i] = n >>> 3;
186 | }
187 |
188 | int encodedSortedPartialFingerprints =
189 | SORTED_PARTIAL_FINGERPRINTS[encodedSortedPartialFingerintsIndex];
190 | long[] fingerprints = new long[size.bucketCapacity()];
191 | for (int i = size.bucketCapacity() - 1; i >= 0; i--) {
192 | fingerprints[i] = (fingerprintPrefixes[i] << 4) | (encodedSortedPartialFingerprints & 0xF);
193 | encodedSortedPartialFingerprints >>>= 4;
194 | }
195 | return fingerprints;
196 | }
197 |
198 | /**
199 | * Encode fingerprints and put them to bucketIndex.
200 | *
201 | * Encoding works as follows.
202 | *
203 | *
Suppose each fingerprint is logically f bits. First, sort the fingerprints by the least
204 | * significant 4 bits. Let's call the most significant f - 4 bits of the fingerprints as the
205 | * fingerprint prefixes. The least significant 4 bits of the fingerprints will be the partial
206 | * fingerprints, which will be encoded according to the SORTED_PARTIAL_FINGEPRRINTS_INDEX map as a
207 | * 12 bit value. Partition the encoded 12 bit value into four 3 bit chunks. Group each of the f -
208 | * 4 bit prefixes with each 3 bit chunk (f - 1 bits total) and insert it as a cuckoo filter array
209 | * element.
210 | */
211 | private void encodeAndPut(int bucketIndex, long[] fingerprints) {
212 | long[] fingerprintPrefixes = new long[size.bucketCapacity()];
213 | int[] partialFingerprints = new int[size.bucketCapacity()];
214 | for (int i = 0; i < size.bucketCapacity(); i++) {
215 | fingerprintPrefixes[i] = fingerprints[i] >>> 4;
216 | partialFingerprints[i] = (int) (fingerprints[i] & 0xF);
217 | }
218 | Integer[] indices = {0, 1, 2, 3};
219 | Arrays.sort(indices, comparingInt((Integer i) -> partialFingerprints[i]));
220 | short encodedSortedPartialFingerprints =
221 | (short)
222 | ((partialFingerprints[indices[0]] << 12)
223 | | (partialFingerprints[indices[1]] << 8)
224 | | (partialFingerprints[indices[2]] << 4)
225 | | partialFingerprints[indices[3]]);
226 | int encodedSortedPartialFingerprintsIndex =
227 | SORTED_PARTIAL_FINGERPRINTS_INDEX.get(encodedSortedPartialFingerprints);
228 | for (int i = size.bucketCapacity() - 1; i >= 0; i--) {
229 | long arrayIndex = toArrayIndex(bucketIndex, i);
230 | cuckooFilterArray.set(
231 | arrayIndex,
232 | (fingerprintPrefixes[indices[i]] << 3) | (encodedSortedPartialFingerprintsIndex & 0x7));
233 | encodedSortedPartialFingerprintsIndex >>>= 3;
234 | }
235 | }
236 |
237 | private static short[] computeSortedPartialFingerprints() {
238 | // (2^4 + 3 choose 4) = 3876 counts the number of multisets of size 4, with each element in
239 | // [0, 16).
240 | short[] sortedPartialFingerprints = new short[3876];
241 |
242 | final short fingerprintUpperBound = 16;
243 |
244 | int i = 0;
245 | for (short a = 0; a < fingerprintUpperBound; a++) {
246 | for (short b = a; b < fingerprintUpperBound; b++) {
247 | for (short c = b; c < fingerprintUpperBound; c++) {
248 | for (short d = c; d < fingerprintUpperBound; d++) {
249 | sortedPartialFingerprints[i] = (short) ((a << 12) | (b << 8) | (c << 4) | d);
250 | i++;
251 | }
252 | }
253 | }
254 | }
255 | return sortedPartialFingerprints;
256 | }
257 |
258 | private static ImmutableMap computeSortedPartialFingerprintsIndex(
259 | short[] sortedPartialFingerprints) {
260 | ImmutableMap.Builder map = ImmutableMap.builder();
261 | for (short i = 0; i < sortedPartialFingerprints.length; i++) {
262 | map.put(sortedPartialFingerprints[i], i);
263 | }
264 | return map.buildOrThrow();
265 | }
266 | }
267 |
--------------------------------------------------------------------------------
/setfilters/src/com/google/setfilters/cuckoofilter/SerializedCuckooFilterTable.java:
--------------------------------------------------------------------------------
1 | // Copyright 2022 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // https://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.setfilters.cuckoofilter;
16 |
17 | import java.util.Arrays;
18 |
19 | /** Serialization of {@link CuckooFilterTable}. */
20 | public final class SerializedCuckooFilterTable {
21 | private final byte[] rawSerialization;
22 |
23 | /** Creates serialization from raw byte array. */
24 | public static SerializedCuckooFilterTable createFromByteArray(byte[] byteArray) {
25 | return new SerializedCuckooFilterTable(Arrays.copyOf(byteArray, byteArray.length));
26 | }
27 |
28 | private SerializedCuckooFilterTable(byte[] rawSerialization) {
29 | this.rawSerialization = rawSerialization;
30 | }
31 |
32 | /** Returns the serialization as a byte array. */
33 | public byte[] asByteArray() {
34 | return Arrays.copyOf(rawSerialization, rawSerialization.length);
35 | }
36 |
37 | // TODO: Add other methods like asJSON();
38 | }
39 |
--------------------------------------------------------------------------------
/setfilters/src/com/google/setfilters/cuckoofilter/UncompressedCuckooFilterTable.java:
--------------------------------------------------------------------------------
1 | // Copyright 2022 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // https://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.setfilters.cuckoofilter;
16 |
17 | import java.nio.ByteBuffer;
18 | import java.util.Optional;
19 | import java.util.Random;
20 |
21 | /**
22 | * Implementation of the {@link CuckooFilterTable} that doesn't use the semi-sorting bucket
23 | * compression scheme in the original paper by Fan et al
24 | * (https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf) - see section 5.2 for what
25 | * semi-sorting bucket compression scheme is.
26 | *
27 | * Thus, if a bucket can hold up to bucketCapacity number of fingerprints and each fingerprint is
28 | * of length fingerprintLength bits, it takes bucketCapacity * fingerprintLength bits to represent
29 | * each bucket.
30 | */
31 | final class UncompressedCuckooFilterTable implements CuckooFilterTable {
32 | // Implementation type of the table, to be encoded in the serialization.
33 | public static final int TABLE_TYPE = 0;
34 |
35 | private final CuckooFilterConfig.Size size;
36 | private final Random random;
37 | private final CuckooFilterArray cuckooFilterArray;
38 |
39 | /**
40 | * Creates a new uncompressed cuckoo filter table of the given size.
41 | *
42 | *
Uses the given source of {@code random} to choose the replaced fingerprint in {@code
43 | * insertWithReplacement} method.
44 | */
45 | public UncompressedCuckooFilterTable(CuckooFilterConfig.Size size, Random random) {
46 | this.size = size;
47 | this.random = random;
48 | // bucketCapacity <= 128 and fingerprintLength <= 64, so we can assume that it will always fit
49 | // into a long.
50 | cuckooFilterArray =
51 | new CuckooFilterArray(
52 | (long) size.bucketCount() * size.bucketCapacity(), size.fingerprintLength());
53 | }
54 |
55 | /** Creates {@link UncompressedCuckooFilterTable} from {@link SerializedCuckooFilterTable}. */
56 | public UncompressedCuckooFilterTable(
57 | CuckooFilterConfig.Size size, byte[] bitArray, Random random) {
58 | this.size = size;
59 | this.random = random;
60 | cuckooFilterArray =
61 | new CuckooFilterArray(
62 | (long) size.bucketCount() * size.bucketCapacity(), size.fingerprintLength(), bitArray);
63 | }
64 |
65 | @Override
66 | public Optional insertWithReplacement(int bucketIndex, long fingerprint) {
67 | for (int slotIndex = 0; slotIndex < size.bucketCapacity(); slotIndex++) {
68 | long arrayIndex = toArrayIndex(bucketIndex, slotIndex);
69 | if (cuckooFilterArray.getAsLong(arrayIndex) == CuckooFilterTable.EMPTY_SLOT) {
70 | cuckooFilterArray.set(arrayIndex, fingerprint);
71 | return Optional.empty();
72 | }
73 | }
74 | int replacedSlotIndex = random.nextInt(size.bucketCapacity());
75 | long replacedArrayIndex = toArrayIndex(bucketIndex, replacedSlotIndex);
76 | long replacedFingerprint = cuckooFilterArray.getAsLong(replacedArrayIndex);
77 | cuckooFilterArray.set(replacedArrayIndex, fingerprint);
78 | return Optional.of(replacedFingerprint);
79 | }
80 |
81 | @Override
82 | public boolean contains(int bucketIndex, long fingerprint) {
83 | for (int slotIndex = 0; slotIndex < size.bucketCapacity(); slotIndex++) {
84 | long arrayIndex = toArrayIndex(bucketIndex, slotIndex);
85 | if (cuckooFilterArray.getAsLong(arrayIndex) == fingerprint) {
86 | return true;
87 | }
88 | }
89 | return false;
90 | }
91 |
92 | @Override
93 | public boolean delete(int bucketIndex, long fingerprint) {
94 | for (int slotIndex = 0; slotIndex < size.bucketCapacity(); slotIndex++) {
95 | long arrayIndex = toArrayIndex(bucketIndex, slotIndex);
96 | if (cuckooFilterArray.getAsLong(arrayIndex) == fingerprint) {
97 | cuckooFilterArray.set(arrayIndex, CuckooFilterTable.EMPTY_SLOT);
98 | return true;
99 | }
100 | }
101 | return false;
102 | }
103 |
104 | @Override
105 | public boolean isFull(int bucketIndex) {
106 | return !contains(bucketIndex, CuckooFilterTable.EMPTY_SLOT);
107 | }
108 |
109 | @Override
110 | public CuckooFilterConfig.Size size() {
111 | return size;
112 | }
113 |
114 | @Override
115 | public SerializedCuckooFilterTable serialize() {
116 | byte[] serializedArray = cuckooFilterArray.toByteArray();
117 |
118 | // The first 16 bytes specifies the implementation type and the size of the table (defined by
119 | // tuple (type, bucketCount,
120 | // bucketCapacity, fingerprintLength)).
121 | // Rest is the bit array.
122 | ByteBuffer encoded = ByteBuffer.allocate(16 + serializedArray.length);
123 | return SerializedCuckooFilterTable.createFromByteArray(
124 | encoded
125 | .putInt(TABLE_TYPE)
126 | .putInt(size.bucketCount())
127 | .putInt(size.bucketCapacity())
128 | .putInt(size.fingerprintLength())
129 | .put(serializedArray)
130 | .array());
131 | }
132 |
133 | private long toArrayIndex(int bucketIndex, int slotIndex) {
134 | return (long) bucketIndex * size.bucketCapacity() + slotIndex;
135 | }
136 | }
137 |
--------------------------------------------------------------------------------