├── src └── com │ └── ifesdjeen │ └── blomstre │ ├── Converters.java │ ├── ConcurrentBitSet.java │ ├── BloomFilter.java │ ├── BloomCalculations.java │ └── MurmurHash.java ├── README.md ├── pom.xml └── test └── com └── ifesdjeen └── blomstre └── BloomFilterTest.java /src/com/ifesdjeen/blomstre/Converters.java: -------------------------------------------------------------------------------- 1 | package com.ifesdjeen.blomstre; 2 | 3 | import java.nio.ByteBuffer; 4 | import java.util.function.Function; 5 | 6 | public class Converters { 7 | static class IntToByteBuffer implements Function { 8 | @Override 9 | public ByteBuffer apply(Integer data) { 10 | ByteBuffer bb = ByteBuffer.allocateDirect(4); 11 | bb.putInt(data); 12 | return bb; 13 | } 14 | } 15 | 16 | static class LongToByteBuffer implements Function { 17 | @Override 18 | public ByteBuffer apply(Long data) { 19 | ByteBuffer bb = ByteBuffer.allocateDirect(8); 20 | bb.putLong(data); 21 | return bb; 22 | } 23 | } 24 | 25 | static class StringToByteBuffer implements Function { 26 | @Override 27 | public ByteBuffer apply(String data) { 28 | byte[] bytes = data.getBytes(); 29 | ByteBuffer bb = ByteBuffer.allocateDirect(bytes.length); 30 | bb.put(bytes); 31 | return bb; 32 | } 33 | } 34 | 35 | public static IntToByteBuffer intToByteBufferConverter; 36 | public static LongToByteBuffer longToByteBufferConverter; 37 | public static StringToByteBuffer stringToByteBufferConverter; 38 | 39 | static { 40 | intToByteBufferConverter = new IntToByteBuffer(); 41 | longToByteBufferConverter = new LongToByteBuffer(); 42 | stringToByteBufferConverter = new StringToByteBuffer(); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/com/ifesdjeen/blomstre/ConcurrentBitSet.java: -------------------------------------------------------------------------------- 1 | package com.ifesdjeen.blomstre; 2 | 3 | import java.util.concurrent.atomic.AtomicInteger; 4 | import java.util.concurrent.atomic.AtomicLongArray; 5 | 6 | public class ConcurrentBitSet { 7 | 8 | /** 9 | * STATE 10 | */ 11 | 12 | private static final int BASE = 64; 13 | private static final long MAX_UNSIGNED_LONG = -1L; 14 | 15 | private final AtomicLongArray buckets; 16 | 17 | public ConcurrentBitSet(long bitsCount) { 18 | int bucketsCount = (int) bitsCount / BASE; 19 | this.buckets = new AtomicLongArray(bucketsCount); 20 | 21 | for (int i = 0; i < buckets.length(); i++) { 22 | this.buckets.set(i, 0); 23 | } 24 | } 25 | 26 | /** 27 | * API 28 | */ 29 | 30 | public void set(long idx) { 31 | final int bucketIdx = (int) idx / BASE; 32 | atomicSet(bucketIdx, (int) idx - (BASE * bucketIdx)); 33 | } 34 | 35 | public boolean get(long idx) { 36 | final int bucketIdx = (int) idx / BASE; 37 | return atomicGet(bucketIdx, (int) idx - (BASE * bucketIdx)); 38 | } 39 | 40 | public void clear() { 41 | throw new RuntimeException("not implemented"); 42 | } 43 | 44 | public long capacity() { 45 | return this.buckets.length() * 64; 46 | } 47 | 48 | /** 49 | * IMLEMENTATION 50 | */ 51 | 52 | private boolean atomicGet(int bucketIdx, int toGet) { 53 | final long l = buckets.get(bucketIdx); 54 | final long idxMask = mask(toGet); 55 | return (l & idxMask) == idxMask; 56 | } 57 | 58 | private void atomicSet(int bucketIdx, int toSet) { 59 | while (true) { 60 | final long l = buckets.get(bucketIdx); 61 | 62 | if (buckets.compareAndSet(bucketIdx, l, l | mask(toSet))) 63 | return; 64 | } 65 | } 66 | 67 | private static long mask(int id) { 68 | return 1L << id; 69 | } 70 | 71 | public String longToBinaryStr(long num) { 72 | StringBuilder stringBuilder = new StringBuilder(); 73 | for(int i = 0; i < BASE; i++) { 74 | final long idxMask = mask(i); 75 | stringBuilder.append( (num & idxMask) == idxMask ? "1" : "0" ); 76 | } 77 | 78 | return stringBuilder.toString(); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Concurrent / Thread Safe BitSet and Bloom filter 2 | 3 | This is a Concurrent / Thread Safe implementation of the [Bloom Filter](https://en.wikipedia.org/wiki/Bloom_filter) 4 | data structure. Bloom Filter allows you to store an approximate set presence information with constant space 5 | guarantees. It can give a precise answer only to the question "which items are _not_ present in the set". 6 | Positive answer to the reverse question ("which items _are_ present in the set") is approximate, which 7 | means that even if you got the positive answer, the item still might be not in the set 8 | 9 | Bloom Filters are implemented very simply. You use multiple hash functions and a bitset. Results of each hash 10 | function are used to set the bits in the bloom filter. When checking the filter, same hash values are taken 11 | and bits at corresponding positions are collected. If all bits on positions yielded by hash functions 12 | were set, we say that element is possibly in the set. Otherwise, if at least one of the bits is unset, 13 | the answer is element is definitely not in the set. 14 | 15 | Hash collisions / intersections result into the false positives: if hash function yields same result for two 16 | values, there's no way to distinguish which exactly value was meant. Given enough collisions all bits will 17 | be set and all queries will return "probably in the set", so make sure you create a sufficiently large set. 18 | 19 | You can find a very good, illustrative description of the Bloom Filter [here](https://www.jasondavies.com/bloomfilter/). 20 | 21 | To include dependency, just use from Maven Central: 22 | 23 | ```xml 24 | 25 | com.github.ifesdjeen 26 | blomstre 27 | 1.0.0-RC1 28 | 29 | ``` 30 | 31 | You can also find it on [clojars](https://clojars.org/com.github.ifesdjeen/blomstre). 32 | 33 | # Usage 34 | 35 | ```java 36 | import com.ifesdjeen.blomstre.BloomFilter; 37 | 38 | // Make a filter that'd accept Strings, give a converter to byte buffer for hash function calculation 39 | // Maximum number of elements is 6000, max acceptable false positive probability is 0.0001 40 | BloomFilter filter = BloomFilter.makeFilter(Converters.stringToByteBufferConverter, 6000, 0.0001); 41 | 42 | // Add an item to the filter 43 | filter.add("abcdef"); 44 | 45 | filter.isPresent("abcdef"); 46 | // => "probably" true 47 | ``` 48 | 49 | # Thread Safety 50 | 51 | Thread safety is achieved through using the concurrent `BitSet`, which is backed by `AtomicLongArray`. That 52 | means that write operations may sometimes retry, although will never affect reads or interfere with 53 | other writes in an unpredictable way. 54 | 55 | # Copyright / License 56 | 57 | Licensed under Apache 2.0 License. 58 | 59 | Original `Bloom Filter` can be found [here](https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/utils/BloomFilter.java) 60 | and it's license [here](https://github.com/apache/cassandra/blob/trunk/LICENSE.txt). 61 | -------------------------------------------------------------------------------- /src/com/ifesdjeen/blomstre/BloomFilter.java: -------------------------------------------------------------------------------- 1 | package com.ifesdjeen.blomstre; 2 | 3 | /* 4 | * !!! THIS SOURCE FILE WAS ORIGINALLY TAKEN FROM APACHE CASSANDRA SOURCE !!! 5 | * 6 | * Licensed to the Apache Software Foundation (ASF) under one 7 | * or more contributor license agreements. See the NOTICE file 8 | * distributed with this work for additional information 9 | * regarding copyright ownership. The ASF licenses this file 10 | * to you under the Apache License, Version 2.0 (the 11 | * "License"); you may not use this file except in compliance 12 | * with the License. You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | 23 | import java.nio.ByteBuffer; 24 | import java.util.function.Function; 25 | 26 | public class BloomFilter { 27 | private static final long BITSET_EXCESS = 20; 28 | 29 | private final Function converter; 30 | public final ConcurrentBitSet bitset; 31 | public final int hashCount; 32 | 33 | BloomFilter(Function converter, 34 | int hashes, 35 | ConcurrentBitSet bitset 36 | ) { 37 | this.converter = converter; 38 | this.hashCount = hashes; 39 | this.bitset = bitset; 40 | } 41 | 42 | private long[] getHashBuckets(ByteBuffer key) { 43 | return getHashBuckets(key, hashCount, bitset.capacity()); 44 | } 45 | 46 | protected long[] hash(ByteBuffer b, 47 | long seed) { 48 | return MurmurHash.hash3_x64_128(b, 0, b.capacity(), seed); 49 | } 50 | 51 | // Murmur is faster than an SHA-based approach and provides as-good collision 52 | // resistance. The combinatorial generation approach described in 53 | // https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf 54 | // does prove to work in actual tests, and is obviously faster 55 | // than performing further iterations of murmur. 56 | long[] getHashBuckets(ByteBuffer b, 57 | int hashCount, 58 | long max) { 59 | final long[] result = new long[hashCount]; 60 | final long[] hash = this.hash(b, 0L); 61 | for (int i = 0; i < hashCount; ++i) { 62 | result[i] = Math.abs((hash[0] + (long) i * hash[1]) % max); 63 | } 64 | return result; 65 | } 66 | 67 | public void add(T key) { 68 | add(converter.apply(key)); 69 | } 70 | 71 | protected void add(ByteBuffer key) { 72 | for (long bucketIndex : getHashBuckets(key)) { 73 | bitset.set(bucketIndex); 74 | } 75 | } 76 | 77 | public boolean isPresent(T key) { 78 | return isPresent(converter.apply(key)); 79 | } 80 | 81 | protected boolean isPresent(ByteBuffer key) { 82 | for (long bucketIndex : getHashBuckets(key)) { 83 | if (!bitset.get(bucketIndex)) { 84 | return false; 85 | } 86 | } 87 | return true; 88 | } 89 | 90 | public void clear() { 91 | bitset.clear(); 92 | } 93 | 94 | public static BloomFilter makeFilter(Function converter, 95 | int numElements, 96 | double maxFalsePosProbability) { 97 | int maxBucketsPerElement = BloomCalculations.maxBucketsPerElement(numElements); 98 | BloomCalculations.BloomSpecification spec = BloomCalculations.computeBloomSpec(maxBucketsPerElement, 99 | maxFalsePosProbability); 100 | 101 | long numBits = (numElements * spec.bucketsPerElement) + BITSET_EXCESS; 102 | return new BloomFilter(converter, spec.K, new ConcurrentBitSet(numBits)); 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 4.0.0 6 | 7 | com.github.ifesdjeen 8 | blomstre 9 | jar 10 | 1.0.0-RC1 11 | Concurrent Bloom Filter 12 | 13 | 14 | 15 | Apache Public License 16 | https://opensource.org/licenses/Apache-2.0 17 | 18 | 19 | 20 | 21 | 1.8 22 | ossrh 23 | https://oss.sonatype.org/content/repositories/snapshots 24 | ossrh 25 | https://oss.sonatype.org/content/repositories/releases 26 | 27 | 28 | 29 | src/ 30 | test/ 31 | 32 | 33 | resources 34 | 35 | 36 | 37 | 38 | dev-resources 39 | 40 | 41 | resources 42 | 43 | 44 | target 45 | target/classes 46 | 47 | 48 | 49 | 50 | org.apache.maven.plugins 51 | maven-compiler-plugin 52 | 3.1 53 | 54 | ${javac.target} 55 | ${javac.target} 56 | ${javac.target} 57 | 58 | 59 | 60 | 61 | org.apache.maven.plugins 62 | maven-surefire-plugin 63 | 2.18.1 64 | 65 | 66 | **/*Test*.java 67 | 68 | random 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | ${snapshotsRepositoryId} 78 | ${snapshotsRepositoryUrl} 79 | 80 | 81 | ${releasesRepositoryId} 82 | ${releasesRepositoryUrl} 83 | 84 | 85 | 86 | 87 | scm:git:git@github.com:ifesdjeen/blomstre.git 88 | scm:git:git@github.com:ifesdjeen/blomstre.git 89 | git@github.com:ifesdjeen/blomstre.git 90 | HEAD 91 | 92 | 93 | 94 | 95 | junit 96 | junit 97 | 4.12 98 | test 99 | 100 | 101 | org.hamcrest 102 | hamcrest-all 103 | 1.3 104 | test 105 | 106 | 107 | 108 | 109 | 110 | clojars 111 | 112 | 113 | env 114 | clojars 115 | 116 | 117 | 118 | clojars 119 | https://clojars.org/repo 120 | clojars 121 | https://clojars.org/repo 122 | 123 | 124 | 125 | 126 | -------------------------------------------------------------------------------- /test/com/ifesdjeen/blomstre/BloomFilterTest.java: -------------------------------------------------------------------------------- 1 | package com.ifesdjeen.blomstre; 2 | 3 | import org.junit.Test; 4 | 5 | import java.util.concurrent.CountDownLatch; 6 | import java.util.concurrent.ExecutorService; 7 | import java.util.concurrent.Executors; 8 | 9 | import static org.junit.Assert.assertFalse; 10 | import static org.junit.Assert.assertTrue; 11 | 12 | public class BloomFilterTest { 13 | 14 | @Test 15 | public void bloomFilter0Test() { 16 | BloomFilter filter = BloomFilter.makeFilter(Converters.stringToByteBufferConverter, 6000, 0.0001); 17 | 18 | int i = 0; 19 | filter.add(String.format("%d.%d.%d.%d", 20 | i, 21 | i, 22 | i, 23 | i)); 24 | 25 | assertTrue(filter.isPresent(String.format("%d.%d.%d.%d", 26 | i, 27 | i, 28 | i, 29 | i))); 30 | 31 | 32 | } 33 | 34 | 35 | @Test 36 | public void bloomFilter1Test() { 37 | BloomFilter filter = BloomFilter.makeFilter(Converters.stringToByteBufferConverter, 6000, 0.0001); 38 | 39 | int i = 1; 40 | filter.add(String.format("%d.%d.%d.%d", 41 | i, 42 | i, 43 | i, 44 | i)); 45 | 46 | assertTrue(filter.isPresent(String.format("%d.%d.%d.%d", 47 | i, 48 | i, 49 | i, 50 | i))); 51 | 52 | 53 | } 54 | 55 | @Test 56 | public void bloomFilterTest() { 57 | BloomFilter filter = BloomFilter.makeFilter(Converters.stringToByteBufferConverter, 6000, 0.0001); 58 | 59 | for(int i = 0; i < 1000; i++) { 60 | filter.add(String.format("%d.%d.%d.%d", 61 | i, 62 | i, 63 | i, 64 | i)); 65 | } 66 | 67 | for(int i = 0; i < 1000; i++) { 68 | assertTrue(filter.isPresent(String.format("%d.%d.%d.%d", 69 | i, 70 | i, 71 | i, 72 | i))); 73 | } 74 | 75 | for(int i = 1000; i < 5000; i++) { 76 | assertFalse(filter.isPresent(String.format("%d.%d.%d.%d", 77 | i, 78 | i, 79 | i, 80 | i))); 81 | } 82 | } 83 | 84 | @Test 85 | public void concurrentBloomFilterTest() throws InterruptedException { 86 | final BloomFilter filter = BloomFilter.makeFilter(Converters.stringToByteBufferConverter, 6000, 0.01); 87 | 88 | final int threads = 10; 89 | final CountDownLatch latch = new CountDownLatch(threads); 90 | final CountDownLatch finishedLatch = new CountDownLatch(threads); 91 | 92 | for(int thread = 0; thread < threads; thread++) { 93 | final int finalThread = thread; 94 | new Thread(new Runnable() { 95 | @Override 96 | public void run() { 97 | final int localFinalThread = finalThread; 98 | latch.countDown(); 99 | try { 100 | latch.await(); 101 | Thread.sleep(1000); 102 | } catch (InterruptedException e) { 103 | e.printStackTrace(); 104 | } 105 | for (int i = 0; i < 100; i++) { 106 | final int finalI = localFinalThread * 100 + i; 107 | filter.add(String.format("%d.%d.%d.%d", 108 | finalI, 109 | finalI, 110 | finalI, 111 | finalI)); 112 | } 113 | finishedLatch.countDown(); 114 | } 115 | }).start(); 116 | } 117 | 118 | finishedLatch.await(); 119 | 120 | for(int i = 0; i < 1000; i++) { 121 | assertTrue(filter.isPresent(String.format("%d.%d.%d.%d", 122 | i, 123 | i, 124 | i, 125 | i))); 126 | } 127 | 128 | 129 | } 130 | 131 | } 132 | -------------------------------------------------------------------------------- /src/com/ifesdjeen/blomstre/BloomCalculations.java: -------------------------------------------------------------------------------- 1 | package com.ifesdjeen.blomstre; 2 | 3 | /* 4 | * !!! THIS SOURCE FILE WAS TAKEN FROM APACHE CASSANDRA SOURCE !!! 5 | * 6 | * Licensed to the Apache Software Foundation (ASF) under one 7 | * or more contributor license agreements. See the NOTICE file 8 | * distributed with this work for additional information 9 | * regarding copyright ownership. The ASF licenses this file 10 | * to you under the Apache License, Version 2.0 (the 11 | * "License"); you may not use this file except in compliance 12 | * with the License. You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | 23 | /** 24 | * The following calculations are taken from: 25 | * http://www.cs.wisc.edu/~cao/papers/summary-cache/node8.html 26 | * "Bloom Filters - the math" 27 | *

28 | * This class's static methods are meant to facilitate the use of the Bloom 29 | * Filter class by helping to choose correct values of 'bits per element' and 30 | * 'number of hash functions, k'. 31 | */ 32 | public class BloomCalculations { 33 | 34 | private static final int minBuckets = 2; 35 | private static final int minK = 1; 36 | 37 | private static final int EXCESS = 20; 38 | 39 | /** 40 | * In the following table, the row 'i' shows false positive rates if i buckets 41 | * per element are used. Column 'j' shows false positive rates if j hash 42 | * functions are used. The first row is 'i=0', the first column is 'j=0'. 43 | * Each cell (i,j) the false positive rate determined by using i buckets per 44 | * element and j hash functions. 45 | */ 46 | static final double[][] probs = new double[][]{ 47 | {1.0}, // dummy row representing 0 buckets per element 48 | {1.0, 1.0}, // dummy row representing 1 buckets per element 49 | {1.0, 0.393, 0.400}, 50 | {1.0, 0.283, 0.237, 0.253}, 51 | {1.0, 0.221, 0.155, 0.147, 0.160}, 52 | {1.0, 0.181, 0.109, 0.092, 0.092, 0.101}, // 5 53 | {1.0, 0.154, 0.0804, 0.0609, 0.0561, 0.0578, 0.0638}, 54 | {1.0, 0.133, 0.0618, 0.0423, 0.0359, 0.0347, 0.0364}, 55 | {1.0, 0.118, 0.0489, 0.0306, 0.024, 0.0217, 0.0216, 0.0229}, 56 | {1.0, 0.105, 0.0397, 0.0228, 0.0166, 0.0141, 0.0133, 0.0135, 0.0145}, 57 | {1.0, 0.0952, 0.0329, 0.0174, 0.0118, 0.00943, 0.00844, 0.00819, 0.00846}, // 10 58 | {1.0, 0.0869, 0.0276, 0.0136, 0.00864, 0.0065, 0.00552, 0.00513, 0.00509}, 59 | {1.0, 0.08, 0.0236, 0.0108, 0.00646, 0.00459, 0.00371, 0.00329, 0.00314}, 60 | {1.0, 0.074, 0.0203, 0.00875, 0.00492, 0.00332, 0.00255, 0.00217, 0.00199, 0.00194}, 61 | {1.0, 0.0689, 0.0177, 0.00718, 0.00381, 0.00244, 0.00179, 0.00146, 0.00129, 0.00121, 0.0012}, 62 | {1.0, 0.0645, 0.0156, 0.00596, 0.003, 0.00183, 0.00128, 0.001, 0.000852, 0.000775, 0.000744}, // 15 63 | {1.0, 0.0606, 0.0138, 0.005, 0.00239, 0.00139, 0.000935, 0.000702, 0.000574, 0.000505, 0.00047, 0.000459}, 64 | {1.0, 0.0571, 0.0123, 0.00423, 0.00193, 0.00107, 0.000692, 0.000499, 0.000394, 0.000335, 0.000302, 0.000287, 0.000284}, 65 | {1.0, 0.054, 0.0111, 0.00362, 0.00158, 0.000839, 0.000519, 0.00036, 0.000275, 0.000226, 0.000198, 0.000183, 0.000176}, 66 | {1.0, 0.0513, 0.00998, 0.00312, 0.0013, 0.000663, 0.000394, 0.000264, 0.000194, 0.000155, 0.000132, 0.000118, 0.000111, 0.000109}, 67 | {1.0, 0.0488, 0.00906, 0.0027, 0.00108, 0.00053, 0.000303, 0.000196, 0.00014, 0.000108, 8.89e-05, 7.77e-05, 7.12e-05, 6.79e-05, 6.71e-05} // 20 68 | }; // the first column is a dummy column representing K=0. 69 | 70 | /** 71 | * The optimal number of hashes for a given number of bits per element. 72 | * These values are automatically calculated from the data above. 73 | */ 74 | private static final int[] optKPerBuckets = new int[probs.length]; 75 | 76 | static { 77 | for (int i = 0; i < probs.length; i++) { 78 | double min = Double.MAX_VALUE; 79 | double[] prob = probs[i]; 80 | for (int j = 0; j < prob.length; j++) { 81 | if (prob[j] < min) { 82 | min = prob[j]; 83 | optKPerBuckets[i] = Math.max(minK, j); 84 | } 85 | } 86 | } 87 | } 88 | 89 | /** 90 | * Given the number of buckets that can be used per element, return a 91 | * specification that minimizes the false positive rate. 92 | * 93 | * @param bucketsPerElement The number of buckets per element for the filter. 94 | * @return A spec that minimizes the false positive rate. 95 | */ 96 | public static BloomSpecification computeBloomSpec(int bucketsPerElement) { 97 | assert bucketsPerElement >= 1; 98 | assert bucketsPerElement <= probs.length - 1; 99 | return new BloomSpecification(optKPerBuckets[bucketsPerElement], bucketsPerElement); 100 | } 101 | 102 | /** 103 | * A wrapper class that holds two key parameters for a Bloom Filter: the 104 | * number of hash functions used, and the number of buckets per element used. 105 | */ 106 | public static class BloomSpecification { 107 | public final int K; // number of hash functions. 108 | public final int bucketsPerElement; 109 | 110 | public BloomSpecification(int k, 111 | int bucketsPerElement) { 112 | K = k; 113 | this.bucketsPerElement = bucketsPerElement; 114 | } 115 | 116 | public String toString() { 117 | return String.format("BloomSpecification(K=%d, bucketsPerElement=%d)", K, bucketsPerElement); 118 | } 119 | } 120 | 121 | /** 122 | * Given a maximum tolerable false positive probability, compute a Bloom 123 | * specification which will give less than the specified false positive rate, 124 | * but minimize the number of buckets per element and the number of hash 125 | * functions used. Because bandwidth (and therefore total bitvector size) 126 | * is considered more expensive than computing power, preference is given 127 | * to minimizing buckets per element rather than number of hash functions. 128 | * 129 | * @param maxBucketsPerElement The maximum number of buckets available for the filter. 130 | * @param maxFalsePosProb The maximum tolerable false positive rate. 131 | * @return A Bloom Specification which would result in a false positive rate 132 | * less than specified by the function call 133 | * @throws UnsupportedOperationException if a filter satisfying the parameters cannot be met 134 | */ 135 | public static BloomSpecification computeBloomSpec(int maxBucketsPerElement, 136 | double maxFalsePosProb) { 137 | assert maxBucketsPerElement >= 1; 138 | assert maxBucketsPerElement <= probs.length - 1; 139 | int maxK = probs[maxBucketsPerElement].length - 1; 140 | 141 | // Handle the trivial cases 142 | if (maxFalsePosProb >= probs[minBuckets][minK]) { 143 | return new BloomSpecification(2, optKPerBuckets[2]); 144 | } 145 | if (maxFalsePosProb < probs[maxBucketsPerElement][maxK]) { 146 | throw new UnsupportedOperationException(String.format("Unable to satisfy %s with %s buckets per element", 147 | maxFalsePosProb, maxBucketsPerElement)); 148 | } 149 | 150 | // First find the minimal required number of buckets: 151 | int bucketsPerElement = 2; 152 | int K = optKPerBuckets[2]; 153 | while (probs[bucketsPerElement][K] > maxFalsePosProb) { 154 | bucketsPerElement++; 155 | K = optKPerBuckets[bucketsPerElement]; 156 | } 157 | // Now that the number of buckets is sufficient, see if we can relax K 158 | // without losing too much precision. 159 | while (probs[bucketsPerElement][K - 1] <= maxFalsePosProb) { 160 | K--; 161 | } 162 | 163 | return new BloomSpecification(K, bucketsPerElement); 164 | } 165 | 166 | /** 167 | * Calculates the maximum number of buckets per element that this implementation 168 | * can support. Crucially, it will lower the bucket count if necessary to meet 169 | * BitSet's size restrictions. 170 | */ 171 | public static int maxBucketsPerElement(long numElements) { 172 | numElements = Math.max(1, numElements); 173 | double v = (Long.MAX_VALUE - EXCESS) / (double) numElements; 174 | if (v < 1.0) { 175 | throw new UnsupportedOperationException("Cannot compute probabilities for " + numElements + " elements."); 176 | } 177 | return Math.min(BloomCalculations.probs.length - 1, (int) v); 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /src/com/ifesdjeen/blomstre/MurmurHash.java: -------------------------------------------------------------------------------- 1 | package com.ifesdjeen.blomstre; 2 | 3 | /* 4 | * !!! THIS SOURCE FILE WAS TAKEN FROM APACHE CASSANDRA SOURCE !!! 5 | * 6 | * Licensed to the Apache Software Foundation (ASF) under one 7 | * or more contributor license agreements. See the NOTICE file 8 | * distributed with this work for additional information 9 | * regarding copyright ownership. The ASF licenses this file 10 | * to you under the Apache License, Version 2.0 (the 11 | * "License"); you may not use this file except in compliance 12 | * with the License. You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | 23 | import java.nio.ByteBuffer; 24 | 25 | /** 26 | * This is a very fast, non-cryptographic hash suitable for general hash-based 27 | * lookup. See http://murmurhash.googlepages.com/ for more details. 28 | * 29 | * hash32() and hash64() are MurmurHash 2.0. 30 | * hash3_x64_128() is MurmurHash 3.0. 31 | * 32 | *

33 | * The C version of MurmurHash 2.0 found at that site was ported to Java by 34 | * Andrzej Bialecki (ab at getopt org). 35 | *

36 | */ 37 | public class MurmurHash 38 | { 39 | public static int[] multiSeedHash32(long data, int[] seeds) { 40 | int[] res = new int[seeds.length]; 41 | for(int i = 0; i < seeds.length; i++) { 42 | res[i] = hash32(data, seeds[i]); 43 | } 44 | return res; 45 | } 46 | 47 | public static int[] multiSeedHash32(int data, int[] seeds) { 48 | int[] res = new int[seeds.length]; 49 | for(int i = 0; i < seeds.length; i++) { 50 | res[i] = hash32(data, seeds[i]); 51 | } 52 | return res; 53 | } 54 | 55 | public static int[] multiSeedHash32(String data, int[] seeds) { 56 | int[] res = new int[seeds.length]; 57 | for(int i = 0; i < seeds.length; i++) { 58 | res[i] = hash32(data, seeds[i]); 59 | } 60 | return res; 61 | } 62 | 63 | public static int[] multiSeedHash32(ByteBuffer data, int offset, int length, int[] seeds) { 64 | int[] res = new int[seeds.length]; 65 | for(int i = 0; i < seeds.length; i++) { 66 | res[i] = hash32(data, offset, length, seeds[i]); 67 | } 68 | return res; 69 | } 70 | 71 | public static int hash32(long data, int seed){ 72 | ByteBuffer bb = ByteBuffer.allocateDirect(8); 73 | bb.putLong(data); 74 | 75 | return hash32(bb, 0, 8, seed); 76 | } 77 | 78 | public static int hash32(int data, int seed){ 79 | ByteBuffer bb = ByteBuffer.allocateDirect(4); 80 | bb.putInt(data); 81 | 82 | return hash32(bb, 0, 4, seed); 83 | } 84 | 85 | public static int hash32(String data, int seed){ 86 | byte[] bytes = data.getBytes(); 87 | ByteBuffer bb = ByteBuffer.allocateDirect(bytes.length); 88 | bb.put(bytes); 89 | 90 | return hash32(bb, 0, bytes.length, seed); 91 | } 92 | 93 | public static int hash32(ByteBuffer data, int offset, int length, int seed) 94 | { 95 | int m = 0x5bd1e995; 96 | int r = 24; 97 | 98 | int h = seed ^ length; 99 | 100 | int len_4 = length >> 2; 101 | 102 | for (int i = 0; i < len_4; i++) 103 | { 104 | int i_4 = i << 2; 105 | int k = data.get(offset + i_4 + 3); 106 | k = k << 8; 107 | k = k | (data.get(offset + i_4 + 2) & 0xff); 108 | k = k << 8; 109 | k = k | (data.get(offset + i_4 + 1) & 0xff); 110 | k = k << 8; 111 | k = k | (data.get(offset + i_4 + 0) & 0xff); 112 | k *= m; 113 | k ^= k >>> r; 114 | k *= m; 115 | h *= m; 116 | h ^= k; 117 | } 118 | 119 | // avoid calculating modulo 120 | int len_m = len_4 << 2; 121 | int left = length - len_m; 122 | 123 | if (left != 0) 124 | { 125 | if (left >= 3) 126 | { 127 | h ^= (int) data.get(offset + length - 3) << 16; 128 | } 129 | if (left >= 2) 130 | { 131 | h ^= (int) data.get(offset + length - 2) << 8; 132 | } 133 | if (left >= 1) 134 | { 135 | h ^= (int) data.get(offset + length - 1); 136 | } 137 | 138 | h *= m; 139 | } 140 | 141 | h ^= h >>> 13; 142 | h *= m; 143 | h ^= h >>> 15; 144 | 145 | return h; 146 | } 147 | 148 | public static long hash2_64(ByteBuffer key, int offset, int length, long seed) 149 | { 150 | long m64 = 0xc6a4a7935bd1e995L; 151 | int r64 = 47; 152 | 153 | long h64 = (seed & 0xffffffffL) ^ (m64 * length); 154 | 155 | int lenLongs = length >> 3; 156 | 157 | for (int i = 0; i < lenLongs; ++i) 158 | { 159 | int i_8 = i << 3; 160 | 161 | long k64 = ((long) key.get(offset+i_8+0) & 0xff) + (((long) key.get(offset+i_8+1) & 0xff)<<8) + 162 | (((long) key.get(offset+i_8+2) & 0xff)<<16) + (((long) key.get(offset+i_8+3) & 0xff)<<24) + 163 | (((long) key.get(offset+i_8+4) & 0xff)<<32) + (((long) key.get(offset+i_8+5) & 0xff)<<40) + 164 | (((long) key.get(offset+i_8+6) & 0xff)<<48) + (((long) key.get(offset+i_8+7) & 0xff)<<56); 165 | 166 | k64 *= m64; 167 | k64 ^= k64 >>> r64; 168 | k64 *= m64; 169 | 170 | h64 ^= k64; 171 | h64 *= m64; 172 | } 173 | 174 | int rem = length & 0x7; 175 | 176 | switch (rem) 177 | { 178 | case 0: 179 | break; 180 | case 7: 181 | h64 ^= (long) key.get(offset + length - rem + 6) << 48; 182 | case 6: 183 | h64 ^= (long) key.get(offset + length - rem + 5) << 40; 184 | case 5: 185 | h64 ^= (long) key.get(offset + length - rem + 4) << 32; 186 | case 4: 187 | h64 ^= (long) key.get(offset + length - rem + 3) << 24; 188 | case 3: 189 | h64 ^= (long) key.get(offset + length - rem + 2) << 16; 190 | case 2: 191 | h64 ^= (long) key.get(offset + length - rem + 1) << 8; 192 | case 1: 193 | h64 ^= (long) key.get(offset + length - rem); 194 | h64 *= m64; 195 | } 196 | 197 | h64 ^= h64 >>> r64; 198 | h64 *= m64; 199 | h64 ^= h64 >>> r64; 200 | 201 | return h64; 202 | } 203 | 204 | protected static long getblock(ByteBuffer key, int offset, int index) 205 | { 206 | int i_8 = index << 3; 207 | int blockOffset = offset + i_8; 208 | return ((long) key.get(blockOffset + 0) & 0xff) + (((long) key.get(blockOffset + 1) & 0xff) << 8) + 209 | (((long) key.get(blockOffset + 2) & 0xff) << 16) + (((long) key.get(blockOffset + 3) & 0xff) << 24) + 210 | (((long) key.get(blockOffset + 4) & 0xff) << 32) + (((long) key.get(blockOffset + 5) & 0xff) << 40) + 211 | (((long) key.get(blockOffset + 6) & 0xff) << 48) + (((long) key.get(blockOffset + 7) & 0xff) << 56); 212 | } 213 | 214 | protected static long rotl64(long v, int n) 215 | { 216 | return ((v << n) | (v >>> (64 - n))); 217 | } 218 | 219 | protected static long fmix(long k) 220 | { 221 | k ^= k >>> 33; 222 | k *= 0xff51afd7ed558ccdL; 223 | k ^= k >>> 33; 224 | k *= 0xc4ceb9fe1a85ec53L; 225 | k ^= k >>> 33; 226 | 227 | return k; 228 | } 229 | 230 | public static long[] hash3_x64_128(ByteBuffer key, int offset, int length, long seed) 231 | { 232 | final int nblocks = length >> 4; // Process as 128-bit blocks. 233 | 234 | long h1 = seed; 235 | long h2 = seed; 236 | 237 | long c1 = 0x87c37b91114253d5L; 238 | long c2 = 0x4cf5ad432745937fL; 239 | 240 | //---------- 241 | // body 242 | 243 | for(int i = 0; i < nblocks; i++) 244 | { 245 | long k1 = getblock(key, offset, i*2+0); 246 | long k2 = getblock(key, offset, i*2+1); 247 | 248 | k1 *= c1; k1 = rotl64(k1,31); k1 *= c2; h1 ^= k1; 249 | 250 | h1 = rotl64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; 251 | 252 | k2 *= c2; k2 = rotl64(k2,33); k2 *= c1; h2 ^= k2; 253 | 254 | h2 = rotl64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; 255 | } 256 | 257 | //---------- 258 | // tail 259 | 260 | // Advance offset to the unprocessed tail of the data. 261 | offset += nblocks * 16; 262 | 263 | long k1 = 0; 264 | long k2 = 0; 265 | 266 | switch(length & 15) 267 | { 268 | case 15: k2 ^= ((long) key.get(offset+14)) << 48; 269 | case 14: k2 ^= ((long) key.get(offset+13)) << 40; 270 | case 13: k2 ^= ((long) key.get(offset+12)) << 32; 271 | case 12: k2 ^= ((long) key.get(offset+11)) << 24; 272 | case 11: k2 ^= ((long) key.get(offset+10)) << 16; 273 | case 10: k2 ^= ((long) key.get(offset+9)) << 8; 274 | case 9: k2 ^= ((long) key.get(offset+8)) << 0; 275 | k2 *= c2; k2 = rotl64(k2,33); k2 *= c1; h2 ^= k2; 276 | 277 | case 8: k1 ^= ((long) key.get(offset+7)) << 56; 278 | case 7: k1 ^= ((long) key.get(offset+6)) << 48; 279 | case 6: k1 ^= ((long) key.get(offset+5)) << 40; 280 | case 5: k1 ^= ((long) key.get(offset+4)) << 32; 281 | case 4: k1 ^= ((long) key.get(offset+3)) << 24; 282 | case 3: k1 ^= ((long) key.get(offset+2)) << 16; 283 | case 2: k1 ^= ((long) key.get(offset+1)) << 8; 284 | case 1: k1 ^= ((long) key.get(offset)); 285 | k1 *= c1; k1 = rotl64(k1,31); k1 *= c2; h1 ^= k1; 286 | }; 287 | 288 | //---------- 289 | // finalization 290 | 291 | h1 ^= length; h2 ^= length; 292 | 293 | h1 += h2; 294 | h2 += h1; 295 | 296 | h1 = fmix(h1); 297 | h2 = fmix(h2); 298 | 299 | h1 += h2; 300 | h2 += h1; 301 | 302 | return(new long[] {h1, h2}); 303 | } 304 | } 305 | --------------------------------------------------------------------------------