├── src
    └── com
    │   └── ifesdjeen
    │       └── blomstre
    │           ├── Converters.java
    │           ├── ConcurrentBitSet.java
    │           ├── BloomFilter.java
    │           ├── BloomCalculations.java
    │           └── MurmurHash.java
├── README.md
├── pom.xml
└── test
    └── com
        └── ifesdjeen
            └── blomstre
                └── BloomFilterTest.java


/src/com/ifesdjeen/blomstre/Converters.java:
--------------------------------------------------------------------------------
 1 | package com.ifesdjeen.blomstre;
 2 | 
 3 | import java.nio.ByteBuffer;
 4 | import java.util.function.Function;
 5 | 
 6 | public class Converters {
 7 |   static class IntToByteBuffer implements Function<Integer, ByteBuffer> {
 8 |     @Override
 9 |     public ByteBuffer apply(Integer data) {
10 |       ByteBuffer bb = ByteBuffer.allocateDirect(4);
11 |       bb.putInt(data);
12 |       return bb;
13 |     }
14 |   }
15 | 
16 |   static class LongToByteBuffer implements Function<Long, ByteBuffer> {
17 |     @Override
18 |     public ByteBuffer apply(Long data) {
19 |       ByteBuffer bb = ByteBuffer.allocateDirect(8);
20 |       bb.putLong(data);
21 |       return bb;
22 |     }
23 |   }
24 | 
25 |   static class StringToByteBuffer implements Function<String, ByteBuffer> {
26 |     @Override
27 |     public ByteBuffer apply(String data) {
28 |       byte[] bytes = data.getBytes();
29 |       ByteBuffer bb = ByteBuffer.allocateDirect(bytes.length);
30 |       bb.put(bytes);
31 |       return bb;
32 |     }
33 |   }
34 | 
35 |   public static IntToByteBuffer intToByteBufferConverter;
36 |   public static LongToByteBuffer longToByteBufferConverter;
37 |   public static StringToByteBuffer stringToByteBufferConverter;
38 | 
39 |   static {
40 |     intToByteBufferConverter = new IntToByteBuffer();
41 |     longToByteBufferConverter = new LongToByteBuffer();
42 |     stringToByteBufferConverter = new StringToByteBuffer();
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/com/ifesdjeen/blomstre/ConcurrentBitSet.java:
--------------------------------------------------------------------------------
 1 | package com.ifesdjeen.blomstre;
 2 | 
 3 | import java.util.concurrent.atomic.AtomicInteger;
 4 | import java.util.concurrent.atomic.AtomicLongArray;
 5 | 
 6 | public class ConcurrentBitSet {
 7 | 
 8 |   /**
 9 |    * STATE
10 |    */
11 | 
12 |   private static final int  BASE              = 64;
13 |   private static final long MAX_UNSIGNED_LONG = -1L;
14 | 
15 |   private final AtomicLongArray buckets;
16 | 
17 |   public ConcurrentBitSet(long bitsCount) {
18 |     int bucketsCount = (int) bitsCount / BASE;
19 |     this.buckets = new AtomicLongArray(bucketsCount);
20 | 
21 |     for (int i = 0; i < buckets.length(); i++) {
22 |       this.buckets.set(i, 0);
23 |     }
24 |   }
25 | 
26 |   /**
27 |    * API
28 |    */
29 | 
30 |   public void set(long idx) {
31 |     final int bucketIdx = (int) idx / BASE;
32 |     atomicSet(bucketIdx, (int) idx - (BASE * bucketIdx));
33 |   }
34 | 
35 |   public boolean get(long idx) {
36 |     final int bucketIdx = (int) idx / BASE;
37 |     return atomicGet(bucketIdx, (int) idx - (BASE * bucketIdx));
38 |   }
39 | 
40 |   public void clear() {
41 |     throw new RuntimeException("not implemented");
42 |   }
43 | 
44 |   public long capacity() {
45 |     return this.buckets.length() * 64;
46 |   }
47 | 
48 |   /**
49 |    * IMLEMENTATION
50 |    */
51 | 
52 |   private boolean atomicGet(int bucketIdx, int toGet) {
53 |     final long l = buckets.get(bucketIdx);
54 |     final long idxMask = mask(toGet);
55 |     return (l & idxMask) == idxMask;
56 |   }
57 | 
58 |   private void atomicSet(int bucketIdx, int toSet) {
59 |     while (true) {
60 |       final long l = buckets.get(bucketIdx);
61 | 
62 |       if (buckets.compareAndSet(bucketIdx, l, l | mask(toSet)))
63 |         return;
64 |     }
65 |   }
66 | 
67 |   private static long mask(int id) {
68 |     return 1L << id;
69 |   }
70 | 
71 |   public String longToBinaryStr(long num) {
72 |     StringBuilder stringBuilder = new StringBuilder();
73 |     for(int i = 0; i < BASE; i++) {
74 |       final long idxMask = mask(i);
75 |       stringBuilder.append( (num & idxMask) == idxMask ? "1" : "0" );
76 |     }
77 | 
78 |     return stringBuilder.toString();
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Concurrent / Thread Safe BitSet and Bloom filter
 2 | 
 3 | This is a Concurrent / Thread Safe implementation of the [Bloom Filter](https://en.wikipedia.org/wiki/Bloom_filter)
 4 | data structure. Bloom Filter allows you to store an approximate set presence information with constant space
 5 | guarantees. It can give a precise answer only to the question "which items are _not_ present in the set".
 6 | Positive answer to the reverse question ("which items _are_ present in the set") is approximate, which
 7 | means that even if you got the positive answer, the item still might be not in the set
 8 | 
 9 | Bloom Filters are implemented very simply. You use multiple hash functions and a bitset. Results of each hash
10 | function are used to set the bits in the bloom filter. When checking the filter, same hash values are taken
11 | and bits at corresponding positions are collected. If all bits on positions yielded by hash functions
12 | were set, we say that element is possibly in the set. Otherwise, if at least one of the bits is unset,
13 | the answer is element is definitely not in the set.
14 | 
15 | Hash collisions / intersections result into the false positives: if hash function yields same result for two
16 | values, there's no way to distinguish which exactly value was meant. Given enough collisions all bits will
17 | be set and all queries will return "probably in the set", so make sure you create a sufficiently large set.
18 | 
19 | You can find a very good, illustrative description of the Bloom Filter [here](https://www.jasondavies.com/bloomfilter/).
20 | 
21 | To include dependency, just use from Maven Central:
22 | 
23 | ```xml
24 | <dependency>
25 |   <groupId>com.github.ifesdjeen</groupId>
26 |   <artifactId>blomstre</artifactId>
27 |   <version>1.0.0-RC1</version>
28 | </dependency>
29 | ```
30 | 
31 | You can also find it on [clojars](https://clojars.org/com.github.ifesdjeen/blomstre).
32 | 
33 | # Usage
34 | 
35 | ```java
36 | import com.ifesdjeen.blomstre.BloomFilter;
37 | 
38 | // Make a filter that'd accept Strings, give a converter to byte buffer for hash function calculation
39 | // Maximum number of elements is 6000, max acceptable false positive probability is 0.0001
40 | BloomFilter<String> filter = BloomFilter.makeFilter(Converters.stringToByteBufferConverter, 6000, 0.0001);
41 | 
42 | // Add an item to the filter
43 | filter.add("abcdef");
44 | 
45 | filter.isPresent("abcdef");
46 | // => "probably" true
47 | ```
48 | 
49 | # Thread Safety
50 | 
51 | Thread safety is achieved through using the concurrent `BitSet`, which is backed by `AtomicLongArray`. That
52 | means that write operations may sometimes retry, although will never affect reads or interfere with
53 | other writes in an unpredictable way.
54 | 
55 | # Copyright / License
56 | 
57 | Licensed under Apache 2.0 License.
58 | 
59 | Original `Bloom Filter` can be found [here](https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/utils/BloomFilter.java)
60 | and it's license [here](https://github.com/apache/cassandra/blob/trunk/LICENSE.txt).
61 | 


--------------------------------------------------------------------------------
/src/com/ifesdjeen/blomstre/BloomFilter.java:
--------------------------------------------------------------------------------
  1 | package com.ifesdjeen.blomstre;
  2 | 
  3 | /*
  4 |  * !!! THIS SOURCE FILE WAS ORIGINALLY TAKEN FROM APACHE CASSANDRA SOURCE !!!
  5 |  *
  6 |  * Licensed to the Apache Software Foundation (ASF) under one
  7 |  * or more contributor license agreements.  See the NOTICE file
  8 |  * distributed with this work for additional information
  9 |  * regarding copyright ownership.  The ASF licenses this file
 10 |  * to you under the Apache License, Version 2.0 (the
 11 |  * "License"); you may not use this file except in compliance
 12 |  * with the License.  You may obtain a copy of the License at
 13 |  *
 14 |  *     http://www.apache.org/licenses/LICENSE-2.0
 15 |  *
 16 |  * Unless required by applicable law or agreed to in writing, software
 17 |  * distributed under the License is distributed on an "AS IS" BASIS,
 18 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |  * See the License for the specific language governing permissions and
 20 |  * limitations under the License.
 21 |  */
 22 | 
 23 | import java.nio.ByteBuffer;
 24 | import java.util.function.Function;
 25 | 
 26 | public class BloomFilter<T> {
 27 |   private static final long BITSET_EXCESS = 20;
 28 | 
 29 |   private final Function<T, ByteBuffer> converter;
 30 |   public final  ConcurrentBitSet        bitset;
 31 |   public final  int                     hashCount;
 32 | 
 33 |   BloomFilter(Function<T, ByteBuffer> converter,
 34 |               int hashes,
 35 |               ConcurrentBitSet bitset
 36 |              ) {
 37 |     this.converter = converter;
 38 |     this.hashCount = hashes;
 39 |     this.bitset = bitset;
 40 |   }
 41 | 
 42 |   private long[] getHashBuckets(ByteBuffer key) {
 43 |     return getHashBuckets(key, hashCount, bitset.capacity());
 44 |   }
 45 | 
 46 |   protected long[] hash(ByteBuffer b,
 47 |                         long seed) {
 48 |     return MurmurHash.hash3_x64_128(b, 0, b.capacity(), seed);
 49 |   }
 50 | 
 51 |   // Murmur is faster than an SHA-based approach and provides as-good collision
 52 |   // resistance.  The combinatorial generation approach described in
 53 |   // https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
 54 |   // does prove to work in actual tests, and is obviously faster
 55 |   // than performing further iterations of murmur.
 56 |   long[] getHashBuckets(ByteBuffer b,
 57 |                         int hashCount,
 58 |                         long max) {
 59 |     final long[] result = new long[hashCount];
 60 |     final long[] hash = this.hash(b, 0L);
 61 |     for (int i = 0; i < hashCount; ++i) {
 62 |       result[i] = Math.abs((hash[0] + (long) i * hash[1]) % max);
 63 |     }
 64 |     return result;
 65 |   }
 66 | 
 67 |   public void add(T key) {
 68 |     add(converter.apply(key));
 69 |   }
 70 | 
 71 |   protected void add(ByteBuffer key) {
 72 |     for (long bucketIndex : getHashBuckets(key)) {
 73 |       bitset.set(bucketIndex);
 74 |     }
 75 |   }
 76 | 
 77 |   public boolean isPresent(T key) {
 78 |     return isPresent(converter.apply(key));
 79 |   }
 80 | 
 81 |   protected boolean isPresent(ByteBuffer key) {
 82 |     for (long bucketIndex : getHashBuckets(key)) {
 83 |       if (!bitset.get(bucketIndex)) {
 84 |         return false;
 85 |       }
 86 |     }
 87 |     return true;
 88 |   }
 89 | 
 90 |   public void clear() {
 91 |     bitset.clear();
 92 |   }
 93 | 
 94 |   public static <T> BloomFilter<T> makeFilter(Function<T, ByteBuffer> converter,
 95 |                                               int numElements,
 96 |                                               double maxFalsePosProbability) {
 97 |     int maxBucketsPerElement = BloomCalculations.maxBucketsPerElement(numElements);
 98 |     BloomCalculations.BloomSpecification spec = BloomCalculations.computeBloomSpec(maxBucketsPerElement,
 99 |                                                                                    maxFalsePosProbability);
100 | 
101 |     long numBits = (numElements * spec.bucketsPerElement) + BITSET_EXCESS;
102 |     return new BloomFilter<T>(converter, spec.K, new ConcurrentBitSet(numBits));
103 |   }
104 | }
105 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  3 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  4 | 
  5 |   <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |   <groupId>com.github.ifesdjeen</groupId>
  8 |   <artifactId>blomstre</artifactId>
  9 |   <packaging>jar</packaging>
 10 |   <version>1.0.0-RC1</version>
 11 |   <description>Concurrent Bloom Filter</description>
 12 | 
 13 |   <licenses>
 14 |     <license>
 15 |       <name>Apache Public License</name>
 16 |       <url>https://opensource.org/licenses/Apache-2.0</url>
 17 |     </license>
 18 |   </licenses>
 19 | 
 20 |   <properties>
 21 |     <javac.target>1.8</javac.target>
 22 |     <snapshotsRepositoryId>ossrh</snapshotsRepositoryId>
 23 |     <snapshotsRepositoryUrl>https://oss.sonatype.org/content/repositories/snapshots</snapshotsRepositoryUrl>
 24 |     <snapshotsRepositoryId>ossrh</snapshotsRepositoryId>
 25 |     <releasesRepositoryUrl>https://oss.sonatype.org/content/repositories/releases</releasesRepositoryUrl>
 26 |   </properties>
 27 | 
 28 |   <build>
 29 |     <sourceDirectory>src/</sourceDirectory>
 30 |     <testSourceDirectory>test/</testSourceDirectory>
 31 |     <resources>
 32 |       <resource>
 33 |         <directory>resources</directory>
 34 |       </resource>
 35 |     </resources>
 36 |     <testResources>
 37 |       <testResource>
 38 |         <directory>dev-resources</directory>
 39 |       </testResource>
 40 |       <testResource>
 41 |         <directory>resources</directory>
 42 |       </testResource>
 43 |     </testResources>
 44 |     <directory>target</directory>
 45 |     <outputDirectory>target/classes</outputDirectory>
 46 | 
 47 |     <plugins>
 48 | 
 49 |       <plugin>
 50 |         <groupId>org.apache.maven.plugins</groupId>
 51 |         <artifactId>maven-compiler-plugin</artifactId>
 52 |         <version>3.1</version>
 53 |         <configuration>
 54 |           <compilerVersion>${javac.target}</compilerVersion>
 55 |           <source>${javac.target}</source>
 56 |           <target>${javac.target}</target>
 57 |         </configuration>
 58 |       </plugin>
 59 | 
 60 |       <plugin>
 61 |         <groupId>org.apache.maven.plugins</groupId>
 62 |         <artifactId>maven-surefire-plugin</artifactId>
 63 |         <version>2.18.1</version>
 64 |         <configuration>
 65 |           <includes>
 66 |             <include>**/*Test*.java</include>
 67 |           </includes>
 68 |           <runOrder>random</runOrder>
 69 |         </configuration>
 70 |       </plugin>
 71 |     </plugins>
 72 |   </build>
 73 | 
 74 |   <distributionManagement>
 75 | 
 76 |     <snapshotRepository>
 77 |       <id>${snapshotsRepositoryId}</id>
 78 |       <url>${snapshotsRepositoryUrl}</url>
 79 |     </snapshotRepository>
 80 |     <repository>
 81 |       <id>${releasesRepositoryId}</id>
 82 |       <url>${releasesRepositoryUrl}</url>
 83 |     </repository>
 84 |   </distributionManagement>
 85 | 
 86 |   <scm>
 87 |     <connection>scm:git:git@github.com:ifesdjeen/blomstre.git</connection>
 88 |     <developerConnection>scm:git:git@github.com:ifesdjeen/blomstre.git</developerConnection>
 89 |     <url>git@github.com:ifesdjeen/blomstre.git</url>
 90 |     <tag>HEAD</tag>
 91 |   </scm>
 92 | 
 93 |   <dependencies>
 94 |     <dependency>
 95 |       <groupId>junit</groupId>
 96 |       <artifactId>junit</artifactId>
 97 |       <version>4.12</version>
 98 |       <scope>test</scope>
 99 |     </dependency>
100 |     <dependency>
101 |       <groupId>org.hamcrest</groupId>
102 |       <artifactId>hamcrest-all</artifactId>
103 |       <version>1.3</version>
104 |       <scope>test</scope>
105 |     </dependency>
106 |   </dependencies>
107 | 
108 |    <profiles>
109 |     <profile>
110 |       <id>clojars</id>
111 |       <activation>
112 |         <property>
113 |           <name>env</name>
114 |           <value>clojars</value>
115 |         </property>
116 |       </activation>
117 |       <properties>
118 |         <snapshotRepositoryId>clojars</snapshotRepositoryId>
119 |         <snapshotRepositoryUrl>https://clojars.org/repo</snapshotRepositoryUrl>
120 |         <releasesRepositoryId>clojars</releasesRepositoryId>
121 |         <releasesRepositoryUrl>https://clojars.org/repo</releasesRepositoryUrl>
122 |       </properties>
123 |     </profile>
124 |    </profiles>
125 | </project>
126 | 


--------------------------------------------------------------------------------
/test/com/ifesdjeen/blomstre/BloomFilterTest.java:
--------------------------------------------------------------------------------
  1 | package com.ifesdjeen.blomstre;
  2 | 
  3 | import org.junit.Test;
  4 | 
  5 | import java.util.concurrent.CountDownLatch;
  6 | import java.util.concurrent.ExecutorService;
  7 | import java.util.concurrent.Executors;
  8 | 
  9 | import static org.junit.Assert.assertFalse;
 10 | import static org.junit.Assert.assertTrue;
 11 | 
 12 | public class BloomFilterTest {
 13 | 
 14 |   @Test
 15 |   public void bloomFilter0Test() {
 16 |     BloomFilter<String> filter = BloomFilter.makeFilter(Converters.stringToByteBufferConverter, 6000, 0.0001);
 17 | 
 18 |     int i = 0;
 19 |     filter.add(String.format("%d.%d.%d.%d",
 20 |                              i,
 21 |                              i,
 22 |                              i,
 23 |                              i));
 24 | 
 25 |     assertTrue(filter.isPresent(String.format("%d.%d.%d.%d",
 26 |                                               i,
 27 |                                               i,
 28 |                                               i,
 29 |                                               i)));
 30 | 
 31 | 
 32 |   }
 33 | 
 34 | 
 35 |   @Test
 36 |   public void bloomFilter1Test() {
 37 |     BloomFilter<String> filter = BloomFilter.makeFilter(Converters.stringToByteBufferConverter, 6000, 0.0001);
 38 | 
 39 |     int i = 1;
 40 |     filter.add(String.format("%d.%d.%d.%d",
 41 |                              i,
 42 |                              i,
 43 |                              i,
 44 |                              i));
 45 | 
 46 |     assertTrue(filter.isPresent(String.format("%d.%d.%d.%d",
 47 |                                               i,
 48 |                                               i,
 49 |                                               i,
 50 |                                               i)));
 51 | 
 52 | 
 53 |   }
 54 | 
 55 |   @Test
 56 |   public void bloomFilterTest() {
 57 |     BloomFilter<String> filter = BloomFilter.makeFilter(Converters.stringToByteBufferConverter, 6000, 0.0001);
 58 | 
 59 |     for(int i = 0; i < 1000; i++) {
 60 |       filter.add(String.format("%d.%d.%d.%d",
 61 |                                i,
 62 |                                i,
 63 |                                i,
 64 |                                i));
 65 |     }
 66 | 
 67 |     for(int i = 0; i < 1000; i++) {
 68 |       assertTrue(filter.isPresent(String.format("%d.%d.%d.%d",
 69 |                                                 i,
 70 |                                                 i,
 71 |                                                 i,
 72 |                                                 i)));
 73 |     }
 74 | 
 75 |     for(int i = 1000; i < 5000; i++) {
 76 |       assertFalse(filter.isPresent(String.format("%d.%d.%d.%d",
 77 |                                                 i,
 78 |                                                 i,
 79 |                                                 i,
 80 |                                                 i)));
 81 |     }
 82 |   }
 83 | 
 84 |   @Test
 85 |   public void concurrentBloomFilterTest() throws InterruptedException {
 86 |     final BloomFilter<String> filter = BloomFilter.makeFilter(Converters.stringToByteBufferConverter, 6000, 0.01);
 87 | 
 88 |     final int threads = 10;
 89 |     final CountDownLatch latch = new CountDownLatch(threads);
 90 |     final CountDownLatch finishedLatch = new CountDownLatch(threads);
 91 | 
 92 |     for(int thread = 0; thread < threads; thread++) {
 93 |       final int finalThread = thread;
 94 |       new Thread(new Runnable() {
 95 |         @Override
 96 |         public void run() {
 97 |           final int localFinalThread = finalThread;
 98 |           latch.countDown();
 99 |           try {
100 |             latch.await();
101 |             Thread.sleep(1000);
102 |           } catch (InterruptedException e) {
103 |             e.printStackTrace();
104 |           }
105 |           for (int i = 0; i < 100; i++) {
106 |             final int finalI = localFinalThread * 100 + i;
107 |             filter.add(String.format("%d.%d.%d.%d",
108 |                                      finalI,
109 |                                      finalI,
110 |                                      finalI,
111 |                                      finalI));
112 |           }
113 |           finishedLatch.countDown();
114 |         }
115 |       }).start();
116 |     }
117 | 
118 |     finishedLatch.await();
119 | 
120 |     for(int i = 0; i < 1000; i++) {
121 |       assertTrue(filter.isPresent(String.format("%d.%d.%d.%d",
122 |                                                 i,
123 |                                                 i,
124 |                                                 i,
125 |                                                 i)));
126 |     }
127 | 
128 | 
129 |   }
130 | 
131 | }
132 | 


--------------------------------------------------------------------------------
/src/com/ifesdjeen/blomstre/BloomCalculations.java:
--------------------------------------------------------------------------------
  1 | package com.ifesdjeen.blomstre;
  2 | 
  3 | /*
  4 |  * !!! THIS SOURCE FILE WAS TAKEN FROM APACHE CASSANDRA SOURCE !!!
  5 |  *
  6 |  * Licensed to the Apache Software Foundation (ASF) under one
  7 |  * or more contributor license agreements.  See the NOTICE file
  8 |  * distributed with this work for additional information
  9 |  * regarding copyright ownership.  The ASF licenses this file
 10 |  * to you under the Apache License, Version 2.0 (the
 11 |  * "License"); you may not use this file except in compliance
 12 |  * with the License.  You may obtain a copy of the License at
 13 |  *
 14 |  *     http://www.apache.org/licenses/LICENSE-2.0
 15 |  *
 16 |  * Unless required by applicable law or agreed to in writing, software
 17 |  * distributed under the License is distributed on an "AS IS" BASIS,
 18 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |  * See the License for the specific language governing permissions and
 20 |  * limitations under the License.
 21 |  */
 22 | 
 23 | /**
 24 |  * The following calculations are taken from:
 25 |  * http://www.cs.wisc.edu/~cao/papers/summary-cache/node8.html
 26 |  * "Bloom Filters - the math"
 27 |  * <p/>
 28 |  * This class's static methods are meant to facilitate the use of the Bloom
 29 |  * Filter class by helping to choose correct values of 'bits per element' and
 30 |  * 'number of hash functions, k'.
 31 |  */
 32 | public class BloomCalculations {
 33 | 
 34 |   private static final int minBuckets = 2;
 35 |   private static final int minK       = 1;
 36 | 
 37 |   private static final int EXCESS = 20;
 38 | 
 39 |   /**
 40 |    * In the following table, the row 'i' shows false positive rates if i buckets
 41 |    * per element are used.  Column 'j' shows false positive rates if j hash
 42 |    * functions are used.  The first row is 'i=0', the first column is 'j=0'.
 43 |    * Each cell (i,j) the false positive rate determined by using i buckets per
 44 |    * element and j hash functions.
 45 |    */
 46 |   static final double[][] probs = new double[][]{
 47 |                                                         {1.0}, // dummy row representing 0 buckets per element
 48 |                                                         {1.0, 1.0}, // dummy row representing 1 buckets per element
 49 |                                                         {1.0, 0.393, 0.400},
 50 |                                                         {1.0, 0.283, 0.237, 0.253},
 51 |                                                         {1.0, 0.221, 0.155, 0.147, 0.160},
 52 |                                                         {1.0, 0.181, 0.109, 0.092, 0.092, 0.101}, // 5
 53 |                                                         {1.0, 0.154, 0.0804, 0.0609, 0.0561, 0.0578, 0.0638},
 54 |                                                         {1.0, 0.133, 0.0618, 0.0423, 0.0359, 0.0347, 0.0364},
 55 |                                                         {1.0, 0.118, 0.0489, 0.0306, 0.024, 0.0217, 0.0216, 0.0229},
 56 |                                                         {1.0, 0.105, 0.0397, 0.0228, 0.0166, 0.0141, 0.0133, 0.0135, 0.0145},
 57 |                                                         {1.0, 0.0952, 0.0329, 0.0174, 0.0118, 0.00943, 0.00844, 0.00819, 0.00846}, // 10
 58 |                                                         {1.0, 0.0869, 0.0276, 0.0136, 0.00864, 0.0065, 0.00552, 0.00513, 0.00509},
 59 |                                                         {1.0, 0.08, 0.0236, 0.0108, 0.00646, 0.00459, 0.00371, 0.00329, 0.00314},
 60 |                                                         {1.0, 0.074, 0.0203, 0.00875, 0.00492, 0.00332, 0.00255, 0.00217, 0.00199, 0.00194},
 61 |                                                         {1.0, 0.0689, 0.0177, 0.00718, 0.00381, 0.00244, 0.00179, 0.00146, 0.00129, 0.00121, 0.0012},
 62 |                                                         {1.0, 0.0645, 0.0156, 0.00596, 0.003, 0.00183, 0.00128, 0.001, 0.000852, 0.000775, 0.000744}, // 15
 63 |                                                         {1.0, 0.0606, 0.0138, 0.005, 0.00239, 0.00139, 0.000935, 0.000702, 0.000574, 0.000505, 0.00047, 0.000459},
 64 |                                                         {1.0, 0.0571, 0.0123, 0.00423, 0.00193, 0.00107, 0.000692, 0.000499, 0.000394, 0.000335, 0.000302, 0.000287, 0.000284},
 65 |                                                         {1.0, 0.054, 0.0111, 0.00362, 0.00158, 0.000839, 0.000519, 0.00036, 0.000275, 0.000226, 0.000198, 0.000183, 0.000176},
 66 |                                                         {1.0, 0.0513, 0.00998, 0.00312, 0.0013, 0.000663, 0.000394, 0.000264, 0.000194, 0.000155, 0.000132, 0.000118, 0.000111, 0.000109},
 67 |                                                         {1.0, 0.0488, 0.00906, 0.0027, 0.00108, 0.00053, 0.000303, 0.000196, 0.00014, 0.000108, 8.89e-05, 7.77e-05, 7.12e-05, 6.79e-05, 6.71e-05} // 20
 68 |   };  // the first column is a dummy column representing K=0.
 69 | 
 70 |   /**
 71 |    * The optimal number of hashes for a given number of bits per element.
 72 |    * These values are automatically calculated from the data above.
 73 |    */
 74 |   private static final int[] optKPerBuckets = new int[probs.length];
 75 | 
 76 |   static {
 77 |     for (int i = 0; i < probs.length; i++) {
 78 |       double min = Double.MAX_VALUE;
 79 |       double[] prob = probs[i];
 80 |       for (int j = 0; j < prob.length; j++) {
 81 |         if (prob[j] < min) {
 82 |           min = prob[j];
 83 |           optKPerBuckets[i] = Math.max(minK, j);
 84 |         }
 85 |       }
 86 |     }
 87 |   }
 88 | 
 89 |   /**
 90 |    * Given the number of buckets that can be used per element, return a
 91 |    * specification that minimizes the false positive rate.
 92 |    *
 93 |    * @param bucketsPerElement The number of buckets per element for the filter.
 94 |    * @return A spec that minimizes the false positive rate.
 95 |    */
 96 |   public static BloomSpecification computeBloomSpec(int bucketsPerElement) {
 97 |     assert bucketsPerElement >= 1;
 98 |     assert bucketsPerElement <= probs.length - 1;
 99 |     return new BloomSpecification(optKPerBuckets[bucketsPerElement], bucketsPerElement);
100 |   }
101 | 
102 |   /**
103 |    * A wrapper class that holds two key parameters for a Bloom Filter: the
104 |    * number of hash functions used, and the number of buckets per element used.
105 |    */
106 |   public static class BloomSpecification {
107 |     public final int K; // number of hash functions.
108 |     public final int bucketsPerElement;
109 | 
110 |     public BloomSpecification(int k,
111 |                               int bucketsPerElement) {
112 |       K = k;
113 |       this.bucketsPerElement = bucketsPerElement;
114 |     }
115 | 
116 |     public String toString() {
117 |       return String.format("BloomSpecification(K=%d, bucketsPerElement=%d)", K, bucketsPerElement);
118 |     }
119 |   }
120 | 
121 |   /**
122 |    * Given a maximum tolerable false positive probability, compute a Bloom
123 |    * specification which will give less than the specified false positive rate,
124 |    * but minimize the number of buckets per element and the number of hash
125 |    * functions used.  Because bandwidth (and therefore total bitvector size)
126 |    * is considered more expensive than computing power, preference is given
127 |    * to minimizing buckets per element rather than number of hash functions.
128 |    *
129 |    * @param maxBucketsPerElement The maximum number of buckets available for the filter.
130 |    * @param maxFalsePosProb      The maximum tolerable false positive rate.
131 |    * @return A Bloom Specification which would result in a false positive rate
132 |    * less than specified by the function call
133 |    * @throws UnsupportedOperationException if a filter satisfying the parameters cannot be met
134 |    */
135 |   public static BloomSpecification computeBloomSpec(int maxBucketsPerElement,
136 |                                                     double maxFalsePosProb) {
137 |     assert maxBucketsPerElement >= 1;
138 |     assert maxBucketsPerElement <= probs.length - 1;
139 |     int maxK = probs[maxBucketsPerElement].length - 1;
140 | 
141 |     // Handle the trivial cases
142 |     if (maxFalsePosProb >= probs[minBuckets][minK]) {
143 |       return new BloomSpecification(2, optKPerBuckets[2]);
144 |     }
145 |     if (maxFalsePosProb < probs[maxBucketsPerElement][maxK]) {
146 |       throw new UnsupportedOperationException(String.format("Unable to satisfy %s with %s buckets per element",
147 |                                                             maxFalsePosProb, maxBucketsPerElement));
148 |     }
149 | 
150 |     // First find the minimal required number of buckets:
151 |     int bucketsPerElement = 2;
152 |     int K = optKPerBuckets[2];
153 |     while (probs[bucketsPerElement][K] > maxFalsePosProb) {
154 |       bucketsPerElement++;
155 |       K = optKPerBuckets[bucketsPerElement];
156 |     }
157 |     // Now that the number of buckets is sufficient, see if we can relax K
158 |     // without losing too much precision.
159 |     while (probs[bucketsPerElement][K - 1] <= maxFalsePosProb) {
160 |       K--;
161 |     }
162 | 
163 |     return new BloomSpecification(K, bucketsPerElement);
164 |   }
165 | 
166 |   /**
167 |    * Calculates the maximum number of buckets per element that this implementation
168 |    * can support.  Crucially, it will lower the bucket count if necessary to meet
169 |    * BitSet's size restrictions.
170 |    */
171 |   public static int maxBucketsPerElement(long numElements) {
172 |     numElements = Math.max(1, numElements);
173 |     double v = (Long.MAX_VALUE - EXCESS) / (double) numElements;
174 |     if (v < 1.0) {
175 |       throw new UnsupportedOperationException("Cannot compute probabilities for " + numElements + " elements.");
176 |     }
177 |     return Math.min(BloomCalculations.probs.length - 1, (int) v);
178 |   }
179 | }
180 | 


--------------------------------------------------------------------------------
/src/com/ifesdjeen/blomstre/MurmurHash.java:
--------------------------------------------------------------------------------
  1 | package com.ifesdjeen.blomstre;
  2 | 
  3 | /*
  4 |  * !!! THIS SOURCE FILE WAS TAKEN FROM APACHE CASSANDRA SOURCE !!!
  5 |  *
  6 |  * Licensed to the Apache Software Foundation (ASF) under one
  7 |  * or more contributor license agreements.  See the NOTICE file
  8 |  * distributed with this work for additional information
  9 |  * regarding copyright ownership.  The ASF licenses this file
 10 |  * to you under the Apache License, Version 2.0 (the
 11 |  * "License"); you may not use this file except in compliance
 12 |  * with the License.  You may obtain a copy of the License at
 13 |  *
 14 |  *     http://www.apache.org/licenses/LICENSE-2.0
 15 |  *
 16 |  * Unless required by applicable law or agreed to in writing, software
 17 |  * distributed under the License is distributed on an "AS IS" BASIS,
 18 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 19 |  * See the License for the specific language governing permissions and
 20 |  * limitations under the License.
 21 |  */
 22 | 
 23 | import java.nio.ByteBuffer;
 24 | 
 25 | /**
 26 |  * This is a very fast, non-cryptographic hash suitable for general hash-based
 27 |  * lookup. See http://murmurhash.googlepages.com/ for more details.
 28 |  *
 29 |  * hash32() and hash64() are MurmurHash 2.0.
 30 |  * hash3_x64_128() is MurmurHash 3.0.
 31 |  *
 32 |  * <p>
 33 |  * The C version of MurmurHash 2.0 found at that site was ported to Java by
 34 |  * Andrzej Bialecki (ab at getopt org).
 35 |  * </p>
 36 |  */
 37 | public class MurmurHash
 38 | {
 39 |   public static int[] multiSeedHash32(long data, int[] seeds) {
 40 |     int[] res = new int[seeds.length];
 41 |     for(int i = 0; i < seeds.length; i++) {
 42 |       res[i] = hash32(data, seeds[i]);
 43 |     }
 44 |     return res;
 45 |   }
 46 | 
 47 |   public static int[] multiSeedHash32(int data, int[] seeds) {
 48 |     int[] res = new int[seeds.length];
 49 |     for(int i = 0; i < seeds.length; i++) {
 50 |       res[i] = hash32(data, seeds[i]);
 51 |     }
 52 |     return res;
 53 |   }
 54 | 
 55 |   public static int[] multiSeedHash32(String data, int[] seeds) {
 56 |     int[] res = new int[seeds.length];
 57 |     for(int i = 0; i < seeds.length; i++) {
 58 |       res[i] = hash32(data, seeds[i]);
 59 |     }
 60 |     return res;
 61 |   }
 62 | 
 63 |   public static int[] multiSeedHash32(ByteBuffer data, int offset, int length, int[] seeds) {
 64 |     int[] res = new int[seeds.length];
 65 |     for(int i = 0; i < seeds.length; i++) {
 66 |       res[i] = hash32(data, offset, length, seeds[i]);
 67 |     }
 68 |     return res;
 69 |   }
 70 | 
 71 |   public static int hash32(long data, int seed){
 72 |     ByteBuffer bb = ByteBuffer.allocateDirect(8);
 73 |     bb.putLong(data);
 74 | 
 75 |     return hash32(bb, 0, 8, seed);
 76 |   }
 77 | 
 78 |   public static int hash32(int data, int seed){
 79 |     ByteBuffer bb = ByteBuffer.allocateDirect(4);
 80 |     bb.putInt(data);
 81 | 
 82 |     return  hash32(bb, 0, 4, seed);
 83 |   }
 84 | 
 85 |   public static int hash32(String data, int seed){
 86 |     byte[] bytes = data.getBytes();
 87 |     ByteBuffer bb = ByteBuffer.allocateDirect(bytes.length);
 88 |     bb.put(bytes);
 89 | 
 90 |     return  hash32(bb, 0, bytes.length, seed);
 91 |   }
 92 | 
 93 |   public static int hash32(ByteBuffer data, int offset, int length, int seed)
 94 |   {
 95 |     int m = 0x5bd1e995;
 96 |     int r = 24;
 97 | 
 98 |     int h = seed ^ length;
 99 | 
100 |     int len_4 = length >> 2;
101 | 
102 |     for (int i = 0; i < len_4; i++)
103 |     {
104 |       int i_4 = i << 2;
105 |       int k = data.get(offset + i_4 + 3);
106 |       k = k << 8;
107 |       k = k | (data.get(offset + i_4 + 2) & 0xff);
108 |       k = k << 8;
109 |       k = k | (data.get(offset + i_4 + 1) & 0xff);
110 |       k = k << 8;
111 |       k = k | (data.get(offset + i_4 + 0) & 0xff);
112 |       k *= m;
113 |       k ^= k >>> r;
114 |       k *= m;
115 |       h *= m;
116 |       h ^= k;
117 |     }
118 | 
119 |     // avoid calculating modulo
120 |     int len_m = len_4 << 2;
121 |     int left = length - len_m;
122 | 
123 |     if (left != 0)
124 |     {
125 |       if (left >= 3)
126 |       {
127 |         h ^= (int) data.get(offset + length - 3) << 16;
128 |       }
129 |       if (left >= 2)
130 |       {
131 |         h ^= (int) data.get(offset + length - 2) << 8;
132 |       }
133 |       if (left >= 1)
134 |       {
135 |         h ^= (int) data.get(offset + length - 1);
136 |       }
137 | 
138 |       h *= m;
139 |     }
140 | 
141 |     h ^= h >>> 13;
142 |     h *= m;
143 |     h ^= h >>> 15;
144 | 
145 |     return h;
146 |   }
147 | 
148 |   public static long hash2_64(ByteBuffer key, int offset, int length, long seed)
149 |   {
150 |     long m64 = 0xc6a4a7935bd1e995L;
151 |     int r64 = 47;
152 | 
153 |     long h64 = (seed & 0xffffffffL) ^ (m64 * length);
154 | 
155 |     int lenLongs = length >> 3;
156 | 
157 |     for (int i = 0; i < lenLongs; ++i)
158 |     {
159 |       int i_8 = i << 3;
160 | 
161 |       long k64 =  ((long)  key.get(offset+i_8+0) & 0xff)      + (((long) key.get(offset+i_8+1) & 0xff)<<8)  +
162 |                           (((long) key.get(offset+i_8+2) & 0xff)<<16) + (((long) key.get(offset+i_8+3) & 0xff)<<24) +
163 |                           (((long) key.get(offset+i_8+4) & 0xff)<<32) + (((long) key.get(offset+i_8+5) & 0xff)<<40) +
164 |                           (((long) key.get(offset+i_8+6) & 0xff)<<48) + (((long) key.get(offset+i_8+7) & 0xff)<<56);
165 | 
166 |       k64 *= m64;
167 |       k64 ^= k64 >>> r64;
168 |       k64 *= m64;
169 | 
170 |       h64 ^= k64;
171 |       h64 *= m64;
172 |     }
173 | 
174 |     int rem = length & 0x7;
175 | 
176 |     switch (rem)
177 |     {
178 |       case 0:
179 |         break;
180 |       case 7:
181 |         h64 ^= (long) key.get(offset + length - rem + 6) << 48;
182 |       case 6:
183 |         h64 ^= (long) key.get(offset + length - rem + 5) << 40;
184 |       case 5:
185 |         h64 ^= (long) key.get(offset + length - rem + 4) << 32;
186 |       case 4:
187 |         h64 ^= (long) key.get(offset + length - rem + 3) << 24;
188 |       case 3:
189 |         h64 ^= (long) key.get(offset + length - rem + 2) << 16;
190 |       case 2:
191 |         h64 ^= (long) key.get(offset + length - rem + 1) << 8;
192 |       case 1:
193 |         h64 ^= (long) key.get(offset + length - rem);
194 |         h64 *= m64;
195 |     }
196 | 
197 |     h64 ^= h64 >>> r64;
198 |     h64 *= m64;
199 |     h64 ^= h64 >>> r64;
200 | 
201 |     return h64;
202 |   }
203 | 
204 |   protected static long getblock(ByteBuffer key, int offset, int index)
205 |   {
206 |     int i_8 = index << 3;
207 |     int blockOffset = offset + i_8;
208 |     return ((long) key.get(blockOffset + 0) & 0xff) + (((long) key.get(blockOffset + 1) & 0xff) << 8) +
209 |                    (((long) key.get(blockOffset + 2) & 0xff) << 16) + (((long) key.get(blockOffset + 3) & 0xff) << 24) +
210 |                    (((long) key.get(blockOffset + 4) & 0xff) << 32) + (((long) key.get(blockOffset + 5) & 0xff) << 40) +
211 |                    (((long) key.get(blockOffset + 6) & 0xff) << 48) + (((long) key.get(blockOffset + 7) & 0xff) << 56);
212 |   }
213 | 
214 |   protected static long rotl64(long v, int n)
215 |   {
216 |     return ((v << n) | (v >>> (64 - n)));
217 |   }
218 | 
219 |   protected static long fmix(long k)
220 |   {
221 |     k ^= k >>> 33;
222 |     k *= 0xff51afd7ed558ccdL;
223 |     k ^= k >>> 33;
224 |     k *= 0xc4ceb9fe1a85ec53L;
225 |     k ^= k >>> 33;
226 | 
227 |     return k;
228 |   }
229 | 
230 |   public static long[] hash3_x64_128(ByteBuffer key, int offset, int length, long seed)
231 |   {
232 |     final int nblocks = length >> 4; // Process as 128-bit blocks.
233 | 
234 |     long h1 = seed;
235 |     long h2 = seed;
236 | 
237 |     long c1 = 0x87c37b91114253d5L;
238 |     long c2 = 0x4cf5ad432745937fL;
239 | 
240 |     //----------
241 |     // body
242 | 
243 |     for(int i = 0; i < nblocks; i++)
244 |     {
245 |       long k1 = getblock(key, offset, i*2+0);
246 |       long k2 = getblock(key, offset, i*2+1);
247 | 
248 |       k1 *= c1; k1 = rotl64(k1,31); k1 *= c2; h1 ^= k1;
249 | 
250 |       h1 = rotl64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
251 | 
252 |       k2 *= c2; k2  = rotl64(k2,33); k2 *= c1; h2 ^= k2;
253 | 
254 |       h2 = rotl64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
255 |     }
256 | 
257 |     //----------
258 |     // tail
259 | 
260 |     // Advance offset to the unprocessed tail of the data.
261 |     offset += nblocks * 16;
262 | 
263 |     long k1 = 0;
264 |     long k2 = 0;
265 | 
266 |     switch(length & 15)
267 |     {
268 |       case 15: k2 ^= ((long) key.get(offset+14)) << 48;
269 |       case 14: k2 ^= ((long) key.get(offset+13)) << 40;
270 |       case 13: k2 ^= ((long) key.get(offset+12)) << 32;
271 |       case 12: k2 ^= ((long) key.get(offset+11)) << 24;
272 |       case 11: k2 ^= ((long) key.get(offset+10)) << 16;
273 |       case 10: k2 ^= ((long) key.get(offset+9)) << 8;
274 |       case  9: k2 ^= ((long) key.get(offset+8)) << 0;
275 |         k2 *= c2; k2  = rotl64(k2,33); k2 *= c1; h2 ^= k2;
276 | 
277 |       case  8: k1 ^= ((long) key.get(offset+7)) << 56;
278 |       case  7: k1 ^= ((long) key.get(offset+6)) << 48;
279 |       case  6: k1 ^= ((long) key.get(offset+5)) << 40;
280 |       case  5: k1 ^= ((long) key.get(offset+4)) << 32;
281 |       case  4: k1 ^= ((long) key.get(offset+3)) << 24;
282 |       case  3: k1 ^= ((long) key.get(offset+2)) << 16;
283 |       case  2: k1 ^= ((long) key.get(offset+1)) << 8;
284 |       case  1: k1 ^= ((long) key.get(offset));
285 |         k1 *= c1; k1  = rotl64(k1,31); k1 *= c2; h1 ^= k1;
286 |     };
287 | 
288 |     //----------
289 |     // finalization
290 | 
291 |     h1 ^= length; h2 ^= length;
292 | 
293 |     h1 += h2;
294 |     h2 += h1;
295 | 
296 |     h1 = fmix(h1);
297 |     h2 = fmix(h2);
298 | 
299 |     h1 += h2;
300 |     h2 += h1;
301 | 
302 |     return(new long[] {h1, h2});
303 |   }
304 | }
305 | 


--------------------------------------------------------------------------------