├── .gitattributes
├── project
    ├── build.properties
    ├── BuildKeys.scala
    ├── Configs.scala
    ├── plugins.sbt
    ├── Testing.scala
    ├── Dependencies.scala
    ├── Publishing.scala
    └── Settings.scala
├── pubring.gpg.enc
├── secring.gpg.enc
├── examples
    └── src
    │   └── main
    │       ├── scala
    │           └── Main.scala
    │       └── java
    │           └── MainJava.java
├── benchmarks
    └── src
    │   └── main
    │       └── scala
    │           ├── bloomfilter
    │               ├── mutable
    │               │   ├── AddLongItemBenchmark.scala
    │               │   ├── _128bit
    │               │   │   ├── AddLongItemBenchmark.scala
    │               │   │   ├── StringItemBenchmark.scala
    │               │   │   └── ArrayByteItemBenchmark.scala
    │               │   ├── StringItemBenchmark.scala
    │               │   ├── ArrayByteItemBenchmark.scala
    │               │   └── StringItemCuckooBenchmark.scala
    │               ├── hashing
    │               │   ├── MurmurHash3GenericBenchmark.scala
    │               │   └── MurmurHash3Benchmark.scala
    │               └── UnsafeBitArrayBenchmark.scala
    │           └── alternatives
    │               ├── breeze
    │                   ├── AddLongItemBenchmark.scala
    │                   ├── StringItemBenchmark.scala
    │                   └── ArrayByteItemBenchmark.scala
    │               ├── guava
    │                   ├── AddLongItemBenchmark.scala
    │                   ├── ArrayByteItemBenchmark.scala
    │                   └── StringItemBenchmark.scala
    │               ├── algebird
    │                   └── StringItemBenchmark.scala
    │               └── stream
    │                   └── StringItemBenchmark.scala
├── bloom-filter
    └── src
    │   └── main
    │       └── scala
    │           └── bloomfilter
    │               ├── util
    │                   └── Unsafe.scala
    │               ├── CanGetDataFrom.scala
    │               ├── CanGenerateHashFrom.scala
    │               ├── CanGenerate128HashFrom.scala
    │               ├── mutable
    │                   ├── _128bit
    │                   │   └── BloomFilter.scala
    │                   ├── UnsafeBitArray.scala
    │                   ├── CuckooFilter.scala
    │                   ├── BloomFilter.scala
    │                   └── UnsafeTable.scala
    │               └── hashing
    │                   └── MurmurHash3Generic.scala
├── tests
    └── src
    │   ├── endToEnd
    │       └── scala
    │       │   └── endToEnd
    │       │       └── bloomfilter
    │       │           └── mutable
    │       │               ├── SampleUsageSpec.scala
    │       │               └── _128bit
    │       │                   └── SampleUsageSpec.scala
    │   └── test
    │       └── scala
    │           └── tests
    │               └── bloomfilter
    │                   ├── hashing
    │                       └── MurmurHash3ScalaVsJavaSpec.scala
    │                   ├── CanGetDataFromSpec.scala
    │                   └── mutable
    │                       ├── _128bit
    │                           ├── BloomFilterSerializationSpec.scala
    │                           └── BloomFilterSpec.scala
    │                       ├── BloomFiltersSpec.scala
    │                       ├── UnsafeBitArraysSpec.scala
    │                       ├── BloomFilterSerializationSpec.scala
    │                       ├── BloomFilterSpec.scala
    │                       ├── UnsafeBitArraySpec.scala
    │                       ├── CuckooFilterSpec.scala
    │                       └── UnsafeTableSpec.scala
├── sandbox
    └── src
    │   └── main
    │       ├── scala
    │           └── sandbox
    │           │   ├── bloomfilter
    │           │       └── mutable
    │           │       │   ├── ChronicleBitArray.scala
    │           │       │   ├── BitArray.scala
    │           │       │   └── BloomFilter.scala
    │           │   └── hashing
    │           │       └── MurmurHash3.scala
    │       └── java
    │           └── sandbox
    │               └── hashing
    │                   ├── AlgebirdMurmurHash128.scala
    │                   ├── CassandraMurmurHash.java
    │                   └── YonikMurmurHash3.java
├── LICENSE
├── .travis.yml
├── .gitignore
├── CHANGELOG.md
├── sandboxApp
    └── src
    │   └── main
    │       └── scala
    │           └── SandboxApp.scala
└── README.md


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.enc binary


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 1.3.10


--------------------------------------------------------------------------------
/pubring.gpg.enc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexandrnikitin/bloom-filter-scala/HEAD/pubring.gpg.enc


--------------------------------------------------------------------------------
/secring.gpg.enc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexandrnikitin/bloom-filter-scala/HEAD/secring.gpg.enc


--------------------------------------------------------------------------------
/project/BuildKeys.scala:
--------------------------------------------------------------------------------
1 | import sbt.TaskKey
2 | 
3 | object BuildKeys {
4 |   lazy val testAll = TaskKey[Unit]("test-all")
5 | }
6 | 


--------------------------------------------------------------------------------
/project/Configs.scala:
--------------------------------------------------------------------------------
1 | import sbt._
2 | 
3 | object Configs {
4 |   val EndToEndTest = config("endToEnd") extend Runtime
5 |   val all = EndToEndTest
6 | }


--------------------------------------------------------------------------------
/examples/src/main/scala/Main.scala:
--------------------------------------------------------------------------------
 1 | 
 2 | import bloomfilter.mutable.BloomFilter
 3 | 
 4 | object Main extends App {
 5 |   val expectedElements = 1000
 6 |   val falsePositiveRate: Double = 0.1
 7 |   val bf = BloomFilter[String](expectedElements, falsePositiveRate)
 8 |   bf.add("some string")
 9 |   bf.mightContain("some string")
10 |   bf.dispose()
11 | }
12 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | logLevel := Level.Warn
 2 | 
 3 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.1")
 4 | 
 5 | addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.3.7")
 6 | 
 7 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.2")
 8 | 
 9 | addSbtPlugin("io.crashbox" % "sbt-gpg" % "0.2.1")
10 | 
11 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.7.0")
12 | 
13 | addSbtPlugin("com.dwijnand" % "sbt-dynver" % "4.0.0")
14 | 


--------------------------------------------------------------------------------
/benchmarks/src/main/scala/bloomfilter/mutable/AddLongItemBenchmark.scala:
--------------------------------------------------------------------------------
 1 | package bloomfilter.mutable
 2 | 
 3 | import org.openjdk.jmh.annotations.{Benchmark, Scope, State}
 4 | 
 5 | @State(Scope.Benchmark)
 6 | class AddLongItemBenchmark {
 7 | 
 8 |   private val itemsExpected = 1000000L
 9 |   private val falsePositiveRate = 0.01
10 | 
11 |   private val bf = BloomFilter[Long](itemsExpected, falsePositiveRate)
12 | 
13 |   @Benchmark
14 |   def my() = {
15 |     bf.add(1L)
16 |   }
17 | 
18 | 
19 | }


--------------------------------------------------------------------------------
/benchmarks/src/main/scala/bloomfilter/mutable/_128bit/AddLongItemBenchmark.scala:
--------------------------------------------------------------------------------
 1 | package bloomfilter.mutable._128bit
 2 | 
 3 | import org.openjdk.jmh.annotations.{Benchmark, Scope, State}
 4 | 
 5 | @State(Scope.Benchmark)
 6 | class AddLongItemBenchmark {
 7 | 
 8 |   private val itemsExpected = 1000000L
 9 |   private val falsePositiveRate = 0.01
10 | 
11 |   private val bf = BloomFilter[Long](itemsExpected, falsePositiveRate)
12 | 
13 |   @Benchmark
14 |   def my() = {
15 |     bf.add(1L)
16 |   }
17 | 
18 | 
19 | }


--------------------------------------------------------------------------------
/benchmarks/src/main/scala/alternatives/breeze/AddLongItemBenchmark.scala:
--------------------------------------------------------------------------------
 1 | package alternatives.breeze
 2 | 
 3 | import breeze.util.BloomFilter
 4 | import org.openjdk.jmh.annotations.{Benchmark, Scope, State}
 5 | 
 6 | @State(Scope.Benchmark)
 7 | class AddLongItemBenchmark {
 8 | 
 9 |   private val itemsExpected = 1000000L
10 |   private val falsePositiveRate = 0.01
11 | 
12 |   private val bf = BloomFilter.optimallySized[Long](itemsExpected.toDouble, falsePositiveRate)
13 | 
14 |   @Benchmark
15 |   def breeze() = {
16 |     bf.+=(1L)
17 |   }
18 | 
19 | 
20 | }


--------------------------------------------------------------------------------
/benchmarks/src/main/scala/alternatives/guava/AddLongItemBenchmark.scala:
--------------------------------------------------------------------------------
 1 | package alternatives.guava
 2 | 
 3 | import com.google.common.hash.{BloomFilter, Funnels}
 4 | import org.openjdk.jmh.annotations.{Benchmark, Scope, State}
 5 | 
 6 | @State(Scope.Benchmark)
 7 | class AddLongItemBenchmark {
 8 | 
 9 |   private val itemsExpected = 1000000L
10 |   private val falsePositiveRate = 0.01
11 | 
12 |   private val bf = BloomFilter.create[java.lang.Long](Funnels.longFunnel(), itemsExpected, falsePositiveRate)
13 | 
14 |   @Benchmark
15 |   def guava() = {
16 |     bf.put(1L)
17 |   }
18 | 
19 | 
20 | }


--------------------------------------------------------------------------------
/benchmarks/src/main/scala/bloomfilter/hashing/MurmurHash3GenericBenchmark.scala:
--------------------------------------------------------------------------------
 1 | package bloomfilter.hashing
 2 | 
 3 | import org.openjdk.jmh.annotations.{Benchmark, Scope, State}
 4 | import sandbox.hashing.MurmurHash3
 5 | 
 6 | @State(Scope.Benchmark)
 7 | class MurmurHash3GenericBenchmark {
 8 | 
 9 |   val key = Range(0, 64).map(_.toByte).toArray
10 | 
11 |   @Benchmark
12 |   def scalaVersion() = {
13 |     MurmurHash3.murmurhash3_x64_128(key, 0, key.length, 0)
14 |   }
15 | 
16 |   @Benchmark
17 |   def genericVersion() = {
18 |     MurmurHash3Generic.murmurhash3_x64_128(key, 0, key.length, 0)
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/examples/src/main/java/MainJava.java:
--------------------------------------------------------------------------------
 1 | import bloomfilter.CanGenerateHashFrom;
 2 | import bloomfilter.mutable.BloomFilter;
 3 | 
 4 | public class MainJava {
 5 |     public static void main(String[] args) {
 6 |         long expectedElements = 10000000;
 7 |         double falsePositiveRate = 0.1;
 8 |         BloomFilter<byte[]> bf = BloomFilter.apply(
 9 |                 expectedElements,
10 |                 falsePositiveRate,
11 |                 CanGenerateHashFrom.CanGenerateHashFromByteArray$.MODULE$);
12 | 
13 |         byte[] element = new byte[100];
14 |         bf.add(element);
15 |         bf.mightContain(element);
16 |         bf.dispose();
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/bloom-filter/src/main/scala/bloomfilter/util/Unsafe.scala:
--------------------------------------------------------------------------------
 1 | package bloomfilter.util
 2 | 
 3 | import sun.misc.{Unsafe => JUnsafe}
 4 | 
 5 | import scala.language.postfixOps
 6 | import scala.util.Try
 7 | 
 8 | object Unsafe {
 9 |   val unsafe: JUnsafe = Try {
10 |     classOf[JUnsafe]
11 |       .getDeclaredFields
12 |       .find { field =>
13 |         field.getType == classOf[JUnsafe]
14 |       }
15 |       .map { field =>
16 |         field.setAccessible(true)
17 |         field.get(null).asInstanceOf[JUnsafe]
18 |       }
19 |       .getOrElse(throw new IllegalStateException("Can't find instance of sun.misc.Unsafe"))
20 |   } recover {
21 |     case th: Throwable => throw new ExceptionInInitializerError(th)
22 |   } get
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/tests/src/endToEnd/scala/endToEnd/bloomfilter/mutable/SampleUsageSpec.scala:
--------------------------------------------------------------------------------
 1 | package endToEnd.bloomfilter.mutable
 2 | 
 3 | import bloomfilter.mutable.BloomFilter
 4 | import org.scalatest.{FreeSpec, Matchers}
 5 | 
 6 | class SampleUsageSpec extends FreeSpec with Matchers {
 7 |   "Create, put and check " in {
 8 |     val bloomFilter = BloomFilter[String](1000, 0.01)
 9 | 
10 |     bloomFilter.add("")
11 |     bloomFilter.add("Hello!")
12 |     bloomFilter.add("8f16c986824e40e7885a032ddd29a7d3")
13 | 
14 |     bloomFilter.mightContain("") shouldBe true
15 |     bloomFilter.mightContain("Hello!") shouldBe true
16 |     bloomFilter.mightContain("8f16c986824e40e7885a032ddd29a7d3") shouldBe true
17 | 
18 |     bloomFilter.dispose()
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/tests/src/endToEnd/scala/endToEnd/bloomfilter/mutable/_128bit/SampleUsageSpec.scala:
--------------------------------------------------------------------------------
 1 | package endToEnd.bloomfilter.mutable._128bit
 2 | 
 3 | import bloomfilter.mutable._128bit.BloomFilter
 4 | import org.scalatest.{FreeSpec, Matchers}
 5 | 
 6 | class SampleUsageSpec extends FreeSpec with Matchers {
 7 |   "Create, put and check " in {
 8 |     val bloomFilter = BloomFilter[String](1000, 0.01)
 9 | 
10 |     bloomFilter.add("")
11 |     bloomFilter.add("Hello!")
12 |     bloomFilter.add("8f16c986824e40e7885a032ddd29a7d3")
13 | 
14 |     bloomFilter.mightContain("") shouldBe true
15 |     bloomFilter.mightContain("Hello!") shouldBe true
16 |     bloomFilter.mightContain("8f16c986824e40e7885a032ddd29a7d3") shouldBe true
17 | 
18 |     bloomFilter.dispose()
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/tests/src/test/scala/tests/bloomfilter/hashing/MurmurHash3ScalaVsJavaSpec.scala:
--------------------------------------------------------------------------------
 1 | package tests.bloomfilter.hashing
 2 | 
 3 | import bloomfilter.hashing.MurmurHash3Generic
 4 | import org.scalacheck.Prop.forAll
 5 | import org.scalacheck.Properties
 6 | import sandbox.hashing.YonikMurmurHash3
 7 | import sandbox.hashing.YonikMurmurHash3.LongPair
 8 | 
 9 | object MurmurHash3ScalaVsJavaSpec extends Properties("MurmurHash3ScalaVsJavaSpec") {
10 | 
11 |   property("murmurhash3_x64_128") = forAll { (key: Array[Byte]) =>
12 |     val tuple = MurmurHash3Generic.murmurhash3_x64_128(key, 0, key.length, 0)
13 |     val pair = new LongPair
14 |     YonikMurmurHash3.murmurhash3_x64_128(key, 0, key.length, 0, pair)
15 |     pair.val1 == tuple._1 && pair.val2 == tuple._2
16 |   }
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/sandbox/src/main/scala/sandbox/bloomfilter/mutable/ChronicleBitArray.scala:
--------------------------------------------------------------------------------
 1 | package sandbox.bloomfilter.mutable
 2 | 
 3 | import net.openhft.chronicle.bytes.NativeBytesStore
 4 | 
 5 | import bloomfilter.util.Unsafe.unsafe
 6 | 
 7 | class ChronicleBitArray(numberOfBits: Long) {
 8 |   private val indices = math.ceil(numberOfBits.toDouble / 64).toLong
 9 |   private val ptr = unsafe.allocateMemory(indices)
10 |   private val bytes = new NativeBytesStore(ptr, indices)
11 | 
12 |   def get(index: Long): Boolean = {
13 |     (bytes.readLong(index >>> 6) & (1L << index)) != 0
14 |   }
15 | 
16 |   def set(index: Long): Unit = {
17 |     val offset = index >>> 6
18 |     val long = bytes.readLong(offset)
19 |     val _ = bytes.writeLong(offset, long | (1L << index))
20 |   }
21 | 
22 |   def getBitCount: Long = 0
23 | }
24 | 


--------------------------------------------------------------------------------
/benchmarks/src/main/scala/bloomfilter/mutable/StringItemBenchmark.scala:
--------------------------------------------------------------------------------
 1 | package bloomfilter.mutable
 2 | 
 3 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State}
 4 | 
 5 | import scala.util.Random
 6 | 
 7 | @State(Scope.Benchmark)
 8 | class StringItemBenchmark {
 9 | 
10 |   private val itemsExpected = 100000000L
11 |   private val falsePositiveRate = 0.01
12 |   private val random = new Random()
13 | 
14 |   private val bf = BloomFilter[String](itemsExpected, falsePositiveRate)
15 | 
16 |   @Param(Array("1024"))
17 |   var length: Int = _
18 | 
19 |   private val item = random.nextString(length)
20 |   bf.add(item)
21 | 
22 |   @Benchmark
23 |   def myPut(): Unit = {
24 |     bf.add(item)
25 |   }
26 | 
27 |   @Benchmark
28 |   def myGet(): Unit = {
29 |     bf.mightContain(item)
30 |   }
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/benchmarks/src/main/scala/bloomfilter/mutable/_128bit/StringItemBenchmark.scala:
--------------------------------------------------------------------------------
 1 | package bloomfilter.mutable._128bit
 2 | 
 3 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State}
 4 | 
 5 | import scala.util.Random
 6 | 
 7 | @State(Scope.Benchmark)
 8 | class StringItemBenchmark {
 9 | 
10 |   private val itemsExpected = 100000000L
11 |   private val falsePositiveRate = 0.01
12 |   private val random = new Random()
13 | 
14 |   private val bf = BloomFilter[String](itemsExpected, falsePositiveRate)
15 | 
16 |   @Param(Array("1024"))
17 |   var length: Int = _
18 | 
19 |   private val item = random.nextString(length)
20 |   bf.add(item)
21 | 
22 |   @Benchmark
23 |   def myPut(): Unit = {
24 |     bf.add(item)
25 |   }
26 | 
27 |   @Benchmark
28 |   def myGet(): Unit = {
29 |     bf.mightContain(item)
30 |   }
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/benchmarks/src/main/scala/bloomfilter/mutable/ArrayByteItemBenchmark.scala:
--------------------------------------------------------------------------------
 1 | package bloomfilter.mutable
 2 | 
 3 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State}
 4 | 
 5 | import scala.util.Random
 6 | 
 7 | @State(Scope.Benchmark)
 8 | class ArrayByteItemBenchmark {
 9 | 
10 |   private val itemsExpected = 1000000L
11 |   private val falsePositiveRate = 0.01
12 |   private val random = new Random()
13 | 
14 |   private val bf = BloomFilter[Array[Byte]](itemsExpected, falsePositiveRate)
15 | 
16 |   @Param(Array("1024"))
17 |   var length: Int = _
18 | 
19 |   private val item = new Array[Byte](length)
20 |   random.nextBytes(item)
21 |   bf.add(item)
22 | 
23 |   @Benchmark
24 |   def myPut(): Unit = {
25 |     bf.add(item)
26 |   }
27 | 
28 |   @Benchmark
29 |   def myGet(): Unit = {
30 |     bf.mightContain(item)
31 |   }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/benchmarks/src/main/scala/bloomfilter/mutable/_128bit/ArrayByteItemBenchmark.scala:
--------------------------------------------------------------------------------
 1 | package bloomfilter.mutable._128bit
 2 | 
 3 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State}
 4 | 
 5 | import scala.util.Random
 6 | 
 7 | @State(Scope.Benchmark)
 8 | class ArrayByteItemBenchmark {
 9 | 
10 |   private val itemsExpected = 1000000L
11 |   private val falsePositiveRate = 0.01
12 |   private val random = new Random()
13 | 
14 |   private val bf = BloomFilter[Array[Byte]](itemsExpected, falsePositiveRate)
15 | 
16 |   @Param(Array("1024"))
17 |   var length: Int = _
18 | 
19 |   private val item = new Array[Byte](length)
20 |   random.nextBytes(item)
21 |   bf.add(item)
22 | 
23 |   @Benchmark
24 |   def myPut(): Unit = {
25 |     bf.add(item)
26 |   }
27 | 
28 |   @Benchmark
29 |   def myGet(): Unit = {
30 |     bf.mightContain(item)
31 |   }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/benchmarks/src/main/scala/alternatives/breeze/StringItemBenchmark.scala:
--------------------------------------------------------------------------------
 1 | package alternatives.breeze
 2 | 
 3 | import breeze.util.BloomFilter
 4 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State}
 5 | 
 6 | import scala.util.Random
 7 | 
 8 | @State(Scope.Benchmark)
 9 | class StringItemBenchmark {
10 | 
11 |   private val itemsExpected = 100000000L
12 |   private val falsePositiveRate = 0.01
13 |   private val random = new Random()
14 | 
15 |   private val bf = BloomFilter.optimallySized[String](itemsExpected.toDouble, falsePositiveRate)
16 | 
17 |   @Param(Array("1024"))
18 |   var length: Int = _
19 | 
20 |   private val item = random.nextString(length)
21 |   bf.+=(item)
22 | 
23 |   @Benchmark
24 |   def breezePut(): Unit = {
25 |     bf.+=(item)
26 |   }
27 | 
28 |   @Benchmark
29 |   def breezeGet(): Unit = {
30 |     bf.contains(item)
31 |   }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/benchmarks/src/main/scala/alternatives/algebird/StringItemBenchmark.scala:
--------------------------------------------------------------------------------
 1 | package alternatives.algebird
 2 | 
 3 | import com.twitter.algebird.BloomFilter
 4 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State}
 5 | 
 6 | import scala.util.Random
 7 | 
 8 | @State(Scope.Benchmark)
 9 | class StringItemBenchmark {
10 | 
11 |   private val itemsExpected = 100000000L
12 |   private val falsePositiveRate = 0.01
13 |   private val random = new Random()
14 | 
15 |   private var bf = BloomFilter(itemsExpected.toInt, falsePositiveRate, 0).create("")
16 | 
17 |   @Param(Array("1024"))
18 |   var length: Int = _
19 | 
20 |   private val item = random.nextString(length)
21 |   bf = bf.+(item)
22 | 
23 |   @Benchmark
24 |   def algebirdPut(): Unit = {
25 |     bf.+(item)
26 |   }
27 | 
28 |   @Benchmark
29 |   def algebirdGet(): Unit = {
30 |     bf.contains(item)
31 |   }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/benchmarks/src/main/scala/alternatives/breeze/ArrayByteItemBenchmark.scala:
--------------------------------------------------------------------------------
 1 | package alternatives.breeze
 2 | 
 3 | import breeze.util.BloomFilter
 4 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State}
 5 | 
 6 | import scala.util.Random
 7 | 
 8 | @State(Scope.Benchmark)
 9 | class ArrayByteItemBenchmark {
10 | 
11 |   private val itemsExpected = 1000000L
12 |   private val falsePositiveRate = 0.01
13 |   private val random = new Random()
14 | 
15 |   private val bf = BloomFilter.optimallySized[Array[Byte]](itemsExpected.toDouble, falsePositiveRate)
16 | 
17 |   @Param(Array("1024"))
18 |   var length: Int = _
19 | 
20 |   private val item = new Array[Byte](length)
21 |   random.nextBytes(item)
22 |   bf.+=(item)
23 | 
24 |   @Benchmark
25 |   def breezePut(): Unit = {
26 |     bf.+=(item)
27 |   }
28 | 
29 |   @Benchmark
30 |   def breezeGet(): Unit = {
31 |     bf.contains(item)
32 |   }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/benchmarks/src/main/scala/alternatives/guava/ArrayByteItemBenchmark.scala:
--------------------------------------------------------------------------------
 1 | package alternatives.guava
 2 | 
 3 | import com.google.common.hash.{BloomFilter, Funnels}
 4 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State}
 5 | 
 6 | import scala.util.Random
 7 | 
 8 | @State(Scope.Benchmark)
 9 | class ArrayByteItemBenchmark {
10 | 
11 |   private val itemsExpected = 1000000L
12 |   private val falsePositiveRate = 0.01
13 |   private val random = new Random()
14 | 
15 |   private val bf = BloomFilter.create[Array[Byte]](Funnels.byteArrayFunnel(), itemsExpected, falsePositiveRate)
16 | 
17 |   @Param(Array("1024"))
18 |   var length: Int = _
19 | 
20 |   private val item = new Array[Byte](length)
21 |   random.nextBytes(item)
22 |   bf.put(item)
23 | 
24 |   @Benchmark
25 |   def guavaPut(): Boolean = {
26 |     bf.put(item)
27 |   }
28 | 
29 |   @Benchmark
30 |   def guava(): Boolean = {
31 |     bf.mightContain(item)
32 |   }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/benchmarks/src/main/scala/alternatives/stream/StringItemBenchmark.scala:
--------------------------------------------------------------------------------
 1 | package alternatives.stream
 2 | 
 3 | import com.clearspring.analytics.stream.membership.BloomFilter
 4 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State}
 5 | import org.openjdk.jmh.infra.Blackhole
 6 | 
 7 | import scala.util.Random
 8 | 
 9 | @State(Scope.Benchmark)
10 | class StringItemBenchmark {
11 | 
12 |   private val itemsExpected = 100000000L
13 |   private val falsePositiveRate = 0.01
14 |   private val random = new Random()
15 | 
16 |   private val bf = new BloomFilter(itemsExpected.toInt, falsePositiveRate)
17 | 
18 |   @Param(Array("1024"))
19 |   var length: Int = _
20 | 
21 |   private val item = random.nextString(length)
22 |   bf.add(item)
23 | 
24 |   @Benchmark
25 |   def streamPut(): Unit = {
26 |     bf.add(item)
27 |   }
28 | 
29 |   @Benchmark
30 |   def streamGet(bh: Blackhole): Unit = {
31 |     bh.consume(bf.isPresent(item))
32 |   }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/benchmarks/src/main/scala/alternatives/guava/StringItemBenchmark.scala:
--------------------------------------------------------------------------------
 1 | package alternatives.guava
 2 | 
 3 | import java.nio.charset.Charset
 4 | 
 5 | import com.google.common.hash.{BloomFilter, Funnels}
 6 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State}
 7 | 
 8 | import scala.util.Random
 9 | 
10 | @State(Scope.Benchmark)
11 | class StringItemBenchmark {
12 | 
13 |   private val itemsExpected = 100000000L
14 |   private val falsePositiveRate = 0.01
15 |   private val random = new Random()
16 | 
17 |   private val bf = BloomFilter.create[String](Funnels.stringFunnel(Charset.forName("UTF-8")), itemsExpected, falsePositiveRate)
18 | 
19 |   @Param(Array("1024"))
20 |   var length: Int = _
21 | 
22 |   private val item = random.nextString(length)
23 |   bf.put(item)
24 | 
25 |   @Benchmark
26 |   def guavaPut(): Unit = {
27 |     bf.put(item)
28 |   }
29 | 
30 |   @Benchmark
31 |   def guavaGet(): Unit = {
32 |     bf.mightContain(item)
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Alexandr Nikitin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/project/Testing.scala:
--------------------------------------------------------------------------------
 1 | import sbt.Keys._
 2 | import sbt._
 3 | import BuildKeys._
 4 | import scoverage.ScoverageKeys._
 5 | 
 6 | object Testing {
 7 | 
 8 |   import Configs._
 9 | 
10 |   private lazy val testSettings = Seq(
11 |     Test / fork := false,
12 |     Test / parallelExecution := false,
13 |     Test / testOptions += Tests.Argument(TestFrameworks.ScalaCheck, "-verbosity", "2")
14 |   )
15 | 
16 |   private lazy val e2eSettings = inConfig(EndToEndTest)(Defaults.testSettings) ++ Seq(
17 |     EndToEndTest / fork := false,
18 |     EndToEndTest / parallelExecution := false,
19 |     EndToEndTest / scalaSource := baseDirectory.value / "src/endToEnd/scala"
20 |   )
21 | 
22 |   private lazy val testAllSettings = Seq(
23 |     testAll :=(),
24 |     testAll := testAll.dependsOn(EndToEndTest / test),
25 |     testAll := testAll.dependsOn(Test / test)
26 |   )
27 | 
28 |   private lazy val scoverageSettings = Seq(
29 |     coverageMinimum := 60,
30 |     coverageFailOnMinimum := false,
31 |     coverageHighlighting := true,
32 |     coverageExcludedPackages := ".*Benchmark"
33 |   )
34 | 
35 |   lazy val settings = testSettings ++ e2eSettings ++ testAllSettings ++ scoverageSettings
36 | }
37 | 


--------------------------------------------------------------------------------
/tests/src/test/scala/tests/bloomfilter/CanGetDataFromSpec.scala:
--------------------------------------------------------------------------------
 1 | package tests.bloomfilter
 2 | 
 3 | import bloomfilter.CanGetDataFrom.CanGetDataFromArrayChar
 4 | import org.scalatest.{FreeSpec, Matchers}
 5 | 
 6 | class CanGetDataFromSpec extends FreeSpec with Matchers {
 7 |   "CanGetDataFromArrayChar" in {
 8 |     CanGetDataFromArrayChar.getByte(Array[Char]('a'), 0) shouldEqual 97.toByte
 9 |     CanGetDataFromArrayChar.getByte(Array[Char]('a'), 1) shouldEqual 0.toByte
10 | 
11 |     CanGetDataFromArrayChar.getByte(Array[Char]('a', 'b'), 0) shouldEqual 97.toByte
12 |     CanGetDataFromArrayChar.getByte(Array[Char]('a', 'b'), 1) shouldEqual 0.toByte
13 |     CanGetDataFromArrayChar.getByte(Array[Char]('a', 'b'), 2) shouldEqual 98.toByte
14 |     CanGetDataFromArrayChar.getByte(Array[Char]('a', 'b'), 3) shouldEqual 0.toByte
15 | 
16 |     CanGetDataFromArrayChar.getLong(Array[Char]('a', 'b', 'c', 'd'), 0) shouldEqual
17 |         (0.toLong << 56) |
18 |             (('d'.toByte & 0xffL) << 48) |
19 |             ((0 & 0xffL) << 40) |
20 |             (('c'.toByte & 0xffL) << 32) |
21 |             ((0 & 0xffL) << 24) |
22 |             (('b' & 0xffL) << 16) |
23 |             ((0 & 0xffL) << 8) |
24 |             'a' & 0xffL
25 | 
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/sandbox/src/main/scala/sandbox/bloomfilter/mutable/BitArray.scala:
--------------------------------------------------------------------------------
 1 | package sandbox.bloomfilter.mutable
 2 | 
 3 | class BitArray(val numberOfBits: Long) {
 4 |   // TODO check cast
 5 |   private val bits = new Array[Long](math.ceil(numberOfBits.toDouble / 64).toInt)
 6 | 
 7 |   def get(index: Long): Boolean = {
 8 |     (bits((index >>> 6).toInt) & (1L << index)) != 0
 9 |   }
10 | 
11 |   def set(index: Long): Unit = {
12 |     // TODO improve
13 |     if (!get(index)) {
14 |       bits((index >>> 6).toInt) |= (1L << index)
15 |     }
16 |   }
17 | 
18 |   def combine(that: BitArray, combiner: (Byte, Byte) => Byte): BitArray = {
19 |     val result = new BitArray(this.numberOfBits)
20 |     result
21 |   }
22 | 
23 |   def |(that: BitArray): BitArray = {
24 |     require(this.numberOfBits == that.numberOfBits, "Bitwise OR works only on arrays with the same number of bits")
25 | 
26 |     combine(that, (b1: Byte, b2: Byte) => (b1 | b2).toByte)
27 |   }
28 | 
29 |   def &(that: BitArray): BitArray = {
30 |     require(this.numberOfBits == that.numberOfBits, "Bitwise AND works only on arrays with the same number of bits")
31 | 
32 |     combine(that, (b1: Byte, b2: Byte) => (b1 & b2).toByte)
33 |   }
34 | 
35 |   def getBitCount: Long = {
36 |     throw new NotImplementedError("Not implemented yet")
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/benchmarks/src/main/scala/bloomfilter/UnsafeBitArrayBenchmark.scala:
--------------------------------------------------------------------------------
 1 | package bloomfilter
 2 | 
 3 | import java.util.BitSet
 4 | 
 5 | import bloomfilter.mutable.UnsafeBitArray
 6 | import org.openjdk.jmh.annotations.{Benchmark, Scope, State}
 7 | import sandbox.bloomfilter.mutable.ChronicleBitArray
 8 | 
 9 | @State(Scope.Benchmark)
10 | class UnsafeBitArrayBenchmark {
11 | 
12 |   private val numberOfBits = Int.MaxValue
13 | 
14 |   val unsafeBits = new UnsafeBitArray(numberOfBits.toLong)
15 |   val bitsSet = new BitSet(numberOfBits)
16 |   val chronicle = new ChronicleBitArray(numberOfBits.toLong)
17 | 
18 |   @Benchmark
19 |   def getUnsafe() = {
20 |     unsafeBits.get(1)
21 |     unsafeBits.get(10)
22 |     unsafeBits.get(100)
23 |     unsafeBits.get(1000)
24 |     unsafeBits.get(10000)
25 |     unsafeBits.get(100000)
26 |     unsafeBits.get(1000000)
27 |   }
28 | 
29 |   @Benchmark
30 |   def getBitSet() = {
31 |     bitsSet.get(1)
32 |     bitsSet.get(10)
33 |     bitsSet.get(100)
34 |     bitsSet.get(1000)
35 |     bitsSet.get(10000)
36 |     bitsSet.get(100000)
37 |     bitsSet.get(1000000)
38 |   }
39 | 
40 |   @Benchmark
41 |   def getChronicle() = {
42 |     chronicle.get(1)
43 |     chronicle.get(10)
44 |     chronicle.get(100)
45 |     chronicle.get(1000)
46 |     chronicle.get(10000)
47 |     chronicle.get(100000)
48 |     chronicle.get(1000000)
49 |   }
50 | 
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/benchmarks/src/main/scala/bloomfilter/mutable/StringItemCuckooBenchmark.scala:
--------------------------------------------------------------------------------
 1 | package bloomfilter.mutable
 2 | 
 3 | import java.util.concurrent.TimeUnit
 4 | 
 5 | import org.openjdk.jmh.annotations.{BenchmarkMode, OperationsPerInvocation, OutputTimeUnit, _}
 6 | 
 7 | import scala.util.Random
 8 | 
 9 | @State(Scope.Benchmark)
10 | class StringItemCuckooBenchmark {
11 | 
12 |   private val itemsExpected = 100000000L
13 |   private val random = new Random()
14 | 
15 |   private var bf: CuckooFilter[String] = _
16 | 
17 |   @Param(Array("1024"))
18 |   var length: Int = _
19 | 
20 |   private val items = new Array[String](10000)
21 |   var i = 0
22 |   while (i < items.length) {
23 |     items(i) = random.nextString(length)
24 |     i += 1
25 |   }
26 | 
27 |   @Setup(Level.Iteration)
28 |   def setup(): Unit = {
29 |     bf = CuckooFilter[String](itemsExpected)
30 |   }
31 | 
32 |   @Benchmark
33 |   @BenchmarkMode(Array(Mode.SingleShotTime))
34 |   @OutputTimeUnit(TimeUnit.NANOSECONDS)
35 |   @OperationsPerInvocation(10000)
36 |   def myPut(): Unit = {
37 |     var i = 0
38 |     while (i < items.length) {
39 |       bf.add(items(i))
40 |       i += 1
41 |     }
42 |   }
43 | 
44 |   @Benchmark
45 |   @BenchmarkMode(Array(Mode.Throughput))
46 |   @OperationsPerInvocation(10000)
47 |   def myGet(): Unit = {
48 |     var i = 0
49 |     while (i < items.length) {
50 |       bf.mightContain(items(i))
51 |       i += 1
52 |     }
53 |   }
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/bloom-filter/src/main/scala/bloomfilter/CanGetDataFrom.scala:
--------------------------------------------------------------------------------
 1 | package bloomfilter
 2 | 
 3 | import bloomfilter.util.Unsafe.unsafe
 4 | 
 5 | trait CanGetDataFrom[-From] {
 6 |   def getLong(from: From, offset: Int): Long
 7 |   def getByte(from: From, offset: Int): Byte
 8 | }
 9 | 
10 | object CanGetDataFrom {
11 | 
12 |   implicit case object CanGetDataFromByteArray extends CanGetDataFrom[Array[Byte]] {
13 | 
14 |     override def getLong(buf: Array[Byte], offset: Int): Long = {
15 |       (buf(offset + 7).toLong << 56) |
16 |           ((buf(offset + 6) & 0xffL) << 48) |
17 |           ((buf(offset + 5) & 0xffL) << 40) |
18 |           ((buf(offset + 4) & 0xffL) << 32) |
19 |           ((buf(offset + 3) & 0xffL) << 24) |
20 |           ((buf(offset + 2) & 0xffL) << 16) |
21 |           ((buf(offset + 1) & 0xffL) << 8) |
22 |           buf(offset) & 0xffL
23 |     }
24 | 
25 |     override def getByte(from: Array[Byte], offset: Int): Byte = {
26 |       from(offset)
27 |     }
28 |   }
29 | 
30 |   implicit case object CanGetDataFromArrayChar extends CanGetDataFrom[Array[Char]] {
31 |     private val arrayCharOffset = unsafe.arrayBaseOffset(classOf[Array[Char]])
32 | 
33 |     override def getLong(from: Array[Char], offset: Int): Long = {
34 |       unsafe.getLong(from, arrayCharOffset + offset.toLong)
35 |     }
36 | 
37 |     override def getByte(from: Array[Char], offset: Int): Byte = {
38 |       unsafe.getByte(from, arrayCharOffset + offset.toLong)
39 |     }
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: scala
 2 | matrix:
 3 |   include:
 4 |   - scala: 2.10.7
 5 |     jdk: openjdk8
 6 |     dist: trusty
 7 |     env: PUBLISH_FROM_THIS_BUILD=true
 8 |   - scala: 2.11.12
 9 |     jdk: openjdk8
10 |     dist: trusty
11 |     env: PUBLISH_FROM_THIS_BUILD=true
12 |   - scala: 2.12.11
13 |     jdk: openjdk8
14 |     dist: xenial
15 |     env: PUBLISH_FROM_THIS_BUILD=true
16 |   - scala: 2.13.1
17 |     jdk: openjdk8
18 |     dist: xenial
19 |     env: PUBLISH_FROM_THIS_BUILD=true
20 |   - scala: 2.13.1
21 |     jdk: openjdk10
22 |     dist: xenial
23 |   - scala: 2.13.1
24 |     jdk: openjdk11
25 |     dist: xenial
26 | script:
27 | - sbt ++$TRAVIS_SCALA_VERSION clean test endToEnd:test package
28 | sudo: false
29 | cache:
30 |   directories:
31 |   - "$HOME/.m2"
32 |   - "$HOME/.ivy2/cache"
33 |   - "$HOME/.sbt"
34 | git:
35 |   depth: 1
36 | before_cache:
37 |   # Tricks to avoid unnecessary cache updates
38 |   - find $HOME/.ivy2 -name "ivydata-*.properties" -delete
39 |   - find $HOME/.sbt -name "*.lock" -delete
40 | before_install:
41 | - "if [[ $TRAVIS_TAG == v[0-9.]* ]]; then openssl aes-256-cbc -pass pass:$ENCRYPTION_PASSWORD -in secring.gpg.enc -out local.secring.gpg -d; fi"
42 | - "if [[ $TRAVIS_TAG == v[0-9.]* ]]; then openssl aes-256-cbc -pass pass:$ENCRYPTION_PASSWORD -in pubring.gpg.enc -out local.pubring.gpg -d; fi"
43 | after_success:
44 | - "[[ $TRAVIS_TAG == v[0-9.]* ]] && [[ $PUBLISH_FROM_THIS_BUILD == true ]] && { gpg --import local.secring.gpg && sbt ++$TRAVIS_SCALA_VERSION publish sonatypeBundleRelease; };"
45 | 


--------------------------------------------------------------------------------
/project/Dependencies.scala:
--------------------------------------------------------------------------------
 1 | import sbt.Keys._
 2 | import sbt._
 3 | 
 4 | object Dependencies {
 5 |   private val scalatest = "org.scalatest" %% "scalatest" % "3.1.1" % "test;endToEnd"
 6 |   private val scalacheck = "org.scalacheck" %% "scalacheck" % "1.14.3" % "test"
 7 |   private val googleGuava = "com.google.guava" % "guava" % "19.0"
 8 |   private val googleFindbugs = "com.google.code.findbugs" % "jsr305" % "2.0.3" // needed by guava
 9 |   private val breeze = "org.scalanlp" %% "breeze" % "1.0"
10 |   private val breezeNatives = "org.scalanlp" %% "breeze-natives" % "1.0"
11 |   private val algebird = "com.twitter" %% "algebird-core" % "0.13.6"
12 |   private val sketches = "com.yahoo.datasketches" % "sketches-core" % "0.3.2"
13 |   private val chronicleBytes = "net.openhft" % "chronicle-bytes" % "1.2.3"
14 |   private val allocationInstrumenter = "com.google.code.java-allocation-instrumenter" % "java-allocation-instrumenter" % "3.0.1"
15 |   private val stream = "com.clearspring.analytics" % "stream" % "2.7.0"
16 | 
17 |   private val common = dependencies()
18 | 
19 |   val bloomfilter = common
20 |   val sandbox = common ++ dependencies(chronicleBytes)
21 |   val sandboxApp = common ++ dependencies(allocationInstrumenter, algebird)
22 |   val tests = common ++ dependencies(scalatest, scalacheck)
23 |   val benchmarks = common ++ dependencies(googleGuava, googleFindbugs, breeze, breezeNatives, algebird, sketches, stream)
24 | 
25 |   private def dependencies(modules: ModuleID*): Seq[Setting[_]] = Seq(libraryDependencies ++= modules)
26 | }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | # Created by https://www.gitignore.io/api/scala,intellij,sbt
 3 | 
 4 | ### Scala ###
 5 | *.class
 6 | *.log
 7 | 
 8 | # sbt specific
 9 | .cache
10 | .history
11 | .lib/
12 | dist/*
13 | target/
14 | lib_managed/
15 | src_managed/
16 | project/boot/
17 | project/plugins/project/
18 | 
19 | # Scala-IDE specific
20 | .scala_dependencies
21 | .worksheet
22 | 
23 | 
24 | 
25 | ### Intellij ###
26 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
27 | 
28 | *.iml
29 | 
30 | ## Directory-based project format:
31 | .idea/
32 | # if you remove the above rule, at least ignore the following:
33 | 
34 | # User-specific stuff:
35 | # .idea/workspace.xml
36 | # .idea/tasks.xml
37 | # .idea/dictionaries
38 | # .idea/shelf
39 | 
40 | # Sensitive or high-churn files:
41 | # .idea/dataSources.ids
42 | # .idea/dataSources.xml
43 | # .idea/sqlDataSources.xml
44 | # .idea/dynamic.xml
45 | # .idea/uiDesigner.xml
46 | 
47 | # Gradle:
48 | # .idea/gradle.xml
49 | # .idea/libraries
50 | 
51 | # Mongo Explorer plugin:
52 | # .idea/mongoSettings.xml
53 | 
54 | ## File-based project format:
55 | *.ipr
56 | *.iws
57 | 
58 | ## Plugin-specific files:
59 | 
60 | # IntelliJ
61 | /out/
62 | 
63 | # mpeltonen/sbt-idea plugin
64 | .idea_modules/
65 | 
66 | # JIRA plugin
67 | atlassian-ide-plugin.xml
68 | 
69 | # Crashlytics plugin (for Android Studio and IntelliJ)
70 | com_crashlytics_export_strings.xml
71 | crashlytics.properties
72 | crashlytics-build.properties
73 | fabric.properties
74 | 
75 | 
76 | local.*
77 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ### 0.13.1
 2 | - Fix deserialization backwards-compatibility with v0.12.0 (via \#52) Thanks to Sean Rohead @seanrohead
 3 | 
 4 | ### 0.13.0
 5 | - Add JDK 9+ support (via \#47) Thanks to Sean Rohead @seanrohead
 6 | - BREAKING: breaks java serialization/deserialization compatibility with v0.12.0. Use v0.13.1.
 7 | 
 8 | ### 0.12.0
 9 | - Add Scala 2.13 support (via \#45)
10 | 
11 | ### 0.11.0
12 | - BREAKING: Add approximateElementCount method that estimates number of added elements. Thanks to @SidWeng. It's a breaking change because it serializes one more field (via \#37 and \#38)
13 | 
14 | ### 0.10.1
15 | - Change the default long hash function to MurMurHash3 (via \#33)
16 | 
17 | ### 0.10.0
18 | 
19 | - Performance improvement: set a bit only if it's not set already (via \#28)
20 | - \#22 Scala 2.12.1 support (via \#31). Thanks to Fedor Lavrentyev @fediq.
21 | - \#29 Fix hashing of small strings (via \#32). 
22 | 
23 | ### 0.9.0
24 | 
25 | - \#23 Serialization support (via \#25). Thanks to Eyal Farago @eyalfa.
26 | 
27 | ### 0.8.0
28 | 
29 | - \#19 Cuckoo Filter (via \#20)
30 | 
31 | ### 0.7.0
32 | 
33 | - \#5 Add serialization support.
34 | 
35 | ### 0.6.0
36 | 
37 | - \#2 Scala 2.10 support.
38 | 
39 | ### 0.5.0
40 | 
41 | - \#4 Union and intersection of two Bloom filters (via \#6). Thanks to Mario Pastorelli @melrief.
42 | 
43 | ### 0.4.2
44 | 
45 | - Fix memory access in UnsafeBitArray. Must update. Thanks to @cmarxer (via e79ff243ac)
46 | 
47 | ### 0.4.1
48 | 
49 | - Fix memory allocation in UnsafeBitArray. Must update. Thanks to @cmarxer (via \#9)
50 | 
51 | ### 0.4.0
52 | 
53 | - Initial release
54 | 


--------------------------------------------------------------------------------
/tests/src/test/scala/tests/bloomfilter/mutable/_128bit/BloomFilterSerializationSpec.scala:
--------------------------------------------------------------------------------
 1 | package tests.bloomfilter.mutable._128bit
 2 | 
 3 | import java.io._
 4 | 
 5 | import bloomfilter.mutable._128bit.BloomFilter
 6 | import org.scalacheck.Prop.forAll
 7 | import org.scalacheck.{Gen, Properties}
 8 | import org.scalatest.Matchers
 9 | 
10 | class BloomFilterSerializationSpec extends Properties("BloomFilter") with Matchers {
11 |   def genListElems[A](max: Long)(implicit aGen: Gen[A]): Gen[List[A]] = {
12 |     Gen.posNum[Int].map(_ % max).flatMap(i => Gen.listOfN(math.min(i, Int.MaxValue).toInt, aGen))
13 |   }
14 | 
15 |   val gen = for {
16 |     size <- Gen.oneOf[Long](1, 1000/*, Int.MaxValue.toLong + 1*/)
17 |     indices <- genListElems[Long](size)(Gen.chooseNum(0, size))
18 |   } yield (size, indices)
19 | 
20 |   property("writeTo & readFrom") = forAll(gen) {
21 |     case (size: Long, indices: List[Long]) =>
22 |       val initial = BloomFilter[Long](size, 0.01)
23 |       indices.foreach(initial.add)
24 | 
25 |       val file = File.createTempFile("bloomFilterSerialized", ".tmp")
26 |       val out = new BufferedOutputStream(new FileOutputStream(file), 10 * 1000 * 1000)
27 |       initial.writeTo(out)
28 |       out.close()
29 |       val in = new BufferedInputStream(new FileInputStream(file), 10 * 1000 * 1000)
30 |       val sut = BloomFilter.readFrom[Long](in)
31 |       in.close()
32 | 
33 |       sut.approximateElementCount() shouldEqual initial.approximateElementCount()
34 | 
35 |       val result = indices.forall(sut.mightContain)
36 | 
37 |       file.delete()
38 |       initial.dispose()
39 |       sut.dispose()
40 | 
41 |       result
42 |   }
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/benchmarks/src/main/scala/bloomfilter/hashing/MurmurHash3Benchmark.scala:
--------------------------------------------------------------------------------
 1 | package bloomfilter.hashing
 2 | 
 3 | import java.nio.ByteBuffer
 4 | 
 5 | import sandbox.hashing.{YonikMurmurHash3, CassandraMurmurHash, AlgebirdMurmurHash128}
 6 | import sandbox.hashing.YonikMurmurHash3.LongPair
 7 | import com.yahoo.sketches.hash.{MurmurHash3 => yMurmurHash3}
 8 | import com.google.common.hash.Hashing
 9 | import org.openjdk.jmh.annotations.{Benchmark, Scope, State}
10 | import scala.util.hashing.{MurmurHash3 => ScalaMurmurHash3}
11 | import com.clearspring.analytics.hash.{MurmurHash => StreamLibMurmurHash}
12 | 
13 | @State(Scope.Benchmark)
14 | class MurmurHash3Benchmark {
15 | 
16 |   val key = Range(0, 64).map(_.toByte).toArray
17 | 
18 |   @Benchmark
19 |   def javaVersion() = {
20 |     YonikMurmurHash3.murmurhash3_x64_128(key, 0, key.length, 0, new LongPair)
21 |   }
22 | 
23 |   @Benchmark
24 |   def scalaVersion() = {
25 |     MurmurHash3Generic.murmurhash3_x64_128(key, 0, key.length, 0)
26 |   }
27 | 
28 |   val guavaMurmur = Hashing.murmur3_128()
29 | 
30 |   @Benchmark
31 |   def guavaVersion() = {
32 |     guavaMurmur.hashBytes(key, 0, key.length)
33 |   }
34 | 
35 |   @Benchmark
36 |   def cassandraVersion() = {
37 |     CassandraMurmurHash.hash3_x64_128(ByteBuffer.wrap(key), 0, key.length, 0)
38 |   }
39 | 
40 |   val algebirdMurmur = AlgebirdMurmurHash128(0)
41 | 
42 |   @Benchmark
43 |   def algebirdVersion() = {
44 |     algebirdMurmur.apply(key)
45 |   }
46 | 
47 |   @Benchmark
48 |   def yahooVersion() = {
49 |     yMurmurHash3.hash(key, 0)
50 |   }
51 | 
52 |   @Benchmark
53 |   def scalaStdlibVersion() = {
54 |     ScalaMurmurHash3.arrayHash(key, 0)
55 |   }
56 | 
57 |   @Benchmark
58 |   def streamLibVersion() = {
59 |     StreamLibMurmurHash.hash(key)
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/project/Publishing.scala:
--------------------------------------------------------------------------------
 1 | import sbt._
 2 | import sbt.Keys._
 3 | import xerial.sbt.Sonatype.SonatypeKeys._
 4 | 
 5 | object Publishing {
 6 | 
 7 |   private lazy val credentialSettings = Seq(
 8 |     credentials ++= (for {
 9 |       username <- Option(System.getenv().get("SONATYPE_USERNAME"))
10 |       password <- Option(System.getenv().get("SONATYPE_PASSWORD"))
11 |     } yield Credentials("Sonatype Nexus Repository Manager", "oss.sonatype.org", username, password)).toSeq,
12 |     
13 |     credentials += Credentials(
14 |       "GnuPG Key ID",
15 |       "gpg",
16 |       "nikitin.alexandr.a@gmail.com", // key identifier
17 |       "ignored" // this field is ignored; passwords are supplied by pinentry
18 |     )
19 |   )
20 | 
21 |   private lazy val sharedSettings = Seq(
22 |     publishMavenStyle := true,
23 |     Test / publishArtifact := false,
24 |     pomIncludeRepository := Function.const(false),
25 |     publishTo := sonatypePublishToBundle.value,
26 |     sonatypeSessionName := "[sbt-sonatype] ${name.value}-${scalaBinaryVersion.value}-${version.value}"
27 |   )
28 | 
29 |   private lazy val generalSettings = Seq(
30 |     homepage := Some(url("https://github.com/alexandrnikitin/bloom-filter-scala")),
31 |     licenses := Seq("MIT" -> url("https://github.com/alexandrnikitin/bloom-filter-scala/blob/master/LICENSE")),
32 |     scmInfo := Some(ScmInfo(url("https://github.com/alexandrnikitin/bloom-filter-scala"), "scm:git:git@github.com:alexandrnikitin/bloom-filter-scala.git")),
33 |     developers := List(Developer("AlexandrNikitin", "Alexandr Nikitin", "nikitin.alexandr.a@gmail.com", url("https://github.com/alexandrnikitin/")))
34 |   )
35 | 
36 |   lazy val settings = generalSettings ++ sharedSettings ++ credentialSettings
37 | 
38 |   lazy val noPublishSettings = Seq(
39 |     publish / skip := true
40 |   )
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/bloom-filter/src/main/scala/bloomfilter/CanGenerateHashFrom.scala:
--------------------------------------------------------------------------------
 1 | package bloomfilter
 2 | 
 3 | import bloomfilter.hashing.MurmurHash3Generic
 4 | 
 5 | import java.lang.reflect.Field
 6 | 
 7 | trait CanGenerateHashFrom[From] {
 8 |   def generateHash(from: From): Long
 9 | }
10 | 
11 | object CanGenerateHashFrom {
12 |   implicit case object CanGenerateHashFromLong extends CanGenerateHashFrom[Long] {
13 |     override def generateHash(from: Long): Long = MurmurHash3Generic.fmix64(from)
14 |   }
15 | 
16 |   implicit case object CanGenerateHashFromByteArray extends CanGenerateHashFrom[Array[Byte]] {
17 |     override def generateHash(from: Array[Byte]): Long =
18 |       MurmurHash3Generic.murmurhash3_x64_64(from, 0, from.length, 0)
19 |   }
20 | 
21 |   import bloomfilter.util.Unsafe.unsafe
22 | 
23 |   case object CanGenerateHashFromString extends CanGenerateHashFrom[String] {
24 |     private val valueOffset = unsafe.objectFieldOffset(stringValueField)
25 | 
26 |     override def generateHash(from: String): Long = {
27 |       val value = unsafe.getObject(from, valueOffset).asInstanceOf[Array[Char]]
28 |       MurmurHash3Generic.murmurhash3_x64_64(value, 0, from.length * 2, 0)
29 |     }
30 |   }
31 | 
32 |   case object CanGenerateHashFromStringByteArray extends CanGenerateHashFrom[String] {
33 |     private val valueOffset = unsafe.objectFieldOffset(stringValueField)
34 | 
35 |     override def generateHash(from: String): Long = {
36 |       val value = unsafe.getObject(from, valueOffset).asInstanceOf[Array[Byte]]
37 |       MurmurHash3Generic.murmurhash3_x64_64(value, 0, from.length, 0)
38 |     }
39 |   }
40 | 
41 |   private val stringValueField: Field = classOf[String].getDeclaredField("value")
42 |   implicit val canGenerateHashFromString: CanGenerateHashFrom[String] = {
43 |     if (stringValueField.getType.getComponentType == java.lang.Byte.TYPE) CanGenerateHashFromStringByteArray else CanGenerateHashFromString
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/sandbox/src/main/java/sandbox/hashing/AlgebirdMurmurHash128.scala:
--------------------------------------------------------------------------------
 1 | package sandbox.hashing
 2 | 
 3 | import java.nio.ByteBuffer
 4 | 
 5 | case class AlgebirdMurmurHash128(seed: Long) extends AnyVal {
 6 |   def apply(buffer: ByteBuffer, offset: Int, length: Int): (Long, Long) = {
 7 |     val longs = CassandraMurmurHash.hash3_x64_128(buffer, offset, length, seed)
 8 |     (longs(0), longs(1))
 9 |   }
10 | 
11 |   def apply(bytes: Array[Byte]): (Long, Long) = apply(ByteBuffer.wrap(bytes), 0, bytes.length)
12 |   def apply(maxBytes: Int, fn: ByteBuffer => Unit): (Long, Long) = {
13 |     val buffer = ByteBuffer.allocate(maxBytes)
14 |     fn(buffer)
15 |     apply(buffer, 0, maxBytes)
16 |   }
17 |   def apply(array: Array[Char]): (Long, Long) = apply(array.size * 2, { _.asCharBuffer.put(array) })
18 |   def apply(array: Array[Short]): (Long, Long) = apply(array.size * 2, { _.asShortBuffer.put(array) })
19 |   def apply(array: Array[Int]): (Long, Long) = apply(array.size * 4, { _.asIntBuffer.put(array) })
20 |   def apply(array: Array[Float]): (Long, Long) = apply(array.size * 4, { _.asFloatBuffer.put(array) })
21 |   def apply(array: Array[Long]): (Long, Long) = apply(array.size * 8, { _.asLongBuffer.put(array) })
22 |   def apply(array: Array[Double]): (Long, Long) = apply(array.size * 8, { _.asDoubleBuffer.put(array) })
23 | 
24 |   def apply(value: Char): (Long, Long) = apply(2, { _.asCharBuffer.put(value) })
25 |   def apply(value: Short): (Long, Long) = apply(2, { _.asShortBuffer.put(value) })
26 |   def apply(value: Int): (Long, Long) = apply(4, { _.asIntBuffer.put(value) })
27 |   def apply(value: Float): (Long, Long) = apply(4, { _.asFloatBuffer.put(value) })
28 |   def apply(value: Long): (Long, Long) = apply(8, { _.asLongBuffer.put(value) })
29 |   def apply(value: Double): (Long, Long) = apply(8, { _.asDoubleBuffer.put(value) })
30 | 
31 |   def apply(string: CharSequence): (Long, Long) = apply(string.length * 2, { buffer =>
32 |     val charBuffer = buffer.asCharBuffer
33 |     0.to(string.length - 1).foreach{ i => charBuffer.put(string.charAt(i)) }
34 |   })
35 | }
36 | 


--------------------------------------------------------------------------------
/bloom-filter/src/main/scala/bloomfilter/CanGenerate128HashFrom.scala:
--------------------------------------------------------------------------------
 1 | package bloomfilter
 2 | 
 3 | import bloomfilter.hashing.MurmurHash3Generic
 4 | 
 5 | import java.lang.reflect.Field
 6 | 
 7 | trait CanGenerate128HashFrom[From] {
 8 |   def generateHash(from: From): (Long, Long)
 9 | }
10 | 
11 | object CanGenerate128HashFrom {
12 |   implicit case object CanGenerate128HashFromLong extends CanGenerate128HashFrom[Long] {
13 |     override def generateHash(from: Long): (Long, Long) = {
14 |       val hash = MurmurHash3Generic.fmix64(from)
15 |       (hash, hash)
16 |     }
17 |   }
18 | 
19 |   implicit case object CanGenerate128HashFromByteArray extends CanGenerate128HashFrom[Array[Byte]] {
20 |     override def generateHash(from: Array[Byte]): (Long, Long) =
21 |       MurmurHash3Generic.murmurhash3_x64_128(from, 0, from.length, 0)
22 |   }
23 | 
24 |   import bloomfilter.util.Unsafe.unsafe
25 | 
26 |   case object CanGenerate128HashFromString extends CanGenerate128HashFrom[String] {
27 |     private val valueOffset = unsafe.objectFieldOffset(stringValueField)
28 | 
29 |     override def generateHash(from: String): (Long, Long) = {
30 |       val value = unsafe.getObject(from, valueOffset).asInstanceOf[Array[Char]]
31 |       MurmurHash3Generic.murmurhash3_x64_128(value, 0, from.length * 2, 0)
32 |     }
33 |   }
34 | 
35 |   case object CanGenerate128HashFromStringByteArray extends CanGenerate128HashFrom[String] {
36 |     private val valueOffset = unsafe.objectFieldOffset(stringValueField)
37 | 
38 |     override def generateHash(from: String): (Long, Long) = {
39 |       val value = unsafe.getObject(from, valueOffset).asInstanceOf[Array[Byte]]
40 |       MurmurHash3Generic.murmurhash3_x64_128(value, 0, from.length, 0)
41 |     }
42 |   }
43 | 
44 |   private val stringValueField: Field = classOf[String].getDeclaredField("value")
45 |   implicit val canGenerate128HashFromString: CanGenerate128HashFrom[String] = {
46 |     if (stringValueField.getType.getComponentType == java.lang.Byte.TYPE) CanGenerate128HashFromStringByteArray else CanGenerate128HashFromString
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/tests/src/test/scala/tests/bloomfilter/mutable/BloomFiltersSpec.scala:
--------------------------------------------------------------------------------
 1 | package tests.bloomfilter.mutable
 2 | 
 3 | import bloomfilter.CanGenerateHashFrom
 4 | import bloomfilter.mutable.BloomFilter
 5 | import org.scalacheck.Test.Parameters
 6 | import org.scalacheck.commands.Commands
 7 | import org.scalacheck.{Arbitrary, Gen, Prop, Properties}
 8 | import org.scalacheck.Arbitrary.arbitrary
 9 | import org.scalacheck.Prop.forAll
10 | 
11 | class BloomFiltersSpec extends Properties("BloomFilters") {
12 | 
13 |   val maxNumElems = 10
14 | 
15 |   def genListOfMaxTenElems[A](implicit aGen: Gen[A]): Gen[List[A]] =
16 |     Gen.posNum[Int] map (_ % maxNumElems) flatMap (i => Gen.listOfN(i, aGen))
17 | 
18 |   property("union") =
19 |     forAll(genListOfMaxTenElems(arbitrary[Long]), genListOfMaxTenElems(arbitrary[Long])) {
20 |       (leftElements: List[Long], rightElements: List[Long]) =>
21 |         val leftBloomFilter = BloomFilter[Long](maxNumElems, 0.01)
22 |         leftElements foreach leftBloomFilter.add
23 |         val rightBloomFilter = BloomFilter[Long](maxNumElems, 0.01)
24 |         rightElements foreach rightBloomFilter.add
25 |         val unionBloomFilter = leftBloomFilter union rightBloomFilter
26 |         val result = (leftElements ++ rightElements) forall unionBloomFilter.mightContain
27 |         leftBloomFilter.dispose()
28 |         rightBloomFilter.dispose()
29 |         unionBloomFilter.dispose()
30 |         result
31 |     }
32 | 
33 |   property("intersect") =
34 |     forAll(genListOfMaxTenElems(arbitrary[Long]), genListOfMaxTenElems(arbitrary[Long])) {
35 |       (leftElements: List[Long], rightElements: List[Long]) =>
36 |         val leftBloomFilter = BloomFilter[Long](maxNumElems, 0.01)
37 |         leftElements foreach leftBloomFilter.add
38 |         val rightBloomFilter = BloomFilter[Long](maxNumElems, 0.01)
39 |         rightElements foreach rightBloomFilter.add
40 |         val unionBloomFilter = leftBloomFilter intersect rightBloomFilter
41 |         val intersectElems = leftElements.toSet intersect rightElements.toSet
42 |         val result = intersectElems forall unionBloomFilter.mightContain
43 |         leftBloomFilter.dispose()
44 |         rightBloomFilter.dispose()
45 |         unionBloomFilter.dispose()
46 |         result
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/tests/src/test/scala/tests/bloomfilter/mutable/UnsafeBitArraysSpec.scala:
--------------------------------------------------------------------------------
 1 | package tests.bloomfilter.mutable
 2 | 
 3 | import bloomfilter.mutable.UnsafeBitArray
 4 | import org.scalacheck.Prop._
 5 | import org.scalacheck.{Gen, Properties}
 6 | 
 7 | class UnsafeBitArraysSpec extends Properties("UnsafeBitArray") {
 8 |   def genListElems[A](max: Long)(implicit aGen: Gen[A]): Gen[List[A]] = {
 9 |     Gen.posNum[Int].map(_ % max).flatMap(i => Gen.listOfN(math.min(i, Int.MaxValue).toInt, aGen))
10 |   }
11 | 
12 |   val genUnion = for {
13 |     size <- Gen.oneOf[Long](1, 1000, Int.MaxValue, Int.MaxValue * 2L)
14 |     indices <- genListElems[Long](size)(Gen.chooseNum(0, size))
15 |     thatIndices <- genListElems[Long](size)(Gen.chooseNum(0, size))
16 |   } yield (size, indices, thatIndices)
17 | 
18 |   val genIntersection = for {
19 |     size <- Gen.oneOf[Long](1, 1000, Int.MaxValue, Int.MaxValue * 2L)
20 |     indices <- genListElems[Long](size)(Gen.chooseNum(0, size))
21 |     thatIndices <- genListElems[Long](size)(Gen.chooseNum(0, size))
22 |     commonIndices <- genListElems[Long](size)(Gen.chooseNum(0, size))
23 |   } yield (size, indices, thatIndices, commonIndices)
24 | 
25 | 
26 |   property("|") = forAll(genUnion) {
27 |     case (size: Long, indices: List[Long], thatIndices: List[Long]) =>
28 |       val array = new UnsafeBitArray(size)
29 |       indices.foreach(array.set)
30 |       val thatArray = new UnsafeBitArray(size)
31 |       thatIndices.foreach(thatArray.set)
32 | 
33 |       val sut = array | thatArray
34 |       val result = (indices ++ thatIndices).forall(sut.get)
35 | 
36 |       array.dispose()
37 |       thatArray.dispose()
38 |       sut.dispose()
39 | 
40 |       result
41 |   }
42 | 
43 |   property("&") = forAll(genIntersection) {
44 |     case (size: Long, indices: List[Long], thatIndices: List[Long], commonIndices: List[Long]) =>
45 |       val array = new UnsafeBitArray(size)
46 |       indices.foreach(array.set)
47 |       val thatArray = new UnsafeBitArray(size)
48 |       thatIndices.foreach(thatArray.set)
49 |       commonIndices.foreach(x => { array.set(x); thatArray.set(x) })
50 | 
51 |       val sut = array & thatArray
52 |       val result = commonIndices.forall(sut.get)
53 | 
54 |       array.dispose()
55 |       thatArray.dispose()
56 |       sut.dispose()
57 | 
58 |       result
59 |   }
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/project/Settings.scala:
--------------------------------------------------------------------------------
 1 | import sbt._
 2 | import sbt.Keys._
 3 | 
 4 | object Settings {
 5 | 
 6 |   private lazy val build = Seq(
 7 |     scalaVersion := "2.12.11",
 8 |     crossScalaVersions := Seq("2.10.7", "2.11.12", "2.12.11", "2.13.1"),
 9 | 
10 |     autoCompilerPlugins := true,
11 | 
12 |     scalacOptions ++= ScalacSettings.base ++ ScalacSettings.specificFor(scalaVersion.value),
13 |     javacOptions ++= JavacSettings.base ++ JavacSettings.specificFor(scalaVersion.value),
14 |     javaOptions += "-Xmx1G",
15 |     organization := "com.github.alexandrnikitin"
16 |   )
17 | 
18 |   lazy val root = build ++ Testing.settings ++ Publishing.noPublishSettings
19 |   lazy val bloomfilter = build ++ Testing.settings ++ Dependencies.bloomfilter ++ Publishing.settings ++
20 |       (scalacOptions ++= ScalacSettings.strictBase ++ ScalacSettings.strictSpecificFor(scalaVersion.value))
21 |   lazy val sandbox = build ++ Testing.settings ++ Dependencies.sandbox ++ Publishing.noPublishSettings
22 |   lazy val sandboxApp = build ++ Dependencies.sandboxApp ++ Publishing.noPublishSettings
23 |   lazy val tests = build ++ Testing.settings ++ Dependencies.tests ++ Publishing.noPublishSettings
24 |   lazy val benchmarks = build ++ Dependencies.benchmarks ++ Publishing.noPublishSettings
25 |   lazy val examples = build ++ Publishing.noPublishSettings
26 | 
27 |   object JavacSettings {
28 |     val base = Seq("-Xlint")
29 | 
30 |     def specificFor(scalaVersion: String) = CrossVersion.partialVersion(scalaVersion) match {
31 |       case Some((2, 13)) => Seq("-source", "1.8", "-target", "1.8")
32 |       case Some((2, 12)) => Seq("-source", "1.8", "-target", "1.8")
33 |       case Some((2, 11)) => Seq("-source", "1.8", "-target", "1.8")
34 |       case Some((2, 10)) => Seq("-source", "1.7", "-target", "1.7")
35 |       case _ => Nil
36 |     }
37 |   }
38 | 
39 |   object ScalacSettings {
40 |     val base = Seq(
41 |       "-deprecation",
42 |       "-encoding", "UTF-8",
43 |       "-feature",
44 |       "-unchecked"
45 |     )
46 | 
47 |     def specificFor(scalaVersion: String) = CrossVersion.partialVersion(scalaVersion) match {
48 |       case Some((2, 12)) => Seq("-target:jvm-1.8")
49 |       case Some((2, 11)) => Seq("-target:jvm-1.8", "-optimise")
50 |       case Some((2, 10)) => Seq("-target:jvm-1.7", "-optimise")
51 |       case _ => Nil
52 |     }
53 | 
54 | 
55 |     val strictBase = Seq(
56 |       "-Xfatal-warnings",
57 |       "-Xlint",
58 |       "-Ywarn-dead-code",
59 |       "-Ywarn-numeric-widen",
60 |       "-Ywarn-value-discard"
61 |     )
62 | 
63 |     def strictSpecificFor(scalaVersion: String) = CrossVersion.partialVersion(scalaVersion) match {
64 |       case Some((2, 12)) => Seq("-Ywarn-unused", "-Ywarn-unused-import")
65 |       case Some((2, 11)) => Seq("-Ywarn-unused", "-Ywarn-unused-import")
66 |       case _ => Nil
67 |     }
68 | 
69 |   }
70 | 
71 | }
72 | 


--------------------------------------------------------------------------------
/sandbox/src/main/scala/sandbox/bloomfilter/mutable/BloomFilter.scala:
--------------------------------------------------------------------------------
 1 | package sandbox.bloomfilter.mutable
 2 | 
 3 | import bloomfilter.CanGenerateHashFrom
 4 | 
 5 | class BloomFilter[T] private (val numberOfBits: Long, val numberOfHashes: Int, private val bits: BitArray)
 6 |     (implicit canGenerateHash: CanGenerateHashFrom[T]) {
 7 | 
 8 |   def this(numberOfBits: Long, numberOfHashes: Int)(implicit canGenerateHash: CanGenerateHashFrom[T]) {
 9 |     this(numberOfBits, numberOfHashes, new BitArray(numberOfBits))
10 |   }
11 | 
12 |   def add(x: T): Unit = {
13 |     val hash = canGenerateHash.generateHash(x)
14 |     val hash1 = hash >>> 32
15 |     val hash2 = (hash << 32) >> 32
16 | 
17 |     var i = 0
18 |     while (i < numberOfHashes) {
19 |       val computedHash = hash1 + i * hash2
20 |       bits.set((computedHash & Long.MaxValue) % numberOfBits)
21 |       i += 1
22 |     }
23 |   }
24 | 
25 |   def union(that: BloomFilter[T]): BloomFilter[T] = {
26 |     require(this.numberOfBits == that.numberOfBits && this.numberOfHashes == that.numberOfHashes,
27 |       s"Union works only on BloomFilters with the same number of hashes and of bits")
28 |     new BloomFilter[T](this.numberOfBits, this.numberOfHashes, this.bits | that.bits)
29 |   }
30 | 
31 |   def intersect(that: BloomFilter[T]): BloomFilter[T] = {
32 |     require(this.numberOfBits == that.numberOfBits && this.numberOfHashes == that.numberOfHashes,
33 |       s"Intersect works only on BloomFilters with the same number of hashes and of bits")
34 |     new BloomFilter[T](this.numberOfBits, this.numberOfHashes, this.bits & that.bits)
35 |   }
36 | 
37 |   def mightContain(x: T): Boolean = {
38 |     val hash = canGenerateHash.generateHash(x)
39 |     val hash1 = hash >>> 32
40 |     val hash2 = (hash << 32) >> 32
41 |     var i = 0
42 |     while (i < numberOfHashes) {
43 |       val computedHash = hash1 + i * hash2
44 |       if (!bits.get((computedHash & Long.MaxValue) % numberOfBits))
45 |         return false
46 |       i += 1
47 |     }
48 |     true
49 |   }
50 | 
51 |   def expectedFalsePositiveRate(): Double = {
52 |     math.pow(bits.getBitCount.toDouble / numberOfBits, numberOfHashes.toDouble)
53 |   }
54 | 
55 | }
56 | 
57 | object BloomFilter {
58 | 
59 |   def apply[T](numberOfItems: Long, falsePositiveRate: Double)
60 |       (implicit canGenerateHash: CanGenerateHashFrom[T]): BloomFilter[T] = {
61 | 
62 |     val nb = optimalNumberOfBits(numberOfItems, falsePositiveRate)
63 |     val nh = optimalNumberOfHashes(numberOfItems, nb)
64 |     new BloomFilter[T](nb, nh)
65 |   }
66 | 
67 |   def optimalNumberOfBits(numberOfItems: Long, falsePositiveRate: Double): Long = {
68 |     math.ceil(-1 * numberOfItems * math.log(falsePositiveRate) / math.log(2) / math.log(2)).toLong
69 |   }
70 | 
71 |   def optimalNumberOfHashes(numberOfItems: Long, numberOfBits: Long): Int = {
72 |     math.ceil(numberOfBits / numberOfItems * math.log(2)).toInt
73 |   }
74 | 
75 | }
76 | 


--------------------------------------------------------------------------------
/sandboxApp/src/main/scala/SandboxApp.scala:
--------------------------------------------------------------------------------
 1 | import java.text.NumberFormat
 2 | 
 3 | import bloomfilter.mutable.{CuckooFilter, UnsafeTable8Bit}
 4 | import com.google.monitoring.runtime.instrumentation.{AllocationRecorder, Sampler}
 5 | import com.twitter.algebird.{BloomFilter => AlgebirdBloomFilter}
 6 | 
 7 | import scala.util.Random
 8 | 
 9 | object SandboxApp {
10 | 
11 |   def checkMemory(): Unit = {
12 |     val runtime = Runtime.getRuntime
13 | 
14 |     val format = NumberFormat.getInstance()
15 | 
16 |     val sb = new StringBuilder()
17 |     val maxMemory = runtime.maxMemory()
18 |     val allocatedMemory = runtime.totalMemory()
19 |     val freeMemory = runtime.freeMemory()
20 | 
21 |     sb.append("free memory: " + format.format(freeMemory / 1024) + "\n")
22 |     sb.append("allocated memory: " + format.format(allocatedMemory / 1024) + "\n")
23 |     sb.append("max memory: " + format.format(maxMemory / 1024) + "\n")
24 |     sb.append("total free memory: " + format.format((freeMemory + (maxMemory - allocatedMemory)) / 1024) + "\n")
25 |     System.out.println(sb.toString())
26 |   }
27 | 
28 | 
29 |   def main(args: Array[String]): Unit = {
30 | 
31 |     val sut = CuckooFilter[Long](1000)
32 |     sut.add(8)
33 |     assert(sut.mightContain(8))
34 |     sut.add(10)
35 |     assert(sut.mightContain(10))
36 |     sut.add(8)
37 |     assert(sut.mightContain(8))
38 |     sut.add(10000)
39 |     assert(sut.mightContain(10000))
40 | 
41 |   }
42 | 
43 |   def compareAlgebirdFPR(): Unit = {
44 | 
45 |     val random: Random = new Random()
46 | 
47 |     val itemsExpected = 10000L
48 |     val falsePositiveRate = 0.1
49 |     var bf = AlgebirdBloomFilter(itemsExpected.toInt, falsePositiveRate, 0).create("")
50 |     val bf2 = bloomfilter.mutable.BloomFilter[String](itemsExpected, falsePositiveRate)
51 | 
52 |     var i = 0
53 |     while (i < itemsExpected) {
54 |       val str: String = random.nextString(1000)
55 |       bf = bf.+(str)
56 |       bf2.add(str)
57 |       i += 1
58 |     }
59 | 
60 |     i = 0
61 |     var in, in2 = 0
62 |     while (true) {
63 |       val str = random.nextString(1000)
64 |       if (bf.contains(str).isTrue) {
65 |         in += 1
66 |       }
67 |       if (bf2.mightContain(str)) {
68 |         in2 += 1
69 |       }
70 | 
71 |       if (i % 1000 == 0) {
72 |         println(s"in: $in; in2: $in2")
73 |       }
74 |     }
75 | 
76 | 
77 |   }
78 | 
79 |   def checkAllocations(): Unit = {
80 |     val sampler: Sampler = new Sampler() {
81 |       def sampleAllocation(count: Int, desc: String, newObj: Object, size: Long) {
82 |         System.out.println("I just allocated the object " + newObj +
83 |           " of type " + desc + " whose size is " + size)
84 |         if (count != -1) {
85 |           System.out.println("It's an array of size " + count)
86 |         }
87 |       }
88 |     }
89 | 
90 |     AllocationRecorder.addSampler(sampler)
91 | 
92 |     AllocationRecorder.removeSampler(sampler)
93 | 
94 |   }
95 | }


--------------------------------------------------------------------------------
/tests/src/test/scala/tests/bloomfilter/mutable/BloomFilterSerializationSpec.scala:
--------------------------------------------------------------------------------
 1 | package tests.bloomfilter.mutable
 2 | 
 3 | import java.io._
 4 | 
 5 | import bloomfilter.mutable.BloomFilter
 6 | import org.scalacheck.Prop.forAll
 7 | import org.scalacheck.{Gen, Properties}
 8 | import org.scalatest.Matchers
 9 | 
10 | class BloomFilterSerializationSpec extends Properties("BloomFilter") with Matchers {
11 |   def genListElems[A](max: Long)(implicit aGen: Gen[A]): Gen[List[A]] = {
12 |     Gen.posNum[Int].map(_ % max).flatMap(i => Gen.listOfN(math.min(i, Int.MaxValue).toInt, aGen))
13 |   }
14 | 
15 |   val gen = for {
16 |     size <- Gen.oneOf[Long](1, 1000 /*, Int.MaxValue.toLong + 1*/)
17 |     indices <- genListElems[Long](size)(Gen.chooseNum(0, size - 1))
18 |   } yield (size, indices)
19 | 
20 |   property("writeTo & readFrom") = forAll(gen) {
21 |     case (size: Long, indices: List[Long]) =>
22 |       val initial = BloomFilter[Long](size, 0.01)
23 |       indices.foreach(initial.add)
24 | 
25 |       val file = File.createTempFile("bloomFilterSerialized", ".tmp")
26 |       val out = new BufferedOutputStream(new FileOutputStream(file), 10 * 1000 * 1000)
27 |       initial.writeTo(out)
28 |       out.close()
29 |       val in = new BufferedInputStream(new FileInputStream(file), 10 * 1000 * 1000)
30 |       val sut = BloomFilter.readFrom[Long](in)
31 |       in.close()
32 | 
33 |       sut.approximateElementCount() shouldEqual initial.approximateElementCount()
34 | 
35 |       val result = indices.forall(sut.mightContain)
36 | 
37 |       file.delete()
38 |       initial.dispose()
39 |       sut.dispose()
40 | 
41 |       result
42 |   }
43 | 
44 |   property("supports java serialization") = {
45 |     forAll(gen) {
46 |       case (size, indices) =>
47 |         val initial = BloomFilter[Long](size, 0.01)
48 |         indices.foreach(initial.add)
49 |         val file = File.createTempFile("bloomFilterSerialized", ".tmp")
50 |         val out = new BufferedOutputStream(new FileOutputStream(file), 10 * 1000 * 1000)
51 |         val oos = new ObjectOutputStream(out)
52 |         oos.writeObject(initial)
53 |         oos.close()
54 |         out.close()
55 |         val in = new BufferedInputStream(new FileInputStream(file), 10 * 1000 * 1000)
56 |         val ois = new ObjectInputStream(in)
57 |         val desrialized = ois.readObject()
58 |         ois.close()
59 |         in.close()
60 | 
61 |         desrialized should not be null
62 |         desrialized should be(a[BloomFilter[Long]])
63 |         val sut = desrialized.asInstanceOf[BloomFilter[Long]]
64 | 
65 |         sut.numberOfBits shouldEqual initial.numberOfBits
66 |         sut.numberOfHashes shouldEqual initial.numberOfHashes
67 |         sut.approximateElementCount() shouldEqual initial.approximateElementCount()
68 | 
69 | 
70 |         val result = indices.forall(sut.mightContain)
71 | 
72 |         file.delete()
73 |         initial.dispose()
74 |         sut.dispose()
75 | 
76 |         result
77 |     }
78 |   }
79 | 
80 | }
81 | 


--------------------------------------------------------------------------------
/sandbox/src/main/scala/sandbox/hashing/MurmurHash3.scala:
--------------------------------------------------------------------------------
 1 | package sandbox.hashing
 2 | 
 3 | import java.lang.Long.rotateLeft
 4 | 
 5 | object MurmurHash3 {
 6 | 
 7 |   private val c1: Long = 0x87c37b91114253d5L
 8 |   private val c2: Long = 0x4cf5ad432745937fL
 9 | 
10 |   def getLongLittleEndian(buf: Array[Byte], offset: Int): Long = {
11 |     (buf(offset + 7).toLong << 56) |
12 |         ((buf(offset + 6) & 0xffL) << 48) |
13 |         ((buf(offset + 5) & 0xffL) << 40) |
14 |         ((buf(offset + 4) & 0xffL) << 32) |
15 |         ((buf(offset + 3) & 0xffL) << 24) |
16 |         ((buf(offset + 2) & 0xffL) << 16) |
17 |         ((buf(offset + 1) & 0xffL) << 8) |
18 |         buf(offset) & 0xffL
19 |   }
20 | 
21 |   def fmix64(l: Long): Long = {
22 |     var k = l
23 |     k ^= k >>> 33
24 |     k *= 0xff51afd7ed558ccdL
25 |     k ^= k >>> 33
26 |     k *= 0xc4ceb9fe1a85ec53L
27 |     k ^= k >>> 33
28 |     k
29 |   }
30 | 
31 |   def murmurhash3_x64_128(key: Array[Byte], offset: Int, len: Int, seed: Int): (Long, Long) = {
32 |     var h1: Long = seed & 0x00000000FFFFFFFFL
33 |     var h2: Long = seed & 0x00000000FFFFFFFFL
34 | 
35 |     val roundedEnd = offset + (len & 0xFFFFFFF0);  // round down to 16 byte block
36 | 
37 |     var i = offset
38 |     while (i < roundedEnd) {
39 |       var k1 = getLongLittleEndian(key, i)
40 |       var k2 = getLongLittleEndian(key, i + 8)
41 |       k1 *= c1; k1 = rotateLeft(k1, 31); k1 *= c2; h1 ^= k1
42 |       h1 = rotateLeft(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729
43 |       k2 *= c2; k2 = rotateLeft(k2, 33); k2 *= c1; h2 ^= k2
44 |       h2 = rotateLeft(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5
45 | 
46 |       i += 16
47 |     }
48 | 
49 |     var k1: Long = 0
50 |     var k2: Long = 0
51 | 
52 |     val lenVar = len & 15
53 |     if (lenVar == 15) k2 = (key(roundedEnd + 14) & 0xffL) << 48
54 |     if (lenVar >= 14) k2 |= (key(roundedEnd + 13) & 0xffL) << 40
55 |     if (lenVar >= 13) k2 |= (key(roundedEnd + 12) & 0xffL) << 32
56 |     if (lenVar >= 12) k2 |= (key(roundedEnd + 11) & 0xffL) << 24
57 |     if (lenVar >= 11) k2 |= (key(roundedEnd + 10) & 0xffL) << 16
58 |     if (lenVar >= 10) k2 |= (key(roundedEnd + 9) & 0xffL) << 8
59 |     if (lenVar >= 9) {
60 |       k2 |= (key(roundedEnd + 8) & 0xffL)
61 |       k2 *= c2
62 |       k2 = rotateLeft(k2, 33)
63 |       k2 *= c1
64 |       h2 ^= k2
65 |     }
66 |     if (lenVar >= 8) k1 = key(roundedEnd + 7).toLong << 56
67 |     if (lenVar >= 7) k1 |= (key(roundedEnd + 6) & 0xffL) << 48
68 |     if (lenVar >= 6) k1 |= (key(roundedEnd + 5) & 0xffL) << 40
69 |     if (lenVar >= 5) k1 |= (key(roundedEnd + 4) & 0xffL) << 32
70 |     if (lenVar >= 4) k1 |= (key(roundedEnd + 3) & 0xffL) << 24
71 |     if (lenVar >= 3) k1 |= (key(roundedEnd + 2) & 0xffL) << 16
72 |     if (lenVar >= 2) k1 |= (key(roundedEnd + 1) & 0xffL) << 8
73 |     if (lenVar >= 1) {
74 |       k1 |= (key(roundedEnd) & 0xffL)
75 |       k1 *= c1
76 |       k1 = rotateLeft(k1, 31)
77 |       k1 *= c2
78 |       h1 ^= k1
79 |     }
80 | 
81 |     h1 ^= len; h2 ^= len
82 | 
83 |     h1 += h2
84 |     h2 += h1
85 | 
86 |     h1 = fmix64(h1)
87 |     h2 = fmix64(h2)
88 | 
89 |     h1 += h2
90 |     h2 += h1
91 | 
92 |     (h1, h2)
93 |   }
94 | }
95 | 


--------------------------------------------------------------------------------
/bloom-filter/src/main/scala/bloomfilter/mutable/_128bit/BloomFilter.scala:
--------------------------------------------------------------------------------
 1 | package bloomfilter.mutable._128bit
 2 | 
 3 | import java.io.{DataInputStream, DataOutputStream, InputStream, OutputStream}
 4 | 
 5 | import bloomfilter.CanGenerate128HashFrom
 6 | import bloomfilter.mutable.UnsafeBitArray
 7 | 
 8 | import scala.math._
 9 | 
10 | @SerialVersionUID(2L)
11 | class BloomFilter[T] private (val numberOfBits: Long, val numberOfHashes: Int, private val bits: UnsafeBitArray)
12 |     (implicit canGenerateHash: CanGenerate128HashFrom[T]) extends Serializable {
13 | 
14 |   def this(numberOfBits: Long, numberOfHashes: Int)(implicit canGenerateHash: CanGenerate128HashFrom[T]) {
15 |     this(numberOfBits, numberOfHashes, new UnsafeBitArray(numberOfBits))
16 |   }
17 | 
18 |   def add(x: T): Unit = {
19 |     val hash = canGenerateHash.generateHash(x)
20 | 
21 |     var i = 0
22 |     while (i < numberOfHashes) {
23 |       val computedHash = hash._1 + i * hash._2
24 |       bits.set((computedHash & Long.MaxValue) % numberOfBits)
25 |       i += 1
26 |     }
27 |   }
28 | 
29 |   def mightContain(x: T): Boolean = {
30 |     val hash = canGenerateHash.generateHash(x)
31 | 
32 |     var i = 0
33 |     while (i < numberOfHashes) {
34 |       val computedHash = hash._1 + i * hash._2
35 |       if (!bits.get((computedHash & Long.MaxValue) % numberOfBits))
36 |         return false
37 |       i += 1
38 |     }
39 |     true
40 |   }
41 | 
42 |   def expectedFalsePositiveRate(): Double = {
43 |     math.pow(bits.getBitCount.toDouble / numberOfBits, numberOfHashes.toDouble)
44 |   }
45 | 
46 |   def writeTo(out: OutputStream): Unit = {
47 |     val dout = new DataOutputStream(out)
48 |     dout.writeLong(numberOfBits)
49 |     dout.writeInt(numberOfHashes)
50 |     bits.writeTo(out)
51 |   }
52 | 
53 |   def approximateElementCount(): Long = {
54 |     val fractionOfBitsSet = bits.getBitCount.toDouble / numberOfBits
55 |     val x = -log1p(-fractionOfBitsSet) * numberOfBits / numberOfHashes
56 |     val z = rint(x)
57 |     if (abs(x - z) == 0.5) {
58 |       (x + Math.copySign(0.5, x)).toLong
59 |     } else {
60 |       z.toLong
61 |     }
62 |   }
63 | 
64 |   def dispose(): Unit = bits.dispose()
65 | 
66 | }
67 | 
68 | object BloomFilter {
69 | 
70 |   def apply[T](numberOfItems: Long, falsePositiveRate: Double)
71 |       (implicit canGenerateHash: CanGenerate128HashFrom[T]): BloomFilter[T] = {
72 | 
73 |     val nb = optimalNumberOfBits(numberOfItems, falsePositiveRate)
74 |     val nh = optimalNumberOfHashes(numberOfItems, nb)
75 |     new BloomFilter[T](nb, nh)
76 |   }
77 | 
78 |   def optimalNumberOfBits(numberOfItems: Long, falsePositiveRate: Double): Long = {
79 |     math.ceil(-1 * numberOfItems * math.log(falsePositiveRate) / math.log(2) / math.log(2)).toLong
80 |   }
81 | 
82 |   def optimalNumberOfHashes(numberOfItems: Long, numberOfBits: Long): Int = {
83 |     math.ceil(numberOfBits / numberOfItems * math.log(2)).toInt
84 |   }
85 | 
86 |   def readFrom[T](in: InputStream)(implicit canGenerateHash: CanGenerate128HashFrom[T]): BloomFilter[T] = {
87 |     val din = new DataInputStream(in)
88 |     val numberOfBits = din.readLong()
89 |     val numberOfHashes = din.readInt()
90 |     val bits = new UnsafeBitArray(numberOfBits)
91 |     bits.readFrom(in)
92 |     new BloomFilter[T](numberOfBits, numberOfHashes, bits)
93 |   }
94 | 
95 | }
96 | 


--------------------------------------------------------------------------------
/tests/src/test/scala/tests/bloomfilter/mutable/BloomFilterSpec.scala:
--------------------------------------------------------------------------------
 1 | package tests.bloomfilter.mutable
 2 | 
 3 | import bloomfilter.CanGenerateHashFrom
 4 | import bloomfilter.mutable.BloomFilter
 5 | import org.scalacheck.Arbitrary.arbitrary
 6 | import org.scalacheck.Prop.forAll
 7 | import org.scalacheck.Test.Parameters
 8 | import org.scalacheck.commands.Commands
 9 | import org.scalacheck.{Arbitrary, Gen, Prop, Properties}
10 | 
11 | class BloomFilterSpec extends Properties("BloomFilter") {
12 | 
13 |   property("for Long") = new BloomFilterCommands[Long].property()
14 |   property("for String") = new BloomFilterCommands[String].property()
15 |   property("for Array[Byte]") = new BloomFilterCommands[Array[Byte]].property()
16 | 
17 | 
18 |   override def overrideParameters(p: Parameters): Parameters = {
19 |     super.overrideParameters(p).withMinSuccessfulTests(100)
20 |   }
21 | 
22 |   class BloomFilterCommands[T: Arbitrary](implicit canGenerateHash: CanGenerateHashFrom[T]) extends Commands {
23 |     type Sut = BloomFilter[T]
24 | 
25 |     case class State(expectedItems: Long, addedItems: Long)
26 | 
27 |     override def canCreateNewSut(
28 |         newState: State,
29 |         initSuts: Traversable[State],
30 |         runningSuts: Traversable[Sut]): Boolean = {
31 |       initSuts.isEmpty && runningSuts.isEmpty ||
32 |           newState.addedItems > newState.expectedItems ||
33 |           newState.addedItems > 100
34 |     }
35 | 
36 |     override def destroySut(sut: Sut): Unit =
37 |       sut.dispose()
38 | 
39 |     override def genInitialState: Gen[State] =
40 |       Gen.chooseNum[Long](1, Int.MaxValue).map(State(_, 0))
41 | 
42 |     override def newSut(state: State): Sut =
43 |       BloomFilter[T](state.expectedItems, 0.01)
44 | 
45 |     def initialPreCondition(state: State): Boolean = true
46 | 
47 |     def genCommand(state: State): Gen[Command] =
48 |       for {
49 |         item <- Arbitrary.arbitrary[T]
50 |       } yield commandSequence(AddItem(item), CheckItem(item))
51 | 
52 |     case class AddItem(item: T) extends UnitCommand {
53 |       def run(sut: Sut): Unit = sut.synchronized(sut.add(item))
54 |       def nextState(state: State) = state.copy(addedItems = state.addedItems + 1)
55 |       def preCondition(state: State) = true
56 |       def postCondition(state: State, success: Boolean) = success
57 |     }
58 | 
59 |     case class CheckItem(item: T) extends SuccessCommand {
60 |       type Result = Boolean
61 |       def run(sut: Sut): Boolean = sut.synchronized(sut.mightContain(item))
62 |       def nextState(state: State) = state
63 |       def preCondition(state: State) = true
64 |       def postCondition(state: State, result: Boolean): Prop = result
65 |     }
66 | 
67 |   }
68 | 
69 |   private val elemsToAddGen = for {
70 |     numberOfElemsToAdd <- Gen.chooseNum[Int](1, 1000)
71 |     elemsToAdd <- Gen.listOfN(numberOfElemsToAdd, arbitrary[Long])
72 |   } yield elemsToAdd
73 | 
74 |   // TODO fix elemsToAddGen.filter() below, why Gen.listOfN above generates empty lists?
75 |   property("approximateElementCount") = forAll(elemsToAddGen.filter(x => x.size > 10 && x.toSet.size > 10)) { elemsToAdd: List[Long] =>
76 |     val bf = BloomFilter[Long](elemsToAdd.size * 10, 0.0001)
77 |     elemsToAdd.foreach(bf.add)
78 |     val numberOfUnique = elemsToAdd.toSet.size
79 |     math.abs(bf.approximateElementCount() - numberOfUnique) < numberOfUnique * 0.1
80 |   }
81 | 
82 | }
83 | 


--------------------------------------------------------------------------------
/bloom-filter/src/main/scala/bloomfilter/mutable/UnsafeBitArray.scala:
--------------------------------------------------------------------------------
  1 | package bloomfilter.mutable
  2 | 
  3 | import java.io._
  4 | 
  5 | import bloomfilter.util.Unsafe.unsafe
  6 | 
  7 | @SerialVersionUID(2L)
  8 | class UnsafeBitArray(val numberOfBits: Long) extends Serializable {
  9 |   private val indices = math.ceil(numberOfBits.toDouble / 64).toLong
 10 |   @transient
 11 |   private val ptr = unsafe.allocateMemory(8L * indices)
 12 |   unsafe.setMemory(ptr, 8L * indices, 0.toByte)
 13 |   private var bitCount = 0L
 14 | 
 15 |   def get(index: Long): Boolean = {
 16 |     (unsafe.getLong(ptr + (index >>> 6) * 8L) & (1L << index)) != 0
 17 |   }
 18 | 
 19 |   def set(index: Long): Unit = {
 20 |     val offset = ptr + (index >>> 6) * 8L
 21 |     val long = unsafe.getLong(offset)
 22 |     if ((long & (1L << index)) == 0) {
 23 |       unsafe.putLong(offset, long | (1L << index))
 24 |       bitCount += 1
 25 |     }
 26 |   }
 27 | 
 28 |   def combine(that: UnsafeBitArray, combiner: (Long, Long) => Long): UnsafeBitArray = {
 29 |     val result = new UnsafeBitArray(this.numberOfBits)
 30 |     var index = 0L
 31 |     while (index < numberOfBits) {
 32 |       val thisLong = unsafe.getLong(this.ptr + (index >>> 6) * 8L)
 33 |       val thatLong = unsafe.getLong(that.ptr + (index >>> 6) * 8L)
 34 |       val longAtIndex = combiner(thisLong, thatLong)
 35 |       unsafe.putLong(result.ptr + (index >>> 6) * 8L, longAtIndex)
 36 |       index += 64
 37 |     }
 38 |     result
 39 |   }
 40 | 
 41 |   def |(that: UnsafeBitArray): UnsafeBitArray = {
 42 |     require(this.numberOfBits == that.numberOfBits, "Bitwise OR works only on arrays with the same number of bits")
 43 | 
 44 |     combine(that, _ | _)
 45 |   }
 46 | 
 47 |   def &(that: UnsafeBitArray): UnsafeBitArray = {
 48 |     require(this.numberOfBits == that.numberOfBits, "Bitwise AND works only on arrays with the same number of bits")
 49 | 
 50 |     combine(that, _ & _)
 51 |   }
 52 | 
 53 |   def getBitCount: Long = {
 54 |     bitCount
 55 |   }
 56 | 
 57 |   def writeTo(out: OutputStream): Unit = {
 58 |     val dout = new DataOutputStream(out)
 59 |     dout.writeLong(bitCount)
 60 |     var index = 0L
 61 |     while (index < numberOfBits) {
 62 |       dout.writeLong(unsafe.getLong(this.ptr + (index >>> 6) * 8L))
 63 |       index += 64
 64 |     }
 65 |   }
 66 | 
 67 |   def readFrom(in: InputStream): Unit = {
 68 |     val din = new DataInputStream(in)
 69 |     bitCount = din.readLong()
 70 |     var index = 0L
 71 |     while (index < numberOfBits) {
 72 |       unsafe.putLong(this.ptr + (index >>> 6) * 8L, din.readLong())
 73 |       index += 64
 74 |     }
 75 |   }
 76 | 
 77 |   def dispose(): Unit = unsafe.freeMemory(ptr)
 78 | 
 79 |   @throws(classOf[java.io.ObjectStreamException])
 80 |   private def writeReplace: AnyRef = new UnsafeBitArray.SerializedForm(this)
 81 | 
 82 | }
 83 | 
 84 | object UnsafeBitArray {
 85 | 
 86 |   @SerialVersionUID(1L)
 87 |   private class SerializedForm(@transient var unsafeBitArray: UnsafeBitArray) extends Serializable {
 88 |     private def writeObject(oos: ObjectOutputStream): Unit = {
 89 |       oos.defaultWriteObject()
 90 |       oos.writeLong(unsafeBitArray.numberOfBits)
 91 |       unsafeBitArray.writeTo(oos)
 92 |     }
 93 | 
 94 |     private def readObject(ois: ObjectInputStream): Unit = {
 95 |       ois.defaultReadObject()
 96 |       val numberOfBits = ois.readLong()
 97 |       unsafeBitArray = new UnsafeBitArray(numberOfBits)
 98 |       unsafeBitArray.readFrom(ois)
 99 |     }
100 | 
101 |     @throws(classOf[java.io.ObjectStreamException])
102 |     private def readResolve: AnyRef = unsafeBitArray
103 |   }
104 | 
105 | }
106 | 


--------------------------------------------------------------------------------
/bloom-filter/src/main/scala/bloomfilter/mutable/CuckooFilter.scala:
--------------------------------------------------------------------------------
  1 | package bloomfilter.mutable
  2 | 
  3 | import bloomfilter.CanGenerateHashFrom
  4 | 
  5 | @SerialVersionUID(1L)
  6 | class CuckooFilter[T](numberOfBuckets: Long, numberOfBitsPerItem: Int, private val table: UnsafeTable)
  7 |     (implicit canGenerateHash: CanGenerateHashFrom[T]) extends Serializable {
  8 | 
  9 |   def this(numberOfBuckets: Long, numberOfBitsPerItem: Int)(implicit canGenerateHash: CanGenerateHashFrom[T]) {
 10 |     this(numberOfBuckets, numberOfBitsPerItem, new UnsafeTable16Bit(numberOfBuckets))
 11 |   }
 12 | 
 13 |   import CuckooFilter._
 14 | 
 15 |   def add(x: T): Unit = {
 16 |     val hash = canGenerateHash.generateHash(x)
 17 |     val index = indexHash(hash >> 32, numberOfBuckets)
 18 |     val tag = tagHash(hash, numberOfBitsPerItem)
 19 |     if (table.insert(index, tag)) {
 20 |       return
 21 |     }
 22 | 
 23 |     var curIndex = index
 24 |     var curTag = tag
 25 |     var i = 0
 26 |     while (i < MaxAddAttempts) {
 27 |       curIndex = altIndex(curIndex, curTag, numberOfBuckets)
 28 |       val swappedTag = table.swapAny(curIndex, curTag)
 29 |       if (swappedTag == 0) {
 30 |         return
 31 |       }
 32 |       curTag = swappedTag
 33 |       i += 1
 34 |     }
 35 |   }
 36 | 
 37 |   def remove(x: T): Unit = {
 38 |     val hash = canGenerateHash.generateHash(x)
 39 |     val index = indexHash(hash >> 32, numberOfBuckets)
 40 |     val tag = tagHash(hash, numberOfBitsPerItem)
 41 |     if (table.remove(index, tag)) return
 42 |     val index2 = altIndex(index, tag, numberOfBuckets)
 43 |     if (table.remove(index2, tag)) return
 44 |   }
 45 | 
 46 |   def mightContain(x: T): Boolean = {
 47 |     val hash = canGenerateHash.generateHash(x)
 48 |     val index = indexHash(hash >> 32, numberOfBuckets)
 49 |     val tag = tagHash(hash, numberOfBitsPerItem)
 50 |     if (table.find(index, tag)) return true
 51 |     val index2 = altIndex(index, tag, numberOfBuckets)
 52 |     if (table.find(index2, tag)) return true
 53 |     false
 54 |   }
 55 | 
 56 |   def dispose(): Unit = table.dispose()
 57 | }
 58 | 
 59 | object CuckooFilter {
 60 | 
 61 |   // TODO falsePositiveRate?
 62 |   def apply[T](numberOfItems: Long)(implicit canGenerateHash: CanGenerateHashFrom[T]): CuckooFilter[T] = {
 63 |     val nb = optimalNumberOfBuckets(numberOfItems)
 64 |     new CuckooFilter[T](nb, 16, new UnsafeTable16Bit(nb))
 65 |   }
 66 | 
 67 |   def optimalNumberOfBuckets(numberOfItems: Long): Long = {
 68 |     var numberOfBuckets = upperPowerOf2((numberOfItems + UnsafeTable16Bit.TagsPerBucket - 1) / UnsafeTable16Bit.TagsPerBucket)
 69 |     val frac = numberOfItems.toDouble / numberOfBuckets / UnsafeTable16Bit.TagsPerBucket
 70 |     if (frac > 0.96) numberOfBuckets = numberOfBuckets << 1
 71 |     numberOfBuckets
 72 |   }
 73 | 
 74 | 
 75 |   val MaxAddAttempts = 500
 76 | 
 77 |   @inline
 78 |   private def upperPowerOf2(l: Long): Long = {
 79 |     var x = l - 1
 80 |     x |= x >> 1
 81 |     x |= x >> 2
 82 |     x |= x >> 4
 83 |     x |= x >> 8
 84 |     x |= x >> 16
 85 |     x |= x >> 32
 86 |     x += 1
 87 |     x
 88 |   }
 89 | 
 90 |   @inline
 91 |   private def altIndex(index: Long, tag: Long, numberOfBuckets: Long): Long =
 92 |     indexHash(index ^ (tag * 0x5bd1e995), numberOfBuckets)
 93 | 
 94 |   @inline
 95 |   private def indexHash(hash: Long, numberOfBuckets: Long): Long = {
 96 |     hash & (numberOfBuckets - 1)
 97 |   }
 98 | 
 99 |   @inline
100 |   private def tagHash(hash: Long, numberOfBitsPerItem: Int): Long = {
101 |     var tag = hash & ((1L << numberOfBitsPerItem) - 1)
102 |     if (tag == 0) tag += 1
103 |     tag
104 |   }
105 | 
106 | 
107 | }
108 | 


--------------------------------------------------------------------------------
/tests/src/test/scala/tests/bloomfilter/mutable/UnsafeBitArraySpec.scala:
--------------------------------------------------------------------------------
  1 | package tests.bloomfilter.mutable
  2 | 
  3 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}
  4 | 
  5 | import bloomfilter.mutable.UnsafeBitArray
  6 | import org.scalacheck.Test.Parameters
  7 | import org.scalacheck.commands.Commands
  8 | import org.scalacheck.{Gen, Prop, Properties}
  9 | import org.scalatest.{Inspectors, Matchers}
 10 | 
 11 | class UnsafeBitArraySpec extends Properties("UnsafeBitArray") with Matchers with Inspectors {
 12 | 
 13 |   property("set & get") = new UnsafeBitArrayCommands().property()
 14 |   property("serializable") = serializationProp
 15 | 
 16 |   override def overrideParameters(p: Parameters): Parameters = {
 17 |     super.overrideParameters(p).withMinSuccessfulTests(100)
 18 |   }
 19 | 
 20 |   class UnsafeBitArrayCommands extends Commands {
 21 |     type Sut = UnsafeBitArray
 22 | 
 23 |     case class State(size: Long)
 24 | 
 25 |     override def canCreateNewSut(
 26 |         newState: State,
 27 |         initSuts: Traversable[State],
 28 |         runningSuts: Traversable[Sut]): Boolean =
 29 |       initSuts.isEmpty && runningSuts.isEmpty
 30 | 
 31 |     override def destroySut(sut: Sut): Unit =
 32 |       sut.dispose()
 33 | 
 34 |     override def genInitialState: Gen[State] =
 35 |       Gen.chooseNum[Long](1, Int.MaxValue * 2L).map(State)
 36 | 
 37 |     override def newSut(state: State): Sut =
 38 |       new UnsafeBitArray(state.size)
 39 | 
 40 |     def initialPreCondition(state: State): Boolean = true
 41 | 
 42 |     def genCommand(state: State): Gen[Command] =
 43 |       for {
 44 |         i <- Gen.choose[Long](0, state.size)
 45 |       } yield commandSequence(SetItem(i), GetItem(i))
 46 | 
 47 |     case class SetItem(i: Long) extends UnitCommand {
 48 |       def run(sut: Sut): Unit = sut.synchronized(sut.set(i))
 49 |       def nextState(state: State): State = state
 50 |       def preCondition(state: State) = true
 51 |       def postCondition(state: State, success: Boolean): Prop = success
 52 |     }
 53 | 
 54 |     case class GetItem(i: Long) extends SuccessCommand {
 55 |       type Result = Boolean
 56 |       def run(sut: Sut): Boolean = sut.synchronized(sut.get(i))
 57 |       def nextState(state: State): State = state
 58 |       def preCondition(state: State) = true
 59 |       def postCondition(state: State, result: Boolean): Prop = result
 60 |     }
 61 | 
 62 |   }
 63 | 
 64 |   def serializationProp: Prop = {
 65 |     case class State(sz: Int, included: Set[Long])
 66 |     val genState = for {
 67 |       sz <- Gen.posNum[Int]
 68 |       included <- Gen.listOf(Gen.choose(0L, sz - 1))
 69 |     } yield {
 70 |       State(sz, included.toSet)
 71 |     }
 72 | 
 73 |     Prop.forAll(genState) {
 74 |       case State(sz, included) =>
 75 |         val bits = new UnsafeBitArray(sz)
 76 |         try {
 77 |           included.foreach(bits.set)
 78 | 
 79 |           val bos = new ByteArrayOutputStream()
 80 |           val oos = new ObjectOutputStream(bos)
 81 |           oos.writeObject(bits)
 82 |           oos.close()
 83 |           val bis = new ByteArrayInputStream(bos.toByteArray)
 84 |           val ois = new ObjectInputStream(bis)
 85 |           val deserialized = ois.readObject()
 86 |           ois.close()
 87 | 
 88 |           deserialized should not be null
 89 |           deserialized should be(a[UnsafeBitArray])
 90 |           val deserializedBits = deserialized.asInstanceOf[UnsafeBitArray]
 91 |           try {
 92 |             deserializedBits.numberOfBits should equal(bits.numberOfBits)
 93 |             forAll(0l until bits.numberOfBits) { idx =>
 94 |               bits.get(idx) should equal(deserializedBits.get(idx))
 95 |             }
 96 |           } finally {
 97 |             deserializedBits.dispose()
 98 |           }
 99 |         } finally bits.dispose()
100 |         Prop.passed
101 |     }
102 |   }
103 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Bloom filter for Scala
 2 | 
 3 | [![Build Status](https://travis-ci.org/alexandrnikitin/bloom-filter-scala.svg?branch=master)](https://travis-ci.org/alexandrnikitin/bloom-filter-scala)
 4 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.github.alexandrnikitin/bloom-filter_2.11/badge.svg)](https://maven-badges.herokuapp.com/maven-central/com.github.alexandrnikitin/bloom-filter_2.11)
 5 | 
 6 | ### Overview
 7 | 
 8 | >"A Bloom filter is a space-efficient probabilistic data structure that is used to test whether an element is a member of a set. False positive matches are possible, but false negatives are not. In other words, a query returns either "possibly in set" or "definitely not in set". Elements can be added to the set, but not removed," says [Wikipedia][wiki-bloom-filter].
 9 | 
10 | What's Bloom filter in a nutshell:
11 | 
12 | - Optimization for memory. It comes into play when you cannot put whole set into memory.
13 | - Solves the membership problem. It can answer one question: does an element belong to a set or not?
14 | - Probabilistic (lossy) data structure. It can answer that an element **probably belongs** to a set with some probability.
15 | 
16 | ### Getting Started
17 | 
18 | ```scala
19 | libraryDependencies += "com.github.alexandrnikitin" %% "bloom-filter" % "latest.release"
20 | ```
21 | 
22 | ```scala
23 | // Create a Bloom filter
24 | val expectedElements = 1000000
25 | val falsePositiveRate = 0.1
26 | val bf = BloomFilter[String](expectedElements, falsePositiveRate)
27 | 
28 | // Put an element
29 | bf.add(element)
30 | 
31 | // Check whether an element in a set
32 | bf.mightContain(element)
33 | 
34 | // Dispose the instance
35 | bf.dispose()
36 | ```
37 | 
38 | ### Motivation
39 | 
40 | You can read about this Bloom filter and motivation behind in [my blog post][post]
41 | 
42 | ### Benchmarks
43 | 
44 | Here's a benchmark for the `String` type and results for other types are very similar to these:
45 | 
46 | ```
47 | [info] Benchmark                                              (length)   Mode  Cnt          Score         Error  Units
48 | [info] alternatives.algebird.StringItemBenchmark.algebirdGet      1024  thrpt   20    1181080.172 ▒    9867.840  ops/s
49 | [info] alternatives.algebird.StringItemBenchmark.algebirdPut      1024  thrpt   20     157158.453 ▒     844.623  ops/s
50 | [info] alternatives.breeze.StringItemBenchmark.breezeGet          1024  thrpt   20    5113222.168 ▒   47005.466  ops/s
51 | [info] alternatives.breeze.StringItemBenchmark.breezePut          1024  thrpt   20    4482377.337 ▒   19971.209  ops/s
52 | [info] alternatives.guava.StringItemBenchmark.guavaGet            1024  thrpt   20    5712237.339 ▒  115453.495  ops/s
53 | [info] alternatives.guava.StringItemBenchmark.guavaPut            1024  thrpt   20    5621712.282 ▒  307133.297  ops/s
54 | 
55 | [info] bloomfilter.mutable.StringItemBenchmark.myGet              1024  thrpt   20   11483828.730 ▒  342980.166  ops/s
56 | [info] bloomfilter.mutable.StringItemBenchmark.myPut              1024  thrpt   20   11634399.272 ▒   45645.105  ops/s
57 | [info] bloomfilter.mutable._128bit.StringItemBenchmark.myGet      1024  thrpt   20   11119086.965 ▒   43696.519  ops/s
58 | [info] bloomfilter.mutable._128bit.StringItemBenchmark.myPut      1024  thrpt   20   11303765.075 ▒   52581.059  ops/s
59 | ```
60 | 
61 | Basically, this implementation is 2x faster than Google's Guava and 10-80x than Twitter's Algebird. Other benchmarks you can find in [the "benchmarks' module on github][github-benchmarks]
62 | 
63 | Warning: These are synthetic benchmarks in isolated environment. Usually the difference in throughput and latency is bigger in production system because it will stress the GC, lead to slow allocation paths and higher latencies, trigger the GC, etc.
64 | 
65 |   [wiki-bloom-filter]: https://en.wikipedia.org/wiki/Bloom_filter
66 |   [post]: https://alexandrnikitin.github.io/blog/bloom-filter-for-scala/
67 |   [github-benchmarks]: https://github.com/alexandrnikitin/bloom-filter-scala/tree/master/benchmarks/src/main/scala
68 | 


--------------------------------------------------------------------------------
/bloom-filter/src/main/scala/bloomfilter/mutable/BloomFilter.scala:
--------------------------------------------------------------------------------
  1 | package bloomfilter.mutable
  2 | 
  3 | import java.io.{DataInputStream, DataOutputStream, InputStream, OutputStream}
  4 | 
  5 | import bloomfilter.CanGenerateHashFrom
  6 | 
  7 | import scala.math._
  8 | 
  9 | @SerialVersionUID(2L)
 10 | class BloomFilter[T] private (val numberOfBits: Long, val numberOfHashes: Int, private val bits: UnsafeBitArray)
 11 |     (implicit canGenerateHash: CanGenerateHashFrom[T]) extends Serializable {
 12 | 
 13 |   def this(numberOfBits: Long, numberOfHashes: Int)(implicit canGenerateHash: CanGenerateHashFrom[T]) {
 14 |     this(numberOfBits, numberOfHashes, new UnsafeBitArray(numberOfBits))
 15 |   }
 16 | 
 17 |   def add(x: T): Unit = {
 18 |     val hash = canGenerateHash.generateHash(x)
 19 |     val hash1 = hash >>> 32
 20 |     val hash2 = (hash << 32) >> 32
 21 | 
 22 |     var i = 0
 23 |     while (i < numberOfHashes) {
 24 |       val computedHash = hash1 + i * hash2
 25 |       bits.set((computedHash & Long.MaxValue) % numberOfBits)
 26 |       i += 1
 27 |     }
 28 |   }
 29 | 
 30 |   def union(that: BloomFilter[T]): BloomFilter[T] = {
 31 |     require(this.numberOfBits == that.numberOfBits && this.numberOfHashes == that.numberOfHashes,
 32 |       s"Union works only on BloomFilters with the same number of hashes and of bits")
 33 |     new BloomFilter[T](this.numberOfBits, this.numberOfHashes, this.bits | that.bits)
 34 |   }
 35 | 
 36 |   def intersect(that: BloomFilter[T]): BloomFilter[T] = {
 37 |     require(this.numberOfBits == that.numberOfBits && this.numberOfHashes == that.numberOfHashes,
 38 |       s"Intersect works only on BloomFilters with the same number of hashes and of bits")
 39 |     new BloomFilter[T](this.numberOfBits, this.numberOfHashes, this.bits & that.bits)
 40 |   }
 41 | 
 42 |   def mightContain(x: T): Boolean = {
 43 |     val hash = canGenerateHash.generateHash(x)
 44 |     val hash1 = hash >>> 32
 45 |     val hash2 = (hash << 32) >> 32
 46 |     var i = 0
 47 |     while (i < numberOfHashes) {
 48 |       val computedHash = hash1 + i * hash2
 49 |       if (!bits.get((computedHash & Long.MaxValue) % numberOfBits))
 50 |         return false
 51 |       i += 1
 52 |     }
 53 |     true
 54 |   }
 55 | 
 56 |   def expectedFalsePositiveRate(): Double = {
 57 |     math.pow(bits.getBitCount.toDouble / numberOfBits, numberOfHashes.toDouble)
 58 |   }
 59 | 
 60 |   def writeTo(out: OutputStream): Unit = {
 61 |     val dout = new DataOutputStream(out)
 62 |     dout.writeLong(numberOfBits)
 63 |     dout.writeInt(numberOfHashes)
 64 |     bits.writeTo(out)
 65 |   }
 66 | 
 67 |   def approximateElementCount(): Long = {
 68 |     val fractionOfBitsSet = bits.getBitCount.toDouble / numberOfBits
 69 |     val x = -log1p(-fractionOfBitsSet) * numberOfBits / numberOfHashes
 70 |     val z = rint(x)
 71 |     if (abs(x - z) == 0.5) {
 72 |       (x + Math.copySign(0.5, x)).toLong
 73 |     } else {
 74 |       z.toLong
 75 |     }
 76 |   }
 77 | 
 78 |   def dispose(): Unit = bits.dispose()
 79 | 
 80 | }
 81 | 
 82 | object BloomFilter {
 83 | 
 84 |   def apply[T](numberOfItems: Long, falsePositiveRate: Double)
 85 |       (implicit canGenerateHash: CanGenerateHashFrom[T]): BloomFilter[T] = {
 86 | 
 87 |     val nb = optimalNumberOfBits(numberOfItems, falsePositiveRate)
 88 |     val nh = optimalNumberOfHashes(numberOfItems, nb)
 89 |     new BloomFilter[T](nb, nh)
 90 |   }
 91 | 
 92 |   def optimalNumberOfBits(numberOfItems: Long, falsePositiveRate: Double): Long = {
 93 |     math.ceil(-1 * numberOfItems * math.log(falsePositiveRate) / math.log(2) / math.log(2)).toLong
 94 |   }
 95 | 
 96 |   def optimalNumberOfHashes(numberOfItems: Long, numberOfBits: Long): Int = {
 97 |     math.ceil(numberOfBits / numberOfItems * math.log(2)).toInt
 98 |   }
 99 | 
100 |   def readFrom[T](in: InputStream)(implicit canGenerateHash: CanGenerateHashFrom[T]): BloomFilter[T] = {
101 |     val din = new DataInputStream(in)
102 |     val numberOfBits = din.readLong()
103 |     val numberOfHashes = din.readInt()
104 |     val bits = new UnsafeBitArray(numberOfBits)
105 |     bits.readFrom(in)
106 |     new BloomFilter[T](numberOfBits, numberOfHashes, bits)
107 |   }
108 | 
109 | }
110 | 


--------------------------------------------------------------------------------
/tests/src/test/scala/tests/bloomfilter/mutable/_128bit/BloomFilterSpec.scala:
--------------------------------------------------------------------------------
  1 | package tests.bloomfilter.mutable._128bit
  2 | 
  3 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}
  4 | 
  5 | import bloomfilter.{CanGenerate128HashFrom, CanGenerateHashFrom}
  6 | import bloomfilter.mutable._128bit.BloomFilter
  7 | import org.scalacheck.Arbitrary.arbitrary
  8 | import org.scalacheck.Test.Parameters
  9 | import org.scalacheck.commands.Commands
 10 | import org.scalacheck.{Arbitrary, Gen, Prop, Properties}
 11 | import org.scalatest.{Inspectors, Matchers}
 12 | 
 13 | class BloomFilterSpec extends Properties("BloomFilter_128bit") with Matchers with Inspectors {
 14 | 
 15 |   property("for Long") = new BloomFilterCommands[Long].property()
 16 |   property("for String") = new BloomFilterCommands[String].property()
 17 |   property("for Array[Byte]") = new BloomFilterCommands[Array[Byte]].property()
 18 | 
 19 | 
 20 |   override def overrideParameters(p: Parameters): Parameters = {
 21 |     super.overrideParameters(p).withMinSuccessfulTests(100)
 22 |   }
 23 | 
 24 |   class BloomFilterCommands[T: Arbitrary](implicit canGenerateHash: CanGenerate128HashFrom[T]) extends Commands {
 25 |     type Sut = BloomFilter[T]
 26 | 
 27 |     case class State(expectedItems: Long, addedItems: Long)
 28 | 
 29 |     override def canCreateNewSut(
 30 |                                   newState: State,
 31 |                                   initSuts: Traversable[State],
 32 |                                   runningSuts: Traversable[Sut]): Boolean = {
 33 |       initSuts.isEmpty && runningSuts.isEmpty ||
 34 |         newState.addedItems > newState.expectedItems ||
 35 |         newState.addedItems > 100
 36 |     }
 37 | 
 38 |     override def destroySut(sut: Sut): Unit =
 39 |       sut.dispose()
 40 | 
 41 |     override def genInitialState: Gen[State] =
 42 |       Gen.chooseNum[Long](1, Int.MaxValue).map(State(_, 0))
 43 | 
 44 |     override def newSut(state: State): Sut =
 45 |       BloomFilter[T](state.expectedItems, 0.01)
 46 | 
 47 |     def initialPreCondition(state: State): Boolean = true
 48 | 
 49 |     def genCommand(state: State): Gen[Command] =
 50 |       for {
 51 |         item <- Arbitrary.arbitrary[T]
 52 |       } yield commandSequence(AddItem(item), CheckItem(item))
 53 | 
 54 |     case class AddItem(item: T) extends UnitCommand {
 55 |       def run(sut: Sut): Unit = sut.synchronized(sut.add(item))
 56 | 
 57 |       def nextState(state: State) = state.copy(addedItems = state.addedItems + 1)
 58 | 
 59 |       def preCondition(state: State) = true
 60 | 
 61 |       def postCondition(state: State, success: Boolean) = success
 62 |     }
 63 | 
 64 |     case class CheckItem(item: T) extends SuccessCommand {
 65 |       type Result = Boolean
 66 | 
 67 |       def run(sut: Sut): Boolean = sut.synchronized(sut.mightContain(item))
 68 | 
 69 |       def nextState(state: State) = state
 70 | 
 71 |       def preCondition(state: State) = true
 72 | 
 73 |       def postCondition(state: State, result: Boolean): Prop = result
 74 |     }
 75 | 
 76 |   }
 77 | 
 78 |   property("supports java serialization") = {
 79 |     val gen = Gen.listOf(Gen.posNum[Long])
 80 | 
 81 |     Prop.forAll(gen) { indices =>
 82 |       val sz = indices.size max 1
 83 |       val bf1 = BloomFilter[Long](sz, 0.01)
 84 |       try {
 85 |         indices foreach bf1.add
 86 |         val bos = new ByteArrayOutputStream
 87 |         val oos = new ObjectOutputStream(bos)
 88 |         oos.writeObject(bf1)
 89 |         oos.close()
 90 |         val bis = new ByteArrayInputStream(bos.toByteArray)
 91 |         val ois = new ObjectInputStream(bis)
 92 |         val deserialized = ois.readObject()
 93 |         deserialized should not be (null)
 94 |         deserialized should be (a[BloomFilter[Long]])
 95 |         val bf2 = deserialized.asInstanceOf[BloomFilter[Long]]
 96 |         try{
 97 |           bf2.numberOfBits shouldEqual bf1.numberOfBits
 98 |           bf1.numberOfHashes shouldEqual bf1.numberOfHashes
 99 | 
100 |           forAll(indices){ idx =>
101 |             bf2.mightContain(idx) shouldBe true
102 |           }
103 |           Prop.passed
104 |         } finally  bf2.dispose()
105 |       } finally bf1.dispose()
106 |     }
107 |   }
108 | 
109 |   private val elemsToAddGen = for {
110 |     numberOfElemsToAdd <- Gen.chooseNum[Int](1, 1000)
111 |     elemsToAdd <- Gen.listOfN(numberOfElemsToAdd, arbitrary[Long])
112 |   } yield elemsToAdd
113 | 
114 |   // TODO fix elemsToAddGen.filter() below, why Gen.listOfN above generates empty lists?
115 |   property("approximateElementCount") = Prop.forAll(elemsToAddGen.filter(x => x.size > 10 && x.toSet.size > 10)) { elemsToAdd: List[Long] =>
116 |     val bf = BloomFilter[Long](elemsToAdd.size * 10, 0.0001)
117 |     elemsToAdd.foreach(bf.add)
118 |     val numberOfUnique = elemsToAdd.toSet.size
119 |     math.abs(bf.approximateElementCount() - numberOfUnique) < numberOfUnique * 0.1
120 |   }
121 | 
122 | }
123 | 


--------------------------------------------------------------------------------
/tests/src/test/scala/tests/bloomfilter/mutable/CuckooFilterSpec.scala:
--------------------------------------------------------------------------------
  1 | package tests.bloomfilter.mutable
  2 | 
  3 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}
  4 | 
  5 | import bloomfilter.CanGenerateHashFrom
  6 | import bloomfilter.mutable.CuckooFilter
  7 | import org.scalacheck.Test.Parameters
  8 | import org.scalacheck.commands.Commands
  9 | import org.scalacheck.{Arbitrary, Gen, Prop, Properties}
 10 | import org.scalatest.{Inspectors, Matchers}
 11 | 
 12 | class CuckooFilterSpec extends Properties("CuckooFilter") with Matchers with Inspectors {
 13 | 
 14 |   property("for Long") = new CuckooFilterCommands[Long].property()
 15 |   property("for String") = new CuckooFilterCommands[String].property()
 16 |   property("for Array[Byte]") = new CuckooFilterCommands[Array[Byte]].property()
 17 | 
 18 | 
 19 |   override def overrideParameters(p: Parameters): Parameters = {
 20 |     super.overrideParameters(p).withMinSuccessfulTests(100)
 21 |   }
 22 | 
 23 |   class CuckooFilterCommands[T: Arbitrary](implicit canGenerateHash: CanGenerateHashFrom[T]) extends Commands {
 24 |     type Sut = CuckooFilter[T]
 25 | 
 26 |     case class State(expectedItems: Long, addedItems: Long)
 27 | 
 28 |     override def canCreateNewSut(
 29 |         newState: State,
 30 |         initSuts: Traversable[State],
 31 |         runningSuts: Traversable[Sut]): Boolean = {
 32 |       initSuts.isEmpty && runningSuts.isEmpty
 33 |     }
 34 | 
 35 |     override def destroySut(sut: Sut): Unit =
 36 |       sut.dispose()
 37 | 
 38 |     override def genInitialState: Gen[State] =
 39 |       Gen.chooseNum[Long](1, 100000).map(State(_, 0))
 40 | 
 41 |     override def newSut(state: State): Sut =
 42 |       CuckooFilter[T](state.expectedItems)
 43 | 
 44 |     def initialPreCondition(state: State): Boolean = true
 45 | 
 46 |     def genCommand(state: State): Gen[Command] =
 47 |       for {
 48 |         item <- Arbitrary.arbitrary[T]
 49 |       } yield commandSequence(AddItem(item), CheckItem(item), RemoveItem(item))
 50 | 
 51 |     case class AddItem(item: T) extends UnitCommand {
 52 |       def run(sut: Sut): Unit = sut.synchronized(sut.add(item))
 53 |       def nextState(state: State): State = state.copy(addedItems = state.addedItems + 1)
 54 |       def preCondition(state: State): Boolean = state.addedItems < state.expectedItems
 55 |       def postCondition(state: State, success: Boolean): Prop = success
 56 |     }
 57 | 
 58 |     case class RemoveItem(item: T) extends SuccessCommand {
 59 |       type Result = Boolean
 60 | 
 61 |       def run(sut: Sut): Boolean = sut.synchronized {
 62 |         sut.remove(item)
 63 |         !sut.mightContain(item)
 64 |       }
 65 |       def nextState(state: State): State = state.copy(addedItems = state.addedItems - 1)
 66 |       def preCondition(state: State): Boolean = state.addedItems < state.expectedItems
 67 |       def postCondition(state: State, success: Boolean): Prop = success
 68 |     }
 69 | 
 70 |     case class CheckItem(item: T) extends SuccessCommand {
 71 |       type Result = Boolean
 72 |       def run(sut: Sut): Boolean = sut.synchronized(sut.mightContain(item))
 73 |       def nextState(state: State): State = state
 74 |       def preCondition(state: State): Boolean = state.addedItems < state.expectedItems
 75 |       def postCondition(state: State, result: Boolean): Prop = result
 76 |     }
 77 | 
 78 |   }
 79 | 
 80 |   property("strange case") = Prop {
 81 |     val lst = List(-1l, 0l)
 82 |     val cf = CuckooFilter[Long](lst.size)
 83 |     lst.foreach(cf.add)
 84 | 
 85 |     lst.forall(cf.mightContain)
 86 |   }
 87 | 
 88 |   property("strange case #2") = Prop {
 89 |     val lst = List(0l, 0, 0, 0, 0, 0, 0, 0, 4)
 90 |     //the x3 size factor here enables 4 to end up in a different bucket than the 3 0's, their bucket overflows after the first four inserts
 91 |     val cf = CuckooFilter[Long](lst.size * 3)
 92 |     lst.foreach(cf.add)
 93 | 
 94 |     lst.forall(cf.mightContain)
 95 |   }
 96 | 
 97 |   property("supports java serialization") = {
 98 |     val gen = Gen.listOf(Arbitrary.arbLong.arbitrary)
 99 |     Prop.forAll(gen) { lst =>
100 |       val sz = math.max(lst.size, 1)
101 |       //we add n x3 factor to reduce probability for buckets overflowing during inserts
102 |       val sut = CuckooFilter[Long](sz * 3)
103 |       try {
104 |         lst foreach sut.add
105 |         val bos = new ByteArrayOutputStream
106 |         val oos = new ObjectOutputStream(bos)
107 |         oos.writeObject(sut)
108 |         oos.close()
109 |         val bis = new ByteArrayInputStream(bos.toByteArray)
110 |         val ois = new ObjectInputStream(bis)
111 |         val deserialized = ois.readObject()
112 |         ois.close()
113 | 
114 |         deserialized should not be null
115 |         deserialized should be(a[CuckooFilter[Long]])
116 |         val sut2 = deserialized.asInstanceOf[CuckooFilter[Long]]
117 |         try {
118 |           forAll(lst) { k =>
119 |             withClue(k) {
120 |               //we use a realxed condition here,
121 |               //the reason for this is potential (and actual) buckets overflowing in the underlying UnsafeTable16.
122 |               //a different aproach might be generating the keys in a way that limits them according to number of buckets and number of tags in each bucket.
123 |               sut2.mightContain(k) shouldEqual sut.mightContain(k)
124 |             }
125 |           }
126 |           Prop.passed
127 |         } finally sut2.dispose()
128 |       } finally sut.dispose()
129 |     }
130 |   }
131 | 
132 | }
133 | 


--------------------------------------------------------------------------------
/bloom-filter/src/main/scala/bloomfilter/hashing/MurmurHash3Generic.scala:
--------------------------------------------------------------------------------
  1 | package bloomfilter.hashing
  2 | 
  3 | import java.lang.Long.rotateLeft
  4 | 
  5 | import bloomfilter.CanGetDataFrom
  6 | 
  7 | object MurmurHash3Generic {
  8 | 
  9 |   private val c1: Long = 0x87c37b91114253d5L
 10 |   private val c2: Long = 0x4cf5ad432745937fL
 11 | 
 12 |   def fmix64(l: Long): Long = {
 13 |     var k = l
 14 |     k ^= k >>> 33
 15 |     k *= 0xff51afd7ed558ccdL
 16 |     k ^= k >>> 33
 17 |     k *= 0xc4ceb9fe1a85ec53L
 18 |     k ^= k >>> 33
 19 |     k
 20 |   }
 21 | 
 22 |   def murmurhash3_x64_128[From](key: From, offset: Int, len: Int, seed: Int)(implicit cgdf: CanGetDataFrom[From]): (Long, Long) = {
 23 |     var h1: Long = seed & 0x00000000FFFFFFFFL
 24 |     var h2: Long = seed & 0x00000000FFFFFFFFL
 25 | 
 26 |     val roundedEnd = offset + (len & 0xFFFFFFF0);  // round down to 16 byte block
 27 | 
 28 |     var i = offset
 29 |     while (i < roundedEnd) {
 30 |       var k1 = cgdf.getLong(key, i)
 31 |       var k2 = cgdf.getLong(key, i + 8)
 32 |       k1 *= c1; k1 = rotateLeft(k1, 31); k1 *= c2; h1 ^= k1
 33 |       h1 = rotateLeft(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729
 34 |       k2 *= c2; k2 = rotateLeft(k2, 33); k2 *= c1; h2 ^= k2
 35 |       h2 = rotateLeft(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5
 36 | 
 37 |       i += 16
 38 |     }
 39 | 
 40 |     var k1: Long = 0
 41 |     var k2: Long = 0
 42 | 
 43 |     val lenVar = len & 15
 44 |     if (lenVar == 15) k2 = (cgdf.getByte(key, roundedEnd + 14) & 0xffL) << 48
 45 |     if (lenVar >= 14) k2 |= (cgdf.getByte(key, roundedEnd + 13) & 0xffL) << 40
 46 |     if (lenVar >= 13) k2 |= (cgdf.getByte(key, roundedEnd + 12) & 0xffL) << 32
 47 |     if (lenVar >= 12) k2 |= (cgdf.getByte(key, roundedEnd + 11) & 0xffL) << 24
 48 |     if (lenVar >= 11) k2 |= (cgdf.getByte(key, roundedEnd + 10) & 0xffL) << 16
 49 |     if (lenVar >= 10) k2 |= (cgdf.getByte(key, roundedEnd + 9) & 0xffL) << 8
 50 |     if (lenVar >= 9) {
 51 |       k2 |= (cgdf.getByte(key, roundedEnd + 8) & 0xffL)
 52 |       k2 *= c2
 53 |       k2 = rotateLeft(k2, 33)
 54 |       k2 *= c1
 55 |       h2 ^= k2
 56 |     }
 57 |     if (lenVar >= 8) k1 = cgdf.getByte(key, roundedEnd + 7).toLong << 56
 58 |     if (lenVar >= 7) k1 |= (cgdf.getByte(key, roundedEnd + 6) & 0xffL) << 48
 59 |     if (lenVar >= 6) k1 |= (cgdf.getByte(key, roundedEnd + 5) & 0xffL) << 40
 60 |     if (lenVar >= 5) k1 |= (cgdf.getByte(key, roundedEnd + 4) & 0xffL) << 32
 61 |     if (lenVar >= 4) k1 |= (cgdf.getByte(key, roundedEnd + 3) & 0xffL) << 24
 62 |     if (lenVar >= 3) k1 |= (cgdf.getByte(key, roundedEnd + 2) & 0xffL) << 16
 63 |     if (lenVar >= 2) k1 |= (cgdf.getByte(key, roundedEnd + 1) & 0xffL) << 8
 64 |     if (lenVar >= 1) {
 65 |       k1 |= (cgdf.getByte(key, roundedEnd) & 0xffL)
 66 |       k1 *= c1
 67 |       k1 = rotateLeft(k1, 31)
 68 |       k1 *= c2
 69 |       h1 ^= k1
 70 |     }
 71 | 
 72 |     h1 ^= len; h2 ^= len
 73 | 
 74 |     h1 += h2
 75 |     h2 += h1
 76 | 
 77 |     h1 = fmix64(h1)
 78 |     h2 = fmix64(h2)
 79 | 
 80 |     h1 += h2
 81 |     h2 += h1
 82 | 
 83 |     (h1, h2)
 84 |   }
 85 | 
 86 |   def murmurhash3_x64_64[From](key: From, offset: Int, len: Int, seed: Int)(implicit cgdf: CanGetDataFrom[From]): Long = {
 87 |     var h1: Long = seed & 0x00000000FFFFFFFFL
 88 |     var h2: Long = seed & 0x00000000FFFFFFFFL
 89 | 
 90 |     val roundedEnd = offset + (len & 0xFFFFFFF0);  // round down to 16 byte block
 91 | 
 92 |     var i = offset
 93 |     while (i < roundedEnd) {
 94 |       var k1 = cgdf.getLong(key, i)
 95 |       var k2 = cgdf.getLong(key, i + 8)
 96 |       k1 *= c1; k1 = rotateLeft(k1, 31); k1 *= c2; h1 ^= k1
 97 |       h1 = rotateLeft(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729
 98 |       k2 *= c2; k2 = rotateLeft(k2, 33); k2 *= c1; h2 ^= k2
 99 |       h2 = rotateLeft(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5
100 | 
101 |       i += 16
102 |     }
103 | 
104 |     var k1: Long = 0
105 |     var k2: Long = 0
106 | 
107 |     val lenVar = len & 15
108 |     if (lenVar == 15) k2 = (cgdf.getByte(key, roundedEnd + 14) & 0xffL) << 48
109 |     if (lenVar >= 14) k2 |= (cgdf.getByte(key, roundedEnd + 13) & 0xffL) << 40
110 |     if (lenVar >= 13) k2 |= (cgdf.getByte(key, roundedEnd + 12) & 0xffL) << 32
111 |     if (lenVar >= 12) k2 |= (cgdf.getByte(key, roundedEnd + 11) & 0xffL) << 24
112 |     if (lenVar >= 11) k2 |= (cgdf.getByte(key, roundedEnd + 10) & 0xffL) << 16
113 |     if (lenVar >= 10) k2 |= (cgdf.getByte(key, roundedEnd + 9) & 0xffL) << 8
114 |     if (lenVar >= 9) {
115 |       k2 |= (cgdf.getByte(key, roundedEnd + 8) & 0xffL)
116 |       k2 *= c2
117 |       k2 = rotateLeft(k2, 33)
118 |       k2 *= c1
119 |       h2 ^= k2
120 |     }
121 |     if (lenVar >= 8) k1 = cgdf.getByte(key, roundedEnd + 7).toLong << 56
122 |     if (lenVar >= 7) k1 |= (cgdf.getByte(key, roundedEnd + 6) & 0xffL) << 48
123 |     if (lenVar >= 6) k1 |= (cgdf.getByte(key, roundedEnd + 5) & 0xffL) << 40
124 |     if (lenVar >= 5) k1 |= (cgdf.getByte(key, roundedEnd + 4) & 0xffL) << 32
125 |     if (lenVar >= 4) k1 |= (cgdf.getByte(key, roundedEnd + 3) & 0xffL) << 24
126 |     if (lenVar >= 3) k1 |= (cgdf.getByte(key, roundedEnd + 2) & 0xffL) << 16
127 |     if (lenVar >= 2) k1 |= (cgdf.getByte(key, roundedEnd + 1) & 0xffL) << 8
128 |     if (lenVar >= 1) {
129 |       k1 |= (cgdf.getByte(key, roundedEnd) & 0xffL)
130 |       k1 *= c1
131 |       k1 = rotateLeft(k1, 31)
132 |       k1 *= c2
133 |       h1 ^= k1
134 |     }
135 | 
136 |     h1 ^= len; h2 ^= len
137 | 
138 |     h1 += h2
139 |     h2 += h1
140 | 
141 |     h1 = fmix64(h1)
142 |     h2 = fmix64(h2)
143 | 
144 |     h1 += h2
145 |     h2 += h1
146 | 
147 |     h1 + h2
148 |   }
149 | }
150 | 


--------------------------------------------------------------------------------
/sandbox/src/main/java/sandbox/hashing/CassandraMurmurHash.java:
--------------------------------------------------------------------------------
  1 | package sandbox.hashing;
  2 | 
  3 | import java.nio.ByteBuffer;
  4 | 
  5 | public class CassandraMurmurHash
  6 | {
  7 |     public static int hash32(ByteBuffer data, int offset, int length, int seed)
  8 |     {
  9 |         int m = 0x5bd1e995;
 10 |         int r = 24;
 11 | 
 12 |         int h = seed ^ length;
 13 | 
 14 |         int len_4 = length >> 2;
 15 | 
 16 |         for (int i = 0; i < len_4; i++)
 17 |         {
 18 |             int i_4 = i << 2;
 19 |             int k = data.get(offset + i_4 + 3);
 20 |             k = k << 8;
 21 |             k = k | (data.get(offset + i_4 + 2) & 0xff);
 22 |             k = k << 8;
 23 |             k = k | (data.get(offset + i_4 + 1) & 0xff);
 24 |             k = k << 8;
 25 |             k = k | (data.get(offset + i_4 + 0) & 0xff);
 26 |             k *= m;
 27 |             k ^= k >>> r;
 28 |             k *= m;
 29 |             h *= m;
 30 |             h ^= k;
 31 |         }
 32 | 
 33 |         // avoid calculating modulo
 34 |         int len_m = len_4 << 2;
 35 |         int left = length - len_m;
 36 | 
 37 |         if (left != 0)
 38 |         {
 39 |             if (left >= 3)
 40 |             {
 41 |                 h ^= (int) data.get(offset + length - 3) << 16;
 42 |             }
 43 |             if (left >= 2)
 44 |             {
 45 |                 h ^= (int) data.get(offset + length - 2) << 8;
 46 |             }
 47 |             if (left >= 1)
 48 |             {
 49 |                 h ^= (int) data.get(offset + length - 1);
 50 |             }
 51 | 
 52 |             h *= m;
 53 |         }
 54 | 
 55 |         h ^= h >>> 13;
 56 |         h *= m;
 57 |         h ^= h >>> 15;
 58 | 
 59 |         return h;
 60 |     }
 61 | 
 62 |     public static long hash2_64(ByteBuffer key, int offset, int length, long seed)
 63 |     {
 64 |         long m64 = 0xc6a4a7935bd1e995L;
 65 |         int r64 = 47;
 66 | 
 67 |         long h64 = (seed & 0xffffffffL) ^ (m64 * length);
 68 | 
 69 |         int lenLongs = length >> 3;
 70 | 
 71 |         for (int i = 0; i < lenLongs; ++i)
 72 |         {
 73 |             int i_8 = i << 3;
 74 | 
 75 |             long k64 =  ((long)  key.get(offset+i_8+0) & 0xff)      + (((long) key.get(offset+i_8+1) & 0xff)<<8)  +
 76 |                         (((long) key.get(offset+i_8+2) & 0xff)<<16) + (((long) key.get(offset+i_8+3) & 0xff)<<24) +
 77 |                         (((long) key.get(offset+i_8+4) & 0xff)<<32) + (((long) key.get(offset+i_8+5) & 0xff)<<40) +
 78 |                         (((long) key.get(offset+i_8+6) & 0xff)<<48) + (((long) key.get(offset+i_8+7) & 0xff)<<56);
 79 | 
 80 |             k64 *= m64;
 81 |             k64 ^= k64 >>> r64;
 82 |             k64 *= m64;
 83 | 
 84 |             h64 ^= k64;
 85 |             h64 *= m64;
 86 |         }
 87 | 
 88 |         int rem = length & 0x7;
 89 | 
 90 |         switch (rem)
 91 |         {
 92 |         case 0:
 93 |             break;
 94 |         case 7:
 95 |             h64 ^= (long) key.get(offset + length - rem + 6) << 48;
 96 |         case 6:
 97 |             h64 ^= (long) key.get(offset + length - rem + 5) << 40;
 98 |         case 5:
 99 |             h64 ^= (long) key.get(offset + length - rem + 4) << 32;
100 |         case 4:
101 |             h64 ^= (long) key.get(offset + length - rem + 3) << 24;
102 |         case 3:
103 |             h64 ^= (long) key.get(offset + length - rem + 2) << 16;
104 |         case 2:
105 |             h64 ^= (long) key.get(offset + length - rem + 1) << 8;
106 |         case 1:
107 |             h64 ^= (long) key.get(offset + length - rem);
108 |             h64 *= m64;
109 |         }
110 | 
111 |         h64 ^= h64 >>> r64;
112 |         h64 *= m64;
113 |         h64 ^= h64 >>> r64;
114 | 
115 |         return h64;
116 |     }
117 | 
118 |     protected static long getblock(ByteBuffer key, int offset, int index)
119 |     {
120 |         int i_8 = index << 3;
121 |         return ((long) key.get(offset + i_8 + 0) & 0xff) + (((long) key.get(offset + i_8 + 1) & 0xff) << 8) +
122 |                (((long) key.get(offset + i_8 + 2) & 0xff) << 16) + (((long) key.get(offset + i_8 + 3) & 0xff) << 24) +
123 |                (((long) key.get(offset + i_8 + 4) & 0xff) << 32) + (((long) key.get(offset + i_8 + 5) & 0xff) << 40) +
124 |                (((long) key.get(offset + i_8 + 6) & 0xff) << 48) + (((long) key.get(offset + i_8 + 7) & 0xff) << 56);
125 |     }
126 | 
127 |     protected static long rotl64(long v, int n)
128 |     {
129 |         return ((v << n) | (v >>> (64 - n)));
130 |     }
131 | 
132 |     protected static long fmix(long k)
133 |     {
134 |         k ^= k >>> 33;
135 |         k *= 0xff51afd7ed558ccdL;
136 |         k ^= k >>> 33;
137 |         k *= 0xc4ceb9fe1a85ec53L;
138 |         k ^= k >>> 33;
139 | 
140 |         return k;
141 |     }
142 | 
143 |     public static long[] hash3_x64_128(ByteBuffer key, int offset, int length, long seed)
144 |     {
145 |         final int nblocks = length >> 4; // Process as 128-bit blocks.
146 | 
147 |         long h1 = seed;
148 |         long h2 = seed;
149 | 
150 |         long c1 = 0x87c37b91114253d5L;
151 |         long c2 = 0x4cf5ad432745937fL;
152 | 
153 |         //----------
154 |         // body
155 | 
156 |         for(int i = 0; i < nblocks; i++)
157 |         {
158 |             long k1 = getblock(key, offset, i*2+0);
159 |             long k2 = getblock(key, offset, i*2+1);
160 | 
161 |             k1 *= c1; k1 = rotl64(k1,31); k1 *= c2; h1 ^= k1;
162 | 
163 |             h1 = rotl64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
164 | 
165 |             k2 *= c2; k2  = rotl64(k2,33); k2 *= c1; h2 ^= k2;
166 | 
167 |             h2 = rotl64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
168 |         }
169 | 
170 |         //----------
171 |         // tail
172 | 
173 |         // Advance offset to the unprocessed tail of the data.
174 |         offset += nblocks * 16;
175 | 
176 |         long k1 = 0;
177 |         long k2 = 0;
178 | 
179 |         switch(length & 15)
180 |         {
181 |         case 15: k2 ^= ((long) key.get(offset+14)) << 48;
182 |         case 14: k2 ^= ((long) key.get(offset+13)) << 40;
183 |         case 13: k2 ^= ((long) key.get(offset+12)) << 32;
184 |         case 12: k2 ^= ((long) key.get(offset+11)) << 24;
185 |         case 11: k2 ^= ((long) key.get(offset+10)) << 16;
186 |         case 10: k2 ^= ((long) key.get(offset+9)) << 8;
187 |         case  9: k2 ^= ((long) key.get(offset+8)) << 0;
188 |             k2 *= c2; k2  = rotl64(k2,33); k2 *= c1; h2 ^= k2;
189 | 
190 |         case  8: k1 ^= ((long) key.get(offset+7)) << 56;
191 |         case  7: k1 ^= ((long) key.get(offset+6)) << 48;
192 |         case  6: k1 ^= ((long) key.get(offset+5)) << 40;
193 |         case  5: k1 ^= ((long) key.get(offset+4)) << 32;
194 |         case  4: k1 ^= ((long) key.get(offset+3)) << 24;
195 |         case  3: k1 ^= ((long) key.get(offset+2)) << 16;
196 |         case  2: k1 ^= ((long) key.get(offset+1)) << 8;
197 |         case  1: k1 ^= ((long) key.get(offset));
198 |             k1 *= c1; k1  = rotl64(k1,31); k1 *= c2; h1 ^= k1;
199 |         };
200 | 
201 |         //----------
202 |         // finalization
203 | 
204 |         h1 ^= length; h2 ^= length;
205 | 
206 |         h1 += h2;
207 |         h2 += h1;
208 | 
209 |         h1 = fmix(h1);
210 |         h2 = fmix(h2);
211 | 
212 |         h1 += h2;
213 |         h2 += h1;
214 | 
215 |         return(new long[] {h1, h2});
216 |     }
217 | }


--------------------------------------------------------------------------------
/tests/src/test/scala/tests/bloomfilter/mutable/UnsafeTableSpec.scala:
--------------------------------------------------------------------------------
  1 | package tests.bloomfilter.mutable
  2 | 
  3 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}
  4 | 
  5 | import bloomfilter.mutable.{UnsafeTable, UnsafeTable16Bit, UnsafeTable8Bit}
  6 | import org.scalacheck.Test.Parameters
  7 | import org.scalacheck.commands.Commands
  8 | import org.scalacheck.{Arbitrary, Gen, Prop, Properties}
  9 | import org.scalatest.{Matchers, PrivateMethodTester}
 10 | 
 11 | class UnsafeTableSpec extends Properties("UnsafeTableSpec") with Matchers with PrivateMethodTester {
 12 | 
 13 |   property("writeTag & readTag") = new UnsafeTableCommands().property()
 14 | 
 15 |   // TODO Sometimes fails when trying to add 5 elements to one bucket. It fails correctly. It shouldn't add 5 elemns. Scalacheck issue? Investigate
 16 |   property("insert & find") = new UnsafeTableInsertFindCommands().property()
 17 | 
 18 |   override def overrideParameters(p: Parameters): Parameters = {
 19 |     super.overrideParameters(p).withMinSuccessfulTests(100)
 20 |   }
 21 | 
 22 |   class UnsafeTableCommands extends Commands {
 23 |     type Sut = UnsafeTable8Bit
 24 | 
 25 |     case class State(numberOfBuckets: Long, addedItems: Long)
 26 | 
 27 |     override def canCreateNewSut(
 28 |         newState: State,
 29 |         initSuts: Traversable[State],
 30 |         runningSuts: Traversable[Sut]): Boolean =
 31 |       (initSuts.isEmpty && runningSuts.isEmpty) ||
 32 |           newState.addedItems >= newState.numberOfBuckets || newState.addedItems >= 4
 33 | 
 34 | 
 35 |     override def destroySut(sut: Sut): Unit =
 36 |       sut.dispose()
 37 | 
 38 |     override def genInitialState: Gen[State] =
 39 |       Gen.chooseNum[Long](1, /*Int.MaxValue * 2L*/ 1000).map(State(_, 0))
 40 | 
 41 |     override def newSut(state: State): Sut =
 42 |       new UnsafeTable8Bit(state.numberOfBuckets)
 43 | 
 44 |     def initialPreCondition(state: State): Boolean = true
 45 | 
 46 |     def genCommand(state: State): Gen[Command] =
 47 |       for {
 48 |         index <- Gen.choose[Long](0, state.numberOfBuckets - 1)
 49 |         tagIndex <- Gen.choose[Int](0, 3)
 50 |         tag <- Gen.choose[Byte](0, Byte.MaxValue)
 51 |       } yield commandSequence(WriteTag(index, tagIndex, tag), ReadTag(index, tagIndex, tag))
 52 | 
 53 |     case class WriteTag(index: Long, tagIndex: Int, tag: Byte) extends UnitCommand {
 54 |       def run(sut: Sut): Unit = sut.synchronized(sut.writeTag(index, tagIndex, tag))
 55 | 
 56 |       def nextState(state: State): State = state.copy(addedItems = state.addedItems + 1)
 57 | 
 58 |       def preCondition(state: State): Boolean = state.addedItems < state.numberOfBuckets || state.addedItems < 4
 59 | 
 60 |       def postCondition(state: State, success: Boolean): Prop = success
 61 |     }
 62 | 
 63 |     case class ReadTag(index: Long, tagIndex: Int, tag: Byte) extends SuccessCommand {
 64 |       type Result = Boolean
 65 | 
 66 |       def run(sut: Sut): Boolean = sut.synchronized(sut.readTag(index, tagIndex) == tag)
 67 | 
 68 |       def nextState(state: State): State = state
 69 | 
 70 |       def preCondition(state: State): Boolean = state.addedItems < state.numberOfBuckets || state.addedItems < 4
 71 | 
 72 |       def postCondition(state: State, result: Boolean): Prop = result
 73 |     }
 74 | 
 75 |   }
 76 | 
 77 |   class UnsafeTableInsertFindCommands extends Commands {
 78 |     type Sut = UnsafeTable8Bit
 79 | 
 80 |     case class State(numberOfBuckets: Long, addedItems: Long, bucketsPopulation: Map[Long, Int])
 81 | 
 82 |     override def canCreateNewSut(
 83 |         newState: State,
 84 |         initSuts: Traversable[State],
 85 |         runningSuts: Traversable[Sut]): Boolean =
 86 |       (initSuts.isEmpty && runningSuts.isEmpty) ||
 87 |           newState.addedItems >= newState.numberOfBuckets || newState.addedItems >= 4
 88 | 
 89 |     override def destroySut(sut: Sut): Unit =
 90 |       sut.dispose()
 91 | 
 92 |     override def genInitialState: Gen[State] =
 93 |       Gen.chooseNum[Long](1, /*Int.MaxValue * 2L*/ 1000).map(State(_, 0, Map.empty))
 94 | 
 95 |     override def newSut(state: State): Sut =
 96 |       new UnsafeTable8Bit(state.numberOfBuckets)
 97 | 
 98 |     def initialPreCondition(state: State): Boolean = true
 99 | 
100 |     def genCommand(state: State): Gen[Command] =
101 |       for {
102 |         index <- Gen.choose[Long](0, state.numberOfBuckets - 1)
103 |         tag <- Gen.choose[Byte](0, Byte.MaxValue)
104 |       } yield commandSequence(Insert(index, tag), Find(index, tag))
105 | 
106 |     case class Insert(index: Long, tag: Byte) extends UnitCommand {
107 |       def run(sut: Sut): Unit = sut.synchronized(sut.insert(index, tag))
108 | 
109 |       def nextState(state: State): State = {
110 |         val nextBucketsPopulation = state.bucketsPopulation.updated(index, prevBucketPopulation(state) + 1)
111 |         state.copy(addedItems = state.addedItems + 1, bucketsPopulation = nextBucketsPopulation)
112 |       }
113 | 
114 |       def prevBucketPopulation(state: State): Int = state.bucketsPopulation.getOrElse(index, 0)
115 | 
116 |       def preCondition(state: State): Boolean =
117 |         (prevBucketPopulation(state) < UnsafeTable8Bit.TagsPerBucket) &&
118 |             (state.addedItems < state.numberOfBuckets || state.addedItems < 4)
119 | 
120 |       def postCondition(state: State, success: Boolean): Prop = success
121 |     }
122 | 
123 |     case class Find(index: Long, tag: Byte) extends SuccessCommand {
124 |       type Result = Boolean
125 |       def run(sut: Sut): Boolean = sut.synchronized(sut.find(index, tag))
126 |       def nextState(state: State): State = state
127 |       def preCondition(state: State): Boolean = state.addedItems < state.numberOfBuckets || state.addedItems < 4
128 |       def postCondition(state: State, result: Boolean): Prop = result
129 |     }
130 | 
131 |   }
132 | 
133 |   type UnsafeTableEx = UnsafeTable {
134 |     def readTag(bucketIndex: Long, tagIndex: Int): Long
135 |   }
136 | 
137 |   def serializationProp(mkTable: Long => UnsafeTableEx): Prop = {
138 |     val gen = for {
139 |       numBuckets <- Gen.posNum[Int]
140 |       numPopulated <- Gen.choose(0, numBuckets)
141 |       m <- Gen.mapOfN(numPopulated, Gen.zip(Gen.choose(0, numBuckets - 1), Arbitrary.arbByte.arbitrary))
142 |     } yield {
143 |       numBuckets -> m
144 |     }
145 |     val ptrAccessor = PrivateMethod[Long]('ptr)
146 | 
147 |     def ptrOf(unsaffeTable: UnsafeTable) = unsaffeTable invokePrivate ptrAccessor()
148 | 
149 |     Prop.forAllNoShrink(gen) { case (numBuckets, tags) =>
150 |       val sut = mkTable(numBuckets)
151 |       try {
152 |         tags.foreach { case (idx, tag) => sut.insert(idx, tag) }
153 | 
154 |         val bos = new ByteArrayOutputStream
155 |         val oos = new ObjectOutputStream(bos)
156 |         oos.writeObject(sut)
157 |         oos.close()
158 |         val bis = new ByteArrayInputStream(bos.toByteArray)
159 |         val ois = new ObjectInputStream(bis)
160 |         val deserialized = ois.readObject()
161 |         ois.close()
162 | 
163 |         deserialized should not be null
164 |         deserialized should be(a[UnsafeTable])
165 |         deserialized should have('class (sut.getClass))
166 |         val sut2 = deserialized.asInstanceOf[UnsafeTableEx]
167 |         ptrOf(sut2) should not be 0
168 |         ptrOf(sut2) should not equal ptrOf(sut)
169 |         try {
170 |           for {
171 |             idx <- 0 until numBuckets
172 |             tagIdx <- 0 until UnsafeTable8Bit.TagsPerBucket
173 |           } {
174 |             sut.readTag(idx, tagIdx) shouldEqual sut2.readTag(idx, tagIdx)
175 |           }
176 |           Prop.passed
177 |         } finally sut2.dispose()
178 |       } finally sut.dispose()
179 |     }
180 |   }
181 | 
182 |   property("UnsafeTable8Bit supports java serialization") = serializationProp(new UnsafeTable8Bit(_))
183 |   property("UnsafeTable16Bit supports java serialization") = serializationProp(new UnsafeTable16Bit(_))
184 | 
185 | }
186 | 


--------------------------------------------------------------------------------
/bloom-filter/src/main/scala/bloomfilter/mutable/UnsafeTable.scala:
--------------------------------------------------------------------------------
  1 | package bloomfilter.mutable
  2 | 
  3 | import java.io._
  4 | 
  5 | import bloomfilter.util.Unsafe.unsafe
  6 | 
  7 | 
  8 | // TODO macro for various bits?
  9 | trait UnsafeTable {
 10 |   def insert(index: Long, tag: Long): Boolean
 11 |   def swapAny(index: Long, tag: Long): Long
 12 |   def remove(index: Long, tag: Long): Boolean
 13 |   def find(index: Long, tag: Long): Boolean
 14 |   def dispose(): Unit
 15 | 
 16 |   protected def readPtrFrom(in: InputStream, ptr: Long, numBytes: Long): Unit = {
 17 |     val din = new DataInputStream(in)
 18 |     var n = 0L
 19 |     while (n + 8 <= numBytes) {
 20 |       val l = din.readLong()
 21 |       unsafe.putLong(ptr + n, l)
 22 |       n += 8
 23 |     }
 24 |     while (n < numBytes) {
 25 |       val b = din.readByte()
 26 |       unsafe.putByte(ptr + n, b)
 27 |       n += 1
 28 |     }
 29 |   }
 30 |   protected def writePtrTo(out: OutputStream, ptr: Long, numBytes: Long): Unit = {
 31 |     val dout = new DataOutputStream(out)
 32 |     var n = 0L
 33 |     while (n + 8 <= numBytes) {
 34 |       val l = unsafe.getLong(ptr + n)
 35 |       dout.writeLong(l)
 36 |       n += 8
 37 |     }
 38 |     while (n < numBytes) {
 39 |       val b = unsafe.getByte(ptr + n)
 40 |       dout.writeByte(b.toInt)
 41 |       n += 1
 42 |     }
 43 |   }
 44 | 
 45 |   protected def toSerializedForm(bytesPerBucket: Int, numberOfBuckets: Long): AnyRef = new UnsafeTable.SerializedForm(bytesPerBucket, numberOfBuckets, this)
 46 | 
 47 |   def writeTo(out: OutputStream): Unit
 48 |   def readFrom(in: InputStream): Unit
 49 | }
 50 | 
 51 | object UnsafeTable {
 52 | 
 53 |   @SerialVersionUID(1L)
 54 |   private class SerializedForm(bytesPerBucket: Int, numberOfBuckets: Long, @transient var unsafeTable: UnsafeTable) extends Serializable {
 55 |     private def writeObject(oos: ObjectOutputStream): Unit = {
 56 |       oos.defaultWriteObject()
 57 |       unsafeTable.writeTo(oos)
 58 |     }
 59 | 
 60 |     private def readObject(ois: ObjectInputStream): Unit = {
 61 |       ois.defaultReadObject()
 62 |       unsafeTable = bytesPerBucket match {
 63 |         case 8 => new UnsafeTable8Bit(numberOfBuckets)
 64 |         case 16 => new UnsafeTable16Bit(numberOfBuckets)
 65 |       }
 66 |       unsafeTable.readFrom(ois)
 67 |     }
 68 | 
 69 |     @throws(classOf[java.io.ObjectStreamException])
 70 |     private def readResolve: AnyRef = unsafeTable
 71 |   }
 72 | 
 73 | }
 74 | 
 75 | @SerialVersionUID(1L)
 76 | class UnsafeTable8Bit(val numberOfBuckets: Long) extends UnsafeTable with Serializable {
 77 | 
 78 |   import UnsafeTable8Bit._
 79 | 
 80 |   private val ptr = unsafe.allocateMemory(bytesPerBucket * numberOfBuckets)
 81 |   unsafe.setMemory(ptr, bytesPerBucket * numberOfBuckets, 0.toByte)
 82 | 
 83 |   def readTag(bucketIndex: Long, tagIndex: Int): Long = {
 84 |     val p = ptr + bucketIndex * bytesPerBucket + tagIndex
 85 |     val tag = unsafe.getByte(p)
 86 |     tag & tagMask
 87 |   }
 88 | 
 89 |   def writeTag(i: Long, j: Int, t: Long): Unit = {
 90 |     val p = ptr + i * bytesPerBucket
 91 |     val tag = t & tagMask
 92 |     unsafe.putByte(p + j, tag.toByte)
 93 |   }
 94 | 
 95 |   def insert(index: Long, tag: Long): Boolean = {
 96 |     var tagIndex = 0
 97 |     while (tagIndex < TagsPerBucket) {
 98 |       if (readTag(index, tagIndex) == EmptyTag) {
 99 |         writeTag(index, tagIndex, tag)
100 |         return true
101 |       }
102 |       tagIndex += 1
103 |     }
104 | 
105 |     false
106 |   }
107 | 
108 |   def swapAny(index: Long, tag: Long): Long = {
109 |     var tagIndex = 0
110 |     while (tagIndex < TagsPerBucket) {
111 |       if (readTag(index, tagIndex) == EmptyTag) {
112 |         writeTag(index, tagIndex, tag)
113 |         return EmptyTag
114 |       }
115 |       tagIndex += 1
116 |     }
117 | 
118 |     random += 1
119 |     val r = random & (TagsPerBucket - 1)
120 |     val tagToSwap = readTag(index, r)
121 |     writeTag(index, r, tag)
122 |     tagToSwap
123 |   }
124 | 
125 |   def remove(index: Long, tag: Long): Boolean = {
126 |     var tagIndex = 0
127 |     while (tagIndex < TagsPerBucket) {
128 |       if (readTag(index, tagIndex) == tag) {
129 |         writeTag(index, tagIndex, EmptyTag)
130 |         return true
131 |       }
132 |       tagIndex += 1
133 |     }
134 |     false
135 |   }
136 | 
137 |   def find(index: Long, tag: Long): Boolean = {
138 |     var i = 0
139 |     while (i < TagsPerBucket) {
140 |       val tag1 = readTag(index, i)
141 |       if (tag1 == tag) {
142 |         return true
143 |       }
144 |       i += 1
145 |     }
146 |     false
147 |   }
148 | 
149 |   def writeTo(out: OutputStream): Unit = {
150 |     writePtrTo(out, ptr, bytesPerBucket * numberOfBuckets)
151 |   }
152 | 
153 |   def readFrom(in: InputStream): Unit = {
154 |     readPtrFrom(in, ptr, bytesPerBucket * numberOfBuckets)
155 |   }
156 | 
157 |   def dispose(): Unit = unsafe.freeMemory(ptr)
158 | 
159 |   @throws(classOf[java.io.ObjectStreamException])
160 |   private def writeReplace: AnyRef = toSerializedForm(8, numberOfBuckets)
161 | }
162 | 
163 | object UnsafeTable8Bit {
164 |   val EmptyTag = 0L
165 |   val BitsPerItem = 8
166 |   val TagsPerBucket = 4
167 |   private var random = 0
168 |   private val bytesPerBucket = (BitsPerItem * TagsPerBucket + 7) >> 3
169 |   private val tagMask = (1L << BitsPerItem) - 1
170 | }
171 | 
172 | 
173 | @SerialVersionUID(1)
174 | class UnsafeTable16Bit(val numberOfBuckets: Long) extends UnsafeTable with Serializable {
175 | 
176 |   import UnsafeTable16Bit._
177 | 
178 |   private val ptr = unsafe.allocateMemory(bytesPerBucket * numberOfBuckets)
179 |   unsafe.setMemory(ptr, bytesPerBucket * numberOfBuckets, 0.toByte)
180 | 
181 |   def readTag(bucketIndex: Long, tagIndex: Int): Long = {
182 |     val p = ptr + bucketIndex * bytesPerBucket + (tagIndex << 1)
183 |     val tag = unsafe.getShort(p)
184 |     tag & tagMask
185 |   }
186 | 
187 |   def writeTag(bucketIndex: Long, tagIndex: Int, tag: Long): Unit = {
188 |     val p = ptr + bucketIndex * bytesPerBucket + (tagIndex << 1)
189 |     unsafe.putShort(p, (tag & tagMask).toShort)
190 |   }
191 | 
192 |   def insert(index: Long, tag: Long): Boolean = {
193 |     var tagIndex = 0
194 |     while (tagIndex < TagsPerBucket) {
195 |       if (readTag(index, tagIndex) == EmptyTag) {
196 |         writeTag(index, tagIndex, tag)
197 |         return true
198 |       }
199 |       tagIndex += 1
200 |     }
201 | 
202 |     false
203 |   }
204 | 
205 |   def swapAny(index: Long, tag: Long): Long = {
206 |     var tagIndex = 0
207 |     while (tagIndex < TagsPerBucket) {
208 |       if (readTag(index, tagIndex) == EmptyTag) {
209 |         writeTag(index, tagIndex, tag)
210 |         return EmptyTag
211 |       }
212 |       tagIndex += 1
213 |     }
214 | 
215 |     random += 1
216 |     val r = random & (TagsPerBucket - 1)
217 |     val tagToSwap = readTag(index, r)
218 |     writeTag(index, r, tag)
219 |     tagToSwap
220 |   }
221 | 
222 |   def remove(index: Long, tag: Long): Boolean = {
223 |     var tagIndex = 0
224 |     while (tagIndex < TagsPerBucket) {
225 |       if (readTag(index, tagIndex) == tag) {
226 |         writeTag(index, tagIndex, EmptyTag)
227 |         return true
228 |       }
229 |       tagIndex += 1
230 |     }
231 |     false
232 |   }
233 | 
234 |   def find(index: Long, tag: Long): Boolean = {
235 |     var i = 0
236 |     while (i < TagsPerBucket) {
237 |       val tag1 = readTag(index, i)
238 |       if (tag1 == tag) {
239 |         return true
240 |       }
241 |       i += 1
242 |     }
243 |     false
244 |   }
245 | 
246 |   def writeTo(out: OutputStream): Unit = {
247 |     writePtrTo(out, ptr, bytesPerBucket * numberOfBuckets)
248 |   }
249 | 
250 |   def readFrom(in: InputStream): Unit = {
251 |     readPtrFrom(in, ptr, bytesPerBucket * numberOfBuckets)
252 |   }
253 | 
254 |   def dispose(): Unit = unsafe.freeMemory(ptr)
255 | 
256 |   @throws(classOf[java.io.ObjectStreamException])
257 |   private def writeReplace: AnyRef = toSerializedForm(16, numberOfBuckets)
258 | }
259 | 
260 | 
261 | object UnsafeTable16Bit {
262 |   val EmptyTag = 0L
263 |   val BitsPerItem = 16
264 |   val TagsPerBucket = 4
265 |   private var random = 0
266 |   private val bytesPerBucket = (BitsPerItem * TagsPerBucket + 7) >> 3
267 |   private val tagMask = (1L << BitsPerItem) - 1
268 | 
269 | }
270 | 


--------------------------------------------------------------------------------
/sandbox/src/main/java/sandbox/hashing/YonikMurmurHash3.java:
--------------------------------------------------------------------------------
  1 | package sandbox.hashing;
  2 | 
  3 | /**
  4 |  *  The MurmurHash3 algorithm was created by Austin Appleby and placed in the public domain.
  5 |  *  This java port was authored by Yonik Seeley and also placed into the public domain.
  6 |  *  The author hereby disclaims copyright to this source code.
  7 |  *  <p>
  8 |  *  This produces exactly the same hash values as the final C++
  9 |  *  version of MurmurHash3 and is thus suitable for producing the same hash values across
 10 |  *  platforms.
 11 |  *  <p>
 12 |  *  The 32 bit x86 version of this hash should be the fastest variant for relatively short keys like ids.
 13 |  *  murmurhash3_x64_128 is a good choice for longer strings or if you need more than 32 bits of hash.
 14 |  *  <p>
 15 |  *  Note - The x86 and x64 versions do _not_ produce the same results, as the
 16 |  *  algorithms are optimized for their respective platforms.
 17 |  *  <p>
 18 |  *  See http://github.com/yonik/java_util for future updates to this file.
 19 |  */
 20 | public final class YonikMurmurHash3 {
 21 | 
 22 |     /** 128 bits of state */
 23 |     public static final class LongPair {
 24 |         public long val1;
 25 |         public long val2;
 26 |     }
 27 | 
 28 |     public static final int fmix32(int h) {
 29 |         h ^= h >>> 16;
 30 |         h *= 0x85ebca6b;
 31 |         h ^= h >>> 13;
 32 |         h *= 0xc2b2ae35;
 33 |         h ^= h >>> 16;
 34 |         return h;
 35 |     }
 36 | 
 37 |     public static final long fmix64(long k) {
 38 |         k ^= k >>> 33;
 39 |         k *= 0xff51afd7ed558ccdL;
 40 |         k ^= k >>> 33;
 41 |         k *= 0xc4ceb9fe1a85ec53L;
 42 |         k ^= k >>> 33;
 43 |         return k;
 44 |     }
 45 | 
 46 |     /** Gets a long from a byte buffer in little endian byte order. */
 47 |     public static final long getLongLittleEndian(byte[] buf, int offset) {
 48 |         return     ((long)buf[offset+7]    << 56)   // no mask needed
 49 |                    | ((buf[offset+6] & 0xffL) << 48)
 50 |                    | ((buf[offset+5] & 0xffL) << 40)
 51 |                    | ((buf[offset+4] & 0xffL) << 32)
 52 |                    | ((buf[offset+3] & 0xffL) << 24)
 53 |                    | ((buf[offset+2] & 0xffL) << 16)
 54 |                    | ((buf[offset+1] & 0xffL) << 8)
 55 |                    | ((buf[offset  ] & 0xffL));        // no shift needed
 56 |     }
 57 | 
 58 | 
 59 |     /** Returns the MurmurHash3_x86_32 hash. */
 60 |     public static int murmurhash3_x86_32(byte[] data, int offset, int len, int seed) {
 61 | 
 62 |         final int c1 = 0xcc9e2d51;
 63 |         final int c2 = 0x1b873593;
 64 | 
 65 |         int h1 = seed;
 66 |         int roundedEnd = offset + (len & 0xfffffffc);  // round down to 4 byte block
 67 | 
 68 |         for (int i=offset; i<roundedEnd; i+=4) {
 69 |             // little endian load order
 70 |             int k1 = (data[i] & 0xff) | ((data[i+1] & 0xff) << 8) | ((data[i+2] & 0xff) << 16) | (data[i+3] << 24);
 71 |             k1 *= c1;
 72 |             k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
 73 |             k1 *= c2;
 74 | 
 75 |             h1 ^= k1;
 76 |             h1 = (h1 << 13) | (h1 >>> 19);  // ROTL32(h1,13);
 77 |             h1 = h1*5+0xe6546b64;
 78 |         }
 79 | 
 80 |         // tail
 81 |         int k1 = 0;
 82 | 
 83 |         switch(len & 0x03) {
 84 |         case 3:
 85 |             k1 = (data[roundedEnd + 2] & 0xff) << 16;
 86 |             // fallthrough
 87 |         case 2:
 88 |             k1 |= (data[roundedEnd + 1] & 0xff) << 8;
 89 |             // fallthrough
 90 |         case 1:
 91 |             k1 |= (data[roundedEnd] & 0xff);
 92 |             k1 *= c1;
 93 |             k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
 94 |             k1 *= c2;
 95 |             h1 ^= k1;
 96 |         }
 97 | 
 98 |         // finalization
 99 |         h1 ^= len;
100 | 
101 |         // fmix(h1);
102 |         h1 ^= h1 >>> 16;
103 |         h1 *= 0x85ebca6b;
104 |         h1 ^= h1 >>> 13;
105 |         h1 *= 0xc2b2ae35;
106 |         h1 ^= h1 >>> 16;
107 | 
108 |         return h1;
109 |     }
110 | 
111 | 
112 |     /** Returns the MurmurHash3_x86_32 hash of the UTF-8 bytes of the String without actually encoding
113 |      * the string to a temporary buffer.  This is more than 2x faster than hashing the result
114 |      * of String.getBytes().
115 |      */
116 |     public static int murmurhash3_x86_32(CharSequence data, int offset, int len, int seed) {
117 | 
118 |         final int c1 = 0xcc9e2d51;
119 |         final int c2 = 0x1b873593;
120 | 
121 |         int h1 = seed;
122 | 
123 |         int pos = offset;
124 |         int end = offset + len;
125 |         int k1 = 0;
126 |         int k2 = 0;
127 |         int shift = 0;
128 |         int bits = 0;
129 |         int nBytes = 0;   // length in UTF8 bytes
130 | 
131 | 
132 |         while (pos < end) {
133 |             int code = data.charAt(pos++);
134 |             if (code < 0x80) {
135 |                 k2 = code;
136 |                 bits = 8;
137 | 
138 |                 /***
139 |                  // optimized ascii implementation (currently slower!!! code size?)
140 |                  if (shift == 24) {
141 |                  k1 = k1 | (code << 24);
142 | 
143 |                  k1 *= c1;
144 |                  k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
145 |                  k1 *= c2;
146 | 
147 |                  h1 ^= k1;
148 |                  h1 = (h1 << 13) | (h1 >>> 19);  // ROTL32(h1,13);
149 |                  h1 = h1*5+0xe6546b64;
150 | 
151 |                  shift = 0;
152 |                  nBytes += 4;
153 |                  k1 = 0;
154 |                  } else {
155 |                  k1 |= code << shift;
156 |                  shift += 8;
157 |                  }
158 |                  continue;
159 |                  ***/
160 | 
161 |             }
162 |             else if (code < 0x800) {
163 |                 k2 = (0xC0 | (code >> 6))
164 |                      | ((0x80 | (code & 0x3F)) << 8);
165 |                 bits = 16;
166 |             }
167 |             else if (code < 0xD800 || code > 0xDFFF || pos>=end) {
168 |                 // we check for pos>=end to encode an unpaired surrogate as 3 bytes.
169 |                 k2 = (0xE0 | (code >> 12))
170 |                      | ((0x80 | ((code >> 6) & 0x3F)) << 8)
171 |                      | ((0x80 | (code & 0x3F)) << 16);
172 |                 bits = 24;
173 |             } else {
174 |                 // surrogate pair
175 |                 // int utf32 = pos < end ? (int) data.charAt(pos++) : 0;
176 |                 int utf32 = (int) data.charAt(pos++);
177 |                 utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
178 |                 k2 = (0xff & (0xF0 | (utf32 >> 18)))
179 |                      | ((0x80 | ((utf32 >> 12) & 0x3F))) << 8
180 |                      | ((0x80 | ((utf32 >> 6) & 0x3F))) << 16
181 |                      |  (0x80 | (utf32 & 0x3F)) << 24;
182 |                 bits = 32;
183 |             }
184 | 
185 | 
186 |             k1 |= k2 << shift;
187 | 
188 |             // int used_bits = 32 - shift;  // how many bits of k2 were used in k1.
189 |             // int unused_bits = bits - used_bits; //  (bits-(32-shift)) == bits+shift-32  == bits-newshift
190 | 
191 |             shift += bits;
192 |             if (shift >= 32) {
193 |                 // mix after we have a complete word
194 | 
195 |                 k1 *= c1;
196 |                 k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
197 |                 k1 *= c2;
198 | 
199 |                 h1 ^= k1;
200 |                 h1 = (h1 << 13) | (h1 >>> 19);  // ROTL32(h1,13);
201 |                 h1 = h1*5+0xe6546b64;
202 | 
203 |                 shift -= 32;
204 |                 // unfortunately, java won't let you shift 32 bits off, so we need to check for 0
205 |                 if (shift != 0) {
206 |                     k1 = k2 >>> (bits-shift);   // bits used == bits - newshift
207 |                 } else {
208 |                     k1 = 0;
209 |                 }
210 |                 nBytes += 4;
211 |             }
212 | 
213 |         } // inner
214 | 
215 |         // handle tail
216 |         if (shift > 0) {
217 |             nBytes += shift >> 3;
218 |             k1 *= c1;
219 |             k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
220 |             k1 *= c2;
221 |             h1 ^= k1;
222 |         }
223 | 
224 |         // finalization
225 |         h1 ^= nBytes;
226 | 
227 |         // fmix(h1);
228 |         h1 ^= h1 >>> 16;
229 |         h1 *= 0x85ebca6b;
230 |         h1 ^= h1 >>> 13;
231 |         h1 *= 0xc2b2ae35;
232 |         h1 ^= h1 >>> 16;
233 | 
234 |         return h1;
235 |     }
236 | 
237 | 
238 |     /** Returns the MurmurHash3_x64_128 hash, placing the result in "out". */
239 |     public static void murmurhash3_x64_128(byte[] key, int offset, int len, int seed, LongPair out) {
240 |         // The original algorithm does have a 32 bit unsigned seed.
241 |         // We have to mask to match the behavior of the unsigned types and prevent sign extension.
242 |         long h1 = seed & 0x00000000FFFFFFFFL;
243 |         long h2 = seed & 0x00000000FFFFFFFFL;
244 | 
245 |         final long c1 = 0x87c37b91114253d5L;
246 |         final long c2 = 0x4cf5ad432745937fL;
247 | 
248 |         int roundedEnd = offset + (len & 0xFFFFFFF0);  // round down to 16 byte block
249 |         for (int i=offset; i<roundedEnd; i+=16) {
250 |             long k1 = getLongLittleEndian(key, i);
251 |             long k2 = getLongLittleEndian(key, i+8);
252 |             k1 *= c1; k1  = Long.rotateLeft(k1,31); k1 *= c2; h1 ^= k1;
253 |             h1 = Long.rotateLeft(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
254 |             k2 *= c2; k2  = Long.rotateLeft(k2,33); k2 *= c1; h2 ^= k2;
255 |             h2 = Long.rotateLeft(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
256 |         }
257 | 
258 |         long k1 = 0;
259 |         long k2 = 0;
260 | 
261 |         switch (len & 15) {
262 |         case 15: k2  = (key[roundedEnd+14] & 0xffL) << 48;
263 |         case 14: k2 |= (key[roundedEnd+13] & 0xffL) << 40;
264 |         case 13: k2 |= (key[roundedEnd+12] & 0xffL) << 32;
265 |         case 12: k2 |= (key[roundedEnd+11] & 0xffL) << 24;
266 |         case 11: k2 |= (key[roundedEnd+10] & 0xffL) << 16;
267 |         case 10: k2 |= (key[roundedEnd+ 9] & 0xffL) << 8;
268 |         case  9: k2 |= (key[roundedEnd+ 8] & 0xffL);
269 |             k2 *= c2; k2  = Long.rotateLeft(k2, 33); k2 *= c1; h2 ^= k2;
270 |         case  8: k1  = ((long)key[roundedEnd+7]) << 56;
271 |         case  7: k1 |= (key[roundedEnd+6] & 0xffL) << 48;
272 |         case  6: k1 |= (key[roundedEnd+5] & 0xffL) << 40;
273 |         case  5: k1 |= (key[roundedEnd+4] & 0xffL) << 32;
274 |         case  4: k1 |= (key[roundedEnd+3] & 0xffL) << 24;
275 |         case  3: k1 |= (key[roundedEnd+2] & 0xffL) << 16;
276 |         case  2: k1 |= (key[roundedEnd+1] & 0xffL) << 8;
277 |         case  1: k1 |= (key[roundedEnd  ] & 0xffL);
278 |             k1 *= c1; k1  = Long.rotateLeft(k1,31); k1 *= c2; h1 ^= k1;
279 |         }
280 | 
281 |         //----------
282 |         // finalization
283 | 
284 |         h1 ^= len; h2 ^= len;
285 | 
286 |         h1 += h2;
287 |         h2 += h1;
288 | 
289 |         h1 = fmix64(h1);
290 |         h2 = fmix64(h2);
291 | 
292 |         h1 += h2;
293 |         h2 += h1;
294 | 
295 |         out.val1 = h1;
296 |         out.val2 = h2;
297 |     }
298 | 
299 | }


--------------------------------------------------------------------------------