├── .gitattributes ├── project ├── build.properties ├── BuildKeys.scala ├── Configs.scala ├── plugins.sbt ├── Testing.scala ├── Dependencies.scala ├── Publishing.scala └── Settings.scala ├── pubring.gpg.enc ├── secring.gpg.enc ├── examples └── src │ └── main │ ├── scala │ └── Main.scala │ └── java │ └── MainJava.java ├── benchmarks └── src │ └── main │ └── scala │ ├── bloomfilter │ ├── mutable │ │ ├── AddLongItemBenchmark.scala │ │ ├── _128bit │ │ │ ├── AddLongItemBenchmark.scala │ │ │ ├── StringItemBenchmark.scala │ │ │ └── ArrayByteItemBenchmark.scala │ │ ├── StringItemBenchmark.scala │ │ ├── ArrayByteItemBenchmark.scala │ │ └── StringItemCuckooBenchmark.scala │ ├── hashing │ │ ├── MurmurHash3GenericBenchmark.scala │ │ └── MurmurHash3Benchmark.scala │ └── UnsafeBitArrayBenchmark.scala │ └── alternatives │ ├── breeze │ ├── AddLongItemBenchmark.scala │ ├── StringItemBenchmark.scala │ └── ArrayByteItemBenchmark.scala │ ├── guava │ ├── AddLongItemBenchmark.scala │ ├── ArrayByteItemBenchmark.scala │ └── StringItemBenchmark.scala │ ├── algebird │ └── StringItemBenchmark.scala │ └── stream │ └── StringItemBenchmark.scala ├── bloom-filter └── src │ └── main │ └── scala │ └── bloomfilter │ ├── util │ └── Unsafe.scala │ ├── CanGetDataFrom.scala │ ├── CanGenerateHashFrom.scala │ ├── CanGenerate128HashFrom.scala │ ├── mutable │ ├── _128bit │ │ └── BloomFilter.scala │ ├── UnsafeBitArray.scala │ ├── CuckooFilter.scala │ ├── BloomFilter.scala │ └── UnsafeTable.scala │ └── hashing │ └── MurmurHash3Generic.scala ├── tests └── src │ ├── endToEnd │ └── scala │ │ └── endToEnd │ │ └── bloomfilter │ │ └── mutable │ │ ├── SampleUsageSpec.scala │ │ └── _128bit │ │ └── SampleUsageSpec.scala │ └── test │ └── scala │ └── tests │ └── bloomfilter │ ├── hashing │ └── MurmurHash3ScalaVsJavaSpec.scala │ ├── CanGetDataFromSpec.scala │ └── mutable │ ├── _128bit │ ├── BloomFilterSerializationSpec.scala │ └── BloomFilterSpec.scala │ ├── BloomFiltersSpec.scala │ ├── UnsafeBitArraysSpec.scala │ ├── BloomFilterSerializationSpec.scala │ ├── BloomFilterSpec.scala │ ├── UnsafeBitArraySpec.scala │ ├── CuckooFilterSpec.scala │ └── UnsafeTableSpec.scala ├── sandbox └── src │ └── main │ ├── scala │ └── sandbox │ │ ├── bloomfilter │ │ └── mutable │ │ │ ├── ChronicleBitArray.scala │ │ │ ├── BitArray.scala │ │ │ └── BloomFilter.scala │ │ └── hashing │ │ └── MurmurHash3.scala │ └── java │ └── sandbox │ └── hashing │ ├── AlgebirdMurmurHash128.scala │ ├── CassandraMurmurHash.java │ └── YonikMurmurHash3.java ├── LICENSE ├── .travis.yml ├── .gitignore ├── CHANGELOG.md ├── sandboxApp └── src │ └── main │ └── scala │ └── SandboxApp.scala └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | *.enc binary -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 1.3.10 -------------------------------------------------------------------------------- /pubring.gpg.enc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexandrnikitin/bloom-filter-scala/HEAD/pubring.gpg.enc -------------------------------------------------------------------------------- /secring.gpg.enc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexandrnikitin/bloom-filter-scala/HEAD/secring.gpg.enc -------------------------------------------------------------------------------- /project/BuildKeys.scala: -------------------------------------------------------------------------------- 1 | import sbt.TaskKey 2 | 3 | object BuildKeys { 4 | lazy val testAll = TaskKey[Unit]("test-all") 5 | } 6 | -------------------------------------------------------------------------------- /project/Configs.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | 3 | object Configs { 4 | val EndToEndTest = config("endToEnd") extend Runtime 5 | val all = EndToEndTest 6 | } -------------------------------------------------------------------------------- /examples/src/main/scala/Main.scala: -------------------------------------------------------------------------------- 1 | 2 | import bloomfilter.mutable.BloomFilter 3 | 4 | object Main extends App { 5 | val expectedElements = 1000 6 | val falsePositiveRate: Double = 0.1 7 | val bf = BloomFilter[String](expectedElements, falsePositiveRate) 8 | bf.add("some string") 9 | bf.mightContain("some string") 10 | bf.dispose() 11 | } 12 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn 2 | 3 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.1") 4 | 5 | addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.3.7") 6 | 7 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.2") 8 | 9 | addSbtPlugin("io.crashbox" % "sbt-gpg" % "0.2.1") 10 | 11 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.7.0") 12 | 13 | addSbtPlugin("com.dwijnand" % "sbt-dynver" % "4.0.0") 14 | -------------------------------------------------------------------------------- /benchmarks/src/main/scala/bloomfilter/mutable/AddLongItemBenchmark.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter.mutable 2 | 3 | import org.openjdk.jmh.annotations.{Benchmark, Scope, State} 4 | 5 | @State(Scope.Benchmark) 6 | class AddLongItemBenchmark { 7 | 8 | private val itemsExpected = 1000000L 9 | private val falsePositiveRate = 0.01 10 | 11 | private val bf = BloomFilter[Long](itemsExpected, falsePositiveRate) 12 | 13 | @Benchmark 14 | def my() = { 15 | bf.add(1L) 16 | } 17 | 18 | 19 | } -------------------------------------------------------------------------------- /benchmarks/src/main/scala/bloomfilter/mutable/_128bit/AddLongItemBenchmark.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter.mutable._128bit 2 | 3 | import org.openjdk.jmh.annotations.{Benchmark, Scope, State} 4 | 5 | @State(Scope.Benchmark) 6 | class AddLongItemBenchmark { 7 | 8 | private val itemsExpected = 1000000L 9 | private val falsePositiveRate = 0.01 10 | 11 | private val bf = BloomFilter[Long](itemsExpected, falsePositiveRate) 12 | 13 | @Benchmark 14 | def my() = { 15 | bf.add(1L) 16 | } 17 | 18 | 19 | } -------------------------------------------------------------------------------- /benchmarks/src/main/scala/alternatives/breeze/AddLongItemBenchmark.scala: -------------------------------------------------------------------------------- 1 | package alternatives.breeze 2 | 3 | import breeze.util.BloomFilter 4 | import org.openjdk.jmh.annotations.{Benchmark, Scope, State} 5 | 6 | @State(Scope.Benchmark) 7 | class AddLongItemBenchmark { 8 | 9 | private val itemsExpected = 1000000L 10 | private val falsePositiveRate = 0.01 11 | 12 | private val bf = BloomFilter.optimallySized[Long](itemsExpected.toDouble, falsePositiveRate) 13 | 14 | @Benchmark 15 | def breeze() = { 16 | bf.+=(1L) 17 | } 18 | 19 | 20 | } -------------------------------------------------------------------------------- /benchmarks/src/main/scala/alternatives/guava/AddLongItemBenchmark.scala: -------------------------------------------------------------------------------- 1 | package alternatives.guava 2 | 3 | import com.google.common.hash.{BloomFilter, Funnels} 4 | import org.openjdk.jmh.annotations.{Benchmark, Scope, State} 5 | 6 | @State(Scope.Benchmark) 7 | class AddLongItemBenchmark { 8 | 9 | private val itemsExpected = 1000000L 10 | private val falsePositiveRate = 0.01 11 | 12 | private val bf = BloomFilter.create[java.lang.Long](Funnels.longFunnel(), itemsExpected, falsePositiveRate) 13 | 14 | @Benchmark 15 | def guava() = { 16 | bf.put(1L) 17 | } 18 | 19 | 20 | } -------------------------------------------------------------------------------- /benchmarks/src/main/scala/bloomfilter/hashing/MurmurHash3GenericBenchmark.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter.hashing 2 | 3 | import org.openjdk.jmh.annotations.{Benchmark, Scope, State} 4 | import sandbox.hashing.MurmurHash3 5 | 6 | @State(Scope.Benchmark) 7 | class MurmurHash3GenericBenchmark { 8 | 9 | val key = Range(0, 64).map(_.toByte).toArray 10 | 11 | @Benchmark 12 | def scalaVersion() = { 13 | MurmurHash3.murmurhash3_x64_128(key, 0, key.length, 0) 14 | } 15 | 16 | @Benchmark 17 | def genericVersion() = { 18 | MurmurHash3Generic.murmurhash3_x64_128(key, 0, key.length, 0) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /examples/src/main/java/MainJava.java: -------------------------------------------------------------------------------- 1 | import bloomfilter.CanGenerateHashFrom; 2 | import bloomfilter.mutable.BloomFilter; 3 | 4 | public class MainJava { 5 | public static void main(String[] args) { 6 | long expectedElements = 10000000; 7 | double falsePositiveRate = 0.1; 8 | BloomFilter bf = BloomFilter.apply( 9 | expectedElements, 10 | falsePositiveRate, 11 | CanGenerateHashFrom.CanGenerateHashFromByteArray$.MODULE$); 12 | 13 | byte[] element = new byte[100]; 14 | bf.add(element); 15 | bf.mightContain(element); 16 | bf.dispose(); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /bloom-filter/src/main/scala/bloomfilter/util/Unsafe.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter.util 2 | 3 | import sun.misc.{Unsafe => JUnsafe} 4 | 5 | import scala.language.postfixOps 6 | import scala.util.Try 7 | 8 | object Unsafe { 9 | val unsafe: JUnsafe = Try { 10 | classOf[JUnsafe] 11 | .getDeclaredFields 12 | .find { field => 13 | field.getType == classOf[JUnsafe] 14 | } 15 | .map { field => 16 | field.setAccessible(true) 17 | field.get(null).asInstanceOf[JUnsafe] 18 | } 19 | .getOrElse(throw new IllegalStateException("Can't find instance of sun.misc.Unsafe")) 20 | } recover { 21 | case th: Throwable => throw new ExceptionInInitializerError(th) 22 | } get 23 | 24 | } 25 | -------------------------------------------------------------------------------- /tests/src/endToEnd/scala/endToEnd/bloomfilter/mutable/SampleUsageSpec.scala: -------------------------------------------------------------------------------- 1 | package endToEnd.bloomfilter.mutable 2 | 3 | import bloomfilter.mutable.BloomFilter 4 | import org.scalatest.{FreeSpec, Matchers} 5 | 6 | class SampleUsageSpec extends FreeSpec with Matchers { 7 | "Create, put and check " in { 8 | val bloomFilter = BloomFilter[String](1000, 0.01) 9 | 10 | bloomFilter.add("") 11 | bloomFilter.add("Hello!") 12 | bloomFilter.add("8f16c986824e40e7885a032ddd29a7d3") 13 | 14 | bloomFilter.mightContain("") shouldBe true 15 | bloomFilter.mightContain("Hello!") shouldBe true 16 | bloomFilter.mightContain("8f16c986824e40e7885a032ddd29a7d3") shouldBe true 17 | 18 | bloomFilter.dispose() 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /tests/src/endToEnd/scala/endToEnd/bloomfilter/mutable/_128bit/SampleUsageSpec.scala: -------------------------------------------------------------------------------- 1 | package endToEnd.bloomfilter.mutable._128bit 2 | 3 | import bloomfilter.mutable._128bit.BloomFilter 4 | import org.scalatest.{FreeSpec, Matchers} 5 | 6 | class SampleUsageSpec extends FreeSpec with Matchers { 7 | "Create, put and check " in { 8 | val bloomFilter = BloomFilter[String](1000, 0.01) 9 | 10 | bloomFilter.add("") 11 | bloomFilter.add("Hello!") 12 | bloomFilter.add("8f16c986824e40e7885a032ddd29a7d3") 13 | 14 | bloomFilter.mightContain("") shouldBe true 15 | bloomFilter.mightContain("Hello!") shouldBe true 16 | bloomFilter.mightContain("8f16c986824e40e7885a032ddd29a7d3") shouldBe true 17 | 18 | bloomFilter.dispose() 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /tests/src/test/scala/tests/bloomfilter/hashing/MurmurHash3ScalaVsJavaSpec.scala: -------------------------------------------------------------------------------- 1 | package tests.bloomfilter.hashing 2 | 3 | import bloomfilter.hashing.MurmurHash3Generic 4 | import org.scalacheck.Prop.forAll 5 | import org.scalacheck.Properties 6 | import sandbox.hashing.YonikMurmurHash3 7 | import sandbox.hashing.YonikMurmurHash3.LongPair 8 | 9 | object MurmurHash3ScalaVsJavaSpec extends Properties("MurmurHash3ScalaVsJavaSpec") { 10 | 11 | property("murmurhash3_x64_128") = forAll { (key: Array[Byte]) => 12 | val tuple = MurmurHash3Generic.murmurhash3_x64_128(key, 0, key.length, 0) 13 | val pair = new LongPair 14 | YonikMurmurHash3.murmurhash3_x64_128(key, 0, key.length, 0, pair) 15 | pair.val1 == tuple._1 && pair.val2 == tuple._2 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /sandbox/src/main/scala/sandbox/bloomfilter/mutable/ChronicleBitArray.scala: -------------------------------------------------------------------------------- 1 | package sandbox.bloomfilter.mutable 2 | 3 | import net.openhft.chronicle.bytes.NativeBytesStore 4 | 5 | import bloomfilter.util.Unsafe.unsafe 6 | 7 | class ChronicleBitArray(numberOfBits: Long) { 8 | private val indices = math.ceil(numberOfBits.toDouble / 64).toLong 9 | private val ptr = unsafe.allocateMemory(indices) 10 | private val bytes = new NativeBytesStore(ptr, indices) 11 | 12 | def get(index: Long): Boolean = { 13 | (bytes.readLong(index >>> 6) & (1L << index)) != 0 14 | } 15 | 16 | def set(index: Long): Unit = { 17 | val offset = index >>> 6 18 | val long = bytes.readLong(offset) 19 | val _ = bytes.writeLong(offset, long | (1L << index)) 20 | } 21 | 22 | def getBitCount: Long = 0 23 | } 24 | -------------------------------------------------------------------------------- /benchmarks/src/main/scala/bloomfilter/mutable/StringItemBenchmark.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter.mutable 2 | 3 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State} 4 | 5 | import scala.util.Random 6 | 7 | @State(Scope.Benchmark) 8 | class StringItemBenchmark { 9 | 10 | private val itemsExpected = 100000000L 11 | private val falsePositiveRate = 0.01 12 | private val random = new Random() 13 | 14 | private val bf = BloomFilter[String](itemsExpected, falsePositiveRate) 15 | 16 | @Param(Array("1024")) 17 | var length: Int = _ 18 | 19 | private val item = random.nextString(length) 20 | bf.add(item) 21 | 22 | @Benchmark 23 | def myPut(): Unit = { 24 | bf.add(item) 25 | } 26 | 27 | @Benchmark 28 | def myGet(): Unit = { 29 | bf.mightContain(item) 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /benchmarks/src/main/scala/bloomfilter/mutable/_128bit/StringItemBenchmark.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter.mutable._128bit 2 | 3 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State} 4 | 5 | import scala.util.Random 6 | 7 | @State(Scope.Benchmark) 8 | class StringItemBenchmark { 9 | 10 | private val itemsExpected = 100000000L 11 | private val falsePositiveRate = 0.01 12 | private val random = new Random() 13 | 14 | private val bf = BloomFilter[String](itemsExpected, falsePositiveRate) 15 | 16 | @Param(Array("1024")) 17 | var length: Int = _ 18 | 19 | private val item = random.nextString(length) 20 | bf.add(item) 21 | 22 | @Benchmark 23 | def myPut(): Unit = { 24 | bf.add(item) 25 | } 26 | 27 | @Benchmark 28 | def myGet(): Unit = { 29 | bf.mightContain(item) 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /benchmarks/src/main/scala/bloomfilter/mutable/ArrayByteItemBenchmark.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter.mutable 2 | 3 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State} 4 | 5 | import scala.util.Random 6 | 7 | @State(Scope.Benchmark) 8 | class ArrayByteItemBenchmark { 9 | 10 | private val itemsExpected = 1000000L 11 | private val falsePositiveRate = 0.01 12 | private val random = new Random() 13 | 14 | private val bf = BloomFilter[Array[Byte]](itemsExpected, falsePositiveRate) 15 | 16 | @Param(Array("1024")) 17 | var length: Int = _ 18 | 19 | private val item = new Array[Byte](length) 20 | random.nextBytes(item) 21 | bf.add(item) 22 | 23 | @Benchmark 24 | def myPut(): Unit = { 25 | bf.add(item) 26 | } 27 | 28 | @Benchmark 29 | def myGet(): Unit = { 30 | bf.mightContain(item) 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /benchmarks/src/main/scala/bloomfilter/mutable/_128bit/ArrayByteItemBenchmark.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter.mutable._128bit 2 | 3 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State} 4 | 5 | import scala.util.Random 6 | 7 | @State(Scope.Benchmark) 8 | class ArrayByteItemBenchmark { 9 | 10 | private val itemsExpected = 1000000L 11 | private val falsePositiveRate = 0.01 12 | private val random = new Random() 13 | 14 | private val bf = BloomFilter[Array[Byte]](itemsExpected, falsePositiveRate) 15 | 16 | @Param(Array("1024")) 17 | var length: Int = _ 18 | 19 | private val item = new Array[Byte](length) 20 | random.nextBytes(item) 21 | bf.add(item) 22 | 23 | @Benchmark 24 | def myPut(): Unit = { 25 | bf.add(item) 26 | } 27 | 28 | @Benchmark 29 | def myGet(): Unit = { 30 | bf.mightContain(item) 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /benchmarks/src/main/scala/alternatives/breeze/StringItemBenchmark.scala: -------------------------------------------------------------------------------- 1 | package alternatives.breeze 2 | 3 | import breeze.util.BloomFilter 4 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State} 5 | 6 | import scala.util.Random 7 | 8 | @State(Scope.Benchmark) 9 | class StringItemBenchmark { 10 | 11 | private val itemsExpected = 100000000L 12 | private val falsePositiveRate = 0.01 13 | private val random = new Random() 14 | 15 | private val bf = BloomFilter.optimallySized[String](itemsExpected.toDouble, falsePositiveRate) 16 | 17 | @Param(Array("1024")) 18 | var length: Int = _ 19 | 20 | private val item = random.nextString(length) 21 | bf.+=(item) 22 | 23 | @Benchmark 24 | def breezePut(): Unit = { 25 | bf.+=(item) 26 | } 27 | 28 | @Benchmark 29 | def breezeGet(): Unit = { 30 | bf.contains(item) 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /benchmarks/src/main/scala/alternatives/algebird/StringItemBenchmark.scala: -------------------------------------------------------------------------------- 1 | package alternatives.algebird 2 | 3 | import com.twitter.algebird.BloomFilter 4 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State} 5 | 6 | import scala.util.Random 7 | 8 | @State(Scope.Benchmark) 9 | class StringItemBenchmark { 10 | 11 | private val itemsExpected = 100000000L 12 | private val falsePositiveRate = 0.01 13 | private val random = new Random() 14 | 15 | private var bf = BloomFilter(itemsExpected.toInt, falsePositiveRate, 0).create("") 16 | 17 | @Param(Array("1024")) 18 | var length: Int = _ 19 | 20 | private val item = random.nextString(length) 21 | bf = bf.+(item) 22 | 23 | @Benchmark 24 | def algebirdPut(): Unit = { 25 | bf.+(item) 26 | } 27 | 28 | @Benchmark 29 | def algebirdGet(): Unit = { 30 | bf.contains(item) 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /benchmarks/src/main/scala/alternatives/breeze/ArrayByteItemBenchmark.scala: -------------------------------------------------------------------------------- 1 | package alternatives.breeze 2 | 3 | import breeze.util.BloomFilter 4 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State} 5 | 6 | import scala.util.Random 7 | 8 | @State(Scope.Benchmark) 9 | class ArrayByteItemBenchmark { 10 | 11 | private val itemsExpected = 1000000L 12 | private val falsePositiveRate = 0.01 13 | private val random = new Random() 14 | 15 | private val bf = BloomFilter.optimallySized[Array[Byte]](itemsExpected.toDouble, falsePositiveRate) 16 | 17 | @Param(Array("1024")) 18 | var length: Int = _ 19 | 20 | private val item = new Array[Byte](length) 21 | random.nextBytes(item) 22 | bf.+=(item) 23 | 24 | @Benchmark 25 | def breezePut(): Unit = { 26 | bf.+=(item) 27 | } 28 | 29 | @Benchmark 30 | def breezeGet(): Unit = { 31 | bf.contains(item) 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /benchmarks/src/main/scala/alternatives/guava/ArrayByteItemBenchmark.scala: -------------------------------------------------------------------------------- 1 | package alternatives.guava 2 | 3 | import com.google.common.hash.{BloomFilter, Funnels} 4 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State} 5 | 6 | import scala.util.Random 7 | 8 | @State(Scope.Benchmark) 9 | class ArrayByteItemBenchmark { 10 | 11 | private val itemsExpected = 1000000L 12 | private val falsePositiveRate = 0.01 13 | private val random = new Random() 14 | 15 | private val bf = BloomFilter.create[Array[Byte]](Funnels.byteArrayFunnel(), itemsExpected, falsePositiveRate) 16 | 17 | @Param(Array("1024")) 18 | var length: Int = _ 19 | 20 | private val item = new Array[Byte](length) 21 | random.nextBytes(item) 22 | bf.put(item) 23 | 24 | @Benchmark 25 | def guavaPut(): Boolean = { 26 | bf.put(item) 27 | } 28 | 29 | @Benchmark 30 | def guava(): Boolean = { 31 | bf.mightContain(item) 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /benchmarks/src/main/scala/alternatives/stream/StringItemBenchmark.scala: -------------------------------------------------------------------------------- 1 | package alternatives.stream 2 | 3 | import com.clearspring.analytics.stream.membership.BloomFilter 4 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State} 5 | import org.openjdk.jmh.infra.Blackhole 6 | 7 | import scala.util.Random 8 | 9 | @State(Scope.Benchmark) 10 | class StringItemBenchmark { 11 | 12 | private val itemsExpected = 100000000L 13 | private val falsePositiveRate = 0.01 14 | private val random = new Random() 15 | 16 | private val bf = new BloomFilter(itemsExpected.toInt, falsePositiveRate) 17 | 18 | @Param(Array("1024")) 19 | var length: Int = _ 20 | 21 | private val item = random.nextString(length) 22 | bf.add(item) 23 | 24 | @Benchmark 25 | def streamPut(): Unit = { 26 | bf.add(item) 27 | } 28 | 29 | @Benchmark 30 | def streamGet(bh: Blackhole): Unit = { 31 | bh.consume(bf.isPresent(item)) 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /benchmarks/src/main/scala/alternatives/guava/StringItemBenchmark.scala: -------------------------------------------------------------------------------- 1 | package alternatives.guava 2 | 3 | import java.nio.charset.Charset 4 | 5 | import com.google.common.hash.{BloomFilter, Funnels} 6 | import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State} 7 | 8 | import scala.util.Random 9 | 10 | @State(Scope.Benchmark) 11 | class StringItemBenchmark { 12 | 13 | private val itemsExpected = 100000000L 14 | private val falsePositiveRate = 0.01 15 | private val random = new Random() 16 | 17 | private val bf = BloomFilter.create[String](Funnels.stringFunnel(Charset.forName("UTF-8")), itemsExpected, falsePositiveRate) 18 | 19 | @Param(Array("1024")) 20 | var length: Int = _ 21 | 22 | private val item = random.nextString(length) 23 | bf.put(item) 24 | 25 | @Benchmark 26 | def guavaPut(): Unit = { 27 | bf.put(item) 28 | } 29 | 30 | @Benchmark 31 | def guavaGet(): Unit = { 32 | bf.mightContain(item) 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Alexandr Nikitin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /project/Testing.scala: -------------------------------------------------------------------------------- 1 | import sbt.Keys._ 2 | import sbt._ 3 | import BuildKeys._ 4 | import scoverage.ScoverageKeys._ 5 | 6 | object Testing { 7 | 8 | import Configs._ 9 | 10 | private lazy val testSettings = Seq( 11 | Test / fork := false, 12 | Test / parallelExecution := false, 13 | Test / testOptions += Tests.Argument(TestFrameworks.ScalaCheck, "-verbosity", "2") 14 | ) 15 | 16 | private lazy val e2eSettings = inConfig(EndToEndTest)(Defaults.testSettings) ++ Seq( 17 | EndToEndTest / fork := false, 18 | EndToEndTest / parallelExecution := false, 19 | EndToEndTest / scalaSource := baseDirectory.value / "src/endToEnd/scala" 20 | ) 21 | 22 | private lazy val testAllSettings = Seq( 23 | testAll :=(), 24 | testAll := testAll.dependsOn(EndToEndTest / test), 25 | testAll := testAll.dependsOn(Test / test) 26 | ) 27 | 28 | private lazy val scoverageSettings = Seq( 29 | coverageMinimum := 60, 30 | coverageFailOnMinimum := false, 31 | coverageHighlighting := true, 32 | coverageExcludedPackages := ".*Benchmark" 33 | ) 34 | 35 | lazy val settings = testSettings ++ e2eSettings ++ testAllSettings ++ scoverageSettings 36 | } 37 | -------------------------------------------------------------------------------- /tests/src/test/scala/tests/bloomfilter/CanGetDataFromSpec.scala: -------------------------------------------------------------------------------- 1 | package tests.bloomfilter 2 | 3 | import bloomfilter.CanGetDataFrom.CanGetDataFromArrayChar 4 | import org.scalatest.{FreeSpec, Matchers} 5 | 6 | class CanGetDataFromSpec extends FreeSpec with Matchers { 7 | "CanGetDataFromArrayChar" in { 8 | CanGetDataFromArrayChar.getByte(Array[Char]('a'), 0) shouldEqual 97.toByte 9 | CanGetDataFromArrayChar.getByte(Array[Char]('a'), 1) shouldEqual 0.toByte 10 | 11 | CanGetDataFromArrayChar.getByte(Array[Char]('a', 'b'), 0) shouldEqual 97.toByte 12 | CanGetDataFromArrayChar.getByte(Array[Char]('a', 'b'), 1) shouldEqual 0.toByte 13 | CanGetDataFromArrayChar.getByte(Array[Char]('a', 'b'), 2) shouldEqual 98.toByte 14 | CanGetDataFromArrayChar.getByte(Array[Char]('a', 'b'), 3) shouldEqual 0.toByte 15 | 16 | CanGetDataFromArrayChar.getLong(Array[Char]('a', 'b', 'c', 'd'), 0) shouldEqual 17 | (0.toLong << 56) | 18 | (('d'.toByte & 0xffL) << 48) | 19 | ((0 & 0xffL) << 40) | 20 | (('c'.toByte & 0xffL) << 32) | 21 | ((0 & 0xffL) << 24) | 22 | (('b' & 0xffL) << 16) | 23 | ((0 & 0xffL) << 8) | 24 | 'a' & 0xffL 25 | 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /sandbox/src/main/scala/sandbox/bloomfilter/mutable/BitArray.scala: -------------------------------------------------------------------------------- 1 | package sandbox.bloomfilter.mutable 2 | 3 | class BitArray(val numberOfBits: Long) { 4 | // TODO check cast 5 | private val bits = new Array[Long](math.ceil(numberOfBits.toDouble / 64).toInt) 6 | 7 | def get(index: Long): Boolean = { 8 | (bits((index >>> 6).toInt) & (1L << index)) != 0 9 | } 10 | 11 | def set(index: Long): Unit = { 12 | // TODO improve 13 | if (!get(index)) { 14 | bits((index >>> 6).toInt) |= (1L << index) 15 | } 16 | } 17 | 18 | def combine(that: BitArray, combiner: (Byte, Byte) => Byte): BitArray = { 19 | val result = new BitArray(this.numberOfBits) 20 | result 21 | } 22 | 23 | def |(that: BitArray): BitArray = { 24 | require(this.numberOfBits == that.numberOfBits, "Bitwise OR works only on arrays with the same number of bits") 25 | 26 | combine(that, (b1: Byte, b2: Byte) => (b1 | b2).toByte) 27 | } 28 | 29 | def &(that: BitArray): BitArray = { 30 | require(this.numberOfBits == that.numberOfBits, "Bitwise AND works only on arrays with the same number of bits") 31 | 32 | combine(that, (b1: Byte, b2: Byte) => (b1 & b2).toByte) 33 | } 34 | 35 | def getBitCount: Long = { 36 | throw new NotImplementedError("Not implemented yet") 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /benchmarks/src/main/scala/bloomfilter/UnsafeBitArrayBenchmark.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter 2 | 3 | import java.util.BitSet 4 | 5 | import bloomfilter.mutable.UnsafeBitArray 6 | import org.openjdk.jmh.annotations.{Benchmark, Scope, State} 7 | import sandbox.bloomfilter.mutable.ChronicleBitArray 8 | 9 | @State(Scope.Benchmark) 10 | class UnsafeBitArrayBenchmark { 11 | 12 | private val numberOfBits = Int.MaxValue 13 | 14 | val unsafeBits = new UnsafeBitArray(numberOfBits.toLong) 15 | val bitsSet = new BitSet(numberOfBits) 16 | val chronicle = new ChronicleBitArray(numberOfBits.toLong) 17 | 18 | @Benchmark 19 | def getUnsafe() = { 20 | unsafeBits.get(1) 21 | unsafeBits.get(10) 22 | unsafeBits.get(100) 23 | unsafeBits.get(1000) 24 | unsafeBits.get(10000) 25 | unsafeBits.get(100000) 26 | unsafeBits.get(1000000) 27 | } 28 | 29 | @Benchmark 30 | def getBitSet() = { 31 | bitsSet.get(1) 32 | bitsSet.get(10) 33 | bitsSet.get(100) 34 | bitsSet.get(1000) 35 | bitsSet.get(10000) 36 | bitsSet.get(100000) 37 | bitsSet.get(1000000) 38 | } 39 | 40 | @Benchmark 41 | def getChronicle() = { 42 | chronicle.get(1) 43 | chronicle.get(10) 44 | chronicle.get(100) 45 | chronicle.get(1000) 46 | chronicle.get(10000) 47 | chronicle.get(100000) 48 | chronicle.get(1000000) 49 | } 50 | 51 | 52 | } 53 | -------------------------------------------------------------------------------- /benchmarks/src/main/scala/bloomfilter/mutable/StringItemCuckooBenchmark.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter.mutable 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import org.openjdk.jmh.annotations.{BenchmarkMode, OperationsPerInvocation, OutputTimeUnit, _} 6 | 7 | import scala.util.Random 8 | 9 | @State(Scope.Benchmark) 10 | class StringItemCuckooBenchmark { 11 | 12 | private val itemsExpected = 100000000L 13 | private val random = new Random() 14 | 15 | private var bf: CuckooFilter[String] = _ 16 | 17 | @Param(Array("1024")) 18 | var length: Int = _ 19 | 20 | private val items = new Array[String](10000) 21 | var i = 0 22 | while (i < items.length) { 23 | items(i) = random.nextString(length) 24 | i += 1 25 | } 26 | 27 | @Setup(Level.Iteration) 28 | def setup(): Unit = { 29 | bf = CuckooFilter[String](itemsExpected) 30 | } 31 | 32 | @Benchmark 33 | @BenchmarkMode(Array(Mode.SingleShotTime)) 34 | @OutputTimeUnit(TimeUnit.NANOSECONDS) 35 | @OperationsPerInvocation(10000) 36 | def myPut(): Unit = { 37 | var i = 0 38 | while (i < items.length) { 39 | bf.add(items(i)) 40 | i += 1 41 | } 42 | } 43 | 44 | @Benchmark 45 | @BenchmarkMode(Array(Mode.Throughput)) 46 | @OperationsPerInvocation(10000) 47 | def myGet(): Unit = { 48 | var i = 0 49 | while (i < items.length) { 50 | bf.mightContain(items(i)) 51 | i += 1 52 | } 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /bloom-filter/src/main/scala/bloomfilter/CanGetDataFrom.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter 2 | 3 | import bloomfilter.util.Unsafe.unsafe 4 | 5 | trait CanGetDataFrom[-From] { 6 | def getLong(from: From, offset: Int): Long 7 | def getByte(from: From, offset: Int): Byte 8 | } 9 | 10 | object CanGetDataFrom { 11 | 12 | implicit case object CanGetDataFromByteArray extends CanGetDataFrom[Array[Byte]] { 13 | 14 | override def getLong(buf: Array[Byte], offset: Int): Long = { 15 | (buf(offset + 7).toLong << 56) | 16 | ((buf(offset + 6) & 0xffL) << 48) | 17 | ((buf(offset + 5) & 0xffL) << 40) | 18 | ((buf(offset + 4) & 0xffL) << 32) | 19 | ((buf(offset + 3) & 0xffL) << 24) | 20 | ((buf(offset + 2) & 0xffL) << 16) | 21 | ((buf(offset + 1) & 0xffL) << 8) | 22 | buf(offset) & 0xffL 23 | } 24 | 25 | override def getByte(from: Array[Byte], offset: Int): Byte = { 26 | from(offset) 27 | } 28 | } 29 | 30 | implicit case object CanGetDataFromArrayChar extends CanGetDataFrom[Array[Char]] { 31 | private val arrayCharOffset = unsafe.arrayBaseOffset(classOf[Array[Char]]) 32 | 33 | override def getLong(from: Array[Char], offset: Int): Long = { 34 | unsafe.getLong(from, arrayCharOffset + offset.toLong) 35 | } 36 | 37 | override def getByte(from: Array[Char], offset: Int): Byte = { 38 | unsafe.getByte(from, arrayCharOffset + offset.toLong) 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | matrix: 3 | include: 4 | - scala: 2.10.7 5 | jdk: openjdk8 6 | dist: trusty 7 | env: PUBLISH_FROM_THIS_BUILD=true 8 | - scala: 2.11.12 9 | jdk: openjdk8 10 | dist: trusty 11 | env: PUBLISH_FROM_THIS_BUILD=true 12 | - scala: 2.12.11 13 | jdk: openjdk8 14 | dist: xenial 15 | env: PUBLISH_FROM_THIS_BUILD=true 16 | - scala: 2.13.1 17 | jdk: openjdk8 18 | dist: xenial 19 | env: PUBLISH_FROM_THIS_BUILD=true 20 | - scala: 2.13.1 21 | jdk: openjdk10 22 | dist: xenial 23 | - scala: 2.13.1 24 | jdk: openjdk11 25 | dist: xenial 26 | script: 27 | - sbt ++$TRAVIS_SCALA_VERSION clean test endToEnd:test package 28 | sudo: false 29 | cache: 30 | directories: 31 | - "$HOME/.m2" 32 | - "$HOME/.ivy2/cache" 33 | - "$HOME/.sbt" 34 | git: 35 | depth: 1 36 | before_cache: 37 | # Tricks to avoid unnecessary cache updates 38 | - find $HOME/.ivy2 -name "ivydata-*.properties" -delete 39 | - find $HOME/.sbt -name "*.lock" -delete 40 | before_install: 41 | - "if [[ $TRAVIS_TAG == v[0-9.]* ]]; then openssl aes-256-cbc -pass pass:$ENCRYPTION_PASSWORD -in secring.gpg.enc -out local.secring.gpg -d; fi" 42 | - "if [[ $TRAVIS_TAG == v[0-9.]* ]]; then openssl aes-256-cbc -pass pass:$ENCRYPTION_PASSWORD -in pubring.gpg.enc -out local.pubring.gpg -d; fi" 43 | after_success: 44 | - "[[ $TRAVIS_TAG == v[0-9.]* ]] && [[ $PUBLISH_FROM_THIS_BUILD == true ]] && { gpg --import local.secring.gpg && sbt ++$TRAVIS_SCALA_VERSION publish sonatypeBundleRelease; };" 45 | -------------------------------------------------------------------------------- /project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | import sbt.Keys._ 2 | import sbt._ 3 | 4 | object Dependencies { 5 | private val scalatest = "org.scalatest" %% "scalatest" % "3.1.1" % "test;endToEnd" 6 | private val scalacheck = "org.scalacheck" %% "scalacheck" % "1.14.3" % "test" 7 | private val googleGuava = "com.google.guava" % "guava" % "19.0" 8 | private val googleFindbugs = "com.google.code.findbugs" % "jsr305" % "2.0.3" // needed by guava 9 | private val breeze = "org.scalanlp" %% "breeze" % "1.0" 10 | private val breezeNatives = "org.scalanlp" %% "breeze-natives" % "1.0" 11 | private val algebird = "com.twitter" %% "algebird-core" % "0.13.6" 12 | private val sketches = "com.yahoo.datasketches" % "sketches-core" % "0.3.2" 13 | private val chronicleBytes = "net.openhft" % "chronicle-bytes" % "1.2.3" 14 | private val allocationInstrumenter = "com.google.code.java-allocation-instrumenter" % "java-allocation-instrumenter" % "3.0.1" 15 | private val stream = "com.clearspring.analytics" % "stream" % "2.7.0" 16 | 17 | private val common = dependencies() 18 | 19 | val bloomfilter = common 20 | val sandbox = common ++ dependencies(chronicleBytes) 21 | val sandboxApp = common ++ dependencies(allocationInstrumenter, algebird) 22 | val tests = common ++ dependencies(scalatest, scalacheck) 23 | val benchmarks = common ++ dependencies(googleGuava, googleFindbugs, breeze, breezeNatives, algebird, sketches, stream) 24 | 25 | private def dependencies(modules: ModuleID*): Seq[Setting[_]] = Seq(libraryDependencies ++= modules) 26 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/scala,intellij,sbt 3 | 4 | ### Scala ### 5 | *.class 6 | *.log 7 | 8 | # sbt specific 9 | .cache 10 | .history 11 | .lib/ 12 | dist/* 13 | target/ 14 | lib_managed/ 15 | src_managed/ 16 | project/boot/ 17 | project/plugins/project/ 18 | 19 | # Scala-IDE specific 20 | .scala_dependencies 21 | .worksheet 22 | 23 | 24 | 25 | ### Intellij ### 26 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 27 | 28 | *.iml 29 | 30 | ## Directory-based project format: 31 | .idea/ 32 | # if you remove the above rule, at least ignore the following: 33 | 34 | # User-specific stuff: 35 | # .idea/workspace.xml 36 | # .idea/tasks.xml 37 | # .idea/dictionaries 38 | # .idea/shelf 39 | 40 | # Sensitive or high-churn files: 41 | # .idea/dataSources.ids 42 | # .idea/dataSources.xml 43 | # .idea/sqlDataSources.xml 44 | # .idea/dynamic.xml 45 | # .idea/uiDesigner.xml 46 | 47 | # Gradle: 48 | # .idea/gradle.xml 49 | # .idea/libraries 50 | 51 | # Mongo Explorer plugin: 52 | # .idea/mongoSettings.xml 53 | 54 | ## File-based project format: 55 | *.ipr 56 | *.iws 57 | 58 | ## Plugin-specific files: 59 | 60 | # IntelliJ 61 | /out/ 62 | 63 | # mpeltonen/sbt-idea plugin 64 | .idea_modules/ 65 | 66 | # JIRA plugin 67 | atlassian-ide-plugin.xml 68 | 69 | # Crashlytics plugin (for Android Studio and IntelliJ) 70 | com_crashlytics_export_strings.xml 71 | crashlytics.properties 72 | crashlytics-build.properties 73 | fabric.properties 74 | 75 | 76 | local.* 77 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ### 0.13.1 2 | - Fix deserialization backwards-compatibility with v0.12.0 (via \#52) Thanks to Sean Rohead @seanrohead 3 | 4 | ### 0.13.0 5 | - Add JDK 9+ support (via \#47) Thanks to Sean Rohead @seanrohead 6 | - BREAKING: breaks java serialization/deserialization compatibility with v0.12.0. Use v0.13.1. 7 | 8 | ### 0.12.0 9 | - Add Scala 2.13 support (via \#45) 10 | 11 | ### 0.11.0 12 | - BREAKING: Add approximateElementCount method that estimates number of added elements. Thanks to @SidWeng. It's a breaking change because it serializes one more field (via \#37 and \#38) 13 | 14 | ### 0.10.1 15 | - Change the default long hash function to MurMurHash3 (via \#33) 16 | 17 | ### 0.10.0 18 | 19 | - Performance improvement: set a bit only if it's not set already (via \#28) 20 | - \#22 Scala 2.12.1 support (via \#31). Thanks to Fedor Lavrentyev @fediq. 21 | - \#29 Fix hashing of small strings (via \#32). 22 | 23 | ### 0.9.0 24 | 25 | - \#23 Serialization support (via \#25). Thanks to Eyal Farago @eyalfa. 26 | 27 | ### 0.8.0 28 | 29 | - \#19 Cuckoo Filter (via \#20) 30 | 31 | ### 0.7.0 32 | 33 | - \#5 Add serialization support. 34 | 35 | ### 0.6.0 36 | 37 | - \#2 Scala 2.10 support. 38 | 39 | ### 0.5.0 40 | 41 | - \#4 Union and intersection of two Bloom filters (via \#6). Thanks to Mario Pastorelli @melrief. 42 | 43 | ### 0.4.2 44 | 45 | - Fix memory access in UnsafeBitArray. Must update. Thanks to @cmarxer (via e79ff243ac) 46 | 47 | ### 0.4.1 48 | 49 | - Fix memory allocation in UnsafeBitArray. Must update. Thanks to @cmarxer (via \#9) 50 | 51 | ### 0.4.0 52 | 53 | - Initial release 54 | -------------------------------------------------------------------------------- /tests/src/test/scala/tests/bloomfilter/mutable/_128bit/BloomFilterSerializationSpec.scala: -------------------------------------------------------------------------------- 1 | package tests.bloomfilter.mutable._128bit 2 | 3 | import java.io._ 4 | 5 | import bloomfilter.mutable._128bit.BloomFilter 6 | import org.scalacheck.Prop.forAll 7 | import org.scalacheck.{Gen, Properties} 8 | import org.scalatest.Matchers 9 | 10 | class BloomFilterSerializationSpec extends Properties("BloomFilter") with Matchers { 11 | def genListElems[A](max: Long)(implicit aGen: Gen[A]): Gen[List[A]] = { 12 | Gen.posNum[Int].map(_ % max).flatMap(i => Gen.listOfN(math.min(i, Int.MaxValue).toInt, aGen)) 13 | } 14 | 15 | val gen = for { 16 | size <- Gen.oneOf[Long](1, 1000/*, Int.MaxValue.toLong + 1*/) 17 | indices <- genListElems[Long](size)(Gen.chooseNum(0, size)) 18 | } yield (size, indices) 19 | 20 | property("writeTo & readFrom") = forAll(gen) { 21 | case (size: Long, indices: List[Long]) => 22 | val initial = BloomFilter[Long](size, 0.01) 23 | indices.foreach(initial.add) 24 | 25 | val file = File.createTempFile("bloomFilterSerialized", ".tmp") 26 | val out = new BufferedOutputStream(new FileOutputStream(file), 10 * 1000 * 1000) 27 | initial.writeTo(out) 28 | out.close() 29 | val in = new BufferedInputStream(new FileInputStream(file), 10 * 1000 * 1000) 30 | val sut = BloomFilter.readFrom[Long](in) 31 | in.close() 32 | 33 | sut.approximateElementCount() shouldEqual initial.approximateElementCount() 34 | 35 | val result = indices.forall(sut.mightContain) 36 | 37 | file.delete() 38 | initial.dispose() 39 | sut.dispose() 40 | 41 | result 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /benchmarks/src/main/scala/bloomfilter/hashing/MurmurHash3Benchmark.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter.hashing 2 | 3 | import java.nio.ByteBuffer 4 | 5 | import sandbox.hashing.{YonikMurmurHash3, CassandraMurmurHash, AlgebirdMurmurHash128} 6 | import sandbox.hashing.YonikMurmurHash3.LongPair 7 | import com.yahoo.sketches.hash.{MurmurHash3 => yMurmurHash3} 8 | import com.google.common.hash.Hashing 9 | import org.openjdk.jmh.annotations.{Benchmark, Scope, State} 10 | import scala.util.hashing.{MurmurHash3 => ScalaMurmurHash3} 11 | import com.clearspring.analytics.hash.{MurmurHash => StreamLibMurmurHash} 12 | 13 | @State(Scope.Benchmark) 14 | class MurmurHash3Benchmark { 15 | 16 | val key = Range(0, 64).map(_.toByte).toArray 17 | 18 | @Benchmark 19 | def javaVersion() = { 20 | YonikMurmurHash3.murmurhash3_x64_128(key, 0, key.length, 0, new LongPair) 21 | } 22 | 23 | @Benchmark 24 | def scalaVersion() = { 25 | MurmurHash3Generic.murmurhash3_x64_128(key, 0, key.length, 0) 26 | } 27 | 28 | val guavaMurmur = Hashing.murmur3_128() 29 | 30 | @Benchmark 31 | def guavaVersion() = { 32 | guavaMurmur.hashBytes(key, 0, key.length) 33 | } 34 | 35 | @Benchmark 36 | def cassandraVersion() = { 37 | CassandraMurmurHash.hash3_x64_128(ByteBuffer.wrap(key), 0, key.length, 0) 38 | } 39 | 40 | val algebirdMurmur = AlgebirdMurmurHash128(0) 41 | 42 | @Benchmark 43 | def algebirdVersion() = { 44 | algebirdMurmur.apply(key) 45 | } 46 | 47 | @Benchmark 48 | def yahooVersion() = { 49 | yMurmurHash3.hash(key, 0) 50 | } 51 | 52 | @Benchmark 53 | def scalaStdlibVersion() = { 54 | ScalaMurmurHash3.arrayHash(key, 0) 55 | } 56 | 57 | @Benchmark 58 | def streamLibVersion() = { 59 | StreamLibMurmurHash.hash(key) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /project/Publishing.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | import sbt.Keys._ 3 | import xerial.sbt.Sonatype.SonatypeKeys._ 4 | 5 | object Publishing { 6 | 7 | private lazy val credentialSettings = Seq( 8 | credentials ++= (for { 9 | username <- Option(System.getenv().get("SONATYPE_USERNAME")) 10 | password <- Option(System.getenv().get("SONATYPE_PASSWORD")) 11 | } yield Credentials("Sonatype Nexus Repository Manager", "oss.sonatype.org", username, password)).toSeq, 12 | 13 | credentials += Credentials( 14 | "GnuPG Key ID", 15 | "gpg", 16 | "nikitin.alexandr.a@gmail.com", // key identifier 17 | "ignored" // this field is ignored; passwords are supplied by pinentry 18 | ) 19 | ) 20 | 21 | private lazy val sharedSettings = Seq( 22 | publishMavenStyle := true, 23 | Test / publishArtifact := false, 24 | pomIncludeRepository := Function.const(false), 25 | publishTo := sonatypePublishToBundle.value, 26 | sonatypeSessionName := "[sbt-sonatype] ${name.value}-${scalaBinaryVersion.value}-${version.value}" 27 | ) 28 | 29 | private lazy val generalSettings = Seq( 30 | homepage := Some(url("https://github.com/alexandrnikitin/bloom-filter-scala")), 31 | licenses := Seq("MIT" -> url("https://github.com/alexandrnikitin/bloom-filter-scala/blob/master/LICENSE")), 32 | scmInfo := Some(ScmInfo(url("https://github.com/alexandrnikitin/bloom-filter-scala"), "scm:git:git@github.com:alexandrnikitin/bloom-filter-scala.git")), 33 | developers := List(Developer("AlexandrNikitin", "Alexandr Nikitin", "nikitin.alexandr.a@gmail.com", url("https://github.com/alexandrnikitin/"))) 34 | ) 35 | 36 | lazy val settings = generalSettings ++ sharedSettings ++ credentialSettings 37 | 38 | lazy val noPublishSettings = Seq( 39 | publish / skip := true 40 | ) 41 | 42 | } 43 | -------------------------------------------------------------------------------- /bloom-filter/src/main/scala/bloomfilter/CanGenerateHashFrom.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter 2 | 3 | import bloomfilter.hashing.MurmurHash3Generic 4 | 5 | import java.lang.reflect.Field 6 | 7 | trait CanGenerateHashFrom[From] { 8 | def generateHash(from: From): Long 9 | } 10 | 11 | object CanGenerateHashFrom { 12 | implicit case object CanGenerateHashFromLong extends CanGenerateHashFrom[Long] { 13 | override def generateHash(from: Long): Long = MurmurHash3Generic.fmix64(from) 14 | } 15 | 16 | implicit case object CanGenerateHashFromByteArray extends CanGenerateHashFrom[Array[Byte]] { 17 | override def generateHash(from: Array[Byte]): Long = 18 | MurmurHash3Generic.murmurhash3_x64_64(from, 0, from.length, 0) 19 | } 20 | 21 | import bloomfilter.util.Unsafe.unsafe 22 | 23 | case object CanGenerateHashFromString extends CanGenerateHashFrom[String] { 24 | private val valueOffset = unsafe.objectFieldOffset(stringValueField) 25 | 26 | override def generateHash(from: String): Long = { 27 | val value = unsafe.getObject(from, valueOffset).asInstanceOf[Array[Char]] 28 | MurmurHash3Generic.murmurhash3_x64_64(value, 0, from.length * 2, 0) 29 | } 30 | } 31 | 32 | case object CanGenerateHashFromStringByteArray extends CanGenerateHashFrom[String] { 33 | private val valueOffset = unsafe.objectFieldOffset(stringValueField) 34 | 35 | override def generateHash(from: String): Long = { 36 | val value = unsafe.getObject(from, valueOffset).asInstanceOf[Array[Byte]] 37 | MurmurHash3Generic.murmurhash3_x64_64(value, 0, from.length, 0) 38 | } 39 | } 40 | 41 | private val stringValueField: Field = classOf[String].getDeclaredField("value") 42 | implicit val canGenerateHashFromString: CanGenerateHashFrom[String] = { 43 | if (stringValueField.getType.getComponentType == java.lang.Byte.TYPE) CanGenerateHashFromStringByteArray else CanGenerateHashFromString 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /sandbox/src/main/java/sandbox/hashing/AlgebirdMurmurHash128.scala: -------------------------------------------------------------------------------- 1 | package sandbox.hashing 2 | 3 | import java.nio.ByteBuffer 4 | 5 | case class AlgebirdMurmurHash128(seed: Long) extends AnyVal { 6 | def apply(buffer: ByteBuffer, offset: Int, length: Int): (Long, Long) = { 7 | val longs = CassandraMurmurHash.hash3_x64_128(buffer, offset, length, seed) 8 | (longs(0), longs(1)) 9 | } 10 | 11 | def apply(bytes: Array[Byte]): (Long, Long) = apply(ByteBuffer.wrap(bytes), 0, bytes.length) 12 | def apply(maxBytes: Int, fn: ByteBuffer => Unit): (Long, Long) = { 13 | val buffer = ByteBuffer.allocate(maxBytes) 14 | fn(buffer) 15 | apply(buffer, 0, maxBytes) 16 | } 17 | def apply(array: Array[Char]): (Long, Long) = apply(array.size * 2, { _.asCharBuffer.put(array) }) 18 | def apply(array: Array[Short]): (Long, Long) = apply(array.size * 2, { _.asShortBuffer.put(array) }) 19 | def apply(array: Array[Int]): (Long, Long) = apply(array.size * 4, { _.asIntBuffer.put(array) }) 20 | def apply(array: Array[Float]): (Long, Long) = apply(array.size * 4, { _.asFloatBuffer.put(array) }) 21 | def apply(array: Array[Long]): (Long, Long) = apply(array.size * 8, { _.asLongBuffer.put(array) }) 22 | def apply(array: Array[Double]): (Long, Long) = apply(array.size * 8, { _.asDoubleBuffer.put(array) }) 23 | 24 | def apply(value: Char): (Long, Long) = apply(2, { _.asCharBuffer.put(value) }) 25 | def apply(value: Short): (Long, Long) = apply(2, { _.asShortBuffer.put(value) }) 26 | def apply(value: Int): (Long, Long) = apply(4, { _.asIntBuffer.put(value) }) 27 | def apply(value: Float): (Long, Long) = apply(4, { _.asFloatBuffer.put(value) }) 28 | def apply(value: Long): (Long, Long) = apply(8, { _.asLongBuffer.put(value) }) 29 | def apply(value: Double): (Long, Long) = apply(8, { _.asDoubleBuffer.put(value) }) 30 | 31 | def apply(string: CharSequence): (Long, Long) = apply(string.length * 2, { buffer => 32 | val charBuffer = buffer.asCharBuffer 33 | 0.to(string.length - 1).foreach{ i => charBuffer.put(string.charAt(i)) } 34 | }) 35 | } 36 | -------------------------------------------------------------------------------- /bloom-filter/src/main/scala/bloomfilter/CanGenerate128HashFrom.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter 2 | 3 | import bloomfilter.hashing.MurmurHash3Generic 4 | 5 | import java.lang.reflect.Field 6 | 7 | trait CanGenerate128HashFrom[From] { 8 | def generateHash(from: From): (Long, Long) 9 | } 10 | 11 | object CanGenerate128HashFrom { 12 | implicit case object CanGenerate128HashFromLong extends CanGenerate128HashFrom[Long] { 13 | override def generateHash(from: Long): (Long, Long) = { 14 | val hash = MurmurHash3Generic.fmix64(from) 15 | (hash, hash) 16 | } 17 | } 18 | 19 | implicit case object CanGenerate128HashFromByteArray extends CanGenerate128HashFrom[Array[Byte]] { 20 | override def generateHash(from: Array[Byte]): (Long, Long) = 21 | MurmurHash3Generic.murmurhash3_x64_128(from, 0, from.length, 0) 22 | } 23 | 24 | import bloomfilter.util.Unsafe.unsafe 25 | 26 | case object CanGenerate128HashFromString extends CanGenerate128HashFrom[String] { 27 | private val valueOffset = unsafe.objectFieldOffset(stringValueField) 28 | 29 | override def generateHash(from: String): (Long, Long) = { 30 | val value = unsafe.getObject(from, valueOffset).asInstanceOf[Array[Char]] 31 | MurmurHash3Generic.murmurhash3_x64_128(value, 0, from.length * 2, 0) 32 | } 33 | } 34 | 35 | case object CanGenerate128HashFromStringByteArray extends CanGenerate128HashFrom[String] { 36 | private val valueOffset = unsafe.objectFieldOffset(stringValueField) 37 | 38 | override def generateHash(from: String): (Long, Long) = { 39 | val value = unsafe.getObject(from, valueOffset).asInstanceOf[Array[Byte]] 40 | MurmurHash3Generic.murmurhash3_x64_128(value, 0, from.length, 0) 41 | } 42 | } 43 | 44 | private val stringValueField: Field = classOf[String].getDeclaredField("value") 45 | implicit val canGenerate128HashFromString: CanGenerate128HashFrom[String] = { 46 | if (stringValueField.getType.getComponentType == java.lang.Byte.TYPE) CanGenerate128HashFromStringByteArray else CanGenerate128HashFromString 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /tests/src/test/scala/tests/bloomfilter/mutable/BloomFiltersSpec.scala: -------------------------------------------------------------------------------- 1 | package tests.bloomfilter.mutable 2 | 3 | import bloomfilter.CanGenerateHashFrom 4 | import bloomfilter.mutable.BloomFilter 5 | import org.scalacheck.Test.Parameters 6 | import org.scalacheck.commands.Commands 7 | import org.scalacheck.{Arbitrary, Gen, Prop, Properties} 8 | import org.scalacheck.Arbitrary.arbitrary 9 | import org.scalacheck.Prop.forAll 10 | 11 | class BloomFiltersSpec extends Properties("BloomFilters") { 12 | 13 | val maxNumElems = 10 14 | 15 | def genListOfMaxTenElems[A](implicit aGen: Gen[A]): Gen[List[A]] = 16 | Gen.posNum[Int] map (_ % maxNumElems) flatMap (i => Gen.listOfN(i, aGen)) 17 | 18 | property("union") = 19 | forAll(genListOfMaxTenElems(arbitrary[Long]), genListOfMaxTenElems(arbitrary[Long])) { 20 | (leftElements: List[Long], rightElements: List[Long]) => 21 | val leftBloomFilter = BloomFilter[Long](maxNumElems, 0.01) 22 | leftElements foreach leftBloomFilter.add 23 | val rightBloomFilter = BloomFilter[Long](maxNumElems, 0.01) 24 | rightElements foreach rightBloomFilter.add 25 | val unionBloomFilter = leftBloomFilter union rightBloomFilter 26 | val result = (leftElements ++ rightElements) forall unionBloomFilter.mightContain 27 | leftBloomFilter.dispose() 28 | rightBloomFilter.dispose() 29 | unionBloomFilter.dispose() 30 | result 31 | } 32 | 33 | property("intersect") = 34 | forAll(genListOfMaxTenElems(arbitrary[Long]), genListOfMaxTenElems(arbitrary[Long])) { 35 | (leftElements: List[Long], rightElements: List[Long]) => 36 | val leftBloomFilter = BloomFilter[Long](maxNumElems, 0.01) 37 | leftElements foreach leftBloomFilter.add 38 | val rightBloomFilter = BloomFilter[Long](maxNumElems, 0.01) 39 | rightElements foreach rightBloomFilter.add 40 | val unionBloomFilter = leftBloomFilter intersect rightBloomFilter 41 | val intersectElems = leftElements.toSet intersect rightElements.toSet 42 | val result = intersectElems forall unionBloomFilter.mightContain 43 | leftBloomFilter.dispose() 44 | rightBloomFilter.dispose() 45 | unionBloomFilter.dispose() 46 | result 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /tests/src/test/scala/tests/bloomfilter/mutable/UnsafeBitArraysSpec.scala: -------------------------------------------------------------------------------- 1 | package tests.bloomfilter.mutable 2 | 3 | import bloomfilter.mutable.UnsafeBitArray 4 | import org.scalacheck.Prop._ 5 | import org.scalacheck.{Gen, Properties} 6 | 7 | class UnsafeBitArraysSpec extends Properties("UnsafeBitArray") { 8 | def genListElems[A](max: Long)(implicit aGen: Gen[A]): Gen[List[A]] = { 9 | Gen.posNum[Int].map(_ % max).flatMap(i => Gen.listOfN(math.min(i, Int.MaxValue).toInt, aGen)) 10 | } 11 | 12 | val genUnion = for { 13 | size <- Gen.oneOf[Long](1, 1000, Int.MaxValue, Int.MaxValue * 2L) 14 | indices <- genListElems[Long](size)(Gen.chooseNum(0, size)) 15 | thatIndices <- genListElems[Long](size)(Gen.chooseNum(0, size)) 16 | } yield (size, indices, thatIndices) 17 | 18 | val genIntersection = for { 19 | size <- Gen.oneOf[Long](1, 1000, Int.MaxValue, Int.MaxValue * 2L) 20 | indices <- genListElems[Long](size)(Gen.chooseNum(0, size)) 21 | thatIndices <- genListElems[Long](size)(Gen.chooseNum(0, size)) 22 | commonIndices <- genListElems[Long](size)(Gen.chooseNum(0, size)) 23 | } yield (size, indices, thatIndices, commonIndices) 24 | 25 | 26 | property("|") = forAll(genUnion) { 27 | case (size: Long, indices: List[Long], thatIndices: List[Long]) => 28 | val array = new UnsafeBitArray(size) 29 | indices.foreach(array.set) 30 | val thatArray = new UnsafeBitArray(size) 31 | thatIndices.foreach(thatArray.set) 32 | 33 | val sut = array | thatArray 34 | val result = (indices ++ thatIndices).forall(sut.get) 35 | 36 | array.dispose() 37 | thatArray.dispose() 38 | sut.dispose() 39 | 40 | result 41 | } 42 | 43 | property("&") = forAll(genIntersection) { 44 | case (size: Long, indices: List[Long], thatIndices: List[Long], commonIndices: List[Long]) => 45 | val array = new UnsafeBitArray(size) 46 | indices.foreach(array.set) 47 | val thatArray = new UnsafeBitArray(size) 48 | thatIndices.foreach(thatArray.set) 49 | commonIndices.foreach(x => { array.set(x); thatArray.set(x) }) 50 | 51 | val sut = array & thatArray 52 | val result = commonIndices.forall(sut.get) 53 | 54 | array.dispose() 55 | thatArray.dispose() 56 | sut.dispose() 57 | 58 | result 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /project/Settings.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | import sbt.Keys._ 3 | 4 | object Settings { 5 | 6 | private lazy val build = Seq( 7 | scalaVersion := "2.12.11", 8 | crossScalaVersions := Seq("2.10.7", "2.11.12", "2.12.11", "2.13.1"), 9 | 10 | autoCompilerPlugins := true, 11 | 12 | scalacOptions ++= ScalacSettings.base ++ ScalacSettings.specificFor(scalaVersion.value), 13 | javacOptions ++= JavacSettings.base ++ JavacSettings.specificFor(scalaVersion.value), 14 | javaOptions += "-Xmx1G", 15 | organization := "com.github.alexandrnikitin" 16 | ) 17 | 18 | lazy val root = build ++ Testing.settings ++ Publishing.noPublishSettings 19 | lazy val bloomfilter = build ++ Testing.settings ++ Dependencies.bloomfilter ++ Publishing.settings ++ 20 | (scalacOptions ++= ScalacSettings.strictBase ++ ScalacSettings.strictSpecificFor(scalaVersion.value)) 21 | lazy val sandbox = build ++ Testing.settings ++ Dependencies.sandbox ++ Publishing.noPublishSettings 22 | lazy val sandboxApp = build ++ Dependencies.sandboxApp ++ Publishing.noPublishSettings 23 | lazy val tests = build ++ Testing.settings ++ Dependencies.tests ++ Publishing.noPublishSettings 24 | lazy val benchmarks = build ++ Dependencies.benchmarks ++ Publishing.noPublishSettings 25 | lazy val examples = build ++ Publishing.noPublishSettings 26 | 27 | object JavacSettings { 28 | val base = Seq("-Xlint") 29 | 30 | def specificFor(scalaVersion: String) = CrossVersion.partialVersion(scalaVersion) match { 31 | case Some((2, 13)) => Seq("-source", "1.8", "-target", "1.8") 32 | case Some((2, 12)) => Seq("-source", "1.8", "-target", "1.8") 33 | case Some((2, 11)) => Seq("-source", "1.8", "-target", "1.8") 34 | case Some((2, 10)) => Seq("-source", "1.7", "-target", "1.7") 35 | case _ => Nil 36 | } 37 | } 38 | 39 | object ScalacSettings { 40 | val base = Seq( 41 | "-deprecation", 42 | "-encoding", "UTF-8", 43 | "-feature", 44 | "-unchecked" 45 | ) 46 | 47 | def specificFor(scalaVersion: String) = CrossVersion.partialVersion(scalaVersion) match { 48 | case Some((2, 12)) => Seq("-target:jvm-1.8") 49 | case Some((2, 11)) => Seq("-target:jvm-1.8", "-optimise") 50 | case Some((2, 10)) => Seq("-target:jvm-1.7", "-optimise") 51 | case _ => Nil 52 | } 53 | 54 | 55 | val strictBase = Seq( 56 | "-Xfatal-warnings", 57 | "-Xlint", 58 | "-Ywarn-dead-code", 59 | "-Ywarn-numeric-widen", 60 | "-Ywarn-value-discard" 61 | ) 62 | 63 | def strictSpecificFor(scalaVersion: String) = CrossVersion.partialVersion(scalaVersion) match { 64 | case Some((2, 12)) => Seq("-Ywarn-unused", "-Ywarn-unused-import") 65 | case Some((2, 11)) => Seq("-Ywarn-unused", "-Ywarn-unused-import") 66 | case _ => Nil 67 | } 68 | 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /sandbox/src/main/scala/sandbox/bloomfilter/mutable/BloomFilter.scala: -------------------------------------------------------------------------------- 1 | package sandbox.bloomfilter.mutable 2 | 3 | import bloomfilter.CanGenerateHashFrom 4 | 5 | class BloomFilter[T] private (val numberOfBits: Long, val numberOfHashes: Int, private val bits: BitArray) 6 | (implicit canGenerateHash: CanGenerateHashFrom[T]) { 7 | 8 | def this(numberOfBits: Long, numberOfHashes: Int)(implicit canGenerateHash: CanGenerateHashFrom[T]) { 9 | this(numberOfBits, numberOfHashes, new BitArray(numberOfBits)) 10 | } 11 | 12 | def add(x: T): Unit = { 13 | val hash = canGenerateHash.generateHash(x) 14 | val hash1 = hash >>> 32 15 | val hash2 = (hash << 32) >> 32 16 | 17 | var i = 0 18 | while (i < numberOfHashes) { 19 | val computedHash = hash1 + i * hash2 20 | bits.set((computedHash & Long.MaxValue) % numberOfBits) 21 | i += 1 22 | } 23 | } 24 | 25 | def union(that: BloomFilter[T]): BloomFilter[T] = { 26 | require(this.numberOfBits == that.numberOfBits && this.numberOfHashes == that.numberOfHashes, 27 | s"Union works only on BloomFilters with the same number of hashes and of bits") 28 | new BloomFilter[T](this.numberOfBits, this.numberOfHashes, this.bits | that.bits) 29 | } 30 | 31 | def intersect(that: BloomFilter[T]): BloomFilter[T] = { 32 | require(this.numberOfBits == that.numberOfBits && this.numberOfHashes == that.numberOfHashes, 33 | s"Intersect works only on BloomFilters with the same number of hashes and of bits") 34 | new BloomFilter[T](this.numberOfBits, this.numberOfHashes, this.bits & that.bits) 35 | } 36 | 37 | def mightContain(x: T): Boolean = { 38 | val hash = canGenerateHash.generateHash(x) 39 | val hash1 = hash >>> 32 40 | val hash2 = (hash << 32) >> 32 41 | var i = 0 42 | while (i < numberOfHashes) { 43 | val computedHash = hash1 + i * hash2 44 | if (!bits.get((computedHash & Long.MaxValue) % numberOfBits)) 45 | return false 46 | i += 1 47 | } 48 | true 49 | } 50 | 51 | def expectedFalsePositiveRate(): Double = { 52 | math.pow(bits.getBitCount.toDouble / numberOfBits, numberOfHashes.toDouble) 53 | } 54 | 55 | } 56 | 57 | object BloomFilter { 58 | 59 | def apply[T](numberOfItems: Long, falsePositiveRate: Double) 60 | (implicit canGenerateHash: CanGenerateHashFrom[T]): BloomFilter[T] = { 61 | 62 | val nb = optimalNumberOfBits(numberOfItems, falsePositiveRate) 63 | val nh = optimalNumberOfHashes(numberOfItems, nb) 64 | new BloomFilter[T](nb, nh) 65 | } 66 | 67 | def optimalNumberOfBits(numberOfItems: Long, falsePositiveRate: Double): Long = { 68 | math.ceil(-1 * numberOfItems * math.log(falsePositiveRate) / math.log(2) / math.log(2)).toLong 69 | } 70 | 71 | def optimalNumberOfHashes(numberOfItems: Long, numberOfBits: Long): Int = { 72 | math.ceil(numberOfBits / numberOfItems * math.log(2)).toInt 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /sandboxApp/src/main/scala/SandboxApp.scala: -------------------------------------------------------------------------------- 1 | import java.text.NumberFormat 2 | 3 | import bloomfilter.mutable.{CuckooFilter, UnsafeTable8Bit} 4 | import com.google.monitoring.runtime.instrumentation.{AllocationRecorder, Sampler} 5 | import com.twitter.algebird.{BloomFilter => AlgebirdBloomFilter} 6 | 7 | import scala.util.Random 8 | 9 | object SandboxApp { 10 | 11 | def checkMemory(): Unit = { 12 | val runtime = Runtime.getRuntime 13 | 14 | val format = NumberFormat.getInstance() 15 | 16 | val sb = new StringBuilder() 17 | val maxMemory = runtime.maxMemory() 18 | val allocatedMemory = runtime.totalMemory() 19 | val freeMemory = runtime.freeMemory() 20 | 21 | sb.append("free memory: " + format.format(freeMemory / 1024) + "\n") 22 | sb.append("allocated memory: " + format.format(allocatedMemory / 1024) + "\n") 23 | sb.append("max memory: " + format.format(maxMemory / 1024) + "\n") 24 | sb.append("total free memory: " + format.format((freeMemory + (maxMemory - allocatedMemory)) / 1024) + "\n") 25 | System.out.println(sb.toString()) 26 | } 27 | 28 | 29 | def main(args: Array[String]): Unit = { 30 | 31 | val sut = CuckooFilter[Long](1000) 32 | sut.add(8) 33 | assert(sut.mightContain(8)) 34 | sut.add(10) 35 | assert(sut.mightContain(10)) 36 | sut.add(8) 37 | assert(sut.mightContain(8)) 38 | sut.add(10000) 39 | assert(sut.mightContain(10000)) 40 | 41 | } 42 | 43 | def compareAlgebirdFPR(): Unit = { 44 | 45 | val random: Random = new Random() 46 | 47 | val itemsExpected = 10000L 48 | val falsePositiveRate = 0.1 49 | var bf = AlgebirdBloomFilter(itemsExpected.toInt, falsePositiveRate, 0).create("") 50 | val bf2 = bloomfilter.mutable.BloomFilter[String](itemsExpected, falsePositiveRate) 51 | 52 | var i = 0 53 | while (i < itemsExpected) { 54 | val str: String = random.nextString(1000) 55 | bf = bf.+(str) 56 | bf2.add(str) 57 | i += 1 58 | } 59 | 60 | i = 0 61 | var in, in2 = 0 62 | while (true) { 63 | val str = random.nextString(1000) 64 | if (bf.contains(str).isTrue) { 65 | in += 1 66 | } 67 | if (bf2.mightContain(str)) { 68 | in2 += 1 69 | } 70 | 71 | if (i % 1000 == 0) { 72 | println(s"in: $in; in2: $in2") 73 | } 74 | } 75 | 76 | 77 | } 78 | 79 | def checkAllocations(): Unit = { 80 | val sampler: Sampler = new Sampler() { 81 | def sampleAllocation(count: Int, desc: String, newObj: Object, size: Long) { 82 | System.out.println("I just allocated the object " + newObj + 83 | " of type " + desc + " whose size is " + size) 84 | if (count != -1) { 85 | System.out.println("It's an array of size " + count) 86 | } 87 | } 88 | } 89 | 90 | AllocationRecorder.addSampler(sampler) 91 | 92 | AllocationRecorder.removeSampler(sampler) 93 | 94 | } 95 | } -------------------------------------------------------------------------------- /tests/src/test/scala/tests/bloomfilter/mutable/BloomFilterSerializationSpec.scala: -------------------------------------------------------------------------------- 1 | package tests.bloomfilter.mutable 2 | 3 | import java.io._ 4 | 5 | import bloomfilter.mutable.BloomFilter 6 | import org.scalacheck.Prop.forAll 7 | import org.scalacheck.{Gen, Properties} 8 | import org.scalatest.Matchers 9 | 10 | class BloomFilterSerializationSpec extends Properties("BloomFilter") with Matchers { 11 | def genListElems[A](max: Long)(implicit aGen: Gen[A]): Gen[List[A]] = { 12 | Gen.posNum[Int].map(_ % max).flatMap(i => Gen.listOfN(math.min(i, Int.MaxValue).toInt, aGen)) 13 | } 14 | 15 | val gen = for { 16 | size <- Gen.oneOf[Long](1, 1000 /*, Int.MaxValue.toLong + 1*/) 17 | indices <- genListElems[Long](size)(Gen.chooseNum(0, size - 1)) 18 | } yield (size, indices) 19 | 20 | property("writeTo & readFrom") = forAll(gen) { 21 | case (size: Long, indices: List[Long]) => 22 | val initial = BloomFilter[Long](size, 0.01) 23 | indices.foreach(initial.add) 24 | 25 | val file = File.createTempFile("bloomFilterSerialized", ".tmp") 26 | val out = new BufferedOutputStream(new FileOutputStream(file), 10 * 1000 * 1000) 27 | initial.writeTo(out) 28 | out.close() 29 | val in = new BufferedInputStream(new FileInputStream(file), 10 * 1000 * 1000) 30 | val sut = BloomFilter.readFrom[Long](in) 31 | in.close() 32 | 33 | sut.approximateElementCount() shouldEqual initial.approximateElementCount() 34 | 35 | val result = indices.forall(sut.mightContain) 36 | 37 | file.delete() 38 | initial.dispose() 39 | sut.dispose() 40 | 41 | result 42 | } 43 | 44 | property("supports java serialization") = { 45 | forAll(gen) { 46 | case (size, indices) => 47 | val initial = BloomFilter[Long](size, 0.01) 48 | indices.foreach(initial.add) 49 | val file = File.createTempFile("bloomFilterSerialized", ".tmp") 50 | val out = new BufferedOutputStream(new FileOutputStream(file), 10 * 1000 * 1000) 51 | val oos = new ObjectOutputStream(out) 52 | oos.writeObject(initial) 53 | oos.close() 54 | out.close() 55 | val in = new BufferedInputStream(new FileInputStream(file), 10 * 1000 * 1000) 56 | val ois = new ObjectInputStream(in) 57 | val desrialized = ois.readObject() 58 | ois.close() 59 | in.close() 60 | 61 | desrialized should not be null 62 | desrialized should be(a[BloomFilter[Long]]) 63 | val sut = desrialized.asInstanceOf[BloomFilter[Long]] 64 | 65 | sut.numberOfBits shouldEqual initial.numberOfBits 66 | sut.numberOfHashes shouldEqual initial.numberOfHashes 67 | sut.approximateElementCount() shouldEqual initial.approximateElementCount() 68 | 69 | 70 | val result = indices.forall(sut.mightContain) 71 | 72 | file.delete() 73 | initial.dispose() 74 | sut.dispose() 75 | 76 | result 77 | } 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /sandbox/src/main/scala/sandbox/hashing/MurmurHash3.scala: -------------------------------------------------------------------------------- 1 | package sandbox.hashing 2 | 3 | import java.lang.Long.rotateLeft 4 | 5 | object MurmurHash3 { 6 | 7 | private val c1: Long = 0x87c37b91114253d5L 8 | private val c2: Long = 0x4cf5ad432745937fL 9 | 10 | def getLongLittleEndian(buf: Array[Byte], offset: Int): Long = { 11 | (buf(offset + 7).toLong << 56) | 12 | ((buf(offset + 6) & 0xffL) << 48) | 13 | ((buf(offset + 5) & 0xffL) << 40) | 14 | ((buf(offset + 4) & 0xffL) << 32) | 15 | ((buf(offset + 3) & 0xffL) << 24) | 16 | ((buf(offset + 2) & 0xffL) << 16) | 17 | ((buf(offset + 1) & 0xffL) << 8) | 18 | buf(offset) & 0xffL 19 | } 20 | 21 | def fmix64(l: Long): Long = { 22 | var k = l 23 | k ^= k >>> 33 24 | k *= 0xff51afd7ed558ccdL 25 | k ^= k >>> 33 26 | k *= 0xc4ceb9fe1a85ec53L 27 | k ^= k >>> 33 28 | k 29 | } 30 | 31 | def murmurhash3_x64_128(key: Array[Byte], offset: Int, len: Int, seed: Int): (Long, Long) = { 32 | var h1: Long = seed & 0x00000000FFFFFFFFL 33 | var h2: Long = seed & 0x00000000FFFFFFFFL 34 | 35 | val roundedEnd = offset + (len & 0xFFFFFFF0); // round down to 16 byte block 36 | 37 | var i = offset 38 | while (i < roundedEnd) { 39 | var k1 = getLongLittleEndian(key, i) 40 | var k2 = getLongLittleEndian(key, i + 8) 41 | k1 *= c1; k1 = rotateLeft(k1, 31); k1 *= c2; h1 ^= k1 42 | h1 = rotateLeft(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729 43 | k2 *= c2; k2 = rotateLeft(k2, 33); k2 *= c1; h2 ^= k2 44 | h2 = rotateLeft(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5 45 | 46 | i += 16 47 | } 48 | 49 | var k1: Long = 0 50 | var k2: Long = 0 51 | 52 | val lenVar = len & 15 53 | if (lenVar == 15) k2 = (key(roundedEnd + 14) & 0xffL) << 48 54 | if (lenVar >= 14) k2 |= (key(roundedEnd + 13) & 0xffL) << 40 55 | if (lenVar >= 13) k2 |= (key(roundedEnd + 12) & 0xffL) << 32 56 | if (lenVar >= 12) k2 |= (key(roundedEnd + 11) & 0xffL) << 24 57 | if (lenVar >= 11) k2 |= (key(roundedEnd + 10) & 0xffL) << 16 58 | if (lenVar >= 10) k2 |= (key(roundedEnd + 9) & 0xffL) << 8 59 | if (lenVar >= 9) { 60 | k2 |= (key(roundedEnd + 8) & 0xffL) 61 | k2 *= c2 62 | k2 = rotateLeft(k2, 33) 63 | k2 *= c1 64 | h2 ^= k2 65 | } 66 | if (lenVar >= 8) k1 = key(roundedEnd + 7).toLong << 56 67 | if (lenVar >= 7) k1 |= (key(roundedEnd + 6) & 0xffL) << 48 68 | if (lenVar >= 6) k1 |= (key(roundedEnd + 5) & 0xffL) << 40 69 | if (lenVar >= 5) k1 |= (key(roundedEnd + 4) & 0xffL) << 32 70 | if (lenVar >= 4) k1 |= (key(roundedEnd + 3) & 0xffL) << 24 71 | if (lenVar >= 3) k1 |= (key(roundedEnd + 2) & 0xffL) << 16 72 | if (lenVar >= 2) k1 |= (key(roundedEnd + 1) & 0xffL) << 8 73 | if (lenVar >= 1) { 74 | k1 |= (key(roundedEnd) & 0xffL) 75 | k1 *= c1 76 | k1 = rotateLeft(k1, 31) 77 | k1 *= c2 78 | h1 ^= k1 79 | } 80 | 81 | h1 ^= len; h2 ^= len 82 | 83 | h1 += h2 84 | h2 += h1 85 | 86 | h1 = fmix64(h1) 87 | h2 = fmix64(h2) 88 | 89 | h1 += h2 90 | h2 += h1 91 | 92 | (h1, h2) 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /bloom-filter/src/main/scala/bloomfilter/mutable/_128bit/BloomFilter.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter.mutable._128bit 2 | 3 | import java.io.{DataInputStream, DataOutputStream, InputStream, OutputStream} 4 | 5 | import bloomfilter.CanGenerate128HashFrom 6 | import bloomfilter.mutable.UnsafeBitArray 7 | 8 | import scala.math._ 9 | 10 | @SerialVersionUID(2L) 11 | class BloomFilter[T] private (val numberOfBits: Long, val numberOfHashes: Int, private val bits: UnsafeBitArray) 12 | (implicit canGenerateHash: CanGenerate128HashFrom[T]) extends Serializable { 13 | 14 | def this(numberOfBits: Long, numberOfHashes: Int)(implicit canGenerateHash: CanGenerate128HashFrom[T]) { 15 | this(numberOfBits, numberOfHashes, new UnsafeBitArray(numberOfBits)) 16 | } 17 | 18 | def add(x: T): Unit = { 19 | val hash = canGenerateHash.generateHash(x) 20 | 21 | var i = 0 22 | while (i < numberOfHashes) { 23 | val computedHash = hash._1 + i * hash._2 24 | bits.set((computedHash & Long.MaxValue) % numberOfBits) 25 | i += 1 26 | } 27 | } 28 | 29 | def mightContain(x: T): Boolean = { 30 | val hash = canGenerateHash.generateHash(x) 31 | 32 | var i = 0 33 | while (i < numberOfHashes) { 34 | val computedHash = hash._1 + i * hash._2 35 | if (!bits.get((computedHash & Long.MaxValue) % numberOfBits)) 36 | return false 37 | i += 1 38 | } 39 | true 40 | } 41 | 42 | def expectedFalsePositiveRate(): Double = { 43 | math.pow(bits.getBitCount.toDouble / numberOfBits, numberOfHashes.toDouble) 44 | } 45 | 46 | def writeTo(out: OutputStream): Unit = { 47 | val dout = new DataOutputStream(out) 48 | dout.writeLong(numberOfBits) 49 | dout.writeInt(numberOfHashes) 50 | bits.writeTo(out) 51 | } 52 | 53 | def approximateElementCount(): Long = { 54 | val fractionOfBitsSet = bits.getBitCount.toDouble / numberOfBits 55 | val x = -log1p(-fractionOfBitsSet) * numberOfBits / numberOfHashes 56 | val z = rint(x) 57 | if (abs(x - z) == 0.5) { 58 | (x + Math.copySign(0.5, x)).toLong 59 | } else { 60 | z.toLong 61 | } 62 | } 63 | 64 | def dispose(): Unit = bits.dispose() 65 | 66 | } 67 | 68 | object BloomFilter { 69 | 70 | def apply[T](numberOfItems: Long, falsePositiveRate: Double) 71 | (implicit canGenerateHash: CanGenerate128HashFrom[T]): BloomFilter[T] = { 72 | 73 | val nb = optimalNumberOfBits(numberOfItems, falsePositiveRate) 74 | val nh = optimalNumberOfHashes(numberOfItems, nb) 75 | new BloomFilter[T](nb, nh) 76 | } 77 | 78 | def optimalNumberOfBits(numberOfItems: Long, falsePositiveRate: Double): Long = { 79 | math.ceil(-1 * numberOfItems * math.log(falsePositiveRate) / math.log(2) / math.log(2)).toLong 80 | } 81 | 82 | def optimalNumberOfHashes(numberOfItems: Long, numberOfBits: Long): Int = { 83 | math.ceil(numberOfBits / numberOfItems * math.log(2)).toInt 84 | } 85 | 86 | def readFrom[T](in: InputStream)(implicit canGenerateHash: CanGenerate128HashFrom[T]): BloomFilter[T] = { 87 | val din = new DataInputStream(in) 88 | val numberOfBits = din.readLong() 89 | val numberOfHashes = din.readInt() 90 | val bits = new UnsafeBitArray(numberOfBits) 91 | bits.readFrom(in) 92 | new BloomFilter[T](numberOfBits, numberOfHashes, bits) 93 | } 94 | 95 | } 96 | -------------------------------------------------------------------------------- /tests/src/test/scala/tests/bloomfilter/mutable/BloomFilterSpec.scala: -------------------------------------------------------------------------------- 1 | package tests.bloomfilter.mutable 2 | 3 | import bloomfilter.CanGenerateHashFrom 4 | import bloomfilter.mutable.BloomFilter 5 | import org.scalacheck.Arbitrary.arbitrary 6 | import org.scalacheck.Prop.forAll 7 | import org.scalacheck.Test.Parameters 8 | import org.scalacheck.commands.Commands 9 | import org.scalacheck.{Arbitrary, Gen, Prop, Properties} 10 | 11 | class BloomFilterSpec extends Properties("BloomFilter") { 12 | 13 | property("for Long") = new BloomFilterCommands[Long].property() 14 | property("for String") = new BloomFilterCommands[String].property() 15 | property("for Array[Byte]") = new BloomFilterCommands[Array[Byte]].property() 16 | 17 | 18 | override def overrideParameters(p: Parameters): Parameters = { 19 | super.overrideParameters(p).withMinSuccessfulTests(100) 20 | } 21 | 22 | class BloomFilterCommands[T: Arbitrary](implicit canGenerateHash: CanGenerateHashFrom[T]) extends Commands { 23 | type Sut = BloomFilter[T] 24 | 25 | case class State(expectedItems: Long, addedItems: Long) 26 | 27 | override def canCreateNewSut( 28 | newState: State, 29 | initSuts: Traversable[State], 30 | runningSuts: Traversable[Sut]): Boolean = { 31 | initSuts.isEmpty && runningSuts.isEmpty || 32 | newState.addedItems > newState.expectedItems || 33 | newState.addedItems > 100 34 | } 35 | 36 | override def destroySut(sut: Sut): Unit = 37 | sut.dispose() 38 | 39 | override def genInitialState: Gen[State] = 40 | Gen.chooseNum[Long](1, Int.MaxValue).map(State(_, 0)) 41 | 42 | override def newSut(state: State): Sut = 43 | BloomFilter[T](state.expectedItems, 0.01) 44 | 45 | def initialPreCondition(state: State): Boolean = true 46 | 47 | def genCommand(state: State): Gen[Command] = 48 | for { 49 | item <- Arbitrary.arbitrary[T] 50 | } yield commandSequence(AddItem(item), CheckItem(item)) 51 | 52 | case class AddItem(item: T) extends UnitCommand { 53 | def run(sut: Sut): Unit = sut.synchronized(sut.add(item)) 54 | def nextState(state: State) = state.copy(addedItems = state.addedItems + 1) 55 | def preCondition(state: State) = true 56 | def postCondition(state: State, success: Boolean) = success 57 | } 58 | 59 | case class CheckItem(item: T) extends SuccessCommand { 60 | type Result = Boolean 61 | def run(sut: Sut): Boolean = sut.synchronized(sut.mightContain(item)) 62 | def nextState(state: State) = state 63 | def preCondition(state: State) = true 64 | def postCondition(state: State, result: Boolean): Prop = result 65 | } 66 | 67 | } 68 | 69 | private val elemsToAddGen = for { 70 | numberOfElemsToAdd <- Gen.chooseNum[Int](1, 1000) 71 | elemsToAdd <- Gen.listOfN(numberOfElemsToAdd, arbitrary[Long]) 72 | } yield elemsToAdd 73 | 74 | // TODO fix elemsToAddGen.filter() below, why Gen.listOfN above generates empty lists? 75 | property("approximateElementCount") = forAll(elemsToAddGen.filter(x => x.size > 10 && x.toSet.size > 10)) { elemsToAdd: List[Long] => 76 | val bf = BloomFilter[Long](elemsToAdd.size * 10, 0.0001) 77 | elemsToAdd.foreach(bf.add) 78 | val numberOfUnique = elemsToAdd.toSet.size 79 | math.abs(bf.approximateElementCount() - numberOfUnique) < numberOfUnique * 0.1 80 | } 81 | 82 | } 83 | -------------------------------------------------------------------------------- /bloom-filter/src/main/scala/bloomfilter/mutable/UnsafeBitArray.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter.mutable 2 | 3 | import java.io._ 4 | 5 | import bloomfilter.util.Unsafe.unsafe 6 | 7 | @SerialVersionUID(2L) 8 | class UnsafeBitArray(val numberOfBits: Long) extends Serializable { 9 | private val indices = math.ceil(numberOfBits.toDouble / 64).toLong 10 | @transient 11 | private val ptr = unsafe.allocateMemory(8L * indices) 12 | unsafe.setMemory(ptr, 8L * indices, 0.toByte) 13 | private var bitCount = 0L 14 | 15 | def get(index: Long): Boolean = { 16 | (unsafe.getLong(ptr + (index >>> 6) * 8L) & (1L << index)) != 0 17 | } 18 | 19 | def set(index: Long): Unit = { 20 | val offset = ptr + (index >>> 6) * 8L 21 | val long = unsafe.getLong(offset) 22 | if ((long & (1L << index)) == 0) { 23 | unsafe.putLong(offset, long | (1L << index)) 24 | bitCount += 1 25 | } 26 | } 27 | 28 | def combine(that: UnsafeBitArray, combiner: (Long, Long) => Long): UnsafeBitArray = { 29 | val result = new UnsafeBitArray(this.numberOfBits) 30 | var index = 0L 31 | while (index < numberOfBits) { 32 | val thisLong = unsafe.getLong(this.ptr + (index >>> 6) * 8L) 33 | val thatLong = unsafe.getLong(that.ptr + (index >>> 6) * 8L) 34 | val longAtIndex = combiner(thisLong, thatLong) 35 | unsafe.putLong(result.ptr + (index >>> 6) * 8L, longAtIndex) 36 | index += 64 37 | } 38 | result 39 | } 40 | 41 | def |(that: UnsafeBitArray): UnsafeBitArray = { 42 | require(this.numberOfBits == that.numberOfBits, "Bitwise OR works only on arrays with the same number of bits") 43 | 44 | combine(that, _ | _) 45 | } 46 | 47 | def &(that: UnsafeBitArray): UnsafeBitArray = { 48 | require(this.numberOfBits == that.numberOfBits, "Bitwise AND works only on arrays with the same number of bits") 49 | 50 | combine(that, _ & _) 51 | } 52 | 53 | def getBitCount: Long = { 54 | bitCount 55 | } 56 | 57 | def writeTo(out: OutputStream): Unit = { 58 | val dout = new DataOutputStream(out) 59 | dout.writeLong(bitCount) 60 | var index = 0L 61 | while (index < numberOfBits) { 62 | dout.writeLong(unsafe.getLong(this.ptr + (index >>> 6) * 8L)) 63 | index += 64 64 | } 65 | } 66 | 67 | def readFrom(in: InputStream): Unit = { 68 | val din = new DataInputStream(in) 69 | bitCount = din.readLong() 70 | var index = 0L 71 | while (index < numberOfBits) { 72 | unsafe.putLong(this.ptr + (index >>> 6) * 8L, din.readLong()) 73 | index += 64 74 | } 75 | } 76 | 77 | def dispose(): Unit = unsafe.freeMemory(ptr) 78 | 79 | @throws(classOf[java.io.ObjectStreamException]) 80 | private def writeReplace: AnyRef = new UnsafeBitArray.SerializedForm(this) 81 | 82 | } 83 | 84 | object UnsafeBitArray { 85 | 86 | @SerialVersionUID(1L) 87 | private class SerializedForm(@transient var unsafeBitArray: UnsafeBitArray) extends Serializable { 88 | private def writeObject(oos: ObjectOutputStream): Unit = { 89 | oos.defaultWriteObject() 90 | oos.writeLong(unsafeBitArray.numberOfBits) 91 | unsafeBitArray.writeTo(oos) 92 | } 93 | 94 | private def readObject(ois: ObjectInputStream): Unit = { 95 | ois.defaultReadObject() 96 | val numberOfBits = ois.readLong() 97 | unsafeBitArray = new UnsafeBitArray(numberOfBits) 98 | unsafeBitArray.readFrom(ois) 99 | } 100 | 101 | @throws(classOf[java.io.ObjectStreamException]) 102 | private def readResolve: AnyRef = unsafeBitArray 103 | } 104 | 105 | } 106 | -------------------------------------------------------------------------------- /bloom-filter/src/main/scala/bloomfilter/mutable/CuckooFilter.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter.mutable 2 | 3 | import bloomfilter.CanGenerateHashFrom 4 | 5 | @SerialVersionUID(1L) 6 | class CuckooFilter[T](numberOfBuckets: Long, numberOfBitsPerItem: Int, private val table: UnsafeTable) 7 | (implicit canGenerateHash: CanGenerateHashFrom[T]) extends Serializable { 8 | 9 | def this(numberOfBuckets: Long, numberOfBitsPerItem: Int)(implicit canGenerateHash: CanGenerateHashFrom[T]) { 10 | this(numberOfBuckets, numberOfBitsPerItem, new UnsafeTable16Bit(numberOfBuckets)) 11 | } 12 | 13 | import CuckooFilter._ 14 | 15 | def add(x: T): Unit = { 16 | val hash = canGenerateHash.generateHash(x) 17 | val index = indexHash(hash >> 32, numberOfBuckets) 18 | val tag = tagHash(hash, numberOfBitsPerItem) 19 | if (table.insert(index, tag)) { 20 | return 21 | } 22 | 23 | var curIndex = index 24 | var curTag = tag 25 | var i = 0 26 | while (i < MaxAddAttempts) { 27 | curIndex = altIndex(curIndex, curTag, numberOfBuckets) 28 | val swappedTag = table.swapAny(curIndex, curTag) 29 | if (swappedTag == 0) { 30 | return 31 | } 32 | curTag = swappedTag 33 | i += 1 34 | } 35 | } 36 | 37 | def remove(x: T): Unit = { 38 | val hash = canGenerateHash.generateHash(x) 39 | val index = indexHash(hash >> 32, numberOfBuckets) 40 | val tag = tagHash(hash, numberOfBitsPerItem) 41 | if (table.remove(index, tag)) return 42 | val index2 = altIndex(index, tag, numberOfBuckets) 43 | if (table.remove(index2, tag)) return 44 | } 45 | 46 | def mightContain(x: T): Boolean = { 47 | val hash = canGenerateHash.generateHash(x) 48 | val index = indexHash(hash >> 32, numberOfBuckets) 49 | val tag = tagHash(hash, numberOfBitsPerItem) 50 | if (table.find(index, tag)) return true 51 | val index2 = altIndex(index, tag, numberOfBuckets) 52 | if (table.find(index2, tag)) return true 53 | false 54 | } 55 | 56 | def dispose(): Unit = table.dispose() 57 | } 58 | 59 | object CuckooFilter { 60 | 61 | // TODO falsePositiveRate? 62 | def apply[T](numberOfItems: Long)(implicit canGenerateHash: CanGenerateHashFrom[T]): CuckooFilter[T] = { 63 | val nb = optimalNumberOfBuckets(numberOfItems) 64 | new CuckooFilter[T](nb, 16, new UnsafeTable16Bit(nb)) 65 | } 66 | 67 | def optimalNumberOfBuckets(numberOfItems: Long): Long = { 68 | var numberOfBuckets = upperPowerOf2((numberOfItems + UnsafeTable16Bit.TagsPerBucket - 1) / UnsafeTable16Bit.TagsPerBucket) 69 | val frac = numberOfItems.toDouble / numberOfBuckets / UnsafeTable16Bit.TagsPerBucket 70 | if (frac > 0.96) numberOfBuckets = numberOfBuckets << 1 71 | numberOfBuckets 72 | } 73 | 74 | 75 | val MaxAddAttempts = 500 76 | 77 | @inline 78 | private def upperPowerOf2(l: Long): Long = { 79 | var x = l - 1 80 | x |= x >> 1 81 | x |= x >> 2 82 | x |= x >> 4 83 | x |= x >> 8 84 | x |= x >> 16 85 | x |= x >> 32 86 | x += 1 87 | x 88 | } 89 | 90 | @inline 91 | private def altIndex(index: Long, tag: Long, numberOfBuckets: Long): Long = 92 | indexHash(index ^ (tag * 0x5bd1e995), numberOfBuckets) 93 | 94 | @inline 95 | private def indexHash(hash: Long, numberOfBuckets: Long): Long = { 96 | hash & (numberOfBuckets - 1) 97 | } 98 | 99 | @inline 100 | private def tagHash(hash: Long, numberOfBitsPerItem: Int): Long = { 101 | var tag = hash & ((1L << numberOfBitsPerItem) - 1) 102 | if (tag == 0) tag += 1 103 | tag 104 | } 105 | 106 | 107 | } 108 | -------------------------------------------------------------------------------- /tests/src/test/scala/tests/bloomfilter/mutable/UnsafeBitArraySpec.scala: -------------------------------------------------------------------------------- 1 | package tests.bloomfilter.mutable 2 | 3 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream} 4 | 5 | import bloomfilter.mutable.UnsafeBitArray 6 | import org.scalacheck.Test.Parameters 7 | import org.scalacheck.commands.Commands 8 | import org.scalacheck.{Gen, Prop, Properties} 9 | import org.scalatest.{Inspectors, Matchers} 10 | 11 | class UnsafeBitArraySpec extends Properties("UnsafeBitArray") with Matchers with Inspectors { 12 | 13 | property("set & get") = new UnsafeBitArrayCommands().property() 14 | property("serializable") = serializationProp 15 | 16 | override def overrideParameters(p: Parameters): Parameters = { 17 | super.overrideParameters(p).withMinSuccessfulTests(100) 18 | } 19 | 20 | class UnsafeBitArrayCommands extends Commands { 21 | type Sut = UnsafeBitArray 22 | 23 | case class State(size: Long) 24 | 25 | override def canCreateNewSut( 26 | newState: State, 27 | initSuts: Traversable[State], 28 | runningSuts: Traversable[Sut]): Boolean = 29 | initSuts.isEmpty && runningSuts.isEmpty 30 | 31 | override def destroySut(sut: Sut): Unit = 32 | sut.dispose() 33 | 34 | override def genInitialState: Gen[State] = 35 | Gen.chooseNum[Long](1, Int.MaxValue * 2L).map(State) 36 | 37 | override def newSut(state: State): Sut = 38 | new UnsafeBitArray(state.size) 39 | 40 | def initialPreCondition(state: State): Boolean = true 41 | 42 | def genCommand(state: State): Gen[Command] = 43 | for { 44 | i <- Gen.choose[Long](0, state.size) 45 | } yield commandSequence(SetItem(i), GetItem(i)) 46 | 47 | case class SetItem(i: Long) extends UnitCommand { 48 | def run(sut: Sut): Unit = sut.synchronized(sut.set(i)) 49 | def nextState(state: State): State = state 50 | def preCondition(state: State) = true 51 | def postCondition(state: State, success: Boolean): Prop = success 52 | } 53 | 54 | case class GetItem(i: Long) extends SuccessCommand { 55 | type Result = Boolean 56 | def run(sut: Sut): Boolean = sut.synchronized(sut.get(i)) 57 | def nextState(state: State): State = state 58 | def preCondition(state: State) = true 59 | def postCondition(state: State, result: Boolean): Prop = result 60 | } 61 | 62 | } 63 | 64 | def serializationProp: Prop = { 65 | case class State(sz: Int, included: Set[Long]) 66 | val genState = for { 67 | sz <- Gen.posNum[Int] 68 | included <- Gen.listOf(Gen.choose(0L, sz - 1)) 69 | } yield { 70 | State(sz, included.toSet) 71 | } 72 | 73 | Prop.forAll(genState) { 74 | case State(sz, included) => 75 | val bits = new UnsafeBitArray(sz) 76 | try { 77 | included.foreach(bits.set) 78 | 79 | val bos = new ByteArrayOutputStream() 80 | val oos = new ObjectOutputStream(bos) 81 | oos.writeObject(bits) 82 | oos.close() 83 | val bis = new ByteArrayInputStream(bos.toByteArray) 84 | val ois = new ObjectInputStream(bis) 85 | val deserialized = ois.readObject() 86 | ois.close() 87 | 88 | deserialized should not be null 89 | deserialized should be(a[UnsafeBitArray]) 90 | val deserializedBits = deserialized.asInstanceOf[UnsafeBitArray] 91 | try { 92 | deserializedBits.numberOfBits should equal(bits.numberOfBits) 93 | forAll(0l until bits.numberOfBits) { idx => 94 | bits.get(idx) should equal(deserializedBits.get(idx)) 95 | } 96 | } finally { 97 | deserializedBits.dispose() 98 | } 99 | } finally bits.dispose() 100 | Prop.passed 101 | } 102 | } 103 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Bloom filter for Scala 2 | 3 | [![Build Status](https://travis-ci.org/alexandrnikitin/bloom-filter-scala.svg?branch=master)](https://travis-ci.org/alexandrnikitin/bloom-filter-scala) 4 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.github.alexandrnikitin/bloom-filter_2.11/badge.svg)](https://maven-badges.herokuapp.com/maven-central/com.github.alexandrnikitin/bloom-filter_2.11) 5 | 6 | ### Overview 7 | 8 | >"A Bloom filter is a space-efficient probabilistic data structure that is used to test whether an element is a member of a set. False positive matches are possible, but false negatives are not. In other words, a query returns either "possibly in set" or "definitely not in set". Elements can be added to the set, but not removed," says [Wikipedia][wiki-bloom-filter]. 9 | 10 | What's Bloom filter in a nutshell: 11 | 12 | - Optimization for memory. It comes into play when you cannot put whole set into memory. 13 | - Solves the membership problem. It can answer one question: does an element belong to a set or not? 14 | - Probabilistic (lossy) data structure. It can answer that an element **probably belongs** to a set with some probability. 15 | 16 | ### Getting Started 17 | 18 | ```scala 19 | libraryDependencies += "com.github.alexandrnikitin" %% "bloom-filter" % "latest.release" 20 | ``` 21 | 22 | ```scala 23 | // Create a Bloom filter 24 | val expectedElements = 1000000 25 | val falsePositiveRate = 0.1 26 | val bf = BloomFilter[String](expectedElements, falsePositiveRate) 27 | 28 | // Put an element 29 | bf.add(element) 30 | 31 | // Check whether an element in a set 32 | bf.mightContain(element) 33 | 34 | // Dispose the instance 35 | bf.dispose() 36 | ``` 37 | 38 | ### Motivation 39 | 40 | You can read about this Bloom filter and motivation behind in [my blog post][post] 41 | 42 | ### Benchmarks 43 | 44 | Here's a benchmark for the `String` type and results for other types are very similar to these: 45 | 46 | ``` 47 | [info] Benchmark (length) Mode Cnt Score Error Units 48 | [info] alternatives.algebird.StringItemBenchmark.algebirdGet 1024 thrpt 20 1181080.172 ▒ 9867.840 ops/s 49 | [info] alternatives.algebird.StringItemBenchmark.algebirdPut 1024 thrpt 20 157158.453 ▒ 844.623 ops/s 50 | [info] alternatives.breeze.StringItemBenchmark.breezeGet 1024 thrpt 20 5113222.168 ▒ 47005.466 ops/s 51 | [info] alternatives.breeze.StringItemBenchmark.breezePut 1024 thrpt 20 4482377.337 ▒ 19971.209 ops/s 52 | [info] alternatives.guava.StringItemBenchmark.guavaGet 1024 thrpt 20 5712237.339 ▒ 115453.495 ops/s 53 | [info] alternatives.guava.StringItemBenchmark.guavaPut 1024 thrpt 20 5621712.282 ▒ 307133.297 ops/s 54 | 55 | [info] bloomfilter.mutable.StringItemBenchmark.myGet 1024 thrpt 20 11483828.730 ▒ 342980.166 ops/s 56 | [info] bloomfilter.mutable.StringItemBenchmark.myPut 1024 thrpt 20 11634399.272 ▒ 45645.105 ops/s 57 | [info] bloomfilter.mutable._128bit.StringItemBenchmark.myGet 1024 thrpt 20 11119086.965 ▒ 43696.519 ops/s 58 | [info] bloomfilter.mutable._128bit.StringItemBenchmark.myPut 1024 thrpt 20 11303765.075 ▒ 52581.059 ops/s 59 | ``` 60 | 61 | Basically, this implementation is 2x faster than Google's Guava and 10-80x than Twitter's Algebird. Other benchmarks you can find in [the "benchmarks' module on github][github-benchmarks] 62 | 63 | Warning: These are synthetic benchmarks in isolated environment. Usually the difference in throughput and latency is bigger in production system because it will stress the GC, lead to slow allocation paths and higher latencies, trigger the GC, etc. 64 | 65 | [wiki-bloom-filter]: https://en.wikipedia.org/wiki/Bloom_filter 66 | [post]: https://alexandrnikitin.github.io/blog/bloom-filter-for-scala/ 67 | [github-benchmarks]: https://github.com/alexandrnikitin/bloom-filter-scala/tree/master/benchmarks/src/main/scala 68 | -------------------------------------------------------------------------------- /bloom-filter/src/main/scala/bloomfilter/mutable/BloomFilter.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter.mutable 2 | 3 | import java.io.{DataInputStream, DataOutputStream, InputStream, OutputStream} 4 | 5 | import bloomfilter.CanGenerateHashFrom 6 | 7 | import scala.math._ 8 | 9 | @SerialVersionUID(2L) 10 | class BloomFilter[T] private (val numberOfBits: Long, val numberOfHashes: Int, private val bits: UnsafeBitArray) 11 | (implicit canGenerateHash: CanGenerateHashFrom[T]) extends Serializable { 12 | 13 | def this(numberOfBits: Long, numberOfHashes: Int)(implicit canGenerateHash: CanGenerateHashFrom[T]) { 14 | this(numberOfBits, numberOfHashes, new UnsafeBitArray(numberOfBits)) 15 | } 16 | 17 | def add(x: T): Unit = { 18 | val hash = canGenerateHash.generateHash(x) 19 | val hash1 = hash >>> 32 20 | val hash2 = (hash << 32) >> 32 21 | 22 | var i = 0 23 | while (i < numberOfHashes) { 24 | val computedHash = hash1 + i * hash2 25 | bits.set((computedHash & Long.MaxValue) % numberOfBits) 26 | i += 1 27 | } 28 | } 29 | 30 | def union(that: BloomFilter[T]): BloomFilter[T] = { 31 | require(this.numberOfBits == that.numberOfBits && this.numberOfHashes == that.numberOfHashes, 32 | s"Union works only on BloomFilters with the same number of hashes and of bits") 33 | new BloomFilter[T](this.numberOfBits, this.numberOfHashes, this.bits | that.bits) 34 | } 35 | 36 | def intersect(that: BloomFilter[T]): BloomFilter[T] = { 37 | require(this.numberOfBits == that.numberOfBits && this.numberOfHashes == that.numberOfHashes, 38 | s"Intersect works only on BloomFilters with the same number of hashes and of bits") 39 | new BloomFilter[T](this.numberOfBits, this.numberOfHashes, this.bits & that.bits) 40 | } 41 | 42 | def mightContain(x: T): Boolean = { 43 | val hash = canGenerateHash.generateHash(x) 44 | val hash1 = hash >>> 32 45 | val hash2 = (hash << 32) >> 32 46 | var i = 0 47 | while (i < numberOfHashes) { 48 | val computedHash = hash1 + i * hash2 49 | if (!bits.get((computedHash & Long.MaxValue) % numberOfBits)) 50 | return false 51 | i += 1 52 | } 53 | true 54 | } 55 | 56 | def expectedFalsePositiveRate(): Double = { 57 | math.pow(bits.getBitCount.toDouble / numberOfBits, numberOfHashes.toDouble) 58 | } 59 | 60 | def writeTo(out: OutputStream): Unit = { 61 | val dout = new DataOutputStream(out) 62 | dout.writeLong(numberOfBits) 63 | dout.writeInt(numberOfHashes) 64 | bits.writeTo(out) 65 | } 66 | 67 | def approximateElementCount(): Long = { 68 | val fractionOfBitsSet = bits.getBitCount.toDouble / numberOfBits 69 | val x = -log1p(-fractionOfBitsSet) * numberOfBits / numberOfHashes 70 | val z = rint(x) 71 | if (abs(x - z) == 0.5) { 72 | (x + Math.copySign(0.5, x)).toLong 73 | } else { 74 | z.toLong 75 | } 76 | } 77 | 78 | def dispose(): Unit = bits.dispose() 79 | 80 | } 81 | 82 | object BloomFilter { 83 | 84 | def apply[T](numberOfItems: Long, falsePositiveRate: Double) 85 | (implicit canGenerateHash: CanGenerateHashFrom[T]): BloomFilter[T] = { 86 | 87 | val nb = optimalNumberOfBits(numberOfItems, falsePositiveRate) 88 | val nh = optimalNumberOfHashes(numberOfItems, nb) 89 | new BloomFilter[T](nb, nh) 90 | } 91 | 92 | def optimalNumberOfBits(numberOfItems: Long, falsePositiveRate: Double): Long = { 93 | math.ceil(-1 * numberOfItems * math.log(falsePositiveRate) / math.log(2) / math.log(2)).toLong 94 | } 95 | 96 | def optimalNumberOfHashes(numberOfItems: Long, numberOfBits: Long): Int = { 97 | math.ceil(numberOfBits / numberOfItems * math.log(2)).toInt 98 | } 99 | 100 | def readFrom[T](in: InputStream)(implicit canGenerateHash: CanGenerateHashFrom[T]): BloomFilter[T] = { 101 | val din = new DataInputStream(in) 102 | val numberOfBits = din.readLong() 103 | val numberOfHashes = din.readInt() 104 | val bits = new UnsafeBitArray(numberOfBits) 105 | bits.readFrom(in) 106 | new BloomFilter[T](numberOfBits, numberOfHashes, bits) 107 | } 108 | 109 | } 110 | -------------------------------------------------------------------------------- /tests/src/test/scala/tests/bloomfilter/mutable/_128bit/BloomFilterSpec.scala: -------------------------------------------------------------------------------- 1 | package tests.bloomfilter.mutable._128bit 2 | 3 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream} 4 | 5 | import bloomfilter.{CanGenerate128HashFrom, CanGenerateHashFrom} 6 | import bloomfilter.mutable._128bit.BloomFilter 7 | import org.scalacheck.Arbitrary.arbitrary 8 | import org.scalacheck.Test.Parameters 9 | import org.scalacheck.commands.Commands 10 | import org.scalacheck.{Arbitrary, Gen, Prop, Properties} 11 | import org.scalatest.{Inspectors, Matchers} 12 | 13 | class BloomFilterSpec extends Properties("BloomFilter_128bit") with Matchers with Inspectors { 14 | 15 | property("for Long") = new BloomFilterCommands[Long].property() 16 | property("for String") = new BloomFilterCommands[String].property() 17 | property("for Array[Byte]") = new BloomFilterCommands[Array[Byte]].property() 18 | 19 | 20 | override def overrideParameters(p: Parameters): Parameters = { 21 | super.overrideParameters(p).withMinSuccessfulTests(100) 22 | } 23 | 24 | class BloomFilterCommands[T: Arbitrary](implicit canGenerateHash: CanGenerate128HashFrom[T]) extends Commands { 25 | type Sut = BloomFilter[T] 26 | 27 | case class State(expectedItems: Long, addedItems: Long) 28 | 29 | override def canCreateNewSut( 30 | newState: State, 31 | initSuts: Traversable[State], 32 | runningSuts: Traversable[Sut]): Boolean = { 33 | initSuts.isEmpty && runningSuts.isEmpty || 34 | newState.addedItems > newState.expectedItems || 35 | newState.addedItems > 100 36 | } 37 | 38 | override def destroySut(sut: Sut): Unit = 39 | sut.dispose() 40 | 41 | override def genInitialState: Gen[State] = 42 | Gen.chooseNum[Long](1, Int.MaxValue).map(State(_, 0)) 43 | 44 | override def newSut(state: State): Sut = 45 | BloomFilter[T](state.expectedItems, 0.01) 46 | 47 | def initialPreCondition(state: State): Boolean = true 48 | 49 | def genCommand(state: State): Gen[Command] = 50 | for { 51 | item <- Arbitrary.arbitrary[T] 52 | } yield commandSequence(AddItem(item), CheckItem(item)) 53 | 54 | case class AddItem(item: T) extends UnitCommand { 55 | def run(sut: Sut): Unit = sut.synchronized(sut.add(item)) 56 | 57 | def nextState(state: State) = state.copy(addedItems = state.addedItems + 1) 58 | 59 | def preCondition(state: State) = true 60 | 61 | def postCondition(state: State, success: Boolean) = success 62 | } 63 | 64 | case class CheckItem(item: T) extends SuccessCommand { 65 | type Result = Boolean 66 | 67 | def run(sut: Sut): Boolean = sut.synchronized(sut.mightContain(item)) 68 | 69 | def nextState(state: State) = state 70 | 71 | def preCondition(state: State) = true 72 | 73 | def postCondition(state: State, result: Boolean): Prop = result 74 | } 75 | 76 | } 77 | 78 | property("supports java serialization") = { 79 | val gen = Gen.listOf(Gen.posNum[Long]) 80 | 81 | Prop.forAll(gen) { indices => 82 | val sz = indices.size max 1 83 | val bf1 = BloomFilter[Long](sz, 0.01) 84 | try { 85 | indices foreach bf1.add 86 | val bos = new ByteArrayOutputStream 87 | val oos = new ObjectOutputStream(bos) 88 | oos.writeObject(bf1) 89 | oos.close() 90 | val bis = new ByteArrayInputStream(bos.toByteArray) 91 | val ois = new ObjectInputStream(bis) 92 | val deserialized = ois.readObject() 93 | deserialized should not be (null) 94 | deserialized should be (a[BloomFilter[Long]]) 95 | val bf2 = deserialized.asInstanceOf[BloomFilter[Long]] 96 | try{ 97 | bf2.numberOfBits shouldEqual bf1.numberOfBits 98 | bf1.numberOfHashes shouldEqual bf1.numberOfHashes 99 | 100 | forAll(indices){ idx => 101 | bf2.mightContain(idx) shouldBe true 102 | } 103 | Prop.passed 104 | } finally bf2.dispose() 105 | } finally bf1.dispose() 106 | } 107 | } 108 | 109 | private val elemsToAddGen = for { 110 | numberOfElemsToAdd <- Gen.chooseNum[Int](1, 1000) 111 | elemsToAdd <- Gen.listOfN(numberOfElemsToAdd, arbitrary[Long]) 112 | } yield elemsToAdd 113 | 114 | // TODO fix elemsToAddGen.filter() below, why Gen.listOfN above generates empty lists? 115 | property("approximateElementCount") = Prop.forAll(elemsToAddGen.filter(x => x.size > 10 && x.toSet.size > 10)) { elemsToAdd: List[Long] => 116 | val bf = BloomFilter[Long](elemsToAdd.size * 10, 0.0001) 117 | elemsToAdd.foreach(bf.add) 118 | val numberOfUnique = elemsToAdd.toSet.size 119 | math.abs(bf.approximateElementCount() - numberOfUnique) < numberOfUnique * 0.1 120 | } 121 | 122 | } 123 | -------------------------------------------------------------------------------- /tests/src/test/scala/tests/bloomfilter/mutable/CuckooFilterSpec.scala: -------------------------------------------------------------------------------- 1 | package tests.bloomfilter.mutable 2 | 3 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream} 4 | 5 | import bloomfilter.CanGenerateHashFrom 6 | import bloomfilter.mutable.CuckooFilter 7 | import org.scalacheck.Test.Parameters 8 | import org.scalacheck.commands.Commands 9 | import org.scalacheck.{Arbitrary, Gen, Prop, Properties} 10 | import org.scalatest.{Inspectors, Matchers} 11 | 12 | class CuckooFilterSpec extends Properties("CuckooFilter") with Matchers with Inspectors { 13 | 14 | property("for Long") = new CuckooFilterCommands[Long].property() 15 | property("for String") = new CuckooFilterCommands[String].property() 16 | property("for Array[Byte]") = new CuckooFilterCommands[Array[Byte]].property() 17 | 18 | 19 | override def overrideParameters(p: Parameters): Parameters = { 20 | super.overrideParameters(p).withMinSuccessfulTests(100) 21 | } 22 | 23 | class CuckooFilterCommands[T: Arbitrary](implicit canGenerateHash: CanGenerateHashFrom[T]) extends Commands { 24 | type Sut = CuckooFilter[T] 25 | 26 | case class State(expectedItems: Long, addedItems: Long) 27 | 28 | override def canCreateNewSut( 29 | newState: State, 30 | initSuts: Traversable[State], 31 | runningSuts: Traversable[Sut]): Boolean = { 32 | initSuts.isEmpty && runningSuts.isEmpty 33 | } 34 | 35 | override def destroySut(sut: Sut): Unit = 36 | sut.dispose() 37 | 38 | override def genInitialState: Gen[State] = 39 | Gen.chooseNum[Long](1, 100000).map(State(_, 0)) 40 | 41 | override def newSut(state: State): Sut = 42 | CuckooFilter[T](state.expectedItems) 43 | 44 | def initialPreCondition(state: State): Boolean = true 45 | 46 | def genCommand(state: State): Gen[Command] = 47 | for { 48 | item <- Arbitrary.arbitrary[T] 49 | } yield commandSequence(AddItem(item), CheckItem(item), RemoveItem(item)) 50 | 51 | case class AddItem(item: T) extends UnitCommand { 52 | def run(sut: Sut): Unit = sut.synchronized(sut.add(item)) 53 | def nextState(state: State): State = state.copy(addedItems = state.addedItems + 1) 54 | def preCondition(state: State): Boolean = state.addedItems < state.expectedItems 55 | def postCondition(state: State, success: Boolean): Prop = success 56 | } 57 | 58 | case class RemoveItem(item: T) extends SuccessCommand { 59 | type Result = Boolean 60 | 61 | def run(sut: Sut): Boolean = sut.synchronized { 62 | sut.remove(item) 63 | !sut.mightContain(item) 64 | } 65 | def nextState(state: State): State = state.copy(addedItems = state.addedItems - 1) 66 | def preCondition(state: State): Boolean = state.addedItems < state.expectedItems 67 | def postCondition(state: State, success: Boolean): Prop = success 68 | } 69 | 70 | case class CheckItem(item: T) extends SuccessCommand { 71 | type Result = Boolean 72 | def run(sut: Sut): Boolean = sut.synchronized(sut.mightContain(item)) 73 | def nextState(state: State): State = state 74 | def preCondition(state: State): Boolean = state.addedItems < state.expectedItems 75 | def postCondition(state: State, result: Boolean): Prop = result 76 | } 77 | 78 | } 79 | 80 | property("strange case") = Prop { 81 | val lst = List(-1l, 0l) 82 | val cf = CuckooFilter[Long](lst.size) 83 | lst.foreach(cf.add) 84 | 85 | lst.forall(cf.mightContain) 86 | } 87 | 88 | property("strange case #2") = Prop { 89 | val lst = List(0l, 0, 0, 0, 0, 0, 0, 0, 4) 90 | //the x3 size factor here enables 4 to end up in a different bucket than the 3 0's, their bucket overflows after the first four inserts 91 | val cf = CuckooFilter[Long](lst.size * 3) 92 | lst.foreach(cf.add) 93 | 94 | lst.forall(cf.mightContain) 95 | } 96 | 97 | property("supports java serialization") = { 98 | val gen = Gen.listOf(Arbitrary.arbLong.arbitrary) 99 | Prop.forAll(gen) { lst => 100 | val sz = math.max(lst.size, 1) 101 | //we add n x3 factor to reduce probability for buckets overflowing during inserts 102 | val sut = CuckooFilter[Long](sz * 3) 103 | try { 104 | lst foreach sut.add 105 | val bos = new ByteArrayOutputStream 106 | val oos = new ObjectOutputStream(bos) 107 | oos.writeObject(sut) 108 | oos.close() 109 | val bis = new ByteArrayInputStream(bos.toByteArray) 110 | val ois = new ObjectInputStream(bis) 111 | val deserialized = ois.readObject() 112 | ois.close() 113 | 114 | deserialized should not be null 115 | deserialized should be(a[CuckooFilter[Long]]) 116 | val sut2 = deserialized.asInstanceOf[CuckooFilter[Long]] 117 | try { 118 | forAll(lst) { k => 119 | withClue(k) { 120 | //we use a realxed condition here, 121 | //the reason for this is potential (and actual) buckets overflowing in the underlying UnsafeTable16. 122 | //a different aproach might be generating the keys in a way that limits them according to number of buckets and number of tags in each bucket. 123 | sut2.mightContain(k) shouldEqual sut.mightContain(k) 124 | } 125 | } 126 | Prop.passed 127 | } finally sut2.dispose() 128 | } finally sut.dispose() 129 | } 130 | } 131 | 132 | } 133 | -------------------------------------------------------------------------------- /bloom-filter/src/main/scala/bloomfilter/hashing/MurmurHash3Generic.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter.hashing 2 | 3 | import java.lang.Long.rotateLeft 4 | 5 | import bloomfilter.CanGetDataFrom 6 | 7 | object MurmurHash3Generic { 8 | 9 | private val c1: Long = 0x87c37b91114253d5L 10 | private val c2: Long = 0x4cf5ad432745937fL 11 | 12 | def fmix64(l: Long): Long = { 13 | var k = l 14 | k ^= k >>> 33 15 | k *= 0xff51afd7ed558ccdL 16 | k ^= k >>> 33 17 | k *= 0xc4ceb9fe1a85ec53L 18 | k ^= k >>> 33 19 | k 20 | } 21 | 22 | def murmurhash3_x64_128[From](key: From, offset: Int, len: Int, seed: Int)(implicit cgdf: CanGetDataFrom[From]): (Long, Long) = { 23 | var h1: Long = seed & 0x00000000FFFFFFFFL 24 | var h2: Long = seed & 0x00000000FFFFFFFFL 25 | 26 | val roundedEnd = offset + (len & 0xFFFFFFF0); // round down to 16 byte block 27 | 28 | var i = offset 29 | while (i < roundedEnd) { 30 | var k1 = cgdf.getLong(key, i) 31 | var k2 = cgdf.getLong(key, i + 8) 32 | k1 *= c1; k1 = rotateLeft(k1, 31); k1 *= c2; h1 ^= k1 33 | h1 = rotateLeft(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729 34 | k2 *= c2; k2 = rotateLeft(k2, 33); k2 *= c1; h2 ^= k2 35 | h2 = rotateLeft(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5 36 | 37 | i += 16 38 | } 39 | 40 | var k1: Long = 0 41 | var k2: Long = 0 42 | 43 | val lenVar = len & 15 44 | if (lenVar == 15) k2 = (cgdf.getByte(key, roundedEnd + 14) & 0xffL) << 48 45 | if (lenVar >= 14) k2 |= (cgdf.getByte(key, roundedEnd + 13) & 0xffL) << 40 46 | if (lenVar >= 13) k2 |= (cgdf.getByte(key, roundedEnd + 12) & 0xffL) << 32 47 | if (lenVar >= 12) k2 |= (cgdf.getByte(key, roundedEnd + 11) & 0xffL) << 24 48 | if (lenVar >= 11) k2 |= (cgdf.getByte(key, roundedEnd + 10) & 0xffL) << 16 49 | if (lenVar >= 10) k2 |= (cgdf.getByte(key, roundedEnd + 9) & 0xffL) << 8 50 | if (lenVar >= 9) { 51 | k2 |= (cgdf.getByte(key, roundedEnd + 8) & 0xffL) 52 | k2 *= c2 53 | k2 = rotateLeft(k2, 33) 54 | k2 *= c1 55 | h2 ^= k2 56 | } 57 | if (lenVar >= 8) k1 = cgdf.getByte(key, roundedEnd + 7).toLong << 56 58 | if (lenVar >= 7) k1 |= (cgdf.getByte(key, roundedEnd + 6) & 0xffL) << 48 59 | if (lenVar >= 6) k1 |= (cgdf.getByte(key, roundedEnd + 5) & 0xffL) << 40 60 | if (lenVar >= 5) k1 |= (cgdf.getByte(key, roundedEnd + 4) & 0xffL) << 32 61 | if (lenVar >= 4) k1 |= (cgdf.getByte(key, roundedEnd + 3) & 0xffL) << 24 62 | if (lenVar >= 3) k1 |= (cgdf.getByte(key, roundedEnd + 2) & 0xffL) << 16 63 | if (lenVar >= 2) k1 |= (cgdf.getByte(key, roundedEnd + 1) & 0xffL) << 8 64 | if (lenVar >= 1) { 65 | k1 |= (cgdf.getByte(key, roundedEnd) & 0xffL) 66 | k1 *= c1 67 | k1 = rotateLeft(k1, 31) 68 | k1 *= c2 69 | h1 ^= k1 70 | } 71 | 72 | h1 ^= len; h2 ^= len 73 | 74 | h1 += h2 75 | h2 += h1 76 | 77 | h1 = fmix64(h1) 78 | h2 = fmix64(h2) 79 | 80 | h1 += h2 81 | h2 += h1 82 | 83 | (h1, h2) 84 | } 85 | 86 | def murmurhash3_x64_64[From](key: From, offset: Int, len: Int, seed: Int)(implicit cgdf: CanGetDataFrom[From]): Long = { 87 | var h1: Long = seed & 0x00000000FFFFFFFFL 88 | var h2: Long = seed & 0x00000000FFFFFFFFL 89 | 90 | val roundedEnd = offset + (len & 0xFFFFFFF0); // round down to 16 byte block 91 | 92 | var i = offset 93 | while (i < roundedEnd) { 94 | var k1 = cgdf.getLong(key, i) 95 | var k2 = cgdf.getLong(key, i + 8) 96 | k1 *= c1; k1 = rotateLeft(k1, 31); k1 *= c2; h1 ^= k1 97 | h1 = rotateLeft(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729 98 | k2 *= c2; k2 = rotateLeft(k2, 33); k2 *= c1; h2 ^= k2 99 | h2 = rotateLeft(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5 100 | 101 | i += 16 102 | } 103 | 104 | var k1: Long = 0 105 | var k2: Long = 0 106 | 107 | val lenVar = len & 15 108 | if (lenVar == 15) k2 = (cgdf.getByte(key, roundedEnd + 14) & 0xffL) << 48 109 | if (lenVar >= 14) k2 |= (cgdf.getByte(key, roundedEnd + 13) & 0xffL) << 40 110 | if (lenVar >= 13) k2 |= (cgdf.getByte(key, roundedEnd + 12) & 0xffL) << 32 111 | if (lenVar >= 12) k2 |= (cgdf.getByte(key, roundedEnd + 11) & 0xffL) << 24 112 | if (lenVar >= 11) k2 |= (cgdf.getByte(key, roundedEnd + 10) & 0xffL) << 16 113 | if (lenVar >= 10) k2 |= (cgdf.getByte(key, roundedEnd + 9) & 0xffL) << 8 114 | if (lenVar >= 9) { 115 | k2 |= (cgdf.getByte(key, roundedEnd + 8) & 0xffL) 116 | k2 *= c2 117 | k2 = rotateLeft(k2, 33) 118 | k2 *= c1 119 | h2 ^= k2 120 | } 121 | if (lenVar >= 8) k1 = cgdf.getByte(key, roundedEnd + 7).toLong << 56 122 | if (lenVar >= 7) k1 |= (cgdf.getByte(key, roundedEnd + 6) & 0xffL) << 48 123 | if (lenVar >= 6) k1 |= (cgdf.getByte(key, roundedEnd + 5) & 0xffL) << 40 124 | if (lenVar >= 5) k1 |= (cgdf.getByte(key, roundedEnd + 4) & 0xffL) << 32 125 | if (lenVar >= 4) k1 |= (cgdf.getByte(key, roundedEnd + 3) & 0xffL) << 24 126 | if (lenVar >= 3) k1 |= (cgdf.getByte(key, roundedEnd + 2) & 0xffL) << 16 127 | if (lenVar >= 2) k1 |= (cgdf.getByte(key, roundedEnd + 1) & 0xffL) << 8 128 | if (lenVar >= 1) { 129 | k1 |= (cgdf.getByte(key, roundedEnd) & 0xffL) 130 | k1 *= c1 131 | k1 = rotateLeft(k1, 31) 132 | k1 *= c2 133 | h1 ^= k1 134 | } 135 | 136 | h1 ^= len; h2 ^= len 137 | 138 | h1 += h2 139 | h2 += h1 140 | 141 | h1 = fmix64(h1) 142 | h2 = fmix64(h2) 143 | 144 | h1 += h2 145 | h2 += h1 146 | 147 | h1 + h2 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /sandbox/src/main/java/sandbox/hashing/CassandraMurmurHash.java: -------------------------------------------------------------------------------- 1 | package sandbox.hashing; 2 | 3 | import java.nio.ByteBuffer; 4 | 5 | public class CassandraMurmurHash 6 | { 7 | public static int hash32(ByteBuffer data, int offset, int length, int seed) 8 | { 9 | int m = 0x5bd1e995; 10 | int r = 24; 11 | 12 | int h = seed ^ length; 13 | 14 | int len_4 = length >> 2; 15 | 16 | for (int i = 0; i < len_4; i++) 17 | { 18 | int i_4 = i << 2; 19 | int k = data.get(offset + i_4 + 3); 20 | k = k << 8; 21 | k = k | (data.get(offset + i_4 + 2) & 0xff); 22 | k = k << 8; 23 | k = k | (data.get(offset + i_4 + 1) & 0xff); 24 | k = k << 8; 25 | k = k | (data.get(offset + i_4 + 0) & 0xff); 26 | k *= m; 27 | k ^= k >>> r; 28 | k *= m; 29 | h *= m; 30 | h ^= k; 31 | } 32 | 33 | // avoid calculating modulo 34 | int len_m = len_4 << 2; 35 | int left = length - len_m; 36 | 37 | if (left != 0) 38 | { 39 | if (left >= 3) 40 | { 41 | h ^= (int) data.get(offset + length - 3) << 16; 42 | } 43 | if (left >= 2) 44 | { 45 | h ^= (int) data.get(offset + length - 2) << 8; 46 | } 47 | if (left >= 1) 48 | { 49 | h ^= (int) data.get(offset + length - 1); 50 | } 51 | 52 | h *= m; 53 | } 54 | 55 | h ^= h >>> 13; 56 | h *= m; 57 | h ^= h >>> 15; 58 | 59 | return h; 60 | } 61 | 62 | public static long hash2_64(ByteBuffer key, int offset, int length, long seed) 63 | { 64 | long m64 = 0xc6a4a7935bd1e995L; 65 | int r64 = 47; 66 | 67 | long h64 = (seed & 0xffffffffL) ^ (m64 * length); 68 | 69 | int lenLongs = length >> 3; 70 | 71 | for (int i = 0; i < lenLongs; ++i) 72 | { 73 | int i_8 = i << 3; 74 | 75 | long k64 = ((long) key.get(offset+i_8+0) & 0xff) + (((long) key.get(offset+i_8+1) & 0xff)<<8) + 76 | (((long) key.get(offset+i_8+2) & 0xff)<<16) + (((long) key.get(offset+i_8+3) & 0xff)<<24) + 77 | (((long) key.get(offset+i_8+4) & 0xff)<<32) + (((long) key.get(offset+i_8+5) & 0xff)<<40) + 78 | (((long) key.get(offset+i_8+6) & 0xff)<<48) + (((long) key.get(offset+i_8+7) & 0xff)<<56); 79 | 80 | k64 *= m64; 81 | k64 ^= k64 >>> r64; 82 | k64 *= m64; 83 | 84 | h64 ^= k64; 85 | h64 *= m64; 86 | } 87 | 88 | int rem = length & 0x7; 89 | 90 | switch (rem) 91 | { 92 | case 0: 93 | break; 94 | case 7: 95 | h64 ^= (long) key.get(offset + length - rem + 6) << 48; 96 | case 6: 97 | h64 ^= (long) key.get(offset + length - rem + 5) << 40; 98 | case 5: 99 | h64 ^= (long) key.get(offset + length - rem + 4) << 32; 100 | case 4: 101 | h64 ^= (long) key.get(offset + length - rem + 3) << 24; 102 | case 3: 103 | h64 ^= (long) key.get(offset + length - rem + 2) << 16; 104 | case 2: 105 | h64 ^= (long) key.get(offset + length - rem + 1) << 8; 106 | case 1: 107 | h64 ^= (long) key.get(offset + length - rem); 108 | h64 *= m64; 109 | } 110 | 111 | h64 ^= h64 >>> r64; 112 | h64 *= m64; 113 | h64 ^= h64 >>> r64; 114 | 115 | return h64; 116 | } 117 | 118 | protected static long getblock(ByteBuffer key, int offset, int index) 119 | { 120 | int i_8 = index << 3; 121 | return ((long) key.get(offset + i_8 + 0) & 0xff) + (((long) key.get(offset + i_8 + 1) & 0xff) << 8) + 122 | (((long) key.get(offset + i_8 + 2) & 0xff) << 16) + (((long) key.get(offset + i_8 + 3) & 0xff) << 24) + 123 | (((long) key.get(offset + i_8 + 4) & 0xff) << 32) + (((long) key.get(offset + i_8 + 5) & 0xff) << 40) + 124 | (((long) key.get(offset + i_8 + 6) & 0xff) << 48) + (((long) key.get(offset + i_8 + 7) & 0xff) << 56); 125 | } 126 | 127 | protected static long rotl64(long v, int n) 128 | { 129 | return ((v << n) | (v >>> (64 - n))); 130 | } 131 | 132 | protected static long fmix(long k) 133 | { 134 | k ^= k >>> 33; 135 | k *= 0xff51afd7ed558ccdL; 136 | k ^= k >>> 33; 137 | k *= 0xc4ceb9fe1a85ec53L; 138 | k ^= k >>> 33; 139 | 140 | return k; 141 | } 142 | 143 | public static long[] hash3_x64_128(ByteBuffer key, int offset, int length, long seed) 144 | { 145 | final int nblocks = length >> 4; // Process as 128-bit blocks. 146 | 147 | long h1 = seed; 148 | long h2 = seed; 149 | 150 | long c1 = 0x87c37b91114253d5L; 151 | long c2 = 0x4cf5ad432745937fL; 152 | 153 | //---------- 154 | // body 155 | 156 | for(int i = 0; i < nblocks; i++) 157 | { 158 | long k1 = getblock(key, offset, i*2+0); 159 | long k2 = getblock(key, offset, i*2+1); 160 | 161 | k1 *= c1; k1 = rotl64(k1,31); k1 *= c2; h1 ^= k1; 162 | 163 | h1 = rotl64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; 164 | 165 | k2 *= c2; k2 = rotl64(k2,33); k2 *= c1; h2 ^= k2; 166 | 167 | h2 = rotl64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; 168 | } 169 | 170 | //---------- 171 | // tail 172 | 173 | // Advance offset to the unprocessed tail of the data. 174 | offset += nblocks * 16; 175 | 176 | long k1 = 0; 177 | long k2 = 0; 178 | 179 | switch(length & 15) 180 | { 181 | case 15: k2 ^= ((long) key.get(offset+14)) << 48; 182 | case 14: k2 ^= ((long) key.get(offset+13)) << 40; 183 | case 13: k2 ^= ((long) key.get(offset+12)) << 32; 184 | case 12: k2 ^= ((long) key.get(offset+11)) << 24; 185 | case 11: k2 ^= ((long) key.get(offset+10)) << 16; 186 | case 10: k2 ^= ((long) key.get(offset+9)) << 8; 187 | case 9: k2 ^= ((long) key.get(offset+8)) << 0; 188 | k2 *= c2; k2 = rotl64(k2,33); k2 *= c1; h2 ^= k2; 189 | 190 | case 8: k1 ^= ((long) key.get(offset+7)) << 56; 191 | case 7: k1 ^= ((long) key.get(offset+6)) << 48; 192 | case 6: k1 ^= ((long) key.get(offset+5)) << 40; 193 | case 5: k1 ^= ((long) key.get(offset+4)) << 32; 194 | case 4: k1 ^= ((long) key.get(offset+3)) << 24; 195 | case 3: k1 ^= ((long) key.get(offset+2)) << 16; 196 | case 2: k1 ^= ((long) key.get(offset+1)) << 8; 197 | case 1: k1 ^= ((long) key.get(offset)); 198 | k1 *= c1; k1 = rotl64(k1,31); k1 *= c2; h1 ^= k1; 199 | }; 200 | 201 | //---------- 202 | // finalization 203 | 204 | h1 ^= length; h2 ^= length; 205 | 206 | h1 += h2; 207 | h2 += h1; 208 | 209 | h1 = fmix(h1); 210 | h2 = fmix(h2); 211 | 212 | h1 += h2; 213 | h2 += h1; 214 | 215 | return(new long[] {h1, h2}); 216 | } 217 | } -------------------------------------------------------------------------------- /tests/src/test/scala/tests/bloomfilter/mutable/UnsafeTableSpec.scala: -------------------------------------------------------------------------------- 1 | package tests.bloomfilter.mutable 2 | 3 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream} 4 | 5 | import bloomfilter.mutable.{UnsafeTable, UnsafeTable16Bit, UnsafeTable8Bit} 6 | import org.scalacheck.Test.Parameters 7 | import org.scalacheck.commands.Commands 8 | import org.scalacheck.{Arbitrary, Gen, Prop, Properties} 9 | import org.scalatest.{Matchers, PrivateMethodTester} 10 | 11 | class UnsafeTableSpec extends Properties("UnsafeTableSpec") with Matchers with PrivateMethodTester { 12 | 13 | property("writeTag & readTag") = new UnsafeTableCommands().property() 14 | 15 | // TODO Sometimes fails when trying to add 5 elements to one bucket. It fails correctly. It shouldn't add 5 elemns. Scalacheck issue? Investigate 16 | property("insert & find") = new UnsafeTableInsertFindCommands().property() 17 | 18 | override def overrideParameters(p: Parameters): Parameters = { 19 | super.overrideParameters(p).withMinSuccessfulTests(100) 20 | } 21 | 22 | class UnsafeTableCommands extends Commands { 23 | type Sut = UnsafeTable8Bit 24 | 25 | case class State(numberOfBuckets: Long, addedItems: Long) 26 | 27 | override def canCreateNewSut( 28 | newState: State, 29 | initSuts: Traversable[State], 30 | runningSuts: Traversable[Sut]): Boolean = 31 | (initSuts.isEmpty && runningSuts.isEmpty) || 32 | newState.addedItems >= newState.numberOfBuckets || newState.addedItems >= 4 33 | 34 | 35 | override def destroySut(sut: Sut): Unit = 36 | sut.dispose() 37 | 38 | override def genInitialState: Gen[State] = 39 | Gen.chooseNum[Long](1, /*Int.MaxValue * 2L*/ 1000).map(State(_, 0)) 40 | 41 | override def newSut(state: State): Sut = 42 | new UnsafeTable8Bit(state.numberOfBuckets) 43 | 44 | def initialPreCondition(state: State): Boolean = true 45 | 46 | def genCommand(state: State): Gen[Command] = 47 | for { 48 | index <- Gen.choose[Long](0, state.numberOfBuckets - 1) 49 | tagIndex <- Gen.choose[Int](0, 3) 50 | tag <- Gen.choose[Byte](0, Byte.MaxValue) 51 | } yield commandSequence(WriteTag(index, tagIndex, tag), ReadTag(index, tagIndex, tag)) 52 | 53 | case class WriteTag(index: Long, tagIndex: Int, tag: Byte) extends UnitCommand { 54 | def run(sut: Sut): Unit = sut.synchronized(sut.writeTag(index, tagIndex, tag)) 55 | 56 | def nextState(state: State): State = state.copy(addedItems = state.addedItems + 1) 57 | 58 | def preCondition(state: State): Boolean = state.addedItems < state.numberOfBuckets || state.addedItems < 4 59 | 60 | def postCondition(state: State, success: Boolean): Prop = success 61 | } 62 | 63 | case class ReadTag(index: Long, tagIndex: Int, tag: Byte) extends SuccessCommand { 64 | type Result = Boolean 65 | 66 | def run(sut: Sut): Boolean = sut.synchronized(sut.readTag(index, tagIndex) == tag) 67 | 68 | def nextState(state: State): State = state 69 | 70 | def preCondition(state: State): Boolean = state.addedItems < state.numberOfBuckets || state.addedItems < 4 71 | 72 | def postCondition(state: State, result: Boolean): Prop = result 73 | } 74 | 75 | } 76 | 77 | class UnsafeTableInsertFindCommands extends Commands { 78 | type Sut = UnsafeTable8Bit 79 | 80 | case class State(numberOfBuckets: Long, addedItems: Long, bucketsPopulation: Map[Long, Int]) 81 | 82 | override def canCreateNewSut( 83 | newState: State, 84 | initSuts: Traversable[State], 85 | runningSuts: Traversable[Sut]): Boolean = 86 | (initSuts.isEmpty && runningSuts.isEmpty) || 87 | newState.addedItems >= newState.numberOfBuckets || newState.addedItems >= 4 88 | 89 | override def destroySut(sut: Sut): Unit = 90 | sut.dispose() 91 | 92 | override def genInitialState: Gen[State] = 93 | Gen.chooseNum[Long](1, /*Int.MaxValue * 2L*/ 1000).map(State(_, 0, Map.empty)) 94 | 95 | override def newSut(state: State): Sut = 96 | new UnsafeTable8Bit(state.numberOfBuckets) 97 | 98 | def initialPreCondition(state: State): Boolean = true 99 | 100 | def genCommand(state: State): Gen[Command] = 101 | for { 102 | index <- Gen.choose[Long](0, state.numberOfBuckets - 1) 103 | tag <- Gen.choose[Byte](0, Byte.MaxValue) 104 | } yield commandSequence(Insert(index, tag), Find(index, tag)) 105 | 106 | case class Insert(index: Long, tag: Byte) extends UnitCommand { 107 | def run(sut: Sut): Unit = sut.synchronized(sut.insert(index, tag)) 108 | 109 | def nextState(state: State): State = { 110 | val nextBucketsPopulation = state.bucketsPopulation.updated(index, prevBucketPopulation(state) + 1) 111 | state.copy(addedItems = state.addedItems + 1, bucketsPopulation = nextBucketsPopulation) 112 | } 113 | 114 | def prevBucketPopulation(state: State): Int = state.bucketsPopulation.getOrElse(index, 0) 115 | 116 | def preCondition(state: State): Boolean = 117 | (prevBucketPopulation(state) < UnsafeTable8Bit.TagsPerBucket) && 118 | (state.addedItems < state.numberOfBuckets || state.addedItems < 4) 119 | 120 | def postCondition(state: State, success: Boolean): Prop = success 121 | } 122 | 123 | case class Find(index: Long, tag: Byte) extends SuccessCommand { 124 | type Result = Boolean 125 | def run(sut: Sut): Boolean = sut.synchronized(sut.find(index, tag)) 126 | def nextState(state: State): State = state 127 | def preCondition(state: State): Boolean = state.addedItems < state.numberOfBuckets || state.addedItems < 4 128 | def postCondition(state: State, result: Boolean): Prop = result 129 | } 130 | 131 | } 132 | 133 | type UnsafeTableEx = UnsafeTable { 134 | def readTag(bucketIndex: Long, tagIndex: Int): Long 135 | } 136 | 137 | def serializationProp(mkTable: Long => UnsafeTableEx): Prop = { 138 | val gen = for { 139 | numBuckets <- Gen.posNum[Int] 140 | numPopulated <- Gen.choose(0, numBuckets) 141 | m <- Gen.mapOfN(numPopulated, Gen.zip(Gen.choose(0, numBuckets - 1), Arbitrary.arbByte.arbitrary)) 142 | } yield { 143 | numBuckets -> m 144 | } 145 | val ptrAccessor = PrivateMethod[Long]('ptr) 146 | 147 | def ptrOf(unsaffeTable: UnsafeTable) = unsaffeTable invokePrivate ptrAccessor() 148 | 149 | Prop.forAllNoShrink(gen) { case (numBuckets, tags) => 150 | val sut = mkTable(numBuckets) 151 | try { 152 | tags.foreach { case (idx, tag) => sut.insert(idx, tag) } 153 | 154 | val bos = new ByteArrayOutputStream 155 | val oos = new ObjectOutputStream(bos) 156 | oos.writeObject(sut) 157 | oos.close() 158 | val bis = new ByteArrayInputStream(bos.toByteArray) 159 | val ois = new ObjectInputStream(bis) 160 | val deserialized = ois.readObject() 161 | ois.close() 162 | 163 | deserialized should not be null 164 | deserialized should be(a[UnsafeTable]) 165 | deserialized should have('class (sut.getClass)) 166 | val sut2 = deserialized.asInstanceOf[UnsafeTableEx] 167 | ptrOf(sut2) should not be 0 168 | ptrOf(sut2) should not equal ptrOf(sut) 169 | try { 170 | for { 171 | idx <- 0 until numBuckets 172 | tagIdx <- 0 until UnsafeTable8Bit.TagsPerBucket 173 | } { 174 | sut.readTag(idx, tagIdx) shouldEqual sut2.readTag(idx, tagIdx) 175 | } 176 | Prop.passed 177 | } finally sut2.dispose() 178 | } finally sut.dispose() 179 | } 180 | } 181 | 182 | property("UnsafeTable8Bit supports java serialization") = serializationProp(new UnsafeTable8Bit(_)) 183 | property("UnsafeTable16Bit supports java serialization") = serializationProp(new UnsafeTable16Bit(_)) 184 | 185 | } 186 | -------------------------------------------------------------------------------- /bloom-filter/src/main/scala/bloomfilter/mutable/UnsafeTable.scala: -------------------------------------------------------------------------------- 1 | package bloomfilter.mutable 2 | 3 | import java.io._ 4 | 5 | import bloomfilter.util.Unsafe.unsafe 6 | 7 | 8 | // TODO macro for various bits? 9 | trait UnsafeTable { 10 | def insert(index: Long, tag: Long): Boolean 11 | def swapAny(index: Long, tag: Long): Long 12 | def remove(index: Long, tag: Long): Boolean 13 | def find(index: Long, tag: Long): Boolean 14 | def dispose(): Unit 15 | 16 | protected def readPtrFrom(in: InputStream, ptr: Long, numBytes: Long): Unit = { 17 | val din = new DataInputStream(in) 18 | var n = 0L 19 | while (n + 8 <= numBytes) { 20 | val l = din.readLong() 21 | unsafe.putLong(ptr + n, l) 22 | n += 8 23 | } 24 | while (n < numBytes) { 25 | val b = din.readByte() 26 | unsafe.putByte(ptr + n, b) 27 | n += 1 28 | } 29 | } 30 | protected def writePtrTo(out: OutputStream, ptr: Long, numBytes: Long): Unit = { 31 | val dout = new DataOutputStream(out) 32 | var n = 0L 33 | while (n + 8 <= numBytes) { 34 | val l = unsafe.getLong(ptr + n) 35 | dout.writeLong(l) 36 | n += 8 37 | } 38 | while (n < numBytes) { 39 | val b = unsafe.getByte(ptr + n) 40 | dout.writeByte(b.toInt) 41 | n += 1 42 | } 43 | } 44 | 45 | protected def toSerializedForm(bytesPerBucket: Int, numberOfBuckets: Long): AnyRef = new UnsafeTable.SerializedForm(bytesPerBucket, numberOfBuckets, this) 46 | 47 | def writeTo(out: OutputStream): Unit 48 | def readFrom(in: InputStream): Unit 49 | } 50 | 51 | object UnsafeTable { 52 | 53 | @SerialVersionUID(1L) 54 | private class SerializedForm(bytesPerBucket: Int, numberOfBuckets: Long, @transient var unsafeTable: UnsafeTable) extends Serializable { 55 | private def writeObject(oos: ObjectOutputStream): Unit = { 56 | oos.defaultWriteObject() 57 | unsafeTable.writeTo(oos) 58 | } 59 | 60 | private def readObject(ois: ObjectInputStream): Unit = { 61 | ois.defaultReadObject() 62 | unsafeTable = bytesPerBucket match { 63 | case 8 => new UnsafeTable8Bit(numberOfBuckets) 64 | case 16 => new UnsafeTable16Bit(numberOfBuckets) 65 | } 66 | unsafeTable.readFrom(ois) 67 | } 68 | 69 | @throws(classOf[java.io.ObjectStreamException]) 70 | private def readResolve: AnyRef = unsafeTable 71 | } 72 | 73 | } 74 | 75 | @SerialVersionUID(1L) 76 | class UnsafeTable8Bit(val numberOfBuckets: Long) extends UnsafeTable with Serializable { 77 | 78 | import UnsafeTable8Bit._ 79 | 80 | private val ptr = unsafe.allocateMemory(bytesPerBucket * numberOfBuckets) 81 | unsafe.setMemory(ptr, bytesPerBucket * numberOfBuckets, 0.toByte) 82 | 83 | def readTag(bucketIndex: Long, tagIndex: Int): Long = { 84 | val p = ptr + bucketIndex * bytesPerBucket + tagIndex 85 | val tag = unsafe.getByte(p) 86 | tag & tagMask 87 | } 88 | 89 | def writeTag(i: Long, j: Int, t: Long): Unit = { 90 | val p = ptr + i * bytesPerBucket 91 | val tag = t & tagMask 92 | unsafe.putByte(p + j, tag.toByte) 93 | } 94 | 95 | def insert(index: Long, tag: Long): Boolean = { 96 | var tagIndex = 0 97 | while (tagIndex < TagsPerBucket) { 98 | if (readTag(index, tagIndex) == EmptyTag) { 99 | writeTag(index, tagIndex, tag) 100 | return true 101 | } 102 | tagIndex += 1 103 | } 104 | 105 | false 106 | } 107 | 108 | def swapAny(index: Long, tag: Long): Long = { 109 | var tagIndex = 0 110 | while (tagIndex < TagsPerBucket) { 111 | if (readTag(index, tagIndex) == EmptyTag) { 112 | writeTag(index, tagIndex, tag) 113 | return EmptyTag 114 | } 115 | tagIndex += 1 116 | } 117 | 118 | random += 1 119 | val r = random & (TagsPerBucket - 1) 120 | val tagToSwap = readTag(index, r) 121 | writeTag(index, r, tag) 122 | tagToSwap 123 | } 124 | 125 | def remove(index: Long, tag: Long): Boolean = { 126 | var tagIndex = 0 127 | while (tagIndex < TagsPerBucket) { 128 | if (readTag(index, tagIndex) == tag) { 129 | writeTag(index, tagIndex, EmptyTag) 130 | return true 131 | } 132 | tagIndex += 1 133 | } 134 | false 135 | } 136 | 137 | def find(index: Long, tag: Long): Boolean = { 138 | var i = 0 139 | while (i < TagsPerBucket) { 140 | val tag1 = readTag(index, i) 141 | if (tag1 == tag) { 142 | return true 143 | } 144 | i += 1 145 | } 146 | false 147 | } 148 | 149 | def writeTo(out: OutputStream): Unit = { 150 | writePtrTo(out, ptr, bytesPerBucket * numberOfBuckets) 151 | } 152 | 153 | def readFrom(in: InputStream): Unit = { 154 | readPtrFrom(in, ptr, bytesPerBucket * numberOfBuckets) 155 | } 156 | 157 | def dispose(): Unit = unsafe.freeMemory(ptr) 158 | 159 | @throws(classOf[java.io.ObjectStreamException]) 160 | private def writeReplace: AnyRef = toSerializedForm(8, numberOfBuckets) 161 | } 162 | 163 | object UnsafeTable8Bit { 164 | val EmptyTag = 0L 165 | val BitsPerItem = 8 166 | val TagsPerBucket = 4 167 | private var random = 0 168 | private val bytesPerBucket = (BitsPerItem * TagsPerBucket + 7) >> 3 169 | private val tagMask = (1L << BitsPerItem) - 1 170 | } 171 | 172 | 173 | @SerialVersionUID(1) 174 | class UnsafeTable16Bit(val numberOfBuckets: Long) extends UnsafeTable with Serializable { 175 | 176 | import UnsafeTable16Bit._ 177 | 178 | private val ptr = unsafe.allocateMemory(bytesPerBucket * numberOfBuckets) 179 | unsafe.setMemory(ptr, bytesPerBucket * numberOfBuckets, 0.toByte) 180 | 181 | def readTag(bucketIndex: Long, tagIndex: Int): Long = { 182 | val p = ptr + bucketIndex * bytesPerBucket + (tagIndex << 1) 183 | val tag = unsafe.getShort(p) 184 | tag & tagMask 185 | } 186 | 187 | def writeTag(bucketIndex: Long, tagIndex: Int, tag: Long): Unit = { 188 | val p = ptr + bucketIndex * bytesPerBucket + (tagIndex << 1) 189 | unsafe.putShort(p, (tag & tagMask).toShort) 190 | } 191 | 192 | def insert(index: Long, tag: Long): Boolean = { 193 | var tagIndex = 0 194 | while (tagIndex < TagsPerBucket) { 195 | if (readTag(index, tagIndex) == EmptyTag) { 196 | writeTag(index, tagIndex, tag) 197 | return true 198 | } 199 | tagIndex += 1 200 | } 201 | 202 | false 203 | } 204 | 205 | def swapAny(index: Long, tag: Long): Long = { 206 | var tagIndex = 0 207 | while (tagIndex < TagsPerBucket) { 208 | if (readTag(index, tagIndex) == EmptyTag) { 209 | writeTag(index, tagIndex, tag) 210 | return EmptyTag 211 | } 212 | tagIndex += 1 213 | } 214 | 215 | random += 1 216 | val r = random & (TagsPerBucket - 1) 217 | val tagToSwap = readTag(index, r) 218 | writeTag(index, r, tag) 219 | tagToSwap 220 | } 221 | 222 | def remove(index: Long, tag: Long): Boolean = { 223 | var tagIndex = 0 224 | while (tagIndex < TagsPerBucket) { 225 | if (readTag(index, tagIndex) == tag) { 226 | writeTag(index, tagIndex, EmptyTag) 227 | return true 228 | } 229 | tagIndex += 1 230 | } 231 | false 232 | } 233 | 234 | def find(index: Long, tag: Long): Boolean = { 235 | var i = 0 236 | while (i < TagsPerBucket) { 237 | val tag1 = readTag(index, i) 238 | if (tag1 == tag) { 239 | return true 240 | } 241 | i += 1 242 | } 243 | false 244 | } 245 | 246 | def writeTo(out: OutputStream): Unit = { 247 | writePtrTo(out, ptr, bytesPerBucket * numberOfBuckets) 248 | } 249 | 250 | def readFrom(in: InputStream): Unit = { 251 | readPtrFrom(in, ptr, bytesPerBucket * numberOfBuckets) 252 | } 253 | 254 | def dispose(): Unit = unsafe.freeMemory(ptr) 255 | 256 | @throws(classOf[java.io.ObjectStreamException]) 257 | private def writeReplace: AnyRef = toSerializedForm(16, numberOfBuckets) 258 | } 259 | 260 | 261 | object UnsafeTable16Bit { 262 | val EmptyTag = 0L 263 | val BitsPerItem = 16 264 | val TagsPerBucket = 4 265 | private var random = 0 266 | private val bytesPerBucket = (BitsPerItem * TagsPerBucket + 7) >> 3 267 | private val tagMask = (1L << BitsPerItem) - 1 268 | 269 | } 270 | -------------------------------------------------------------------------------- /sandbox/src/main/java/sandbox/hashing/YonikMurmurHash3.java: -------------------------------------------------------------------------------- 1 | package sandbox.hashing; 2 | 3 | /** 4 | * The MurmurHash3 algorithm was created by Austin Appleby and placed in the public domain. 5 | * This java port was authored by Yonik Seeley and also placed into the public domain. 6 | * The author hereby disclaims copyright to this source code. 7 | *

8 | * This produces exactly the same hash values as the final C++ 9 | * version of MurmurHash3 and is thus suitable for producing the same hash values across 10 | * platforms. 11 | *

12 | * The 32 bit x86 version of this hash should be the fastest variant for relatively short keys like ids. 13 | * murmurhash3_x64_128 is a good choice for longer strings or if you need more than 32 bits of hash. 14 | *

15 | * Note - The x86 and x64 versions do _not_ produce the same results, as the 16 | * algorithms are optimized for their respective platforms. 17 | *

18 | * See http://github.com/yonik/java_util for future updates to this file. 19 | */ 20 | public final class YonikMurmurHash3 { 21 | 22 | /** 128 bits of state */ 23 | public static final class LongPair { 24 | public long val1; 25 | public long val2; 26 | } 27 | 28 | public static final int fmix32(int h) { 29 | h ^= h >>> 16; 30 | h *= 0x85ebca6b; 31 | h ^= h >>> 13; 32 | h *= 0xc2b2ae35; 33 | h ^= h >>> 16; 34 | return h; 35 | } 36 | 37 | public static final long fmix64(long k) { 38 | k ^= k >>> 33; 39 | k *= 0xff51afd7ed558ccdL; 40 | k ^= k >>> 33; 41 | k *= 0xc4ceb9fe1a85ec53L; 42 | k ^= k >>> 33; 43 | return k; 44 | } 45 | 46 | /** Gets a long from a byte buffer in little endian byte order. */ 47 | public static final long getLongLittleEndian(byte[] buf, int offset) { 48 | return ((long)buf[offset+7] << 56) // no mask needed 49 | | ((buf[offset+6] & 0xffL) << 48) 50 | | ((buf[offset+5] & 0xffL) << 40) 51 | | ((buf[offset+4] & 0xffL) << 32) 52 | | ((buf[offset+3] & 0xffL) << 24) 53 | | ((buf[offset+2] & 0xffL) << 16) 54 | | ((buf[offset+1] & 0xffL) << 8) 55 | | ((buf[offset ] & 0xffL)); // no shift needed 56 | } 57 | 58 | 59 | /** Returns the MurmurHash3_x86_32 hash. */ 60 | public static int murmurhash3_x86_32(byte[] data, int offset, int len, int seed) { 61 | 62 | final int c1 = 0xcc9e2d51; 63 | final int c2 = 0x1b873593; 64 | 65 | int h1 = seed; 66 | int roundedEnd = offset + (len & 0xfffffffc); // round down to 4 byte block 67 | 68 | for (int i=offset; i>> 17); // ROTL32(k1,15); 73 | k1 *= c2; 74 | 75 | h1 ^= k1; 76 | h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13); 77 | h1 = h1*5+0xe6546b64; 78 | } 79 | 80 | // tail 81 | int k1 = 0; 82 | 83 | switch(len & 0x03) { 84 | case 3: 85 | k1 = (data[roundedEnd + 2] & 0xff) << 16; 86 | // fallthrough 87 | case 2: 88 | k1 |= (data[roundedEnd + 1] & 0xff) << 8; 89 | // fallthrough 90 | case 1: 91 | k1 |= (data[roundedEnd] & 0xff); 92 | k1 *= c1; 93 | k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); 94 | k1 *= c2; 95 | h1 ^= k1; 96 | } 97 | 98 | // finalization 99 | h1 ^= len; 100 | 101 | // fmix(h1); 102 | h1 ^= h1 >>> 16; 103 | h1 *= 0x85ebca6b; 104 | h1 ^= h1 >>> 13; 105 | h1 *= 0xc2b2ae35; 106 | h1 ^= h1 >>> 16; 107 | 108 | return h1; 109 | } 110 | 111 | 112 | /** Returns the MurmurHash3_x86_32 hash of the UTF-8 bytes of the String without actually encoding 113 | * the string to a temporary buffer. This is more than 2x faster than hashing the result 114 | * of String.getBytes(). 115 | */ 116 | public static int murmurhash3_x86_32(CharSequence data, int offset, int len, int seed) { 117 | 118 | final int c1 = 0xcc9e2d51; 119 | final int c2 = 0x1b873593; 120 | 121 | int h1 = seed; 122 | 123 | int pos = offset; 124 | int end = offset + len; 125 | int k1 = 0; 126 | int k2 = 0; 127 | int shift = 0; 128 | int bits = 0; 129 | int nBytes = 0; // length in UTF8 bytes 130 | 131 | 132 | while (pos < end) { 133 | int code = data.charAt(pos++); 134 | if (code < 0x80) { 135 | k2 = code; 136 | bits = 8; 137 | 138 | /*** 139 | // optimized ascii implementation (currently slower!!! code size?) 140 | if (shift == 24) { 141 | k1 = k1 | (code << 24); 142 | 143 | k1 *= c1; 144 | k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); 145 | k1 *= c2; 146 | 147 | h1 ^= k1; 148 | h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13); 149 | h1 = h1*5+0xe6546b64; 150 | 151 | shift = 0; 152 | nBytes += 4; 153 | k1 = 0; 154 | } else { 155 | k1 |= code << shift; 156 | shift += 8; 157 | } 158 | continue; 159 | ***/ 160 | 161 | } 162 | else if (code < 0x800) { 163 | k2 = (0xC0 | (code >> 6)) 164 | | ((0x80 | (code & 0x3F)) << 8); 165 | bits = 16; 166 | } 167 | else if (code < 0xD800 || code > 0xDFFF || pos>=end) { 168 | // we check for pos>=end to encode an unpaired surrogate as 3 bytes. 169 | k2 = (0xE0 | (code >> 12)) 170 | | ((0x80 | ((code >> 6) & 0x3F)) << 8) 171 | | ((0x80 | (code & 0x3F)) << 16); 172 | bits = 24; 173 | } else { 174 | // surrogate pair 175 | // int utf32 = pos < end ? (int) data.charAt(pos++) : 0; 176 | int utf32 = (int) data.charAt(pos++); 177 | utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF); 178 | k2 = (0xff & (0xF0 | (utf32 >> 18))) 179 | | ((0x80 | ((utf32 >> 12) & 0x3F))) << 8 180 | | ((0x80 | ((utf32 >> 6) & 0x3F))) << 16 181 | | (0x80 | (utf32 & 0x3F)) << 24; 182 | bits = 32; 183 | } 184 | 185 | 186 | k1 |= k2 << shift; 187 | 188 | // int used_bits = 32 - shift; // how many bits of k2 were used in k1. 189 | // int unused_bits = bits - used_bits; // (bits-(32-shift)) == bits+shift-32 == bits-newshift 190 | 191 | shift += bits; 192 | if (shift >= 32) { 193 | // mix after we have a complete word 194 | 195 | k1 *= c1; 196 | k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); 197 | k1 *= c2; 198 | 199 | h1 ^= k1; 200 | h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13); 201 | h1 = h1*5+0xe6546b64; 202 | 203 | shift -= 32; 204 | // unfortunately, java won't let you shift 32 bits off, so we need to check for 0 205 | if (shift != 0) { 206 | k1 = k2 >>> (bits-shift); // bits used == bits - newshift 207 | } else { 208 | k1 = 0; 209 | } 210 | nBytes += 4; 211 | } 212 | 213 | } // inner 214 | 215 | // handle tail 216 | if (shift > 0) { 217 | nBytes += shift >> 3; 218 | k1 *= c1; 219 | k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); 220 | k1 *= c2; 221 | h1 ^= k1; 222 | } 223 | 224 | // finalization 225 | h1 ^= nBytes; 226 | 227 | // fmix(h1); 228 | h1 ^= h1 >>> 16; 229 | h1 *= 0x85ebca6b; 230 | h1 ^= h1 >>> 13; 231 | h1 *= 0xc2b2ae35; 232 | h1 ^= h1 >>> 16; 233 | 234 | return h1; 235 | } 236 | 237 | 238 | /** Returns the MurmurHash3_x64_128 hash, placing the result in "out". */ 239 | public static void murmurhash3_x64_128(byte[] key, int offset, int len, int seed, LongPair out) { 240 | // The original algorithm does have a 32 bit unsigned seed. 241 | // We have to mask to match the behavior of the unsigned types and prevent sign extension. 242 | long h1 = seed & 0x00000000FFFFFFFFL; 243 | long h2 = seed & 0x00000000FFFFFFFFL; 244 | 245 | final long c1 = 0x87c37b91114253d5L; 246 | final long c2 = 0x4cf5ad432745937fL; 247 | 248 | int roundedEnd = offset + (len & 0xFFFFFFF0); // round down to 16 byte block 249 | for (int i=offset; i