├── .github └── workflows │ ├── ci.yml │ └── clean.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── NOTICE ├── README.md ├── build.sbt ├── core └── src │ ├── main │ └── scala │ │ └── io │ │ └── chrisdavenport │ │ └── probabilistic │ │ ├── BloomFilter.scala │ │ ├── CuckooFilter.scala │ │ ├── hashes │ │ └── Hashes.scala │ │ └── mutable │ │ ├── BloomFilter.scala │ │ ├── CuckooFilter.scala │ │ ├── CuckooTable.scala │ │ └── ThreadSafeBitSet.scala │ └── test │ └── scala │ └── io │ └── chrisdavenport │ └── probabilistic │ └── MainSpec.scala ├── examples └── src │ └── main │ └── scala │ ├── BloomExample.scala │ └── CuckooExample.scala ├── licenses └── apache ├── project ├── build.properties └── plugins.sbt └── site ├── Gemfile └── docs └── index.md /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | # This file was automatically generated by sbt-github-actions using the 2 | # githubWorkflowGenerate task. You should add and commit this file to 3 | # your git repository. It goes without saying that you shouldn't edit 4 | # this file by hand! Instead, if you wish to make changes, you should 5 | # change your sbt build configuration to revise the workflow description 6 | # to meet your needs, then regenerate this file. 7 | 8 | name: Continuous Integration 9 | 10 | on: 11 | pull_request: 12 | branches: ['*'] 13 | push: 14 | branches: ['*'] 15 | tags: [v*] 16 | 17 | env: 18 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 19 | 20 | jobs: 21 | build: 22 | name: Build and Test 23 | strategy: 24 | matrix: 25 | os: [ubuntu-latest] 26 | scala: [2.12.13, 2.13.3] 27 | java: [adopt@1.8] 28 | runs-on: ${{ matrix.os }} 29 | steps: 30 | - name: Checkout current branch (full) 31 | uses: actions/checkout@v2 32 | with: 33 | fetch-depth: 0 34 | 35 | - name: Setup Java and Scala 36 | uses: olafurpg/setup-scala@v10 37 | with: 38 | java-version: ${{ matrix.java }} 39 | 40 | - name: Cache sbt 41 | uses: actions/cache@v2 42 | with: 43 | path: | 44 | ~/.sbt 45 | ~/.ivy2/cache 46 | ~/.coursier/cache/v1 47 | ~/.cache/coursier/v1 48 | ~/AppData/Local/Coursier/Cache/v1 49 | ~/Library/Caches/Coursier/v1 50 | key: ${{ runner.os }}-sbt-cache-v2-${{ hashFiles('**/*.sbt') }}-${{ hashFiles('project/build.properties') }} 51 | 52 | - name: Setup Ruby 53 | if: matrix.scala == '2.13.3' 54 | uses: ruby/setup-ruby@v1 55 | with: 56 | ruby-version: 2.6.0 57 | 58 | - name: Install microsite dependencies 59 | if: matrix.scala == '2.13.3' 60 | run: | 61 | gem install saas 62 | gem install jekyll -v 3.2.1 63 | 64 | - name: Check that workflows are up to date 65 | run: sbt ++${{ matrix.scala }} githubWorkflowCheck 66 | 67 | - run: sbt ++${{ matrix.scala }} test mimaReportBinaryIssues 68 | 69 | - if: matrix.scala == '2.13.3' 70 | run: sbt ++${{ matrix.scala }} site/makeMicrosite 71 | 72 | publish: 73 | name: Publish Artifacts 74 | needs: [build] 75 | if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v')) 76 | strategy: 77 | matrix: 78 | os: [ubuntu-latest] 79 | scala: [2.13.3] 80 | java: [adopt@1.8] 81 | runs-on: ${{ matrix.os }} 82 | steps: 83 | - name: Checkout current branch (full) 84 | uses: actions/checkout@v2 85 | with: 86 | fetch-depth: 0 87 | 88 | - name: Setup Java and Scala 89 | uses: olafurpg/setup-scala@v10 90 | with: 91 | java-version: ${{ matrix.java }} 92 | 93 | - name: Cache sbt 94 | uses: actions/cache@v2 95 | with: 96 | path: | 97 | ~/.sbt 98 | ~/.ivy2/cache 99 | ~/.coursier/cache/v1 100 | ~/.cache/coursier/v1 101 | ~/AppData/Local/Coursier/Cache/v1 102 | ~/Library/Caches/Coursier/v1 103 | key: ${{ runner.os }}-sbt-cache-v2-${{ hashFiles('**/*.sbt') }}-${{ hashFiles('project/build.properties') }} 104 | 105 | - uses: olafurpg/setup-gpg@v3 106 | 107 | - name: Setup Ruby 108 | uses: ruby/setup-ruby@v1 109 | with: 110 | ruby-version: 2.6.0 111 | 112 | - name: Install microsite dependencies 113 | run: | 114 | gem install saas 115 | gem install jekyll -v 3.2.1 116 | 117 | - name: Publish artifacts to Sonatype 118 | env: 119 | PGP_PASSPHRASE: ${{ secrets.PGP_PASSPHRASE }} 120 | PGP_SECRET: ${{ secrets.PGP_SECRET }} 121 | SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }} 122 | SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }} 123 | run: sbt ++${{ matrix.scala }} ci-release 124 | 125 | - name: Publish microsite 126 | run: sbt ++${{ matrix.scala }} site/publishMicrosite -------------------------------------------------------------------------------- /.github/workflows/clean.yml: -------------------------------------------------------------------------------- 1 | # This file was automatically generated by sbt-github-actions using the 2 | # githubWorkflowGenerate task. You should add and commit this file to 3 | # your git repository. It goes without saying that you shouldn't edit 4 | # this file by hand! Instead, if you wish to make changes, you should 5 | # change your sbt build configuration to revise the workflow description 6 | # to meet your needs, then regenerate this file. 7 | 8 | name: Clean 9 | 10 | on: push 11 | 12 | jobs: 13 | delete-artifacts: 14 | name: Delete Artifacts 15 | runs-on: ubuntu-latest 16 | env: 17 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 18 | steps: 19 | - name: Delete artifacts 20 | run: | 21 | # Customize those three lines with your repository and credentials: 22 | REPO=${GITHUB_API_URL}/repos/${{ github.repository }} 23 | 24 | # A shortcut to call GitHub API. 25 | ghapi() { curl --silent --location --user _:$GITHUB_TOKEN "$@"; } 26 | 27 | # A temporary file which receives HTTP response headers. 28 | TMPFILE=/tmp/tmp.$$ 29 | 30 | # An associative array, key: artifact name, value: number of artifacts of that name. 31 | declare -A ARTCOUNT 32 | 33 | # Process all artifacts on this repository, loop on returned "pages". 34 | URL=$REPO/actions/artifacts 35 | while [[ -n "$URL" ]]; do 36 | 37 | # Get current page, get response headers in a temporary file. 38 | JSON=$(ghapi --dump-header $TMPFILE "$URL") 39 | 40 | # Get URL of next page. Will be empty if we are at the last page. 41 | URL=$(grep '^Link:' "$TMPFILE" | tr ',' '\n' | grep 'rel="next"' | head -1 | sed -e 's/.*.*//') 42 | rm -f $TMPFILE 43 | 44 | # Number of artifacts on this page: 45 | COUNT=$(( $(jq <<<$JSON -r '.artifacts | length') )) 46 | 47 | # Loop on all artifacts on this page. 48 | for ((i=0; $i < $COUNT; i++)); do 49 | 50 | # Get name of artifact and count instances of this name. 51 | name=$(jq <<<$JSON -r ".artifacts[$i].name?") 52 | ARTCOUNT[$name]=$(( $(( ${ARTCOUNT[$name]} )) + 1)) 53 | 54 | id=$(jq <<<$JSON -r ".artifacts[$i].id?") 55 | size=$(( $(jq <<<$JSON -r ".artifacts[$i].size_in_bytes?") )) 56 | printf "Deleting '%s' #%d, %'d bytes\n" $name ${ARTCOUNT[$name]} $size 57 | ghapi -X DELETE $REPO/actions/artifacts/$id 58 | done 59 | done -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | .idea/ 3 | # vim 4 | *.sw? 5 | 6 | # Ignore [ce]tags files 7 | tags 8 | 9 | .bloop 10 | .metals 11 | metals.sbt 12 | .vscode 13 | .bsp -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | We are committed to providing a friendly, safe and welcoming environment for all, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, religion, nationality, or other such characteristics. 4 | 5 | Everyone is expected to follow the [Scala Code of Conduct] when discussing the project on the available communication channels. If you are being harassed, please contact us immediately so that we can support you. 6 | 7 | ## Moderation 8 | 9 | Any questions, concerns, or moderation requests please contact a member of the project. 10 | 11 | - [Christopher Davenport](mailto:chris@christopherdavenport.tech) 12 | 13 | [Scala Code of Conduct]: https://www.scala-lang.org/conduct/ 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2021 Christopher Davenport 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | probabilistic 2 | Copyright 2021 Christopher Davenport 3 | Licensed under the MIT license (see LICENSE) 4 | 5 | This software contains portions of code derived from guava-probably 6 | https://github.com/bdupras/guava-probably 7 | Copyright (C) 2015 Brian Dupras 8 | Licensed under Apache License 2.0 (licenses/apache) 9 | 10 | This software contains portions of code derived from hollow 11 | https://github.com/Netflix/hollow 12 | Copyright (C) 2016-2019 Netflix, Inc. 13 | Licensed under Apache License 2.0 (licenses/apache) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # probabilistic - Probabilistic Data Structures [![Build Status](https://travis-ci.com/ChristopherDavenport/probabilistic.svg?branch=master)](https://travis-ci.com/ChristopherDavenport/probabilistic) [![Maven Central](https://maven-badges.herokuapp.com/maven-central/io.chrisdavenport/probabilistic_2.12/badge.svg)](https://maven-badges.herokuapp.com/maven-central/io.chrisdavenport/probabilistic_2.12) ![Code of Consuct](https://img.shields.io/badge/Code%20of%20Conduct-Scala-blue.svg) 2 | 3 | ## [Head on over to the microsite](https://ChristopherDavenport.github.io/probabilistic) 4 | 5 | ## Quick Start 6 | 7 | To use probabilistic in an existing SBT project with Scala 2.11 or a later version, add the following dependencies to your 8 | `build.sbt` depending on your needs: 9 | 10 | ```scala 11 | libraryDependencies ++= Seq( 12 | "io.chrisdavenport" %% "probabilistic" % "" 13 | ) 14 | ``` 15 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import sbtcrossproject.CrossPlugin.autoImport.{crossProject, CrossType} 2 | 3 | val Scala213 = "2.13.3" 4 | 5 | ThisBuild / crossScalaVersions := Seq("2.12.13", Scala213) 6 | ThisBuild / scalaVersion := crossScalaVersions.value.last 7 | 8 | ThisBuild / githubWorkflowArtifactUpload := false 9 | 10 | val Scala213Cond = s"matrix.scala == '$Scala213'" 11 | 12 | def rubySetupSteps(cond: Option[String]) = Seq( 13 | WorkflowStep.Use( 14 | "ruby", "setup-ruby", "v1", 15 | name = Some("Setup Ruby"), 16 | params = Map("ruby-version" -> "2.6.0"), 17 | cond = cond), 18 | 19 | WorkflowStep.Run( 20 | List( 21 | "gem install saas", 22 | "gem install jekyll -v 3.2.1"), 23 | name = Some("Install microsite dependencies"), 24 | cond = cond)) 25 | 26 | ThisBuild / githubWorkflowBuildPreamble ++= 27 | rubySetupSteps(Some(Scala213Cond)) 28 | 29 | ThisBuild / githubWorkflowBuild := Seq( 30 | WorkflowStep.Sbt(List("test", "mimaReportBinaryIssues")), 31 | 32 | WorkflowStep.Sbt( 33 | List("site/makeMicrosite"), 34 | cond = Some(Scala213Cond))) 35 | 36 | ThisBuild / githubWorkflowTargetTags ++= Seq("v*") 37 | 38 | // currently only publishing tags 39 | ThisBuild / githubWorkflowPublishTargetBranches := 40 | Seq(RefPredicate.StartsWith(Ref.Tag("v"))) 41 | 42 | ThisBuild / githubWorkflowPublishPreamble ++= 43 | WorkflowStep.Use("olafurpg", "setup-gpg", "v3") +: rubySetupSteps(None) 44 | 45 | ThisBuild / githubWorkflowPublish := Seq( 46 | WorkflowStep.Sbt( 47 | List("ci-release"), 48 | name = Some("Publish artifacts to Sonatype"), 49 | env = Map( 50 | "PGP_PASSPHRASE" -> "${{ secrets.PGP_PASSPHRASE }}", 51 | "PGP_SECRET" -> "${{ secrets.PGP_SECRET }}", 52 | "SONATYPE_PASSWORD" -> "${{ secrets.SONATYPE_PASSWORD }}", 53 | "SONATYPE_USERNAME" -> "${{ secrets.SONATYPE_USERNAME }}")), 54 | 55 | WorkflowStep.Sbt( 56 | List("site/publishMicrosite"), 57 | name = Some("Publish microsite") 58 | ) 59 | ) 60 | 61 | 62 | val catsV = "2.3.1" 63 | val catsEffectV = "2.3.1" 64 | 65 | val munitCatsEffectV = "0.12.0" 66 | 67 | val kindProjectorV = "0.11.3" 68 | val betterMonadicForV = "0.3.1" 69 | 70 | // Projects 71 | lazy val `probabilistic` = project.in(file(".")) 72 | .disablePlugins(MimaPlugin) 73 | .enablePlugins(NoPublishPlugin) 74 | .aggregate(core, examples) 75 | 76 | lazy val core = project.in(file("core")) 77 | .settings(commonSettings) 78 | .settings( 79 | name := "probabilistic" 80 | ) 81 | 82 | lazy val examples = project.in(file("examples")) 83 | .disablePlugins(MimaPlugin) 84 | .enablePlugins(NoPublishPlugin) 85 | .settings(commonSettings) 86 | .dependsOn(core) 87 | .settings( 88 | name := "probabilistic-examples" 89 | ) 90 | 91 | lazy val site = project.in(file("site")) 92 | .disablePlugins(MimaPlugin) 93 | .enablePlugins(MicrositesPlugin) 94 | .enablePlugins(MdocPlugin) 95 | .enablePlugins(NoPublishPlugin) 96 | .settings(commonSettings) 97 | .dependsOn(core) 98 | .settings{ 99 | import microsites._ 100 | Seq( 101 | micrositeName := "probabilistic", 102 | micrositeDescription := "Probabilistic Data Structures", 103 | micrositeAuthor := "Christopher Davenport", 104 | micrositeGithubOwner := "ChristopherDavenport", 105 | micrositeGithubRepo := "probabilistic", 106 | micrositeBaseUrl := "/probabilistic", 107 | micrositeDocumentationUrl := "https://www.javadoc.io/doc/io.chrisdavenport/probabilistic_2.13", 108 | micrositeGitterChannelUrl := "ChristopherDavenport/libraries", // Feel Free to Set To Something Else 109 | micrositeFooterText := None, 110 | micrositeHighlightTheme := "atom-one-light", 111 | micrositePalette := Map( 112 | "brand-primary" -> "#3e5b95", 113 | "brand-secondary" -> "#294066", 114 | "brand-tertiary" -> "#2d5799", 115 | "gray-dark" -> "#49494B", 116 | "gray" -> "#7B7B7E", 117 | "gray-light" -> "#E5E5E6", 118 | "gray-lighter" -> "#F4F3F4", 119 | "white-color" -> "#FFFFFF" 120 | ), 121 | micrositePushSiteWith := GitHub4s, 122 | micrositeGithubToken := sys.env.get("GITHUB_TOKEN"), 123 | micrositeExtraMdFiles := Map( 124 | file("CODE_OF_CONDUCT.md") -> ExtraMdFileConfig("code-of-conduct.md", "page", Map("title" -> "code of conduct", "section" -> "code of conduct", "position" -> "100")), 125 | file("LICENSE") -> ExtraMdFileConfig("license.md", "page", Map("title" -> "license", "section" -> "license", "position" -> "101")) 126 | ) 127 | ) 128 | } 129 | 130 | // General Settings 131 | lazy val commonSettings = Seq( 132 | testFrameworks += new TestFramework("munit.Framework"), 133 | libraryDependencies ++= { 134 | if (isDotty.value) Seq.empty 135 | else Seq( 136 | compilerPlugin("org.typelevel" % "kind-projector" % kindProjectorV cross CrossVersion.full), 137 | compilerPlugin("com.olegpy" %% "better-monadic-for" % betterMonadicForV), 138 | ) 139 | }, 140 | scalacOptions ++= { 141 | if (isDotty.value) Seq("-source:3.0-migration") 142 | else Seq() 143 | }, 144 | Compile / doc / sources := { 145 | val old = (Compile / doc / sources).value 146 | if (isDotty.value) 147 | Seq() 148 | else 149 | old 150 | }, 151 | 152 | libraryDependencies ++= Seq( 153 | "org.typelevel" %% "cats-core" % catsV, 154 | "org.typelevel" %% "cats-effect" % catsEffectV, 155 | 156 | "org.typelevel" %%% "munit-cats-effect-2" % munitCatsEffectV % Test, 157 | ) 158 | ) 159 | 160 | // General Settings 161 | inThisBuild(List( 162 | organization := "io.chrisdavenport", 163 | developers := List( 164 | Developer("ChristopherDavenport", "Christopher Davenport", "chris@christopherdavenport.tech", url("https://github.com/ChristopherDavenport")) 165 | ), 166 | 167 | homepage := Some(url("https://github.com/ChristopherDavenport/probabilistic")), 168 | licenses += ("MIT", url("http://opensource.org/licenses/MIT")), 169 | 170 | pomIncludeRepository := { _ => false}, 171 | scalacOptions in (Compile, doc) ++= Seq( 172 | "-groups", 173 | "-sourcepath", (baseDirectory in LocalRootProject).value.getAbsolutePath, 174 | "-doc-source-url", "https://github.com/ChristopherDavenport/probabilistic/blob/v" + version.value + "€{FILE_PATH}.scala" 175 | ) 176 | )) 177 | -------------------------------------------------------------------------------- /core/src/main/scala/io/chrisdavenport/probabilistic/BloomFilter.scala: -------------------------------------------------------------------------------- 1 | package io.chrisdavenport.probabilistic 2 | 3 | 4 | import cats._ 5 | import cats.syntax.all._ 6 | import cats.effect._ 7 | import cats.effect.concurrent._ 8 | import java.nio.charset.Charset 9 | import java.nio.charset.StandardCharsets 10 | import io.chrisdavenport.probabilistic.hashes.Hashes 11 | 12 | trait BloomFilter[F[_], A]{ 13 | def add(a: A): F[Unit] 14 | // False Positives a Reality 15 | // False Negatives are not a thing 16 | def mayContain(a: A): F[Boolean] 17 | } 18 | 19 | object BloomFilter { 20 | 21 | def string[F[_]: Sync](numberOfItems: Long, falsePositiveRate: Double)(implicit charset: Charset = Charset.defaultCharset()): F[BloomFilter[F, String]] = { 22 | Sync[F].delay(mutable.BloomFilter.string(numberOfItems, falsePositiveRate)(charset)) 23 | .map(new BloomFilterImpl[F, String](_)) 24 | } 25 | 26 | def array[F[_]: Sync](numberOfItems: Long, falsePositiveRate: Double): F[BloomFilter[F, Array[Byte]]] = { 27 | Sync[F].delay(mutable.BloomFilter.array(numberOfItems, falsePositiveRate)) 28 | .map(new BloomFilterImpl[F, Array[Byte]](_)) 29 | } 30 | 31 | def static[F[_]: Sync, A](initBitSize: Long, hashFunctions: A => cats.data.NonEmptyList[Long]): F[BloomFilter[F, A]] = 32 | for { 33 | bf <- Sync[F].delay(mutable.BloomFilter.static(initBitSize, hashFunctions)) 34 | } yield new BloomFilterImpl[F, A](bf) 35 | 36 | 37 | implicit def instances[F[_]]: Contravariant[({type X[A] = BloomFilter[F, A]})#X] = new Contravariant[({type X[A] = BloomFilter[F, A]})#X]{ 38 | def contramap[A, B](fa: BloomFilter[F,A])(f: B => A): BloomFilter[F,B] = new BloomFilter[F, B] { 39 | def add(a: B): F[Unit] = fa.add(f(a)) 40 | def mayContain(a: B): F[Boolean] = fa.mayContain(f(a)) 41 | } 42 | } 43 | 44 | private class BloomFilterImpl[F[_]: Sync, A]( 45 | underlying: mutable.BloomFilter[A] 46 | ) extends BloomFilter[F, A]{ 47 | def add(a: A): F[Unit] = Sync[F].delay{ 48 | underlying.add(a) 49 | } 50 | def mayContain(a: A): F[Boolean] = Sync[F].delay{ 51 | underlying.mayContain(a) 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /core/src/main/scala/io/chrisdavenport/probabilistic/CuckooFilter.scala: -------------------------------------------------------------------------------- 1 | package io.chrisdavenport.probabilistic 2 | 3 | import cats.Contravariant 4 | import cats.syntax.all._ 5 | import cats.effect._ 6 | import java.nio.charset.Charset 7 | 8 | trait CuckooFilter[F[_], A]{ 9 | def add(a: A): F[Boolean] 10 | def remove(a: A): F[Boolean] 11 | 12 | def mayContain(a: A): F[Boolean] 13 | } 14 | 15 | object CuckooFilter { 16 | 17 | def string[F[_]: Sync](numberOfItems: Long, falsePositiveRate: Double)(implicit charset: Charset = Charset.defaultCharset()): F[CuckooFilter[F, String]] = { 18 | Sync[F].delay(mutable.CuckooFilter.string(numberOfItems, falsePositiveRate)(charset)) 19 | .map(new CuckooFilterImpl[F, String](_)) 20 | } 21 | 22 | def array[F[_]: Sync](numberOfItems: Long, falsePositiveRate: Double): F[CuckooFilter[F, Array[Byte]]] = { 23 | Sync[F].delay(mutable.CuckooFilter.array(numberOfItems, falsePositiveRate)) 24 | .map(new CuckooFilterImpl[F, Array[Byte]](_)) 25 | } 26 | 27 | 28 | implicit def instances[F[_]]: Contravariant[({type X[A] = CuckooFilter[F, A]})#X] = new Contravariant[({type X[A] = CuckooFilter[F, A]})#X]{ 29 | def contramap[A, B](fa: CuckooFilter[F,A])(f: B => A): CuckooFilter[F,B] = new CuckooFilter[F, B] { 30 | def add(a: B): F[Boolean] = fa.add(f(a)) 31 | def remove(a: B): F[Boolean] = fa.remove(f(a)) 32 | def mayContain(a: B): F[Boolean] = fa.mayContain(f(a)) 33 | } 34 | } 35 | 36 | private class CuckooFilterImpl[F[_]: Sync, A](underlying: mutable.CuckooFilter[A]) extends CuckooFilter[F, A]{ 37 | def add(a: A): F[Boolean] = Sync[F].delay(underlying.add(a)) 38 | def remove(a: A): F[Boolean] = Sync[F].delay(underlying.remove(a)) 39 | def mayContain(a: A): F[Boolean] = Sync[F].delay(underlying.mayContain(a)) 40 | } 41 | } -------------------------------------------------------------------------------- /core/src/main/scala/io/chrisdavenport/probabilistic/hashes/Hashes.scala: -------------------------------------------------------------------------------- 1 | package io.chrisdavenport.probabilistic.hashes 2 | 3 | import scala.util.hashing.MurmurHash3 4 | import cats.data.NonEmptyList 5 | import java.nio.charset.Charset 6 | 7 | object Hashes { 8 | 9 | // TODO MORE HASHING!!! 10 | // Need 14 - 17 unique algorithms for extremely small probabilities of false positives 11 | val arrayHashes: NonEmptyList[Array[Byte] => Long] = NonEmptyList.of( 12 | XXHash.hash(_), 13 | FNV32.hash(_), 14 | Adler32.hash(_), 15 | Bernstein.hash(_), 16 | KernighanRitchie.hash(_), 17 | Murmur3.hash(_), 18 | CRC16.hash(_), 19 | CRC32.hash(_), 20 | ) 21 | 22 | object Adler32 { 23 | def hash(data: Array[Byte]): Long = { 24 | val x = new java.util.zip.Adler32() 25 | x.update(data) 26 | x.getValue() 27 | } 28 | 29 | } 30 | 31 | object FNV32 { 32 | private val FNV1_32_INIT = 0x811c9dc5 33 | private val FNV1_PRIME_32 = 16777619 34 | 35 | def hash(data: Array[Byte]): Int = { 36 | var mHash = FNV1_32_INIT 37 | for { b <- data}{ 38 | mHash ^= (b & 0xff) 39 | mHash *= FNV1_PRIME_32 40 | } 41 | mHash 42 | } 43 | } 44 | 45 | /** 46 | * XMODEM CRC 16 CRC16 - 16-bit Cyclic Redundancy Check (CRC16) 47 | * 48 | * Name : "XMODEM", also known as "ZMODEM", "CRC-16/ACORN" 49 | * Width : 16 bit 50 | * Poly : 1021 (That is actually x^16 + x^12 + x^5 + 1) 51 | * Initialization : 0000 52 | * Reflect Input byte : False 53 | * Reflect Output CRC : False 54 | * Xor constant to output CRC : 0000 55 | * Output for "123456789" : 31C3 56 | */ 57 | object CRC16 { 58 | 59 | def hash(data: Array[Byte]): Int = { 60 | var crc: Int = 0 61 | data.foreach{ b => 62 | crc = (crc << 8) ^ table(((crc >>> 8) ^ (b & 0xff)) & 0xff) 63 | } 64 | crc & 0xFFFF 65 | } 66 | 67 | private[CRC16] lazy val table : Array[Int] = Array( 68 | 0x0000,0x1021,0x2042,0x3063,0x4084,0x50a5,0x60c6,0x70e7, 69 | 0x8108,0x9129,0xa14a,0xb16b,0xc18c,0xd1ad,0xe1ce,0xf1ef, 70 | 0x1231,0x0210,0x3273,0x2252,0x52b5,0x4294,0x72f7,0x62d6, 71 | 0x9339,0x8318,0xb37b,0xa35a,0xd3bd,0xc39c,0xf3ff,0xe3de, 72 | 0x2462,0x3443,0x0420,0x1401,0x64e6,0x74c7,0x44a4,0x5485, 73 | 0xa56a,0xb54b,0x8528,0x9509,0xe5ee,0xf5cf,0xc5ac,0xd58d, 74 | 0x3653,0x2672,0x1611,0x0630,0x76d7,0x66f6,0x5695,0x46b4, 75 | 0xb75b,0xa77a,0x9719,0x8738,0xf7df,0xe7fe,0xd79d,0xc7bc, 76 | 0x48c4,0x58e5,0x6886,0x78a7,0x0840,0x1861,0x2802,0x3823, 77 | 0xc9cc,0xd9ed,0xe98e,0xf9af,0x8948,0x9969,0xa90a,0xb92b, 78 | 0x5af5,0x4ad4,0x7ab7,0x6a96,0x1a71,0x0a50,0x3a33,0x2a12, 79 | 0xdbfd,0xcbdc,0xfbbf,0xeb9e,0x9b79,0x8b58,0xbb3b,0xab1a, 80 | 0x6ca6,0x7c87,0x4ce4,0x5cc5,0x2c22,0x3c03,0x0c60,0x1c41, 81 | 0xedae,0xfd8f,0xcdec,0xddcd,0xad2a,0xbd0b,0x8d68,0x9d49, 82 | 0x7e97,0x6eb6,0x5ed5,0x4ef4,0x3e13,0x2e32,0x1e51,0x0e70, 83 | 0xff9f,0xefbe,0xdfdd,0xcffc,0xbf1b,0xaf3a,0x9f59,0x8f78, 84 | 0x9188,0x81a9,0xb1ca,0xa1eb,0xd10c,0xc12d,0xf14e,0xe16f, 85 | 0x1080,0x00a1,0x30c2,0x20e3,0x5004,0x4025,0x7046,0x6067, 86 | 0x83b9,0x9398,0xa3fb,0xb3da,0xc33d,0xd31c,0xe37f,0xf35e, 87 | 0x02b1,0x1290,0x22f3,0x32d2,0x4235,0x5214,0x6277,0x7256, 88 | 0xb5ea,0xa5cb,0x95a8,0x8589,0xf56e,0xe54f,0xd52c,0xc50d, 89 | 0x34e2,0x24c3,0x14a0,0x0481,0x7466,0x6447,0x5424,0x4405, 90 | 0xa7db,0xb7fa,0x8799,0x97b8,0xe75f,0xf77e,0xc71d,0xd73c, 91 | 0x26d3,0x36f2,0x0691,0x16b0,0x6657,0x7676,0x4615,0x5634, 92 | 0xd94c,0xc96d,0xf90e,0xe92f,0x99c8,0x89e9,0xb98a,0xa9ab, 93 | 0x5844,0x4865,0x7806,0x6827,0x18c0,0x08e1,0x3882,0x28a3, 94 | 0xcb7d,0xdb5c,0xeb3f,0xfb1e,0x8bf9,0x9bd8,0xabbb,0xbb9a, 95 | 0x4a75,0x5a54,0x6a37,0x7a16,0x0af1,0x1ad0,0x2ab3,0x3a92, 96 | 0xfd2e,0xed0f,0xdd6c,0xcd4d,0xbdaa,0xad8b,0x9de8,0x8dc9, 97 | 0x7c26,0x6c07,0x5c64,0x4c45,0x3ca2,0x2c83,0x1ce0,0x0cc1, 98 | 0xef1f,0xff3e,0xcf5d,0xdf7c,0xaf9b,0xbfba,0x8fd9,0x9ff8, 99 | 0x6e17,0x7e36,0x4e55,0x5e74,0x2e93,0x3eb2,0x0ed1,0x1ef0 100 | ) 101 | } 102 | 103 | object CRC32 { 104 | def hash(data: Array[Byte]): Long = { 105 | val c = new java.util.zip.CRC32() 106 | c.update(data) 107 | c.getValue() 108 | } 109 | } 110 | 111 | 112 | object Bernstein { 113 | private val INITIAL = 5381 114 | private val M = 33 115 | def hash(data: Array[Byte]): Int = { 116 | var hash = INITIAL 117 | for { 118 | x <- data 119 | } { hash = M * hash + x } 120 | hash 121 | } 122 | } 123 | 124 | object KernighanRitchie { 125 | private val INITIAL = 0 126 | private val M = 31 127 | def hash(data: Array[Byte]): Int = { 128 | var hash = INITIAL 129 | for { 130 | x <- data 131 | } { hash = M * hash + x } 132 | hash 133 | } 134 | } 135 | 136 | object Murmur3 { 137 | def hash(data: Array[Byte]): Int = { 138 | MurmurHash3.bytesHash(data) 139 | } 140 | } 141 | 142 | object XXHash { 143 | private final val PRIME64_1 = 0x9E3779B185EBCA87L 144 | private final val PRIME64_2 = 0xC2B2AE3D27D4EB4FL 145 | private final val PRIME64_3 = 0x165667B19E3779F9L 146 | private final val PRIME64_4 = 0x85EBCA77C2b2AE63L 147 | private final val PRIME64_5 = 0x27D4EB2F165667C5L 148 | private final val DEFAULT_SEED = 0L 149 | 150 | def hash(data: Array[Byte]): Long = hash64(data) 151 | 152 | def hash64(data: Array[Byte], seed: Long = DEFAULT_SEED): Long = { 153 | val length = data.length 154 | var index = 0 155 | var hash: Long = -1 // Danger Will Robinson 156 | if (length >= 32) { 157 | var v1 = seed + PRIME64_1 + PRIME64_2 158 | var v2 = seed + PRIME64_2 159 | var v3 = seed + 0 160 | var v4 = seed - PRIME64_1 161 | var limit = length - 32 162 | while (index <= limit){ 163 | var k1 = (data(index).toLong & 0xff) | 164 | ((data(index + 1).toLong & 0xff) << 8) | 165 | ((data(index + 2) & 0xff) << 16) | 166 | ((data(index + 3) & 0xff) << 24) | 167 | ((data(index + 4) & 0xff) << 32) | 168 | ((data(index + 5) & 0xff) << 40) | 169 | ((data(index + 6) & 0xff) << 48) | 170 | ((data(index + 7) & 0xff) << 56) 171 | v1 = mix(v1, k1) 172 | index += 8 173 | 174 | var k2 = (data(index) & 0xff) | 175 | ((data(index + 1) & 0xff) << 8) | 176 | ((data(index + 2) & 0xff) << 16) | 177 | ((data(index + 3) & 0xff) << 24) | 178 | ((data(index + 4) & 0xff) << 32) | 179 | ((data(index + 5) & 0xff) << 40) | 180 | ((data(index + 6) & 0xff) << 48) | 181 | ((data(index + 7) & 0xff) << 56) 182 | v2 = mix(v2, k2) 183 | index += 8 184 | 185 | var k3 = (data(index) & 0xff) | 186 | ((data(index + 1) & 0xff) << 8) | 187 | ((data(index + 2) & 0xff) << 16) | 188 | ((data(index + 3) & 0xff) << 24) | 189 | ((data(index + 4) & 0xff) << 32) | 190 | ((data(index + 5) & 0xff) << 40) | 191 | ((data(index + 6) & 0xff) << 48) | 192 | ((data(index + 7) & 0xff) << 56) 193 | v3 = mix(v3, k3) 194 | index += 8 195 | 196 | var k4 = (data(index) & 0xff) | 197 | ((data(index + 1) & 0xff) << 8) | 198 | ((data(index + 2) & 0xff) << 16) | 199 | ((data(index + 3) & 0xff) << 24) | 200 | ((data(index + 4) & 0xff) << 32) | 201 | ((data(index + 5) & 0xff) << 40) | 202 | ((data(index + 6) & 0xff) << 48) | 203 | ((data(index + 7) & 0xff) << 56) 204 | v4 = mix(v4, k4) 205 | index += 8 206 | } 207 | 208 | hash = java.lang.Long.rotateLeft(v1, 1) + 209 | java.lang.Long.rotateLeft(v2, 7) + 210 | java.lang.Long.rotateLeft(v3, 12) + 211 | java.lang.Long.rotateLeft(v4, 18) 212 | 213 | hash = update(hash, v1) 214 | hash = update(hash, v2) 215 | hash = update(hash, v3) 216 | hash = update(hash, v4) 217 | } else { 218 | hash = seed + PRIME64_5 219 | } 220 | 221 | hash += length 222 | 223 | // tail 224 | while (index <= length - 8) { 225 | var tailStart: Int = index 226 | var k: Long = 0 227 | var remaining: Int = length - index 228 | remaining = if (remaining > 8) 8 else remaining 229 | remaining match { 230 | case 8 => 231 | k |= (data(tailStart + 7) & 0xff) << 56 232 | case 7 => 233 | k |= (data(tailStart + 6) & 0xff) << 48 234 | case 6 => 235 | k |= (data(tailStart + 5) & 0xff) << 40 236 | case 5 => 237 | k |= (data(tailStart + 4) & 0xff) << 32 238 | case 4 => 239 | k |= (data(tailStart + 3) & 0xff) << 24 240 | case 3 => 241 | k |= (data(tailStart + 2) & 0xff) << 16 242 | case 2 => 243 | k |= (data(tailStart + 1) & 0xff) << 8 244 | case 1 => 245 | k |= (data(tailStart) & 0xff) 246 | } 247 | hash = updateTail(hash, k) 248 | index += 8 249 | } 250 | 251 | if (index <= length - 4) { 252 | var tailStart = index 253 | var k = 0 254 | var remaining = length - index 255 | remaining = if (remaining > 4) 4 else remaining 256 | remaining match { 257 | case 4 => 258 | k |= (data(tailStart + 3) & 0xff) << 24 259 | case 3 => 260 | k |= (data(tailStart + 2) & 0xff) << 16 261 | case 2 => 262 | k |= (data(tailStart + 1) & 0xff) << 8 263 | case 1 => 264 | k |= (data(tailStart) & 0xff) 265 | } 266 | hash = updateTail(hash, k) 267 | index += 4 268 | } 269 | 270 | while (index < length) { 271 | hash = updateTail(hash, data(index)) 272 | index += 1 273 | } 274 | 275 | hash = finalShuffle(hash) 276 | 277 | hash 278 | } 279 | 280 | 281 | private def mix(current: Long, value: Long): Long = { 282 | java.lang.Long.rotateLeft(current + value * PRIME64_2, 31) * PRIME64_1 283 | } 284 | 285 | private def update(hash: Long, value: Long): Long ={ 286 | val temp = hash ^ mix(0, value) 287 | temp * PRIME64_1 + PRIME64_4 288 | } 289 | 290 | private def updateTail(hash: Long, value: Long): Long ={ 291 | val temp = hash ^ mix(0, value) 292 | java.lang.Long.rotateLeft(temp, 27) * PRIME64_1 + PRIME64_4 293 | } 294 | 295 | private def updateTail(hash: Long, value: Int): Long ={ 296 | val unsigned = value & 0xFFFFFFFFL 297 | val temp = hash ^ (unsigned * PRIME64_1) 298 | java.lang.Long.rotateLeft(temp, 23) * PRIME64_2 + PRIME64_3 299 | } 300 | 301 | private def updateTail(hash: Long, value: Byte): Long ={ 302 | var unsigned = value & 0xFF 303 | var temp = hash ^ (unsigned * PRIME64_5) 304 | java.lang.Long.rotateLeft(temp, 11) * PRIME64_1 305 | } 306 | 307 | private def finalShuffle(ihash: Long): Long = { 308 | var hash = ihash 309 | hash ^= hash >>> 33 310 | hash *= PRIME64_2 311 | hash ^= hash >>> 29 312 | hash *= PRIME64_3 313 | hash ^= hash >>> 32 314 | hash 315 | } 316 | } 317 | 318 | } -------------------------------------------------------------------------------- /core/src/main/scala/io/chrisdavenport/probabilistic/mutable/BloomFilter.scala: -------------------------------------------------------------------------------- 1 | package io.chrisdavenport.probabilistic.mutable 2 | 3 | import io.chrisdavenport.probabilistic.hashes.Hashes 4 | import java.nio.charset.Charset 5 | 6 | class BloomFilter[A] private ( 7 | private[mutable] val bitSet: ThreadSafeBitSet, 8 | initSize: Long, 9 | hashFunctions: A => cats.data.NonEmptyList[Long] 10 | ) extends io.chrisdavenport.probabilistic.BloomFilter[cats.Id, A]{ 11 | private def hashToPosition(l: Long): Long = { 12 | val modulus = (l % initSize).toInt 13 | if (modulus >= 0) modulus 14 | else initSize + modulus 15 | } 16 | 17 | private def positions(a: A): cats.data.NonEmptyList[Long] = { 18 | hashFunctions(a).map(hashToPosition) 19 | } 20 | 21 | def add(a: A): Unit = { 22 | positions(a).toList.foreach{i => 23 | bitSet.set(i) 24 | } 25 | } 26 | 27 | def mayContain(a: A): Boolean = positions(a).forall(bitSet.get(_)) 28 | } 29 | 30 | object BloomFilter { 31 | def static[A](initBitSize: Long, hashFunctions: A => cats.data.NonEmptyList[Long]): BloomFilter[A] = { 32 | val bits = ThreadSafeBitSet(ThreadSafeBitSet.DEFAULT_LOG2_SEGMENT_SIZE_IN_BITS, initBitSize) 33 | new BloomFilter[A](bits, initBitSize, hashFunctions) 34 | } 35 | 36 | def string(numberOfItems: Long, falsePositiveRate: Double)(implicit charset: Charset = Charset.defaultCharset()): BloomFilter[String] = { 37 | val bits = optimalNumberOfBits(numberOfItems, falsePositiveRate) 38 | val hashes = optimalNumberOfHashes(numberOfItems, bits) 39 | static[String]( 40 | bits, 41 | { 42 | (s: String) => 43 | val array = s.getBytes(charset) 44 | cats.data.NonEmptyList( 45 | s.hashCode(), 46 | Hashes.arrayHashes.toList.take(hashes - 1).map(f => f(array)) 47 | ) 48 | } 49 | ) 50 | } 51 | 52 | def array(numberOfItems: Long, falsePositiveRate: Double): BloomFilter[Array[Byte]] = { 53 | val bits = optimalNumberOfBits(numberOfItems, falsePositiveRate) 54 | val hashes = optimalNumberOfHashes(numberOfItems, bits) 55 | static[Array[Byte]]( 56 | bits, 57 | { 58 | (data: Array[Byte]) => 59 | cats.data.NonEmptyList( 60 | Hashes.arrayHashes.head(data), 61 | Hashes.arrayHashes.tail.take(hashes - 1).map(f => f(data)) 62 | ) 63 | } 64 | ) 65 | } 66 | 67 | def optimalNumberOfBits(numberOfItems: Long, falsePositiveRate: Double): Long = { 68 | val p = if (falsePositiveRate == 0) Double.MinValue else falsePositiveRate 69 | math.ceil(-1 * numberOfItems * math.log(p) / math.log(2) / math.log(2)).toLong 70 | } 71 | 72 | def optimalNumberOfHashes(numberOfItems: Long, numberOfBits: Long): Int = { 73 | math.ceil(numberOfBits / numberOfItems * math.log(2)).toInt 74 | } 75 | 76 | } -------------------------------------------------------------------------------- /core/src/main/scala/io/chrisdavenport/probabilistic/mutable/CuckooFilter.scala: -------------------------------------------------------------------------------- 1 | package io.chrisdavenport.probabilistic.mutable 2 | 3 | import scala.util.Random 4 | import cats.Id 5 | import java.nio.charset.Charset 6 | import io.chrisdavenport.probabilistic.hashes.Hashes 7 | import cats.Contravariant 8 | 9 | class CuckooFilter[A] private ( 10 | private[mutable] val table: CuckooTable, 11 | hash: Array[Byte] => Long, 12 | f: A => Array[Byte], 13 | random: Random, 14 | maxRelocationAttempts: Int, 15 | ) extends io.chrisdavenport.probabilistic.CuckooFilter[cats.Id, A]{ 16 | import CuckooFilter._ 17 | 18 | def add(a: A): Boolean = { 19 | val h = hash(f(a)) 20 | val h1 = hash1(h) 21 | val h2 = hash2(h) 22 | val finger = fingerprint(h2) 23 | val i1 = index(h1) 24 | 25 | putEntry(finger, i1) || 26 | putEntry(finger, index2(i1, finger)) 27 | } 28 | 29 | def remove(a: A): Boolean = { 30 | val h = hash(f(a)) 31 | val h1 = hash1(h) 32 | val h2 = hash2(h) 33 | val finger = fingerprint(h2) 34 | val i1 = index(h1) 35 | val i2 = index2(i1, finger) 36 | table.swapAnyEntry(i1, CuckooTable.EMPTY_ENTRY, finger) || 37 | table.swapAnyEntry(i2, CuckooTable.EMPTY_ENTRY, finger) 38 | } 39 | 40 | def mayContain(a: A): Boolean = { 41 | val h = hash(f(a)) 42 | val h1 = hash1(h) 43 | val h2 = hash2(h) 44 | val finger = fingerprint(h2) 45 | val i1 = index(h1) 46 | val i2 = index2(i1, finger) 47 | table.findEntry(i1, finger).isDefined || 48 | table.findEntry(i2, finger).isDefined 49 | } 50 | 51 | private def putEntry(fingerprint: Int, index: Long): Boolean = { 52 | return table.swapAnyEntry(index, fingerprint, CuckooTable.EMPTY_ENTRY) || 53 | putEntry(fingerprint,index, 0); 54 | } 55 | 56 | 57 | private def putEntry(fingerprint: Int, index: Long, kick: Int): Boolean = { 58 | if (maxRelocationAttempts == kick) { 59 | return false; 60 | } else { 61 | 62 | val entry = random.nextInt(table.numEntriesPerBucket) 63 | val kicked = table.writeEntry(index, entry, fingerprint) 64 | 65 | if ((CuckooTable.EMPTY_ENTRY == kicked) 66 | || putEntry(kicked, index2(index, kicked), kick + 1)) { 67 | return true; 68 | } else { 69 | val kickedBack = table.writeEntry(index,entry, kicked) 70 | assert(kickedBack == fingerprint, "Uh oh - couldn't unroll failed attempts to putEntry()") 71 | return false; 72 | } 73 | } 74 | } 75 | 76 | private def hash1(hash: Long): Long = { 77 | hash 78 | } 79 | 80 | private def hash2(hash: Long): Long = { 81 | hash >>> 32 82 | } 83 | 84 | 85 | private def index(hash: Long): Long = { 86 | mod(hash, table.numBuckets).toInt 87 | } 88 | 89 | private def index2(index: Long, fingerprint: Int): Long = { 90 | mod(protectedSum(index, parsign(index) * odd(hash(intToArray(fingerprint))), table.numBuckets), table.numBuckets) 91 | } 92 | 93 | 94 | 95 | /** 96 | * Maps parity of i to a sign. 97 | * 98 | * @return 1 if i is even parity, -1 if i is odd parity 99 | */ 100 | private def parsign(i: Long): Long = { 101 | return ((i & 0x01L) * -2L) + 1L; 102 | } 103 | 104 | private def odd(i: Long): Long = { 105 | i | 0x01L 106 | } 107 | 108 | private def intToArray(data: Int): Array[Byte] = { 109 | BigInt(data).toByteArray 110 | } 111 | 112 | /** 113 | * Returns the sum of index and offset, reduced by a mod-consistent amount if necessary to 114 | * protect from numeric overflow. This method is intended to support a subsequent mod operation 115 | * on the return value. 116 | * 117 | * @param index Assumed to be >= 0L. 118 | * @param offset Any value. 119 | * @param mod Value used to reduce the result, 120 | * @return sum of index and offset, reduced by a mod-consistent amount if necessary to protect 121 | * from numeric overflow. 122 | */ 123 | private def protectedSum(index: Long, offset: Long, mod: Long): Long = { 124 | if (canSum(index, offset)) index + offset else protectedSum(index - mod, offset, mod); 125 | } 126 | 127 | private def canSum(a: Long, b: Long): Boolean = { 128 | (a ^ b) < 0 | (a ^ (a + b)) >= 0 129 | } 130 | 131 | /** 132 | * Returns an f-bit portion of the given hash. Iterating by f-bit segments from the least 133 | * significant side of the hash to the most significant, looks for a non-zero segment. If a 134 | * non-zero segment isn't found, 1 is returned to distinguish the fingerprint from a 135 | * non-entry. 136 | * 137 | * @param hash 64-bit hash value 138 | * @param f number of bits to consider from the hash 139 | * @return first non-zero f-bit value from hash as an int, or 1 if no non-zero value is found 140 | */ 141 | private[mutable] def fingerprint(hash: Long): Int = { 142 | val f = table.numBitsPerEntry 143 | 144 | val mask = (0x80000000 >> (f - 1)) >>> (Integer.SIZE - f) 145 | var bit = 0 146 | var ret: Long = 0x1.toLong 147 | 148 | while (bit + f <= Integer.SIZE){ 149 | ret = (hash >> bit) & mask 150 | if (ret != 0) { 151 | bit = Integer.SIZE 152 | } else { 153 | bit += f 154 | } 155 | } 156 | ret.toInt 157 | } 158 | 159 | } 160 | 161 | object CuckooFilter { 162 | 163 | def string(numberOfItems: Long, falsePositiveRate: Double)(implicit charset: Charset = Charset.defaultCharset()): CuckooFilter[String] = { 164 | of(numberOfItems, falsePositiveRate, {s: String => s.getBytes(charset)}) 165 | } 166 | 167 | def array(numberOfItems: Long, falsePositiveRate: Double): CuckooFilter[Array[Byte]] = { 168 | of(numberOfItems, falsePositiveRate, identity) 169 | } 170 | 171 | def of[A](numberOfItems: Long, falsePositiveRate: Double, f: A => Array[Byte]): CuckooFilter[A] = { 172 | val numEntriesPerBucket = optimalEntriesPerBucket(falsePositiveRate) 173 | val numBuckets: Long = optimalNumberOfBuckets(numberOfItems, numEntriesPerBucket) 174 | val numBitsPerEntry = optimalBitsPerEntry(falsePositiveRate, numEntriesPerBucket) 175 | val random = new Random() 176 | val maxRelocationAttempts = 500 177 | 178 | new CuckooFilter[A]( 179 | CuckooTable(numBuckets, numEntriesPerBucket, numBitsPerEntry), 180 | Hashes.XXHash.hash(_), 181 | f, 182 | random, 183 | maxRelocationAttempts 184 | ) 185 | } 186 | 187 | 188 | private def mod(x: Long, m: Long): Long = { 189 | val result = x % m 190 | if (result >= 0) result else result + m 191 | } 192 | 193 | val MAX_ENTRIES_PER_BUCKET = 8 194 | val MIN_ENTRIES_PER_BUCKET = 2 195 | 196 | /** 197 | * Minimum false positive probability supported, 8.67E-19. 198 | * 199 | * CuckooFilter § 5.1 Eq. (6), "f ≥ log2(2b/e) = [log2(1/e) + log2(2b)]" 200 | * (b) entries per bucket: 8 at e <= 0.00001 201 | * (f) bits per entry: 64-bits max 202 | * (e) false positive probability 203 | * 204 | * 64 = log2(16/e) = [log2(1/e) + log2(16)] 205 | * 64 = log2(1/e) + 4 206 | * 60 = log2(1/e) 207 | * 2^60 = 1/e 208 | * e = 1/2^60 209 | * e = 8.673617379884035E-19 210 | */ 211 | val MIN_FPP = 1.0D / Math.pow(2, 60) 212 | 213 | /** 214 | * Maximum false positive probability supported, 0.99. 215 | */ 216 | val MAX_FPP = 0.99D 217 | 218 | /* 219 | * Space optimization cheat sheet, per CuckooFilter § 5.1 : 220 | * 221 | * Given: 222 | * n: expected insertions 223 | * e: expected false positive probability (e.g. 0.03D for 3% fpp) 224 | * 225 | * Choose: 226 | * b: bucket size in entries (2, 4, 8) 227 | * a: load factor (proportional to b) 228 | * 229 | * Calculate: 230 | * f: fingerprint size in bits 231 | * m: table size in buckets 232 | * 233 | * 234 | * 1) Choose b = 8 | 4 | 2 235 | * when e : 0.00001 < e ≤ 0.002 236 | * ref: CuckooFilter § 5.1 ¶ 5, "Optimal bucket size" 237 | * 238 | * 2) Choose a = 50% | 84% | 95.5% | 98% 239 | * when b = 1 | 2 | 4 | 8 240 | * ref: CuckooFilter § 5.1 ¶ 2, "(1) Larger buckets improve table occupancy" 241 | * 242 | * 2) Optimal f = ceil( log2(2b/e) ) 243 | * ref: CuckooFilter § 5.1 Eq. (6), "f ≥ log2(2b/e) = [log2(1/e) + log2(2b)]" 244 | * 245 | * 3) Required m = evenCeil( ceiling( ceiling( n/a ) / b ) ) 246 | * Minimum entries (B) = n/a rounded up 247 | * Minimum buckets (m) = B/b rounded up to an even number 248 | */ 249 | 250 | /** 251 | * Returns the optimal number of entries per bucket, or bucket size, ({@code b}) given the 252 | * expected false positive probability ({@code e}). 253 | * 254 | * CuckooFilter § 5.1 ¶ 5, "Optimal bucket size" 255 | * 256 | * @param e the desired false positive probability (must be positive and less than 1.0) 257 | * @return optimal number of entries per bucket 258 | */ 259 | def optimalEntriesPerBucket(e: Double) = { 260 | require(e > 0.0D, "e must be > 0.0"); 261 | if (e <= 0.00001) { 262 | MAX_ENTRIES_PER_BUCKET 263 | } else if (e <= 0.002) { 264 | MAX_ENTRIES_PER_BUCKET / 2 265 | } else { 266 | MIN_ENTRIES_PER_BUCKET; 267 | } 268 | } 269 | 270 | /** 271 | * Returns the optimal load factor ({@code a}) given the number of entries per bucket ({@code 272 | * b}). 273 | * 274 | * CuckooFilter § 5.1 ¶ 2, "(1) Larger buckets improve table occupancy" 275 | * 276 | * @param b number of entries per bucket 277 | * @return load factor, positive and less than 1.0 278 | */ 279 | def optimalLoadFactor(b: Int): Double = { 280 | require(b == 2 || b == 4 || b == 8, "b must be 2, 4, or 8"); 281 | if (b == 2) { 282 | 0.84D 283 | } else if (b == 4) { 284 | 0.955D 285 | } else { 286 | 0.98D 287 | } 288 | } 289 | 290 | private val log2 = (x: Double) => Math.log10(x)/ Math.log10(2.0) 291 | 292 | def optimalBitsPerEntry(e: Double, b: Int): Int = { 293 | require(e >= MIN_FPP, "Cannot create CuckooFilter with FPP[" + e + 294 | "] < CuckooFilter.MIN_FPP[" + CuckooFilter.MIN_FPP + "]"); 295 | val d = log2(2 * b / e) 296 | d.round.toInt 297 | } 298 | 299 | def optimalNumberOfBuckets(n: Long,b: Int): Long = { 300 | require(n > 0, "n must be > 0"); 301 | val x = Math.ceil((math.ceil(n / optimalLoadFactor(b)) / b)).toLong 302 | (x + 1) / 2 * 2 303 | } 304 | 305 | } -------------------------------------------------------------------------------- /core/src/main/scala/io/chrisdavenport/probabilistic/mutable/CuckooTable.scala: -------------------------------------------------------------------------------- 1 | package io.chrisdavenport.probabilistic.mutable 2 | 3 | import scala.util.control.Breaks 4 | 5 | class CuckooTable private ( 6 | private[mutable] val data: ThreadSafeBitSet, 7 | val numBuckets: Long, 8 | val numEntriesPerBucket: Int, 9 | val numBitsPerEntry: Int 10 | ){ 11 | import CuckooTable._ 12 | 13 | // 0 indexed 14 | private def bitOffSet(bucket: Long, entry: Int): Long = { 15 | ((bucket * numEntriesPerBucket) + entry) * numBitsPerEntry 16 | } 17 | 18 | def readEntry(bucket: Long, entry: Int): Int = { 19 | require(bucket <= numBuckets) 20 | require(entry <= numEntriesPerBucket) 21 | val offset = bitOffSet(bucket, entry) 22 | val positions = for { 23 | x <- (offset until offset + numBitsPerEntry).toList 24 | if (data.get(x)) 25 | } yield x - offset 26 | fromBitPositions(positions.map(_.toInt)) 27 | } 28 | 29 | def findEntry(bucket: Long, value: Int): Option[Int] = { 30 | val break = new Breaks 31 | var entry = Option.empty[Int] 32 | break.breakable{ 33 | for { 34 | i <- 0 until numEntriesPerBucket 35 | } { 36 | val x = readEntry(bucket, i) 37 | if (x == value) { 38 | entry = Some(i) 39 | break.break() 40 | } 41 | } 42 | } 43 | entry 44 | } 45 | 46 | // 0 indexed 47 | def writeEntry(bucket: Long, entry: Int, value: Int): Int = { 48 | require(bucket <= numBuckets) 49 | require(entry <= numEntriesPerBucket, "Entry Higher Than Allowed") 50 | // Expensive... But unsafe otherwise 51 | require(highestBitPosition(value) <= numBitsPerEntry, "Bits of this value are too large") 52 | 53 | val x = readEntry(bucket, entry) // TODO Race Condition - Make atomic or keysemaphore on the combination 54 | val offset = bitOffSet(bucket, entry) 55 | val newEntrySet = bitPositions(value).map(offset + _).toSet 56 | val oldEntrySet = bitPositions(x).map(offset + _).toSet 57 | val entryBits = (offset until (offset + numBitsPerEntry)).toList 58 | for { 59 | x <- entryBits 60 | } { 61 | if (newEntrySet.contains(x)) { 62 | data.set(x) 63 | } 64 | else if (oldEntrySet.contains(x)){ 65 | data.clear(x) 66 | } 67 | } 68 | x 69 | } 70 | 71 | def swapAnyEntry(bucket: Long, valueIn: Int, valueOut: Int): Boolean = { 72 | findEntry(bucket, valueOut) 73 | .map(writeEntry(bucket, _, valueIn)) 74 | .map{i => 75 | val x = i == valueOut 76 | assert(x, s"Value Out Was Incorrect got $i expected $valueOut") 77 | true 78 | }.getOrElse(false) 79 | } 80 | } 81 | 82 | object CuckooTable { 83 | 84 | val EMPTY_ENTRY: Int = 0x00 85 | 86 | private[mutable] def bitPositions(int: Int): List[Int] = { 87 | val buffer = new scala.collection.mutable.ListBuffer[Int]() 88 | var number = int 89 | var position = 0 90 | while (number != 0){ 91 | if ((number & 1) != 0) { 92 | buffer :+ position 93 | } 94 | position += 1 95 | number = number >>> 1 96 | } 97 | buffer.toList 98 | } 99 | 100 | private[mutable] def highestBitPosition(int: Int): Int = { 101 | bitPositions(int).headOption.getOrElse(0) 102 | } 103 | 104 | private[mutable] def fromBitPositions(l: List[Int]): Int = { 105 | var x = 0x00 106 | l.foreach{bitPosition => 107 | val mask = 1 << bitPosition 108 | x = x | mask 109 | } 110 | x 111 | } 112 | 113 | // Int serves a fingerprint 114 | 115 | def apply( 116 | numBuckets: Long, // X 117 | numEntriesPerBucket: Int, 118 | numBitsPerEntry: Int 119 | ): CuckooTable = new CuckooTable( 120 | ThreadSafeBitSet(numBitsToPreallocate = numBuckets * numEntriesPerBucket * numBitsPerEntry), 121 | numBuckets, numEntriesPerBucket, numBitsPerEntry 122 | ) 123 | } -------------------------------------------------------------------------------- /core/src/main/scala/io/chrisdavenport/probabilistic/mutable/ThreadSafeBitSet.scala: -------------------------------------------------------------------------------- 1 | package io.chrisdavenport.probabilistic 2 | package mutable 3 | 4 | import java.util.concurrent.atomic.AtomicLongArray 5 | import java.util.concurrent.atomic.AtomicReference 6 | import scala.util.control.Breaks 7 | import scala.util.hashing.MurmurHash3 8 | import scala.collection.BitSet 9 | 10 | // More Like a BitVector that a bitset, but the name is what Scala calls this 11 | class ThreadSafeBitSet private ( 12 | private final val numLongsPerSegment: Int, 13 | private final val log2SegmentSize: Int, 14 | private final val segmentMask: Int, 15 | private final val segments: AtomicReference[ThreadSafeBitSet.ThreadSafeBitSegments] 16 | ) { // TODO extends scala.collection.mutable.BitSet 17 | 18 | /* 19 | * -------------------- 20 | * Modifications 21 | * -------------------- 22 | */ 23 | def set(position: Long): Unit = { 24 | val segmentPosition = position >>> log2SegmentSize // which segment -- div by num bits per segment 25 | val longPosition = (position >>> 6) & segmentMask // which long in the segment -- remainder of div by num bits per segment 26 | val bitPosition = position & 0x3F // which bit in the long -- remainder of div by num bits in long (64) -- positive bits 27 | val segment = getSegment(segmentPosition.toInt) 28 | val mask = 1L << bitPosition 29 | var retry = true 30 | while (retry){ 31 | val currentLongValue = segment.get(longPosition.toInt) 32 | val newLongValue = currentLongValue | mask 33 | if (segment.compareAndSet(longPosition.toInt, currentLongValue, newLongValue)){ 34 | retry = false 35 | } 36 | } 37 | } 38 | 39 | def clear(position: Long): Unit = { 40 | val segmentPosition = position >>> log2SegmentSize // which segment -- div by num bits per segment 41 | val longPosition = (position >>> 6) & segmentMask // which long in the segment -- remainder of div by num bits per segment 42 | val bitPosition = position & 0x3F /// which bit in the long -- remainder of div by num bits in long (64) 43 | val segment = getSegment(segmentPosition.toInt) 44 | val mask = ~(1L << bitPosition) 45 | var retry = true 46 | while (retry){ 47 | val currentLongValue = segment.get(longPosition.toInt) 48 | val newLongValue = currentLongValue & mask 49 | if (segment.compareAndSet(longPosition.toInt, currentLongValue, newLongValue)){ 50 | retry = false 51 | } 52 | } 53 | } 54 | def get(position: Long): Boolean = { 55 | val segmentPosition = position >>> log2SegmentSize // which segment -- div by num bits per segment 56 | val longPosition = (position >>> 6) & segmentMask // which long in the segment -- remainder of div by num bits per segment 57 | val bitPosition = position & 0x3F /// which bit in the long -- remainder of div by num bits in long (64) 58 | val segment = getSegment(segmentPosition.toInt) 59 | val mask = 1L << bitPosition 60 | (segment.get(longPosition.toInt) & mask) != 0 61 | } 62 | 63 | /** 64 | * Clear all bits to 0. 65 | */ 66 | def clearAll(): Unit = { 67 | val visibleSegments = segments.get 68 | for { 69 | i <- 0 until visibleSegments.numSegments 70 | segment = visibleSegments.getSegment(i) 71 | j <- 0 until segment.length() 72 | } { 73 | segment.set(j, 0L) 74 | } 75 | } 76 | 77 | /* 78 | * -------------------- 79 | * Informational 80 | * -------------------- 81 | */ 82 | 83 | def maxSetBit: Long = { 84 | val breaks = new Breaks 85 | val viewableSegments = segments.get() 86 | var bitPosition = -1L 87 | breaks.breakable{ 88 | for { 89 | segmentIdx <- (viewableSegments.numSegments - 1) to 0 by -1 90 | segment = viewableSegments.getSegment(segmentIdx) 91 | longIdx <- (segment.length() - 1) to 0 by -1 92 | } { 93 | val l = segment.get(longIdx) 94 | if (l != 0) { 95 | bitPosition = (segmentIdx.toLong << log2SegmentSize) + (longIdx * 64) + (63 - java.lang.Long.numberOfLeadingZeros(l)) 96 | breaks.break() 97 | } 98 | } 99 | } 100 | bitPosition 101 | } 102 | 103 | def nextSetBit(fromIndex: Long): Long = { 104 | require(fromIndex >= 0, s"fromIndex must be >= 0: got $fromIndex") 105 | var segmentPosition = fromIndex >>> log2SegmentSize 106 | val viewableSegments = segments.get() 107 | if (segmentPosition >= viewableSegments.numSegments) -1 108 | else { 109 | var longPosition = (fromIndex >>> 6) & segmentMask // which long in the segment -- remainder of div by num bits per segment 110 | val bitPosition = fromIndex & 0x3F // which bit in the long -- remainder of div by num bits in long (64) 111 | var segment = viewableSegments.getSegment(segmentPosition.toInt) 112 | var word = segment.get(longPosition.toInt) & (0xffffffffffffffffL << bitPosition) 113 | var response = -1L 114 | var loop = true 115 | while (loop) { 116 | if (word != 0) { 117 | response = (segmentPosition << (log2SegmentSize)) + (longPosition << 6) + java.lang.Long.numberOfTrailingZeros(word) 118 | loop = false 119 | } else { 120 | longPosition += 1 121 | if (longPosition > segmentMask) { 122 | segmentPosition += 1 123 | if (segmentPosition >= viewableSegments.numSegments) { 124 | loop = false 125 | // No bits set, return - 126 | } else { 127 | segment = viewableSegments.getSegment(segmentPosition.toInt) 128 | longPosition = 0 129 | word = segment.get(longPosition.toInt) 130 | } 131 | } else { 132 | word = segment.get(longPosition.toInt) 133 | } 134 | } 135 | } 136 | 137 | response 138 | } 139 | } 140 | 141 | /** 142 | * The numbers of bits which are set in this bit set. 143 | **/ 144 | def cardinality: Long = { 145 | val viewableSegments = segments.get() 146 | var numSetBits = 0L 147 | for { 148 | i <- 0 until viewableSegments.numSegments 149 | segment = viewableSegments.getSegment(i) 150 | j <- 0 until segment.length() 151 | } { 152 | numSetBits += java.lang.Long.bitCount(segment.get(j)) 153 | } 154 | numSetBits 155 | } 156 | 157 | /** 158 | * The number of bits which are currently specified by this bit set. This 159 | * is the maximum number which you might need to iterate if you were to 160 | * iterate over all the bits in this set. 161 | */ 162 | def currentCapacity: Int = 163 | segments.get().numSegments * (1 << log2SegmentSize) 164 | 165 | 166 | def eqv(other: ThreadSafeBitSet): Boolean = { 167 | require(other.log2SegmentSize == log2SegmentSize, "Segment sizes must be the same") 168 | val thisSegments = segments.get 169 | val otherSegments = other.segments.get 170 | var allEqual = true 171 | 172 | val breaks = new Breaks 173 | 174 | breaks.breakable{ 175 | // Check All of That Equal to All of This 176 | for { 177 | i <- 0 until thisSegments.numSegments 178 | thisArray = thisSegments.getSegment(i) 179 | otherArray = { 180 | if (i < otherSegments.numSegments) Some(otherSegments.getSegment(i)) 181 | else None 182 | } 183 | j <- 0 until thisArray.length() 184 | } { 185 | val thisLong = thisArray.get(j) 186 | val otherLong = otherArray.map(_.get(j)).getOrElse(0L) 187 | if (thisLong != otherLong) { 188 | allEqual = false 189 | breaks.break() 190 | } 191 | } 192 | // Check that anything left in that is equal to 0 193 | for { 194 | i <- thisSegments.numSegments until otherSegments.numSegments 195 | otherArray = otherSegments.getSegment(i) 196 | j <- 0 until otherArray.length 197 | } { 198 | val l = otherArray.get(j) 199 | if (l != 0) { 200 | allEqual = false 201 | breaks.break() 202 | } 203 | } 204 | } 205 | 206 | allEqual 207 | } 208 | 209 | /** 210 | * Return a new bit set which contains all bits which are contained in this bit set, and which are NOT contained in the `other` bit set. 211 | * 212 | * In other words, return a new bit set, which is a bitwise and with the bitwise not of the other bit set. 213 | * 214 | */ 215 | def andNot(other: ThreadSafeBitSet): ThreadSafeBitSet = { 216 | require(other.log2SegmentSize == log2SegmentSize, "Segment sizes must be the same") 217 | val thisSegments = segments.get() 218 | val otherSegments = other.segments.get() 219 | val newSegments = ThreadSafeBitSet.ThreadSafeBitSegments(thisSegments.numSegments, numLongsPerSegment) 220 | for { 221 | i <- 0 until thisSegments.numSegments 222 | thisArray = thisSegments.getSegment(i) 223 | otherArray = { 224 | if (i < otherSegments.numSegments) Some(otherSegments.getSegment(i)) 225 | else None 226 | } 227 | newArray = newSegments.getSegment(i) 228 | j <- 0 until thisArray.length() 229 | } { 230 | val thisLong = thisArray.get(j) 231 | val otherLong = otherArray.fold(0L)(a => a.get(j)) 232 | newArray.set(j, thisLong & ~ otherLong) 233 | } 234 | val andNot = ThreadSafeBitSet(log2SegmentSize) 235 | andNot.segments.set(newSegments) 236 | andNot 237 | } 238 | 239 | // Get the segment at `segmentIndex`. If this segment does not yet exist, create it. 240 | def getSegment(segmentIndex: Int): AtomicLongArray = { 241 | var visibleSegments = segments.get 242 | 243 | while(visibleSegments.numSegments <= segmentIndex){ 244 | // Thread safety: newVisibleSegments contains all of the segments from the currently visible segments, plus extra. 245 | // all of the segments in the currently visible segments are canonical and will not change. 246 | val newVisibleSegments = ThreadSafeBitSet.ThreadSafeBitSegments(visibleSegments, segmentIndex + 1, numLongsPerSegment) 247 | 248 | // because we are using a compareAndSet, if this thread "wins the race" and successfully sets this variable, then the segments 249 | // which are newly defined in newVisibleSegments become canonical. 250 | if (segments.compareAndSet(visibleSegments, newVisibleSegments)) { 251 | visibleSegments = newVisibleSegments 252 | } else { 253 | // If we "lose the race" and are growing the ThreadSafeBitSet segments larger, 254 | // then we will gather the new canonical sets from the update which we missed on the next iteration of this loop. 255 | // Newly defined segments in newVisibleSegments will be discarded, they do not get to become canonical. 256 | visibleSegments = segments.get(); 257 | } 258 | } 259 | 260 | visibleSegments.getSegment(segmentIndex) 261 | } 262 | 263 | override def equals(obj: Any): Boolean = obj match { 264 | case that: ThreadSafeBitSet => eqv(that) 265 | case _ => false 266 | } 267 | 268 | override def hashCode(): Int = { 269 | 31 * log2SegmentSize + 270 | MurmurHash3.arrayHash(segments.get().segments) 271 | } 272 | 273 | // Only works if Int Capable values are there 274 | def toMutableBitSet: scala.collection.mutable.BitSet = { 275 | val resultSet = scala.collection.mutable.BitSet.empty 276 | var ordinal = nextSetBit(0) 277 | while(ordinal != -1){ 278 | resultSet.add(ordinal.toInt) 279 | ordinal = nextSetBit(ordinal + 1) 280 | } 281 | resultSet 282 | } 283 | 284 | override def toString(): String = { 285 | val longs = new scala.collection.mutable.ListBuffer[Long]() 286 | var ordinal = nextSetBit(0) 287 | while(ordinal != -1L){ 288 | longs :+ ordinal 289 | ordinal = nextSetBit(ordinal + 1) 290 | } 291 | "ThreadSafeBitSet(" ++ longs.mkString(",") + ")" 292 | } 293 | } 294 | 295 | object ThreadSafeBitSet { 296 | val DEFAULT_LOG2_SEGMENT_SIZE_IN_BITS = 14 297 | 298 | // TODO: Overloads 299 | // def apply(): ThreadSafeBitSet = apply(DEFAULT_LOG2_SEGMENT_SIZE_IN_BITS) 300 | // def apply(log2SegmentSize: Int): ThreadSafeBitSet = apply(log2SegmentSize, 0) 301 | def apply( 302 | log2SegmentSizeInBits: Int = DEFAULT_LOG2_SEGMENT_SIZE_IN_BITS, 303 | numBitsToPreallocate: Long = 0L 304 | ): ThreadSafeBitSet = { 305 | require(log2SegmentSizeInBits > 6, "Cannot specify fewer than 64 bits in each segment!") 306 | val log2SegmentSize = log2SegmentSizeInBits 307 | val numLongsPerSegment = (1 << (log2SegmentSizeInBits - 6)) 308 | val segmentMask = numLongsPerSegment - 1 309 | val numBitsPerSegment = numLongsPerSegment * 64 310 | val numSegmentsToPreallocate = 311 | if (numBitsToPreallocate == 0) 1 312 | else ((numBitsToPreallocate - 1) / numBitsPerSegment) + 1 313 | val segments = new AtomicReference[ThreadSafeBitSegments]() 314 | segments.set(ThreadSafeBitSegments(numSegmentsToPreallocate.toInt, numLongsPerSegment)) 315 | 316 | new ThreadSafeBitSet(numLongsPerSegment, log2SegmentSize, segmentMask, segments) 317 | } 318 | 319 | def fromBitSet( 320 | bitSet: BitSet, 321 | log2SegmentSize: Int = DEFAULT_LOG2_SEGMENT_SIZE_IN_BITS 322 | ): ThreadSafeBitSet = { 323 | val tsb = apply(log2SegmentSize, bitSet.size.toLong) 324 | bitSet.foreach(i => 325 | tsb.set(i.toLong) 326 | ) 327 | tsb 328 | } 329 | 330 | // def orAll(bitSets: ThreadSafeBitSet*): ThreadSafeBitSet = { 331 | // ??? 332 | // } 333 | 334 | private class ThreadSafeBitSegments private (private[ThreadSafeBitSet] final val segments: Array[AtomicLongArray]){ 335 | def numSegments = segments.length 336 | def getSegment(index: Int) = segments(index) 337 | } 338 | private object ThreadSafeBitSegments { 339 | def apply(numSegments: Int, segmentLength: Int) = { 340 | val segments = new Array[AtomicLongArray](numSegments) 341 | for(i <- 0 until numSegments) { 342 | segments.update(i, new AtomicLongArray(segmentLength)) 343 | } 344 | new ThreadSafeBitSegments(segments) 345 | } 346 | def apply(copyFrom: ThreadSafeBitSegments, numSegments: Int, segmentLength: Int) = { 347 | val segments = new Array[AtomicLongArray](numSegments) 348 | for(i <- 0 until numSegments) { 349 | val set = if (i < copyFrom.numSegments) copyFrom.getSegment(i) else new AtomicLongArray(segmentLength) 350 | segments.update(i, set) 351 | } 352 | new ThreadSafeBitSegments(segments) 353 | } 354 | } 355 | } -------------------------------------------------------------------------------- /core/src/test/scala/io/chrisdavenport/probabilistic/MainSpec.scala: -------------------------------------------------------------------------------- 1 | package io.chrisdavenport.probabilistic 2 | 3 | import munit.CatsEffectSuite 4 | import cats.effect._ 5 | 6 | class MainSpec extends CatsEffectSuite { 7 | 8 | test("Main should exit succesfully") { 9 | assertIO(IO(true), true) 10 | } 11 | 12 | } 13 | -------------------------------------------------------------------------------- /examples/src/main/scala/BloomExample.scala: -------------------------------------------------------------------------------- 1 | 2 | import cats.effect._ 3 | import io.chrisdavenport.probabilistic.BloomFilter 4 | 5 | object BloomExample extends IOApp { 6 | 7 | def run(args: List[String]): IO[ExitCode] = { 8 | val x = "Foo" 9 | for { 10 | bf <- BloomFilter.string[IO](numberOfItems = 10000, falsePositiveRate = 0.01) 11 | present1 <- bf.mayContain(x) // False - It hasn't been inserted yet 12 | _ <- IO(println(present1)) 13 | _ <- bf.add(x) 14 | present2 <- bf.mayContain(x) // True - It was inserted 15 | _ <- IO(println(present2)) 16 | } yield ExitCode.Success 17 | } 18 | 19 | } -------------------------------------------------------------------------------- /examples/src/main/scala/CuckooExample.scala: -------------------------------------------------------------------------------- 1 | import io.chrisdavenport.probabilistic.CuckooFilter 2 | import cats.effect._ 3 | 4 | object CuckooExample extends IOApp { 5 | 6 | def run(args: List[String]): IO[ExitCode] = { 7 | val x = "Foo" 8 | for { 9 | cf <- CuckooFilter.string[IO](numberOfItems = 10000, falsePositiveRate = 0.01) 10 | present1 <- cf.mayContain(x) // False - It hasn't been inserted yet 11 | _ <- IO(println(present1)) 12 | _ <- cf.add(x) 13 | present2 <- cf.mayContain(x) // True - It was inserted 14 | _ <- IO(println(present2)) 15 | _ <- cf.remove(x) 16 | present3 <- cf.mayContain(x) // False - It was removed again. Cool! 17 | _ <- IO(println(present3)) 18 | } yield ExitCode.Success 19 | } 20 | 21 | } -------------------------------------------------------------------------------- /licenses/apache: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.4.9 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | // addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.16") 2 | addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.8.1") 3 | addSbtPlugin("io.chrisdavenport" % "sbt-mima-version-check" % "0.1.2") 4 | addSbtPlugin("io.chrisdavenport" % "sbt-no-publish" % "0.1.0") 5 | 6 | addSbtPlugin("org.portable-scala" % "sbt-scalajs-crossproject" % "1.0.0") 7 | addSbtPlugin("org.scala-js" % "sbt-scalajs" % "1.4.0") 8 | addSbtPlugin("com.github.cb372" % "sbt-explicit-dependencies" % "0.2.16") 9 | addSbtPlugin("com.geirsson" % "sbt-ci-release" % "1.5.5") 10 | addSbtPlugin("ch.epfl.lamp" % "sbt-dotty" % "0.5.1") 11 | addSbtPlugin("com.codecommit" % "sbt-github-actions" % "0.9.5") 12 | 13 | addSbtPlugin("org.scalameta" % "sbt-mdoc" % "2.2.16") 14 | addSbtPlugin("com.47deg" % "sbt-microsites" % "1.3.0") 15 | addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.3") -------------------------------------------------------------------------------- /site/Gemfile: -------------------------------------------------------------------------------- 1 | source 'http://rubygems.org' 2 | 3 | gem "jekyll", ">= 4.0.0" 4 | gem "jekyll-relative-links" 5 | gem "sass" -------------------------------------------------------------------------------- /site/docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: home 3 | 4 | --- 5 | 6 | # probabilistic - Probabilistic Data Structures [![Build Status](https://travis-ci.com/ChristopherDavenport/probabilistic.svg?branch=master)](https://travis-ci.com/ChristopherDavenport/probabilistic) [![Maven Central](https://maven-badges.herokuapp.com/maven-central/io.chrisdavenport/probabilistic_2.12/badge.svg)](https://maven-badges.herokuapp.com/maven-central/io.chrisdavenport/probabilistic_2.12) 7 | 8 | ## Quick Start 9 | 10 | To use probabilistic in an existing SBT project with Scala 2.11 or a later version, add the following dependencies to your 11 | `build.sbt` depending on your needs: 12 | 13 | ```scala 14 | libraryDependencies ++= Seq( 15 | "io.chrisdavenport" %% "probabilistic" % "" 16 | ) 17 | ``` --------------------------------------------------------------------------------