├── .github
    └── workflows
    │   ├── ci.yml
    │   └── clean.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── NOTICE
├── README.md
├── build.sbt
├── core
    └── src
    │   ├── main
    │       └── scala
    │       │   └── io
    │       │       └── chrisdavenport
    │       │           └── probabilistic
    │       │               ├── BloomFilter.scala
    │       │               ├── CuckooFilter.scala
    │       │               ├── hashes
    │       │                   └── Hashes.scala
    │       │               └── mutable
    │       │                   ├── BloomFilter.scala
    │       │                   ├── CuckooFilter.scala
    │       │                   ├── CuckooTable.scala
    │       │                   └── ThreadSafeBitSet.scala
    │   └── test
    │       └── scala
    │           └── io
    │               └── chrisdavenport
    │                   └── probabilistic
    │                       └── MainSpec.scala
├── examples
    └── src
    │   └── main
    │       └── scala
    │           ├── BloomExample.scala
    │           └── CuckooExample.scala
├── licenses
    └── apache
├── project
    ├── build.properties
    └── plugins.sbt
└── site
    ├── Gemfile
    └── docs
        └── index.md


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | # This file was automatically generated by sbt-github-actions using the
  2 | # githubWorkflowGenerate task. You should add and commit this file to
  3 | # your git repository. It goes without saying that you shouldn't edit
  4 | # this file by hand! Instead, if you wish to make changes, you should
  5 | # change your sbt build configuration to revise the workflow description
  6 | # to meet your needs, then regenerate this file.
  7 | 
  8 | name: Continuous Integration
  9 | 
 10 | on:
 11 |   pull_request:
 12 |     branches: ['*']
 13 |   push:
 14 |     branches: ['*']
 15 |     tags: [v*]
 16 | 
 17 | env:
 18 |   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 19 | 
 20 | jobs:
 21 |   build:
 22 |     name: Build and Test
 23 |     strategy:
 24 |       matrix:
 25 |         os: [ubuntu-latest]
 26 |         scala: [2.12.13, 2.13.3]
 27 |         java: [adopt@1.8]
 28 |     runs-on: ${{ matrix.os }}
 29 |     steps:
 30 |       - name: Checkout current branch (full)
 31 |         uses: actions/checkout@v2
 32 |         with:
 33 |           fetch-depth: 0
 34 | 
 35 |       - name: Setup Java and Scala
 36 |         uses: olafurpg/setup-scala@v10
 37 |         with:
 38 |           java-version: ${{ matrix.java }}
 39 | 
 40 |       - name: Cache sbt
 41 |         uses: actions/cache@v2
 42 |         with:
 43 |           path: |
 44 |             ~/.sbt
 45 |             ~/.ivy2/cache
 46 |             ~/.coursier/cache/v1
 47 |             ~/.cache/coursier/v1
 48 |             ~/AppData/Local/Coursier/Cache/v1
 49 |             ~/Library/Caches/Coursier/v1
 50 |           key: ${{ runner.os }}-sbt-cache-v2-${{ hashFiles('**/*.sbt') }}-${{ hashFiles('project/build.properties') }}
 51 | 
 52 |       - name: Setup Ruby
 53 |         if: matrix.scala == '2.13.3'
 54 |         uses: ruby/setup-ruby@v1
 55 |         with:
 56 |           ruby-version: 2.6.0
 57 | 
 58 |       - name: Install microsite dependencies
 59 |         if: matrix.scala == '2.13.3'
 60 |         run: |
 61 |           gem install saas
 62 |           gem install jekyll -v 3.2.1
 63 | 
 64 |       - name: Check that workflows are up to date
 65 |         run: sbt ++${{ matrix.scala }} githubWorkflowCheck
 66 | 
 67 |       - run: sbt ++${{ matrix.scala }} test mimaReportBinaryIssues
 68 | 
 69 |       - if: matrix.scala == '2.13.3'
 70 |         run: sbt ++${{ matrix.scala }} site/makeMicrosite
 71 | 
 72 |   publish:
 73 |     name: Publish Artifacts
 74 |     needs: [build]
 75 |     if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v'))
 76 |     strategy:
 77 |       matrix:
 78 |         os: [ubuntu-latest]
 79 |         scala: [2.13.3]
 80 |         java: [adopt@1.8]
 81 |     runs-on: ${{ matrix.os }}
 82 |     steps:
 83 |       - name: Checkout current branch (full)
 84 |         uses: actions/checkout@v2
 85 |         with:
 86 |           fetch-depth: 0
 87 | 
 88 |       - name: Setup Java and Scala
 89 |         uses: olafurpg/setup-scala@v10
 90 |         with:
 91 |           java-version: ${{ matrix.java }}
 92 | 
 93 |       - name: Cache sbt
 94 |         uses: actions/cache@v2
 95 |         with:
 96 |           path: |
 97 |             ~/.sbt
 98 |             ~/.ivy2/cache
 99 |             ~/.coursier/cache/v1
100 |             ~/.cache/coursier/v1
101 |             ~/AppData/Local/Coursier/Cache/v1
102 |             ~/Library/Caches/Coursier/v1
103 |           key: ${{ runner.os }}-sbt-cache-v2-${{ hashFiles('**/*.sbt') }}-${{ hashFiles('project/build.properties') }}
104 | 
105 |       - uses: olafurpg/setup-gpg@v3
106 | 
107 |       - name: Setup Ruby
108 |         uses: ruby/setup-ruby@v1
109 |         with:
110 |           ruby-version: 2.6.0
111 | 
112 |       - name: Install microsite dependencies
113 |         run: |
114 |           gem install saas
115 |           gem install jekyll -v 3.2.1
116 | 
117 |       - name: Publish artifacts to Sonatype
118 |         env:
119 |           PGP_PASSPHRASE: ${{ secrets.PGP_PASSPHRASE }}
120 |           PGP_SECRET: ${{ secrets.PGP_SECRET }}
121 |           SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }}
122 |           SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }}
123 |         run: sbt ++${{ matrix.scala }} ci-release
124 | 
125 |       - name: Publish microsite
126 |         run: sbt ++${{ matrix.scala }} site/publishMicrosite


--------------------------------------------------------------------------------
/.github/workflows/clean.yml:
--------------------------------------------------------------------------------
 1 | # This file was automatically generated by sbt-github-actions using the
 2 | # githubWorkflowGenerate task. You should add and commit this file to
 3 | # your git repository. It goes without saying that you shouldn't edit
 4 | # this file by hand! Instead, if you wish to make changes, you should
 5 | # change your sbt build configuration to revise the workflow description
 6 | # to meet your needs, then regenerate this file.
 7 | 
 8 | name: Clean
 9 | 
10 | on: push
11 | 
12 | jobs:
13 |   delete-artifacts:
14 |     name: Delete Artifacts
15 |     runs-on: ubuntu-latest
16 |     env:
17 |       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
18 |     steps:
19 |       - name: Delete artifacts
20 |         run: |
21 |           # Customize those three lines with your repository and credentials:
22 |           REPO=${GITHUB_API_URL}/repos/${{ github.repository }}
23 | 
24 |           # A shortcut to call GitHub API.
25 |           ghapi() { curl --silent --location --user _:$GITHUB_TOKEN "$@"; }
26 | 
27 |           # A temporary file which receives HTTP response headers.
28 |           TMPFILE=/tmp/tmp.$$
29 | 
30 |           # An associative array, key: artifact name, value: number of artifacts of that name.
31 |           declare -A ARTCOUNT
32 | 
33 |           # Process all artifacts on this repository, loop on returned "pages".
34 |           URL=$REPO/actions/artifacts
35 |           while [[ -n "$URL" ]]; do
36 | 
37 |             # Get current page, get response headers in a temporary file.
38 |             JSON=$(ghapi --dump-header $TMPFILE "$URL")
39 | 
40 |             # Get URL of next page. Will be empty if we are at the last page.
41 |             URL=$(grep '^Link:' "$TMPFILE" | tr ',' '\n' | grep 'rel="next"' | head -1 | sed -e 's/.*<//' -e 's/>.*//')
42 |             rm -f $TMPFILE
43 | 
44 |             # Number of artifacts on this page:
45 |             COUNT=$(( $(jq <<<$JSON -r '.artifacts | length') ))
46 | 
47 |             # Loop on all artifacts on this page.
48 |             for ((i=0; $i < $COUNT; i++)); do
49 | 
50 |               # Get name of artifact and count instances of this name.
51 |               name=$(jq <<<$JSON -r ".artifacts[$i].name?")
52 |               ARTCOUNT[$name]=$(( $(( ${ARTCOUNT[$name]} )) + 1))
53 | 
54 |               id=$(jq <<<$JSON -r ".artifacts[$i].id?")
55 |               size=$(( $(jq <<<$JSON -r ".artifacts[$i].size_in_bytes?") ))
56 |               printf "Deleting '%s' #%d, %'d bytes\n" $name ${ARTCOUNT[$name]} $size
57 |               ghapi -X DELETE $REPO/actions/artifacts/$id
58 |             done
59 |           done


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | target/
 2 | .idea/
 3 | # vim
 4 | *.sw?
 5 | 
 6 | # Ignore [ce]tags files
 7 | tags
 8 | 
 9 | .bloop
10 | .metals
11 | metals.sbt
12 | .vscode
13 | .bsp


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | We are committed to providing a friendly, safe and welcoming environment for all, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, religion, nationality, or other such characteristics.
 4 | 
 5 | Everyone is expected to follow the [Scala Code of Conduct] when discussing the project on the available communication channels. If you are being harassed, please contact us immediately so that we can support you.
 6 | 
 7 | ## Moderation
 8 | 
 9 | Any questions, concerns, or moderation requests please contact a member of the project.
10 | 
11 | - [Christopher Davenport](mailto:chris@christopherdavenport.tech)
12 | 
13 | [Scala Code of Conduct]: https://www.scala-lang.org/conduct/
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2021 Christopher Davenport
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | probabilistic
 2 | Copyright 2021 Christopher Davenport
 3 | Licensed under the MIT license (see LICENSE)
 4 | 
 5 | This software contains portions of code derived from guava-probably
 6 | https://github.com/bdupras/guava-probably
 7 | Copyright (C) 2015 Brian Dupras
 8 | Licensed under Apache License 2.0 (licenses/apache)
 9 | 
10 | This software contains portions of code derived from hollow
11 | https://github.com/Netflix/hollow
12 | Copyright (C) 2016-2019 Netflix, Inc.
13 | Licensed under Apache License 2.0 (licenses/apache)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # probabilistic - Probabilistic Data Structures [![Build Status](https://travis-ci.com/ChristopherDavenport/probabilistic.svg?branch=master)](https://travis-ci.com/ChristopherDavenport/probabilistic) [![Maven Central](https://maven-badges.herokuapp.com/maven-central/io.chrisdavenport/probabilistic_2.12/badge.svg)](https://maven-badges.herokuapp.com/maven-central/io.chrisdavenport/probabilistic_2.12) ![Code of Consuct](https://img.shields.io/badge/Code%20of%20Conduct-Scala-blue.svg)
 2 | 
 3 | ## [Head on over to the microsite](https://ChristopherDavenport.github.io/probabilistic)
 4 | 
 5 | ## Quick Start
 6 | 
 7 | To use probabilistic in an existing SBT project with Scala 2.11 or a later version, add the following dependencies to your
 8 | `build.sbt` depending on your needs:
 9 | 
10 | ```scala
11 | libraryDependencies ++= Seq(
12 |   "io.chrisdavenport" %% "probabilistic" % "<version>"
13 | )
14 | ```
15 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
  1 | import sbtcrossproject.CrossPlugin.autoImport.{crossProject, CrossType}
  2 | 
  3 | val Scala213 = "2.13.3"
  4 | 
  5 | ThisBuild / crossScalaVersions := Seq("2.12.13", Scala213)
  6 | ThisBuild / scalaVersion := crossScalaVersions.value.last
  7 | 
  8 | ThisBuild / githubWorkflowArtifactUpload := false
  9 | 
 10 | val Scala213Cond = s"matrix.scala == '$Scala213'"
 11 | 
 12 | def rubySetupSteps(cond: Option[String]) = Seq(
 13 |   WorkflowStep.Use(
 14 |     "ruby", "setup-ruby", "v1",
 15 |     name = Some("Setup Ruby"),
 16 |     params = Map("ruby-version" -> "2.6.0"),
 17 |     cond = cond),
 18 | 
 19 |   WorkflowStep.Run(
 20 |     List(
 21 |       "gem install saas",
 22 |       "gem install jekyll -v 3.2.1"),
 23 |     name = Some("Install microsite dependencies"),
 24 |     cond = cond))
 25 | 
 26 | ThisBuild / githubWorkflowBuildPreamble ++=
 27 |   rubySetupSteps(Some(Scala213Cond))
 28 | 
 29 | ThisBuild / githubWorkflowBuild := Seq(
 30 |   WorkflowStep.Sbt(List("test", "mimaReportBinaryIssues")),
 31 | 
 32 |   WorkflowStep.Sbt(
 33 |     List("site/makeMicrosite"),
 34 |     cond = Some(Scala213Cond)))
 35 | 
 36 | ThisBuild / githubWorkflowTargetTags ++= Seq("v*")
 37 | 
 38 | // currently only publishing tags
 39 | ThisBuild / githubWorkflowPublishTargetBranches :=
 40 |   Seq(RefPredicate.StartsWith(Ref.Tag("v")))
 41 | 
 42 | ThisBuild / githubWorkflowPublishPreamble ++=
 43 |   WorkflowStep.Use("olafurpg", "setup-gpg", "v3") +: rubySetupSteps(None)
 44 | 
 45 | ThisBuild / githubWorkflowPublish := Seq(
 46 |   WorkflowStep.Sbt(
 47 |     List("ci-release"),
 48 |     name = Some("Publish artifacts to Sonatype"),
 49 |     env = Map(
 50 |       "PGP_PASSPHRASE" -> "${{ secrets.PGP_PASSPHRASE }}",
 51 |       "PGP_SECRET" -> "${{ secrets.PGP_SECRET }}",
 52 |       "SONATYPE_PASSWORD" -> "${{ secrets.SONATYPE_PASSWORD }}",
 53 |       "SONATYPE_USERNAME" -> "${{ secrets.SONATYPE_USERNAME }}")),
 54 | 
 55 |   WorkflowStep.Sbt(
 56 |     List("site/publishMicrosite"),
 57 |     name = Some("Publish microsite")
 58 |   )
 59 | )
 60 | 
 61 | 
 62 | val catsV = "2.3.1"
 63 | val catsEffectV = "2.3.1"
 64 | 
 65 | val munitCatsEffectV = "0.12.0"
 66 | 
 67 | val kindProjectorV = "0.11.3"
 68 | val betterMonadicForV = "0.3.1"
 69 | 
 70 | // Projects
 71 | lazy val `probabilistic` = project.in(file("."))
 72 |   .disablePlugins(MimaPlugin)
 73 |   .enablePlugins(NoPublishPlugin)
 74 |   .aggregate(core, examples)
 75 | 
 76 | lazy val core = project.in(file("core"))
 77 |   .settings(commonSettings)
 78 |   .settings(
 79 |     name := "probabilistic"
 80 |   )
 81 | 
 82 | lazy val examples = project.in(file("examples"))
 83 |   .disablePlugins(MimaPlugin)
 84 |   .enablePlugins(NoPublishPlugin)
 85 |   .settings(commonSettings)
 86 |   .dependsOn(core)
 87 |   .settings(
 88 |     name := "probabilistic-examples"
 89 |   )
 90 | 
 91 | lazy val site = project.in(file("site"))
 92 |   .disablePlugins(MimaPlugin)
 93 |   .enablePlugins(MicrositesPlugin)
 94 |   .enablePlugins(MdocPlugin)
 95 |   .enablePlugins(NoPublishPlugin)
 96 |   .settings(commonSettings)
 97 |   .dependsOn(core)
 98 |   .settings{
 99 |     import microsites._
100 |     Seq(
101 |       micrositeName := "probabilistic",
102 |       micrositeDescription := "Probabilistic Data Structures",
103 |       micrositeAuthor := "Christopher Davenport",
104 |       micrositeGithubOwner := "ChristopherDavenport",
105 |       micrositeGithubRepo := "probabilistic",
106 |       micrositeBaseUrl := "/probabilistic",
107 |       micrositeDocumentationUrl := "https://www.javadoc.io/doc/io.chrisdavenport/probabilistic_2.13",
108 |       micrositeGitterChannelUrl := "ChristopherDavenport/libraries", // Feel Free to Set To Something Else
109 |       micrositeFooterText := None,
110 |       micrositeHighlightTheme := "atom-one-light",
111 |       micrositePalette := Map(
112 |         "brand-primary" -> "#3e5b95",
113 |         "brand-secondary" -> "#294066",
114 |         "brand-tertiary" -> "#2d5799",
115 |         "gray-dark" -> "#49494B",
116 |         "gray" -> "#7B7B7E",
117 |         "gray-light" -> "#E5E5E6",
118 |         "gray-lighter" -> "#F4F3F4",
119 |         "white-color" -> "#FFFFFF"
120 |       ),
121 |       micrositePushSiteWith := GitHub4s,
122 |       micrositeGithubToken := sys.env.get("GITHUB_TOKEN"),
123 |       micrositeExtraMdFiles := Map(
124 |           file("CODE_OF_CONDUCT.md")  -> ExtraMdFileConfig("code-of-conduct.md",   "page", Map("title" -> "code of conduct",   "section" -> "code of conduct",   "position" -> "100")),
125 |           file("LICENSE")             -> ExtraMdFileConfig("license.md",   "page", Map("title" -> "license",   "section" -> "license",   "position" -> "101"))
126 |       )
127 |     )
128 |   }
129 | 
130 | // General Settings
131 | lazy val commonSettings = Seq(
132 |   testFrameworks += new TestFramework("munit.Framework"),
133 |   libraryDependencies ++= {
134 |     if (isDotty.value) Seq.empty
135 |     else Seq(
136 |       compilerPlugin("org.typelevel" % "kind-projector" % kindProjectorV cross CrossVersion.full),
137 |       compilerPlugin("com.olegpy" %% "better-monadic-for" % betterMonadicForV),
138 |     )
139 |   },
140 |   scalacOptions ++= {
141 |     if (isDotty.value) Seq("-source:3.0-migration")
142 |     else Seq()
143 |   },
144 |   Compile / doc / sources := {
145 |     val old = (Compile / doc / sources).value
146 |     if (isDotty.value)
147 |       Seq()
148 |     else
149 |       old
150 |   },
151 | 
152 |   libraryDependencies ++= Seq(
153 |     "org.typelevel"               %% "cats-core"                  % catsV,
154 |     "org.typelevel"               %% "cats-effect"                % catsEffectV,
155 | 
156 |     "org.typelevel"               %%% "munit-cats-effect-2"        % munitCatsEffectV         % Test,
157 |   )
158 | )
159 | 
160 | // General Settings
161 | inThisBuild(List(
162 |   organization := "io.chrisdavenport",
163 |   developers := List(
164 |     Developer("ChristopherDavenport", "Christopher Davenport", "chris@christopherdavenport.tech", url("https://github.com/ChristopherDavenport"))
165 |   ),
166 | 
167 |   homepage := Some(url("https://github.com/ChristopherDavenport/probabilistic")),
168 |   licenses += ("MIT", url("http://opensource.org/licenses/MIT")),
169 | 
170 |   pomIncludeRepository := { _ => false},
171 |   scalacOptions in (Compile, doc) ++= Seq(
172 |       "-groups",
173 |       "-sourcepath", (baseDirectory in LocalRootProject).value.getAbsolutePath,
174 |       "-doc-source-url", "https://github.com/ChristopherDavenport/probabilistic/blob/v" + version.value + "€{FILE_PATH}.scala"
175 |   )
176 | ))
177 | 


--------------------------------------------------------------------------------
/core/src/main/scala/io/chrisdavenport/probabilistic/BloomFilter.scala:
--------------------------------------------------------------------------------
 1 | package io.chrisdavenport.probabilistic
 2 | 
 3 | 
 4 | import cats._
 5 | import cats.syntax.all._
 6 | import cats.effect._
 7 | import cats.effect.concurrent._
 8 | import java.nio.charset.Charset
 9 | import java.nio.charset.StandardCharsets
10 | import io.chrisdavenport.probabilistic.hashes.Hashes
11 | 
12 | trait BloomFilter[F[_], A]{
13 |   def add(a: A): F[Unit]
14 |   // False Positives a Reality
15 |   // False Negatives are not a thing
16 |   def mayContain(a: A): F[Boolean]
17 | }
18 | 
19 | object BloomFilter {
20 | 
21 |   def string[F[_]: Sync](numberOfItems: Long, falsePositiveRate: Double)(implicit charset: Charset = Charset.defaultCharset()): F[BloomFilter[F, String]] = {
22 |     Sync[F].delay(mutable.BloomFilter.string(numberOfItems, falsePositiveRate)(charset))
23 |       .map(new BloomFilterImpl[F, String](_))
24 |   }
25 | 
26 |   def array[F[_]: Sync](numberOfItems: Long, falsePositiveRate: Double): F[BloomFilter[F, Array[Byte]]] = {
27 |     Sync[F].delay(mutable.BloomFilter.array(numberOfItems, falsePositiveRate))
28 |       .map(new BloomFilterImpl[F, Array[Byte]](_))
29 |   }
30 | 
31 |   def static[F[_]: Sync, A](initBitSize: Long, hashFunctions: A => cats.data.NonEmptyList[Long]): F[BloomFilter[F, A]] = 
32 |     for {
33 |       bf <- Sync[F].delay(mutable.BloomFilter.static(initBitSize, hashFunctions))
34 |     } yield new BloomFilterImpl[F, A](bf)
35 | 
36 | 
37 |   implicit def instances[F[_]]: Contravariant[({type X[A] = BloomFilter[F, A]})#X] = new Contravariant[({type X[A] = BloomFilter[F, A]})#X]{
38 |     def contramap[A, B](fa: BloomFilter[F,A])(f: B => A): BloomFilter[F,B] = new BloomFilter[F, B] {
39 |       def add(a: B): F[Unit] = fa.add(f(a))
40 |       def mayContain(a: B): F[Boolean] = fa.mayContain(f(a))
41 |     }
42 |   }
43 | 
44 |   private class BloomFilterImpl[F[_]: Sync, A](
45 |     underlying: mutable.BloomFilter[A]
46 |   ) extends BloomFilter[F, A]{
47 |     def add(a: A): F[Unit] = Sync[F].delay{
48 |       underlying.add(a)
49 |     }
50 |     def mayContain(a: A): F[Boolean] = Sync[F].delay{
51 |       underlying.mayContain(a)
52 |     }
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/core/src/main/scala/io/chrisdavenport/probabilistic/CuckooFilter.scala:
--------------------------------------------------------------------------------
 1 | package io.chrisdavenport.probabilistic
 2 | 
 3 | import cats.Contravariant
 4 | import cats.syntax.all._
 5 | import cats.effect._
 6 | import java.nio.charset.Charset
 7 | 
 8 | trait CuckooFilter[F[_], A]{
 9 |   def add(a: A): F[Boolean]
10 |   def remove(a: A): F[Boolean]
11 | 
12 |   def mayContain(a: A): F[Boolean]
13 | }
14 | 
15 | object CuckooFilter {
16 | 
17 |   def string[F[_]: Sync](numberOfItems: Long, falsePositiveRate: Double)(implicit charset: Charset = Charset.defaultCharset()): F[CuckooFilter[F, String]] = {
18 |     Sync[F].delay(mutable.CuckooFilter.string(numberOfItems, falsePositiveRate)(charset))
19 |       .map(new CuckooFilterImpl[F, String](_))
20 |   }
21 | 
22 |   def array[F[_]: Sync](numberOfItems: Long, falsePositiveRate: Double): F[CuckooFilter[F, Array[Byte]]] = {
23 |     Sync[F].delay(mutable.CuckooFilter.array(numberOfItems, falsePositiveRate))
24 |       .map(new CuckooFilterImpl[F, Array[Byte]](_))
25 |   }
26 | 
27 | 
28 |   implicit def instances[F[_]]: Contravariant[({type X[A] = CuckooFilter[F, A]})#X] = new Contravariant[({type X[A] = CuckooFilter[F, A]})#X]{
29 |     def contramap[A, B](fa: CuckooFilter[F,A])(f: B => A): CuckooFilter[F,B] = new CuckooFilter[F, B] {
30 |       def add(a: B): F[Boolean] = fa.add(f(a))
31 |       def remove(a: B): F[Boolean] = fa.remove(f(a))
32 |       def mayContain(a: B): F[Boolean] = fa.mayContain(f(a))
33 |     }
34 |   }
35 | 
36 |   private class CuckooFilterImpl[F[_]: Sync, A](underlying: mutable.CuckooFilter[A]) extends CuckooFilter[F, A]{
37 |     def add(a: A): F[Boolean] = Sync[F].delay(underlying.add(a))
38 |     def remove(a: A): F[Boolean] = Sync[F].delay(underlying.remove(a))
39 |     def mayContain(a: A): F[Boolean] = Sync[F].delay(underlying.mayContain(a))
40 |   }
41 | }


--------------------------------------------------------------------------------
/core/src/main/scala/io/chrisdavenport/probabilistic/hashes/Hashes.scala:
--------------------------------------------------------------------------------
  1 | package io.chrisdavenport.probabilistic.hashes
  2 | 
  3 | import scala.util.hashing.MurmurHash3
  4 | import cats.data.NonEmptyList
  5 | import java.nio.charset.Charset
  6 | 
  7 | object Hashes {
  8 | 
  9 |   // TODO MORE HASHING!!! 
 10 |   // Need 14 - 17 unique algorithms for extremely small probabilities of false positives
 11 |   val arrayHashes: NonEmptyList[Array[Byte] => Long] = NonEmptyList.of(
 12 |     XXHash.hash(_),
 13 |     FNV32.hash(_),
 14 |     Adler32.hash(_),
 15 |     Bernstein.hash(_),
 16 |     KernighanRitchie.hash(_),
 17 |     Murmur3.hash(_),
 18 |     CRC16.hash(_),
 19 |     CRC32.hash(_),
 20 |   )
 21 | 
 22 |   object Adler32 {
 23 |     def  hash(data: Array[Byte]): Long = {
 24 |       val x = new java.util.zip.Adler32()
 25 |       x.update(data)
 26 |       x.getValue()
 27 |     }
 28 | 
 29 |   }
 30 | 
 31 |   object FNV32 {
 32 |     private val FNV1_32_INIT = 0x811c9dc5
 33 |     private val FNV1_PRIME_32 = 16777619
 34 | 
 35 |     def hash(data: Array[Byte]): Int = {
 36 |       var mHash = FNV1_32_INIT
 37 |       for { b <- data}{
 38 |         mHash ^= (b & 0xff)
 39 |         mHash *= FNV1_PRIME_32
 40 |       }
 41 |       mHash
 42 |     }
 43 |   }
 44 | 
 45 |     /**
 46 |     * XMODEM CRC 16 CRC16 - 16-bit Cyclic Redundancy Check (CRC16)
 47 |     *
 48 |     * Name                       : "XMODEM", also known as "ZMODEM", "CRC-16/ACORN"
 49 |     * Width                      : 16 bit
 50 |     * Poly                       : 1021 (That is actually x^16 + x^12 + x^5 + 1)
 51 |     * Initialization             : 0000
 52 |     * Reflect Input byte         : False
 53 |     * Reflect Output CRC         : False
 54 |     * Xor constant to output CRC : 0000
 55 |     * Output for "123456789"     : 31C3
 56 |     */
 57 |   object CRC16 {
 58 |     
 59 |     def hash(data: Array[Byte]): Int = {
 60 |       var crc: Int = 0
 61 |       data.foreach{ b =>
 62 |         crc = (crc << 8) ^ table(((crc >>> 8) ^ (b & 0xff)) & 0xff)
 63 |       }
 64 |       crc & 0xFFFF
 65 |     }
 66 | 
 67 |     private[CRC16] lazy val table : Array[Int] = Array(
 68 |       0x0000,0x1021,0x2042,0x3063,0x4084,0x50a5,0x60c6,0x70e7,
 69 |       0x8108,0x9129,0xa14a,0xb16b,0xc18c,0xd1ad,0xe1ce,0xf1ef,
 70 |       0x1231,0x0210,0x3273,0x2252,0x52b5,0x4294,0x72f7,0x62d6,
 71 |       0x9339,0x8318,0xb37b,0xa35a,0xd3bd,0xc39c,0xf3ff,0xe3de,
 72 |       0x2462,0x3443,0x0420,0x1401,0x64e6,0x74c7,0x44a4,0x5485,
 73 |       0xa56a,0xb54b,0x8528,0x9509,0xe5ee,0xf5cf,0xc5ac,0xd58d,
 74 |       0x3653,0x2672,0x1611,0x0630,0x76d7,0x66f6,0x5695,0x46b4,
 75 |       0xb75b,0xa77a,0x9719,0x8738,0xf7df,0xe7fe,0xd79d,0xc7bc,
 76 |       0x48c4,0x58e5,0x6886,0x78a7,0x0840,0x1861,0x2802,0x3823,
 77 |       0xc9cc,0xd9ed,0xe98e,0xf9af,0x8948,0x9969,0xa90a,0xb92b,
 78 |       0x5af5,0x4ad4,0x7ab7,0x6a96,0x1a71,0x0a50,0x3a33,0x2a12,
 79 |       0xdbfd,0xcbdc,0xfbbf,0xeb9e,0x9b79,0x8b58,0xbb3b,0xab1a,
 80 |       0x6ca6,0x7c87,0x4ce4,0x5cc5,0x2c22,0x3c03,0x0c60,0x1c41,
 81 |       0xedae,0xfd8f,0xcdec,0xddcd,0xad2a,0xbd0b,0x8d68,0x9d49,
 82 |       0x7e97,0x6eb6,0x5ed5,0x4ef4,0x3e13,0x2e32,0x1e51,0x0e70,
 83 |       0xff9f,0xefbe,0xdfdd,0xcffc,0xbf1b,0xaf3a,0x9f59,0x8f78,
 84 |       0x9188,0x81a9,0xb1ca,0xa1eb,0xd10c,0xc12d,0xf14e,0xe16f,
 85 |       0x1080,0x00a1,0x30c2,0x20e3,0x5004,0x4025,0x7046,0x6067,
 86 |       0x83b9,0x9398,0xa3fb,0xb3da,0xc33d,0xd31c,0xe37f,0xf35e,
 87 |       0x02b1,0x1290,0x22f3,0x32d2,0x4235,0x5214,0x6277,0x7256,
 88 |       0xb5ea,0xa5cb,0x95a8,0x8589,0xf56e,0xe54f,0xd52c,0xc50d,
 89 |       0x34e2,0x24c3,0x14a0,0x0481,0x7466,0x6447,0x5424,0x4405,
 90 |       0xa7db,0xb7fa,0x8799,0x97b8,0xe75f,0xf77e,0xc71d,0xd73c,
 91 |       0x26d3,0x36f2,0x0691,0x16b0,0x6657,0x7676,0x4615,0x5634,
 92 |       0xd94c,0xc96d,0xf90e,0xe92f,0x99c8,0x89e9,0xb98a,0xa9ab,
 93 |       0x5844,0x4865,0x7806,0x6827,0x18c0,0x08e1,0x3882,0x28a3,
 94 |       0xcb7d,0xdb5c,0xeb3f,0xfb1e,0x8bf9,0x9bd8,0xabbb,0xbb9a,
 95 |       0x4a75,0x5a54,0x6a37,0x7a16,0x0af1,0x1ad0,0x2ab3,0x3a92,
 96 |       0xfd2e,0xed0f,0xdd6c,0xcd4d,0xbdaa,0xad8b,0x9de8,0x8dc9,
 97 |       0x7c26,0x6c07,0x5c64,0x4c45,0x3ca2,0x2c83,0x1ce0,0x0cc1,
 98 |       0xef1f,0xff3e,0xcf5d,0xdf7c,0xaf9b,0xbfba,0x8fd9,0x9ff8,
 99 |       0x6e17,0x7e36,0x4e55,0x5e74,0x2e93,0x3eb2,0x0ed1,0x1ef0
100 |     )
101 |   }
102 | 
103 |   object CRC32 {
104 |     def hash(data: Array[Byte]): Long = {
105 |       val c = new java.util.zip.CRC32()
106 |       c.update(data)
107 |       c.getValue()
108 |     }
109 |   }
110 | 
111 | 
112 |   object Bernstein {
113 |     private val INITIAL = 5381
114 |     private val M = 33
115 |     def hash(data: Array[Byte]): Int = {
116 |       var hash = INITIAL
117 |       for {
118 |         x <- data
119 |       } { hash = M * hash + x }
120 |       hash
121 |     }
122 |   }
123 | 
124 |   object KernighanRitchie {
125 |     private val INITIAL = 0
126 |     private val M = 31
127 |     def hash(data: Array[Byte]): Int = {
128 |       var hash = INITIAL
129 |       for {
130 |         x <- data
131 |       } { hash = M * hash + x }
132 |       hash
133 |     }
134 |   }
135 | 
136 |   object Murmur3 {
137 |     def hash(data: Array[Byte]): Int = {
138 |       MurmurHash3.bytesHash(data)
139 |     }
140 |   }
141 | 
142 |   object XXHash {
143 |     private final val PRIME64_1 = 0x9E3779B185EBCA87L
144 |     private final val PRIME64_2 = 0xC2B2AE3D27D4EB4FL
145 |     private final val PRIME64_3 = 0x165667B19E3779F9L
146 |     private final val PRIME64_4 = 0x85EBCA77C2b2AE63L
147 |     private final val PRIME64_5 = 0x27D4EB2F165667C5L
148 |     private final val DEFAULT_SEED = 0L
149 | 
150 |     def hash(data: Array[Byte]): Long = hash64(data)
151 | 
152 |     def hash64(data: Array[Byte], seed: Long = DEFAULT_SEED): Long = {
153 |       val length = data.length
154 |       var index = 0
155 |       var hash: Long = -1 // Danger Will Robinson
156 |       if (length >= 32) {
157 |         var v1 = seed + PRIME64_1 + PRIME64_2
158 |         var v2 = seed + PRIME64_2
159 |         var v3 = seed + 0
160 |         var v4 = seed - PRIME64_1
161 |         var limit = length - 32
162 |         while (index <= limit){
163 |           var k1 = (data(index).toLong & 0xff) |
164 |             ((data(index + 1).toLong & 0xff) << 8) |
165 |             ((data(index + 2) & 0xff) << 16) |
166 |             ((data(index + 3) & 0xff) << 24) |
167 |             ((data(index + 4) & 0xff) << 32) |
168 |             ((data(index + 5) & 0xff) << 40) |
169 |             ((data(index + 6) & 0xff) << 48) |
170 |             ((data(index + 7) & 0xff) << 56) 
171 |           v1 = mix(v1, k1)
172 |           index += 8
173 | 
174 |           var k2 = (data(index) & 0xff) |
175 |               ((data(index + 1) & 0xff) << 8)  |
176 |               ((data(index + 2) & 0xff) << 16) |
177 |               ((data(index + 3) & 0xff) << 24) |
178 |               ((data(index + 4) & 0xff) << 32) |
179 |               ((data(index + 5) & 0xff) << 40) |
180 |               ((data(index + 6) & 0xff) << 48) |
181 |               ((data(index + 7) & 0xff) << 56)
182 |           v2 = mix(v2, k2)
183 |           index += 8
184 | 
185 |           var k3 = (data(index) & 0xff) |
186 |               ((data(index + 1) & 0xff) << 8)  |
187 |               ((data(index + 2) & 0xff) << 16) |
188 |               ((data(index + 3) & 0xff) << 24) |
189 |               ((data(index + 4) & 0xff) << 32) |
190 |               ((data(index + 5) & 0xff) << 40) |
191 |               ((data(index + 6) & 0xff) << 48) |
192 |               ((data(index + 7) & 0xff) << 56)
193 |           v3 = mix(v3, k3)
194 |           index += 8
195 | 
196 |           var k4 = (data(index) & 0xff) |
197 |               ((data(index + 1) & 0xff) << 8)  |
198 |               ((data(index + 2) & 0xff) << 16) |
199 |               ((data(index + 3) & 0xff) << 24) |
200 |               ((data(index + 4) & 0xff) << 32) |
201 |               ((data(index + 5) & 0xff) << 40) |
202 |               ((data(index + 6) & 0xff) << 48) |
203 |               ((data(index + 7) & 0xff) << 56)
204 |           v4 = mix(v4, k4)
205 |           index += 8
206 |         }
207 | 
208 |       hash = java.lang.Long.rotateLeft(v1, 1) + 
209 |         java.lang.Long.rotateLeft(v2, 7) + 
210 |         java.lang.Long.rotateLeft(v3, 12) +
211 |         java.lang.Long.rotateLeft(v4, 18)
212 | 
213 |       hash = update(hash, v1)
214 |       hash = update(hash, v2)
215 |       hash = update(hash, v3)
216 |       hash = update(hash, v4)
217 |       } else {
218 |         hash = seed + PRIME64_5
219 |       }
220 | 
221 |       hash += length
222 | 
223 |       // tail
224 |       while (index <= length - 8) {
225 |         var tailStart: Int = index
226 |         var k: Long = 0
227 |         var remaining: Int = length - index
228 |         remaining = if (remaining > 8) 8 else remaining
229 |         remaining match {
230 |           case 8 => 
231 |             k |= (data(tailStart + 7) & 0xff) << 56
232 |           case 7 =>
233 |             k |= (data(tailStart + 6) & 0xff) << 48
234 |           case 6 =>
235 |             k |= (data(tailStart + 5) & 0xff) << 40
236 |           case 5 =>
237 |             k |= (data(tailStart + 4) & 0xff) << 32
238 |           case 4 =>
239 |             k |= (data(tailStart + 3) & 0xff) << 24
240 |           case 3 =>
241 |             k |= (data(tailStart + 2) & 0xff) << 16
242 |           case 2 =>
243 |             k |= (data(tailStart + 1) & 0xff) << 8
244 |           case 1 =>
245 |             k |= (data(tailStart) & 0xff)
246 |         }
247 |         hash = updateTail(hash, k)
248 |         index += 8
249 |       }
250 | 
251 |       if (index <= length - 4) {
252 |         var tailStart = index
253 |         var k = 0
254 |         var remaining = length - index
255 |         remaining = if (remaining > 4) 4 else remaining
256 |         remaining match {
257 |           case 4 =>
258 |             k |= (data(tailStart + 3) & 0xff) << 24
259 |           case 3 =>
260 |             k |= (data(tailStart + 2) & 0xff) << 16
261 |           case 2 =>
262 |             k |= (data(tailStart + 1) & 0xff) << 8
263 |           case 1 =>
264 |             k |= (data(tailStart) & 0xff)
265 |         }
266 |         hash = updateTail(hash, k)
267 |         index += 4
268 |       }
269 | 
270 |       while (index < length) {
271 |         hash = updateTail(hash, data(index))
272 |         index += 1
273 |       }
274 | 
275 |       hash = finalShuffle(hash)
276 | 
277 |       hash
278 |     }
279 | 
280 | 
281 |     private def mix(current: Long, value: Long): Long = {
282 |       java.lang.Long.rotateLeft(current + value * PRIME64_2, 31) * PRIME64_1
283 |     }
284 | 
285 |     private def update(hash: Long, value: Long): Long ={
286 |       val temp = hash ^ mix(0, value)
287 |       temp * PRIME64_1 + PRIME64_4
288 |     }
289 | 
290 |     private def updateTail(hash: Long, value: Long): Long ={
291 |       val temp = hash ^ mix(0, value)
292 |       java.lang.Long.rotateLeft(temp, 27) * PRIME64_1 + PRIME64_4
293 |     }
294 | 
295 |     private def updateTail(hash: Long, value: Int): Long ={
296 |       val unsigned = value & 0xFFFFFFFFL
297 |       val temp = hash ^ (unsigned * PRIME64_1)
298 |       java.lang.Long.rotateLeft(temp, 23) * PRIME64_2 + PRIME64_3
299 |     }
300 | 
301 |     private def updateTail(hash: Long, value: Byte): Long ={
302 |       var unsigned = value & 0xFF
303 |       var temp = hash ^ (unsigned * PRIME64_5)
304 |       java.lang.Long.rotateLeft(temp, 11) * PRIME64_1
305 |     }
306 | 
307 |     private def finalShuffle(ihash: Long): Long = {
308 |       var hash = ihash
309 |       hash ^= hash >>> 33
310 |       hash *= PRIME64_2
311 |       hash ^= hash >>> 29
312 |       hash *= PRIME64_3
313 |       hash ^= hash >>> 32
314 |       hash
315 |     }
316 |   }
317 | 
318 | }


--------------------------------------------------------------------------------
/core/src/main/scala/io/chrisdavenport/probabilistic/mutable/BloomFilter.scala:
--------------------------------------------------------------------------------
 1 | package io.chrisdavenport.probabilistic.mutable
 2 | 
 3 | import io.chrisdavenport.probabilistic.hashes.Hashes
 4 | import java.nio.charset.Charset
 5 | 
 6 | class BloomFilter[A] private (
 7 |   private[mutable] val bitSet: ThreadSafeBitSet,
 8 |   initSize: Long,
 9 |   hashFunctions: A => cats.data.NonEmptyList[Long]
10 | ) extends io.chrisdavenport.probabilistic.BloomFilter[cats.Id, A]{
11 |   private def hashToPosition(l: Long): Long = {
12 |     val modulus = (l % initSize).toInt
13 |     if (modulus >= 0) modulus
14 |     else initSize + modulus
15 |   }
16 | 
17 |   private def positions(a: A): cats.data.NonEmptyList[Long] = {
18 |     hashFunctions(a).map(hashToPosition)
19 |   }
20 |   
21 |   def add(a: A): Unit = {
22 |     positions(a).toList.foreach{i => 
23 |       bitSet.set(i)
24 |     }
25 |   }
26 | 
27 |   def mayContain(a: A): Boolean = positions(a).forall(bitSet.get(_))
28 | }
29 | 
30 | object BloomFilter {
31 |   def static[A](initBitSize: Long, hashFunctions: A => cats.data.NonEmptyList[Long]): BloomFilter[A] = {
32 |     val bits = ThreadSafeBitSet(ThreadSafeBitSet.DEFAULT_LOG2_SEGMENT_SIZE_IN_BITS, initBitSize)
33 |     new BloomFilter[A](bits, initBitSize, hashFunctions)
34 |   }
35 | 
36 |   def string(numberOfItems: Long, falsePositiveRate: Double)(implicit charset: Charset = Charset.defaultCharset()): BloomFilter[String] = {
37 |     val bits = optimalNumberOfBits(numberOfItems, falsePositiveRate)
38 |     val hashes = optimalNumberOfHashes(numberOfItems, bits)
39 |     static[String](
40 |       bits, 
41 |       {
42 |         (s: String) => 
43 |         val array = s.getBytes(charset)
44 |         cats.data.NonEmptyList(
45 |           s.hashCode(),
46 |           Hashes.arrayHashes.toList.take(hashes - 1).map(f => f(array))
47 |         )
48 |       }
49 |     )
50 |   }
51 | 
52 |   def array(numberOfItems: Long, falsePositiveRate: Double): BloomFilter[Array[Byte]] = {
53 |     val bits = optimalNumberOfBits(numberOfItems, falsePositiveRate)
54 |     val hashes = optimalNumberOfHashes(numberOfItems, bits)
55 |     static[Array[Byte]](
56 |       bits, 
57 |       {
58 |         (data: Array[Byte]) => 
59 |         cats.data.NonEmptyList(
60 |           Hashes.arrayHashes.head(data),
61 |           Hashes.arrayHashes.tail.take(hashes - 1).map(f => f(data))
62 |         )
63 |       }
64 |     )
65 |   }
66 | 
67 |   def optimalNumberOfBits(numberOfItems: Long, falsePositiveRate: Double): Long = {
68 |     val p = if (falsePositiveRate == 0) Double.MinValue else falsePositiveRate
69 |     math.ceil(-1 * numberOfItems * math.log(p) / math.log(2) / math.log(2)).toLong
70 |   }
71 | 
72 |   def optimalNumberOfHashes(numberOfItems: Long, numberOfBits: Long): Int = {
73 |     math.ceil(numberOfBits / numberOfItems * math.log(2)).toInt
74 |   }
75 | 
76 | }


--------------------------------------------------------------------------------
/core/src/main/scala/io/chrisdavenport/probabilistic/mutable/CuckooFilter.scala:
--------------------------------------------------------------------------------
  1 | package io.chrisdavenport.probabilistic.mutable
  2 | 
  3 | import scala.util.Random
  4 | import cats.Id
  5 | import java.nio.charset.Charset
  6 | import io.chrisdavenport.probabilistic.hashes.Hashes
  7 | import cats.Contravariant
  8 | 
  9 | class CuckooFilter[A] private (
 10 |   private[mutable] val table: CuckooTable,
 11 |   hash: Array[Byte] => Long,
 12 |   f: A => Array[Byte], 
 13 |   random: Random,
 14 |   maxRelocationAttempts: Int, 
 15 | ) extends io.chrisdavenport.probabilistic.CuckooFilter[cats.Id, A]{
 16 |   import CuckooFilter._
 17 | 
 18 |   def add(a: A): Boolean = {
 19 |     val h = hash(f(a))
 20 |     val h1 = hash1(h)
 21 |     val h2 = hash2(h)
 22 |     val finger = fingerprint(h2)
 23 |     val i1 = index(h1)
 24 | 
 25 |     putEntry(finger, i1) ||
 26 |     putEntry(finger, index2(i1, finger))
 27 |   }
 28 |   
 29 |   def remove(a: A): Boolean = {
 30 |     val h = hash(f(a))
 31 |     val h1 = hash1(h)
 32 |     val h2 = hash2(h)
 33 |     val finger = fingerprint(h2)
 34 |     val i1 = index(h1)
 35 |     val i2 = index2(i1, finger)
 36 |     table.swapAnyEntry(i1, CuckooTable.EMPTY_ENTRY, finger) ||
 37 |     table.swapAnyEntry(i2, CuckooTable.EMPTY_ENTRY, finger)
 38 |   }
 39 |   
 40 |   def mayContain(a: A): Boolean = {
 41 |     val h = hash(f(a))
 42 |     val h1 = hash1(h)
 43 |     val h2 = hash2(h)
 44 |     val finger = fingerprint(h2)
 45 |     val i1 = index(h1)
 46 |     val i2 = index2(i1, finger)
 47 |     table.findEntry(i1, finger).isDefined ||
 48 |     table.findEntry(i2, finger).isDefined
 49 |   }
 50 | 
 51 |   private def putEntry(fingerprint: Int, index: Long): Boolean = {
 52 |     return table.swapAnyEntry(index, fingerprint, CuckooTable.EMPTY_ENTRY) ||
 53 |       putEntry(fingerprint,index, 0);
 54 |   }
 55 | 
 56 | 
 57 |   private def putEntry(fingerprint: Int, index: Long, kick: Int): Boolean = {
 58 |     if (maxRelocationAttempts == kick) {
 59 |       return false;
 60 |     } else {
 61 | 
 62 |       val entry = random.nextInt(table.numEntriesPerBucket)
 63 |       val kicked = table.writeEntry(index, entry, fingerprint)
 64 | 
 65 |       if ((CuckooTable.EMPTY_ENTRY == kicked)
 66 |           || putEntry(kicked, index2(index, kicked), kick + 1)) {
 67 |         return true;
 68 |       } else {
 69 |         val kickedBack = table.writeEntry(index,entry, kicked)
 70 |         assert(kickedBack == fingerprint, "Uh oh - couldn't unroll failed attempts to putEntry()")
 71 |         return false;
 72 |       }
 73 |     }
 74 |   }
 75 | 
 76 |   private def hash1(hash: Long): Long = {
 77 |     hash
 78 |   }
 79 | 
 80 |   private def hash2(hash: Long): Long = {
 81 |     hash >>> 32
 82 |   }
 83 | 
 84 | 
 85 |   private def index(hash: Long): Long = {
 86 |     mod(hash, table.numBuckets).toInt
 87 |   }
 88 | 
 89 |   private def index2(index: Long, fingerprint: Int): Long = {
 90 |     mod(protectedSum(index, parsign(index) * odd(hash(intToArray(fingerprint))), table.numBuckets), table.numBuckets)
 91 |   }
 92 | 
 93 | 
 94 | 
 95 |   /**
 96 |    * Maps parity of i to a sign.
 97 |    *
 98 |    * @return 1 if i is even parity, -1 if i is odd parity
 99 |    */
100 |   private def parsign(i: Long): Long = {
101 |     return ((i & 0x01L) * -2L) + 1L;
102 |   }
103 | 
104 |   private def odd(i: Long): Long = {
105 |     i | 0x01L
106 |   }
107 | 
108 |   private def intToArray(data: Int): Array[Byte] = {
109 |     BigInt(data).toByteArray
110 |   }
111 | 
112 |     /**
113 |    * Returns the sum of index and offset, reduced by a mod-consistent amount if necessary to
114 |    * protect from numeric overflow. This method is intended to support a subsequent mod operation
115 |    * on the return value.
116 |    *
117 |    * @param index Assumed to be >= 0L.
118 |    * @param offset Any value.
119 |    * @param mod Value used to reduce the result,
120 |    * @return sum of index and offset, reduced by a mod-consistent amount if necessary to protect
121 |    *         from numeric overflow.
122 |    */
123 |   private def protectedSum(index: Long, offset: Long, mod: Long): Long =  {
124 |     if (canSum(index, offset)) index + offset else  protectedSum(index - mod, offset, mod);
125 |   }
126 | 
127 |   private def canSum(a: Long, b: Long): Boolean = {
128 |     (a ^ b) < 0 | (a ^ (a + b)) >= 0
129 |   }
130 | 
131 |   /**
132 |     * Returns an f-bit portion of the given hash. Iterating by f-bit segments from the least
133 |     * significant side of the hash to the most significant, looks for a non-zero segment. If a
134 |     * non-zero segment isn't found, 1 is returned to distinguish the fingerprint from a
135 |     * non-entry.
136 |     *
137 |     * @param hash 64-bit hash value
138 |     * @param f number of bits to consider from the hash
139 |     * @return first non-zero f-bit value from hash as an int, or 1 if no non-zero value is found
140 |     */
141 |   private[mutable] def fingerprint(hash: Long): Int = {
142 |     val f = table.numBitsPerEntry
143 | 
144 |     val mask = (0x80000000 >> (f - 1)) >>> (Integer.SIZE - f)
145 |     var bit = 0
146 |     var ret: Long = 0x1.toLong
147 | 
148 |     while (bit + f <= Integer.SIZE){
149 |       ret = (hash >> bit) & mask
150 |       if (ret != 0) {
151 |         bit = Integer.SIZE
152 |       } else {
153 |         bit += f
154 |       }
155 |     }
156 |     ret.toInt
157 |   }
158 |   
159 | }
160 | 
161 | object CuckooFilter {
162 | 
163 |   def string(numberOfItems: Long, falsePositiveRate: Double)(implicit charset: Charset = Charset.defaultCharset()): CuckooFilter[String] = {
164 |     of(numberOfItems, falsePositiveRate, {s: String => s.getBytes(charset)})
165 |   }
166 | 
167 |   def array(numberOfItems: Long, falsePositiveRate: Double): CuckooFilter[Array[Byte]] = {
168 |     of(numberOfItems, falsePositiveRate, identity)
169 |   }
170 | 
171 |   def of[A](numberOfItems: Long, falsePositiveRate: Double, f: A => Array[Byte]): CuckooFilter[A] = {
172 |     val numEntriesPerBucket = optimalEntriesPerBucket(falsePositiveRate)
173 |     val numBuckets: Long = optimalNumberOfBuckets(numberOfItems, numEntriesPerBucket)
174 |     val numBitsPerEntry = optimalBitsPerEntry(falsePositiveRate, numEntriesPerBucket)
175 |     val random = new Random()
176 |     val maxRelocationAttempts = 500
177 | 
178 |     new CuckooFilter[A](
179 |       CuckooTable(numBuckets, numEntriesPerBucket, numBitsPerEntry),
180 |       Hashes.XXHash.hash(_),
181 |       f,
182 |       random,
183 |       maxRelocationAttempts
184 |     )
185 |   }
186 | 
187 |   
188 |   private def mod(x: Long, m: Long): Long = {
189 |     val result = x % m
190 |     if (result >= 0) result else result + m
191 |   }
192 |   
193 |   val MAX_ENTRIES_PER_BUCKET = 8
194 |   val MIN_ENTRIES_PER_BUCKET = 2
195 | 
196 |   /**
197 |    * Minimum false positive probability supported, 8.67E-19.
198 |    *
199 |    * CuckooFilter § 5.1 Eq. (6), "f ≥ log2(2b/e) = [log2(1/e) + log2(2b)]"
200 |    * (b) entries per bucket: 8 at e <= 0.00001
201 |    * (f) bits per entry: 64-bits max
202 |    * (e) false positive probability
203 |    *
204 |    * 64 = log2(16/e) = [log2(1/e) + log2(16)]
205 |    * 64 = log2(1/e) + 4
206 |    * 60 = log2(1/e)
207 |    * 2^60 = 1/e
208 |    * e = 1/2^60
209 |    * e = 8.673617379884035E-19
210 |    */
211 |   val MIN_FPP = 1.0D / Math.pow(2, 60)
212 | 
213 |   /**
214 |    * Maximum false positive probability supported, 0.99.
215 |    */
216 |   val MAX_FPP = 0.99D
217 | 
218 |   /*
219 |    * Space optimization cheat sheet, per CuckooFilter § 5.1 :
220 |    *
221 |    * Given:
222 |    *   n: expected insertions
223 |    *   e: expected false positive probability (e.g. 0.03D for 3% fpp)
224 |    *
225 |    * Choose:
226 |    *   b: bucket size in entries (2, 4, 8)
227 |    *   a: load factor (proportional to b)
228 |    *
229 |    * Calculate:
230 |    *   f: fingerprint size in bits
231 |    *   m: table size in buckets
232 |    *
233 |    *
234 |    * 1) Choose b =     8   | 4 |   2
235 |    *      when e : 0.00001 < e ≤ 0.002
236 |    *      ref: CuckooFilter § 5.1 ¶ 5, "Optimal bucket size"
237 |    *
238 |    * 2) Choose a =  50% | 84% | 95.5% | 98%
239 |    *      when b =   1  |  2  |  4    |  8
240 |    *      ref: CuckooFilter § 5.1 ¶ 2, "(1) Larger buckets improve table occupancy"
241 |    *
242 |    * 2) Optimal f = ceil( log2(2b/e) )
243 |    *    ref: CuckooFilter § 5.1 Eq. (6), "f ≥ log2(2b/e) = [log2(1/e) + log2(2b)]"
244 |    *
245 |    * 3) Required m = evenCeil( ceiling( ceiling( n/a ) / b ) )
246 |    *       Minimum entries (B) = n/a rounded up
247 |    *       Minimum buckets (m) = B/b rounded up to an even number
248 |    */
249 | 
250 |   /**
251 |    * Returns the optimal number of entries per bucket, or bucket size, ({@code b}) given the
252 |    * expected false positive probability ({@code e}).
253 |    *
254 |    * CuckooFilter § 5.1 ¶ 5, "Optimal bucket size"
255 |    *
256 |    * @param e the desired false positive probability (must be positive and less than 1.0)
257 |    * @return optimal number of entries per bucket
258 |    */
259 |   def optimalEntriesPerBucket(e: Double) = {
260 |     require(e > 0.0D, "e must be > 0.0");
261 |     if (e <= 0.00001) {
262 |       MAX_ENTRIES_PER_BUCKET
263 |     } else if (e <= 0.002) {
264 |       MAX_ENTRIES_PER_BUCKET / 2
265 |     } else {
266 |       MIN_ENTRIES_PER_BUCKET;
267 |     }
268 |   }
269 | 
270 |   /**
271 |    * Returns the optimal load factor ({@code a}) given the number of entries per bucket ({@code
272 |    * b}).
273 |    *
274 |    * CuckooFilter § 5.1 ¶ 2, "(1) Larger buckets improve table occupancy"
275 |    *
276 |    * @param b number of entries per bucket
277 |    * @return load factor, positive and less than 1.0
278 |    */
279 |   def optimalLoadFactor(b: Int): Double = {
280 |     require(b == 2 || b == 4 || b == 8, "b must be 2, 4, or 8");
281 |     if (b == 2) {
282 |       0.84D
283 |     } else if (b == 4) {
284 |       0.955D
285 |     } else {
286 |       0.98D
287 |     }
288 |   }
289 | 
290 |   private val log2 = (x: Double) => Math.log10(x)/ Math.log10(2.0)
291 | 
292 |   def optimalBitsPerEntry(e: Double, b: Int): Int = {
293 |     require(e >= MIN_FPP, "Cannot create CuckooFilter with FPP[" + e +
294 |         "] < CuckooFilter.MIN_FPP[" + CuckooFilter.MIN_FPP + "]");
295 |     val d = log2(2 * b / e)
296 |     d.round.toInt
297 |   }
298 | 
299 |   def optimalNumberOfBuckets(n: Long,b: Int): Long =  {
300 |     require(n > 0, "n must be > 0");
301 |     val x = Math.ceil((math.ceil(n / optimalLoadFactor(b)) / b)).toLong
302 |     (x + 1) / 2 * 2
303 |   }
304 | 
305 | }


--------------------------------------------------------------------------------
/core/src/main/scala/io/chrisdavenport/probabilistic/mutable/CuckooTable.scala:
--------------------------------------------------------------------------------
  1 | package io.chrisdavenport.probabilistic.mutable
  2 | 
  3 | import scala.util.control.Breaks
  4 | 
  5 | class CuckooTable private (
  6 |   private[mutable] val data: ThreadSafeBitSet,
  7 |   val numBuckets: Long,
  8 |   val numEntriesPerBucket: Int,
  9 |   val numBitsPerEntry: Int
 10 | ){
 11 |   import CuckooTable._
 12 | 
 13 |   // 0 indexed
 14 |   private def bitOffSet(bucket: Long, entry: Int): Long = {
 15 |     ((bucket * numEntriesPerBucket) + entry) * numBitsPerEntry
 16 |   }
 17 | 
 18 |   def readEntry(bucket: Long, entry: Int): Int = {
 19 |     require(bucket <= numBuckets)
 20 |     require(entry <= numEntriesPerBucket)
 21 |     val offset = bitOffSet(bucket, entry)
 22 |     val positions = for {
 23 |       x <- (offset until offset + numBitsPerEntry).toList
 24 |       if (data.get(x))
 25 |     } yield x - offset
 26 |     fromBitPositions(positions.map(_.toInt))
 27 |   }
 28 | 
 29 |   def findEntry(bucket: Long, value: Int): Option[Int] = {
 30 |     val break = new Breaks
 31 |     var entry = Option.empty[Int]
 32 |     break.breakable{
 33 |       for {
 34 |         i <- 0 until numEntriesPerBucket
 35 |       } {
 36 |         val x = readEntry(bucket, i)
 37 |         if (x == value) {
 38 |           entry = Some(i)
 39 |           break.break()
 40 |         }
 41 |       }
 42 |     }
 43 |     entry
 44 |   }
 45 | 
 46 |   // 0 indexed
 47 |   def writeEntry(bucket: Long, entry: Int, value: Int): Int = {
 48 |     require(bucket <= numBuckets)
 49 |     require(entry <= numEntriesPerBucket, "Entry Higher Than Allowed")
 50 |     // Expensive... But unsafe otherwise
 51 |     require(highestBitPosition(value) <= numBitsPerEntry, "Bits of this value are too large")
 52 | 
 53 |     val x = readEntry(bucket, entry) // TODO Race Condition - Make atomic or keysemaphore on the combination
 54 |     val offset = bitOffSet(bucket, entry)
 55 |     val newEntrySet = bitPositions(value).map(offset + _).toSet
 56 |     val oldEntrySet = bitPositions(x).map(offset + _).toSet
 57 |     val entryBits = (offset until (offset + numBitsPerEntry)).toList
 58 |     for {
 59 |       x <- entryBits
 60 |     } {
 61 |       if (newEntrySet.contains(x)) {
 62 |         data.set(x)
 63 |       }
 64 |       else if (oldEntrySet.contains(x)){
 65 |         data.clear(x)
 66 |       }
 67 |     }
 68 |     x
 69 |   }
 70 | 
 71 |   def swapAnyEntry(bucket: Long, valueIn: Int, valueOut: Int): Boolean = {
 72 |     findEntry(bucket, valueOut)
 73 |       .map(writeEntry(bucket, _, valueIn))
 74 |       .map{i => 
 75 |         val x = i == valueOut
 76 |         assert(x, s"Value Out Was Incorrect got $i expected $valueOut")
 77 |         true
 78 |       }.getOrElse(false)
 79 |   }
 80 | }
 81 | 
 82 | object CuckooTable {
 83 | 
 84 |   val EMPTY_ENTRY: Int = 0x00
 85 | 
 86 |   private[mutable] def bitPositions(int: Int): List[Int] = {
 87 |     val buffer = new scala.collection.mutable.ListBuffer[Int]()
 88 |     var number = int
 89 |     var position = 0
 90 |     while (number != 0){
 91 |       if ((number & 1) != 0) {
 92 |         buffer :+ position
 93 |       }
 94 |       position += 1
 95 |       number = number >>> 1
 96 |     }
 97 |     buffer.toList
 98 |   }
 99 | 
100 |   private[mutable] def highestBitPosition(int: Int): Int = {
101 |     bitPositions(int).headOption.getOrElse(0)
102 |   }
103 | 
104 |   private[mutable] def fromBitPositions(l: List[Int]): Int = {
105 |     var x = 0x00
106 |     l.foreach{bitPosition => 
107 |       val mask = 1 << bitPosition
108 |       x = x | mask
109 |     }
110 |     x
111 |   }
112 | 
113 |   // Int serves a fingerprint
114 | 
115 |   def apply(
116 |       numBuckets: Long, // X
117 |       numEntriesPerBucket: Int,
118 |       numBitsPerEntry: Int
119 |     ): CuckooTable = new CuckooTable(
120 |       ThreadSafeBitSet(numBitsToPreallocate = numBuckets * numEntriesPerBucket * numBitsPerEntry), 
121 |       numBuckets, numEntriesPerBucket, numBitsPerEntry
122 |     )
123 | }


--------------------------------------------------------------------------------
/core/src/main/scala/io/chrisdavenport/probabilistic/mutable/ThreadSafeBitSet.scala:
--------------------------------------------------------------------------------
  1 | package io.chrisdavenport.probabilistic
  2 | package mutable
  3 | 
  4 | import java.util.concurrent.atomic.AtomicLongArray
  5 | import java.util.concurrent.atomic.AtomicReference
  6 | import scala.util.control.Breaks
  7 | import scala.util.hashing.MurmurHash3
  8 | import scala.collection.BitSet
  9 | 
 10 | // More Like a BitVector that a bitset, but the name is what Scala calls this
 11 | class ThreadSafeBitSet private (
 12 |   private final val numLongsPerSegment: Int,
 13 |   private final val log2SegmentSize: Int,
 14 |   private final val segmentMask: Int,
 15 |   private final val segments: AtomicReference[ThreadSafeBitSet.ThreadSafeBitSegments]
 16 | ) { // TODO extends scala.collection.mutable.BitSet
 17 | 
 18 |   /*
 19 |    * --------------------
 20 |    * Modifications
 21 |    * --------------------
 22 |    */
 23 |   def set(position: Long): Unit = {
 24 |     val segmentPosition = position >>> log2SegmentSize // which segment -- div by num bits per segment
 25 |     val longPosition = (position >>> 6) & segmentMask // which long in the segment -- remainder of div by num bits per segment
 26 |     val bitPosition = position & 0x3F // which bit in the long -- remainder of div by num bits in long (64) -- positive bits
 27 |     val segment = getSegment(segmentPosition.toInt)
 28 |     val mask = 1L << bitPosition
 29 |     var retry = true
 30 |     while (retry){
 31 |       val currentLongValue = segment.get(longPosition.toInt)
 32 |       val newLongValue = currentLongValue | mask
 33 |       if (segment.compareAndSet(longPosition.toInt, currentLongValue, newLongValue)){
 34 |         retry = false
 35 |       }
 36 |     }
 37 |   }
 38 | 
 39 |   def clear(position: Long): Unit = {
 40 |     val segmentPosition = position >>> log2SegmentSize // which segment -- div by num bits per segment
 41 |     val longPosition = (position >>> 6) & segmentMask // which long in the segment -- remainder of div by num bits per segment
 42 |     val bitPosition = position & 0x3F /// which bit in the long -- remainder of div by num bits in long (64)
 43 |     val segment = getSegment(segmentPosition.toInt)
 44 |     val mask = ~(1L << bitPosition)
 45 |     var retry = true
 46 |     while (retry){
 47 |       val currentLongValue = segment.get(longPosition.toInt)
 48 |       val newLongValue = currentLongValue & mask
 49 |       if (segment.compareAndSet(longPosition.toInt, currentLongValue, newLongValue)){
 50 |         retry = false
 51 |       }
 52 |     }
 53 |   }
 54 |   def get(position: Long): Boolean = {
 55 |     val segmentPosition = position >>> log2SegmentSize // which segment -- div by num bits per segment
 56 |     val longPosition = (position >>> 6) & segmentMask // which long in the segment -- remainder of div by num bits per segment
 57 |     val bitPosition = position & 0x3F /// which bit in the long -- remainder of div by num bits in long (64)
 58 |     val segment = getSegment(segmentPosition.toInt)
 59 |     val mask = 1L << bitPosition
 60 |     (segment.get(longPosition.toInt) & mask) != 0
 61 |   }
 62 | 
 63 |   /**
 64 |     * Clear all bits to 0.
 65 |     */
 66 |   def clearAll(): Unit = {
 67 |     val visibleSegments = segments.get
 68 |     for {
 69 |       i <- 0 until visibleSegments.numSegments
 70 |       segment = visibleSegments.getSegment(i)
 71 |       j <- 0 until segment.length()
 72 |     } {
 73 |       segment.set(j, 0L)
 74 |     }
 75 |   }
 76 | 
 77 |   /*
 78 |    * --------------------
 79 |    * Informational
 80 |    * --------------------
 81 |    */
 82 | 
 83 |   def maxSetBit: Long = {
 84 |     val breaks = new Breaks
 85 |     val viewableSegments = segments.get()
 86 |     var bitPosition = -1L
 87 |     breaks.breakable{
 88 |       for {
 89 |         segmentIdx <- (viewableSegments.numSegments - 1) to 0 by -1
 90 |         segment = viewableSegments.getSegment(segmentIdx)
 91 |         longIdx <- (segment.length() - 1) to 0 by -1
 92 |       } {
 93 |         val l = segment.get(longIdx)
 94 |         if (l != 0) {
 95 |           bitPosition = (segmentIdx.toLong << log2SegmentSize) + (longIdx * 64) + (63 - java.lang.Long.numberOfLeadingZeros(l))
 96 |           breaks.break()
 97 |         }
 98 |       }
 99 |     }
100 |     bitPosition
101 |   }
102 | 
103 |   def nextSetBit(fromIndex: Long): Long = {
104 |     require(fromIndex >= 0, s"fromIndex must be >= 0: got $fromIndex")
105 |     var segmentPosition = fromIndex >>> log2SegmentSize
106 |     val viewableSegments = segments.get()
107 |     if (segmentPosition >= viewableSegments.numSegments) -1
108 |     else {
109 |       var longPosition = (fromIndex >>> 6) & segmentMask // which long in the segment -- remainder of div by num bits per segment
110 |       val bitPosition = fromIndex & 0x3F // which bit in the long -- remainder of div by num bits in long (64)
111 |       var segment = viewableSegments.getSegment(segmentPosition.toInt)
112 |       var word = segment.get(longPosition.toInt) & (0xffffffffffffffffL << bitPosition)
113 |       var response = -1L
114 |       var loop = true
115 |       while (loop) {
116 |         if (word != 0) {
117 |           response = (segmentPosition << (log2SegmentSize)) + (longPosition << 6) + java.lang.Long.numberOfTrailingZeros(word)
118 |           loop = false
119 |         } else {
120 |           longPosition += 1
121 |           if (longPosition > segmentMask) {
122 |             segmentPosition += 1
123 |             if (segmentPosition >= viewableSegments.numSegments) {
124 |               loop = false
125 |               // No bits set, return -
126 |             } else {
127 |               segment = viewableSegments.getSegment(segmentPosition.toInt)
128 |               longPosition = 0
129 |               word = segment.get(longPosition.toInt)
130 |             }
131 |           } else {
132 |             word = segment.get(longPosition.toInt)
133 |           }
134 |         }
135 |       }
136 | 
137 |       response
138 |     }
139 |   }
140 | 
141 |   /**
142 |    * The numbers of bits which are set in this bit set.
143 |    **/
144 |   def cardinality: Long = {
145 |     val viewableSegments = segments.get()
146 |     var numSetBits = 0L
147 |     for {
148 |       i <- 0 until viewableSegments.numSegments
149 |       segment = viewableSegments.getSegment(i)
150 |       j <- 0 until segment.length()
151 |     } {
152 |       numSetBits += java.lang.Long.bitCount(segment.get(j))
153 |     }
154 |     numSetBits
155 |   }
156 | 
157 |   /**
158 |     * The number of bits which are currently specified by this bit set. This
159 |     * is the maximum number which you might need to iterate if you were to
160 |     * iterate over all the bits in this set.
161 |     */
162 |   def currentCapacity: Int = 
163 |     segments.get().numSegments * (1 << log2SegmentSize)
164 | 
165 | 
166 |   def eqv(other: ThreadSafeBitSet): Boolean = {
167 |     require(other.log2SegmentSize == log2SegmentSize, "Segment sizes must be the same")
168 |     val thisSegments = segments.get
169 |     val otherSegments = other.segments.get
170 |     var allEqual = true
171 | 
172 |     val breaks = new Breaks
173 |     
174 |     breaks.breakable{
175 |       // Check All of That Equal to All of This
176 |       for {
177 |         i <- 0 until thisSegments.numSegments
178 |         thisArray = thisSegments.getSegment(i)
179 |         otherArray = {
180 |           if (i < otherSegments.numSegments) Some(otherSegments.getSegment(i))
181 |           else None
182 |         }
183 |         j <- 0 until thisArray.length()
184 |       } {
185 |           val thisLong = thisArray.get(j)
186 |           val otherLong = otherArray.map(_.get(j)).getOrElse(0L)
187 |           if (thisLong != otherLong) {
188 |             allEqual = false
189 |             breaks.break()
190 |           }
191 |         }
192 |       // Check that anything left in that is equal to 0
193 |       for {
194 |         i <- thisSegments.numSegments until otherSegments.numSegments
195 |         otherArray = otherSegments.getSegment(i)
196 |         j <- 0 until otherArray.length
197 |       } {
198 |         val l = otherArray.get(j)
199 |         if (l != 0) {
200 |           allEqual = false
201 |           breaks.break()
202 |         }
203 |       }
204 |     }
205 | 
206 |     allEqual
207 |   }
208 | 
209 |   /**
210 |    * Return a new bit set which contains all bits which are contained in this bit set, and which are NOT contained in the `other` bit set.
211 |    *
212 |    * In other words, return a new bit set, which is a bitwise and with the bitwise not of the other bit set.
213 |    *
214 |    */
215 |   def andNot(other: ThreadSafeBitSet): ThreadSafeBitSet = {
216 |     require(other.log2SegmentSize == log2SegmentSize, "Segment sizes must be the same")
217 |     val thisSegments = segments.get()
218 |     val otherSegments = other.segments.get()
219 |     val newSegments = ThreadSafeBitSet.ThreadSafeBitSegments(thisSegments.numSegments, numLongsPerSegment)
220 |     for {
221 |       i <- 0 until thisSegments.numSegments
222 |       thisArray = thisSegments.getSegment(i)
223 |       otherArray = {
224 |         if (i < otherSegments.numSegments) Some(otherSegments.getSegment(i))
225 |         else None
226 |       }
227 |       newArray = newSegments.getSegment(i)
228 |       j <- 0 until thisArray.length()
229 |     } {
230 |       val thisLong = thisArray.get(j)
231 |       val otherLong = otherArray.fold(0L)(a => a.get(j))
232 |       newArray.set(j, thisLong & ~ otherLong)
233 |     }
234 |     val andNot = ThreadSafeBitSet(log2SegmentSize)
235 |     andNot.segments.set(newSegments)
236 |     andNot
237 |   }
238 | 
239 |   // Get the segment at `segmentIndex`.  If this segment does not yet exist, create it.
240 |   def getSegment(segmentIndex: Int): AtomicLongArray = {
241 |     var visibleSegments = segments.get
242 | 
243 |     while(visibleSegments.numSegments <= segmentIndex){
244 |       // Thread safety:  newVisibleSegments contains all of the segments from the currently visible segments, plus extra.
245 |       // all of the segments in the currently visible segments are canonical and will not change.
246 |       val newVisibleSegments = ThreadSafeBitSet.ThreadSafeBitSegments(visibleSegments, segmentIndex + 1, numLongsPerSegment)
247 | 
248 |       // because we are using a compareAndSet, if this thread "wins the race" and successfully sets this variable, then the segments
249 |       // which are newly defined in newVisibleSegments become canonical.
250 |       if (segments.compareAndSet(visibleSegments, newVisibleSegments)) {
251 |         visibleSegments = newVisibleSegments
252 |       } else {
253 |         // If we "lose the race" and are growing the ThreadSafeBitSet segments larger,
254 |         // then we will gather the new canonical sets from the update which we missed on the next iteration of this loop.
255 |         // Newly defined segments in newVisibleSegments will be discarded, they do not get to become canonical.
256 |         visibleSegments = segments.get();
257 |       }
258 |     }
259 | 
260 |     visibleSegments.getSegment(segmentIndex)
261 |   }
262 | 
263 |   override def equals(obj: Any): Boolean = obj match {
264 |     case that: ThreadSafeBitSet => eqv(that)
265 |     case _ => false
266 |   }
267 | 
268 |   override def hashCode(): Int = {
269 |     31 * log2SegmentSize + 
270 |     MurmurHash3.arrayHash(segments.get().segments)
271 |   }
272 | 
273 |   // Only works if Int Capable values are there
274 |   def toMutableBitSet: scala.collection.mutable.BitSet = {
275 |     val resultSet = scala.collection.mutable.BitSet.empty
276 |     var ordinal = nextSetBit(0)
277 |     while(ordinal != -1){
278 |       resultSet.add(ordinal.toInt)
279 |       ordinal = nextSetBit(ordinal + 1)
280 |     }
281 |     resultSet
282 |   }
283 | 
284 |   override def toString(): String = {
285 |     val longs = new scala.collection.mutable.ListBuffer[Long]()
286 |     var ordinal = nextSetBit(0)
287 |     while(ordinal != -1L){
288 |       longs :+ ordinal
289 |       ordinal = nextSetBit(ordinal + 1)
290 |     }
291 |     "ThreadSafeBitSet(" ++ longs.mkString(",") + ")"
292 |   }
293 | }
294 | 
295 | object ThreadSafeBitSet {
296 |   val DEFAULT_LOG2_SEGMENT_SIZE_IN_BITS = 14
297 | 
298 |   // TODO: Overloads
299 |   // def apply(): ThreadSafeBitSet = apply(DEFAULT_LOG2_SEGMENT_SIZE_IN_BITS)
300 |   // def apply(log2SegmentSize: Int): ThreadSafeBitSet = apply(log2SegmentSize, 0)
301 |   def apply(
302 |     log2SegmentSizeInBits: Int = DEFAULT_LOG2_SEGMENT_SIZE_IN_BITS,
303 |     numBitsToPreallocate: Long = 0L
304 |   ): ThreadSafeBitSet = {
305 |     require(log2SegmentSizeInBits > 6, "Cannot specify fewer than 64 bits in each segment!")
306 |     val log2SegmentSize = log2SegmentSizeInBits
307 |     val numLongsPerSegment = (1 << (log2SegmentSizeInBits - 6))
308 |     val segmentMask = numLongsPerSegment - 1
309 |     val numBitsPerSegment = numLongsPerSegment * 64
310 |     val numSegmentsToPreallocate =
311 |       if (numBitsToPreallocate == 0) 1
312 |       else ((numBitsToPreallocate - 1) / numBitsPerSegment) + 1
313 |     val segments = new AtomicReference[ThreadSafeBitSegments]()
314 |     segments.set(ThreadSafeBitSegments(numSegmentsToPreallocate.toInt, numLongsPerSegment))
315 | 
316 |     new ThreadSafeBitSet(numLongsPerSegment, log2SegmentSize, segmentMask, segments)
317 |   }
318 | 
319 |   def fromBitSet(
320 |     bitSet: BitSet,
321 |     log2SegmentSize: Int = DEFAULT_LOG2_SEGMENT_SIZE_IN_BITS
322 |   ): ThreadSafeBitSet = {
323 |     val tsb = apply(log2SegmentSize, bitSet.size.toLong)
324 |     bitSet.foreach(i => 
325 |       tsb.set(i.toLong)
326 |     )
327 |     tsb
328 |   }
329 | 
330 |   // def orAll(bitSets: ThreadSafeBitSet*): ThreadSafeBitSet = {
331 |   //   ???
332 |   // }
333 | 
334 |   private class ThreadSafeBitSegments private (private[ThreadSafeBitSet] final val segments: Array[AtomicLongArray]){
335 |     def numSegments = segments.length
336 |     def getSegment(index: Int) = segments(index)
337 |   }
338 |   private object ThreadSafeBitSegments {
339 |     def apply(numSegments: Int, segmentLength: Int) = {
340 |       val segments = new Array[AtomicLongArray](numSegments)
341 |       for(i <- 0 until numSegments) {
342 |         segments.update(i, new AtomicLongArray(segmentLength))
343 |       }
344 |       new ThreadSafeBitSegments(segments)
345 |     }
346 |     def apply(copyFrom: ThreadSafeBitSegments, numSegments: Int, segmentLength: Int) = {
347 |       val segments = new Array[AtomicLongArray](numSegments)
348 |       for(i <- 0 until numSegments) {
349 |         val set = if (i < copyFrom.numSegments) copyFrom.getSegment(i) else new AtomicLongArray(segmentLength)
350 |         segments.update(i, set)
351 |       }
352 |       new ThreadSafeBitSegments(segments)
353 |     }
354 |   }
355 | }


--------------------------------------------------------------------------------
/core/src/test/scala/io/chrisdavenport/probabilistic/MainSpec.scala:
--------------------------------------------------------------------------------
 1 | package io.chrisdavenport.probabilistic
 2 | 
 3 | import munit.CatsEffectSuite
 4 | import cats.effect._
 5 | 
 6 | class MainSpec extends CatsEffectSuite {
 7 | 
 8 |   test("Main should exit succesfully") {
 9 |     assertIO(IO(true), true)
10 |   }
11 | 
12 | }
13 | 


--------------------------------------------------------------------------------
/examples/src/main/scala/BloomExample.scala:
--------------------------------------------------------------------------------
 1 | 
 2 | import cats.effect._
 3 | import io.chrisdavenport.probabilistic.BloomFilter
 4 | 
 5 | object BloomExample extends IOApp {
 6 | 
 7 |   def run(args: List[String]): IO[ExitCode] = {
 8 |     val x = "Foo"
 9 |     for {
10 |       bf <- BloomFilter.string[IO](numberOfItems = 10000,  falsePositiveRate = 0.01)
11 |       present1 <- bf.mayContain(x) // False - It hasn't been inserted yet
12 |       _ <- IO(println(present1))
13 |       _ <- bf.add(x)
14 |       present2 <- bf.mayContain(x) // True - It was inserted
15 |       _ <- IO(println(present2))
16 |     } yield ExitCode.Success
17 |   }
18 | 
19 | }


--------------------------------------------------------------------------------
/examples/src/main/scala/CuckooExample.scala:
--------------------------------------------------------------------------------
 1 | import io.chrisdavenport.probabilistic.CuckooFilter
 2 | import cats.effect._
 3 | 
 4 | object CuckooExample extends IOApp {
 5 | 
 6 |   def run(args: List[String]): IO[ExitCode] = {
 7 |     val x = "Foo"
 8 |     for {
 9 |       cf <- CuckooFilter.string[IO](numberOfItems = 10000,  falsePositiveRate = 0.01)
10 |       present1 <- cf.mayContain(x) // False - It hasn't been inserted yet
11 |       _ <- IO(println(present1))
12 |       _ <- cf.add(x)
13 |       present2 <- cf.mayContain(x) // True - It was inserted
14 |       _ <- IO(println(present2))
15 |       _ <- cf.remove(x)
16 |       present3 <- cf.mayContain(x) // False - It was removed again. Cool!
17 |       _ <- IO(println(present3))
18 |     } yield ExitCode.Success
19 |   }
20 | 
21 | }


--------------------------------------------------------------------------------
/licenses/apache:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.4.9
2 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | // addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.16")
 2 | addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.8.1")
 3 | addSbtPlugin("io.chrisdavenport" % "sbt-mima-version-check" % "0.1.2")
 4 | addSbtPlugin("io.chrisdavenport" % "sbt-no-publish" % "0.1.0")
 5 | 
 6 | addSbtPlugin("org.portable-scala" % "sbt-scalajs-crossproject" % "1.0.0")
 7 | addSbtPlugin("org.scala-js" % "sbt-scalajs" % "1.4.0")
 8 | addSbtPlugin("com.github.cb372" % "sbt-explicit-dependencies" % "0.2.16")
 9 | addSbtPlugin("com.geirsson" % "sbt-ci-release" % "1.5.5")
10 | addSbtPlugin("ch.epfl.lamp" % "sbt-dotty" % "0.5.1")
11 | addSbtPlugin("com.codecommit" % "sbt-github-actions" % "0.9.5")
12 | 
13 | addSbtPlugin("org.scalameta" % "sbt-mdoc" % "2.2.16")
14 | addSbtPlugin("com.47deg" % "sbt-microsites" % "1.3.0")
15 | addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.3")


--------------------------------------------------------------------------------
/site/Gemfile:
--------------------------------------------------------------------------------
1 | source 'http://rubygems.org'
2 | 
3 | gem "jekyll", ">= 4.0.0"
4 | gem "jekyll-relative-links"
5 | gem "sass"


--------------------------------------------------------------------------------
/site/docs/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: home
 3 | 
 4 | ---
 5 | 
 6 | # probabilistic - Probabilistic Data Structures [![Build Status](https://travis-ci.com/ChristopherDavenport/probabilistic.svg?branch=master)](https://travis-ci.com/ChristopherDavenport/probabilistic) [![Maven Central](https://maven-badges.herokuapp.com/maven-central/io.chrisdavenport/probabilistic_2.12/badge.svg)](https://maven-badges.herokuapp.com/maven-central/io.chrisdavenport/probabilistic_2.12)
 7 | 
 8 | ## Quick Start
 9 | 
10 | To use probabilistic in an existing SBT project with Scala 2.11 or a later version, add the following dependencies to your
11 | `build.sbt` depending on your needs:
12 | 
13 | ```scala
14 | libraryDependencies ++= Seq(
15 |   "io.chrisdavenport" %% "probabilistic" % "<version>"
16 | )
17 | ```


--------------------------------------------------------------------------------