├── .editorconfig
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── publish.yml
    │   ├── python-package.yml
    │   └── ruff.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .pylintrc
├── .readthedocs.yaml
├── CHANGELOG.md
├── CITATION.cff
├── LICENSE
├── README.rst
├── codecov.yml
├── docs
    ├── Makefile
    ├── requirements.txt
    └── source
    │   ├── _static
    │       └── custom.css
    │   ├── code.rst
    │   ├── conf.py
    │   ├── index.rst
    │   └── quickstart.rst
├── probables
    ├── __init__.py
    ├── blooms
    │   ├── __init__.py
    │   ├── bloom.py
    │   ├── countingbloom.py
    │   ├── expandingbloom.py
    │   └── py.typed
    ├── constants.py
    ├── countminsketch
    │   ├── __init__.py
    │   ├── countminsketch.py
    │   └── py.typed
    ├── cuckoo
    │   ├── __init__.py
    │   ├── countingcuckoo.py
    │   ├── cuckoo.py
    │   └── py.typed
    ├── exceptions.py
    ├── hashes.py
    ├── py.typed
    ├── quotientfilter
    │   ├── __init__.py
    │   ├── py.typed
    │   └── quotientfilter.py
    └── utilities.py
├── pyproject.toml
├── scripts
    └── version_bump.py
└── tests
    ├── __init__.py
    ├── bloom_test.py
    ├── countingbloom_test.py
    ├── countingcuckoo_test.py
    ├── countminsketch_test.py
    ├── cuckoo_test.py
    ├── expandingbloom_test.py
    ├── hashes_test.py
    ├── quotientfilter_test.py
    ├── test_utilities.py
    └── utilities.py


/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | charset = utf-8
 5 | indent_style = space
 6 | indent_size = 4
 7 | insert_final_newline = true
 8 | end_of_line = lf
 9 | 
10 | [*.{yml,yaml}]
11 | indent_style = space
12 | indent_size = 2
13 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 5 | 
 6 | version: 2
 7 | updates:
 8 | 
 9 |   - package-ecosystem: "github-actions"
10 |     directory: "/"
11 |     schedule:
12 |       # Check for updates to GitHub Actions every week
13 |       interval: "weekly"
14 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v4
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v5
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
25 |         pip install twine build
26 |     - name: Build and publish
27 |       env:
28 |         TWINE_USERNAME: __token__
29 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
30 |       run: |
31 |         python -m build
32 |         twine upload dist/*
33 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python package
 5 | 
 6 | on: [push, pull_request]
 7 | 
 8 | jobs:
 9 |   build:
10 | 
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v4
18 |     - name: Set up Python ${{ matrix.python-version }}
19 |       uses: actions/setup-python@v5
20 |       with:
21 |         python-version: ${{ matrix.python-version }}
22 |     - name: Install dependencies
23 |       run: |
24 |         python -m pip install --upgrade pip
25 |         python -m pip install flake8 pytest pytest-cov
26 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
27 |     - name: Lint with flake8
28 |       run: |
29 |         # stop the build if there are Python syntax errors or undefined names
30 |         flake8 probables/ --count --select=E9,F63,F7,F82 --show-source --statistics
31 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
32 |         flake8 probables/ --count --exit-zero --max-complexity=11 --max-line-length=127 --statistics
33 |     - name: Test with pytest
34 |       run: |
35 |         # Run tests while also generating coverage statistics
36 |         pytest --cov=./ --cov-report=xml
37 |     - name: Upload coverage to Codecov
38 |       uses: codecov/codecov-action@v5
39 |       with:
40 |         files: ./coverage.xml
41 |         fail_ci_if_error: false
42 | 
43 |   build-verification:
44 | 
45 |     runs-on: ubuntu-latest
46 |     steps:
47 |     - uses: actions/checkout@v4
48 |     - uses: actions/setup-python@v5
49 |       with:
50 |         python-version: '3.x'
51 |     - name: Build and check twine
52 |       run: |
53 |         python -m pip install --upgrade pip
54 |         python -m pip install build twine
55 |         python -m build
56 |         twine check dist/*
57 | 
58 |   Lint-black:
59 |     runs-on: ubuntu-latest
60 |     steps:
61 |     - uses: actions/checkout@v4
62 |     - uses: psf/black@stable
63 |       with:
64 |         # src: "./probables"
65 |         version: "22.8.0"
66 | 


--------------------------------------------------------------------------------
/.github/workflows/ruff.yml:
--------------------------------------------------------------------------------
 1 | name: Ruff
 2 | on: [workflow_dispatch, pull_request]
 3 | jobs:
 4 |   build:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |       - uses: actions/checkout@v4
 8 |       - name: Install Python
 9 |         uses: actions/setup-python@v5
10 |         with:
11 |           python-version: "3.13"
12 |       - uses: astral-sh/ruff-action@v3
13 |         with:
14 |           args: "check --fix"
15 |         continue-on-error: false


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ###############################################
  2 | # Project Specific
  3 | ###############################################
  4 | *.blm
  5 | *.cms
  6 | *.dat
  7 | 
  8 | ###############################################
  9 | # Python
 10 | ###############################################
 11 | 
 12 | # Byte-compiled / optimized / DLL files
 13 | __pycache__/
 14 | *.py[cod]
 15 | *$py.class
 16 | 
 17 | # C extensions
 18 | *.so
 19 | 
 20 | # Distribution / packaging
 21 | .Python
 22 | env/
 23 | build/
 24 | develop-eggs/
 25 | dist/
 26 | downloads/
 27 | eggs/
 28 | .eggs/
 29 | lib/
 30 | lib64/
 31 | parts/
 32 | sdist/
 33 | var/
 34 | wheels/
 35 | *.egg-info/
 36 | .installed.cfg
 37 | *.egg
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .coverage
 53 | .coverage.*
 54 | .cache
 55 | nosetests.xml
 56 | coverage.xml
 57 | *.cover
 58 | .hypothesis/
 59 | .pytest_cache/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # celery beat schedule file
 89 | celerybeat-schedule
 90 | 
 91 | # SageMath parsed files
 92 | *.sage.py
 93 | 
 94 | # dotenv
 95 | .env
 96 | 
 97 | # virtualenv
 98 | .venv
 99 | venv/
100 | ENV/
101 | Pipfile*
102 | 
103 | # Spyder project settings
104 | .spyderproject
105 | .spyproject
106 | 
107 | # Rope project settings
108 | .ropeproject
109 | 
110 | # mkdocs documentation
111 | /site
112 | 
113 | # mypy
114 | .mypy_cache/
115 | 
116 | 
117 | ###############################################
118 | # Operating Systems
119 | ###############################################
120 | # Windows thumbnail cache files
121 | Thumbs.db
122 | Thumbs.db:encryptable
123 | ehthumbs.db
124 | ehthumbs_vista.db
125 | 
126 | # Dump file
127 | *.stackdump
128 | 
129 | # Folder config file
130 | [Dd]esktop.ini
131 | 
132 | # Recycle Bin used on file shares
133 | $RECYCLE.BIN/
134 | 
135 | # Windows Installer files
136 | *.cab
137 | *.msi
138 | *.msix
139 | *.msm
140 | *.msp
141 | 
142 | # Windows shortcuts
143 | *.lnk
144 | 
145 | #
146 | # MacOS
147 | #
148 | 
149 | # General
150 | .DS_Store
151 | .AppleDouble
152 | .LSOverride
153 | 
154 | # Icon must end with two \r
155 | Icon
156 | 
157 | 
158 | # Thumbnails
159 | ._*
160 | 
161 | # Files that might appear in the root of a volume
162 | .DocumentRevisions-V100
163 | .fseventsd
164 | .Spotlight-V100
165 | .TemporaryItems
166 | .Trashes
167 | .VolumeIcon.icns
168 | .com.apple.timemachine.donotpresent
169 | 
170 | # Directories potentially created on remote AFP share
171 | .AppleDB
172 | .AppleDesktop
173 | Network Trash Folder
174 | Temporary Items
175 | .apdisk
176 | 
177 | #
178 | #   Linux
179 | #
180 | *~
181 | 
182 | # temporary files which can be created if a process still has a handle open of a deleted file
183 | .fuse_hidden*
184 | 
185 | # KDE directory preferences
186 | .directory
187 | 
188 | # Linux trash folder which might appear on any partition or disk
189 | .Trash-*
190 | 
191 | # .nfs files are created when an open file is removed but is still being accessed
192 | .nfs*
193 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |     -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |         rev: v3.2.0
 4 |         hooks:
 5 |         -   id: check-yaml
 6 |         -   id: check-toml
 7 |         -   id: check-json
 8 |         -   id: end-of-file-fixer
 9 |         -   id: trailing-whitespace
10 |         -   id: debug-statements
11 | 
12 |     -   repo: https://github.com/psf/black
13 |         rev: 20.8b1
14 |         hooks:
15 |         -   id: black
16 |             exclude: ^docs/
17 |     -   repo: https://github.com/pycqa/isort
18 |         rev: 5.6.3
19 |         hooks:
20 |         -   id: isort
21 |             exclude: ^docs/
22 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.11"
13 | 
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 |   configuration: docs/source/conf.py
17 | 
18 | # We recommend specifying your dependencies to enable reproducible builds:
19 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
20 | python:
21 |   install:
22 |   - requirements: docs/requirements.txt


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # PyProbables Changelog
  2 | 
  3 | ### Version 0.6.1
  4 | 
  5 | * Quotient Filter:
  6 |   * Add ability to get hashes from the filter either as a list, or as a generator
  7 |   * Add quotient filter expand capability, auto and on request
  8 |   * Add QuotientFilterError exception
  9 |   * Add merge functionality
 10 |   * Add retrieve hashes from the filter
 11 |   * Add resize filter, automatically or programatically
 12 |   * Add merging two filters into one
 13 |   * Add removal of an element from the filter
 14 | * Count-Min Sketch:
 15 |   * Fix bug in elements added calculation when joining Count-Min Sketches; see [PR #119](https://github.com/barrust/pyprobables/pull/119); Thanks [@cunla](https://github.com/cunla)
 16 | 
 17 | ### Version 0.6.0
 18 | 
 19 | * Add `QuotientFilter` implementation; [see issue #37](https://github.com/barrust/pyprobables/issues/37)
 20 | * Add `bitarray` implementation
 21 | * Bitwise operations in lieu of modulo calculations
 22 | 
 23 | ### Version 0.5.9
 24 | 
 25 | * Add `py.typed` files so that mypy will find type annotations
 26 | * Drop support for python `3.6` and `3.7`
 27 | 
 28 | ### Version 0.5.8
 29 | 
 30 | * Make the `mmap` utility class windows compatible; see [PR #160](https://github.com/barrust/pyprobables/pull/106); Thanks [@leonhma](https://github.com/leonhma)
 31 | 
 32 | ### Version 0.5.7
 33 | 
 34 | * Update Build System and update project metadata
 35 | * Better support for `resolve_path` in passed filenames
 36 | * Remove Python 3.5 support
 37 | * Pylint inspired updates
 38 | 
 39 | ### Version 0.5.6
 40 | 
 41 | * Bloom Filters:
 42 |   * Fix for `ValueError` exception when using `estimate_elements()` when all bits are set
 43 | * Add Citation file
 44 | 
 45 | ### Version 0.5.5
 46 | 
 47 | * Bloom Filters:
 48 |   * Re-implemented the entire Bloom Filter data structure to reduce complexity and code duplication
 49 | * Removed un-unsed imports
 50 | * Removed unnecessary casts
 51 | * Pylint Requested Style Changes:
 52 |   * Use python 3 `super()`
 53 |   * Use python 3 classes
 54 | * Remove use of temporary variables if possible and still clear
 55 | 
 56 | ### Version 0.5.4
 57 | 
 58 | * All Probablistic Data Structures:
 59 |   * Added ability to load each `frombytes()`
 60 |   * Updated underlying data structures of number based lists to be more space and time efficient; see [Issue #60](https://github.com/barrust/pyprobables/issues/60)
 61 | * Cuckoo Filters:
 62 |   * Added `fingerprint_size_bits` property
 63 |   * Added `error_rate` property
 64 |   * Added ability to initialize based on error rate
 65 | * Simplified typing
 66 | * Ensure all `filepaths` can be `str` or `Path`
 67 | 
 68 | ### Version 0.5.3
 69 | 
 70 | * Additional type hinting
 71 | * Improved format parsing and serialization; [see PR#81](https://github.com/barrust/pyprobables/pull/81). Thanks [@KOLANICH](https://github.com/KOLANICH)
 72 | * Bloom Filters
 73 |   * Added `export_to_hex` functionality for Bloom Filters on Disk
 74 |   * Export as C header (**\*.h**) for Bloom Filters on Disk and Counting Bloom Filters
 75 | * Added support for more input types for exporting and loading of saved files
 76 | 
 77 | ### Version 0.5.2
 78 | 
 79 | * Add ability to hash bytes along with strings
 80 | * Make all tests files individually executable from the CLI. Thanks [@KOLANICH](https://github.com/KOLANICH)
 81 | * Added type hints
 82 | 
 83 | ### Version 0.5.1
 84 | 
 85 | * Bloom Filter:
 86 |   * Export as a C header (**\*.h**)
 87 | * Count-Min Sketch
 88 |   * Add join/merge functionality
 89 | * Moved testing to use `NamedTemporaryFile` for file based tests
 90 | 
 91 | ### Version 0.5.0
 92 | 
 93 | * ***BACKWARD INCOMPATIBLE CHANGES***
 94 |   * **NOTE:** Breaks backwards compatibility with previously exported blooms, counting-blooms, cuckoo filter, or count-min-sketch files using the default hash!
 95 |   * Update to the FNV_1a hash function
 96 |   * Simplified the default hash to use a seed value
 97 | * Ensure passing of depth to hashing function when using `hash_with_depth_int` or `hash_with_depth_bytes`
 98 | 
 99 | ## Version 0.4.1
100 | 
101 | * Resolve [issue 57](https://github.com/barrust/pyprobables/issues/57) where false positive rate not stored / used the same in some instances
102 | 
103 | ## Version 0.4.0
104 | 
105 | * Remove **Python 2.7** support
106 | 
107 | ### Version 0.3.2
108 | 
109 | * Fix `RotatingBloomFilter` to keep information on number of elements inserted when exported and loaded. [see PR #50](https://github.com/barrust/pyprobables/pull/50) Thanks [@dvolker48](https://github.com/volker48)
110 | 
111 | ### Version 0.3.1
112 | 
113 | * Add additional **slots**
114 | * Very minor improvement to the hashing algorithm
115 | 
116 | ### Version 0.3.0
117 | 
118 | * Bloom Filters:
119 |   * Import/Export of Expanding and Rotating Bloom Filters
120 |   * Fix for importing standard Bloom Filters
121 | 
122 | ### Version 0.2.6
123 | 
124 | * Bloom Filters:
125 |   * Addition of a Rotating Bloom Filter
126 | 
127 | ### Version 0.2.5
128 | 
129 | * Bloom Filters:
130 |   * Addition of an Expanding Bloom Filter
131 | 
132 | ### Version 0.2.0
133 | 
134 | * Use **slots**
135 | 
136 | ### Version 0.1.4
137 | 
138 | * Drop support for python 3.3
139 | * Ensure passing parameters correctly to parent classes
140 | 
141 | ### Version 0.1.3
142 | 
143 | * Better parameter validation
144 | * Cuckoo Filters:
145 |   * Support passing different hash function
146 |   * Support for different fingerprint size
147 | * Utility to help generate valid hashing strategies using decorators
148 |   * hash_with_depth_bytes
149 |   * hash_with_depth_int
150 | * Updated documentation
151 | 
152 | ### Version 0.1.2
153 | 
154 | * Counting Cuckoo Filter
155 |   * Basic functionality: add, remove, check
156 |   * Expand
157 |   * Import / Export
158 | * Fix and tests for utility functions
159 | * Fix package build
160 | 
161 | ### Version 0.1.1
162 | 
163 | * CuckooFilter
164 |   * Import / Export functionality
165 |   * Enforce single insertion per key
166 |   * Auto expand when insertion failure OR when called to do so (settable)
167 | 
168 | ### Version 0.1.0
169 | 
170 | * Cuckoo Filter
171 |   * Added basic Cuckoo Filter code
172 | 
173 | ### Version 0.0.8
174 | 
175 | * Counting Bloom Filter
176 |   * Estimate unique elements added
177 |   * Union
178 |   * Intersection
179 |   * Jaccard Index
180 | 
181 | ### Version 0.0.7
182 | 
183 | * Counting Bloom Filter
184 |   * Fix counting bloom hex export / import
185 |   * Fix for overflow issue in counting bloom export
186 |   * Added ability to remove from counting bloom
187 | * Count-Min Sketch
188 |   * Fix for not recording large numbers of inserts and deletions correctly
189 | 
190 | ### Version 0.0.6
191 | 
192 | * Probabilistic data structures added:
193 |   * Counting Bloom Filter
194 | * Minor code clean-up
195 | * Re-factored Bloom Filters
196 | 
197 | ### Version 0.0.5
198 | 
199 | * Better on-line documentation
200 | * Changed access to some public functions
201 | 
202 | ### Version 0.0.4
203 | 
204 | * Probabilistic data structures:
205 |   * Bloom Filter
206 |   * Bloom Filter (on disk)
207 |   * Count-Min Sketch
208 |   * Count-Mean Sketch
209 |   * Count-Mean-Min Sketch
210 |   * Heavy Hitters
211 |   * Stream Threshold
212 | * Import and export of each
213 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | # This CITATION.cff file was generated with cffinit.
 2 | # Visit https://bit.ly/cffinit to generate yours today!
 3 | 
 4 | cff-version: 1.2.0
 5 | title: PyProbables
 6 | message: >-
 7 |   If you use this software, please cite it using the
 8 |   metadata from this file.
 9 | type: software
10 | authors:
11 |   - given-names: Tyler
12 |     family-names: Barrus
13 |     email: barrust@gmail.coim
14 |     orcid: 'https://orcid.org/0000-0002-6691-0360'
15 | repository-code: 'https://github.com/barrust/pyprobables'
16 | abstract: >-
17 |   A set of probabilistic data structures written in
18 |   python
19 | keywords:
20 |   - Probabilistic
21 |   - Data Structures
22 |   - Bloom Filter
23 |   - Count-Min Sketch
24 |   - Cuckoo Filter
25 |   - Counting Bloom Filter
26 |   - Count-Mean-Min Sketch
27 |   - Count-Mean Sketch
28 |   - Heavy Hitters
29 |   - Stream Threshold
30 |   - Rolling Bloom Filter
31 |   - Expanding Bloom Filter
32 |   - Counting Cuckoo Filter
33 |   - Quotient Filter
34 | license: MIT
35 | version: 0.6.0
36 | date-released: '2024-01-10'


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017-2021 Tyler Barrus
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | PyProbables
  2 | ===========
  3 | 
  4 | .. image:: https://img.shields.io/badge/license-MIT-blue.svg
  5 |     :target: https://opensource.org/licenses/MIT/
  6 |     :alt: License
  7 | .. image:: https://img.shields.io/github/release/barrust/pyprobables.svg
  8 |     :target: https://github.com/barrust/pyprobables/releases
  9 |     :alt: GitHub release
 10 | .. image:: https://github.com/barrust/pyprobables/workflows/Python%20package/badge.svg
 11 |     :target: https://github.com/barrust/pyprobables/actions?query=workflow%3A%22Python+package%22
 12 |     :alt: Build Status
 13 | .. image:: https://codecov.io/gh/barrust/pyprobables/branch/master/graph/badge.svg?token=OdETiNgz9k
 14 |     :target: https://codecov.io/gh/barrust/pyprobables
 15 |     :alt: Test Coverage
 16 | .. image:: https://readthedocs.org/projects/pyprobables/badge/?version=latest
 17 |     :target: http://pyprobables.readthedocs.io/en/latest/?badge=latest
 18 |     :alt: Documentation Status
 19 | .. image:: https://badge.fury.io/py/pyprobables.svg
 20 |     :target: https://pypi.org/project/pyprobables/
 21 |     :alt: Pypi Release
 22 | .. image:: https://pepy.tech/badge/pyprobables
 23 |     :target: https://pepy.tech/project/pyprobables
 24 |     :alt: Downloads
 25 | 
 26 | **pyprobables** is a pure-python library for probabilistic data structures.
 27 | The goal is to provide the developer with a pure-python implementation of
 28 | common probabilistic data-structures to use in their work.
 29 | 
 30 | To achieve better raw performance, it is recommended supplying an alternative
 31 | hashing algorithm that has been compiled in C. This could include using the
 32 | md5 and sha512 algorithms provided or installing a third party package and
 33 | writing your own hashing strategy. Some options include the murmur hash
 34 | `mmh3 <https://github.com/hajimes/mmh3>`__ or those from the
 35 | `pyhash <https://github.com/flier/pyfasthash>`__ library. Each data object in
 36 | **pyprobables** makes it easy to pass in a custom hashing function.
 37 | 
 38 | Read more about how to use `Supplying a pre-defined, alternative hashing strategies`_
 39 | or `Defining hashing function using the provided decorators`_.
 40 | 
 41 | Installation
 42 | ------------------
 43 | 
 44 | Pip Installation:
 45 | 
 46 | ::
 47 | 
 48 |     $ pip install pyprobables
 49 | 
 50 | To install from source:
 51 | 
 52 | To install `pyprobables`, simply clone the `repository on GitHub
 53 | <https://github.com/barrust/pyprobables>`__, then run from the folder:
 54 | 
 55 | ::
 56 | 
 57 |     $ python setup.py install
 58 | 
 59 | `pyprobables` supports python 3.6 - 3.11+
 60 | 
 61 | For *python 2.7* support, install `release 0.3.2 <https://github.com/barrust/pyprobables/releases/tag/v0.3.2>`__
 62 | 
 63 | ::
 64 | 
 65 |     $ pip install pyprobables==0.3.2
 66 | 
 67 | 
 68 | API Documentation
 69 | ---------------------
 70 | 
 71 | The documentation of is hosted on
 72 | `readthedocs.io <http://pyprobables.readthedocs.io/en/latest/code.html#api>`__
 73 | 
 74 | You can build the documentation locally by running:
 75 | 
 76 | ::
 77 | 
 78 |     $ pip install sphinx
 79 |     $ cd docs/
 80 |     $ make html
 81 | 
 82 | 
 83 | 
 84 | Automated Tests
 85 | ------------------
 86 | 
 87 | To run automated tests, one must simply run the following command from the
 88 | downloaded folder:
 89 | 
 90 | ::
 91 | 
 92 |   $ python setup.py test
 93 | 
 94 | 
 95 | 
 96 | Quickstart
 97 | ------------------
 98 | 
 99 | Import pyprobables and setup a Bloom Filter
100 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
101 | 
102 | .. code:: python
103 | 
104 |     from probables import BloomFilter
105 |     blm = BloomFilter(est_elements=1000, false_positive_rate=0.05)
106 |     blm.add('google.com')
107 |     blm.check('facebook.com')  # should return False
108 |     blm.check('google.com')  # should return True
109 | 
110 | 
111 | Import pyprobables and setup a Count-Min Sketch
112 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
113 | 
114 | .. code:: python
115 | 
116 |     from probables import CountMinSketch
117 |     cms = CountMinSketch(width=1000, depth=5)
118 |     cms.add('google.com')  # should return 1
119 |     cms.add('facebook.com', 25)  # insert 25 at once; should return 25
120 | 
121 | 
122 | Import pyprobables and setup a Cuckoo Filter
123 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
124 | 
125 | .. code:: python
126 | 
127 |     from probables import CuckooFilter
128 |     cko = CuckooFilter(capacity=100, max_swaps=10)
129 |     cko.add('google.com')
130 |     cko.check('facebook.com')  # should return False
131 |     cko.check('google.com')  # should return True
132 | 
133 | 
134 | Import pyprobables and setup a Quotient Filter
135 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
136 | 
137 | .. code:: python
138 | 
139 |     from probables import QuotientFilter
140 |     qf = QuotientFilter(quotient=24)
141 |     qf.add('google.com')
142 |     qf.check('facebook.com')  # should return False
143 |     qf.check('google.com')  # should return True
144 | 
145 | 
146 | Supplying a pre-defined, alternative hashing strategies
147 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
148 | 
149 | .. code:: python
150 | 
151 |     from probables import BloomFilter
152 |     from probables.hashes import default_sha256
153 |     blm = BloomFilter(est_elements=1000, false_positive_rate=0.05,
154 |                       hash_function=default_sha256)
155 |     blm.add('google.com')
156 |     blm.check('facebook.com')  # should return False
157 |     blm.check('google.com')  # should return True
158 | 
159 | 
160 | .. _use-custom-hashing-strategies:
161 | 
162 | Defining hashing function using the provided decorators
163 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
164 | 
165 | .. code:: python
166 | 
167 |     import mmh3  # murmur hash 3 implementation (pip install mmh3)
168 |     from probables.hashes import hash_with_depth_bytes
169 |     from probables import BloomFilter
170 | 
171 |     @hash_with_depth_bytes
172 |     def my_hash(key, depth):
173 |         return mmh3.hash_bytes(key, seed=depth)
174 | 
175 |     blm = BloomFilter(est_elements=1000, false_positive_rate=0.05, hash_function=my_hash)
176 | 
177 | .. code:: python
178 | 
179 |     import hashlib
180 |     from probables.hashes import hash_with_depth_int
181 |     from probables.constants import UINT64_T_MAX
182 |     from probables import BloomFilter
183 | 
184 |     @hash_with_depth_int
185 |     def my_hash(key, seed=0, encoding="utf-8"):
186 |         max64mod = UINT64_T_MAX + 1
187 |         val = int(hashlib.sha512(key.encode(encoding)).hexdigest(), 16)
188 |         val += seed  # not a good example, but uses the seed value
189 |         return val % max64mod
190 | 
191 |     blm = BloomFilter(est_elements=1000, false_positive_rate=0.05, hash_function=my_hash)
192 | 
193 | 
194 | See the `API documentation <http://pyprobables.readthedocs.io/en/latest/code.html#api>`__
195 | for other data structures available and the
196 | `quickstart page <http://pyprobables.readthedocs.io/en/latest/quickstart.html#quickstart>`__
197 | for more examples!
198 | 
199 | 
200 | Changelog
201 | ------------------
202 | 
203 | Please see the `changelog
204 | <https://github.com/barrust/pyprobables/blob/master/CHANGELOG.md>`__ for a list
205 | of all changes.
206 | 
207 | 
208 | Backward Compatible Changes
209 | ---------------------------
210 | 
211 | If you are using previously exported probablistic data structures (v0.4.1 or below)
212 | and used the default hashing strategy, you will want to use the following code
213 | to mimic the original default hashing algorithm.
214 | 
215 | .. code:: python
216 | 
217 |     from probables import BloomFilter
218 |     from probables.hashes import hash_with_depth_int
219 | 
220 |     @hash_with_depth_int
221 |     def old_fnv1a(key, depth=1):
222 |         return tmp_fnv_1a(key)
223 | 
224 |     def tmp_fnv_1a(key):
225 |         max64mod = UINT64_T_MAX + 1
226 |         hval = 14695981039346656073
227 |         fnv_64_prime = 1099511628211
228 |         tmp = map(ord, key)
229 |         for t_str in tmp:
230 |             hval ^= t_str
231 |             hval *= fnv_64_prime
232 |             hval %= max64mod
233 |         return hval
234 | 
235 |     blm = BloomFilter(filpath="old-file-path.blm", hash_function=old_fnv1a)
236 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | codecov:
 2 |   require_ci_to_pass: yes
 3 | 
 4 | coverage:
 5 |   precision: 2
 6 |   round: down
 7 |   range: "70...100"
 8 | 
 9 |   status:
10 |     project:
11 |         default:
12 |             # basic settings
13 |             target: "85%"
14 |             base: auto
15 |             threshold: 15
16 |     patch:
17 |         default:
18 |             target: "50%"
19 |     changes: no
20 | 
21 | parsers:
22 |   gcov:
23 |     branch_detection:
24 |       conditional: yes
25 |       loop: yes
26 |       method: no
27 |       macro: no
28 | 
29 | comment:
30 |   layout: "reach,diff,flags,tree"
31 |   behavior: default
32 |   require_changes: no
33 | 
34 | ignore:
35 |   - "./tests/"
36 |   - "setup.py"
37 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = pyprobables
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx>=3.0
2 | sphinx-rtd-theme
3 | 


--------------------------------------------------------------------------------
/docs/source/_static/custom.css:
--------------------------------------------------------------------------------
1 | /* Set the properties to be full width */
2 | dl.py.property {
3 |     display: block !important;
4 | }


--------------------------------------------------------------------------------
/docs/source/code.rst:
--------------------------------------------------------------------------------
  1 | .. _api:
  2 | 
  3 | pyprobables API
  4 | ====================
  5 | 
  6 | Here you can find the full developer API for the pyprobables project.
  7 | pyprobables provides a suite of probabilistic data-structures to be used
  8 | in data analytics and data science projects.
  9 | 
 10 | 
 11 | Data Structures and Classes
 12 | ============================
 13 | 
 14 | Bloom Filters
 15 | -------------
 16 | 
 17 | Bloom Filters are a class of probabilistic data structures used for set
 18 | operations. Bloom Filters guarantee a zero percent false negative rate
 19 | and a predetermined false positive rate. Once the number of elements inserted
 20 | exceeds the estimated elements, the false positive rate will increase over the
 21 | desired amount.
 22 | 
 23 | `Further Reading <https://en.wikipedia.org/wiki/Bloom_filter>`__
 24 | 
 25 | 
 26 | .. _BloomFilterAnchor:
 27 | 
 28 | BloomFilter
 29 | +++++++++++++++++++++++++++++++
 30 | 
 31 | .. autoclass:: probables.BloomFilter
 32 |     :members:
 33 |     :inherited-members:
 34 | 
 35 | 
 36 | BloomFilterOnDisk
 37 | +++++++++++++++++++++++++++++++
 38 | 
 39 | .. autoclass:: probables.BloomFilterOnDisk
 40 |     :members:
 41 | 
 42 | For more information of all methods and properties, see `BloomFilter`_.
 43 | 
 44 | ExpandingBloomFilter
 45 | +++++++++++++++++++++++++++++++
 46 | 
 47 | .. autoclass:: probables.ExpandingBloomFilter
 48 |     :members:
 49 | 
 50 | RotatingBloomFilter
 51 | +++++++++++++++++++++++++++++++
 52 | 
 53 | .. autoclass:: probables.RotatingBloomFilter
 54 |     :members:
 55 |     :inherited-members:
 56 | 
 57 | CountingBloomFilter
 58 | +++++++++++++++++++++++++++++++
 59 | 
 60 | .. autoclass:: probables.CountingBloomFilter
 61 |     :members:
 62 |     :inherited-members:
 63 | 
 64 | 
 65 | Cuckoo Filters
 66 | --------------
 67 | 
 68 | Cuckoo filters are a space efficient data structure that supports set
 69 | membership testing. Cuckoo filters support insertion, deletion, and lookup of
 70 | elements with low overhead and few false positive results. The name is derived
 71 | from the `cuckoo hashing <https://en.wikipedia.org/wiki/Cuckoo_hashing>`__
 72 | strategy used to resolve conflicts.
 73 | 
 74 | `Further Reading <https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf>`__
 75 | 
 76 | CuckooFilter
 77 | +++++++++++++++++++++++++++++++
 78 | .. autoclass:: probables.CuckooFilter
 79 |     :members:
 80 | 
 81 | CountingCuckooFilter
 82 | +++++++++++++++++++++++++++++++
 83 | .. autoclass:: probables.CountingCuckooFilter
 84 |     :members:
 85 |     :inherited-members:
 86 | 
 87 | 
 88 | Count-Min Sketches
 89 | ------------------
 90 | 
 91 | Count-Min Sketches, and its derivatives, are good for estimating the number of
 92 | occurrences of an element in streaming data while not needing to retain all the
 93 | data elements. The result is a probabilistic count of elements inserted into
 94 | the data structure. It will always provide the **maximum** number of times a
 95 | data element was encountered. Notice that the result may be **more** than the
 96 | true number of times it was inserted, but never fewer.
 97 | 
 98 | `Further Reading <https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch>`__
 99 | 
100 | 
101 | CountMinSketch
102 | +++++++++++++++++++++++++++++++
103 | 
104 | .. autoclass:: probables.CountMinSketch
105 |     :members:
106 | 
107 | 
108 | CountMeanSketch
109 | +++++++++++++++++++++++++++++++
110 | 
111 | .. autoclass:: probables.CountMeanSketch
112 |     :members:
113 | 
114 | For more information of all methods and properties, see `CountMinSketch`_.
115 | 
116 | 
117 | CountMeanMinSketch
118 | +++++++++++++++++++++++++++++++
119 | 
120 | .. autoclass:: probables.CountMeanMinSketch
121 |     :members:
122 | 
123 | For more information of all methods and properties, see `CountMinSketch`_.
124 | 
125 | 
126 | HeavyHitters
127 | +++++++++++++++++++++++++++++++
128 | 
129 | .. autoclass:: probables.HeavyHitters
130 |     :members:
131 | 
132 | For more information of all methods and properties, see `CountMinSketch`_.
133 | 
134 | 
135 | StreamThreshold
136 | +++++++++++++++++++++++++++++++
137 | 
138 | .. autoclass:: probables.StreamThreshold
139 |     :members:
140 | 
141 | For more information of all methods and properties, see `CountMinSketch`_.
142 | 
143 | QuotientFilter
144 | ------------------
145 | 
146 | Quotient filters are an aproximate membership query filter (AMQ) that is both
147 | space efficient and returns a zero false negative rate and a probablistic false
148 | positive rate. Unlike Bloom filters, the quotient filter only requires a single
149 | hash of the element to insert. The upper **q** bits denote the location within the
150 | filter while the lower **r** bits are stored in the filter.
151 | 
152 | Quotient filters provide some useful benifits over Bloom filters including:
153 | 
154 | * Merging of two filters (not union)
155 | * Resizing of the filter
156 | * Ability to remove elements
157 | 
158 | `Further Reading <https://en.wikipedia.org/wiki/Quotient_filter>`__
159 | 
160 | QuotientFilter
161 | +++++++++++++++++++++++++++++++
162 | 
163 | .. autoclass:: probables.QuotientFilter
164 |     :members:
165 | 
166 | 
167 | Utilities
168 | ------------------
169 | 
170 | Bitarray
171 | +++++++++++++++++++++++++++++++
172 | 
173 | .. autoclass:: probables.utilities.Bitarray
174 |     :members:
175 | 
176 | Exceptions
177 | ============================
178 | 
179 | .. automodule:: probables.exceptions
180 |     :members:
181 | 
182 | 
183 | Hashing Functions
184 | ============================
185 | 
186 | .. automodule:: probables.hashes
187 |     :members:
188 | 
189 | 
190 | Indices and Tables
191 | ============================
192 | 
193 | * :ref:`home`
194 | * :ref:`quickstart`
195 | * :ref:`genindex`
196 | * :ref:`modindex`
197 | * :ref:`search`
198 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # pyprobables documentation build configuration file, created by
  4 | # sphinx-quickstart on Thu Jul 13 22:20:03 2017.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | # If extensions (or modules to document with autodoc) are in another directory,
 16 | # add these directories to sys.path here. If the directory is relative to the
 17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 18 | #
 19 | import os
 20 | import sys
 21 | 
 22 | sys.path.insert(0, os.path.abspath("../../"))
 23 | import probables
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #
 29 | # needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = [
 35 |     "sphinx.ext.autodoc",
 36 |     "sphinx.ext.napoleon",
 37 |     "sphinx.ext.doctest",
 38 |     "sphinx.ext.coverage",
 39 |     "sphinx.ext.viewcode",
 40 |     "sphinx.ext.githubpages",
 41 |     "sphinx.ext.todo",
 42 | ]
 43 | 
 44 | # Turn off typehints in discription
 45 | autodoc_typehints = "description"
 46 | 
 47 | # Add any paths that contain templates here, relative to this directory.
 48 | templates_path = ["_templates"]
 49 | 
 50 | # The suffix(es) of source filenames.
 51 | # You can specify multiple suffix as a list of string:
 52 | #
 53 | # source_suffix = ['.rst', '.md']
 54 | source_suffix = ".rst"
 55 | 
 56 | # The master toctree document.
 57 | master_doc = "index"
 58 | 
 59 | # General information about the project.
 60 | project = "probables"
 61 | copyright = "2017, Tyler Barrus"
 62 | author = probables.__author__
 63 | 
 64 | # The version info for the project you're documenting, acts as replacement for
 65 | # |version| and |release|, also used in various other places throughout the
 66 | # built documents.
 67 | #
 68 | # The short X.Y version.
 69 | version = probables.__version__
 70 | # The full version, including alpha/beta/rc tags.
 71 | release = probables.__version__
 72 | 
 73 | # The language for content autogenerated by Sphinx. Refer to documentation
 74 | # for a list of supported languages.
 75 | #
 76 | # This is also used if you do content translation via gettext catalogs.
 77 | # Usually you set "language" from the command line for these cases.
 78 | language = "en"
 79 | 
 80 | # List of patterns, relative to source directory, that match files and
 81 | # directories to ignore when looking for source files.
 82 | # This patterns also effect to html_static_path and html_extra_path
 83 | exclude_patterns = []
 84 | 
 85 | # The name of the Pygments (syntax highlighting) style to use.
 86 | pygments_style = "sphinx"
 87 | 
 88 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 89 | todo_include_todos = True
 90 | 
 91 | 
 92 | # -- Options for HTML output ----------------------------------------------
 93 | 
 94 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 95 | # a list of builtin themes.
 96 | #
 97 | html_theme = "sphinx_rtd_theme"
 98 | # html_theme = 'alabaster'
 99 | # html_theme = "custom_theme"
100 | # html_theme_path = ["_themes"]
101 | 
102 | # Theme options are theme-specific and customize the look and feel of a theme
103 | # further.  For a list of options available for each theme, see the
104 | # documentation.
105 | #
106 | 
107 | html_theme_options = {
108 |     # "collapse_navigation": True,
109 |     # "sticky_navigation": True,
110 |     # "navigation_depth": 4,
111 |     # "includehidden": True,
112 |     # "titles_only": False,
113 | }
114 | 
115 | # Add any paths that contain custom static files (such as style sheets) here,
116 | # relative to this directory. They are copied after the builtin static files,
117 | # so a file named "default.css" will overwrite the builtin "default.css".
118 | html_static_path = ["_static"]
119 | 
120 | # These paths are either relative to html_static_path
121 | # or fully qualified paths (eg. https://...)
122 | html_css_files = ["custom.css"]
123 | 
124 | # -- Options for HTMLHelp output ------------------------------------------
125 | 
126 | # Output file base name for HTML help builder.
127 | htmlhelp_basename = "pyprobablesdoc"
128 | 
129 | 
130 | # -- Options for LaTeX output ---------------------------------------------
131 | 
132 | latex_elements = {
133 |     # The paper size ('letterpaper' or 'a4paper').
134 |     #
135 |     # 'papersize': 'letterpaper',
136 |     # The font size ('10pt', '11pt' or '12pt').
137 |     #
138 |     # 'pointsize': '10pt',
139 |     # Additional stuff for the LaTeX preamble.
140 |     #
141 |     # 'preamble': '',
142 |     # Latex figure (float) alignment
143 |     #
144 |     # 'figure_align': 'htbp',
145 | }
146 | 
147 | # Grouping the document tree into LaTeX files. List of tuples
148 | # (source start file, target name, title,
149 | #  author, documentclass [howto, manual, or own class]).
150 | latex_documents = [
151 |     (
152 |         master_doc,
153 |         "pyprobables.tex",
154 |         "pyprobables Documentation",
155 |         "Tyler Barrus",
156 |         "manual",
157 |     ),
158 | ]
159 | 
160 | 
161 | # -- Options for manual page output ---------------------------------------
162 | 
163 | # One entry per manual page. List of tuples
164 | # (source start file, name, description, authors, manual section).
165 | man_pages = [(master_doc, "pyprobables", "pyprobables Documentation", [author], 1)]
166 | 
167 | 
168 | # -- Options for Texinfo output -------------------------------------------
169 | 
170 | # Grouping the document tree into Texinfo files. List of tuples
171 | # (source start file, target name, title, author,
172 | #  dir menu entry, description, category)
173 | texinfo_documents = [
174 |     (
175 |         master_doc,
176 |         "pyprobables",
177 |         "pyprobables Documentation",
178 |         author,
179 |         "pyprobables",
180 |         "One line description of project.",
181 |         "Miscellaneous",
182 |     ),
183 | ]
184 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. _home:
 2 | .. include:: ../../README.rst
 3 | 
 4 | 
 5 | .. toctree::
 6 | 
 7 |     code
 8 |     quickstart
 9 | 
10 | 
11 | Read More
12 | ==================
13 | 
14 | * :ref:`api`
15 | * :ref:`quickstart`
16 | * :ref:`genindex`
17 | * :ref:`modindex`
18 | * :ref:`search`
19 | 


--------------------------------------------------------------------------------
/docs/source/quickstart.rst:
--------------------------------------------------------------------------------
  1 | .. _quickstart:
  2 | 
  3 | Quickstart
  4 | ==========================
  5 | 
  6 | 
  7 | Install
  8 | +++++++++++++++++++++++++++++++
  9 | 
 10 | The easiest method of installing pyprobables is by using the pip package
 11 | manager:
 12 | 
 13 | Pip Installation:
 14 | 
 15 | ::
 16 | 
 17 |     $ pip install pyprobables
 18 | 
 19 | 
 20 | API Documentation
 21 | +++++++++++++++++++++++++++++++
 22 | 
 23 | The full API documentation for the pyprobables package:  :ref:`api`
 24 | 
 25 | Example Usage
 26 | +++++++++++++++++++++++++++++++
 27 | 
 28 | Bloom Filters
 29 | -------------
 30 | 
 31 | Bloom Filters provide set operations of large datasets while being small in
 32 | memory footprint. They provide a zero percent false negative rate and a
 33 | predetermined, or desired, false positive rate.
 34 | `more information <https://en.wikipedia.org/wiki/Bloom_filter>`__
 35 | 
 36 | 
 37 | Import, Initialize, and Train
 38 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 39 | .. code:: python
 40 | 
 41 |     >>> from probables import (BloomFilter)
 42 |     >>> blm = BloomFilter(est_elements=1000000, false_positive_rate=0.05)
 43 |     >>> with open('war_and_peace.txt', 'r') as fp:
 44 |     >>>     for line in fp:
 45 |     >>>         for word in line.split():
 46 |     >>>             blm.add(word.lower())  # add each word to the bloom filter!
 47 |     >>> # end reading in the file
 48 | 
 49 | 
 50 | Query the Bloom Filter
 51 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 52 | .. code:: python
 53 | 
 54 |     >>> words_to_check = ['step', 'borzoi', 'diametrically', 'fleches', 'rain']
 55 |     >>> for word in words_to_check:
 56 |     >>>     blm.check(word)
 57 | 
 58 | 
 59 | Export the Bloom Filter
 60 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 61 | .. code:: python
 62 | 
 63 |     >>> blm.export('war_and_peace_bloom.blm')
 64 | 
 65 | 
 66 | Import a Bloom Filter
 67 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 68 | .. code:: python
 69 | 
 70 |     >>> blm2 = BloomFilter(filepath='war_and_peace_bloom.blm')
 71 |     >>> print(blm2.check('sutler'))
 72 | 
 73 | 
 74 | Other Bloom Filters
 75 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 76 | 
 77 | Bloom Filter on Disk
 78 | """""""""""""""""""""""""""""""""""""""""""""""
 79 | 
 80 | The **Bloom Filter on Disk** is a specialized version of the standard
 81 | Bloom Filter that is run directly off of disk instead of in memory. This
 82 | can be useful for very large Bloom Filters or when needing to access many
 83 | Blooms that are exported to file.
 84 | 
 85 | 
 86 | Expanding Bloom Filter
 87 | """""""""""""""""""""""""""""""""""""""""""""""
 88 | 
 89 | The **Expanding Bloom Filter** is a specialized version of the standard
 90 | Bloom Filter that automatically grows to ensure that the desired false positive
 91 | rate is not exceeded. This is ideal for situations that it is a wild guess to
 92 | determine the number of elements that will be added.
 93 | 
 94 | 
 95 | Rotating Bloom Filter
 96 | """""""""""""""""""""""""""""""""""""""""""""""
 97 | 
 98 | The **Rotating Bloom Filter** is a specialized version of the standard
 99 | Bloom Filter that rolls of earlier entries into the filter as they become more
100 | stale. The popping of the queue can be done either programmatically or
101 | automatically.
102 | 
103 | 
104 | Counting Bloom Filter
105 | """""""""""""""""""""""""""""""""""""""""""""""
106 | 
107 | **Counting Bloom Filters** are another specialized version of the standard
108 | Bloom Filter. Instead of using a bit array to track added elements, a
109 | Counting Bloom uses integers to track the number of times the element has
110 | been added.
111 | 
112 | 
113 | Count-Min Sketch
114 | -----------------
115 | 
116 | Count-Min Sketches, and its derivatives, are good for counting the number of
117 | occurrences of an element in streaming data while not needing to retain all the
118 | data elements. The result is a probabilistic count of elements inserted into
119 | the data structure. It will always provide a **maximum** number of times
120 | encountered. Notice that the result may be **more** than the true number
121 | of times it was inserted, but never fewer.
122 | `more information <https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch>`__
123 | 
124 | 
125 | Import, Initialize, and Train
126 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
127 | .. code:: python
128 | 
129 |     >>> from probables import (CountMinSketch)
130 |     >>> cms = CountMinSketch(width=100000, depth=5)
131 |     >>> with open('war_and_peace.txt', 'r') as fp:
132 |     >>>     for line in fp:
133 |     >>>         for word in line.split():
134 |     >>>             cms.add(word.lower())  # add each to the count-min sketch!
135 | 
136 | 
137 | Query the Count-Min Sketch
138 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
139 | .. code:: python
140 | 
141 |     >>> words_to_check = ['step', 'borzoi', 'diametrically', 'fleches', 'rain']
142 |     >>> for word in words_to_check:
143 |     >>>     print(cms.check(word))  # prints: 80, 17, 1, 20, 25
144 | 
145 | 
146 | Export Count-Min Sketch
147 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
148 | .. code:: python
149 | 
150 |     >>> cms.export('war_and_peace.cms')
151 | 
152 | 
153 | Import a Count-Min Sketch
154 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
155 | .. code:: python
156 | 
157 |     >>> cms2 = CountMinSketch(filepath='war_and_peace.cms')
158 |     >>> print(cms2.check('fleches'))  # prints 20
159 | 
160 | 
161 | Other Count-Min Sketches
162 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
163 | 
164 | Count-Mean Sketch and Count-Mean-Min Sketch
165 | """""""""""""""""""""""""""""""""""""""""""""""
166 | 
167 | **Count-Mean Sketch** and **Count-Mean-Min Sketch** are identical to the
168 | Count-Min Sketch for the data structure but both differ in the method of
169 | calculating the number of times and element has been inserted. These are
170 | currently supported by specifying at query time which method is desired
171 | or by initializing to the desired class: CountMeanSketch or CountMeanMinSketch.
172 | 
173 | 
174 | Heavy Hitters
175 | """""""""""""""""""""""""""""""""""""""""""""""
176 | 
177 | **Heavy Hitters** is a version of the Count-Min Sketch that tracks those
178 | elements that are seen most often. Beyond the normal initialization parameters
179 | one only needs to specify the number of heavy hitters to track.
180 | 
181 | 
182 | Stream Threshold
183 | """""""""""""""""""""""""""""""""""""""""""""""
184 | 
185 | **Stream Threshold** is another version of the Count-Min Sketch similar to the
186 | Heavy Hitters. The main difference is that the there is a threshold for
187 | including an element to be tracked instead of tracking a certain number of
188 | elements.
189 | 
190 | 
191 | Cuckoo Filters
192 | ----------------------------------
193 | 
194 | Cuckoo Filters are a memory efficient method to approximate set membership.
195 | They allow for the ability to add, remove, and look elements from the set.
196 | They get the name cuckoo filter from the use of the
197 | `cuckoo hashing <https://en.wikipedia.org/wiki/Cuckoo_hashing>`__ strategy.
198 | 
199 | Import, Initialize, and Train
200 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
201 | .. code:: python3
202 | 
203 |     >>> from probables import (CuckooFilter)
204 |     >>> ccf = CuckooFilter(capacity=100000, bucket_size=4, max_swaps=100)
205 |     >>> with open('war_and_peace.txt', 'r') as fp:
206 |     >>>     for line in fp:
207 |     >>>         for word in line.split():
208 |     >>>             ccf.add(word.lower())  # add each to the cuckoo filter!
209 | 
210 | 
211 | Query the Cuckoo Filter
212 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
213 | .. code:: python3
214 | 
215 |     >>> words_to_check = ['borzoi', 'diametrically', 'fleches', 'rain', 'foo']
216 |     >>> for word in words_to_check:
217 |     >>>     print(ccf.check(word))  # prints: True, True, True, True, False
218 | 
219 | 
220 | Export the Cuckoo Filter
221 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
222 | .. code:: python3
223 | 
224 |     >>> ccf.export('war_and_peace.cko')
225 | 
226 | 
227 | Import a Cuckoo Filter
228 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
229 | .. code:: python3
230 | 
231 |     >>> ccf2 = CuckooFilter(filepath='war_and_peace.cko')
232 |     >>> print(ccf2.check('fleches'))  # prints True
233 | 
234 | Cuckoo Filters based on Error Rate
235 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
236 | To use error rate to initialize a Cuckoo Filter, there are class methods that can be used.
237 | `init_error_rate()` can be used to initialize a Cuckoo Filter that has not been exported, and
238 | `load_error_rate()` can be used to load in a previously exported Cuckoo Filter that used error rate
239 | to determine the parameters.
240 | 
241 | .. code:: python3
242 | 
243 |     >>> cko = CuckooFilter.init_error_rate(0.00001)
244 |     >>> cko.export('war_and_peace.cko')
245 |     >>> ckf = CuckooFilter.load_error_rate(0.00001)
246 | 
247 | Other Cuckoo Filters
248 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
249 | 
250 | Counting Cuckoo Filter
251 | """""""""""""""""""""""""""""""""""""""""""""""
252 | The counting cuckoo filter is similar to the standard filter except that it
253 | tracks the number of times a fingerprint has been added to the filter.
254 | 
255 | 
256 | Quotient Filters
257 | ----------------
258 | 
259 | Quotient Filters provide set operations of large datasets while being relatively
260 | small in memory footprint. They provide a zero percent false negative rate and a
261 | small false positive rate.
262 | `more information <https://en.wikipedia.org/wiki/Quotient_filter>`__
263 | 
264 | 
265 | Import, Initialize, and Train
266 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
267 | .. code:: python3
268 | 
269 |     >>> qf = QuotientFilter(quotient=22)
270 |     >>> with open('war_and_peace.txt', 'r') as fp:
271 |     >>>     for line in fp:
272 |     >>>         for word in line.split():
273 |     >>>             blm.add(word.lower())  # add each word to the bloom filter!
274 | 
275 | 
276 | Query the Quotient Filter
277 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
278 | .. code:: python3
279 | 
280 |     >>> words_to_check = ['borzoi', 'diametrically', 'fleches', 'rain', 'foo']
281 |     >>> for word in words_to_check:
282 |     >>>     print(qf.check(word))  # prints: True, True, True, True, False
283 | 
284 | Custom Hashing Functions
285 | ----------------------------------
286 | In many instances, to get the best raw performance out of the data structures,
287 | it is wise to use a non pure python hashing algorithm. It is recommended that
288 | one is used that is compiled such as `mmh3 <https://github.com/hajimes/mmh3>`__
289 | or `pyhash <https://github.com/flier/pyfasthash>`__ or even built in
290 | cryptographic hashes.
291 | 
292 | Some pre-defined hashing strategies are provided that use built in
293 | cryptographic hashes.
294 | 
295 | To use a pre-defined alternative hashing strategy:
296 | 
297 | .. code:: python3
298 | 
299 |     >>> from probables import (BloomFilter)
300 |     >>> from probables.hashes import (default_sha256, default_md5)
301 |     >>> blm = BloomFilter(est_elements=1000, false_positive_rate=0.05,
302 |                           hash_function=default_sha256)
303 |     >>> blm.add('google.com')
304 |     >>> blm.check('facebook.com')  # should return False
305 |     >>> blm.check('google.com')  # should return True
306 | 
307 | Decorators are provided to help make generating hashing strategies easier.
308 | 
309 | Defining hashing function using the provided decorators:
310 | 
311 | .. code:: python3
312 | 
313 |     >>> import mmh3  # murmur hash 3 implementation (pip install mmh3)
314 |     >>> from pyprobables.hashes import (hash_with_depth_bytes)
315 |     >>> from pyprobables import (BloomFilter)
316 |     >>>
317 |     >>> @hash_with_depth_bytes
318 |     >>> def my_hash(key):
319 |     >>>     return mmh3.hash_bytes(key)
320 |     >>>
321 |     >>> blm = BloomFilter(est_elements=1000, false_positive_rate=0.05, hash_function=my_hash)
322 | 
323 | .. code:: python3
324 | 
325 |     >>> import mmh3  # murmur hash 3 implementation (pip install mmh3)
326 |     >>> from pyprobables.hashes import (hash_with_depth_int)
327 |     >>> from pyprobables import (BloomFilter)
328 |     >>>
329 |     >>> @hash_with_depth_int
330 |     >>> def my_hash(key, encoding='utf-8'):
331 |     >>>    max64mod = UINT64_T_MAX + 1
332 |     >>>    val = int(hashlib.sha512(key.encode(encoding)).hexdigest(), 16)
333 |     >>>    return val % max64mod
334 |     >>>
335 |     >>> blm = BloomFilter(est_elements=1000, false_positive_rate=0.05, hash_function=my_hash)
336 | 
337 | Generate completely different hashing strategy
338 | 
339 | .. code:: python3
340 | 
341 |     >>> import mmh3  # murmur hash 3 implementation (pip install mmh3)
342 |     >>>
343 |     >>> def my_hash(key, depth, encoding='utf-8'):
344 |     >>>     max64mod = UINT64_T_MAX + 1
345 |     >>>     results = list()
346 |     >>>     for i in range(0, depth):
347 |     >>>         tmp = key[i:] + key[:i]
348 |     >>>         val = int(hashlib.sha512(tmp.encode(encoding)).hexdigest(), 16)
349 |     >>>         results.append(val % max64mod)
350 |     >>>     return results
351 | 
352 | 
353 | Indices and Tables
354 | ==================
355 | 
356 | * :ref:`home`
357 | * :ref:`api`
358 | * :ref:`genindex`
359 | * :ref:`modindex`
360 | * :ref:`search`
361 | 


--------------------------------------------------------------------------------
/probables/__init__.py:
--------------------------------------------------------------------------------
 1 | """pyprobables module"""
 2 | 
 3 | from probables.blooms import (
 4 |     BloomFilter,
 5 |     BloomFilterOnDisk,
 6 |     CountingBloomFilter,
 7 |     ExpandingBloomFilter,
 8 |     RotatingBloomFilter,
 9 | )
10 | from probables.countminsketch import CountMeanMinSketch, CountMeanSketch, CountMinSketch, HeavyHitters, StreamThreshold
11 | from probables.cuckoo import CountingCuckooFilter, CuckooFilter
12 | from probables.exceptions import (
13 |     CuckooFilterFullError,
14 |     InitializationError,
15 |     NotSupportedError,
16 |     ProbablesBaseException,
17 |     RotatingBloomFilterError,
18 | )
19 | from probables.quotientfilter import QuotientFilter
20 | from probables.utilities import Bitarray
21 | 
22 | __author__ = "Tyler Barrus"
23 | __maintainer__ = "Tyler Barrus"
24 | __email__ = "barrust@gmail.com"
25 | __license__ = "MIT"
26 | __version__ = "0.6.1"
27 | __credits__: list[str] = []
28 | __url__ = "https://github.com/barrust/pyprobables"
29 | __bugtrack_url__ = "https://github.com/barrust/pyprobables/issues"
30 | 
31 | __all__ = [
32 |     "BloomFilter",
33 |     "BloomFilterOnDisk",
34 |     "CountingBloomFilter",
35 |     "CountMinSketch",
36 |     "CountMeanSketch",
37 |     "CountMeanMinSketch",
38 |     "HeavyHitters",
39 |     "StreamThreshold",
40 |     "CuckooFilter",
41 |     "CountingCuckooFilter",
42 |     "InitializationError",
43 |     "NotSupportedError",
44 |     "ProbablesBaseException",
45 |     "CuckooFilterFullError",
46 |     "ExpandingBloomFilter",
47 |     "RotatingBloomFilter",
48 |     "RotatingBloomFilterError",
49 |     "QuotientFilter",
50 |     "Bitarray",
51 | ]
52 | 


--------------------------------------------------------------------------------
/probables/blooms/__init__.py:
--------------------------------------------------------------------------------
 1 | """Bloom Filters"""
 2 | 
 3 | from probables.blooms.bloom import BloomFilter, BloomFilterOnDisk
 4 | from probables.blooms.countingbloom import CountingBloomFilter
 5 | from probables.blooms.expandingbloom import ExpandingBloomFilter, RotatingBloomFilter
 6 | 
 7 | __all__ = [
 8 |     "BloomFilter",
 9 |     "BloomFilterOnDisk",
10 |     "CountingBloomFilter",
11 |     "ExpandingBloomFilter",
12 |     "RotatingBloomFilter",
13 | ]
14 | 


--------------------------------------------------------------------------------
/probables/blooms/countingbloom.py:
--------------------------------------------------------------------------------
  1 | """CountingBloomFilter, python implementation
  2 | License: MIT
  3 | Author: Tyler Barrus (barrust@gmail.com)
  4 | URL: https://github.com/barrust/counting_bloom
  5 | """
  6 | 
  7 | from array import array
  8 | from collections.abc import ByteString
  9 | from pathlib import Path
 10 | from struct import Struct
 11 | from typing import Union
 12 | 
 13 | from probables.blooms.bloom import BloomFilter
 14 | from probables.constants import UINT32_T_MAX, UINT64_T_MAX
 15 | from probables.exceptions import InitializationError
 16 | from probables.hashes import HashFuncT, HashResultsT, KeyT
 17 | from probables.utilities import is_hex_string, is_valid_file, resolve_path
 18 | 
 19 | MISMATCH_MSG = "The parameter second must be of type CountingBloomFilter"
 20 | 
 21 | 
 22 | def _verify_not_type_mismatch(second: "CountingBloomFilter") -> bool:
 23 |     """verify that there is not a type mismatch"""
 24 |     return isinstance(second, (CountingBloomFilter))
 25 | 
 26 | 
 27 | class CountingBloomFilter(BloomFilter):
 28 |     """Simple Counting Bloom Filter implementation for use in python;
 29 |     It can read and write the same format as the c version
 30 |     (https://github.com/barrust/counting_bloom)
 31 | 
 32 |     Args:
 33 |         est_elements (int): The number of estimated elements to be added
 34 |         false_positive_rate (float): The desired false positive rate
 35 |         filepath (str): Path to file to load
 36 |         hex_string (str): Hex based representation to be loaded
 37 |         hash_function (function): Hashing strategy function to use `hf(key, number)`
 38 |     Returns:
 39 |         CountingBloomFilter: A Counting Bloom Filter object
 40 | 
 41 |     Note:
 42 |         Initialization order of operations:
 43 |             1) From file
 44 |             2) From Hex String
 45 |             3) From params"""
 46 | 
 47 |     __slots__ = ("_filepath",)
 48 | 
 49 |     def __init__(
 50 |         self,
 51 |         est_elements: Union[int, None] = None,
 52 |         false_positive_rate: Union[float, None] = None,
 53 |         filepath: Union[str, Path, None] = None,
 54 |         hex_string: Union[str, None] = None,
 55 |         hash_function: Union[HashFuncT, None] = None,
 56 |     ) -> None:
 57 |         """setup the basic values needed"""
 58 |         self._filepath = None
 59 |         super().__init__(est_elements, false_positive_rate, filepath, hex_string, hash_function)
 60 | 
 61 |     def _load_init(self, filepath, hash_function, hex_string, est_elements, false_positive_rate):
 62 |         """Handle setting params and loading everything as needed"""
 63 |         self._bits_per_elm = 1.0
 64 |         self._type = "counting"
 65 |         self._typecode = "I"
 66 | 
 67 |         if is_valid_file(filepath):
 68 |             self._filepath = resolve_path(filepath)
 69 |             self._load(self._filepath, hash_function)
 70 |         elif is_hex_string(hex_string):
 71 |             self._load_hex(hex_string, hash_function)
 72 |         else:
 73 |             if est_elements is None or false_positive_rate is None:
 74 |                 raise InitializationError("Insufecient parameters to set up the Counting Bloom Filter")
 75 |             # calc values
 76 |             fpr, n_hashes, n_bits = self._get_optimized_params(est_elements, false_positive_rate)
 77 |             self._set_values(est_elements, fpr, n_hashes, n_bits, hash_function)
 78 |             self._bloom_length = n_bits
 79 |             self._bloom = array(self._typecode, [0]) * self._bloom_length
 80 | 
 81 |     _IMPT_STRUCT = Struct("I")
 82 | 
 83 |     @classmethod
 84 |     def frombytes(cls, b: ByteString, hash_function: Union[HashFuncT, None] = None) -> "CountingBloomFilter":
 85 |         """
 86 |         Args:
 87 |             b (ByteString): the bytes to load as a Counting Bloom Filter
 88 |             hash_function (function): Hashing strategy function to use `hf(key, number)`
 89 |         Returns:
 90 |             CountingBloomFilter: A Counting Bloom Filter object
 91 |         """
 92 |         offset = cls._FOOTER_STRUCT.size
 93 |         est_els, els_added, fpr, n_hashes, n_bits = cls._parse_footer(cls._FOOTER_STRUCT, bytes(b[-1 * offset :]))
 94 |         blm = CountingBloomFilter(est_elements=est_els, false_positive_rate=fpr, hash_function=hash_function)
 95 |         blm._set_values(est_els, fpr, n_hashes, n_bits, hash_function)
 96 |         blm._els_added = els_added
 97 |         blm._parse_bloom_array(b, cls._IMPT_STRUCT.size * blm.bloom_length)
 98 |         return blm
 99 | 
100 |     def __str__(self) -> str:
101 |         """string representation of the counting bloom filter"""
102 |         on_disk = "no" if self.is_on_disk is False else "yes"
103 | 
104 |         cnt = sum(x for x in self._bloom if x > 0)
105 |         total = sum(self._bloom)
106 |         largest = max(self._bloom)
107 |         largest_idx = (self._bloom).index(largest)
108 |         fullness = cnt / self.number_bits
109 |         els_added = total // self.number_hashes
110 | 
111 |         return (
112 |             "CountingBloom:\n"
113 |             f"\tbits: {self.number_bits}\n"
114 |             f"\testimated elements: {self.estimated_elements}\n"
115 |             f"\tnumber hashes: {self.number_hashes}\n"
116 |             f"\tmax false positive rate: {self.false_positive_rate:.6f}\n"
117 |             f"\telements added: {self.elements_added}\n"
118 |             f"\tcurrent false positive rate: {self.current_false_positive_rate():.6f}\n"
119 |             f"\tis on disk: {on_disk}\n"
120 |             f"\tindex fullness: {fullness:.6}\n"
121 |             f"\tmax index usage: {largest}\n"
122 |             f"\tmax index id: {largest_idx}\n"
123 |             f"\tcalculated elements: {els_added}\n"
124 |         )
125 | 
126 |     def add(self, key: KeyT, num_els: int = 1) -> int:  # type: ignore
127 |         """Add the key to the Counting Bloom Filter
128 | 
129 |         Args:
130 |             key (str): The element to be inserted
131 |             num_els (int): Number of times to insert the element
132 |         Returns:
133 |             int: Maximum number of insertions"""
134 |         return self.add_alt(self.hashes(key), num_els)
135 | 
136 |     def add_alt(self, hashes: HashResultsT, num_els: int = 1) -> int:  # type: ignore
137 |         """Add the element represented by hashes into the Counting Bloom Filter
138 | 
139 |         Args:
140 |             hashes (list): A list of integers representing the key to insert
141 |             num_els (int): Number of times to insert the element
142 |         Returns:
143 |             int: Maximum number of insertions"""
144 |         # NOTE: this will increment indices each time it is viewed. Not sure if that is "correct"
145 |         #       if not then we will need to update this and the C version
146 |         indices = [hashes[i] % self._bloom_length for i in range(self._number_hashes)]
147 |         vals = [self._bloom[k] + num_els for k in indices]
148 |         for i, v in enumerate(vals):
149 |             k = indices[i]
150 |             if v > UINT32_T_MAX:
151 |                 self._bloom[k] = UINT32_T_MAX
152 |                 vals[i] = UINT32_T_MAX
153 |             else:
154 |                 self._bloom[k] += num_els  # This keeps the original methodology
155 |         self.elements_added = min(self.elements_added + num_els, UINT64_T_MAX)
156 |         return min(vals)
157 | 
158 |     def check(self, key: KeyT) -> int:  # type: ignore
159 |         """Check if the key is likely in the Counting Bloom Filter
160 | 
161 |         Args:
162 |             key (str): The element to be checked
163 |         Returns:
164 |             int: Maximum number of insertions"""
165 |         return self.check_alt(self.hashes(key))
166 | 
167 |     def check_alt(self, hashes: HashResultsT) -> int:  # type: ignore
168 |         """Check if the element represented by hashes is in the Counting
169 |         Bloom Filter
170 | 
171 |         Args:
172 |             hashes (list): A list of integers representing the key to check
173 |         Returns:
174 |             int: Maximum number of insertions"""
175 |         return min(self._bloom[x % self.number_bits] for x in hashes)
176 | 
177 |     def remove(self, key: KeyT, num_els: int = 1) -> int:
178 |         """Remove the element from the counting bloom
179 | 
180 |         Args:
181 |             key (str): The element to be removed
182 |             num_els (int): Number of times to remove the element
183 |         Returns:
184 |             int: Maximum number of insertions after the removal"""
185 |         return self.remove_alt(self.hashes(key), num_els)
186 | 
187 |     def remove_alt(self, hashes: HashResultsT, num_els: int = 1) -> int:
188 |         """Remvoe the element represented by hashes from the Counting Bloom Filter
189 | 
190 |         Args:
191 |             hashes (list): A list of integers representing the key to remove
192 |             num_els (int): Number of times to remove the element
193 |         Returns:
194 |             int: Maximum number of insertions after the removal"""
195 | 
196 |         indices = [hashes[i] % self._bloom_length for i in range(self._number_hashes)]
197 |         vals = [self._bloom[k] for k in indices]
198 |         min_val = min(vals)
199 |         if min_val == UINT32_T_MAX:  # cannot remove if we have hit the max
200 |             return UINT32_T_MAX
201 |         if min_val == 0:
202 |             return 0
203 | 
204 |         to_remove = num_els if min_val > num_els else min_val
205 |         for k in indices:
206 |             if self._bloom[k] < UINT32_T_MAX:  # only remove if less than UINT32_T_MAX
207 |                 self._bloom[k] -= to_remove
208 |         self.elements_added -= to_remove
209 |         return min_val - to_remove
210 | 
211 |     def intersection(self, second: "CountingBloomFilter") -> Union["CountingBloomFilter", None]:  # type: ignore
212 |         """Take the intersection of two Counting Bloom Filters
213 | 
214 |         Args:
215 |             second (CountingBloomFilter): The Bloom Filter with which to take the intersection
216 |         Returns:
217 |             CountingBloomFilter: The new Counting Bloom Filter containing the union
218 |         Raises:
219 |             TypeError: When second is not a :class:`CountingBloomFilter`
220 |         Note:
221 |             The elements_added property will be set to the estimated number of unique elements \
222 |                 added as found in estimate_elements()
223 |         Note:
224 |             If `second` is not of the same size (false_positive_rate and est_elements) then \
225 |                 this will return `None`"""
226 |         if not _verify_not_type_mismatch(second):
227 |             raise TypeError(MISMATCH_MSG)
228 | 
229 |         if self._verify_bloom_similarity(second) is False:
230 |             return None
231 |         res = CountingBloomFilter(
232 |             est_elements=self.estimated_elements,
233 |             false_positive_rate=self.false_positive_rate,
234 |             hash_function=self.hash_function,
235 |         )
236 | 
237 |         for i in range(self.bloom_length):
238 |             if self._bloom[i] > 0 and second._bloom[i] > 0:
239 |                 tmp = self._bloom[i] + second._bloom[i]
240 |                 res.bloom[i] = tmp
241 |         res.elements_added = res.estimate_elements()
242 |         return res
243 | 
244 |     def jaccard_index(self, second: "CountingBloomFilter") -> Union[float, None]:  # type:ignore
245 |         """Take the Jaccard Index of two Counting Bloom Filters
246 | 
247 |         Args:
248 |             second (CountingBloomFilter): The Bloom Filter with which to take the jaccard index
249 |         Returns:
250 |             float: A numeric value between 0 and 1 where 1 is identical and 0 means completely different
251 |         Raises:
252 |             TypeError: When second is not a :class:`CountingBloomFilter`
253 |         Note:
254 |             The Jaccard Index is based on the unique set of elements added and not the number of each element added
255 |         Note:
256 |             If `second` is not of the same size (false_positive_rate and est_elements) then this will return `None`"""
257 |         if not _verify_not_type_mismatch(second):
258 |             raise TypeError(MISMATCH_MSG)
259 | 
260 |         if self._verify_bloom_similarity(second) is False:
261 |             return None
262 | 
263 |         count_union = 0
264 |         count_inter = 0
265 |         for i in range(self.bloom_length):
266 |             if self._bloom[i] > 0 or second._bloom[i] > 0:
267 |                 count_union += 1
268 |             if self._bloom[i] > 0 and second._bloom[i] > 0:
269 |                 count_inter += 1
270 |         if count_union == 0:
271 |             return 1.0
272 |         return count_inter / count_union
273 | 
274 |     def union(self, second: "CountingBloomFilter") -> Union["CountingBloomFilter", None]:  # type:ignore
275 |         """Return a new Countiong Bloom Filter that contains the union of
276 |         the two
277 | 
278 |         Args:
279 |             second (CountingBloomFilter): The Counting Bloom Filter with which to calculate the union
280 |         Returns:
281 |             CountingBloomFilter: The new Counting Bloom Filter containing the union
282 |         Raises:
283 |             TypeError: When second is not a :class:`CountingBloomFilter`
284 |         Note:
285 |             The elements_added property will be set to the estimated number of unique elements added as \
286 |                 found in estimate_elements()
287 |         Note:
288 |             If `second` is not of the same size (false_positive_rate and est_elements) then this will return `None`"""
289 |         if not _verify_not_type_mismatch(second):
290 |             raise TypeError(MISMATCH_MSG)
291 | 
292 |         if self._verify_bloom_similarity(second) is False:
293 |             return None
294 |         res = CountingBloomFilter(
295 |             est_elements=self.estimated_elements,
296 |             false_positive_rate=self.false_positive_rate,
297 |             hash_function=self.hash_function,
298 |         )
299 |         for i in range(self.bloom_length):
300 |             tmp = self._bloom[i] + second._bloom[i]
301 |             res._bloom[i] = tmp
302 |         res.elements_added = res.estimate_elements()
303 |         return res
304 | 
305 |     def _cnt_number_bits_set(self) -> int:
306 |         """calculate the total number of set bits in the bloom"""
307 |         return sum(1 for x in self._bloom if x > 0)
308 | 


--------------------------------------------------------------------------------
/probables/blooms/expandingbloom.py:
--------------------------------------------------------------------------------
  1 | """Expanding and Rotating BloomFilter, python implementations
  2 | License: MIT
  3 | Author: Tyler Barrus (barrust@gmail.com)
  4 | URL: https://github.com/barrust/pyprobables
  5 | """
  6 | 
  7 | from array import array
  8 | from collections.abc import ByteString
  9 | from io import BytesIO, IOBase
 10 | from mmap import mmap
 11 | from pathlib import Path
 12 | from struct import Struct
 13 | from typing import Union
 14 | 
 15 | from probables.blooms.bloom import BloomFilter
 16 | from probables.exceptions import RotatingBloomFilterError
 17 | from probables.hashes import HashFuncT, HashResultsT, KeyT, default_fnv_1a
 18 | from probables.utilities import MMap, is_valid_file, resolve_path
 19 | 
 20 | 
 21 | class ExpandingBloomFilter:
 22 |     """Simple expanding Bloom Filter implementation for use in python; the
 23 |     Bloom Fiter will automatically expand, or grow, if the false
 24 |     positive rate is about to become greater than the desired false
 25 |     positive rate.
 26 | 
 27 |     Args:
 28 |         est_elements (int): The number of estimated elements to be added
 29 |         false_positive_rate (float): The desired false positive rate
 30 |         filepath (str): Path to file to load
 31 |         hash_function (function): Hashing strategy function to use `hf(key, number)`
 32 |     Returns:
 33 |         ExpandingBloomFilter: An expanding Bloom Filter object
 34 |     Note:
 35 |         Initialization order of operations:
 36 |             1) Filepath
 37 |             2) est_elements and false_positive_rate"""
 38 | 
 39 |     __slots__ = (
 40 |         "_blooms",
 41 |         "__fpr",
 42 |         "__est_elements",
 43 |         "__hash_func",
 44 |         "_added_elements",
 45 |     )
 46 | 
 47 |     def __init__(
 48 |         self,
 49 |         est_elements: Union[int, None] = None,
 50 |         false_positive_rate: Union[float, None] = None,
 51 |         filepath: Union[str, Path, None] = None,
 52 |         hash_function: Union[HashFuncT, None] = None,
 53 |     ):
 54 |         """initialize"""
 55 |         self._blooms = []  # type: ignore
 56 |         self.__fpr = false_positive_rate if false_positive_rate is not None else 0.0
 57 |         self.__est_elements = est_elements if est_elements is not None else 100
 58 |         self.__hash_func: HashFuncT
 59 |         self._added_elements = 0  # total added...
 60 | 
 61 |         if hash_function is not None:
 62 |             self.__hash_func = hash_function
 63 |         else:
 64 |             self.__hash_func = default_fnv_1a
 65 | 
 66 |         if filepath is not None and is_valid_file(filepath):
 67 |             self.__load(filepath)
 68 |         else:
 69 |             # add in the initial bloom filter!
 70 |             self.__add_bloom_filter()
 71 | 
 72 |     __FOOTER_STRUCT = Struct("QQQf")
 73 |     __S_INT64_STRUCT = Struct("Q")
 74 |     _BLOOM_ELEMENT_SIZE = Struct("B").size
 75 | 
 76 |     @classmethod
 77 |     def frombytes(cls, b: ByteString, hash_function: Union[HashFuncT, None] = None) -> "ExpandingBloomFilter":
 78 |         """
 79 |         Args:
 80 |             b (ByteString): The bytes to load as a Expanding Bloom Filter
 81 |             hash_function (function): Hashing strategy function to use `hf(key, number)`
 82 |         Returns:
 83 |             ExpandingBloomFilter: A Bloom Filter object
 84 |         """
 85 |         size, est_els, added_els, fpr = cls._parse_footer(b)
 86 |         blm = ExpandingBloomFilter(est_elements=est_els, false_positive_rate=fpr, hash_function=hash_function)
 87 |         blm._parse_blooms(b, size)
 88 |         blm._added_elements = added_els
 89 |         return blm
 90 | 
 91 |     def __contains__(self, key: KeyT) -> bool:
 92 |         """setup the `in` functionality"""
 93 |         return self.check(key)
 94 | 
 95 |     def __bytes__(self) -> bytes:
 96 |         """Export bloom filter to `bytes`"""
 97 | 
 98 |         with BytesIO() as f:
 99 |             self.export(f)
100 |             return f.getvalue()
101 | 
102 |     @property
103 |     def expansions(self) -> int:
104 |         """int: The number of expansions"""
105 |         return len(self._blooms) - 1
106 | 
107 |     @property
108 |     def false_positive_rate(self) -> float:
109 |         """float: The desired false positive rate of the expanding Bloom Filter"""
110 |         return self.__fpr
111 | 
112 |     @property
113 |     def estimated_elements(self) -> int:
114 |         """int: The original number of elements estimated to be in the Bloom Filter"""
115 |         return self.__est_elements
116 | 
117 |     @property
118 |     def elements_added(self) -> int:
119 |         """int: The total number of elements added"""
120 |         return self._added_elements
121 | 
122 |     @property
123 |     def hash_function(self) -> HashFuncT:
124 |         """int: The total number of elements added"""
125 |         return self.__hash_func
126 | 
127 |     def push(self) -> None:
128 |         """Push a new expansion onto the Bloom Filter"""
129 |         self.__add_bloom_filter()
130 | 
131 |     def check(self, key: KeyT) -> bool:
132 |         """Check to see if the key is in the Bloom Filter
133 | 
134 |         Args:
135 |             key (str): The key to check for in the Bloom Filter
136 |         Returns:
137 |             bool: `True` if the element is likely present; `False` if definately not present"""
138 |         hashes = self._blooms[0].hashes(key)
139 |         return self.check_alt(hashes)
140 | 
141 |     def check_alt(self, hashes: HashResultsT) -> bool:
142 |         """Check to see if the hashes are in the Bloom Filter
143 | 
144 |         Args:
145 |             hashes (list): The hash representation to check for in the Bloom Filter
146 |         Returns:
147 |             bool: `True` if the element is likely present; `False` if definately not present"""
148 |         return any(blm.check_alt(hashes) for blm in self._blooms)
149 | 
150 |     def add(self, key: KeyT, force: bool = False) -> None:
151 |         """Add the key to the Bloom Filter
152 | 
153 |         Args:
154 |             key (str): The element to be inserted
155 |             force (bool): `True` will force it to be inserted, even if it likely has been inserted \
156 |                 before `False` will only insert if not found in the Bloom Filter"""
157 |         hashes = self._blooms[0].hashes(key)
158 |         self.add_alt(hashes, force)
159 | 
160 |     def add_alt(self, hashes: HashResultsT, force: bool = False) -> None:
161 |         """Add the element represented by hashes into the Bloom Filter
162 | 
163 |         Args:
164 |             hashes (list): A list of integers representing the key to insert
165 |             force (bool): `True` will force it to be inserted, even if it likely has been inserted \
166 |                 before `False` will only insert if not found in the Bloom Filter"""
167 |         self._added_elements += 1
168 |         if force or not self.check_alt(hashes):
169 |             self.__check_for_growth()
170 |             self._blooms[-1].add_alt(hashes)
171 | 
172 |     def __add_bloom_filter(self):
173 |         """build a new bloom and add it on!"""
174 |         blm = BloomFilter(
175 |             est_elements=self.__est_elements,
176 |             false_positive_rate=self.__fpr,
177 |             hash_function=self.__hash_func,
178 |         )
179 |         self._blooms.append(blm)
180 | 
181 |     def __check_for_growth(self):
182 |         """detereming if the bloom filter should automatically grow"""
183 |         if self._blooms[-1].elements_added >= self.__est_elements:
184 |             self.__add_bloom_filter()
185 | 
186 |     def export(self, file: Union[Path, str, IOBase, mmap]) -> None:
187 |         """Export an expanding Bloom Filter, or subclass, to disk
188 | 
189 |         Args:
190 |             filepath (str): The path to the file to import"""
191 |         if not isinstance(file, (IOBase, mmap)):
192 |             file = resolve_path(file)
193 |             with open(file, "wb") as filepointer:
194 |                 self.export(filepointer)  # type:ignore
195 |         else:
196 |             filepointer = file  # type:ignore
197 |             # add all the different Bloom bit arrays...
198 |             for blm in self._blooms:
199 |                 filepointer.write(self.__S_INT64_STRUCT.pack(blm.elements_added))
200 |                 blm.bloom.tofile(filepointer)
201 |             filepointer.write(
202 |                 self.__FOOTER_STRUCT.pack(
203 |                     len(self._blooms),
204 |                     self.estimated_elements,
205 |                     self.elements_added,
206 |                     self.false_positive_rate,
207 |                 )
208 |             )
209 | 
210 |     def __load(self, file: Union[Path, str, IOBase, mmap]):
211 |         """load a file"""
212 |         if not isinstance(file, (IOBase, mmap)):
213 |             file = resolve_path(file)
214 |             with MMap(file) as filepointer:
215 |                 self.__load(filepointer)
216 |         else:
217 |             size, est_els, els_added, fpr = self._parse_footer(file)  # type: ignore
218 |             self._blooms = []
219 |             self._added_elements = els_added
220 |             self.__fpr = fpr
221 |             self.__est_elements = est_els
222 |             self._parse_blooms(file, size)  # type:ignore
223 | 
224 |     @classmethod
225 |     def _parse_footer(cls, b: ByteString) -> tuple[int, int, int, float]:
226 |         offset = cls.__FOOTER_STRUCT.size
227 |         size, est_els, els_added, fpr = cls.__FOOTER_STRUCT.unpack(bytes(b[-1 * offset :]))
228 |         return int(size), int(est_els), int(els_added), float(fpr)
229 | 
230 |     def _parse_blooms(self, b: ByteString, size: int) -> None:
231 |         # reset the bloom list
232 |         self._blooms = []
233 |         blm_size = 0
234 |         start = 0
235 |         end = 0
236 |         for _ in range(size):
237 |             blm = BloomFilter(
238 |                 est_elements=self.__est_elements,
239 |                 false_positive_rate=self.__fpr,
240 |                 hash_function=self.__hash_func,
241 |             )
242 |             if blm_size == 0:
243 |                 blm_size = self._BLOOM_ELEMENT_SIZE * blm.bloom_length
244 |             end = start + self.__S_INT64_STRUCT.size + blm_size
245 |             blm._els_added = int(self.__S_INT64_STRUCT.unpack(bytes(b[start : start + self.__S_INT64_STRUCT.size]))[0])
246 |             blm._bloom = array("B", bytes(b[start + self.__S_INT64_STRUCT.size : end]))
247 |             self._blooms.append(blm)
248 |             start = end
249 | 
250 | 
251 | class RotatingBloomFilter(ExpandingBloomFilter):
252 |     """Simple Rotating Bloom Filter implementation that allows for the "older"
253 |     elements added to be removed, in chunks. As the queue fills up, those
254 |     elements inserted earlier will be bulk removed. This also provides the
255 |     user with the oportunity to force the removal instead of it being time
256 |     based.
257 | 
258 |     Args:
259 |         est_elements (int): The number of estimated elements to be added
260 |         false_positive_rate (float): The desired false positive rate
261 |         max_queue_size (int): This is the number is used to determine the maximum number of Bloom Filters. \
262 |             Total elements added is based on `max_queue_size * est_elements`
263 |         filepath (str): Path to file to load
264 |         hash_function (function): Hashing strategy function to use `hf(key, number)`
265 |     Note:
266 |         Initialization order of operations:
267 |             1) Filepath
268 |             2) est_elements and false_positive_rate
269 |     """
270 | 
271 |     __slots__ = ("_queue_size",)
272 | 
273 |     def __init__(
274 |         self,
275 |         est_elements: Union[int, None] = None,
276 |         false_positive_rate: Union[float, None] = None,
277 |         max_queue_size: int = 10,
278 |         filepath: Union[str, Path, None] = None,
279 |         hash_function: Union[HashFuncT, None] = None,
280 |     ) -> None:
281 |         """initialize"""
282 |         super().__init__(
283 |             est_elements=est_elements,
284 |             false_positive_rate=false_positive_rate,
285 |             filepath=filepath,
286 |             hash_function=hash_function,
287 |         )
288 |         self._queue_size = max_queue_size
289 | 
290 |     @classmethod
291 |     def frombytes(  # type:ignore
292 |         cls, b: ByteString, max_queue_size: int, hash_function: Union[HashFuncT, None] = None
293 |     ) -> "RotatingBloomFilter":
294 |         """
295 |         Args:
296 |             b (ByteString): The bytes to load as a Expanding Bloom Filter
297 |             max_queue_size (int): This is the number is used to determine the maximum number \
298 |                 of Bloom Filters. Total elements added is based on `max_queue_size * est_elements`
299 |             hash_function (function): Hashing strategy function to use `hf(key, number)`
300 |         Returns:
301 |             RotatingBloomFilter: A Bloom Filter object
302 |         """
303 |         size, est_els, added_els, fpr = cls._parse_footer(b)
304 |         blm = RotatingBloomFilter(
305 |             est_elements=est_els, false_positive_rate=fpr, max_queue_size=max_queue_size, hash_function=hash_function
306 |         )
307 |         blm._parse_blooms(b, size)
308 |         blm._added_elements = added_els
309 |         return blm
310 | 
311 |     @property
312 |     def max_queue_size(self) -> int:
313 |         """int: The maximum size for the queue"""
314 |         return self._queue_size
315 | 
316 |     @property
317 |     def current_queue_size(self) -> int:
318 |         """int: The current size of the queue"""
319 |         return len(self._blooms)
320 | 
321 |     def add_alt(self, hashes: HashResultsT, force: bool = False) -> None:
322 |         """Add the element represented by hashes into the Bloom Filter
323 | 
324 |         Args:
325 |             hashes (list): A list of integers representing the key to insert
326 |             force (bool): `True` will force it to be inserted, even if it likely has been inserted \
327 |                 before `False` will only insert if not found in the Bloom Filter"""
328 |         self._added_elements += 1
329 |         if force or not self.check_alt(hashes):
330 |             self.__rotate_bloom_filter()
331 |             self._blooms[-1].add_alt(hashes)
332 | 
333 |     def pop(self) -> None:
334 |         """Pop the oldest Bloom Filter off of the queue without pushing a new
335 |         Bloom Filter onto the queue
336 | 
337 |         Raises:
338 |             RotatingBloomFilterError: Unable to rotate the Bloom Filter"""
339 |         if self.current_queue_size == 1:
340 |             msg = "Popping a Bloom Filter will result in an unusable system!"
341 |             raise RotatingBloomFilterError(msg)
342 |         self._blooms.pop(0)
343 | 
344 |     def push(self) -> None:
345 |         """Push a new bloom filter onto the queue and rotate if necessary"""
346 |         self.__rotate_bloom_filter(force=True)
347 | 
348 |     def __rotate_bloom_filter(self, force: bool = False):
349 |         """handle determining if/when the Bloom Filter queue needs to be rotated"""
350 |         blm = self._blooms[-1]
351 |         ready_to_rotate = blm.elements_added == blm.estimated_elements
352 |         no_need_to_pop = self.current_queue_size < self._queue_size
353 |         if force and no_need_to_pop:
354 |             self.__add_bloom_filter()
355 |         elif force:  # must need to be pop'd first!
356 |             blm = self._blooms.pop(0)
357 |             self.__add_bloom_filter()
358 |         elif ready_to_rotate and no_need_to_pop:
359 |             self.__add_bloom_filter()
360 |         elif ready_to_rotate:
361 |             blm = self._blooms.pop(0)
362 |             self.__add_bloom_filter()
363 | 
364 |     def __add_bloom_filter(self):
365 |         """build a new bloom and add it on!"""
366 |         blm = BloomFilter(
367 |             est_elements=self.estimated_elements,
368 |             false_positive_rate=self.false_positive_rate,
369 |             hash_function=self.hash_function,
370 |         )
371 |         self._blooms.append(blm)
372 | 


--------------------------------------------------------------------------------
/probables/blooms/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/barrust/pyprobables/e5d44dac65033421d455681178344eeca9961bd8/probables/blooms/py.typed


--------------------------------------------------------------------------------
/probables/constants.py:
--------------------------------------------------------------------------------
1 | """Project Constants (or basic numerical constants...)"""
2 | 
3 | INT32_T_MIN = -2147483648
4 | INT32_T_MAX = 2147483647
5 | INT64_T_MIN = -9223372036854775808
6 | INT64_T_MAX = 9223372036854775807
7 | UINT32_T_MAX = 2**32 - 1
8 | UINT64_T_MAX = 2**64 - 1
9 | 


--------------------------------------------------------------------------------
/probables/countminsketch/__init__.py:
--------------------------------------------------------------------------------
 1 | """Count-Min Sketchs"""
 2 | 
 3 | from probables.countminsketch.countminsketch import (
 4 |     CountMeanMinSketch,
 5 |     CountMeanSketch,
 6 |     CountMinSketch,
 7 |     HeavyHitters,
 8 |     StreamThreshold,
 9 | )
10 | 
11 | __all__ = ["CountMinSketch", "HeavyHitters", "StreamThreshold", "CountMeanSketch", "CountMeanMinSketch"]
12 | 


--------------------------------------------------------------------------------
/probables/countminsketch/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/barrust/pyprobables/e5d44dac65033421d455681178344eeca9961bd8/probables/countminsketch/py.typed


--------------------------------------------------------------------------------
/probables/cuckoo/__init__.py:
--------------------------------------------------------------------------------
1 | """Cuckoo Filters"""
2 | 
3 | from probables.cuckoo.countingcuckoo import CountingCuckooFilter
4 | from probables.cuckoo.cuckoo import CuckooFilter
5 | 
6 | __all__ = ["CuckooFilter", "CountingCuckooFilter"]
7 | 


--------------------------------------------------------------------------------
/probables/cuckoo/countingcuckoo.py:
--------------------------------------------------------------------------------
  1 | """Counting Cuckoo Filter, python implementation
  2 | License: MIT
  3 | Author: Tyler Barrus (barrust@gmail.com)
  4 | """
  5 | 
  6 | import random
  7 | from array import array
  8 | from collections.abc import ByteString
  9 | from io import IOBase
 10 | from mmap import mmap
 11 | from pathlib import Path
 12 | from struct import Struct
 13 | from typing import Union
 14 | 
 15 | from probables.cuckoo.cuckoo import CuckooFilter
 16 | from probables.exceptions import CuckooFilterFullError
 17 | from probables.hashes import KeyT, SimpleHashT
 18 | from probables.utilities import MMap, resolve_path
 19 | 
 20 | 
 21 | class CountingCuckooFilter(CuckooFilter):
 22 |     """Simple Counting Cuckoo Filter implementation
 23 | 
 24 |     Args:
 25 |         capacity (int): The number of bins
 26 |         bucket_size (int): The number of buckets per bin
 27 |         max_swaps (int): The number of cuckoo swaps before stopping
 28 |         expansion_rate (int): The rate at which to expand
 29 |         auto_expand (bool): If the filter should automatically expand
 30 |         finger_size (int): The size of the fingerprint to use in bytes \
 31 |             (between 1 and 4); exported as 4 bytes; up to the user to \
 32 |             reset the size correctly on import
 33 |         filename (str): The path to the file to load or None if no file
 34 |     Returns:
 35 |         CountingCuckooFilter: A Cuckoo Filter object"""
 36 | 
 37 |     __slots__ = ("__unique_elements",)
 38 | 
 39 |     def __init__(
 40 |         self,
 41 |         capacity: int = 10000,
 42 |         bucket_size: int = 4,
 43 |         max_swaps: int = 500,
 44 |         expansion_rate: int = 2,
 45 |         auto_expand: bool = True,
 46 |         finger_size: int = 4,
 47 |         filepath: Union[str, Path, None] = None,
 48 |         hash_function: Union[SimpleHashT, None] = None,
 49 |     ) -> None:
 50 |         """setup the data structure"""
 51 |         self.__unique_elements = 0
 52 |         super().__init__(
 53 |             capacity,
 54 |             bucket_size,
 55 |             max_swaps,
 56 |             expansion_rate,
 57 |             auto_expand,
 58 |             finger_size,
 59 |             filepath,
 60 |             hash_function,
 61 |         )
 62 | 
 63 |     __COUNTING_CUCKOO_FOOTER_STRUCT = Struct("II")
 64 |     __BIN_STRUCT = Struct("II")
 65 | 
 66 |     @classmethod
 67 |     def init_error_rate(
 68 |         cls,
 69 |         error_rate: float,
 70 |         capacity: int = 10000,
 71 |         bucket_size: int = 4,
 72 |         max_swaps: int = 500,
 73 |         expansion_rate: int = 2,
 74 |         auto_expand: bool = True,
 75 |         hash_function: Union[SimpleHashT, None] = None,
 76 |     ):
 77 |         """Initialize a simple Cuckoo Filter based on error rate
 78 | 
 79 |         Args:
 80 |             error_rate (float):
 81 |             capacity (int): The number of bins
 82 |             bucket_size (int): The number of buckets per bin
 83 |             max_swaps (int): The number of cuckoo swaps before stopping
 84 |             expansion_rate (int): The rate at which to expand
 85 |             auto_expand (bool): If the filter should automatically expand
 86 |             hash_function (function): Hashing strategy function to use `hf(key)`
 87 |         Returns:
 88 |             CuckooFilter: A Cuckoo Filter object"""
 89 |         cku = CountingCuckooFilter(
 90 |             capacity=capacity,
 91 |             bucket_size=bucket_size,
 92 |             auto_expand=auto_expand,
 93 |             max_swaps=max_swaps,
 94 |             expansion_rate=expansion_rate,
 95 |             hash_function=hash_function,
 96 |         )
 97 |         cku._set_error_rate(error_rate)
 98 |         return cku
 99 | 
100 |     @classmethod
101 |     def load_error_rate(
102 |         cls, error_rate: float, filepath: Union[str, Path], hash_function: Union[SimpleHashT, None] = None
103 |     ):
104 |         """Initialize a previously exported Cuckoo Filter based on error rate
105 | 
106 |         Args:
107 |             error_rate (float):
108 |             filepath (str): The path to the file to load or None if no file
109 |             hash_function (function): Hashing strategy function to use \
110 |             `hf(key)`
111 |         Returns:
112 |             CuckooFilter: A Cuckoo Filter object
113 |         """
114 |         filepath = resolve_path(filepath)
115 |         cku = CountingCuckooFilter(filepath=filepath, hash_function=hash_function)
116 |         cku._set_error_rate(error_rate)
117 |         return cku
118 | 
119 |     @classmethod
120 |     def frombytes(
121 |         cls, b: ByteString, error_rate: Union[float, None] = None, hash_function: Union[SimpleHashT, None] = None
122 |     ) -> "CountingCuckooFilter":
123 |         """
124 |         Args:
125 |             b (ByteString): The bytes to load as a Expanding Bloom Filter
126 |             error_rate (float): The error rate of the cuckoo filter, if used to generate the original filter
127 |             hash_function (function): Hashing strategy function to use `hf(key, number)`
128 |         Returns:
129 |             CountingCuckooFilter: A Bloom Filter object"""
130 |         cku = CountingCuckooFilter(hash_function=hash_function)
131 |         cku._load(b)
132 | 
133 |         # if error rate is provided, use it
134 |         cku._set_error_rate(error_rate)
135 |         return cku
136 | 
137 |     def __contains__(self, val: KeyT) -> bool:
138 |         """setup the `in` keyword"""
139 |         return self.check(val) > 0
140 | 
141 |     @property
142 |     def unique_elements(self) -> int:
143 |         """int: unique number of elements inserted"""
144 |         return self.__unique_elements
145 | 
146 |     @property
147 |     def buckets(self) -> list[list["CountingCuckooBin"]]:  # type: ignore
148 |         """list(list): The buckets holding the fingerprints
149 | 
150 |         Note:
151 |             Not settable"""
152 |         return self._buckets
153 | 
154 |     def load_factor(self) -> float:
155 |         """float: How full the Cuckoo Filter is currently"""
156 |         return self.unique_elements / (self.capacity * self.bucket_size)
157 | 
158 |     def add(self, key: KeyT) -> None:
159 |         """Add element key to the filter
160 | 
161 |         Args:
162 |             key (str): The element to add
163 |         Raises:
164 |             CuckooFilterFullError: When element not inserted after maximum number of swaps or 'kicks'"""
165 |         idx_1, idx_2, fingerprint = self._generate_fingerprint_info(key)
166 | 
167 |         is_present = self._check_if_present(idx_1, idx_2, fingerprint)
168 |         if is_present is not None:
169 |             for bucket in self.buckets[is_present]:
170 |                 if fingerprint in bucket:
171 |                     bucket.increment()
172 |                     self._inserted_elements += 1
173 |                     return
174 |         finger = self._insert_fingerprint_alt(fingerprint, idx_1, idx_2)
175 |         self._deal_with_insertion(finger)
176 | 
177 |     def check(self, key: KeyT) -> int:  # type: ignore
178 |         """Check if an element is in the filter
179 | 
180 |         Args:
181 |             key (str): Element to check
182 |         Returns:
183 |             int: The number of times inserted into the filter"""
184 |         idx_1, idx_2, fingerprint = self._generate_fingerprint_info(key)
185 |         is_present = self._check_if_present(idx_1, idx_2, fingerprint)
186 |         val = 0
187 |         if is_present is not None:
188 |             # get the count out!
189 |             for bucket in self.buckets[is_present]:
190 |                 if fingerprint in bucket:
191 |                     val = bucket.count
192 |                     break
193 |         return val
194 | 
195 |     def remove(self, key: KeyT) -> bool:
196 |         """Remove an element from the filter
197 | 
198 |         Args:
199 |             key (str): Element to remove"""
200 |         idx_1, idx_2, fingerprint = self._generate_fingerprint_info(key)
201 |         idx = self._check_if_present(idx_1, idx_2, fingerprint)
202 |         if idx is None:
203 |             return False
204 |         for bucket in self.buckets[idx]:
205 |             if fingerprint in bucket:
206 |                 bucket.decrement()
207 |                 self._inserted_elements -= 1
208 |                 if bucket.count == 0:
209 |                     self.buckets[idx].remove(bucket)
210 |                     self.__unique_elements -= 1
211 |                 return True
212 |         return False  # catch this...
213 | 
214 |     def expand(self):
215 |         """Expand the cuckoo filter"""
216 |         self._expand_logic(None)
217 | 
218 |     def export(self, file: Union[Path, str, IOBase, mmap]) -> None:
219 |         """Export cuckoo filter to file
220 | 
221 |         Args:
222 |             file (str): Path to file to export"""
223 |         if not isinstance(file, (IOBase, mmap)):
224 |             file = resolve_path(file)
225 |             with open(file, "wb") as filepointer:
226 |                 self.export(filepointer)  # type:ignore
227 |         else:
228 |             self.__bucket_decomposition(self.buckets, self.bucket_size).tofile(file)
229 |             # now put out the required information at the end
230 |             file.write(self.__COUNTING_CUCKOO_FOOTER_STRUCT.pack(self.bucket_size, self.max_swaps))
231 | 
232 |     def _insert_fingerprint_alt(
233 |         self, fingerprint: int, idx_1: int, idx_2: int, count: int = 1
234 |     ) -> Union["CountingCuckooBin", None]:
235 |         """insert a fingerprint, but with a count parameter!"""
236 |         if self.__insert_element(fingerprint, idx_1, count):
237 |             self._inserted_elements += 1
238 |             self.__unique_elements += 1
239 |             return None
240 |         if self.__insert_element(fingerprint, idx_2, count):
241 |             self._inserted_elements += 1
242 |             self.__unique_elements += 1
243 |             return None
244 | 
245 |         # we didn't insert, so now we need to randomly select one index to use
246 |         # and move things around to the other index, if possible, until we
247 |         # either move everything around or hit the maximum number of swaps
248 |         idx = random.choice([idx_1, idx_2])
249 |         prv_bin = CountingCuckooBin(fingerprint, 1)
250 |         for _ in range(self.max_swaps):
251 |             # select one element to be swapped out...
252 |             swap_elm = random.randint(0, self.bucket_size - 1)
253 |             swap_finger = self.buckets[idx][swap_elm]
254 |             prv_bin, self.buckets[idx][swap_elm] = swap_finger, prv_bin
255 | 
256 |             # now find another place to put this fingerprint
257 |             index_1, index_2 = self._indicies_from_fingerprint(prv_bin.finger)
258 | 
259 |             idx = index_2 if idx == index_1 else index_1
260 | 
261 |             if self.__insert_element(prv_bin.finger, idx, prv_bin.count):
262 |                 self._inserted_elements += 1
263 |                 self.__unique_elements += 1
264 |                 return None
265 | 
266 |         # if we got here we have an error... we might need to know what is left
267 |         return prv_bin
268 | 
269 |     def _check_if_present(self, idx_1: int, idx_2: int, fingerprint: int) -> Union[int, None]:
270 |         """wrapper for checking if fingerprint is already inserted"""
271 |         if fingerprint in [x.finger for x in self.buckets[idx_1]]:
272 |             return idx_1
273 |         if fingerprint in [x.finger for x in self.buckets[idx_2]]:
274 |             return idx_2
275 |         return None
276 | 
277 |     def _load(self, file: Union[Path, str, IOBase, mmap, bytes, ByteString]) -> None:
278 |         """load a cuckoo filter from file"""
279 |         if not isinstance(file, (IOBase, mmap, bytes, bytearray, memoryview)):
280 |             file = resolve_path(file)
281 |             with MMap(file) as filepointer:
282 |                 self._load(filepointer)
283 |         else:
284 |             self._parse_footer(file, self.__COUNTING_CUCKOO_FOOTER_STRUCT)  # type: ignore
285 |             self._inserted_elements = 0
286 |             self._parse_buckets(file)  # type: ignore
287 | 
288 |     def _parse_buckets(self, d: ByteString) -> None:
289 |         """Parse bytes to pull out and set the buckets"""
290 |         bin_size = self.__BIN_STRUCT.size
291 |         self._cuckoo_capacity = (len(bytes(d)) - bin_size) // bin_size // self.bucket_size
292 |         start = 0
293 |         end = bin_size
294 |         self._buckets = []
295 |         for i in range(self.capacity):
296 |             self.buckets.append([])
297 |             for _ in range(self.bucket_size):
298 |                 finger, count = self.__BIN_STRUCT.unpack(bytes(d[start:end]))
299 |                 if finger > 0:
300 |                     ccb = CountingCuckooBin(finger, count)
301 |                     self.buckets[i].append(ccb)
302 |                     self._inserted_elements += count
303 |                     self.__unique_elements += 1
304 |                 start = end
305 |                 end += bin_size
306 | 
307 |     def _expand_logic(self, extra_fingerprint: "CountingCuckooBin") -> None:
308 |         """the logic to acutally expand the cuckoo filter"""
309 |         # get all the fingerprints
310 |         fingerprints = self._setup_expand(extra_fingerprint)
311 |         self.__unique_elements = 0  # this needs to be reset!
312 | 
313 |         for elm in fingerprints:
314 |             idx_1, idx_2 = self._indicies_from_fingerprint(elm.finger)
315 |             res = self._insert_fingerprint_alt(elm.finger, idx_1, idx_2, elm.count)
316 |             if res is not None:  # again, this *shouldn't* happen
317 |                 msg = "The CountingCuckooFilter failed to expand"
318 |                 raise CuckooFilterFullError(msg)
319 | 
320 |     def __insert_element(self, fingerprint, idx, count=1) -> bool:
321 |         """insert an element"""
322 |         if len(self.buckets[idx]) < self.bucket_size:
323 |             self.buckets[idx].append(CountingCuckooBin(fingerprint, count))
324 |             return True
325 |         return False
326 | 
327 |     @staticmethod
328 |     def __bucket_decomposition(buckets, bucket_size: int) -> array:
329 |         """convert a list of buckets into a single array for export"""
330 |         arr = array("I")
331 |         for bucket in buckets:
332 |             for buck in bucket:
333 |                 arr.extend(buck.get_array())
334 |             leftover = bucket_size - len(bucket)
335 |             arr.fromlist([0 for _ in range(leftover * 2)])
336 |         return arr
337 | 
338 | 
339 | class CountingCuckooBin:
340 |     """A container class for the counting cuckoo filter"""
341 | 
342 |     # keep it lightweight
343 |     __slots__ = ["__bin"]
344 | 
345 |     def __init__(self, fingerprint: int, count: int) -> None:
346 |         """init"""
347 |         self.__bin = array("I", [fingerprint, count])
348 | 
349 |     def __contains__(self, val: int) -> bool:
350 |         """setup the `in` construct"""
351 |         return self.__bin[0] == val
352 | 
353 |     def get_array(self):
354 |         """return the array implementation"""
355 |         return self.__bin
356 | 
357 |     @property
358 |     def finger(self) -> int:
359 |         """fingerprint property"""
360 |         return self.__bin[0]
361 | 
362 |     @property
363 |     def count(self) -> int:
364 |         """count property"""
365 |         return self.__bin[1]
366 | 
367 |     def __repr__(self) -> str:
368 |         """how do we represent this?"""
369 |         return self.__str__()
370 | 
371 |     def __str__(self) -> str:
372 |         """convert it into a string"""
373 |         return f"(fingerprint:{self.__bin[0]} count:{self.__bin[1]})"
374 | 
375 |     def increment(self) -> int:
376 |         """increment"""
377 |         self.__bin[1] += 1
378 |         return self.__bin[1]
379 | 
380 |     def decrement(self) -> int:
381 |         """decrement"""
382 |         self.__bin[1] -= 1
383 |         return self.__bin[1]
384 | 


--------------------------------------------------------------------------------
/probables/cuckoo/cuckoo.py:
--------------------------------------------------------------------------------
  1 | """Cuckoo Filter, python implementation
  2 | License: MIT
  3 | Author: Tyler Barrus (barrust@gmail.com)
  4 | """
  5 | 
  6 | import math
  7 | import random
  8 | from array import array
  9 | from collections.abc import ByteString
 10 | from io import BytesIO, IOBase
 11 | from mmap import mmap
 12 | from numbers import Number
 13 | from pathlib import Path
 14 | from struct import Struct
 15 | from typing import Union
 16 | 
 17 | from probables.exceptions import CuckooFilterFullError, InitializationError
 18 | from probables.hashes import KeyT, SimpleHashT, fnv_1a
 19 | from probables.utilities import MMap, get_x_bits, is_valid_file, resolve_path
 20 | 
 21 | 
 22 | class CuckooFilter:
 23 |     """Simple Cuckoo Filter implementation
 24 | 
 25 |     Args:
 26 |         capacity (int): The number of bins
 27 |         bucket_size (int): The number of buckets per bin
 28 |         max_swaps (int): The number of cuckoo swaps before stopping
 29 |         expansion_rate (int): The rate at which to expand
 30 |         auto_expand (bool): If the filter should automatically expand
 31 |         finger_size (int): The size of the fingerprint to use in bytes \
 32 |             (between 1 and 4); exported as 4 bytes; up to the user to \
 33 |             reset the size correctly on import
 34 |         filepath (str): The path to the file to load or None if no file
 35 |         hash_function (function): Hashing strategy function to use `hf(key)`
 36 |     Returns:
 37 |         CuckooFilter: A Cuckoo Filter object"""
 38 | 
 39 |     __slots__ = (
 40 |         "_bucket_size",
 41 |         "_cuckoo_capacity",
 42 |         "__max_cuckoo_swaps",
 43 |         "__expansion_rate",
 44 |         "__auto_expand",
 45 |         "_fingerprint_size",
 46 |         "__hash_func",
 47 |         "_inserted_elements",
 48 |         "_buckets",
 49 |         "_error_rate",
 50 |     )
 51 | 
 52 |     def __init__(
 53 |         self,
 54 |         capacity: int = 10000,
 55 |         bucket_size: int = 4,
 56 |         max_swaps: int = 500,
 57 |         expansion_rate: int = 2,
 58 |         auto_expand: bool = True,
 59 |         finger_size: int = 4,
 60 |         filepath: Union[str, Path, None] = None,
 61 |         hash_function: Union[SimpleHashT, None] = None,
 62 |     ):
 63 |         """setup the data structure"""
 64 |         valid_prms = (
 65 |             isinstance(capacity, Number)
 66 |             and capacity >= 1
 67 |             and isinstance(bucket_size, Number)
 68 |             and bucket_size >= 1
 69 |             and isinstance(max_swaps, Number)
 70 |             and max_swaps >= 1
 71 |         )
 72 |         if not valid_prms:
 73 |             msg = "CuckooFilter: capacity, bucket_size, and max_swaps must be an integer greater than 0"
 74 |             raise InitializationError(msg)
 75 |         self._bucket_size = int(bucket_size)
 76 |         self._cuckoo_capacity = int(capacity)
 77 |         self.__max_cuckoo_swaps = int(max_swaps)
 78 |         self.__expansion_rate = 2
 79 |         self.expansion_rate = expansion_rate
 80 |         self.__auto_expand = True
 81 |         self.auto_expand = auto_expand
 82 |         self._fingerprint_size = 32
 83 |         self.fingerprint_size = finger_size
 84 | 
 85 |         if hash_function is None:
 86 |             self.__hash_func = fnv_1a
 87 |         else:
 88 |             self.__hash_func = hash_function  # type: ignore
 89 |         self._inserted_elements = 0
 90 |         if filepath is None:
 91 |             self._buckets = []  # type: ignore
 92 |             for _ in range(self.capacity):
 93 |                 self.buckets.append([])
 94 |         elif is_valid_file(filepath):
 95 |             filepath = resolve_path(filepath)
 96 |             self._load(filepath)
 97 |         else:
 98 |             msg = "CuckooFilter: failed to load provided file"
 99 |             raise InitializationError(msg)
100 | 
101 |         self._error_rate = float(self._calc_error_rate())
102 | 
103 |     @classmethod
104 |     def init_error_rate(
105 |         cls,
106 |         error_rate: float,
107 |         capacity: int = 10000,
108 |         bucket_size: int = 4,
109 |         max_swaps: int = 500,
110 |         expansion_rate: int = 2,
111 |         auto_expand: bool = True,
112 |         hash_function: Union[SimpleHashT, None] = None,
113 |     ):
114 |         """Initialize a simple Cuckoo Filter based on error rate
115 | 
116 |         Args:
117 |             error_rate (float):
118 |             capacity (int): The number of bins
119 |             bucket_size (int): The number of buckets per bin
120 |             max_swaps (int): The number of cuckoo swaps before stopping
121 |             expansion_rate (int): The rate at which to expand
122 |             auto_expand (bool): If the filter should automatically expand
123 |             hash_function (function): Hashing strategy function to use \
124 |             `hf(key)`
125 |         Returns:
126 |             CuckooFilter: A Cuckoo Filter object"""
127 |         cku = CuckooFilter(
128 |             capacity=capacity,
129 |             bucket_size=bucket_size,
130 |             auto_expand=auto_expand,
131 |             max_swaps=max_swaps,
132 |             expansion_rate=expansion_rate,
133 |             hash_function=hash_function,
134 |         )
135 |         cku._set_error_rate(error_rate)
136 |         return cku
137 | 
138 |     @classmethod
139 |     def load_error_rate(
140 |         cls,
141 |         error_rate: float,
142 |         filepath: Union[str, Path],
143 |         hash_function: Union[SimpleHashT, None] = None,
144 |     ):
145 |         """Initialize a previously exported Cuckoo Filter based on error rate
146 | 
147 |         Args:
148 |             error_rate (float):
149 |             filepath (str): The path to the file to load or None if no file
150 |             hash_function (function): Hashing strategy function to use `hf(key)`
151 |         Returns:
152 |             CuckooFilter: A Cuckoo Filter object"""
153 |         filepath = resolve_path(filepath)
154 |         cku = CuckooFilter(filepath=filepath, hash_function=hash_function)
155 |         cku._set_error_rate(error_rate)
156 |         return cku
157 | 
158 |     @classmethod
159 |     def frombytes(
160 |         cls,
161 |         b: ByteString,
162 |         error_rate: Union[float, None] = None,
163 |         hash_function: Union[SimpleHashT, None] = None,
164 |     ) -> "CuckooFilter":
165 |         """
166 |         Args:
167 |             b (ByteString): The bytes to load as a Expanding Bloom Filter
168 |             error_rate (float): The error rate of the cuckoo filter, if used to generate the original filter
169 |             hash_function (function): Hashing strategy function to use `hf(key, number)`
170 |         Returns:
171 |             CuckooFilter: A Bloom Filter object
172 |         """
173 |         cku = CuckooFilter(hash_function=hash_function)
174 |         cku._load(b)  # type: ignore
175 | 
176 |         # if error rate is provided, use it
177 |         cku._set_error_rate(error_rate)
178 |         return cku
179 | 
180 |     def __contains__(self, key: KeyT) -> bool:
181 |         """setup the `in` keyword"""
182 |         return self.check(key)
183 | 
184 |     def __str__(self):
185 |         """setup what it will print"""
186 |         return (
187 |             f"{self.__class__.__name__}:\n"
188 |             f"\tCapacity: {self.capacity}\n"
189 |             f"\tTotal Bins: {self.capacity * self.bucket_size}\n"
190 |             f"\tLoad Factor: {self.load_factor() * 100}%\n"
191 |             f"\tInserted Elements: {self.elements_added}\n"
192 |             f"\tMax Swaps: {self.max_swaps}\n"
193 |             f"\tExpansion Rate: {self.expansion_rate}\n"
194 |             f"\tAuto Expand: {self.auto_expand}"
195 |         )
196 | 
197 |     @property
198 |     def elements_added(self) -> int:
199 |         """int: The number of elements added
200 | 
201 |         Note:
202 |             Not settable"""
203 |         return self._inserted_elements
204 | 
205 |     @property
206 |     def capacity(self) -> int:
207 |         """int: The number of bins
208 | 
209 |         Note:
210 |             Not settable"""
211 |         return self._cuckoo_capacity
212 | 
213 |     @property
214 |     def max_swaps(self) -> int:
215 |         """int: The maximum number of swaps to perform
216 | 
217 |         Note:
218 |             Not settable"""
219 |         return self.__max_cuckoo_swaps
220 | 
221 |     @property
222 |     def bucket_size(self) -> int:
223 |         """int: The number of buckets per bin
224 | 
225 |         Note:
226 |             Not settable"""
227 |         return self._bucket_size
228 | 
229 |     @property
230 |     def buckets(self) -> list[list[int]]:
231 |         """list(list): The buckets holding the fingerprints
232 | 
233 |         Note:
234 |             Not settable"""
235 |         return self._buckets
236 | 
237 |     @property
238 |     def expansion_rate(self) -> int:
239 |         """int: The rate at expansion when the filter grows"""
240 |         return self.__expansion_rate
241 | 
242 |     @expansion_rate.setter
243 |     def expansion_rate(self, val: int):
244 |         """set the self expand value"""
245 |         self.__expansion_rate = val
246 | 
247 |     @property
248 |     def error_rate(self) -> float:
249 |         """float: The error rate of the cuckoo filter"""
250 |         return self._error_rate
251 | 
252 |     @property
253 |     def auto_expand(self) -> bool:
254 |         """bool: True if the cuckoo filter will expand automatically"""
255 |         return self.__auto_expand
256 | 
257 |     @auto_expand.setter
258 |     def auto_expand(self, val: bool):
259 |         """set the self expand value"""
260 |         self.__auto_expand = bool(val)
261 | 
262 |     @property
263 |     def fingerprint_size_bits(self) -> int:
264 |         """int: The size in bits of the fingerprint"""
265 |         return self._fingerprint_size
266 | 
267 |     @property
268 |     def fingerprint_size(self) -> int:
269 |         """int: The size in bytes of the fingerprint
270 | 
271 |         Raises:
272 |             ValueError: If the size is not between 1 and 4
273 |         Note:
274 |             The size of the fingerprint must be between 1 and 4"""
275 |         return math.ceil(self.fingerprint_size_bits / 8)
276 | 
277 |     @fingerprint_size.setter
278 |     def fingerprint_size(self, val: int):
279 |         """set the fingerprint size"""
280 |         tmp = val
281 |         if not 1 <= tmp <= 4:
282 |             msg = f"{self.__class__.__name__}: fingerprint size must be between 1 and 4"
283 |             raise ValueError(msg)
284 |         # bytes to bits
285 |         self._fingerprint_size = tmp * 8
286 |         self._calc_error_rate()  # if updating fingerprint size then error rate may change
287 | 
288 |     def load_factor(self) -> float:
289 |         """float: How full the Cuckoo Filter is currently"""
290 |         return self.elements_added / (self.capacity * self.bucket_size)
291 | 
292 |     def add(self, key: KeyT):
293 |         """Add element key to the filter
294 | 
295 |         Args:
296 |             key (str): The element to add
297 |         Raises:
298 |             CuckooFilterFullError: When element not inserted after maximum number of swaps or 'kicks'"""
299 |         idx_1, idx_2, fingerprint = self._generate_fingerprint_info(key)
300 | 
301 |         is_present = self._check_if_present(idx_1, idx_2, fingerprint)
302 |         if is_present is not None:  # already there, nothing to do
303 |             return
304 |         finger = self._insert_fingerprint(fingerprint, idx_1, idx_2)
305 |         self._deal_with_insertion(finger)
306 | 
307 |     def check(self, key: KeyT) -> bool:
308 |         """Check if an element is in the filter
309 | 
310 |         Args:
311 |             key (str): Element to check
312 |         Returns:
313 |             bool: True if likely present, False if definately not"""
314 |         idx_1, idx_2, fingerprint = self._generate_fingerprint_info(key)
315 |         is_present = self._check_if_present(idx_1, idx_2, fingerprint)
316 |         return is_present is not None
317 | 
318 |     def remove(self, key: KeyT) -> bool:
319 |         """Remove an element from the filter
320 | 
321 |         Args:
322 |             key (str): Element to remove
323 |         Returns:
324 |             bool: True if removed, False if not present"""
325 |         idx_1, idx_2, fingerprint = self._generate_fingerprint_info(key)
326 |         idx = self._check_if_present(idx_1, idx_2, fingerprint)
327 |         if idx is None:
328 |             return False
329 |         self.buckets[idx].remove(fingerprint)
330 |         self._inserted_elements -= 1
331 |         return True
332 | 
333 |     def export(self, file: Union[Path, str, IOBase, mmap]) -> None:
334 |         """Export cuckoo filter to file
335 | 
336 |         Args:
337 |             file: Path to file to export"""
338 | 
339 |         if not isinstance(file, (IOBase, mmap)):
340 |             file = resolve_path(file)
341 |             with open(file, "wb") as filepointer:
342 |                 self.export(filepointer)  # type:ignore
343 |         else:
344 |             filepointer = file  # type:ignore
345 |             for _, val in enumerate(self.buckets):
346 |                 bucket = array(self._CUCKOO_SINGLE_INT_C, val)
347 |                 bucket.extend([0] * (self.bucket_size - len(bucket)))
348 |                 bucket.tofile(filepointer)
349 |             # now put out the required information at the end
350 |             filepointer.write(self._CUCKOO_FOOTER_STRUCT.pack(self.bucket_size, self.max_swaps))
351 | 
352 |     def __bytes__(self) -> bytes:
353 |         """Export cuckoo filter to `bytes`"""
354 |         with BytesIO() as f:
355 |             self.export(f)
356 |             return f.getvalue()
357 | 
358 |     def expand(self):
359 |         """Expand the cuckoo filter"""
360 |         self._expand_logic(None)
361 | 
362 |     def _insert_fingerprint(self, fingerprint, idx_1, idx_2):
363 |         """insert a fingerprint"""
364 |         if self.__insert_element(fingerprint, idx_1):
365 |             self._inserted_elements += 1
366 |             return None
367 |         if self.__insert_element(fingerprint, idx_2):
368 |             self._inserted_elements += 1
369 |             return None
370 | 
371 |         # we didn't insert, so now we need to randomly select one index to use
372 |         # and move things around to the other index, if possible, until we
373 |         # either move everything around or hit the maximum number of swaps
374 |         idx = random.choice([idx_1, idx_2])
375 | 
376 |         for _ in range(self.max_swaps):
377 |             # select one element to be swapped out...
378 |             swap_elm = random.randint(0, self.bucket_size - 1)
379 | 
380 |             swb = self.buckets[idx][swap_elm]
381 |             fingerprint, self.buckets[idx][swap_elm] = swb, fingerprint
382 | 
383 |             # now find another place to put this fingerprint
384 |             index_1, index_2 = self._indicies_from_fingerprint(fingerprint)
385 | 
386 |             idx = index_2 if idx == index_1 else index_1
387 | 
388 |             if self.__insert_element(fingerprint, idx):
389 |                 self._inserted_elements += 1
390 |                 return None
391 | 
392 |         # if we got here we have an error... we might need to know what is left
393 |         return fingerprint
394 | 
395 |     def _load(self, file: Union[Path, str, IOBase, mmap, bytes]) -> None:
396 |         """load a cuckoo filter from file"""
397 |         if not isinstance(file, (IOBase, mmap, bytes)):
398 |             file = resolve_path(file)
399 |             with MMap(file) as filepointer:
400 |                 self._load(filepointer)
401 |         else:
402 |             self._parse_footer(file, self._CUCKOO_FOOTER_STRUCT)  # type: ignore
403 |             self._inserted_elements = 0
404 |             # now pull everything in!
405 |             self._parse_buckets(file)  # type: ignore
406 | 
407 |     _CUCKOO_SINGLE_INT_C = "I"
408 |     _CUCKOO_SINGLE_INT_SIZE = Struct(_CUCKOO_SINGLE_INT_C).size
409 |     _CUCKOO_FOOTER_STRUCT = Struct("II")
410 | 
411 |     def _parse_footer(self, d: ByteString, stct: Struct) -> None:
412 |         """parse bytes and set footer information"""
413 |         list_size = len(d) - stct.size
414 |         self._bucket_size, self.__max_cuckoo_swaps = stct.unpack(d[list_size:])  # type:ignore
415 |         self._cuckoo_capacity = list_size // self._CUCKOO_SINGLE_INT_SIZE // self.bucket_size
416 | 
417 |     def _parse_buckets(self, d: ByteString) -> None:
418 |         """parse bytes and set buckets"""
419 |         self._buckets = []
420 |         bucket_byte_size = self.bucket_size * self._CUCKOO_SINGLE_INT_SIZE
421 |         offs = 0
422 |         for _ in range(self.capacity):
423 |             next_offs = offs + bucket_byte_size
424 |             self.buckets.append(self._parse_bucket(d[offs:next_offs]))  # type: ignore
425 |             offs = next_offs
426 | 
427 |     def _parse_bucket(self, d: ByteString) -> array:
428 |         """parse a single bucket"""
429 |         bucket = array(self._CUCKOO_SINGLE_INT_C, bytes(d))
430 |         bucket = array(self._CUCKOO_SINGLE_INT_C, [el for el in bucket if el])
431 |         self._inserted_elements += len(bucket)
432 |         return bucket
433 | 
434 |     def _set_error_rate(self, error_rate: Union[float, None]) -> None:
435 |         """set error rate correctly"""
436 |         # if error rate is provided, use it
437 |         if error_rate is not None:
438 |             self._error_rate = error_rate
439 |             self._fingerprint_size = self._calc_fingerprint_size()
440 | 
441 |     def _check_if_present(self, idx_1, idx_2, fingerprint):
442 |         """wrapper for checking if fingerprint is already inserted"""
443 |         if fingerprint in self.buckets[idx_1]:
444 |             return idx_1
445 |         if fingerprint in self.buckets[idx_2]:
446 |             return idx_2
447 |         return None
448 | 
449 |     def __insert_element(self, fingerprint, idx) -> bool:
450 |         """insert element wrapper"""
451 |         if len(self.buckets[idx]) < self.bucket_size:
452 |             self.buckets[idx].append(fingerprint)
453 |             return True
454 |         return False
455 | 
456 |     def _expand_logic(self, extra_fingerprint):
457 |         """the logic to acutally expand the cuckoo filter"""
458 |         # get all the fingerprints
459 |         fingerprints = self._setup_expand(extra_fingerprint)
460 | 
461 |         for finger in fingerprints:
462 |             idx_1, idx_2 = self._indicies_from_fingerprint(finger)
463 |             res = self._insert_fingerprint(finger, idx_1, idx_2)
464 |             if res is not None:  # again, this *shouldn't* happen
465 |                 msg = "The CuckooFilter failed to expand"
466 |                 raise CuckooFilterFullError(msg)
467 | 
468 |     def _setup_expand(self, extra_fingerprint):
469 |         """setup this thing"""
470 |         fingerprints = []
471 |         if extra_fingerprint is not None:
472 |             fingerprints.append(extra_fingerprint)
473 |         for idx in range(self.capacity):
474 |             fingerprints.extend(self.buckets[idx])
475 | 
476 |         self._cuckoo_capacity = self.capacity * self.expansion_rate
477 |         self._buckets = []
478 |         self._inserted_elements = 0
479 |         for _ in range(self.capacity):
480 |             self.buckets.append([])
481 | 
482 |         return fingerprints
483 | 
484 |     def _indicies_from_fingerprint(self, fingerprint):
485 |         """Generate the possible insertion indicies from a fingerprint
486 | 
487 |         Args:
488 |             fingerprint (int): The fingerprint to use for generating indicies"""
489 |         idx_1 = fingerprint % self.capacity
490 |         idx_2 = self.__hash_func(str(fingerprint)) % self.capacity
491 |         return idx_1, idx_2
492 | 
493 |     def _generate_fingerprint_info(self, key: KeyT) -> tuple[int, int, int]:
494 |         """Generate the fingerprint and indicies using the provided key
495 | 
496 |         Args:
497 |             key (str): The element for which information is to be generated
498 |         """
499 |         # generate the fingerprint along with the two possible indecies
500 |         hash_val = self.__hash_func(key)
501 |         fingerprint = get_x_bits(hash_val, 64, self.fingerprint_size_bits, True)
502 |         idx_1, idx_2 = self._indicies_from_fingerprint(fingerprint)
503 | 
504 |         # NOTE: This should never happen...
505 |         if idx_1 > self.capacity or idx_2 > self.capacity:
506 |             raise ValueError(f"Either idx_1 {idx_1} or idx_2 {idx_2} is greater than {self.capacity}")
507 |         return idx_1, idx_2, fingerprint
508 | 
509 |     def _deal_with_insertion(self, finger):
510 |         """some code to handle the insertion the same"""
511 |         if finger is None:
512 |             return
513 |         if self.auto_expand:
514 |             self._expand_logic(finger)
515 |         else:
516 |             msg = f"The {self.__class__.__name__} is currently full"
517 |             raise CuckooFilterFullError(msg)
518 | 
519 |     def _calc_error_rate(self):
520 |         """calculate error rate based on fingerprint size (bits) and bucket size"""
521 |         return float(1 / (2 ** (self.fingerprint_size_bits - (math.log2(self.bucket_size) + 1))))
522 | 
523 |     def _calc_fingerprint_size(self) -> int:
524 |         """calculate fingerprint size (bits) based on error rate and bucket size"""
525 |         return int(math.ceil(math.log2(1.0 / self.error_rate) + math.log2(self.bucket_size) + 1))
526 | 


--------------------------------------------------------------------------------
/probables/cuckoo/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/barrust/pyprobables/e5d44dac65033421d455681178344eeca9961bd8/probables/cuckoo/py.typed


--------------------------------------------------------------------------------
/probables/exceptions.py:
--------------------------------------------------------------------------------
 1 | """PyProbables Exceptions"""
 2 | 
 3 | 
 4 | class ProbablesBaseException(Exception):
 5 |     """Base ProbablesBaseException
 6 | 
 7 |     Args:
 8 |         message (str): The error message to be reported"""
 9 | 
10 |     def __init__(self, message: str) -> None:
11 |         self.message = message
12 |         super().__init__(self.message)
13 | 
14 |     def __str__(self) -> str:
15 |         return self.message
16 | 
17 | 
18 | class InitializationError(ProbablesBaseException):
19 |     """Initialization Exception
20 | 
21 |     Args:
22 |         message (str): The initialization error messge"""
23 | 
24 |     def __init__(self, message: str) -> None:
25 |         self.message = message
26 |         super().__init__(self.message)
27 | 
28 | 
29 | class NotSupportedError(ProbablesBaseException):
30 |     """Not Supported Functionality Exception
31 | 
32 |     Args:
33 |         message (str): The error message to be reported"""
34 | 
35 |     def __init__(self, message: str) -> None:
36 |         self.message = message
37 |         super().__init__(self.message)
38 | 
39 | 
40 | class CuckooFilterFullError(ProbablesBaseException):
41 |     """Cuckoo Filter Full Exception
42 | 
43 |     Args:
44 |         message (str): The error message to be reported"""
45 | 
46 |     def __init__(self, message: str) -> None:
47 |         self.message = message
48 |         super().__init__(self.message)
49 | 
50 | 
51 | class RotatingBloomFilterError(ProbablesBaseException):
52 |     """RotatingBloomFilter unable to rotate Blooms Exceptions
53 | 
54 |     Args:
55 |         message (str): The error message to be reported"""
56 | 
57 |     def __init__(self, message: str) -> None:
58 |         self.message = message
59 |         super().__init__(self.message)
60 | 
61 | 
62 | class CountMinSketchError(ProbablesBaseException):
63 |     """CountMinSketch Exception
64 | 
65 |     Args:
66 |         message (str): The error message to be reported"""
67 | 
68 |     def __init__(self, message: str) -> None:
69 |         self.message = message
70 |         super().__init__(self.message)
71 | 
72 | 
73 | class QuotientFilterError(ProbablesBaseException):
74 |     """Quotient Filter Exception
75 | 
76 |     Args:
77 |         message (str): The error message to be reported"""
78 | 
79 |     def __init__(self, message: str) -> None:
80 |         self.message = message
81 |         super().__init__(self.message)
82 | 


--------------------------------------------------------------------------------
/probables/hashes.py:
--------------------------------------------------------------------------------
  1 | """Probables Hashing Utilities"""
  2 | 
  3 | from functools import wraps
  4 | from hashlib import md5, sha256
  5 | from struct import unpack
  6 | from typing import Callable, Union
  7 | 
  8 | from probables.constants import UINT32_T_MAX, UINT64_T_MAX
  9 | 
 10 | KeyT = Union[str, bytes]
 11 | SimpleHashT = Callable[[KeyT, int], int]
 12 | HashResultsT = list[int]
 13 | HashFuncT = Callable[[KeyT, int], HashResultsT]
 14 | HashFuncBytesT = Callable[[KeyT, int], bytes]
 15 | 
 16 | 
 17 | def hash_with_depth_bytes(func: HashFuncBytesT) -> HashFuncT:
 18 |     """Decorator to turns a function taking a single key and hashes it to
 19 |     bytes. Wraps functions to be used in Bloom filters and Count-Min sketch
 20 |     data structures.
 21 | 
 22 |     Args:
 23 |         key (str): The element to be hashed
 24 |         depth (int): The number of hash permutations to compute
 25 |     Returns:
 26 |         list(int): 64-bit hashed representation of key
 27 |     Note:
 28 |         Arguments shown are as it will be after decorated"""
 29 | 
 30 |     @wraps(func)
 31 |     def hashing_func(key, depth=1):
 32 |         """wrapper function"""
 33 |         res = []
 34 |         tmp = key if not isinstance(key, str) else key.encode("utf-8")
 35 |         for idx in range(depth):
 36 |             tmp = func(tmp, idx)
 37 |             res.append(unpack("Q", tmp[:8])[0])  # turn into 64 bit number
 38 |         return res
 39 | 
 40 |     return hashing_func
 41 | 
 42 | 
 43 | def hash_with_depth_int(func: HashFuncT) -> HashFuncT:
 44 |     """Decorator to turn a function that takes a single key and hashes it to
 45 |     an int. Wraps functions to be used in Bloom filters and Count-Min
 46 |     sketch data structures.
 47 | 
 48 |     Args:
 49 |         key (str): The element to be hashed
 50 |         depth (int): The number of hash permutations to compute
 51 |     Returns:
 52 |         list(int): 64-bit hashed representation of key
 53 |     Note:
 54 |         Arguments shown are as it will be after decorated"""
 55 | 
 56 |     @wraps(func)
 57 |     def hashing_func(key, depth=1):
 58 |         """wrapper function"""
 59 |         res = []
 60 |         tmp = func(key, 0)
 61 |         res.append(tmp)
 62 |         for idx in range(1, depth):
 63 |             tmp = func(f"{tmp:x}", idx)
 64 |             res.append(tmp)
 65 |         return res
 66 | 
 67 |     return hashing_func
 68 | 
 69 | 
 70 | def default_fnv_1a(key: KeyT, depth: int = 1) -> list[int]:
 71 |     """The default fnv-1a hashing routine
 72 | 
 73 |     Args:
 74 |         key (str): The element to be hashed
 75 |         depth (int): The number of hash permutations to compute
 76 |     Returns:
 77 |         list(int): List of size depth hashes"""
 78 | 
 79 |     res = []
 80 |     for idx in range(depth):
 81 |         res.append(fnv_1a(key, idx))
 82 |     return res
 83 | 
 84 | 
 85 | def fnv_1a(key: KeyT, seed: int = 0) -> int:
 86 |     """Pure python implementation of the 64 bit fnv-1a hash
 87 | 
 88 |     Args:
 89 |         key (str): The element to be hashed
 90 |         seed (int): Add a seed to the initial starting point (0 means no seed)
 91 |     Returns:
 92 |         int: 64-bit hashed representation of key
 93 |     Note:
 94 |         Uses the lower 64 bits when overflows occur"""
 95 |     hval = (14695981039346656037 + (31 * seed)) & UINT64_T_MAX
 96 |     fnv_64_prime = 1099511628211
 97 |     tmp = list(key) if not isinstance(key, str) else list(map(ord, key))
 98 |     for t_str in tmp:
 99 |         hval ^= t_str
100 |         hval *= fnv_64_prime
101 |         hval &= UINT64_T_MAX
102 |     return hval
103 | 
104 | 
105 | def fnv_1a_32(key: KeyT, seed: int = 0) -> int:
106 |     """Pure python implementation of the 32 bit fnv-1a hash
107 |     Args:
108 |         key (str): The element to be hashed
109 |         seed (int): Add a seed to the initial starting point (0 means no seed)
110 |     Returns:
111 |         int: 32-bit hashed representation of key
112 |     Note:
113 |         Uses the lower 32 bits when overflows occur"""
114 |     hval = (0x811C9DC5 + (31 * seed)) & UINT32_T_MAX
115 |     fnv_32_prime = 0x01000193
116 |     tmp = list(key) if not isinstance(key, str) else list(map(ord, key))
117 |     for t_str in tmp:
118 |         hval ^= t_str
119 |         hval *= fnv_32_prime
120 |         hval &= UINT32_T_MAX
121 |     return hval
122 | 
123 | 
124 | @hash_with_depth_bytes
125 | def default_md5(key: KeyT, *args, **kwargs) -> bytes:
126 |     """The default md5 hashing routine
127 | 
128 |     Args:
129 |         key (str): The element to be hashed
130 |         depth (int): The number of hash permutations to compute
131 |     Returns:
132 |         list(int): List of 64-bit hashed representation of key hashes
133 |     Note:
134 |         Returns the upper-most 64 bits"""
135 |     return md5(key).digest()  # type: ignore
136 | 
137 | 
138 | @hash_with_depth_bytes
139 | def default_sha256(key: KeyT, *args, **kwargs) -> bytes:
140 |     """The default sha256 hashing routine
141 | 
142 |     Args:
143 |         key (str): The element to be hashed
144 |         depth (int): The number of hash permutations to compute
145 |     Returns:
146 |         list(int): List of 64-bit hashed representation of key hashes
147 |     Note:
148 |         Returns the upper-most 64 bits"""
149 |     return sha256(key).digest()  # type: ignore
150 | 


--------------------------------------------------------------------------------
/probables/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/barrust/pyprobables/e5d44dac65033421d455681178344eeca9961bd8/probables/py.typed


--------------------------------------------------------------------------------
/probables/quotientfilter/__init__.py:
--------------------------------------------------------------------------------
1 | """Quotient Filters"""
2 | 
3 | from probables.quotientfilter.quotientfilter import QuotientFilter
4 | 
5 | __all__ = ["QuotientFilter"]
6 | 


--------------------------------------------------------------------------------
/probables/quotientfilter/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/barrust/pyprobables/e5d44dac65033421d455681178344eeca9961bd8/probables/quotientfilter/py.typed


--------------------------------------------------------------------------------
/probables/utilities.py:
--------------------------------------------------------------------------------
  1 | """Utility Functions"""
  2 | 
  3 | import math
  4 | import mmap
  5 | import string
  6 | from array import array
  7 | from pathlib import Path
  8 | from typing import Union
  9 | 
 10 | 
 11 | def is_hex_string(hex_string: Union[str, None]) -> bool:
 12 |     """check if the passed in string is really hex"""
 13 |     if hex_string is None:
 14 |         return False
 15 |     return all(c in string.hexdigits for c in hex_string)
 16 | 
 17 | 
 18 | def is_valid_file(filepath: Union[str, Path, None]) -> bool:
 19 |     """check if the passed filepath points to a real file"""
 20 |     if filepath is None:
 21 |         return False
 22 |     return Path(filepath).exists()
 23 | 
 24 | 
 25 | def resolve_path(filepath: Union[str, Path]) -> Path:
 26 |     """fully resolve the path by expanding user and resolving"""
 27 |     return Path(filepath).expanduser().resolve()
 28 | 
 29 | 
 30 | def get_x_bits(num: int, max_bits: int, num_bits: int, right_bits: bool = True) -> int:
 31 |     """ensure the correct bits are pulled from num"""
 32 |     if right_bits:
 33 |         return num & (2**num_bits - 1)
 34 |     return ((1 << num_bits) - 1) & (num >> (max_bits - num_bits))
 35 | 
 36 | 
 37 | class MMap:
 38 |     """Simplified mmap.mmap class"""
 39 | 
 40 |     __slots__ = ("__p", "__f", "__m", "_closed")
 41 | 
 42 |     def __init__(self, path: Union[Path, str]):
 43 |         self.__p = Path(path)
 44 |         self.__f = self.path.open("rb")  # noqa: SIM115
 45 |         self.__m = mmap.mmap(self.__f.fileno(), 0, access=mmap.ACCESS_READ)
 46 |         self._closed = False
 47 | 
 48 |     def __enter__(self) -> mmap.mmap:
 49 |         return self.__m
 50 | 
 51 |     def __exit__(self, *args, **kwargs) -> None:
 52 |         if self.__m and not self.map.closed:
 53 |             self.map.close()
 54 |         if self.__f:
 55 |             self.__f.close()
 56 |         self._closed = True
 57 | 
 58 |     @property
 59 |     def closed(self) -> bool:
 60 |         """Is the MMap closed"""
 61 |         return self._closed
 62 | 
 63 |     @property
 64 |     def map(self) -> mmap.mmap:
 65 |         """Return a pointer to the mmap"""
 66 |         return self.__m
 67 | 
 68 |     @property
 69 |     def path(self) -> Path:
 70 |         """Return the path to the mmap'd file"""
 71 |         return self.__p
 72 | 
 73 |     def close(self) -> None:
 74 |         """Close the MMap class includeing cleaning up open files, etc"""
 75 |         self.__exit__()
 76 | 
 77 |     def seek(self, pos: int, whence: int) -> None:
 78 |         """Implement a method to seek on top of the MMap class"""
 79 |         self.__m.seek(pos, whence)
 80 | 
 81 |     def read(self, n: int = -1) -> bytes:
 82 |         """Implement a method to read from the file on top of the MMap class"""
 83 |         return self.__m.read(n)
 84 | 
 85 | 
 86 | class Bitarray:
 87 |     """Simplified, pure python bitarray implementation using as little memory as possible
 88 | 
 89 |     Args:
 90 |         size (int): The number of bits in the bitarray
 91 |     Returns:
 92 |         Bitarray: A bitarray
 93 |     Raises:
 94 |         TypeError:
 95 |         ValueError:"""
 96 | 
 97 |     def __init__(self, size: int):
 98 |         if not isinstance(size, int):
 99 |             raise TypeError(f"Bitarray size must be an int; {type(size)} was provided")
100 |         if size <= 0:
101 |             raise ValueError(f"Bitarray size must be larger than 1; {size} was provided")
102 |         self._size_bytes = math.ceil(size / 8)
103 |         self._bitarray = array("B", [0]) * self._size_bytes
104 |         self._size = size
105 | 
106 |     @property
107 |     def size_bytes(self) -> int:
108 |         """The size of the bitarray in bytes"""
109 |         return self._size_bytes
110 | 
111 |     @property
112 |     def size(self) -> int:
113 |         """The number of bits in the bitarray"""
114 |         return self._size
115 | 
116 |     @property
117 |     def bitarray(self) -> array:
118 |         """The bitarray"""
119 |         return self._bitarray
120 | 
121 |     def __getitem__(self, key: int) -> int:
122 |         return self.check_bit(key)
123 | 
124 |     def __setitem__(self, idx: int, val: int):
125 |         if val < 0 or val > 1:
126 |             raise ValueError("Invalid bit setting; must be 0 or 1")
127 |         if idx < 0 or idx >= self._size:
128 |             raise IndexError(f"Bitarray index outside of range; index {idx} was provided")
129 |         b = idx // 8
130 |         if val == 1:
131 |             self._bitarray[b] = self._bitarray[b] | (1 << (idx % 8))
132 |         else:
133 |             self._bitarray[b] = self._bitarray[b] & ~(1 << (idx % 8))
134 | 
135 |     def check_bit(self, idx: int) -> int:
136 |         """Check if the bit idx is set
137 | 
138 |         Args:
139 |             idx (int): The index to check
140 |         Returns:
141 |             int: The status of the bit, either 0 or 1"""
142 |         if idx < 0 or idx >= self._size:
143 |             raise IndexError(f"Bitarray index outside of range; index {idx} was provided")
144 |         return 0 if (self._bitarray[idx // 8] & (1 << (idx % 8))) == 0 else 1
145 | 
146 |     def is_bit_set(self, idx: int) -> bool:
147 |         """Check if the bit idx is set
148 | 
149 |         Args:
150 |             idx (int): The index to check
151 |         Returns:
152 |             int: The status of the bit, either 0 or 1"""
153 |         return bool(self.check_bit(idx))
154 | 
155 |     def set_bit(self, idx: int) -> None:
156 |         """Set the bit at idx to 1
157 | 
158 |         Args:
159 |             idx (int): The index to set"""
160 |         if idx < 0 or idx >= self._size:
161 |             raise IndexError(f"Bitarray index outside of range; index {idx} was provided")
162 |         b = idx // 8
163 |         self._bitarray[b] = self._bitarray[b] | (1 << (idx % 8))
164 | 
165 |     def clear_bit(self, idx: int) -> None:
166 |         """Set the bit at idx to 0
167 | 
168 |         Args:
169 |             idx (int): The index to clear"""
170 |         if idx < 0 or idx >= self._size:
171 |             raise IndexError(f"Bitarray index outside of range; index {idx} was provided")
172 |         b = idx // 8
173 |         self._bitarray[b] = self._bitarray[b] & ~(1 << (idx % 8))
174 | 
175 |     def clear(self):
176 |         """Clear all bits in the bitarray"""
177 |         for i in range(self._size_bytes):
178 |             self._bitarray[i] = 0
179 | 
180 |     def as_string(self):
181 |         """String representation of the bitarray
182 | 
183 |         Returns:
184 |             str: Bitarray representation as a string"""
185 |         return "".join(str(self.check_bit(x)) for x in range(self._size))
186 | 
187 |     def num_bits_set(self) -> int:
188 |         """Number of bits set in the bitarray
189 | 
190 |         Returns:
191 |             int: Number of bits set"""
192 |         return sum(self.check_bit(x) for x in range(self._size))
193 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [project]
  2 | name = "pyprobables"
  3 | dynamic = ["version"]
  4 | authors = [{ name = "Tyler Barrus", email = "barrust@gmail.com" }]
  5 | license = "MIT"
  6 | description = "Probabilistic data structures in python"
  7 | keywords = [
  8 |     "python",
  9 |     "probabilistic",
 10 |     "data-structure",
 11 |     "bloom",
 12 |     "filter",
 13 |     "count-min",
 14 |     "sketch",
 15 |     "bloom-filter",
 16 |     "count-min-sketch",
 17 |     "cuckoo-filter",
 18 |     "quotient-filter",
 19 | ]
 20 | readme = "README.rst"
 21 | classifiers = [
 22 |     "Development Status :: 5 - Production/Stable",
 23 |     "Intended Audience :: Developers",
 24 |     "Intended Audience :: Information Technology",
 25 |     "Intended Audience :: Science/Research",
 26 |     "Topic :: Software Development :: Libraries",
 27 |     "Topic :: Utilities",
 28 |     "Programming Language :: Python",
 29 |     "Programming Language :: Python :: 3",
 30 |     "Programming Language :: Python :: 3.9",
 31 |     "Programming Language :: Python :: 3.10",
 32 |     "Programming Language :: Python :: 3.11",
 33 |     "Programming Language :: Python :: 3.12",
 34 |     "Programming Language :: Python :: 3.13",
 35 | ]
 36 | requires-python = ">=3.9"
 37 | 
 38 | [tool.setuptools.dynamic]
 39 | version = { attr = "probables.__version__" }
 40 | 
 41 | [project.urls]
 42 | Homepage = "https://github.com/barrust/pyprobables"
 43 | Bug-tracker = "https://github.com/barrust/pyprobables/issues"
 44 | Documentation = "https://pyprobables.readthedocs.io/"
 45 | 
 46 | [tool.setuptools.packages.find]
 47 | include = ["probables", "probables.*"]
 48 | 
 49 | [tool.flit.module]
 50 | name = "probables"
 51 | 
 52 | [tool.pep8]
 53 | max-line-length = 120
 54 | 
 55 | [tool.pycodestyle]
 56 | max-line-length = 120
 57 | 
 58 | [tool.flake8]
 59 | max-line-length = 120
 60 | 
 61 | [tool.isort]
 62 | profile = "black"
 63 | 
 64 | [tool.black]
 65 | line-length = 120
 66 | target-version = ['py39']
 67 | include = '\.pyi?$'
 68 | 
 69 | [tool.ruff]
 70 | include = ["pyproject.toml", "probables/**/*.py", "probables/*.py"]
 71 | exclude = [
 72 |     ".bzr",
 73 |     ".direnv",
 74 |     ".eggs",
 75 |     ".git",
 76 |     ".git-rewrite",
 77 |     ".hg",
 78 |     ".ipynb_checkpoints",
 79 |     ".mypy_cache",
 80 |     ".nox",
 81 |     ".pants.d",
 82 |     ".pyenv",
 83 |     ".pytest_cache",
 84 |     ".pytype",
 85 |     ".ruff_cache",
 86 |     ".svn",
 87 |     ".tox",
 88 |     ".venv",
 89 |     ".vscode",
 90 |     "__pypackages__",
 91 |     "_build",
 92 |     "buck-out",
 93 |     "build",
 94 |     "dist",
 95 |     "node_modules",
 96 |     "site-packages",
 97 |     "venv",
 98 | ]
 99 | 
100 | # Same as Black.
101 | line-length = 120
102 | indent-width = 4
103 | target-version = "py39"
104 | 
105 | [tool.ruff.lint]
106 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
107 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
108 | # McCabe complexity (`C901`) by default.
109 | select = [
110 |     # pycodestyle
111 |     "E",
112 |     # Pyflakes
113 |     "F",
114 |     # pyupgrade
115 |     "UP",
116 |     # flake8-bugbear
117 |     "B",
118 |     # flake8-simplify
119 |     "SIM",
120 |     # isort
121 |     "I",
122 | ]
123 | ignore = []
124 | 
125 | # Allow fix for all enabled rules (when `--fix`) is provided.
126 | fixable = ["ALL"]
127 | unfixable = []
128 | 
129 | # Allow unused variables when underscore-prefixed.
130 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
131 | 
132 | [tool.ruff.format]
133 | # Like Black, use double quotes for strings.
134 | quote-style = "double"
135 | 
136 | # Like Black, indent with spaces, rather than tabs.
137 | indent-style = "space"
138 | 
139 | # Like Black, respect magic trailing commas.
140 | skip-magic-trailing-comma = false
141 | 
142 | # Like Black, automatically detect the appropriate line ending.
143 | line-ending = "auto"
144 | 
145 | # Enable auto-formatting of code examples in docstrings. Markdown,
146 | # reStructuredText code/literal blocks and doctests are all supported.
147 | #
148 | # This is currently disabled by default, but it is planned for this
149 | # to be opt-out in the future.
150 | docstring-code-format = false
151 | 
152 | # Set the line length limit used when formatting code snippets in
153 | # docstrings.
154 | #
155 | # This only has an effect when the `docstring-code-format` setting is
156 | # enabled.
157 | docstring-code-line-length = "dynamic"
158 | 
159 | [build-system]
160 | requires = ["setuptools>=77.0.0", "wheel"]
161 | build-backend = "setuptools.build_meta"
162 | 


--------------------------------------------------------------------------------
/scripts/version_bump.py:
--------------------------------------------------------------------------------
 1 | """ Update all the different version variables
 2 | """
 3 | import os
 4 | from datetime import datetime
 5 | from functools import wraps
 6 | 
 7 | 
 8 | def read_and_write(func):
 9 |     @wraps(func)
10 |     def wrapper(**kwargs):
11 |         path = kwargs["path"]
12 | 
13 |         with open(path) as fobj:
14 |             data = fobj.readlines()
15 | 
16 |         func(data, **kwargs)
17 | 
18 |         with open(path, "w") as fobj:
19 |             fobj.writelines(data)
20 | 
21 |     return wrapper
22 | 
23 | 
24 | @read_and_write
25 | def update_file(data, **kwargs):
26 |     """Parse a file based on the key (k) and update it's value with the provided value (v)
27 | 
28 |     Args:
29 |         path (str):
30 |         k (str):
31 |         v (str):
32 |     """
33 |     for i, line in enumerate(data):
34 |         if line.startswith(kwargs["k"]):
35 |             data[i] = """{} = "{}"\n""".format(kwargs["k"], kwargs["v"])
36 | 
37 | 
38 | @read_and_write
39 | def update_citation_file(data, **kwargs):
40 |     """Parse the citation file and update it's values with the provide file
41 | 
42 |     Args:
43 |         path (str):
44 |         v (str):
45 |     """
46 |     for i, line in enumerate(data):
47 |         if line.startswith("version:"):
48 |             data[i] = "version: {}\n".format(kwargs["v"])
49 |         if line.startswith("date-released:"):
50 |             data[i] = "date-released: '{}'".format(datetime.today().strftime("%Y-%m-%d"))
51 | 
52 | 
53 | def _parse_args():
54 |     import argparse
55 | 
56 |     parser = argparse.ArgumentParser(description="Automate the version bump of the pyprobables project")
57 |     parser.add_argument("new_version", help="The new version of the package")
58 | 
59 |     return parser.parse_args()
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     args = _parse_args()
64 | 
65 |     # get current path to find where the script is currently
66 |     script_path = os.path.dirname(os.path.abspath(__file__))
67 | 
68 |     module_path = os.path.abspath(f"{script_path}/../")
69 | 
70 |     # update the package __init__ file
71 |     init_file = f"{module_path}/probables/__init__.py"
72 |     update_file(path=init_file, k="__version__", v=args.new_version)
73 | 
74 |     # update the citation file
75 |     citation_file = f"{module_path}/CITATION.cff"
76 |     update_citation_file(path=citation_file, v=args.new_version)
77 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Testing Module"""
2 | 


--------------------------------------------------------------------------------
/tests/countingcuckoo_test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Unittest class"""
  3 | 
  4 | import hashlib
  5 | import os
  6 | import sys
  7 | import unittest
  8 | from pathlib import Path
  9 | from tempfile import NamedTemporaryFile
 10 | 
 11 | this_dir = Path(__file__).parent
 12 | sys.path.insert(0, str(this_dir))
 13 | sys.path.insert(0, str(this_dir.parent))
 14 | 
 15 | from probables import CountingCuckooFilter, CuckooFilterFullError  # noqa: E402
 16 | from tests.utilities import calc_file_md5  # noqa: E402
 17 | 
 18 | DELETE_TEMP_FILES = True
 19 | 
 20 | 
 21 | class TestCountingCuckooFilter(unittest.TestCase):
 22 |     """base Cuckoo Filter test"""
 23 | 
 24 |     def test_c_cuckoo_filter_default(self):
 25 |         """test counting cuckoo filter default properties"""
 26 |         cko = CountingCuckooFilter()
 27 |         self.assertEqual(10000, cko.capacity)
 28 |         self.assertEqual(4, cko.bucket_size)
 29 |         self.assertEqual(500, cko.max_swaps)
 30 |         self.assertEqual(2, cko.expansion_rate)
 31 |         self.assertEqual(True, cko.auto_expand)
 32 | 
 33 |     def test_c_cuckoo_filter_diff(self):
 34 |         """test counting cuckoo filter non-standard properties"""
 35 |         cko = CountingCuckooFilter(
 36 |             capacity=100,
 37 |             bucket_size=2,
 38 |             max_swaps=5,
 39 |             expansion_rate=4,
 40 |             auto_expand=False,
 41 |         )
 42 |         self.assertEqual(100, cko.capacity)
 43 |         self.assertEqual(2, cko.bucket_size)
 44 |         self.assertEqual(5, cko.max_swaps)
 45 |         self.assertEqual(4, cko.expansion_rate)
 46 |         self.assertEqual(False, cko.auto_expand)
 47 | 
 48 |     def test_c_cuckoo_filter_add(self):
 49 |         """test adding to the counting cuckoo filter"""
 50 |         cko = CountingCuckooFilter()
 51 |         cko.add("this is a test")
 52 |         self.assertEqual(cko.elements_added, 1)
 53 |         cko.add("this is another test")
 54 |         self.assertEqual(cko.elements_added, 2)
 55 |         cko.add("this is yet another test")
 56 |         self.assertEqual(cko.elements_added, 3)
 57 | 
 58 |     def test_c_cuckoo_filter_remove(self):
 59 |         """test removing from the counting cuckoo filter"""
 60 |         cko = CountingCuckooFilter()
 61 |         cko.add("this is a test")
 62 |         self.assertEqual(cko.elements_added, 1)
 63 |         cko.add("this is another test")
 64 |         self.assertEqual(cko.elements_added, 2)
 65 |         cko.add("this is yet another test")
 66 |         self.assertEqual(cko.elements_added, 3)
 67 |         self.assertEqual(cko.unique_elements, 3)
 68 |         cko.add("this is a test")
 69 |         cko.add("this is a test")
 70 |         cko.add("this is a test")
 71 |         self.assertEqual(cko.elements_added, 6)
 72 |         self.assertEqual(cko.unique_elements, 3)
 73 | 
 74 |         res = cko.remove("this is another test")
 75 |         self.assertTrue(res)
 76 |         self.assertEqual(cko.elements_added, 5)
 77 |         self.assertEqual(cko.unique_elements, 2)
 78 | 
 79 |         self.assertTrue(cko.check("this is a test"))
 80 |         self.assertFalse(cko.check("this is another test"))
 81 |         self.assertTrue(cko.check("this is yet another test"))
 82 | 
 83 |     def test_c_cuckoo_filter_rmv_miss(self):
 84 |         """test removing from the counting cuckoo filter when not present"""
 85 |         cko = CountingCuckooFilter()
 86 |         cko.add("this is a test")
 87 |         self.assertEqual(cko.elements_added, 1)
 88 |         cko.add("this is another test")
 89 |         self.assertEqual(cko.elements_added, 2)
 90 |         cko.add("this is yet another test")
 91 |         self.assertEqual(cko.elements_added, 3)
 92 | 
 93 |         res = cko.remove("this is still a test")
 94 |         self.assertFalse(res)
 95 |         self.assertEqual(cko.elements_added, 3)
 96 |         self.assertTrue(cko.check("this is a test"))
 97 |         self.assertTrue(cko.check("this is another test"))
 98 |         self.assertTrue(cko.check("this is yet another test"))
 99 | 
100 |     def test_c_cuckoo_filter_lots(self):
101 |         """test inserting lots into the counting cuckoo filter"""
102 |         cko = CountingCuckooFilter(capacity=100, bucket_size=2, max_swaps=100)
103 |         for i in range(125):
104 |             cko.add(str(i))
105 |         self.assertEqual(cko.elements_added, 125)
106 | 
107 |     def test_c_cuckoo_filter_full(self):
108 |         """test inserting until counting cuckoo filter is full"""
109 | 
110 |         def runner():
111 |             """runner"""
112 |             cko = CountingCuckooFilter(capacity=100, bucket_size=2, max_swaps=100, auto_expand=False)
113 |             for i in range(175):
114 |                 cko.add(str(i))
115 | 
116 |         self.assertRaises(CuckooFilterFullError, runner)
117 | 
118 |     def test_c_cuckoo_full_msg(self):
119 |         """test exception message for full counting cuckoo filter"""
120 |         try:
121 |             cko = CountingCuckooFilter(capacity=100, bucket_size=2, max_swaps=100, auto_expand=False)
122 |             for i in range(175):
123 |                 cko.add(str(i))
124 |         except CuckooFilterFullError as ex:
125 |             msg = "The CountingCuckooFilter is currently full"
126 |             self.assertEqual(str(ex), msg)
127 |         else:
128 |             self.assertEqual(True, False)
129 | 
130 |     def test_c_cuckoo_idx(self):
131 |         """test that the indexing works correctly for counting cuckoo filter
132 |         swap"""
133 |         cko = CountingCuckooFilter(capacity=100, bucket_size=2, max_swaps=5)
134 |         txt = "this is a test"
135 |         idx_1, idx_2, fingerprint = cko._generate_fingerprint_info(txt)
136 |         index_1, index_2 = cko._indicies_from_fingerprint(fingerprint)
137 |         self.assertEqual(idx_1, index_1)
138 |         self.assertEqual(idx_2, index_2)
139 | 
140 |     def test_c_cuckoo_filter_check(self):
141 |         """test checking if element in counting cuckoo filter"""
142 |         cko = CountingCuckooFilter()
143 |         cko.add("this is a test")
144 |         cko.add("this is another test")
145 |         cko.add("this is yet another test")
146 |         self.assertEqual(cko.check("this is a test"), True)
147 |         self.assertEqual(cko.check("this is another test"), True)
148 |         self.assertEqual(cko.check("this is yet another test"), True)
149 |         self.assertEqual(cko.check("this is not another test"), False)
150 |         self.assertEqual(cko.check("this is not a test"), False)
151 | 
152 |     def test_c_cuckoo_filter_in(self):
153 |         """test checking using 'in' counting cuckoo filter"""
154 |         cko = CountingCuckooFilter()
155 |         cko.add("this is a test")
156 |         cko.add("this is another test")
157 |         cko.add("this is yet another test")
158 |         self.assertEqual("this is a test" in cko, True)
159 |         self.assertEqual("this is another test" in cko, True)
160 |         self.assertEqual("this is yet another test" in cko, True)
161 |         self.assertEqual("this is not another test" in cko, False)
162 |         self.assertEqual("this is not a test" in cko, False)
163 | 
164 |     def test_c_cuckoo_filter_dup_add(self):
165 |         """test adding same item multiple times counting cuckoo filter"""
166 |         cko = CountingCuckooFilter()
167 |         cko.add("this is a test")
168 |         cko.add("this is another test")
169 |         cko.add("this is yet another test")
170 |         self.assertEqual(cko.elements_added, 3)
171 |         cko.add("this is a test")
172 |         cko.add("this is another test")
173 |         cko.add("this is yet another test")
174 |         self.assertEqual(cko.elements_added, 6)
175 |         self.assertEqual(cko.unique_elements, 3)
176 | 
177 |     def test_c_cuckoo_filter_l_fact(self):
178 |         """test the load factor of the counting cuckoo filter"""
179 |         cko = CountingCuckooFilter(capacity=100, bucket_size=2, max_swaps=10)
180 |         self.assertEqual(cko.load_factor(), 0.0)
181 |         for i in range(50):
182 |             cko.add(str(i))
183 |         self.assertEqual(cko.load_factor(), 0.25)
184 |         for i in range(50):
185 |             cko.add(str(i + 50))
186 | 
187 |         if cko.capacity == 200:  # self expanded
188 |             self.assertEqual(cko.load_factor(), 0.25)
189 |         else:
190 |             self.assertEqual(cko.load_factor(), 0.50)
191 | 
192 |         for i in range(100):
193 |             cko.add(str(i))
194 |         if cko.capacity == 200:  # self expanded
195 |             self.assertEqual(cko.load_factor(), 0.25)
196 |         else:
197 |             self.assertEqual(cko.load_factor(), 0.50)
198 | 
199 |     def test_c_cuckoo_filter_export(self):
200 |         """test exporting a counting cuckoo filter"""
201 |         md5sum = "6a98c2df1ec9fbb4f75f8e6392696b9b"
202 |         with NamedTemporaryFile(dir=os.getcwd(), suffix=".cck", delete=DELETE_TEMP_FILES) as fobj:
203 |             cko = CountingCuckooFilter(capacity=1000, bucket_size=2, auto_expand=False)
204 |             for i in range(100):
205 |                 cko.add(str(i))
206 | 
207 |             cko.export(fobj.name)
208 |             md5_out = calc_file_md5(fobj.name)
209 |             self.assertEqual(md5sum, md5_out)
210 | 
211 |     def test_c_cuckoo_filter_bytes(self):
212 |         """test exporting a counting cuckoo filter"""
213 |         md5sum = "6a98c2df1ec9fbb4f75f8e6392696b9b"
214 |         cko = CountingCuckooFilter(capacity=1000, bucket_size=2, auto_expand=False)
215 |         for i in range(100):
216 |             cko.add(str(i))
217 |         md5_out = hashlib.md5(bytes(cko)).hexdigest()
218 |         self.assertEqual(md5sum, md5_out)
219 | 
220 |     def test_c_cuckoo_filter_frombytes(self):
221 |         """test initializing a counting cuckoo filter frombytes"""
222 |         cko = CountingCuckooFilter(capacity=1000, bucket_size=2, auto_expand=False)
223 |         for i in range(100):
224 |             cko.add(str(i))
225 |         bytes_out = bytes(cko)
226 | 
227 |         cko2 = CountingCuckooFilter.frombytes(bytes_out)
228 | 
229 |         self.assertEqual(bytes_out, bytes(cko2))
230 |         for i in range(100):
231 |             self.assertTrue(cko2.check(str(i)))
232 |         self.assertFalse(cko2.check("999"))
233 | 
234 |     def test_c_cuckoo_filter_load(self):
235 |         """test loading a saved counting cuckoo filter"""
236 |         md5sum = "6a98c2df1ec9fbb4f75f8e6392696b9b"
237 |         with NamedTemporaryFile(dir=os.getcwd(), suffix=".cck", delete=DELETE_TEMP_FILES) as fobj:
238 |             cko = CountingCuckooFilter(capacity=1000, bucket_size=2, auto_expand=False)
239 |             for i in range(100):
240 |                 cko.add(str(i))
241 | 
242 |             cko.export(fobj.name)
243 |             md5_out = calc_file_md5(fobj.name)
244 |             self.assertEqual(md5sum, md5_out)
245 | 
246 |             ckf = CountingCuckooFilter(filepath=fobj.name)
247 |             for i in range(100):
248 |                 self.assertEqual(ckf.check(str(i)), 1)
249 | 
250 |             self.assertEqual(1000, ckf.capacity)
251 |             self.assertEqual(2, ckf.bucket_size)
252 |             self.assertEqual(500, ckf.max_swaps)
253 |             self.assertEqual(0.05, ckf.load_factor())
254 | 
255 |     def test_c_cuckoo_filter_expand_els(self):
256 |         """test out the expansion of the counting cuckoo filter"""
257 |         cko = CountingCuckooFilter()
258 |         for i in range(200):
259 |             cko.add(str(i))
260 |         cko.expand()
261 |         for i in range(200):
262 |             self.assertGreater(cko.check(str(i)), 0)
263 |         self.assertEqual(20000, cko.capacity)
264 | 
265 |     def test_c_cuckoo_filter_auto_exp(self):
266 |         """test inserting until counting cuckoo filter is full"""
267 |         cko = CountingCuckooFilter(capacity=100, bucket_size=2, max_swaps=100)
268 |         for i in range(375):  # this would fail if it doesn't expand
269 |             cko.add(str(i))
270 |         self.assertEqual(400, cko.capacity)
271 |         self.assertEqual(375, cko.elements_added)
272 |         for i in range(375):
273 |             self.assertGreater(cko.check(str(i)), 0)
274 | 
275 |     def test_c_cuckoo_filter_bin(self):
276 |         """test the cuckoo bin repr"""
277 |         cko = CountingCuckooFilter(capacity=1, bucket_size=2, max_swaps=100)
278 |         cko.add("this is a test")
279 |         self.assertEqual("[(fingerprint:4280557824 count:1)]", str(cko.buckets[0]))
280 | 
281 |     def test_c_cuckoo_filter_str(self):
282 |         """test the str representation of the counting cuckoo filter"""
283 |         cko = CountingCuckooFilter(capacity=100, bucket_size=2, max_swaps=100)
284 |         for i in range(75):
285 |             cko.add(str(i))
286 |         msg = (
287 |             "CountingCuckooFilter:\n"
288 |             "\tCapacity: 100\n"
289 |             "\tTotal Bins: 200\n"
290 |             "\tLoad Factor: 37.5%\n"
291 |             "\tInserted Elements: 75\n"
292 |             "\tMax Swaps: 100\n"
293 |             "\tExpansion Rate: 2\n"
294 |             "\tAuto Expand: True"
295 |         )
296 |         self.assertEqual(str(cko), msg)
297 | 
298 | 
299 | class TestCuckooFilterErrorRate(unittest.TestCase):
300 |     """Test CountingCuckooFilter using Error Rate"""
301 | 
302 |     def test_c_cuckoo_filter_er_default(self):
303 |         """test cuckoo filter default properties"""
304 |         cko = CountingCuckooFilter.init_error_rate(0.00001)
305 |         self.assertEqual(10000, cko.capacity)
306 |         self.assertEqual(4, cko.bucket_size)
307 |         self.assertEqual(500, cko.max_swaps)
308 |         self.assertEqual(2, cko.expansion_rate)
309 |         self.assertEqual(True, cko.auto_expand)
310 |         self.assertEqual(3, cko.fingerprint_size)
311 |         self.assertEqual(20, cko.fingerprint_size_bits)
312 |         self.assertEqual(0.00001, cko.error_rate)
313 | 
314 |     def test_c_cuckoo_filter_er_add_check(self):
315 |         """test adding to the cuckoo filter"""
316 |         cko = CountingCuckooFilter.init_error_rate(0.00001)
317 |         cko.add("this is a test")
318 |         self.assertEqual(cko.elements_added, 1)
319 |         cko.add("this is another test")
320 |         self.assertEqual(cko.elements_added, 2)
321 |         cko.add("this is yet another test")
322 |         self.assertEqual(cko.elements_added, 3)
323 | 
324 |         # check
325 |         self.assertEqual(cko.check("this is a test"), True)
326 |         self.assertEqual(cko.check("this is another test"), True)
327 |         self.assertEqual(cko.check("this is yet another test"), True)
328 |         self.assertEqual(cko.check("this is not another test"), False)
329 |         self.assertEqual(cko.check("this is not a test"), False)
330 | 
331 |         # use of `in`
332 |         self.assertEqual("this is a test" in cko, True)
333 |         self.assertEqual("this is another test" in cko, True)
334 |         self.assertEqual("this is yet another test" in cko, True)
335 |         self.assertEqual("this is not another test" in cko, False)
336 |         self.assertEqual("this is not a test" in cko, False)
337 | 
338 |     def test_c_cuckoo_filter_er_export(self):
339 |         """test exporting a cuckoo filter"""
340 |         md5sum = "f68767bd97b21426f5d2315fb38961ad"
341 |         with NamedTemporaryFile(dir=os.getcwd(), suffix=".cko", delete=DELETE_TEMP_FILES) as fobj:
342 |             cko = CountingCuckooFilter.init_error_rate(0.00001)
343 |             for i in range(1000):
344 |                 cko.add(str(i))
345 |             cko.export(fobj.name)
346 |             md5_out = calc_file_md5(fobj.name)
347 |             self.assertEqual(md5sum, md5_out)
348 | 
349 |     def test_c_cuckoo_filter_load(self):
350 |         """test loading a saved cuckoo filter"""
351 |         md5sum = "88bc3a08bfc967f9ba60e9d57c21207f"
352 |         with NamedTemporaryFile(dir=os.getcwd(), suffix=".cko", delete=DELETE_TEMP_FILES) as fobj:
353 |             cko = CountingCuckooFilter.init_error_rate(0.00001)
354 |             for i in range(1000):
355 |                 cko.add(str(i))
356 |                 if i % 2 == 1:
357 |                     cko.add(str(i))
358 |             cko.export(fobj.name)
359 |             md5_out = calc_file_md5(fobj.name)
360 |             self.assertEqual(md5sum, md5_out)
361 | 
362 |             ckf = CountingCuckooFilter.load_error_rate(error_rate=0.00001, filepath=fobj.name)
363 |             for i in range(1000):
364 |                 self.assertEqual(ckf.check(str(i)), (i % 2) + 1)
365 | 
366 |             self.assertEqual(10000, ckf.capacity)
367 |             self.assertEqual(4, ckf.bucket_size)
368 |             self.assertEqual(500, ckf.max_swaps)
369 |             self.assertEqual(2, ckf.expansion_rate)
370 |             self.assertEqual(True, ckf.auto_expand)
371 |             self.assertEqual(20, ckf.fingerprint_size_bits)
372 |             self.assertEqual(3, ckf.fingerprint_size)
373 |             self.assertEqual(0.00001, ckf.error_rate)
374 |             self.assertEqual(0.025, ckf.load_factor())
375 | 
376 |     def test_c_cuckoo_filter_er_bytes(self):
377 |         """test exporting a cuckoo filter to bytes"""
378 |         md5sum = "f68767bd97b21426f5d2315fb38961ad"
379 |         cko = CountingCuckooFilter.init_error_rate(0.00001)
380 |         for i in range(1000):
381 |             cko.add(str(i))
382 |         md5_out = hashlib.md5(bytes(cko)).hexdigest()
383 |         self.assertEqual(md5sum, md5_out)
384 | 
385 |     def test_c_cuckoo_filter_er_frombytes(self):
386 |         """test initializing a couting cuckoo filter from bytes"""
387 |         cko = CountingCuckooFilter.init_error_rate(0.00001, capacity=3000)
388 |         for i in range(1000):
389 |             cko.add(str(i))
390 |         bytes_out = bytes(cko)
391 | 
392 |         cko2 = CountingCuckooFilter.frombytes(bytes_out, error_rate=0.00001)
393 | 
394 |         self.assertEqual(bytes_out, bytes(cko2))
395 |         for i in range(1000):
396 |             self.assertTrue(cko2.check(str(i)))
397 |         self.assertFalse(cko2.check("9999"))
398 |         self.assertEqual(cko2.capacity, 3000)
399 | 
400 |     def test_c_cuckoo_filter_er_remove(self):
401 |         """test removing from the counting cuckoo filter"""
402 |         cko = CountingCuckooFilter.init_error_rate(0.00001)
403 |         cko.add("this is a test")
404 |         self.assertEqual(cko.elements_added, 1)
405 |         cko.add("this is another test")
406 |         self.assertEqual(cko.elements_added, 2)
407 |         cko.add("this is yet another test")
408 |         self.assertEqual(cko.elements_added, 3)
409 |         self.assertEqual(cko.unique_elements, 3)
410 |         cko.add("this is a test")
411 |         cko.add("this is a test")
412 |         cko.add("this is a test")
413 |         self.assertEqual(cko.elements_added, 6)
414 |         self.assertEqual(cko.unique_elements, 3)
415 | 
416 |         res = cko.remove("this is another test")
417 |         self.assertTrue(res)
418 |         self.assertEqual(cko.elements_added, 5)
419 |         self.assertEqual(cko.unique_elements, 2)
420 | 
421 |         self.assertTrue(cko.check("this is a test"))
422 |         self.assertFalse(cko.check("this is another test"))
423 |         self.assertTrue(cko.check("this is yet another test"))
424 | 
425 | 
426 | if __name__ == "__main__":
427 |     unittest.main()
428 | 


--------------------------------------------------------------------------------
/tests/expandingbloom_test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Unittest class"""
  3 | 
  4 | import hashlib
  5 | import os
  6 | import sys
  7 | import unittest
  8 | from pathlib import Path
  9 | from tempfile import NamedTemporaryFile
 10 | 
 11 | this_dir = Path(__file__).parent
 12 | sys.path.insert(0, str(this_dir))
 13 | sys.path.insert(0, str(this_dir.parent))
 14 | 
 15 | from probables import ExpandingBloomFilter, RotatingBloomFilter  # noqa: E402
 16 | from probables.exceptions import RotatingBloomFilterError  # noqa: E402
 17 | from tests.utilities import calc_file_md5, different_hash  # noqa: E402
 18 | 
 19 | DELETE_TEMP_FILES = True
 20 | 
 21 | 
 22 | class TestExpandingBloomFilter(unittest.TestCase):
 23 |     """Test ExpandingBloomFilter"""
 24 | 
 25 |     def test_ebf_init(self):
 26 |         """test the initialization of an expanding bloom filter"""
 27 |         blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05)
 28 |         self.assertEqual(blm.expansions, 0)
 29 |         self.assertEqual(blm.false_positive_rate, 0.05)
 30 |         self.assertEqual(blm.estimated_elements, 10)
 31 |         self.assertEqual(blm.elements_added, 0)
 32 | 
 33 |     def test_ebf_add_lots(self):
 34 |         """test adding "lots" of elements to force the expansion"""
 35 |         blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05)
 36 |         for i in range(100):
 37 |             blm.add(f"{i}", True)
 38 |         self.assertEqual(blm.expansions, 9)
 39 | 
 40 |     def test_ebf_add_lots_diff_hash(self):
 41 |         """test adding "lots" of elements to force the expansion using a different hash"""
 42 |         blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05, hash_function=different_hash)
 43 |         for i in range(100):
 44 |             blm.add(f"{i}", True)
 45 |         self.assertEqual(blm.expansions, 9)
 46 | 
 47 |     def test_ebf_add_lots_without_force(self):
 48 |         """testing adding "lots" but force them to be inserted multiple times"""
 49 |         blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05)
 50 |         # simulate false positives... notice it didn't grow a few...
 51 |         for i in range(120):
 52 |             blm.add(f"{i}")
 53 |         self.assertEqual(blm.expansions, 8)
 54 |         self.assertEqual(blm.elements_added, 120)
 55 | 
 56 |     def test_ebf_check(self):
 57 |         """ensure that checking the expanding bloom filter works"""
 58 |         blm = ExpandingBloomFilter(est_elements=30, false_positive_rate=0.05)
 59 |         # expand it out some first!
 60 |         for i in range(100):
 61 |             blm.add(f"{i}")
 62 |         blm.add("this is a test")
 63 |         blm.add("this is another test")
 64 |         self.assertGreater(blm.expansions, 1)
 65 |         self.assertEqual(blm.check("this is a test"), True)
 66 |         self.assertEqual(blm.check("this is another test"), True)
 67 |         self.assertEqual(blm.check("this is yet another test!"), False)
 68 |         self.assertEqual(blm.check("this is not another test"), False)
 69 |         self.assertEqual(blm.elements_added, 102)
 70 | 
 71 |     def test_ebf_contains(self):
 72 |         """ensure that "in" functionality for the expanding bloom filter works"""
 73 |         blm = ExpandingBloomFilter(est_elements=30, false_positive_rate=0.05)
 74 |         # expand it out some first!
 75 |         for i in range(100):
 76 |             blm.add(f"{i}")
 77 |         blm.add("this is a test")
 78 |         blm.add("this is another test")
 79 |         self.assertGreater(blm.expansions, 1)
 80 |         self.assertEqual("this is a test" in blm, True)
 81 |         self.assertEqual("this is another test" in blm, True)
 82 |         self.assertEqual("this is yet another test!" in blm, False)
 83 |         self.assertEqual("this is not another test" in blm, False)
 84 | 
 85 |     def test_ebf_push(self):
 86 |         """ensure that we are able to push new Bloom Filters"""
 87 |         blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05)
 88 |         self.assertEqual(blm.expansions, 0)
 89 |         blm.push()
 90 |         self.assertEqual(blm.expansions, 1)
 91 |         self.assertEqual(blm.elements_added, 0)
 92 |         blm.push()
 93 |         self.assertEqual(blm.expansions, 2)
 94 |         self.assertEqual(blm.elements_added, 0)
 95 |         blm.push()
 96 |         self.assertEqual(blm.expansions, 3)
 97 |         self.assertEqual(blm.elements_added, 0)
 98 | 
 99 |     def test_ebf_export(self):
100 |         """basic expanding Bloom Filter export test"""
101 |         with NamedTemporaryFile(dir=os.getcwd(), suffix=".ebf", delete=DELETE_TEMP_FILES) as fobj:
102 |             blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05)
103 |             blm.export(fobj.name)
104 |             self.assertEqual(calc_file_md5(fobj.name), "eb5769ae9babdf7b37d6ce64d58812bc")
105 | 
106 |     def test_ebf_bytes(self):
107 |         """basic expanding Bloom Filter export bytes test"""
108 |         blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05)
109 |         self.assertEqual(hashlib.md5(bytes(blm)).hexdigest(), "eb5769ae9babdf7b37d6ce64d58812bc")
110 | 
111 |     def test_ebf_frombytes(self):
112 |         """expanding Bloom Filter load bytes test"""
113 |         blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05)
114 |         for i in range(105):
115 |             blm.add(str(i))
116 |         bytes_out = bytes(blm)
117 | 
118 |         blm2 = ExpandingBloomFilter.frombytes(bytes_out)
119 |         self.assertEqual(blm2.expansions, 3)
120 |         self.assertEqual(blm2.false_positive_rate, 0.05000000074505806)
121 |         self.assertEqual(blm2.estimated_elements, 25)
122 |         self.assertEqual(blm2.elements_added, 105)
123 |         self.assertEqual(bytes(blm2), bytes(blm))
124 | 
125 |         for i in range(105):
126 |             self.assertTrue(blm.check(str(i)))
127 | 
128 |     def test_ebf_import_empty(self):
129 |         """test that expanding Bloom Filter is correct on import"""
130 |         with NamedTemporaryFile(dir=os.getcwd(), suffix=".ebf", delete=DELETE_TEMP_FILES) as fobj:
131 |             blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05)
132 |             blm.export(fobj.name)
133 |             self.assertEqual(calc_file_md5(fobj.name), "eb5769ae9babdf7b37d6ce64d58812bc")
134 | 
135 |             blm2 = ExpandingBloomFilter(filepath=fobj.name)
136 |             for bloom in blm2._blooms:
137 |                 self.assertEqual(bloom.elements_added, 0)
138 | 
139 |     def test_ebf_import_non_empty(self):
140 |         """test expanding Bloom Filter import when non-empty"""
141 |         with NamedTemporaryFile(dir=os.getcwd(), suffix=".ebf", delete=DELETE_TEMP_FILES) as fobj:
142 |             blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05)
143 |             for i in range(15):
144 |                 blm.add(f"{i}")
145 |                 blm.push()
146 | 
147 |             blm.export(fobj.name)
148 | 
149 |             blm2 = ExpandingBloomFilter(filepath=fobj.name)
150 |             self.assertEqual(blm2.expansions, 15)
151 |             for i in range(15):
152 |                 self.assertEqual(f"{i}" in blm2, True)
153 | 
154 |             # check for things that are not there!
155 |             for i in range(99, 125):
156 |                 self.assertEqual(f"{i}" in blm2, False)
157 | 
158 | 
159 | class TestRotatingBloomFilter(unittest.TestCase):
160 |     """Test RotatingBloomFilter"""
161 | 
162 |     def test_rbf_init(self):
163 |         """test the initialization of an rotating bloom filter"""
164 |         blm = RotatingBloomFilter(est_elements=10, false_positive_rate=0.05, max_queue_size=10)
165 |         self.assertEqual(blm.expansions, 0)
166 |         self.assertEqual(blm.max_queue_size, 10)
167 | 
168 |     def test_rbf_rotate(self):
169 |         """test that the bloom filter rotates the first bloom off the stack"""
170 |         blm = RotatingBloomFilter(est_elements=10, false_positive_rate=0.05, max_queue_size=5)
171 |         self.assertEqual(blm.expansions, 0)
172 |         blm.add("test")
173 |         self.assertEqual(blm.expansions, 0)
174 |         for i in range(10):
175 |             blm.add(f"{i}", force=True)
176 |         self.assertEqual(blm.expansions, 1)
177 |         self.assertEqual(blm.current_queue_size, 2)
178 |         self.assertEqual(blm.check("test"), True)
179 | 
180 |         for i in range(10, 20):
181 |             blm.add(f"{i}", force=True)
182 |         self.assertEqual(blm.check("test"), True)
183 |         self.assertEqual(blm.current_queue_size, 3)
184 | 
185 |         for i in range(20, 30):
186 |             blm.add(f"{i}", force=True)
187 |         self.assertEqual(blm.check("test"), True)
188 |         self.assertEqual(blm.current_queue_size, 4)
189 | 
190 |         for i in range(30, 40):
191 |             blm.add(f"{i}", force=True)
192 |         self.assertEqual(blm.check("test"), True)
193 |         self.assertEqual(blm.current_queue_size, 5)
194 | 
195 |         for i in range(40, 50):
196 |             blm.add(f"{i}", force=True)
197 |         self.assertEqual(blm.check("test"), False)  # it should roll off
198 |         self.assertEqual(blm.current_queue_size, 5)
199 | 
200 |         self.assertEqual(blm.elements_added, 51)
201 | 
202 |     def test_rbf_push_pop(self):
203 |         """test forcing push and pop"""
204 |         blm = RotatingBloomFilter(est_elements=10, false_positive_rate=0.05, max_queue_size=5)
205 |         self.assertEqual(blm.current_queue_size, 1)
206 |         blm.add("test")
207 |         blm.push()
208 |         self.assertEqual(blm.current_queue_size, 2)
209 |         self.assertEqual("test" in blm, True)
210 |         blm.push()
211 |         self.assertEqual(blm.current_queue_size, 3)
212 |         self.assertEqual("test" in blm, True)
213 |         blm.push()
214 |         self.assertEqual(blm.current_queue_size, 4)
215 |         self.assertEqual("test" in blm, True)
216 |         blm.push()
217 |         self.assertEqual(blm.current_queue_size, 5)
218 |         self.assertEqual("test" in blm, True)
219 |         blm.push()
220 |         self.assertEqual(blm.current_queue_size, 5)
221 |         self.assertEqual("test" in blm, False)
222 | 
223 |         # test popping
224 |         blm.add("that")
225 |         blm.pop()
226 |         self.assertEqual(blm.current_queue_size, 4)
227 |         self.assertEqual("that" in blm, True)
228 |         blm.pop()
229 |         self.assertEqual(blm.current_queue_size, 3)
230 |         self.assertEqual("that" in blm, True)
231 |         blm.pop()
232 |         self.assertEqual(blm.current_queue_size, 2)
233 |         self.assertEqual("that" in blm, True)
234 |         blm.pop()
235 |         self.assertEqual(blm.current_queue_size, 1)
236 |         self.assertEqual("that" in blm, True)
237 | 
238 |     def test_rbf_pop_exception(self):
239 |         """ensure the correct exception is thrown"""
240 |         blm = RotatingBloomFilter(est_elements=10, false_positive_rate=0.05, max_queue_size=5)
241 |         self.assertRaises(RotatingBloomFilterError, lambda: blm.pop())
242 | 
243 |     def test_rbf_pop_exception_msg(self):
244 |         """rotating bloom filter error: check the resulting error message"""
245 |         blm = RotatingBloomFilter(est_elements=10, false_positive_rate=0.05, max_queue_size=5)
246 |         try:
247 |             blm.pop()
248 |         except RotatingBloomFilterError as ex:
249 |             msg = "Popping a Bloom Filter will result in an unusable system!"
250 |             self.assertEqual(str(ex), msg)
251 |         except:  # noqa: E722
252 |             self.assertEqual(True, False)
253 | 
254 |     def test_rfb_basic_export(self):
255 |         """basic rotating Bloom Filter export test"""
256 |         with NamedTemporaryFile(dir=os.getcwd(), suffix=".rbf", delete=DELETE_TEMP_FILES) as fobj:
257 |             blm = RotatingBloomFilter(est_elements=25, false_positive_rate=0.05)
258 |             blm.export(fobj.name)
259 |             self.assertEqual(calc_file_md5(fobj.name), "eb5769ae9babdf7b37d6ce64d58812bc")
260 | 
261 |     def test_rfb_basic_bytes(self):
262 |         """basic rotating Bloom Filter export bytes test"""
263 |         blm = RotatingBloomFilter(est_elements=25, false_positive_rate=0.05)
264 |         self.assertEqual(hashlib.md5(bytes(blm)).hexdigest(), "eb5769ae9babdf7b37d6ce64d58812bc")
265 | 
266 |     def test_rfb_from_bytes(self):
267 |         """basic rotating Bloom Filter export bytes test"""
268 |         blm = RotatingBloomFilter(est_elements=25, false_positive_rate=0.05, max_queue_size=3)
269 |         for i in range(105):
270 |             blm.add(str(i))
271 |         bytes_out = bytes(blm)
272 | 
273 |         blm2 = RotatingBloomFilter.frombytes(bytes_out, max_queue_size=3)
274 |         self.assertEqual(blm2.expansions, 2)
275 |         self.assertEqual(blm2.false_positive_rate, 0.05000000074505806)
276 |         self.assertEqual(blm2.estimated_elements, 25)
277 |         self.assertEqual(blm2.elements_added, 105)
278 |         self.assertEqual(blm2.current_queue_size, 3)
279 |         self.assertEqual(bytes(blm2), bytes(blm))
280 |         for i in range(105):
281 |             self.assertEqual(blm.check(str(i)), blm2.check(str(i)))
282 | 
283 |     def test_rbf_import_empty(self):
284 |         """test that rotating Bloom Filter is correct on import"""
285 |         with NamedTemporaryFile(dir=os.getcwd(), suffix=".rbf", delete=DELETE_TEMP_FILES) as fobj:
286 |             blm = RotatingBloomFilter(est_elements=25, false_positive_rate=0.05)
287 |             blm.export(fobj.name)
288 |             self.assertEqual(calc_file_md5(fobj.name), "eb5769ae9babdf7b37d6ce64d58812bc")
289 | 
290 |             blm2 = ExpandingBloomFilter(filepath=fobj.name)
291 |             for bloom in blm2._blooms:
292 |                 self.assertEqual(bloom.elements_added, 0)
293 | 
294 |     def test_rbf_non_basic_import(self):
295 |         """test that the imported rotating Bloom filter is correct"""
296 |         with NamedTemporaryFile(dir=os.getcwd(), suffix=".rbf", delete=DELETE_TEMP_FILES) as fobj:
297 |             blm = RotatingBloomFilter(est_elements=25, false_positive_rate=0.05)
298 |             for i in range(15):
299 |                 blm.add(f"{i}")
300 |                 blm.push()
301 |             blm.export(fobj.name)
302 | 
303 |             blm2 = RotatingBloomFilter(filepath=fobj.name)
304 |             # test those that should be popped off...
305 |             for i in range(5):
306 |                 self.assertEqual(f"{i}" in blm2, False)
307 |             # test things that would not be popped
308 |             for i in range(6, 15):
309 |                 self.assertEqual(f"{i}" in blm2, True)
310 |             self.assertEqual(blm2.current_queue_size, 10)
311 |             self.assertEqual(blm2.expansions, 9)
312 |             self.assertEqual(blm2.elements_added, 15)
313 | 
314 | 
315 | if __name__ == "__main__":
316 |     unittest.main()
317 | 


--------------------------------------------------------------------------------
/tests/hashes_test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Unittest class"""
  3 | 
  4 | import hashlib
  5 | import sys
  6 | import unittest
  7 | from pathlib import Path
  8 | 
  9 | this_dir = Path(__file__).parent
 10 | sys.path.insert(0, str(this_dir))
 11 | sys.path.insert(0, str(this_dir.parent))
 12 | 
 13 | from probables.constants import UINT64_T_MAX  # noqa: E402
 14 | from probables.hashes import (  # noqa: E402
 15 |     default_fnv_1a,
 16 |     default_md5,
 17 |     default_sha256,
 18 |     fnv_1a_32,
 19 |     hash_with_depth_bytes,
 20 |     hash_with_depth_int,
 21 | )
 22 | 
 23 | 
 24 | class TestHashes(unittest.TestCase):
 25 |     """Test the different hash algorithms"""
 26 | 
 27 |     def test_default_fnv_1a(self):
 28 |         """test default fnv-1a algorithm"""
 29 |         this_is_a_test = [
 30 |             4040040117721899264,
 31 |             3916497180155386777,
 32 |             468410530588793106,
 33 |             13781401791305604595,
 34 |             321382271269641900,
 35 |         ]
 36 |         this_is_also = [
 37 |             7925790280716546811,
 38 |             13347851945403505568,
 39 |             17775584719969392601,
 40 |             10279404995231728046,
 41 |             13802534855964835503,
 42 |         ]
 43 |         hashes = default_fnv_1a("this is a test", 5)
 44 |         self.assertEqual(hashes, this_is_a_test)
 45 |         hashes = default_fnv_1a("this is also a test", 5)
 46 |         self.assertEqual(hashes, this_is_also)
 47 | 
 48 |     def test_default_hash_colision(self):
 49 |         """test when different strings start with the same hash value (issue 62)"""
 50 |         h1 = default_fnv_1a("gMPflVXtwGDXbIhP73TX", 5)
 51 |         h2 = default_fnv_1a("LtHf1prlU1bCeYZEdqWf", 5)
 52 | 
 53 |         self.assertEqual(h1[0], h2[0])  # these should match
 54 |         for i in range(1, 5):
 55 |             self.assertNotEqual(h1[i], h2[i])
 56 | 
 57 |     def test_fnv_1a_32(self):
 58 |         """test fnv_1a 32 bit hash"""
 59 |         hash = fnv_1a_32("this is a test", 0)
 60 |         self.assertEqual(hash, 2139996864)
 61 |         hash = fnv_1a_32("this is also a test", 0)
 62 |         self.assertEqual(hash, 1462718619)
 63 | 
 64 |     def test_default_md5(self):
 65 |         """test default md5 algorithm"""
 66 |         this_is_a_test = [
 67 |             12174049463882854484,
 68 |             10455450501617390806,
 69 |             3838261292881602234,
 70 |             12102952520950148619,
 71 |             12126605867972429202,
 72 |         ]
 73 |         this_is_also = [
 74 |             8938037604889355346,
 75 |             9361632593818981393,
 76 |             15781121455678786382,
 77 |             5600686735535066561,
 78 |             1353473153840687523,
 79 |         ]
 80 |         hashes = default_md5("this is a test", 5)
 81 |         self.assertEqual(hashes, this_is_a_test)
 82 |         hashes = default_md5("this is also a test", 5)
 83 |         self.assertEqual(hashes, this_is_also)
 84 | 
 85 |     def test_default_sha256(self):
 86 |         """test default sha256 algorithm"""
 87 |         this_is_a_test = [
 88 |             10244166640140130606,
 89 |             5650905005272240665,
 90 |             14215057275609328422,
 91 |             5952353080197385534,
 92 |             4990779931033217093,
 93 |         ]
 94 |         this_is_also = [
 95 |             4140421647067018332,
 96 |             9306548247555387104,
 97 |             5672713771950536751,
 98 |             8501641957786831066,
 99 |             15146689942378126332,
100 |         ]
101 |         hashes = default_sha256("this is a test", 5)
102 |         self.assertEqual(hashes, this_is_a_test)
103 |         hashes = default_sha256("this is also a test", 5)
104 |         self.assertEqual(hashes, this_is_also)
105 | 
106 |     def test_hash_bytes_decorator(self):
107 |         """test making bytes hashing strategy with decorator"""
108 |         results = [
109 |             1164302962920061,
110 |             16735493734761467723,
111 |             18150279091576190542,
112 |             9861778148718857663,
113 |             14008040072978383620,
114 |         ]
115 | 
116 |         @hash_with_depth_bytes
117 |         def my_hash(key, depth=1):
118 |             """my hash function"""
119 |             return hashlib.sha512(key).digest()
120 | 
121 |         self.assertEqual(my_hash("this is a test", 5), results)
122 |         res = my_hash("this is a test", 1)
123 |         self.assertEqual(len(res), 1)
124 |         self.assertEqual(res[0], results[0])
125 | 
126 |     def test_hash_ints_decorator(self):
127 |         """test making int hashing strategy with decorator"""
128 |         results = [
129 |             14409285476674975580,
130 |             6203976290780191624,
131 |             5074829385518853901,
132 |             3953072760750514173,
133 |             11782747630324011555,
134 |         ]
135 | 
136 |         @hash_with_depth_int
137 |         def my_hash(key, depth=1, encoding="utf-8"):
138 |             """my hash function"""
139 |             max64mod = UINT64_T_MAX + 1
140 |             val = int(hashlib.sha512(key.encode(encoding)).hexdigest(), 16)
141 |             return val % max64mod
142 | 
143 |         self.assertEqual(my_hash("this is a test", 5), results)
144 |         res = my_hash("this is a test", 1)
145 |         self.assertEqual(len(res), 1)
146 |         self.assertEqual(res[0], results[0])
147 | 
148 |     def test_default_fnv_1a_bytes(self):
149 |         """test default fnv-1a algorithm"""
150 |         this_is_a_test = [
151 |             4040040117721899264,
152 |             3916497180155386777,
153 |             468410530588793106,
154 |             13781401791305604595,
155 |             321382271269641900,
156 |         ]
157 |         this_is_also = [
158 |             7925790280716546811,
159 |             13347851945403505568,
160 |             17775584719969392601,
161 |             10279404995231728046,
162 |             13802534855964835503,
163 |         ]
164 |         hashes = default_fnv_1a(b"this is a test", 5)
165 |         self.assertEqual(hashes, this_is_a_test)
166 |         hashes = default_fnv_1a(b"this is also a test", 5)
167 |         self.assertEqual(hashes, this_is_also)
168 | 
169 |     def test_default_md5_bytes(self):
170 |         """test default md5 algorithm using bytes"""
171 |         this_is_a_test = [
172 |             12174049463882854484,
173 |             10455450501617390806,
174 |             3838261292881602234,
175 |             12102952520950148619,
176 |             12126605867972429202,
177 |         ]
178 |         this_is_also = [
179 |             8938037604889355346,
180 |             9361632593818981393,
181 |             15781121455678786382,
182 |             5600686735535066561,
183 |             1353473153840687523,
184 |         ]
185 |         hashes = default_md5(b"this is a test", 5)
186 |         self.assertEqual(hashes, this_is_a_test)
187 |         hashes = default_md5(b"this is also a test", 5)
188 |         self.assertEqual(hashes, this_is_also)
189 | 
190 |     def test_default_sha256_bytes(self):
191 |         """test default sha256 algorithm using bytes"""
192 |         this_is_a_test = [
193 |             10244166640140130606,
194 |             5650905005272240665,
195 |             14215057275609328422,
196 |             5952353080197385534,
197 |             4990779931033217093,
198 |         ]
199 |         this_is_also = [
200 |             4140421647067018332,
201 |             9306548247555387104,
202 |             5672713771950536751,
203 |             8501641957786831066,
204 |             15146689942378126332,
205 |         ]
206 |         hashes = default_sha256(b"this is a test", 5)
207 |         self.assertEqual(hashes, this_is_a_test)
208 |         hashes = default_sha256(b"this is also a test", 5)
209 |         self.assertEqual(hashes, this_is_also)
210 | 
211 | 
212 | if __name__ == "__main__":
213 |     unittest.main()
214 | 


--------------------------------------------------------------------------------
/tests/quotientfilter_test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Unittest class"""
  3 | 
  4 | import os
  5 | import random
  6 | import sys
  7 | import unittest
  8 | from pathlib import Path
  9 | from tempfile import NamedTemporaryFile
 10 | 
 11 | from probables.exceptions import QuotientFilterError
 12 | 
 13 | this_dir = Path(__file__).parent
 14 | sys.path.insert(0, str(this_dir))
 15 | sys.path.insert(0, str(this_dir.parent))
 16 | from probables import QuotientFilter  # noqa: E402
 17 | 
 18 | DELETE_TEMP_FILES = True
 19 | 
 20 | 
 21 | class TestQuotientFilter(unittest.TestCase):
 22 |     """Test the default quotient filter implementation"""
 23 | 
 24 |     def test_qf_init(self):
 25 |         "test initializing a blank quotient filter"
 26 |         qf = QuotientFilter()
 27 | 
 28 |         self.assertEqual(qf.bits_per_elm, 16)
 29 |         self.assertEqual(qf.quotient, 20)
 30 |         self.assertEqual(qf.remainder, 12)
 31 |         self.assertEqual(qf.elements_added, 0)
 32 |         self.assertEqual(qf.num_elements, 1048576)  # 2**qf.quotient
 33 | 
 34 |         qf = QuotientFilter(quotient=8)
 35 | 
 36 |         self.assertEqual(qf.bits_per_elm, 32)
 37 |         self.assertEqual(qf.quotient, 8)
 38 |         self.assertEqual(qf.remainder, 24)
 39 |         self.assertEqual(qf.elements_added, 0)
 40 |         self.assertEqual(qf.num_elements, 256)  # 2**qf.quotient
 41 |         self.assertTrue(qf.auto_expand)
 42 | 
 43 |         qf = QuotientFilter(quotient=24, auto_expand=False)
 44 | 
 45 |         self.assertEqual(qf.bits_per_elm, 8)
 46 |         self.assertEqual(qf.quotient, 24)
 47 |         self.assertEqual(qf.remainder, 8)
 48 |         self.assertEqual(qf.elements_added, 0)
 49 |         self.assertEqual(qf.num_elements, 16777216)  # 2**qf.quotient
 50 |         self.assertFalse(qf.auto_expand)
 51 | 
 52 |         # reset auto_expand
 53 |         qf.auto_expand = True
 54 |         self.assertTrue(qf.auto_expand)
 55 | 
 56 |     def test_qf_add_check(self):
 57 |         "test that the qf is able to add and check elements"
 58 |         qf = QuotientFilter(quotient=8)
 59 | 
 60 |         for i in range(0, 200, 2):
 61 |             qf.add(str(i))
 62 |         self.assertEqual(qf.elements_added, 100)
 63 |         self.assertEqual(qf.load_factor, 100 / qf.size)
 64 |         found_no = False
 65 |         for i in range(0, 200, 2):
 66 |             if not qf.check(str(i)):
 67 |                 found_no = True
 68 |         self.assertFalse(found_no)
 69 | 
 70 |         for i in range(1, 200, 2):
 71 |             print(i)
 72 |             self.assertFalse(qf.check(str(i)))
 73 | 
 74 |         self.assertEqual(qf.elements_added, 100)
 75 | 
 76 |     def test_qf_add_check_in(self):
 77 |         "test that the qf is able to add and check elements using `in`"
 78 |         qf = QuotientFilter(quotient=8)
 79 | 
 80 |         for i in range(0, 200, 2):
 81 |             qf.add(str(i))
 82 |         self.assertEqual(qf.elements_added, 100)
 83 | 
 84 |         found_no = False
 85 |         for i in range(0, 200, 2):
 86 |             if str(i) not in qf:
 87 |                 found_no = True
 88 |         self.assertFalse(found_no)
 89 | 
 90 |         for i in range(1, 200, 2):
 91 |             print(i)
 92 |             self.assertFalse(str(i) in qf)
 93 | 
 94 |         self.assertEqual(qf.elements_added, 100)
 95 | 
 96 |     def test_qf_init_errors(self):
 97 |         """test quotient filter initialization errors"""
 98 |         self.assertRaises(QuotientFilterError, lambda: QuotientFilter(quotient=2))
 99 |         self.assertRaises(QuotientFilterError, lambda: QuotientFilter(quotient=32))
100 | 
101 |     def test_qf_retrieve_hashes(self):
102 |         """test retrieving hashes back from the quotient filter"""
103 |         qf = QuotientFilter(quotient=8, auto_expand=False)
104 |         hashes = []
105 |         for i in range(255):
106 |             hashes.append(qf._hash_func(str(i), 0))  # use the private function here..
107 |             qf.add(str(i))
108 |         self.assertEqual(qf.size, 256)
109 |         self.assertEqual(qf.load_factor, 255 / qf.size)
110 |         out_hashes = qf.get_hashes()
111 |         self.assertEqual(qf.elements_added, len(out_hashes))
112 |         self.assertEqual(set(hashes), set(out_hashes))
113 | 
114 |     def test_qf_resize(self):
115 |         """test resizing the quotient filter"""
116 |         qf = QuotientFilter(quotient=8, auto_expand=False)
117 |         for i in range(200):
118 |             qf.add(str(i))
119 | 
120 |         self.assertEqual(qf.elements_added, 200)
121 |         self.assertEqual(qf.load_factor, 200 / qf.size)
122 |         self.assertEqual(qf.quotient, 8)
123 |         self.assertEqual(qf.remainder, 24)
124 |         self.assertEqual(qf.bits_per_elm, 32)
125 |         self.assertFalse(qf.auto_expand)
126 | 
127 |         self.assertRaises(QuotientFilterError, lambda: qf.resize(7))  # should be too small to fit
128 | 
129 |         qf.resize(17)
130 |         self.assertEqual(qf.elements_added, 200)
131 |         self.assertEqual(qf.load_factor, 200 / qf.size)
132 |         self.assertEqual(qf.quotient, 17)
133 |         self.assertEqual(qf.remainder, 15)
134 |         self.assertEqual(qf.bits_per_elm, 16)
135 |         # ensure everything is still accessable
136 |         for i in range(200):
137 |             self.assertTrue(qf.check(str(i)))
138 | 
139 |     def test_qf_auto_resize(self):
140 |         """test resizing the quotient filter automatically"""
141 |         qf = QuotientFilter(quotient=8, auto_expand=True)
142 |         self.assertEqual(qf.max_load_factor, 0.85)
143 |         self.assertEqual(qf.elements_added, 0)
144 |         self.assertEqual(qf.load_factor, 0 / qf.size)
145 |         self.assertEqual(qf.quotient, 8)
146 |         self.assertEqual(qf.remainder, 24)
147 |         self.assertEqual(qf.bits_per_elm, 32)
148 |         self.assertTrue(qf.auto_expand)
149 | 
150 |         for i in range(220):
151 |             qf.add(str(i))
152 | 
153 |         self.assertEqual(qf.max_load_factor, 0.85)
154 |         self.assertEqual(qf.elements_added, 220)
155 |         self.assertEqual(qf.load_factor, 220 / qf.size)
156 |         self.assertEqual(qf.quotient, 9)
157 |         self.assertEqual(qf.remainder, 23)
158 |         self.assertEqual(qf.bits_per_elm, 32)
159 | 
160 |     def test_qf_auto_resize_changed_max_load_factor(self):
161 |         """test resizing the quotient filter with a different load factor"""
162 |         qf = QuotientFilter(quotient=8, auto_expand=True)
163 |         self.assertEqual(qf.max_load_factor, 0.85)
164 |         self.assertTrue(qf.auto_expand)
165 |         qf.max_load_factor = 0.65
166 |         self.assertEqual(qf.max_load_factor, 0.65)
167 | 
168 |         self.assertEqual(qf.elements_added, 0)
169 |         self.assertEqual(qf.load_factor, 0 / qf.size)
170 |         self.assertEqual(qf.quotient, 8)
171 |         self.assertEqual(qf.remainder, 24)
172 |         self.assertEqual(qf.bits_per_elm, 32)
173 |         self.assertTrue(qf.auto_expand)
174 | 
175 |         for i in range(200):
176 |             qf.add(str(i))
177 | 
178 |         self.assertEqual(qf.max_load_factor, 0.85)
179 |         self.assertEqual(qf.elements_added, 200)
180 |         self.assertEqual(qf.load_factor, 200 / qf.size)
181 |         self.assertEqual(qf.quotient, 9)
182 |         self.assertEqual(qf.remainder, 23)
183 |         self.assertEqual(qf.bits_per_elm, 32)
184 | 
185 |     def test_qf_resize_errors(self):
186 |         """test resizing errors"""
187 | 
188 |         qf = QuotientFilter(quotient=8, auto_expand=True)
189 |         for i in range(200):
190 |             qf.add(str(i))
191 | 
192 |         self.assertRaises(QuotientFilterError, lambda: qf.resize(quotient=2))
193 |         self.assertRaises(QuotientFilterError, lambda: qf.resize(quotient=32))
194 |         self.assertRaises(QuotientFilterError, lambda: qf.resize(quotient=6))
195 | 
196 |     def test_qf_merge(self):
197 |         """test merging two quotient filters together"""
198 |         qf = QuotientFilter(quotient=8, auto_expand=True)
199 |         for i in range(200):
200 |             qf.add(str(i))
201 | 
202 |         fq = QuotientFilter(quotient=8)
203 |         for i in range(300, 500):
204 |             fq.add(str(i))
205 | 
206 |         qf.merge(fq)
207 | 
208 |         for i in range(200):
209 |             self.assertTrue(qf.check(str(i)))
210 |         for i in range(200, 300):
211 |             self.assertFalse(qf.check(str(i)))
212 |         for i in range(300, 500):
213 |             self.assertTrue(qf.check(str(i)))
214 | 
215 |         self.assertEqual(qf.elements_added, 400)
216 | 
217 |     def test_qf_merge_error(self):
218 |         """test unable to merge due to inability to grow"""
219 |         qf = QuotientFilter(quotient=8, auto_expand=False)
220 |         for i in range(200):
221 |             qf.add(str(i))
222 | 
223 |         fq = QuotientFilter(quotient=8)
224 |         for i in range(300, 400):
225 |             fq.add(str(i))
226 | 
227 |         self.assertRaises(QuotientFilterError, lambda: qf.merge(fq))
228 | 
229 |         # test mismatch hashes
230 |         def useless_hash(key, seed) -> int:
231 |             return 99999999
232 | 
233 |         qq = QuotientFilter(quotient=8, hash_function=useless_hash)
234 |         qq.add("999")
235 | 
236 |         self.assertRaises(QuotientFilterError, lambda: fq.merge(qq))
237 | 
238 |     def test_qf_remove_missing_elm(self):
239 |         """test removing a missing element"""
240 |         alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"]
241 |         qf = QuotientFilter(quotient=7)
242 |         for a in alpha:
243 |             qf.add(a)
244 | 
245 |         qf.remove("~")
246 | 
247 |         missing_vals = []
248 |         for a in alpha:
249 |             if not qf.check(a):
250 |                 missing_vals.append(a)
251 |         self.assertListEqual(missing_vals, [])
252 |         self.assertTrue(qf.validate_metadata())
253 | 
254 |     def test_qf_remove_cluster_start(self):
255 |         """test removing a cluster start followed by empty"""
256 |         alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"]
257 |         qf = QuotientFilter(quotient=7)
258 |         for a in alpha:
259 |             qf.add(a)
260 | 
261 |         qf.remove(".")
262 | 
263 |         missing_vals = []
264 |         for a in alpha:
265 |             if not qf.check(a):
266 |                 missing_vals.append(a)
267 |         self.assertListEqual(missing_vals, ["."])
268 |         self.assertTrue(qf.validate_metadata())
269 | 
270 |     def test_qf_remove_cluster_start_cluster(self):
271 |         """test removing a cluster start followed by cluster start"""
272 |         alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"]
273 |         qf = QuotientFilter(quotient=7)
274 |         for a in alpha:
275 |             qf.add(a)
276 | 
277 |         qf.remove("-")
278 | 
279 |         missing_vals = []
280 |         for a in alpha:
281 |             if not qf.check(a):
282 |                 missing_vals.append(a)
283 |         self.assertListEqual(missing_vals, ["-"])
284 |         self.assertTrue(qf.validate_metadata())
285 | 
286 |     def test_qf_remove_shifted_run_start_followed_by_empty(self):
287 |         """test removing a shifted run start followed by empty"""
288 |         alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"]
289 |         qf = QuotientFilter(quotient=7)
290 |         for a in alpha:
291 |             qf.add(a)
292 | 
293 |         qf.remove("z")
294 | 
295 |         missing_vals = []
296 |         for a in alpha:
297 |             if not qf.check(a):
298 |                 missing_vals.append(a)
299 |         self.assertListEqual(missing_vals, ["z"])
300 |         self.assertTrue(qf.validate_metadata())
301 | 
302 |     def test_qf_remove_shifted_run_start_followed_continuation(self):
303 |         """test removing a shifted run start followed by continuation"""
304 |         alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"]
305 |         qf = QuotientFilter(quotient=7)
306 |         for a in alpha:
307 |             qf.add(a)
308 | 
309 |         qf.remove("y")
310 | 
311 |         missing_vals = []
312 |         for a in alpha:
313 |             if not qf.check(a):
314 |                 missing_vals.append(a)
315 |         self.assertListEqual(missing_vals, ["y"])
316 |         self.assertTrue(qf.validate_metadata())
317 | 
318 |     def test_qf_remove_shifted_continuation_followed_run_start(self):
319 |         """test removing a shifted continuation followed by run start"""
320 |         alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"]
321 |         qf = QuotientFilter(quotient=7)
322 |         for a in alpha:
323 |             qf.add(a)
324 | 
325 |         qf.remove("x")
326 | 
327 |         missing_vals = []
328 |         for a in alpha:
329 |             if not qf.check(a):
330 |                 missing_vals.append(a)
331 |         self.assertListEqual(missing_vals, ["x"])
332 |         self.assertTrue(qf.validate_metadata())
333 | 
334 |     def test_qf_remove_shifted_run_start_followed_run_start(self):
335 |         """test removing a shifted run start followed by run start"""
336 |         alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"]
337 |         qf = QuotientFilter(quotient=7)
338 |         for a in alpha:
339 |             qf.add(a)
340 | 
341 |         qf.remove("a")
342 | 
343 |         missing_vals = []
344 |         for a in alpha:
345 |             if not qf.check(a):
346 |                 missing_vals.append(a)
347 |         self.assertListEqual(missing_vals, ["a"])
348 |         self.assertTrue(qf.validate_metadata())
349 | 
350 |     def test_qf_remove_cluster_start_followed_continuation_follow_run_start(self):
351 |         """test removing a cluster start followed by continuation putting a run start into a cluster start position"""
352 |         alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"]
353 |         qf = QuotientFilter(quotient=7)
354 |         for a in alpha:
355 |             qf.add(a)
356 | 
357 |         qf.remove("d")
358 | 
359 |         missing_vals = []
360 |         for a in alpha:
361 |             if not qf.check(a):
362 |                 missing_vals.append(a)
363 |         self.assertListEqual(missing_vals, ["d"])
364 |         self.assertTrue(qf.validate_metadata())
365 | 
366 |     def test_qf_remove_full(self):
367 |         """Test removing all elements, but find each one after each removal"""
368 |         alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"]
369 |         qf = QuotientFilter(quotient=7)
370 |         for a in alpha:
371 |             _hash = qf._hash_func(a, 0)
372 |             print(a, _hash >> qf._r, _hash & ((1 << qf._r) - 1))
373 |             qf.add(a)
374 | 
375 |         for a in alpha:
376 |             self.assertTrue(qf.check(a), "failed to insert")
377 | 
378 |         while alpha:
379 |             missing_vals = []
380 |             val = alpha.pop(0)
381 |             qf.remove(val)
382 |             missing_vals = []
383 |             for a in alpha:
384 |                 if not qf.check(a):
385 |                     missing_vals.append(a)
386 |             self.assertListEqual(missing_vals, [])
387 |             self.assertTrue(qf.validate_metadata())
388 | 
389 |     def test_qf_remove_full_random(self):
390 |         """Test removing all elements, but in a random order"""
391 |         alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"]
392 |         qf = QuotientFilter(quotient=7)
393 |         for a in alpha:
394 |             qf.add(a)
395 | 
396 |         for a in alpha:
397 |             self.assertTrue(qf.check(a), "failed to insert")
398 |             self.assertTrue(qf.validate_metadata())
399 | 
400 |         while alpha:
401 |             missing_vals = []
402 |             idx = random.randrange(len(alpha))
403 |             val = alpha.pop(idx)
404 |             qf.remove(val)
405 |             missing_vals = []
406 |             for a in alpha:
407 |                 if not qf.check(a):
408 |                     missing_vals.append(a)
409 |             self.assertListEqual(missing_vals, [])
410 |             self.assertTrue(qf.validate_metadata())
411 | 
412 |     def test_qf_remove_full_random_take_2(self):
413 |         """Test removing all elements, but in a random order - take 2"""
414 |         alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"]
415 |         qf = QuotientFilter(quotient=7)
416 |         for a in alpha:
417 |             qf.add(a)
418 | 
419 |         for a in alpha:
420 |             self.assertTrue(qf.check(a), "failed to insert")
421 | 
422 |         while alpha:
423 |             missing_vals = []
424 |             idx = random.randrange(len(alpha))
425 |             val = alpha.pop(idx)
426 |             qf.remove(val)
427 |             missing_vals = []
428 |             for a in alpha:
429 |                 if not qf.check(a):
430 |                     missing_vals.append(a)
431 |             self.assertListEqual(missing_vals, [])
432 |             self.assertTrue(qf.validate_metadata())
433 | 
434 |     def test_quotient_filter_print_empty(self):
435 |         """Test printing the data of a quotient filter in a manner to be read through"""
436 |         qf = QuotientFilter(quotient=7)
437 |         with NamedTemporaryFile(dir=os.getcwd(), suffix=".txt", delete=DELETE_TEMP_FILES, mode="wt") as fobj:
438 |             qf.print(file=fobj.file)
439 |             fobj.flush()
440 | 
441 |             with open(fobj.name) as fobj:
442 |                 data = fobj.readlines()
443 |         data = [x.strip() for x in data]
444 |         self.assertEqual(data[0], "idx\t--\tO-C-S\tStatus")
445 |         for i in range(2, len(data)):
446 |             self.assertEqual(data[i], f"{i-2}\t--\t0-0-0\tEmpty")
447 | 
448 |     def test_quotient_filter_print(self):
449 |         """Test printing the data of a quotient filter in a manner to be read through not empty"""
450 |         alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"]
451 |         qf = QuotientFilter(quotient=7)
452 |         for a in alpha:
453 |             qf.add(a)
454 | 
455 |         with NamedTemporaryFile(dir=os.getcwd(), suffix=".txt", delete=DELETE_TEMP_FILES, mode="wt") as fobj:
456 |             qf.print(file=fobj.file)
457 |             fobj.flush()
458 | 
459 |             with open(fobj.name) as fobj:
460 |                 data = fobj.readlines()
461 |         data = [x.strip() for x in data]
462 |         self.assertEqual(data[0], "idx\t--\tO-C-S\tStatus")
463 |         self.assertEqual(data[22], "20\t--\t1-0-0\tCluster Start")
464 |         self.assertEqual(data[23], "21\t--\t1-0-0\tCluster Start")
465 | 
466 |         self.assertEqual(data[114], "112\t--\t1-0-0\tCluster Start")
467 |         self.assertEqual(data[115], "113\t--\t1-1-1\tContinuation")
468 |         self.assertEqual(data[116], "114\t--\t1-0-1\tRun Start")
469 |         self.assertEqual(data[10], "8\t--\t0-1-1\tContinuation")
470 |         self.assertEqual(data[11], "9\t--\t0-0-1\tRun Start")
471 | 


--------------------------------------------------------------------------------
/tests/test_utilities.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """probables utilitites tests"""
  3 | 
  4 | import os
  5 | import sys
  6 | import unittest
  7 | from pathlib import Path
  8 | from tempfile import NamedTemporaryFile
  9 | 
 10 | this_dir = Path(__file__).parent
 11 | sys.path.insert(0, str(this_dir))
 12 | sys.path.insert(0, str(this_dir.parent))
 13 | 
 14 | from probables.utilities import Bitarray, MMap, get_x_bits, is_hex_string, is_valid_file, resolve_path  # noqa: E402
 15 | from tests.utilities import different_hash  # noqa: E402
 16 | 
 17 | DELETE_TEMP_FILES = True
 18 | 
 19 | 
 20 | class TestProbablesUtilities(unittest.TestCase):
 21 |     """test the utilities for pyprobables"""
 22 | 
 23 |     def test_is_hex(self):
 24 |         """test the is valid hex function"""
 25 |         self.assertTrue(is_hex_string("123467890abcdef"))
 26 |         self.assertTrue(is_hex_string("123467890ABCDEF"))
 27 |         self.assertFalse(is_hex_string("123467890abcdfq"))
 28 |         self.assertFalse(is_hex_string("123467890ABCDEFQ"))
 29 | 
 30 |     def test_is_valid_file(self):
 31 |         """test the is valid file function"""
 32 |         with NamedTemporaryFile(dir=os.getcwd(), suffix=".rbf", delete=DELETE_TEMP_FILES) as fobj:
 33 |             self.assertFalse(is_valid_file(None))
 34 |             self.assertFalse(is_valid_file("./file_doesnt_exist.txt"))
 35 |             with open(fobj.name, "w"):
 36 |                 pass
 37 |             self.assertTrue(is_valid_file(fobj.name))
 38 | 
 39 |     def test_get_x_bits(self):
 40 |         """test the get x bits function"""
 41 |         for i in range(8):
 42 |             res = get_x_bits(i, 4, 2, True)
 43 |             self.assertEqual(res, i % 4)
 44 |         for i in range(8):
 45 |             res = get_x_bits(i, 4, 2, False)
 46 |             if i < 4:
 47 |                 self.assertEqual(res, 0)
 48 |             else:
 49 |                 self.assertEqual(res, 1)
 50 | 
 51 |     def test_get_x_bits_large(self):
 52 |         """test it on much larger numbers"""
 53 |         res = different_hash("this is a test", 1)[0]
 54 |         # 1010100101011011100100010101010011110000001010011010000101001011
 55 |         tmp1 = get_x_bits(res, 64, 32, True)
 56 |         tmp2 = get_x_bits(res, 64, 32, False)
 57 |         self.assertEqual(4029260107, tmp1)
 58 |         self.assertEqual(2841350484, tmp2)
 59 | 
 60 |         tmp1 = get_x_bits(res, 64, 16, True)
 61 |         tmp2 = get_x_bits(res, 64, 16, False)
 62 |         self.assertEqual(41291, tmp1)
 63 |         self.assertEqual(43355, tmp2)
 64 | 
 65 |         tmp1 = get_x_bits(res, 64, 8, True)
 66 |         tmp2 = get_x_bits(res, 64, 8, False)
 67 |         self.assertEqual(75, tmp1)
 68 |         self.assertEqual(169, tmp2)
 69 | 
 70 |         tmp1 = get_x_bits(res, 64, 4, True)
 71 |         tmp2 = get_x_bits(res, 64, 4, False)
 72 |         self.assertEqual(11, tmp1)
 73 |         self.assertEqual(10, tmp2)
 74 | 
 75 |         tmp1 = get_x_bits(res, 64, 2, True)
 76 |         tmp2 = get_x_bits(res, 64, 2, False)
 77 |         self.assertEqual(3, tmp1)
 78 |         self.assertEqual(2, tmp2)
 79 | 
 80 |         tmp1 = get_x_bits(res, 64, 1, True)
 81 |         tmp2 = get_x_bits(res, 64, 1, False)
 82 |         self.assertEqual(1, tmp1)
 83 |         self.assertEqual(1, tmp2)
 84 | 
 85 |     def test_mmap_functionality(self):
 86 |         """test some of the MMap class functionality"""
 87 |         data = b"this is a test of the MMap system!"
 88 |         with NamedTemporaryFile(dir=os.getcwd(), suffix=".rbf", delete=DELETE_TEMP_FILES) as fobj:
 89 |             with open(fobj.name, "wb") as fobj:
 90 |                 fobj.write(data)
 91 |             m = MMap(fobj.name)
 92 |             self.assertFalse(m.closed)
 93 |             self.assertEqual(data, m.read())
 94 |             m.seek(0, os.SEEK_SET)
 95 |             self.assertEqual(data[:5], m.read(5))
 96 |             self.assertEqual(data[5:], m.read())
 97 |             m.close()
 98 |             self.assertTrue(m.closed)
 99 | 
100 |     def test_resolve_path(self):
101 |         """test that resolve_path returns correct"""
102 |         p = resolve_path("~")
103 |         self.assertTrue(p.is_absolute())
104 | 
105 |         with NamedTemporaryFile(dir=os.getcwd(), suffix=".rbf", delete=DELETE_TEMP_FILES) as fobj:
106 |             with open(fobj.name, "w"):
107 |                 pass
108 |             p2 = resolve_path(f"./{fobj.name}")
109 |             self.assertTrue(p2.is_absolute())
110 | 
111 |     def test_bitarray(self):
112 |         """test bit array basic operations"""
113 |         ba = Bitarray(100)
114 | 
115 |         self.assertEqual(ba.size, 100)
116 |         self.assertEqual(ba.size_bytes, 13)
117 |         for i in range(ba.size_bytes):
118 |             self.assertEqual(0, ba.bitarray[i])
119 | 
120 |         # test setting bits
121 |         for i in range(33):
122 |             ba.set_bit(i * 3)
123 | 
124 |         self.assertEqual(
125 |             ba.as_string(),
126 |             "1001001001001001001001001001001001001001001001001001001001001001001001001001001001001001001001001000",
127 |         )
128 |         self.assertEqual(ba.num_bits_set(), 33)
129 |         self.assertTrue(ba.is_bit_set(3))
130 |         self.assertFalse(ba.is_bit_set(4))
131 |         self.assertEqual(ba[0], 1)
132 |         self.assertEqual(ba[1], 0)
133 | 
134 |         # test clearing bits
135 |         for i in range(33):
136 |             ba.clear_bit(i * 3)
137 | 
138 |         self.assertEqual(
139 |             ba.as_string(),
140 |             "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
141 |         )
142 | 
143 |         for i in range(33):
144 |             ba.set_bit(i * 3)
145 |         self.assertEqual(
146 |             ba.as_string(),
147 |             "1001001001001001001001001001001001001001001001001001001001001001001001001001001001001001001001001000",
148 |         )
149 | 
150 |         self.assertEqual(ba[2], 0)
151 |         ba[2] = 1
152 |         self.assertEqual(ba[2], 1)
153 |         ba[2] = 0
154 |         self.assertEqual(ba[2], 0)
155 | 
156 |         ba.clear()
157 |         self.assertEqual(
158 |             ba.as_string(),
159 |             "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
160 |         )
161 | 
162 |     def test_bitarray_invalid_idx(self):
163 |         """use an invalid type in a jaccard index"""
164 |         self.assertRaises(TypeError, lambda: Bitarray("100"))
165 |         self.assertRaises(ValueError, lambda: Bitarray(-100))
166 |         ba = Bitarray(10)
167 |         self.assertRaises(IndexError, lambda: ba.set_bit(12))
168 |         self.assertRaises(IndexError, lambda: ba.set_bit(-1))
169 |         self.assertRaises(IndexError, lambda: ba.check_bit(-1))
170 |         self.assertRaises(IndexError, lambda: ba.check_bit(12))
171 |         self.assertRaises(IndexError, lambda: ba.clear_bit(-1))
172 |         self.assertRaises(IndexError, lambda: ba.clear_bit(12))
173 | 
174 |         self.assertRaises(IndexError, lambda: ba[-1])
175 |         self.assertRaises(IndexError, lambda: ba[12])
176 | 
177 |         def test_set(idx, val):
178 |             ba[idx] = val
179 | 
180 |         self.assertRaises(IndexError, lambda: test_set(-1, 0))
181 |         self.assertRaises(IndexError, lambda: test_set(12, 0))
182 |         # set as non-valid bit value
183 |         self.assertRaises(ValueError, lambda: test_set(1, 5))
184 |         self.assertRaises(ValueError, lambda: test_set(12, -1))
185 | 
186 | 
187 | if __name__ == "__main__":
188 |     unittest.main()
189 | 


--------------------------------------------------------------------------------
/tests/utilities.py:
--------------------------------------------------------------------------------
 1 | """utility functions"""
 2 | 
 3 | from hashlib import md5
 4 | from pathlib import Path
 5 | from typing import Union
 6 | 
 7 | from probables.constants import UINT64_T_MAX
 8 | from probables.hashes import KeyT
 9 | 
10 | 
11 | def calc_file_md5(filename: Union[str, Path]) -> str:
12 |     """calc the md5 of a file"""
13 |     with open(filename, "rb") as filepointer:
14 |         res = filepointer.read()
15 |     return md5(res).hexdigest()
16 | 
17 | 
18 | def different_hash(key: KeyT, depth: int) -> list[int]:
19 |     """the default fnv-1a hashing routine, but different"""
20 | 
21 |     def __fnv_1a(key: KeyT) -> int:
22 |         """64 bit fnv-1a hash"""
23 |         hval = 14695981039346656074  # made minor change
24 |         fnv_64_prime = 1099511628211
25 |         tmp = list(key) if not isinstance(key, str) else list(map(ord, key))
26 |         for t_str in tmp:
27 |             hval ^= t_str
28 |             hval *= fnv_64_prime
29 |             hval &= UINT64_T_MAX
30 |         return hval
31 | 
32 |     res = []
33 |     for _ in range(depth):
34 |         res.append(__fnv_1a(key))
35 |     return res
36 | 


--------------------------------------------------------------------------------