├── .editorconfig ├── .github ├── dependabot.yml └── workflows │ ├── publish.yml │ ├── python-package.yml │ └── ruff.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .pylintrc ├── .readthedocs.yaml ├── CHANGELOG.md ├── CITATION.cff ├── LICENSE ├── README.rst ├── codecov.yml ├── docs ├── Makefile ├── requirements.txt └── source │ ├── _static │ └── custom.css │ ├── code.rst │ ├── conf.py │ ├── index.rst │ └── quickstart.rst ├── probables ├── __init__.py ├── blooms │ ├── __init__.py │ ├── bloom.py │ ├── countingbloom.py │ ├── expandingbloom.py │ └── py.typed ├── constants.py ├── countminsketch │ ├── __init__.py │ ├── countminsketch.py │ └── py.typed ├── cuckoo │ ├── __init__.py │ ├── countingcuckoo.py │ ├── cuckoo.py │ └── py.typed ├── exceptions.py ├── hashes.py ├── py.typed ├── quotientfilter │ ├── __init__.py │ ├── py.typed │ └── quotientfilter.py └── utilities.py ├── pyproject.toml ├── scripts └── version_bump.py └── tests ├── __init__.py ├── bloom_test.py ├── countingbloom_test.py ├── countingcuckoo_test.py ├── countminsketch_test.py ├── cuckoo_test.py ├── expandingbloom_test.py ├── hashes_test.py ├── quotientfilter_test.py ├── test_utilities.py └── utilities.py /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | indent_style = space 6 | indent_size = 4 7 | insert_final_newline = true 8 | end_of_line = lf 9 | 10 | [*.{yml,yaml}] 11 | indent_style = space 12 | indent_size = 2 13 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | 9 | - package-ecosystem: "github-actions" 10 | directory: "/" 11 | schedule: 12 | # Check for updates to GitHub Actions every week 13 | interval: "weekly" 14 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | - name: Set up Python 18 | uses: actions/setup-python@v5 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 25 | pip install twine build 26 | - name: Build and publish 27 | env: 28 | TWINE_USERNAME: __token__ 29 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 30 | run: | 31 | python -m build 32 | twine upload dist/* 33 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: [push, pull_request] 7 | 8 | jobs: 9 | build: 10 | 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v5 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | python -m pip install flake8 pytest pytest-cov 26 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 27 | - name: Lint with flake8 28 | run: | 29 | # stop the build if there are Python syntax errors or undefined names 30 | flake8 probables/ --count --select=E9,F63,F7,F82 --show-source --statistics 31 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 32 | flake8 probables/ --count --exit-zero --max-complexity=11 --max-line-length=127 --statistics 33 | - name: Test with pytest 34 | run: | 35 | # Run tests while also generating coverage statistics 36 | pytest --cov=./ --cov-report=xml 37 | - name: Upload coverage to Codecov 38 | uses: codecov/codecov-action@v5 39 | with: 40 | files: ./coverage.xml 41 | fail_ci_if_error: false 42 | 43 | build-verification: 44 | 45 | runs-on: ubuntu-latest 46 | steps: 47 | - uses: actions/checkout@v4 48 | - uses: actions/setup-python@v5 49 | with: 50 | python-version: '3.x' 51 | - name: Build and check twine 52 | run: | 53 | python -m pip install --upgrade pip 54 | python -m pip install build twine 55 | python -m build 56 | twine check dist/* 57 | 58 | Lint-black: 59 | runs-on: ubuntu-latest 60 | steps: 61 | - uses: actions/checkout@v4 62 | - uses: psf/black@stable 63 | with: 64 | # src: "./probables" 65 | version: "22.8.0" 66 | -------------------------------------------------------------------------------- /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- 1 | name: Ruff 2 | on: [workflow_dispatch, pull_request] 3 | jobs: 4 | build: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v4 8 | - name: Install Python 9 | uses: actions/setup-python@v5 10 | with: 11 | python-version: "3.13" 12 | - uses: astral-sh/ruff-action@v3 13 | with: 14 | args: "check --fix" 15 | continue-on-error: false -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ############################################### 2 | # Project Specific 3 | ############################################### 4 | *.blm 5 | *.cms 6 | *.dat 7 | 8 | ############################################### 9 | # Python 10 | ############################################### 11 | 12 | # Byte-compiled / optimized / DLL files 13 | __pycache__/ 14 | *.py[cod] 15 | *$py.class 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Distribution / packaging 21 | .Python 22 | env/ 23 | build/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | wheels/ 35 | *.egg-info/ 36 | .installed.cfg 37 | *.egg 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # celery beat schedule file 89 | celerybeat-schedule 90 | 91 | # SageMath parsed files 92 | *.sage.py 93 | 94 | # dotenv 95 | .env 96 | 97 | # virtualenv 98 | .venv 99 | venv/ 100 | ENV/ 101 | Pipfile* 102 | 103 | # Spyder project settings 104 | .spyderproject 105 | .spyproject 106 | 107 | # Rope project settings 108 | .ropeproject 109 | 110 | # mkdocs documentation 111 | /site 112 | 113 | # mypy 114 | .mypy_cache/ 115 | 116 | 117 | ############################################### 118 | # Operating Systems 119 | ############################################### 120 | # Windows thumbnail cache files 121 | Thumbs.db 122 | Thumbs.db:encryptable 123 | ehthumbs.db 124 | ehthumbs_vista.db 125 | 126 | # Dump file 127 | *.stackdump 128 | 129 | # Folder config file 130 | [Dd]esktop.ini 131 | 132 | # Recycle Bin used on file shares 133 | $RECYCLE.BIN/ 134 | 135 | # Windows Installer files 136 | *.cab 137 | *.msi 138 | *.msix 139 | *.msm 140 | *.msp 141 | 142 | # Windows shortcuts 143 | *.lnk 144 | 145 | # 146 | # MacOS 147 | # 148 | 149 | # General 150 | .DS_Store 151 | .AppleDouble 152 | .LSOverride 153 | 154 | # Icon must end with two \r 155 | Icon 156 | 157 | 158 | # Thumbnails 159 | ._* 160 | 161 | # Files that might appear in the root of a volume 162 | .DocumentRevisions-V100 163 | .fseventsd 164 | .Spotlight-V100 165 | .TemporaryItems 166 | .Trashes 167 | .VolumeIcon.icns 168 | .com.apple.timemachine.donotpresent 169 | 170 | # Directories potentially created on remote AFP share 171 | .AppleDB 172 | .AppleDesktop 173 | Network Trash Folder 174 | Temporary Items 175 | .apdisk 176 | 177 | # 178 | # Linux 179 | # 180 | *~ 181 | 182 | # temporary files which can be created if a process still has a handle open of a deleted file 183 | .fuse_hidden* 184 | 185 | # KDE directory preferences 186 | .directory 187 | 188 | # Linux trash folder which might appear on any partition or disk 189 | .Trash-* 190 | 191 | # .nfs files are created when an open file is removed but is still being accessed 192 | .nfs* 193 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v3.2.0 4 | hooks: 5 | - id: check-yaml 6 | - id: check-toml 7 | - id: check-json 8 | - id: end-of-file-fixer 9 | - id: trailing-whitespace 10 | - id: debug-statements 11 | 12 | - repo: https://github.com/psf/black 13 | rev: 20.8b1 14 | hooks: 15 | - id: black 16 | exclude: ^docs/ 17 | - repo: https://github.com/pycqa/isort 18 | rev: 5.6.3 19 | hooks: 20 | - id: isort 21 | exclude: ^docs/ 22 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/source/conf.py 17 | 18 | # We recommend specifying your dependencies to enable reproducible builds: 19 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 20 | python: 21 | install: 22 | - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # PyProbables Changelog 2 | 3 | ### Version 0.6.1 4 | 5 | * Quotient Filter: 6 | * Add ability to get hashes from the filter either as a list, or as a generator 7 | * Add quotient filter expand capability, auto and on request 8 | * Add QuotientFilterError exception 9 | * Add merge functionality 10 | * Add retrieve hashes from the filter 11 | * Add resize filter, automatically or programatically 12 | * Add merging two filters into one 13 | * Add removal of an element from the filter 14 | * Count-Min Sketch: 15 | * Fix bug in elements added calculation when joining Count-Min Sketches; see [PR #119](https://github.com/barrust/pyprobables/pull/119); Thanks [@cunla](https://github.com/cunla) 16 | 17 | ### Version 0.6.0 18 | 19 | * Add `QuotientFilter` implementation; [see issue #37](https://github.com/barrust/pyprobables/issues/37) 20 | * Add `bitarray` implementation 21 | * Bitwise operations in lieu of modulo calculations 22 | 23 | ### Version 0.5.9 24 | 25 | * Add `py.typed` files so that mypy will find type annotations 26 | * Drop support for python `3.6` and `3.7` 27 | 28 | ### Version 0.5.8 29 | 30 | * Make the `mmap` utility class windows compatible; see [PR #160](https://github.com/barrust/pyprobables/pull/106); Thanks [@leonhma](https://github.com/leonhma) 31 | 32 | ### Version 0.5.7 33 | 34 | * Update Build System and update project metadata 35 | * Better support for `resolve_path` in passed filenames 36 | * Remove Python 3.5 support 37 | * Pylint inspired updates 38 | 39 | ### Version 0.5.6 40 | 41 | * Bloom Filters: 42 | * Fix for `ValueError` exception when using `estimate_elements()` when all bits are set 43 | * Add Citation file 44 | 45 | ### Version 0.5.5 46 | 47 | * Bloom Filters: 48 | * Re-implemented the entire Bloom Filter data structure to reduce complexity and code duplication 49 | * Removed un-unsed imports 50 | * Removed unnecessary casts 51 | * Pylint Requested Style Changes: 52 | * Use python 3 `super()` 53 | * Use python 3 classes 54 | * Remove use of temporary variables if possible and still clear 55 | 56 | ### Version 0.5.4 57 | 58 | * All Probablistic Data Structures: 59 | * Added ability to load each `frombytes()` 60 | * Updated underlying data structures of number based lists to be more space and time efficient; see [Issue #60](https://github.com/barrust/pyprobables/issues/60) 61 | * Cuckoo Filters: 62 | * Added `fingerprint_size_bits` property 63 | * Added `error_rate` property 64 | * Added ability to initialize based on error rate 65 | * Simplified typing 66 | * Ensure all `filepaths` can be `str` or `Path` 67 | 68 | ### Version 0.5.3 69 | 70 | * Additional type hinting 71 | * Improved format parsing and serialization; [see PR#81](https://github.com/barrust/pyprobables/pull/81). Thanks [@KOLANICH](https://github.com/KOLANICH) 72 | * Bloom Filters 73 | * Added `export_to_hex` functionality for Bloom Filters on Disk 74 | * Export as C header (**\*.h**) for Bloom Filters on Disk and Counting Bloom Filters 75 | * Added support for more input types for exporting and loading of saved files 76 | 77 | ### Version 0.5.2 78 | 79 | * Add ability to hash bytes along with strings 80 | * Make all tests files individually executable from the CLI. Thanks [@KOLANICH](https://github.com/KOLANICH) 81 | * Added type hints 82 | 83 | ### Version 0.5.1 84 | 85 | * Bloom Filter: 86 | * Export as a C header (**\*.h**) 87 | * Count-Min Sketch 88 | * Add join/merge functionality 89 | * Moved testing to use `NamedTemporaryFile` for file based tests 90 | 91 | ### Version 0.5.0 92 | 93 | * ***BACKWARD INCOMPATIBLE CHANGES*** 94 | * **NOTE:** Breaks backwards compatibility with previously exported blooms, counting-blooms, cuckoo filter, or count-min-sketch files using the default hash! 95 | * Update to the FNV_1a hash function 96 | * Simplified the default hash to use a seed value 97 | * Ensure passing of depth to hashing function when using `hash_with_depth_int` or `hash_with_depth_bytes` 98 | 99 | ## Version 0.4.1 100 | 101 | * Resolve [issue 57](https://github.com/barrust/pyprobables/issues/57) where false positive rate not stored / used the same in some instances 102 | 103 | ## Version 0.4.0 104 | 105 | * Remove **Python 2.7** support 106 | 107 | ### Version 0.3.2 108 | 109 | * Fix `RotatingBloomFilter` to keep information on number of elements inserted when exported and loaded. [see PR #50](https://github.com/barrust/pyprobables/pull/50) Thanks [@dvolker48](https://github.com/volker48) 110 | 111 | ### Version 0.3.1 112 | 113 | * Add additional **slots** 114 | * Very minor improvement to the hashing algorithm 115 | 116 | ### Version 0.3.0 117 | 118 | * Bloom Filters: 119 | * Import/Export of Expanding and Rotating Bloom Filters 120 | * Fix for importing standard Bloom Filters 121 | 122 | ### Version 0.2.6 123 | 124 | * Bloom Filters: 125 | * Addition of a Rotating Bloom Filter 126 | 127 | ### Version 0.2.5 128 | 129 | * Bloom Filters: 130 | * Addition of an Expanding Bloom Filter 131 | 132 | ### Version 0.2.0 133 | 134 | * Use **slots** 135 | 136 | ### Version 0.1.4 137 | 138 | * Drop support for python 3.3 139 | * Ensure passing parameters correctly to parent classes 140 | 141 | ### Version 0.1.3 142 | 143 | * Better parameter validation 144 | * Cuckoo Filters: 145 | * Support passing different hash function 146 | * Support for different fingerprint size 147 | * Utility to help generate valid hashing strategies using decorators 148 | * hash_with_depth_bytes 149 | * hash_with_depth_int 150 | * Updated documentation 151 | 152 | ### Version 0.1.2 153 | 154 | * Counting Cuckoo Filter 155 | * Basic functionality: add, remove, check 156 | * Expand 157 | * Import / Export 158 | * Fix and tests for utility functions 159 | * Fix package build 160 | 161 | ### Version 0.1.1 162 | 163 | * CuckooFilter 164 | * Import / Export functionality 165 | * Enforce single insertion per key 166 | * Auto expand when insertion failure OR when called to do so (settable) 167 | 168 | ### Version 0.1.0 169 | 170 | * Cuckoo Filter 171 | * Added basic Cuckoo Filter code 172 | 173 | ### Version 0.0.8 174 | 175 | * Counting Bloom Filter 176 | * Estimate unique elements added 177 | * Union 178 | * Intersection 179 | * Jaccard Index 180 | 181 | ### Version 0.0.7 182 | 183 | * Counting Bloom Filter 184 | * Fix counting bloom hex export / import 185 | * Fix for overflow issue in counting bloom export 186 | * Added ability to remove from counting bloom 187 | * Count-Min Sketch 188 | * Fix for not recording large numbers of inserts and deletions correctly 189 | 190 | ### Version 0.0.6 191 | 192 | * Probabilistic data structures added: 193 | * Counting Bloom Filter 194 | * Minor code clean-up 195 | * Re-factored Bloom Filters 196 | 197 | ### Version 0.0.5 198 | 199 | * Better on-line documentation 200 | * Changed access to some public functions 201 | 202 | ### Version 0.0.4 203 | 204 | * Probabilistic data structures: 205 | * Bloom Filter 206 | * Bloom Filter (on disk) 207 | * Count-Min Sketch 208 | * Count-Mean Sketch 209 | * Count-Mean-Min Sketch 210 | * Heavy Hitters 211 | * Stream Threshold 212 | * Import and export of each 213 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | # This CITATION.cff file was generated with cffinit. 2 | # Visit https://bit.ly/cffinit to generate yours today! 3 | 4 | cff-version: 1.2.0 5 | title: PyProbables 6 | message: >- 7 | If you use this software, please cite it using the 8 | metadata from this file. 9 | type: software 10 | authors: 11 | - given-names: Tyler 12 | family-names: Barrus 13 | email: barrust@gmail.coim 14 | orcid: 'https://orcid.org/0000-0002-6691-0360' 15 | repository-code: 'https://github.com/barrust/pyprobables' 16 | abstract: >- 17 | A set of probabilistic data structures written in 18 | python 19 | keywords: 20 | - Probabilistic 21 | - Data Structures 22 | - Bloom Filter 23 | - Count-Min Sketch 24 | - Cuckoo Filter 25 | - Counting Bloom Filter 26 | - Count-Mean-Min Sketch 27 | - Count-Mean Sketch 28 | - Heavy Hitters 29 | - Stream Threshold 30 | - Rolling Bloom Filter 31 | - Expanding Bloom Filter 32 | - Counting Cuckoo Filter 33 | - Quotient Filter 34 | license: MIT 35 | version: 0.6.0 36 | date-released: '2024-01-10' -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017-2021 Tyler Barrus 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | PyProbables 2 | =========== 3 | 4 | .. image:: https://img.shields.io/badge/license-MIT-blue.svg 5 | :target: https://opensource.org/licenses/MIT/ 6 | :alt: License 7 | .. image:: https://img.shields.io/github/release/barrust/pyprobables.svg 8 | :target: https://github.com/barrust/pyprobables/releases 9 | :alt: GitHub release 10 | .. image:: https://github.com/barrust/pyprobables/workflows/Python%20package/badge.svg 11 | :target: https://github.com/barrust/pyprobables/actions?query=workflow%3A%22Python+package%22 12 | :alt: Build Status 13 | .. image:: https://codecov.io/gh/barrust/pyprobables/branch/master/graph/badge.svg?token=OdETiNgz9k 14 | :target: https://codecov.io/gh/barrust/pyprobables 15 | :alt: Test Coverage 16 | .. image:: https://readthedocs.org/projects/pyprobables/badge/?version=latest 17 | :target: http://pyprobables.readthedocs.io/en/latest/?badge=latest 18 | :alt: Documentation Status 19 | .. image:: https://badge.fury.io/py/pyprobables.svg 20 | :target: https://pypi.org/project/pyprobables/ 21 | :alt: Pypi Release 22 | .. image:: https://pepy.tech/badge/pyprobables 23 | :target: https://pepy.tech/project/pyprobables 24 | :alt: Downloads 25 | 26 | **pyprobables** is a pure-python library for probabilistic data structures. 27 | The goal is to provide the developer with a pure-python implementation of 28 | common probabilistic data-structures to use in their work. 29 | 30 | To achieve better raw performance, it is recommended supplying an alternative 31 | hashing algorithm that has been compiled in C. This could include using the 32 | md5 and sha512 algorithms provided or installing a third party package and 33 | writing your own hashing strategy. Some options include the murmur hash 34 | `mmh3 `__ or those from the 35 | `pyhash `__ library. Each data object in 36 | **pyprobables** makes it easy to pass in a custom hashing function. 37 | 38 | Read more about how to use `Supplying a pre-defined, alternative hashing strategies`_ 39 | or `Defining hashing function using the provided decorators`_. 40 | 41 | Installation 42 | ------------------ 43 | 44 | Pip Installation: 45 | 46 | :: 47 | 48 | $ pip install pyprobables 49 | 50 | To install from source: 51 | 52 | To install `pyprobables`, simply clone the `repository on GitHub 53 | `__, then run from the folder: 54 | 55 | :: 56 | 57 | $ python setup.py install 58 | 59 | `pyprobables` supports python 3.6 - 3.11+ 60 | 61 | For *python 2.7* support, install `release 0.3.2 `__ 62 | 63 | :: 64 | 65 | $ pip install pyprobables==0.3.2 66 | 67 | 68 | API Documentation 69 | --------------------- 70 | 71 | The documentation of is hosted on 72 | `readthedocs.io `__ 73 | 74 | You can build the documentation locally by running: 75 | 76 | :: 77 | 78 | $ pip install sphinx 79 | $ cd docs/ 80 | $ make html 81 | 82 | 83 | 84 | Automated Tests 85 | ------------------ 86 | 87 | To run automated tests, one must simply run the following command from the 88 | downloaded folder: 89 | 90 | :: 91 | 92 | $ python setup.py test 93 | 94 | 95 | 96 | Quickstart 97 | ------------------ 98 | 99 | Import pyprobables and setup a Bloom Filter 100 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 101 | 102 | .. code:: python 103 | 104 | from probables import BloomFilter 105 | blm = BloomFilter(est_elements=1000, false_positive_rate=0.05) 106 | blm.add('google.com') 107 | blm.check('facebook.com') # should return False 108 | blm.check('google.com') # should return True 109 | 110 | 111 | Import pyprobables and setup a Count-Min Sketch 112 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 113 | 114 | .. code:: python 115 | 116 | from probables import CountMinSketch 117 | cms = CountMinSketch(width=1000, depth=5) 118 | cms.add('google.com') # should return 1 119 | cms.add('facebook.com', 25) # insert 25 at once; should return 25 120 | 121 | 122 | Import pyprobables and setup a Cuckoo Filter 123 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 124 | 125 | .. code:: python 126 | 127 | from probables import CuckooFilter 128 | cko = CuckooFilter(capacity=100, max_swaps=10) 129 | cko.add('google.com') 130 | cko.check('facebook.com') # should return False 131 | cko.check('google.com') # should return True 132 | 133 | 134 | Import pyprobables and setup a Quotient Filter 135 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 136 | 137 | .. code:: python 138 | 139 | from probables import QuotientFilter 140 | qf = QuotientFilter(quotient=24) 141 | qf.add('google.com') 142 | qf.check('facebook.com') # should return False 143 | qf.check('google.com') # should return True 144 | 145 | 146 | Supplying a pre-defined, alternative hashing strategies 147 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 148 | 149 | .. code:: python 150 | 151 | from probables import BloomFilter 152 | from probables.hashes import default_sha256 153 | blm = BloomFilter(est_elements=1000, false_positive_rate=0.05, 154 | hash_function=default_sha256) 155 | blm.add('google.com') 156 | blm.check('facebook.com') # should return False 157 | blm.check('google.com') # should return True 158 | 159 | 160 | .. _use-custom-hashing-strategies: 161 | 162 | Defining hashing function using the provided decorators 163 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 164 | 165 | .. code:: python 166 | 167 | import mmh3 # murmur hash 3 implementation (pip install mmh3) 168 | from probables.hashes import hash_with_depth_bytes 169 | from probables import BloomFilter 170 | 171 | @hash_with_depth_bytes 172 | def my_hash(key, depth): 173 | return mmh3.hash_bytes(key, seed=depth) 174 | 175 | blm = BloomFilter(est_elements=1000, false_positive_rate=0.05, hash_function=my_hash) 176 | 177 | .. code:: python 178 | 179 | import hashlib 180 | from probables.hashes import hash_with_depth_int 181 | from probables.constants import UINT64_T_MAX 182 | from probables import BloomFilter 183 | 184 | @hash_with_depth_int 185 | def my_hash(key, seed=0, encoding="utf-8"): 186 | max64mod = UINT64_T_MAX + 1 187 | val = int(hashlib.sha512(key.encode(encoding)).hexdigest(), 16) 188 | val += seed # not a good example, but uses the seed value 189 | return val % max64mod 190 | 191 | blm = BloomFilter(est_elements=1000, false_positive_rate=0.05, hash_function=my_hash) 192 | 193 | 194 | See the `API documentation `__ 195 | for other data structures available and the 196 | `quickstart page `__ 197 | for more examples! 198 | 199 | 200 | Changelog 201 | ------------------ 202 | 203 | Please see the `changelog 204 | `__ for a list 205 | of all changes. 206 | 207 | 208 | Backward Compatible Changes 209 | --------------------------- 210 | 211 | If you are using previously exported probablistic data structures (v0.4.1 or below) 212 | and used the default hashing strategy, you will want to use the following code 213 | to mimic the original default hashing algorithm. 214 | 215 | .. code:: python 216 | 217 | from probables import BloomFilter 218 | from probables.hashes import hash_with_depth_int 219 | 220 | @hash_with_depth_int 221 | def old_fnv1a(key, depth=1): 222 | return tmp_fnv_1a(key) 223 | 224 | def tmp_fnv_1a(key): 225 | max64mod = UINT64_T_MAX + 1 226 | hval = 14695981039346656073 227 | fnv_64_prime = 1099511628211 228 | tmp = map(ord, key) 229 | for t_str in tmp: 230 | hval ^= t_str 231 | hval *= fnv_64_prime 232 | hval %= max64mod 233 | return hval 234 | 235 | blm = BloomFilter(filpath="old-file-path.blm", hash_function=old_fnv1a) 236 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | require_ci_to_pass: yes 3 | 4 | coverage: 5 | precision: 2 6 | round: down 7 | range: "70...100" 8 | 9 | status: 10 | project: 11 | default: 12 | # basic settings 13 | target: "85%" 14 | base: auto 15 | threshold: 15 16 | patch: 17 | default: 18 | target: "50%" 19 | changes: no 20 | 21 | parsers: 22 | gcov: 23 | branch_detection: 24 | conditional: yes 25 | loop: yes 26 | method: no 27 | macro: no 28 | 29 | comment: 30 | layout: "reach,diff,flags,tree" 31 | behavior: default 32 | require_changes: no 33 | 34 | ignore: 35 | - "./tests/" 36 | - "setup.py" 37 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = pyprobables 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx>=3.0 2 | sphinx-rtd-theme 3 | -------------------------------------------------------------------------------- /docs/source/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* Set the properties to be full width */ 2 | dl.py.property { 3 | display: block !important; 4 | } -------------------------------------------------------------------------------- /docs/source/code.rst: -------------------------------------------------------------------------------- 1 | .. _api: 2 | 3 | pyprobables API 4 | ==================== 5 | 6 | Here you can find the full developer API for the pyprobables project. 7 | pyprobables provides a suite of probabilistic data-structures to be used 8 | in data analytics and data science projects. 9 | 10 | 11 | Data Structures and Classes 12 | ============================ 13 | 14 | Bloom Filters 15 | ------------- 16 | 17 | Bloom Filters are a class of probabilistic data structures used for set 18 | operations. Bloom Filters guarantee a zero percent false negative rate 19 | and a predetermined false positive rate. Once the number of elements inserted 20 | exceeds the estimated elements, the false positive rate will increase over the 21 | desired amount. 22 | 23 | `Further Reading `__ 24 | 25 | 26 | .. _BloomFilterAnchor: 27 | 28 | BloomFilter 29 | +++++++++++++++++++++++++++++++ 30 | 31 | .. autoclass:: probables.BloomFilter 32 | :members: 33 | :inherited-members: 34 | 35 | 36 | BloomFilterOnDisk 37 | +++++++++++++++++++++++++++++++ 38 | 39 | .. autoclass:: probables.BloomFilterOnDisk 40 | :members: 41 | 42 | For more information of all methods and properties, see `BloomFilter`_. 43 | 44 | ExpandingBloomFilter 45 | +++++++++++++++++++++++++++++++ 46 | 47 | .. autoclass:: probables.ExpandingBloomFilter 48 | :members: 49 | 50 | RotatingBloomFilter 51 | +++++++++++++++++++++++++++++++ 52 | 53 | .. autoclass:: probables.RotatingBloomFilter 54 | :members: 55 | :inherited-members: 56 | 57 | CountingBloomFilter 58 | +++++++++++++++++++++++++++++++ 59 | 60 | .. autoclass:: probables.CountingBloomFilter 61 | :members: 62 | :inherited-members: 63 | 64 | 65 | Cuckoo Filters 66 | -------------- 67 | 68 | Cuckoo filters are a space efficient data structure that supports set 69 | membership testing. Cuckoo filters support insertion, deletion, and lookup of 70 | elements with low overhead and few false positive results. The name is derived 71 | from the `cuckoo hashing `__ 72 | strategy used to resolve conflicts. 73 | 74 | `Further Reading `__ 75 | 76 | CuckooFilter 77 | +++++++++++++++++++++++++++++++ 78 | .. autoclass:: probables.CuckooFilter 79 | :members: 80 | 81 | CountingCuckooFilter 82 | +++++++++++++++++++++++++++++++ 83 | .. autoclass:: probables.CountingCuckooFilter 84 | :members: 85 | :inherited-members: 86 | 87 | 88 | Count-Min Sketches 89 | ------------------ 90 | 91 | Count-Min Sketches, and its derivatives, are good for estimating the number of 92 | occurrences of an element in streaming data while not needing to retain all the 93 | data elements. The result is a probabilistic count of elements inserted into 94 | the data structure. It will always provide the **maximum** number of times a 95 | data element was encountered. Notice that the result may be **more** than the 96 | true number of times it was inserted, but never fewer. 97 | 98 | `Further Reading `__ 99 | 100 | 101 | CountMinSketch 102 | +++++++++++++++++++++++++++++++ 103 | 104 | .. autoclass:: probables.CountMinSketch 105 | :members: 106 | 107 | 108 | CountMeanSketch 109 | +++++++++++++++++++++++++++++++ 110 | 111 | .. autoclass:: probables.CountMeanSketch 112 | :members: 113 | 114 | For more information of all methods and properties, see `CountMinSketch`_. 115 | 116 | 117 | CountMeanMinSketch 118 | +++++++++++++++++++++++++++++++ 119 | 120 | .. autoclass:: probables.CountMeanMinSketch 121 | :members: 122 | 123 | For more information of all methods and properties, see `CountMinSketch`_. 124 | 125 | 126 | HeavyHitters 127 | +++++++++++++++++++++++++++++++ 128 | 129 | .. autoclass:: probables.HeavyHitters 130 | :members: 131 | 132 | For more information of all methods and properties, see `CountMinSketch`_. 133 | 134 | 135 | StreamThreshold 136 | +++++++++++++++++++++++++++++++ 137 | 138 | .. autoclass:: probables.StreamThreshold 139 | :members: 140 | 141 | For more information of all methods and properties, see `CountMinSketch`_. 142 | 143 | QuotientFilter 144 | ------------------ 145 | 146 | Quotient filters are an aproximate membership query filter (AMQ) that is both 147 | space efficient and returns a zero false negative rate and a probablistic false 148 | positive rate. Unlike Bloom filters, the quotient filter only requires a single 149 | hash of the element to insert. The upper **q** bits denote the location within the 150 | filter while the lower **r** bits are stored in the filter. 151 | 152 | Quotient filters provide some useful benifits over Bloom filters including: 153 | 154 | * Merging of two filters (not union) 155 | * Resizing of the filter 156 | * Ability to remove elements 157 | 158 | `Further Reading `__ 159 | 160 | QuotientFilter 161 | +++++++++++++++++++++++++++++++ 162 | 163 | .. autoclass:: probables.QuotientFilter 164 | :members: 165 | 166 | 167 | Utilities 168 | ------------------ 169 | 170 | Bitarray 171 | +++++++++++++++++++++++++++++++ 172 | 173 | .. autoclass:: probables.utilities.Bitarray 174 | :members: 175 | 176 | Exceptions 177 | ============================ 178 | 179 | .. automodule:: probables.exceptions 180 | :members: 181 | 182 | 183 | Hashing Functions 184 | ============================ 185 | 186 | .. automodule:: probables.hashes 187 | :members: 188 | 189 | 190 | Indices and Tables 191 | ============================ 192 | 193 | * :ref:`home` 194 | * :ref:`quickstart` 195 | * :ref:`genindex` 196 | * :ref:`modindex` 197 | * :ref:`search` 198 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # pyprobables documentation build configuration file, created by 4 | # sphinx-quickstart on Thu Jul 13 22:20:03 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | import os 20 | import sys 21 | 22 | sys.path.insert(0, os.path.abspath("../../")) 23 | import probables 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | "sphinx.ext.autodoc", 36 | "sphinx.ext.napoleon", 37 | "sphinx.ext.doctest", 38 | "sphinx.ext.coverage", 39 | "sphinx.ext.viewcode", 40 | "sphinx.ext.githubpages", 41 | "sphinx.ext.todo", 42 | ] 43 | 44 | # Turn off typehints in discription 45 | autodoc_typehints = "description" 46 | 47 | # Add any paths that contain templates here, relative to this directory. 48 | templates_path = ["_templates"] 49 | 50 | # The suffix(es) of source filenames. 51 | # You can specify multiple suffix as a list of string: 52 | # 53 | # source_suffix = ['.rst', '.md'] 54 | source_suffix = ".rst" 55 | 56 | # The master toctree document. 57 | master_doc = "index" 58 | 59 | # General information about the project. 60 | project = "probables" 61 | copyright = "2017, Tyler Barrus" 62 | author = probables.__author__ 63 | 64 | # The version info for the project you're documenting, acts as replacement for 65 | # |version| and |release|, also used in various other places throughout the 66 | # built documents. 67 | # 68 | # The short X.Y version. 69 | version = probables.__version__ 70 | # The full version, including alpha/beta/rc tags. 71 | release = probables.__version__ 72 | 73 | # The language for content autogenerated by Sphinx. Refer to documentation 74 | # for a list of supported languages. 75 | # 76 | # This is also used if you do content translation via gettext catalogs. 77 | # Usually you set "language" from the command line for these cases. 78 | language = "en" 79 | 80 | # List of patterns, relative to source directory, that match files and 81 | # directories to ignore when looking for source files. 82 | # This patterns also effect to html_static_path and html_extra_path 83 | exclude_patterns = [] 84 | 85 | # The name of the Pygments (syntax highlighting) style to use. 86 | pygments_style = "sphinx" 87 | 88 | # If true, `todo` and `todoList` produce output, else they produce nothing. 89 | todo_include_todos = True 90 | 91 | 92 | # -- Options for HTML output ---------------------------------------------- 93 | 94 | # The theme to use for HTML and HTML Help pages. See the documentation for 95 | # a list of builtin themes. 96 | # 97 | html_theme = "sphinx_rtd_theme" 98 | # html_theme = 'alabaster' 99 | # html_theme = "custom_theme" 100 | # html_theme_path = ["_themes"] 101 | 102 | # Theme options are theme-specific and customize the look and feel of a theme 103 | # further. For a list of options available for each theme, see the 104 | # documentation. 105 | # 106 | 107 | html_theme_options = { 108 | # "collapse_navigation": True, 109 | # "sticky_navigation": True, 110 | # "navigation_depth": 4, 111 | # "includehidden": True, 112 | # "titles_only": False, 113 | } 114 | 115 | # Add any paths that contain custom static files (such as style sheets) here, 116 | # relative to this directory. They are copied after the builtin static files, 117 | # so a file named "default.css" will overwrite the builtin "default.css". 118 | html_static_path = ["_static"] 119 | 120 | # These paths are either relative to html_static_path 121 | # or fully qualified paths (eg. https://...) 122 | html_css_files = ["custom.css"] 123 | 124 | # -- Options for HTMLHelp output ------------------------------------------ 125 | 126 | # Output file base name for HTML help builder. 127 | htmlhelp_basename = "pyprobablesdoc" 128 | 129 | 130 | # -- Options for LaTeX output --------------------------------------------- 131 | 132 | latex_elements = { 133 | # The paper size ('letterpaper' or 'a4paper'). 134 | # 135 | # 'papersize': 'letterpaper', 136 | # The font size ('10pt', '11pt' or '12pt'). 137 | # 138 | # 'pointsize': '10pt', 139 | # Additional stuff for the LaTeX preamble. 140 | # 141 | # 'preamble': '', 142 | # Latex figure (float) alignment 143 | # 144 | # 'figure_align': 'htbp', 145 | } 146 | 147 | # Grouping the document tree into LaTeX files. List of tuples 148 | # (source start file, target name, title, 149 | # author, documentclass [howto, manual, or own class]). 150 | latex_documents = [ 151 | ( 152 | master_doc, 153 | "pyprobables.tex", 154 | "pyprobables Documentation", 155 | "Tyler Barrus", 156 | "manual", 157 | ), 158 | ] 159 | 160 | 161 | # -- Options for manual page output --------------------------------------- 162 | 163 | # One entry per manual page. List of tuples 164 | # (source start file, name, description, authors, manual section). 165 | man_pages = [(master_doc, "pyprobables", "pyprobables Documentation", [author], 1)] 166 | 167 | 168 | # -- Options for Texinfo output ------------------------------------------- 169 | 170 | # Grouping the document tree into Texinfo files. List of tuples 171 | # (source start file, target name, title, author, 172 | # dir menu entry, description, category) 173 | texinfo_documents = [ 174 | ( 175 | master_doc, 176 | "pyprobables", 177 | "pyprobables Documentation", 178 | author, 179 | "pyprobables", 180 | "One line description of project.", 181 | "Miscellaneous", 182 | ), 183 | ] 184 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. _home: 2 | .. include:: ../../README.rst 3 | 4 | 5 | .. toctree:: 6 | 7 | code 8 | quickstart 9 | 10 | 11 | Read More 12 | ================== 13 | 14 | * :ref:`api` 15 | * :ref:`quickstart` 16 | * :ref:`genindex` 17 | * :ref:`modindex` 18 | * :ref:`search` 19 | -------------------------------------------------------------------------------- /docs/source/quickstart.rst: -------------------------------------------------------------------------------- 1 | .. _quickstart: 2 | 3 | Quickstart 4 | ========================== 5 | 6 | 7 | Install 8 | +++++++++++++++++++++++++++++++ 9 | 10 | The easiest method of installing pyprobables is by using the pip package 11 | manager: 12 | 13 | Pip Installation: 14 | 15 | :: 16 | 17 | $ pip install pyprobables 18 | 19 | 20 | API Documentation 21 | +++++++++++++++++++++++++++++++ 22 | 23 | The full API documentation for the pyprobables package: :ref:`api` 24 | 25 | Example Usage 26 | +++++++++++++++++++++++++++++++ 27 | 28 | Bloom Filters 29 | ------------- 30 | 31 | Bloom Filters provide set operations of large datasets while being small in 32 | memory footprint. They provide a zero percent false negative rate and a 33 | predetermined, or desired, false positive rate. 34 | `more information `__ 35 | 36 | 37 | Import, Initialize, and Train 38 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 39 | .. code:: python 40 | 41 | >>> from probables import (BloomFilter) 42 | >>> blm = BloomFilter(est_elements=1000000, false_positive_rate=0.05) 43 | >>> with open('war_and_peace.txt', 'r') as fp: 44 | >>> for line in fp: 45 | >>> for word in line.split(): 46 | >>> blm.add(word.lower()) # add each word to the bloom filter! 47 | >>> # end reading in the file 48 | 49 | 50 | Query the Bloom Filter 51 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 52 | .. code:: python 53 | 54 | >>> words_to_check = ['step', 'borzoi', 'diametrically', 'fleches', 'rain'] 55 | >>> for word in words_to_check: 56 | >>> blm.check(word) 57 | 58 | 59 | Export the Bloom Filter 60 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 61 | .. code:: python 62 | 63 | >>> blm.export('war_and_peace_bloom.blm') 64 | 65 | 66 | Import a Bloom Filter 67 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 68 | .. code:: python 69 | 70 | >>> blm2 = BloomFilter(filepath='war_and_peace_bloom.blm') 71 | >>> print(blm2.check('sutler')) 72 | 73 | 74 | Other Bloom Filters 75 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 76 | 77 | Bloom Filter on Disk 78 | """"""""""""""""""""""""""""""""""""""""""""""" 79 | 80 | The **Bloom Filter on Disk** is a specialized version of the standard 81 | Bloom Filter that is run directly off of disk instead of in memory. This 82 | can be useful for very large Bloom Filters or when needing to access many 83 | Blooms that are exported to file. 84 | 85 | 86 | Expanding Bloom Filter 87 | """"""""""""""""""""""""""""""""""""""""""""""" 88 | 89 | The **Expanding Bloom Filter** is a specialized version of the standard 90 | Bloom Filter that automatically grows to ensure that the desired false positive 91 | rate is not exceeded. This is ideal for situations that it is a wild guess to 92 | determine the number of elements that will be added. 93 | 94 | 95 | Rotating Bloom Filter 96 | """"""""""""""""""""""""""""""""""""""""""""""" 97 | 98 | The **Rotating Bloom Filter** is a specialized version of the standard 99 | Bloom Filter that rolls of earlier entries into the filter as they become more 100 | stale. The popping of the queue can be done either programmatically or 101 | automatically. 102 | 103 | 104 | Counting Bloom Filter 105 | """"""""""""""""""""""""""""""""""""""""""""""" 106 | 107 | **Counting Bloom Filters** are another specialized version of the standard 108 | Bloom Filter. Instead of using a bit array to track added elements, a 109 | Counting Bloom uses integers to track the number of times the element has 110 | been added. 111 | 112 | 113 | Count-Min Sketch 114 | ----------------- 115 | 116 | Count-Min Sketches, and its derivatives, are good for counting the number of 117 | occurrences of an element in streaming data while not needing to retain all the 118 | data elements. The result is a probabilistic count of elements inserted into 119 | the data structure. It will always provide a **maximum** number of times 120 | encountered. Notice that the result may be **more** than the true number 121 | of times it was inserted, but never fewer. 122 | `more information `__ 123 | 124 | 125 | Import, Initialize, and Train 126 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 127 | .. code:: python 128 | 129 | >>> from probables import (CountMinSketch) 130 | >>> cms = CountMinSketch(width=100000, depth=5) 131 | >>> with open('war_and_peace.txt', 'r') as fp: 132 | >>> for line in fp: 133 | >>> for word in line.split(): 134 | >>> cms.add(word.lower()) # add each to the count-min sketch! 135 | 136 | 137 | Query the Count-Min Sketch 138 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 139 | .. code:: python 140 | 141 | >>> words_to_check = ['step', 'borzoi', 'diametrically', 'fleches', 'rain'] 142 | >>> for word in words_to_check: 143 | >>> print(cms.check(word)) # prints: 80, 17, 1, 20, 25 144 | 145 | 146 | Export Count-Min Sketch 147 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 148 | .. code:: python 149 | 150 | >>> cms.export('war_and_peace.cms') 151 | 152 | 153 | Import a Count-Min Sketch 154 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 155 | .. code:: python 156 | 157 | >>> cms2 = CountMinSketch(filepath='war_and_peace.cms') 158 | >>> print(cms2.check('fleches')) # prints 20 159 | 160 | 161 | Other Count-Min Sketches 162 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 163 | 164 | Count-Mean Sketch and Count-Mean-Min Sketch 165 | """"""""""""""""""""""""""""""""""""""""""""""" 166 | 167 | **Count-Mean Sketch** and **Count-Mean-Min Sketch** are identical to the 168 | Count-Min Sketch for the data structure but both differ in the method of 169 | calculating the number of times and element has been inserted. These are 170 | currently supported by specifying at query time which method is desired 171 | or by initializing to the desired class: CountMeanSketch or CountMeanMinSketch. 172 | 173 | 174 | Heavy Hitters 175 | """"""""""""""""""""""""""""""""""""""""""""""" 176 | 177 | **Heavy Hitters** is a version of the Count-Min Sketch that tracks those 178 | elements that are seen most often. Beyond the normal initialization parameters 179 | one only needs to specify the number of heavy hitters to track. 180 | 181 | 182 | Stream Threshold 183 | """"""""""""""""""""""""""""""""""""""""""""""" 184 | 185 | **Stream Threshold** is another version of the Count-Min Sketch similar to the 186 | Heavy Hitters. The main difference is that the there is a threshold for 187 | including an element to be tracked instead of tracking a certain number of 188 | elements. 189 | 190 | 191 | Cuckoo Filters 192 | ---------------------------------- 193 | 194 | Cuckoo Filters are a memory efficient method to approximate set membership. 195 | They allow for the ability to add, remove, and look elements from the set. 196 | They get the name cuckoo filter from the use of the 197 | `cuckoo hashing `__ strategy. 198 | 199 | Import, Initialize, and Train 200 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 201 | .. code:: python3 202 | 203 | >>> from probables import (CuckooFilter) 204 | >>> ccf = CuckooFilter(capacity=100000, bucket_size=4, max_swaps=100) 205 | >>> with open('war_and_peace.txt', 'r') as fp: 206 | >>> for line in fp: 207 | >>> for word in line.split(): 208 | >>> ccf.add(word.lower()) # add each to the cuckoo filter! 209 | 210 | 211 | Query the Cuckoo Filter 212 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 213 | .. code:: python3 214 | 215 | >>> words_to_check = ['borzoi', 'diametrically', 'fleches', 'rain', 'foo'] 216 | >>> for word in words_to_check: 217 | >>> print(ccf.check(word)) # prints: True, True, True, True, False 218 | 219 | 220 | Export the Cuckoo Filter 221 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 222 | .. code:: python3 223 | 224 | >>> ccf.export('war_and_peace.cko') 225 | 226 | 227 | Import a Cuckoo Filter 228 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 229 | .. code:: python3 230 | 231 | >>> ccf2 = CuckooFilter(filepath='war_and_peace.cko') 232 | >>> print(ccf2.check('fleches')) # prints True 233 | 234 | Cuckoo Filters based on Error Rate 235 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 236 | To use error rate to initialize a Cuckoo Filter, there are class methods that can be used. 237 | `init_error_rate()` can be used to initialize a Cuckoo Filter that has not been exported, and 238 | `load_error_rate()` can be used to load in a previously exported Cuckoo Filter that used error rate 239 | to determine the parameters. 240 | 241 | .. code:: python3 242 | 243 | >>> cko = CuckooFilter.init_error_rate(0.00001) 244 | >>> cko.export('war_and_peace.cko') 245 | >>> ckf = CuckooFilter.load_error_rate(0.00001) 246 | 247 | Other Cuckoo Filters 248 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 249 | 250 | Counting Cuckoo Filter 251 | """"""""""""""""""""""""""""""""""""""""""""""" 252 | The counting cuckoo filter is similar to the standard filter except that it 253 | tracks the number of times a fingerprint has been added to the filter. 254 | 255 | 256 | Quotient Filters 257 | ---------------- 258 | 259 | Quotient Filters provide set operations of large datasets while being relatively 260 | small in memory footprint. They provide a zero percent false negative rate and a 261 | small false positive rate. 262 | `more information `__ 263 | 264 | 265 | Import, Initialize, and Train 266 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 267 | .. code:: python3 268 | 269 | >>> qf = QuotientFilter(quotient=22) 270 | >>> with open('war_and_peace.txt', 'r') as fp: 271 | >>> for line in fp: 272 | >>> for word in line.split(): 273 | >>> blm.add(word.lower()) # add each word to the bloom filter! 274 | 275 | 276 | Query the Quotient Filter 277 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 278 | .. code:: python3 279 | 280 | >>> words_to_check = ['borzoi', 'diametrically', 'fleches', 'rain', 'foo'] 281 | >>> for word in words_to_check: 282 | >>> print(qf.check(word)) # prints: True, True, True, True, False 283 | 284 | Custom Hashing Functions 285 | ---------------------------------- 286 | In many instances, to get the best raw performance out of the data structures, 287 | it is wise to use a non pure python hashing algorithm. It is recommended that 288 | one is used that is compiled such as `mmh3 `__ 289 | or `pyhash `__ or even built in 290 | cryptographic hashes. 291 | 292 | Some pre-defined hashing strategies are provided that use built in 293 | cryptographic hashes. 294 | 295 | To use a pre-defined alternative hashing strategy: 296 | 297 | .. code:: python3 298 | 299 | >>> from probables import (BloomFilter) 300 | >>> from probables.hashes import (default_sha256, default_md5) 301 | >>> blm = BloomFilter(est_elements=1000, false_positive_rate=0.05, 302 | hash_function=default_sha256) 303 | >>> blm.add('google.com') 304 | >>> blm.check('facebook.com') # should return False 305 | >>> blm.check('google.com') # should return True 306 | 307 | Decorators are provided to help make generating hashing strategies easier. 308 | 309 | Defining hashing function using the provided decorators: 310 | 311 | .. code:: python3 312 | 313 | >>> import mmh3 # murmur hash 3 implementation (pip install mmh3) 314 | >>> from pyprobables.hashes import (hash_with_depth_bytes) 315 | >>> from pyprobables import (BloomFilter) 316 | >>> 317 | >>> @hash_with_depth_bytes 318 | >>> def my_hash(key): 319 | >>> return mmh3.hash_bytes(key) 320 | >>> 321 | >>> blm = BloomFilter(est_elements=1000, false_positive_rate=0.05, hash_function=my_hash) 322 | 323 | .. code:: python3 324 | 325 | >>> import mmh3 # murmur hash 3 implementation (pip install mmh3) 326 | >>> from pyprobables.hashes import (hash_with_depth_int) 327 | >>> from pyprobables import (BloomFilter) 328 | >>> 329 | >>> @hash_with_depth_int 330 | >>> def my_hash(key, encoding='utf-8'): 331 | >>> max64mod = UINT64_T_MAX + 1 332 | >>> val = int(hashlib.sha512(key.encode(encoding)).hexdigest(), 16) 333 | >>> return val % max64mod 334 | >>> 335 | >>> blm = BloomFilter(est_elements=1000, false_positive_rate=0.05, hash_function=my_hash) 336 | 337 | Generate completely different hashing strategy 338 | 339 | .. code:: python3 340 | 341 | >>> import mmh3 # murmur hash 3 implementation (pip install mmh3) 342 | >>> 343 | >>> def my_hash(key, depth, encoding='utf-8'): 344 | >>> max64mod = UINT64_T_MAX + 1 345 | >>> results = list() 346 | >>> for i in range(0, depth): 347 | >>> tmp = key[i:] + key[:i] 348 | >>> val = int(hashlib.sha512(tmp.encode(encoding)).hexdigest(), 16) 349 | >>> results.append(val % max64mod) 350 | >>> return results 351 | 352 | 353 | Indices and Tables 354 | ================== 355 | 356 | * :ref:`home` 357 | * :ref:`api` 358 | * :ref:`genindex` 359 | * :ref:`modindex` 360 | * :ref:`search` 361 | -------------------------------------------------------------------------------- /probables/__init__.py: -------------------------------------------------------------------------------- 1 | """pyprobables module""" 2 | 3 | from probables.blooms import ( 4 | BloomFilter, 5 | BloomFilterOnDisk, 6 | CountingBloomFilter, 7 | ExpandingBloomFilter, 8 | RotatingBloomFilter, 9 | ) 10 | from probables.countminsketch import CountMeanMinSketch, CountMeanSketch, CountMinSketch, HeavyHitters, StreamThreshold 11 | from probables.cuckoo import CountingCuckooFilter, CuckooFilter 12 | from probables.exceptions import ( 13 | CuckooFilterFullError, 14 | InitializationError, 15 | NotSupportedError, 16 | ProbablesBaseException, 17 | RotatingBloomFilterError, 18 | ) 19 | from probables.quotientfilter import QuotientFilter 20 | from probables.utilities import Bitarray 21 | 22 | __author__ = "Tyler Barrus" 23 | __maintainer__ = "Tyler Barrus" 24 | __email__ = "barrust@gmail.com" 25 | __license__ = "MIT" 26 | __version__ = "0.6.1" 27 | __credits__: list[str] = [] 28 | __url__ = "https://github.com/barrust/pyprobables" 29 | __bugtrack_url__ = "https://github.com/barrust/pyprobables/issues" 30 | 31 | __all__ = [ 32 | "BloomFilter", 33 | "BloomFilterOnDisk", 34 | "CountingBloomFilter", 35 | "CountMinSketch", 36 | "CountMeanSketch", 37 | "CountMeanMinSketch", 38 | "HeavyHitters", 39 | "StreamThreshold", 40 | "CuckooFilter", 41 | "CountingCuckooFilter", 42 | "InitializationError", 43 | "NotSupportedError", 44 | "ProbablesBaseException", 45 | "CuckooFilterFullError", 46 | "ExpandingBloomFilter", 47 | "RotatingBloomFilter", 48 | "RotatingBloomFilterError", 49 | "QuotientFilter", 50 | "Bitarray", 51 | ] 52 | -------------------------------------------------------------------------------- /probables/blooms/__init__.py: -------------------------------------------------------------------------------- 1 | """Bloom Filters""" 2 | 3 | from probables.blooms.bloom import BloomFilter, BloomFilterOnDisk 4 | from probables.blooms.countingbloom import CountingBloomFilter 5 | from probables.blooms.expandingbloom import ExpandingBloomFilter, RotatingBloomFilter 6 | 7 | __all__ = [ 8 | "BloomFilter", 9 | "BloomFilterOnDisk", 10 | "CountingBloomFilter", 11 | "ExpandingBloomFilter", 12 | "RotatingBloomFilter", 13 | ] 14 | -------------------------------------------------------------------------------- /probables/blooms/countingbloom.py: -------------------------------------------------------------------------------- 1 | """CountingBloomFilter, python implementation 2 | License: MIT 3 | Author: Tyler Barrus (barrust@gmail.com) 4 | URL: https://github.com/barrust/counting_bloom 5 | """ 6 | 7 | from array import array 8 | from collections.abc import ByteString 9 | from pathlib import Path 10 | from struct import Struct 11 | from typing import Union 12 | 13 | from probables.blooms.bloom import BloomFilter 14 | from probables.constants import UINT32_T_MAX, UINT64_T_MAX 15 | from probables.exceptions import InitializationError 16 | from probables.hashes import HashFuncT, HashResultsT, KeyT 17 | from probables.utilities import is_hex_string, is_valid_file, resolve_path 18 | 19 | MISMATCH_MSG = "The parameter second must be of type CountingBloomFilter" 20 | 21 | 22 | def _verify_not_type_mismatch(second: "CountingBloomFilter") -> bool: 23 | """verify that there is not a type mismatch""" 24 | return isinstance(second, (CountingBloomFilter)) 25 | 26 | 27 | class CountingBloomFilter(BloomFilter): 28 | """Simple Counting Bloom Filter implementation for use in python; 29 | It can read and write the same format as the c version 30 | (https://github.com/barrust/counting_bloom) 31 | 32 | Args: 33 | est_elements (int): The number of estimated elements to be added 34 | false_positive_rate (float): The desired false positive rate 35 | filepath (str): Path to file to load 36 | hex_string (str): Hex based representation to be loaded 37 | hash_function (function): Hashing strategy function to use `hf(key, number)` 38 | Returns: 39 | CountingBloomFilter: A Counting Bloom Filter object 40 | 41 | Note: 42 | Initialization order of operations: 43 | 1) From file 44 | 2) From Hex String 45 | 3) From params""" 46 | 47 | __slots__ = ("_filepath",) 48 | 49 | def __init__( 50 | self, 51 | est_elements: Union[int, None] = None, 52 | false_positive_rate: Union[float, None] = None, 53 | filepath: Union[str, Path, None] = None, 54 | hex_string: Union[str, None] = None, 55 | hash_function: Union[HashFuncT, None] = None, 56 | ) -> None: 57 | """setup the basic values needed""" 58 | self._filepath = None 59 | super().__init__(est_elements, false_positive_rate, filepath, hex_string, hash_function) 60 | 61 | def _load_init(self, filepath, hash_function, hex_string, est_elements, false_positive_rate): 62 | """Handle setting params and loading everything as needed""" 63 | self._bits_per_elm = 1.0 64 | self._type = "counting" 65 | self._typecode = "I" 66 | 67 | if is_valid_file(filepath): 68 | self._filepath = resolve_path(filepath) 69 | self._load(self._filepath, hash_function) 70 | elif is_hex_string(hex_string): 71 | self._load_hex(hex_string, hash_function) 72 | else: 73 | if est_elements is None or false_positive_rate is None: 74 | raise InitializationError("Insufecient parameters to set up the Counting Bloom Filter") 75 | # calc values 76 | fpr, n_hashes, n_bits = self._get_optimized_params(est_elements, false_positive_rate) 77 | self._set_values(est_elements, fpr, n_hashes, n_bits, hash_function) 78 | self._bloom_length = n_bits 79 | self._bloom = array(self._typecode, [0]) * self._bloom_length 80 | 81 | _IMPT_STRUCT = Struct("I") 82 | 83 | @classmethod 84 | def frombytes(cls, b: ByteString, hash_function: Union[HashFuncT, None] = None) -> "CountingBloomFilter": 85 | """ 86 | Args: 87 | b (ByteString): the bytes to load as a Counting Bloom Filter 88 | hash_function (function): Hashing strategy function to use `hf(key, number)` 89 | Returns: 90 | CountingBloomFilter: A Counting Bloom Filter object 91 | """ 92 | offset = cls._FOOTER_STRUCT.size 93 | est_els, els_added, fpr, n_hashes, n_bits = cls._parse_footer(cls._FOOTER_STRUCT, bytes(b[-1 * offset :])) 94 | blm = CountingBloomFilter(est_elements=est_els, false_positive_rate=fpr, hash_function=hash_function) 95 | blm._set_values(est_els, fpr, n_hashes, n_bits, hash_function) 96 | blm._els_added = els_added 97 | blm._parse_bloom_array(b, cls._IMPT_STRUCT.size * blm.bloom_length) 98 | return blm 99 | 100 | def __str__(self) -> str: 101 | """string representation of the counting bloom filter""" 102 | on_disk = "no" if self.is_on_disk is False else "yes" 103 | 104 | cnt = sum(x for x in self._bloom if x > 0) 105 | total = sum(self._bloom) 106 | largest = max(self._bloom) 107 | largest_idx = (self._bloom).index(largest) 108 | fullness = cnt / self.number_bits 109 | els_added = total // self.number_hashes 110 | 111 | return ( 112 | "CountingBloom:\n" 113 | f"\tbits: {self.number_bits}\n" 114 | f"\testimated elements: {self.estimated_elements}\n" 115 | f"\tnumber hashes: {self.number_hashes}\n" 116 | f"\tmax false positive rate: {self.false_positive_rate:.6f}\n" 117 | f"\telements added: {self.elements_added}\n" 118 | f"\tcurrent false positive rate: {self.current_false_positive_rate():.6f}\n" 119 | f"\tis on disk: {on_disk}\n" 120 | f"\tindex fullness: {fullness:.6}\n" 121 | f"\tmax index usage: {largest}\n" 122 | f"\tmax index id: {largest_idx}\n" 123 | f"\tcalculated elements: {els_added}\n" 124 | ) 125 | 126 | def add(self, key: KeyT, num_els: int = 1) -> int: # type: ignore 127 | """Add the key to the Counting Bloom Filter 128 | 129 | Args: 130 | key (str): The element to be inserted 131 | num_els (int): Number of times to insert the element 132 | Returns: 133 | int: Maximum number of insertions""" 134 | return self.add_alt(self.hashes(key), num_els) 135 | 136 | def add_alt(self, hashes: HashResultsT, num_els: int = 1) -> int: # type: ignore 137 | """Add the element represented by hashes into the Counting Bloom Filter 138 | 139 | Args: 140 | hashes (list): A list of integers representing the key to insert 141 | num_els (int): Number of times to insert the element 142 | Returns: 143 | int: Maximum number of insertions""" 144 | # NOTE: this will increment indices each time it is viewed. Not sure if that is "correct" 145 | # if not then we will need to update this and the C version 146 | indices = [hashes[i] % self._bloom_length for i in range(self._number_hashes)] 147 | vals = [self._bloom[k] + num_els for k in indices] 148 | for i, v in enumerate(vals): 149 | k = indices[i] 150 | if v > UINT32_T_MAX: 151 | self._bloom[k] = UINT32_T_MAX 152 | vals[i] = UINT32_T_MAX 153 | else: 154 | self._bloom[k] += num_els # This keeps the original methodology 155 | self.elements_added = min(self.elements_added + num_els, UINT64_T_MAX) 156 | return min(vals) 157 | 158 | def check(self, key: KeyT) -> int: # type: ignore 159 | """Check if the key is likely in the Counting Bloom Filter 160 | 161 | Args: 162 | key (str): The element to be checked 163 | Returns: 164 | int: Maximum number of insertions""" 165 | return self.check_alt(self.hashes(key)) 166 | 167 | def check_alt(self, hashes: HashResultsT) -> int: # type: ignore 168 | """Check if the element represented by hashes is in the Counting 169 | Bloom Filter 170 | 171 | Args: 172 | hashes (list): A list of integers representing the key to check 173 | Returns: 174 | int: Maximum number of insertions""" 175 | return min(self._bloom[x % self.number_bits] for x in hashes) 176 | 177 | def remove(self, key: KeyT, num_els: int = 1) -> int: 178 | """Remove the element from the counting bloom 179 | 180 | Args: 181 | key (str): The element to be removed 182 | num_els (int): Number of times to remove the element 183 | Returns: 184 | int: Maximum number of insertions after the removal""" 185 | return self.remove_alt(self.hashes(key), num_els) 186 | 187 | def remove_alt(self, hashes: HashResultsT, num_els: int = 1) -> int: 188 | """Remvoe the element represented by hashes from the Counting Bloom Filter 189 | 190 | Args: 191 | hashes (list): A list of integers representing the key to remove 192 | num_els (int): Number of times to remove the element 193 | Returns: 194 | int: Maximum number of insertions after the removal""" 195 | 196 | indices = [hashes[i] % self._bloom_length for i in range(self._number_hashes)] 197 | vals = [self._bloom[k] for k in indices] 198 | min_val = min(vals) 199 | if min_val == UINT32_T_MAX: # cannot remove if we have hit the max 200 | return UINT32_T_MAX 201 | if min_val == 0: 202 | return 0 203 | 204 | to_remove = num_els if min_val > num_els else min_val 205 | for k in indices: 206 | if self._bloom[k] < UINT32_T_MAX: # only remove if less than UINT32_T_MAX 207 | self._bloom[k] -= to_remove 208 | self.elements_added -= to_remove 209 | return min_val - to_remove 210 | 211 | def intersection(self, second: "CountingBloomFilter") -> Union["CountingBloomFilter", None]: # type: ignore 212 | """Take the intersection of two Counting Bloom Filters 213 | 214 | Args: 215 | second (CountingBloomFilter): The Bloom Filter with which to take the intersection 216 | Returns: 217 | CountingBloomFilter: The new Counting Bloom Filter containing the union 218 | Raises: 219 | TypeError: When second is not a :class:`CountingBloomFilter` 220 | Note: 221 | The elements_added property will be set to the estimated number of unique elements \ 222 | added as found in estimate_elements() 223 | Note: 224 | If `second` is not of the same size (false_positive_rate and est_elements) then \ 225 | this will return `None`""" 226 | if not _verify_not_type_mismatch(second): 227 | raise TypeError(MISMATCH_MSG) 228 | 229 | if self._verify_bloom_similarity(second) is False: 230 | return None 231 | res = CountingBloomFilter( 232 | est_elements=self.estimated_elements, 233 | false_positive_rate=self.false_positive_rate, 234 | hash_function=self.hash_function, 235 | ) 236 | 237 | for i in range(self.bloom_length): 238 | if self._bloom[i] > 0 and second._bloom[i] > 0: 239 | tmp = self._bloom[i] + second._bloom[i] 240 | res.bloom[i] = tmp 241 | res.elements_added = res.estimate_elements() 242 | return res 243 | 244 | def jaccard_index(self, second: "CountingBloomFilter") -> Union[float, None]: # type:ignore 245 | """Take the Jaccard Index of two Counting Bloom Filters 246 | 247 | Args: 248 | second (CountingBloomFilter): The Bloom Filter with which to take the jaccard index 249 | Returns: 250 | float: A numeric value between 0 and 1 where 1 is identical and 0 means completely different 251 | Raises: 252 | TypeError: When second is not a :class:`CountingBloomFilter` 253 | Note: 254 | The Jaccard Index is based on the unique set of elements added and not the number of each element added 255 | Note: 256 | If `second` is not of the same size (false_positive_rate and est_elements) then this will return `None`""" 257 | if not _verify_not_type_mismatch(second): 258 | raise TypeError(MISMATCH_MSG) 259 | 260 | if self._verify_bloom_similarity(second) is False: 261 | return None 262 | 263 | count_union = 0 264 | count_inter = 0 265 | for i in range(self.bloom_length): 266 | if self._bloom[i] > 0 or second._bloom[i] > 0: 267 | count_union += 1 268 | if self._bloom[i] > 0 and second._bloom[i] > 0: 269 | count_inter += 1 270 | if count_union == 0: 271 | return 1.0 272 | return count_inter / count_union 273 | 274 | def union(self, second: "CountingBloomFilter") -> Union["CountingBloomFilter", None]: # type:ignore 275 | """Return a new Countiong Bloom Filter that contains the union of 276 | the two 277 | 278 | Args: 279 | second (CountingBloomFilter): The Counting Bloom Filter with which to calculate the union 280 | Returns: 281 | CountingBloomFilter: The new Counting Bloom Filter containing the union 282 | Raises: 283 | TypeError: When second is not a :class:`CountingBloomFilter` 284 | Note: 285 | The elements_added property will be set to the estimated number of unique elements added as \ 286 | found in estimate_elements() 287 | Note: 288 | If `second` is not of the same size (false_positive_rate and est_elements) then this will return `None`""" 289 | if not _verify_not_type_mismatch(second): 290 | raise TypeError(MISMATCH_MSG) 291 | 292 | if self._verify_bloom_similarity(second) is False: 293 | return None 294 | res = CountingBloomFilter( 295 | est_elements=self.estimated_elements, 296 | false_positive_rate=self.false_positive_rate, 297 | hash_function=self.hash_function, 298 | ) 299 | for i in range(self.bloom_length): 300 | tmp = self._bloom[i] + second._bloom[i] 301 | res._bloom[i] = tmp 302 | res.elements_added = res.estimate_elements() 303 | return res 304 | 305 | def _cnt_number_bits_set(self) -> int: 306 | """calculate the total number of set bits in the bloom""" 307 | return sum(1 for x in self._bloom if x > 0) 308 | -------------------------------------------------------------------------------- /probables/blooms/expandingbloom.py: -------------------------------------------------------------------------------- 1 | """Expanding and Rotating BloomFilter, python implementations 2 | License: MIT 3 | Author: Tyler Barrus (barrust@gmail.com) 4 | URL: https://github.com/barrust/pyprobables 5 | """ 6 | 7 | from array import array 8 | from collections.abc import ByteString 9 | from io import BytesIO, IOBase 10 | from mmap import mmap 11 | from pathlib import Path 12 | from struct import Struct 13 | from typing import Union 14 | 15 | from probables.blooms.bloom import BloomFilter 16 | from probables.exceptions import RotatingBloomFilterError 17 | from probables.hashes import HashFuncT, HashResultsT, KeyT, default_fnv_1a 18 | from probables.utilities import MMap, is_valid_file, resolve_path 19 | 20 | 21 | class ExpandingBloomFilter: 22 | """Simple expanding Bloom Filter implementation for use in python; the 23 | Bloom Fiter will automatically expand, or grow, if the false 24 | positive rate is about to become greater than the desired false 25 | positive rate. 26 | 27 | Args: 28 | est_elements (int): The number of estimated elements to be added 29 | false_positive_rate (float): The desired false positive rate 30 | filepath (str): Path to file to load 31 | hash_function (function): Hashing strategy function to use `hf(key, number)` 32 | Returns: 33 | ExpandingBloomFilter: An expanding Bloom Filter object 34 | Note: 35 | Initialization order of operations: 36 | 1) Filepath 37 | 2) est_elements and false_positive_rate""" 38 | 39 | __slots__ = ( 40 | "_blooms", 41 | "__fpr", 42 | "__est_elements", 43 | "__hash_func", 44 | "_added_elements", 45 | ) 46 | 47 | def __init__( 48 | self, 49 | est_elements: Union[int, None] = None, 50 | false_positive_rate: Union[float, None] = None, 51 | filepath: Union[str, Path, None] = None, 52 | hash_function: Union[HashFuncT, None] = None, 53 | ): 54 | """initialize""" 55 | self._blooms = [] # type: ignore 56 | self.__fpr = false_positive_rate if false_positive_rate is not None else 0.0 57 | self.__est_elements = est_elements if est_elements is not None else 100 58 | self.__hash_func: HashFuncT 59 | self._added_elements = 0 # total added... 60 | 61 | if hash_function is not None: 62 | self.__hash_func = hash_function 63 | else: 64 | self.__hash_func = default_fnv_1a 65 | 66 | if filepath is not None and is_valid_file(filepath): 67 | self.__load(filepath) 68 | else: 69 | # add in the initial bloom filter! 70 | self.__add_bloom_filter() 71 | 72 | __FOOTER_STRUCT = Struct("QQQf") 73 | __S_INT64_STRUCT = Struct("Q") 74 | _BLOOM_ELEMENT_SIZE = Struct("B").size 75 | 76 | @classmethod 77 | def frombytes(cls, b: ByteString, hash_function: Union[HashFuncT, None] = None) -> "ExpandingBloomFilter": 78 | """ 79 | Args: 80 | b (ByteString): The bytes to load as a Expanding Bloom Filter 81 | hash_function (function): Hashing strategy function to use `hf(key, number)` 82 | Returns: 83 | ExpandingBloomFilter: A Bloom Filter object 84 | """ 85 | size, est_els, added_els, fpr = cls._parse_footer(b) 86 | blm = ExpandingBloomFilter(est_elements=est_els, false_positive_rate=fpr, hash_function=hash_function) 87 | blm._parse_blooms(b, size) 88 | blm._added_elements = added_els 89 | return blm 90 | 91 | def __contains__(self, key: KeyT) -> bool: 92 | """setup the `in` functionality""" 93 | return self.check(key) 94 | 95 | def __bytes__(self) -> bytes: 96 | """Export bloom filter to `bytes`""" 97 | 98 | with BytesIO() as f: 99 | self.export(f) 100 | return f.getvalue() 101 | 102 | @property 103 | def expansions(self) -> int: 104 | """int: The number of expansions""" 105 | return len(self._blooms) - 1 106 | 107 | @property 108 | def false_positive_rate(self) -> float: 109 | """float: The desired false positive rate of the expanding Bloom Filter""" 110 | return self.__fpr 111 | 112 | @property 113 | def estimated_elements(self) -> int: 114 | """int: The original number of elements estimated to be in the Bloom Filter""" 115 | return self.__est_elements 116 | 117 | @property 118 | def elements_added(self) -> int: 119 | """int: The total number of elements added""" 120 | return self._added_elements 121 | 122 | @property 123 | def hash_function(self) -> HashFuncT: 124 | """int: The total number of elements added""" 125 | return self.__hash_func 126 | 127 | def push(self) -> None: 128 | """Push a new expansion onto the Bloom Filter""" 129 | self.__add_bloom_filter() 130 | 131 | def check(self, key: KeyT) -> bool: 132 | """Check to see if the key is in the Bloom Filter 133 | 134 | Args: 135 | key (str): The key to check for in the Bloom Filter 136 | Returns: 137 | bool: `True` if the element is likely present; `False` if definately not present""" 138 | hashes = self._blooms[0].hashes(key) 139 | return self.check_alt(hashes) 140 | 141 | def check_alt(self, hashes: HashResultsT) -> bool: 142 | """Check to see if the hashes are in the Bloom Filter 143 | 144 | Args: 145 | hashes (list): The hash representation to check for in the Bloom Filter 146 | Returns: 147 | bool: `True` if the element is likely present; `False` if definately not present""" 148 | return any(blm.check_alt(hashes) for blm in self._blooms) 149 | 150 | def add(self, key: KeyT, force: bool = False) -> None: 151 | """Add the key to the Bloom Filter 152 | 153 | Args: 154 | key (str): The element to be inserted 155 | force (bool): `True` will force it to be inserted, even if it likely has been inserted \ 156 | before `False` will only insert if not found in the Bloom Filter""" 157 | hashes = self._blooms[0].hashes(key) 158 | self.add_alt(hashes, force) 159 | 160 | def add_alt(self, hashes: HashResultsT, force: bool = False) -> None: 161 | """Add the element represented by hashes into the Bloom Filter 162 | 163 | Args: 164 | hashes (list): A list of integers representing the key to insert 165 | force (bool): `True` will force it to be inserted, even if it likely has been inserted \ 166 | before `False` will only insert if not found in the Bloom Filter""" 167 | self._added_elements += 1 168 | if force or not self.check_alt(hashes): 169 | self.__check_for_growth() 170 | self._blooms[-1].add_alt(hashes) 171 | 172 | def __add_bloom_filter(self): 173 | """build a new bloom and add it on!""" 174 | blm = BloomFilter( 175 | est_elements=self.__est_elements, 176 | false_positive_rate=self.__fpr, 177 | hash_function=self.__hash_func, 178 | ) 179 | self._blooms.append(blm) 180 | 181 | def __check_for_growth(self): 182 | """detereming if the bloom filter should automatically grow""" 183 | if self._blooms[-1].elements_added >= self.__est_elements: 184 | self.__add_bloom_filter() 185 | 186 | def export(self, file: Union[Path, str, IOBase, mmap]) -> None: 187 | """Export an expanding Bloom Filter, or subclass, to disk 188 | 189 | Args: 190 | filepath (str): The path to the file to import""" 191 | if not isinstance(file, (IOBase, mmap)): 192 | file = resolve_path(file) 193 | with open(file, "wb") as filepointer: 194 | self.export(filepointer) # type:ignore 195 | else: 196 | filepointer = file # type:ignore 197 | # add all the different Bloom bit arrays... 198 | for blm in self._blooms: 199 | filepointer.write(self.__S_INT64_STRUCT.pack(blm.elements_added)) 200 | blm.bloom.tofile(filepointer) 201 | filepointer.write( 202 | self.__FOOTER_STRUCT.pack( 203 | len(self._blooms), 204 | self.estimated_elements, 205 | self.elements_added, 206 | self.false_positive_rate, 207 | ) 208 | ) 209 | 210 | def __load(self, file: Union[Path, str, IOBase, mmap]): 211 | """load a file""" 212 | if not isinstance(file, (IOBase, mmap)): 213 | file = resolve_path(file) 214 | with MMap(file) as filepointer: 215 | self.__load(filepointer) 216 | else: 217 | size, est_els, els_added, fpr = self._parse_footer(file) # type: ignore 218 | self._blooms = [] 219 | self._added_elements = els_added 220 | self.__fpr = fpr 221 | self.__est_elements = est_els 222 | self._parse_blooms(file, size) # type:ignore 223 | 224 | @classmethod 225 | def _parse_footer(cls, b: ByteString) -> tuple[int, int, int, float]: 226 | offset = cls.__FOOTER_STRUCT.size 227 | size, est_els, els_added, fpr = cls.__FOOTER_STRUCT.unpack(bytes(b[-1 * offset :])) 228 | return int(size), int(est_els), int(els_added), float(fpr) 229 | 230 | def _parse_blooms(self, b: ByteString, size: int) -> None: 231 | # reset the bloom list 232 | self._blooms = [] 233 | blm_size = 0 234 | start = 0 235 | end = 0 236 | for _ in range(size): 237 | blm = BloomFilter( 238 | est_elements=self.__est_elements, 239 | false_positive_rate=self.__fpr, 240 | hash_function=self.__hash_func, 241 | ) 242 | if blm_size == 0: 243 | blm_size = self._BLOOM_ELEMENT_SIZE * blm.bloom_length 244 | end = start + self.__S_INT64_STRUCT.size + blm_size 245 | blm._els_added = int(self.__S_INT64_STRUCT.unpack(bytes(b[start : start + self.__S_INT64_STRUCT.size]))[0]) 246 | blm._bloom = array("B", bytes(b[start + self.__S_INT64_STRUCT.size : end])) 247 | self._blooms.append(blm) 248 | start = end 249 | 250 | 251 | class RotatingBloomFilter(ExpandingBloomFilter): 252 | """Simple Rotating Bloom Filter implementation that allows for the "older" 253 | elements added to be removed, in chunks. As the queue fills up, those 254 | elements inserted earlier will be bulk removed. This also provides the 255 | user with the oportunity to force the removal instead of it being time 256 | based. 257 | 258 | Args: 259 | est_elements (int): The number of estimated elements to be added 260 | false_positive_rate (float): The desired false positive rate 261 | max_queue_size (int): This is the number is used to determine the maximum number of Bloom Filters. \ 262 | Total elements added is based on `max_queue_size * est_elements` 263 | filepath (str): Path to file to load 264 | hash_function (function): Hashing strategy function to use `hf(key, number)` 265 | Note: 266 | Initialization order of operations: 267 | 1) Filepath 268 | 2) est_elements and false_positive_rate 269 | """ 270 | 271 | __slots__ = ("_queue_size",) 272 | 273 | def __init__( 274 | self, 275 | est_elements: Union[int, None] = None, 276 | false_positive_rate: Union[float, None] = None, 277 | max_queue_size: int = 10, 278 | filepath: Union[str, Path, None] = None, 279 | hash_function: Union[HashFuncT, None] = None, 280 | ) -> None: 281 | """initialize""" 282 | super().__init__( 283 | est_elements=est_elements, 284 | false_positive_rate=false_positive_rate, 285 | filepath=filepath, 286 | hash_function=hash_function, 287 | ) 288 | self._queue_size = max_queue_size 289 | 290 | @classmethod 291 | def frombytes( # type:ignore 292 | cls, b: ByteString, max_queue_size: int, hash_function: Union[HashFuncT, None] = None 293 | ) -> "RotatingBloomFilter": 294 | """ 295 | Args: 296 | b (ByteString): The bytes to load as a Expanding Bloom Filter 297 | max_queue_size (int): This is the number is used to determine the maximum number \ 298 | of Bloom Filters. Total elements added is based on `max_queue_size * est_elements` 299 | hash_function (function): Hashing strategy function to use `hf(key, number)` 300 | Returns: 301 | RotatingBloomFilter: A Bloom Filter object 302 | """ 303 | size, est_els, added_els, fpr = cls._parse_footer(b) 304 | blm = RotatingBloomFilter( 305 | est_elements=est_els, false_positive_rate=fpr, max_queue_size=max_queue_size, hash_function=hash_function 306 | ) 307 | blm._parse_blooms(b, size) 308 | blm._added_elements = added_els 309 | return blm 310 | 311 | @property 312 | def max_queue_size(self) -> int: 313 | """int: The maximum size for the queue""" 314 | return self._queue_size 315 | 316 | @property 317 | def current_queue_size(self) -> int: 318 | """int: The current size of the queue""" 319 | return len(self._blooms) 320 | 321 | def add_alt(self, hashes: HashResultsT, force: bool = False) -> None: 322 | """Add the element represented by hashes into the Bloom Filter 323 | 324 | Args: 325 | hashes (list): A list of integers representing the key to insert 326 | force (bool): `True` will force it to be inserted, even if it likely has been inserted \ 327 | before `False` will only insert if not found in the Bloom Filter""" 328 | self._added_elements += 1 329 | if force or not self.check_alt(hashes): 330 | self.__rotate_bloom_filter() 331 | self._blooms[-1].add_alt(hashes) 332 | 333 | def pop(self) -> None: 334 | """Pop the oldest Bloom Filter off of the queue without pushing a new 335 | Bloom Filter onto the queue 336 | 337 | Raises: 338 | RotatingBloomFilterError: Unable to rotate the Bloom Filter""" 339 | if self.current_queue_size == 1: 340 | msg = "Popping a Bloom Filter will result in an unusable system!" 341 | raise RotatingBloomFilterError(msg) 342 | self._blooms.pop(0) 343 | 344 | def push(self) -> None: 345 | """Push a new bloom filter onto the queue and rotate if necessary""" 346 | self.__rotate_bloom_filter(force=True) 347 | 348 | def __rotate_bloom_filter(self, force: bool = False): 349 | """handle determining if/when the Bloom Filter queue needs to be rotated""" 350 | blm = self._blooms[-1] 351 | ready_to_rotate = blm.elements_added == blm.estimated_elements 352 | no_need_to_pop = self.current_queue_size < self._queue_size 353 | if force and no_need_to_pop: 354 | self.__add_bloom_filter() 355 | elif force: # must need to be pop'd first! 356 | blm = self._blooms.pop(0) 357 | self.__add_bloom_filter() 358 | elif ready_to_rotate and no_need_to_pop: 359 | self.__add_bloom_filter() 360 | elif ready_to_rotate: 361 | blm = self._blooms.pop(0) 362 | self.__add_bloom_filter() 363 | 364 | def __add_bloom_filter(self): 365 | """build a new bloom and add it on!""" 366 | blm = BloomFilter( 367 | est_elements=self.estimated_elements, 368 | false_positive_rate=self.false_positive_rate, 369 | hash_function=self.hash_function, 370 | ) 371 | self._blooms.append(blm) 372 | -------------------------------------------------------------------------------- /probables/blooms/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/barrust/pyprobables/e5d44dac65033421d455681178344eeca9961bd8/probables/blooms/py.typed -------------------------------------------------------------------------------- /probables/constants.py: -------------------------------------------------------------------------------- 1 | """Project Constants (or basic numerical constants...)""" 2 | 3 | INT32_T_MIN = -2147483648 4 | INT32_T_MAX = 2147483647 5 | INT64_T_MIN = -9223372036854775808 6 | INT64_T_MAX = 9223372036854775807 7 | UINT32_T_MAX = 2**32 - 1 8 | UINT64_T_MAX = 2**64 - 1 9 | -------------------------------------------------------------------------------- /probables/countminsketch/__init__.py: -------------------------------------------------------------------------------- 1 | """Count-Min Sketchs""" 2 | 3 | from probables.countminsketch.countminsketch import ( 4 | CountMeanMinSketch, 5 | CountMeanSketch, 6 | CountMinSketch, 7 | HeavyHitters, 8 | StreamThreshold, 9 | ) 10 | 11 | __all__ = ["CountMinSketch", "HeavyHitters", "StreamThreshold", "CountMeanSketch", "CountMeanMinSketch"] 12 | -------------------------------------------------------------------------------- /probables/countminsketch/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/barrust/pyprobables/e5d44dac65033421d455681178344eeca9961bd8/probables/countminsketch/py.typed -------------------------------------------------------------------------------- /probables/cuckoo/__init__.py: -------------------------------------------------------------------------------- 1 | """Cuckoo Filters""" 2 | 3 | from probables.cuckoo.countingcuckoo import CountingCuckooFilter 4 | from probables.cuckoo.cuckoo import CuckooFilter 5 | 6 | __all__ = ["CuckooFilter", "CountingCuckooFilter"] 7 | -------------------------------------------------------------------------------- /probables/cuckoo/countingcuckoo.py: -------------------------------------------------------------------------------- 1 | """Counting Cuckoo Filter, python implementation 2 | License: MIT 3 | Author: Tyler Barrus (barrust@gmail.com) 4 | """ 5 | 6 | import random 7 | from array import array 8 | from collections.abc import ByteString 9 | from io import IOBase 10 | from mmap import mmap 11 | from pathlib import Path 12 | from struct import Struct 13 | from typing import Union 14 | 15 | from probables.cuckoo.cuckoo import CuckooFilter 16 | from probables.exceptions import CuckooFilterFullError 17 | from probables.hashes import KeyT, SimpleHashT 18 | from probables.utilities import MMap, resolve_path 19 | 20 | 21 | class CountingCuckooFilter(CuckooFilter): 22 | """Simple Counting Cuckoo Filter implementation 23 | 24 | Args: 25 | capacity (int): The number of bins 26 | bucket_size (int): The number of buckets per bin 27 | max_swaps (int): The number of cuckoo swaps before stopping 28 | expansion_rate (int): The rate at which to expand 29 | auto_expand (bool): If the filter should automatically expand 30 | finger_size (int): The size of the fingerprint to use in bytes \ 31 | (between 1 and 4); exported as 4 bytes; up to the user to \ 32 | reset the size correctly on import 33 | filename (str): The path to the file to load or None if no file 34 | Returns: 35 | CountingCuckooFilter: A Cuckoo Filter object""" 36 | 37 | __slots__ = ("__unique_elements",) 38 | 39 | def __init__( 40 | self, 41 | capacity: int = 10000, 42 | bucket_size: int = 4, 43 | max_swaps: int = 500, 44 | expansion_rate: int = 2, 45 | auto_expand: bool = True, 46 | finger_size: int = 4, 47 | filepath: Union[str, Path, None] = None, 48 | hash_function: Union[SimpleHashT, None] = None, 49 | ) -> None: 50 | """setup the data structure""" 51 | self.__unique_elements = 0 52 | super().__init__( 53 | capacity, 54 | bucket_size, 55 | max_swaps, 56 | expansion_rate, 57 | auto_expand, 58 | finger_size, 59 | filepath, 60 | hash_function, 61 | ) 62 | 63 | __COUNTING_CUCKOO_FOOTER_STRUCT = Struct("II") 64 | __BIN_STRUCT = Struct("II") 65 | 66 | @classmethod 67 | def init_error_rate( 68 | cls, 69 | error_rate: float, 70 | capacity: int = 10000, 71 | bucket_size: int = 4, 72 | max_swaps: int = 500, 73 | expansion_rate: int = 2, 74 | auto_expand: bool = True, 75 | hash_function: Union[SimpleHashT, None] = None, 76 | ): 77 | """Initialize a simple Cuckoo Filter based on error rate 78 | 79 | Args: 80 | error_rate (float): 81 | capacity (int): The number of bins 82 | bucket_size (int): The number of buckets per bin 83 | max_swaps (int): The number of cuckoo swaps before stopping 84 | expansion_rate (int): The rate at which to expand 85 | auto_expand (bool): If the filter should automatically expand 86 | hash_function (function): Hashing strategy function to use `hf(key)` 87 | Returns: 88 | CuckooFilter: A Cuckoo Filter object""" 89 | cku = CountingCuckooFilter( 90 | capacity=capacity, 91 | bucket_size=bucket_size, 92 | auto_expand=auto_expand, 93 | max_swaps=max_swaps, 94 | expansion_rate=expansion_rate, 95 | hash_function=hash_function, 96 | ) 97 | cku._set_error_rate(error_rate) 98 | return cku 99 | 100 | @classmethod 101 | def load_error_rate( 102 | cls, error_rate: float, filepath: Union[str, Path], hash_function: Union[SimpleHashT, None] = None 103 | ): 104 | """Initialize a previously exported Cuckoo Filter based on error rate 105 | 106 | Args: 107 | error_rate (float): 108 | filepath (str): The path to the file to load or None if no file 109 | hash_function (function): Hashing strategy function to use \ 110 | `hf(key)` 111 | Returns: 112 | CuckooFilter: A Cuckoo Filter object 113 | """ 114 | filepath = resolve_path(filepath) 115 | cku = CountingCuckooFilter(filepath=filepath, hash_function=hash_function) 116 | cku._set_error_rate(error_rate) 117 | return cku 118 | 119 | @classmethod 120 | def frombytes( 121 | cls, b: ByteString, error_rate: Union[float, None] = None, hash_function: Union[SimpleHashT, None] = None 122 | ) -> "CountingCuckooFilter": 123 | """ 124 | Args: 125 | b (ByteString): The bytes to load as a Expanding Bloom Filter 126 | error_rate (float): The error rate of the cuckoo filter, if used to generate the original filter 127 | hash_function (function): Hashing strategy function to use `hf(key, number)` 128 | Returns: 129 | CountingCuckooFilter: A Bloom Filter object""" 130 | cku = CountingCuckooFilter(hash_function=hash_function) 131 | cku._load(b) 132 | 133 | # if error rate is provided, use it 134 | cku._set_error_rate(error_rate) 135 | return cku 136 | 137 | def __contains__(self, val: KeyT) -> bool: 138 | """setup the `in` keyword""" 139 | return self.check(val) > 0 140 | 141 | @property 142 | def unique_elements(self) -> int: 143 | """int: unique number of elements inserted""" 144 | return self.__unique_elements 145 | 146 | @property 147 | def buckets(self) -> list[list["CountingCuckooBin"]]: # type: ignore 148 | """list(list): The buckets holding the fingerprints 149 | 150 | Note: 151 | Not settable""" 152 | return self._buckets 153 | 154 | def load_factor(self) -> float: 155 | """float: How full the Cuckoo Filter is currently""" 156 | return self.unique_elements / (self.capacity * self.bucket_size) 157 | 158 | def add(self, key: KeyT) -> None: 159 | """Add element key to the filter 160 | 161 | Args: 162 | key (str): The element to add 163 | Raises: 164 | CuckooFilterFullError: When element not inserted after maximum number of swaps or 'kicks'""" 165 | idx_1, idx_2, fingerprint = self._generate_fingerprint_info(key) 166 | 167 | is_present = self._check_if_present(idx_1, idx_2, fingerprint) 168 | if is_present is not None: 169 | for bucket in self.buckets[is_present]: 170 | if fingerprint in bucket: 171 | bucket.increment() 172 | self._inserted_elements += 1 173 | return 174 | finger = self._insert_fingerprint_alt(fingerprint, idx_1, idx_2) 175 | self._deal_with_insertion(finger) 176 | 177 | def check(self, key: KeyT) -> int: # type: ignore 178 | """Check if an element is in the filter 179 | 180 | Args: 181 | key (str): Element to check 182 | Returns: 183 | int: The number of times inserted into the filter""" 184 | idx_1, idx_2, fingerprint = self._generate_fingerprint_info(key) 185 | is_present = self._check_if_present(idx_1, idx_2, fingerprint) 186 | val = 0 187 | if is_present is not None: 188 | # get the count out! 189 | for bucket in self.buckets[is_present]: 190 | if fingerprint in bucket: 191 | val = bucket.count 192 | break 193 | return val 194 | 195 | def remove(self, key: KeyT) -> bool: 196 | """Remove an element from the filter 197 | 198 | Args: 199 | key (str): Element to remove""" 200 | idx_1, idx_2, fingerprint = self._generate_fingerprint_info(key) 201 | idx = self._check_if_present(idx_1, idx_2, fingerprint) 202 | if idx is None: 203 | return False 204 | for bucket in self.buckets[idx]: 205 | if fingerprint in bucket: 206 | bucket.decrement() 207 | self._inserted_elements -= 1 208 | if bucket.count == 0: 209 | self.buckets[idx].remove(bucket) 210 | self.__unique_elements -= 1 211 | return True 212 | return False # catch this... 213 | 214 | def expand(self): 215 | """Expand the cuckoo filter""" 216 | self._expand_logic(None) 217 | 218 | def export(self, file: Union[Path, str, IOBase, mmap]) -> None: 219 | """Export cuckoo filter to file 220 | 221 | Args: 222 | file (str): Path to file to export""" 223 | if not isinstance(file, (IOBase, mmap)): 224 | file = resolve_path(file) 225 | with open(file, "wb") as filepointer: 226 | self.export(filepointer) # type:ignore 227 | else: 228 | self.__bucket_decomposition(self.buckets, self.bucket_size).tofile(file) 229 | # now put out the required information at the end 230 | file.write(self.__COUNTING_CUCKOO_FOOTER_STRUCT.pack(self.bucket_size, self.max_swaps)) 231 | 232 | def _insert_fingerprint_alt( 233 | self, fingerprint: int, idx_1: int, idx_2: int, count: int = 1 234 | ) -> Union["CountingCuckooBin", None]: 235 | """insert a fingerprint, but with a count parameter!""" 236 | if self.__insert_element(fingerprint, idx_1, count): 237 | self._inserted_elements += 1 238 | self.__unique_elements += 1 239 | return None 240 | if self.__insert_element(fingerprint, idx_2, count): 241 | self._inserted_elements += 1 242 | self.__unique_elements += 1 243 | return None 244 | 245 | # we didn't insert, so now we need to randomly select one index to use 246 | # and move things around to the other index, if possible, until we 247 | # either move everything around or hit the maximum number of swaps 248 | idx = random.choice([idx_1, idx_2]) 249 | prv_bin = CountingCuckooBin(fingerprint, 1) 250 | for _ in range(self.max_swaps): 251 | # select one element to be swapped out... 252 | swap_elm = random.randint(0, self.bucket_size - 1) 253 | swap_finger = self.buckets[idx][swap_elm] 254 | prv_bin, self.buckets[idx][swap_elm] = swap_finger, prv_bin 255 | 256 | # now find another place to put this fingerprint 257 | index_1, index_2 = self._indicies_from_fingerprint(prv_bin.finger) 258 | 259 | idx = index_2 if idx == index_1 else index_1 260 | 261 | if self.__insert_element(prv_bin.finger, idx, prv_bin.count): 262 | self._inserted_elements += 1 263 | self.__unique_elements += 1 264 | return None 265 | 266 | # if we got here we have an error... we might need to know what is left 267 | return prv_bin 268 | 269 | def _check_if_present(self, idx_1: int, idx_2: int, fingerprint: int) -> Union[int, None]: 270 | """wrapper for checking if fingerprint is already inserted""" 271 | if fingerprint in [x.finger for x in self.buckets[idx_1]]: 272 | return idx_1 273 | if fingerprint in [x.finger for x in self.buckets[idx_2]]: 274 | return idx_2 275 | return None 276 | 277 | def _load(self, file: Union[Path, str, IOBase, mmap, bytes, ByteString]) -> None: 278 | """load a cuckoo filter from file""" 279 | if not isinstance(file, (IOBase, mmap, bytes, bytearray, memoryview)): 280 | file = resolve_path(file) 281 | with MMap(file) as filepointer: 282 | self._load(filepointer) 283 | else: 284 | self._parse_footer(file, self.__COUNTING_CUCKOO_FOOTER_STRUCT) # type: ignore 285 | self._inserted_elements = 0 286 | self._parse_buckets(file) # type: ignore 287 | 288 | def _parse_buckets(self, d: ByteString) -> None: 289 | """Parse bytes to pull out and set the buckets""" 290 | bin_size = self.__BIN_STRUCT.size 291 | self._cuckoo_capacity = (len(bytes(d)) - bin_size) // bin_size // self.bucket_size 292 | start = 0 293 | end = bin_size 294 | self._buckets = [] 295 | for i in range(self.capacity): 296 | self.buckets.append([]) 297 | for _ in range(self.bucket_size): 298 | finger, count = self.__BIN_STRUCT.unpack(bytes(d[start:end])) 299 | if finger > 0: 300 | ccb = CountingCuckooBin(finger, count) 301 | self.buckets[i].append(ccb) 302 | self._inserted_elements += count 303 | self.__unique_elements += 1 304 | start = end 305 | end += bin_size 306 | 307 | def _expand_logic(self, extra_fingerprint: "CountingCuckooBin") -> None: 308 | """the logic to acutally expand the cuckoo filter""" 309 | # get all the fingerprints 310 | fingerprints = self._setup_expand(extra_fingerprint) 311 | self.__unique_elements = 0 # this needs to be reset! 312 | 313 | for elm in fingerprints: 314 | idx_1, idx_2 = self._indicies_from_fingerprint(elm.finger) 315 | res = self._insert_fingerprint_alt(elm.finger, idx_1, idx_2, elm.count) 316 | if res is not None: # again, this *shouldn't* happen 317 | msg = "The CountingCuckooFilter failed to expand" 318 | raise CuckooFilterFullError(msg) 319 | 320 | def __insert_element(self, fingerprint, idx, count=1) -> bool: 321 | """insert an element""" 322 | if len(self.buckets[idx]) < self.bucket_size: 323 | self.buckets[idx].append(CountingCuckooBin(fingerprint, count)) 324 | return True 325 | return False 326 | 327 | @staticmethod 328 | def __bucket_decomposition(buckets, bucket_size: int) -> array: 329 | """convert a list of buckets into a single array for export""" 330 | arr = array("I") 331 | for bucket in buckets: 332 | for buck in bucket: 333 | arr.extend(buck.get_array()) 334 | leftover = bucket_size - len(bucket) 335 | arr.fromlist([0 for _ in range(leftover * 2)]) 336 | return arr 337 | 338 | 339 | class CountingCuckooBin: 340 | """A container class for the counting cuckoo filter""" 341 | 342 | # keep it lightweight 343 | __slots__ = ["__bin"] 344 | 345 | def __init__(self, fingerprint: int, count: int) -> None: 346 | """init""" 347 | self.__bin = array("I", [fingerprint, count]) 348 | 349 | def __contains__(self, val: int) -> bool: 350 | """setup the `in` construct""" 351 | return self.__bin[0] == val 352 | 353 | def get_array(self): 354 | """return the array implementation""" 355 | return self.__bin 356 | 357 | @property 358 | def finger(self) -> int: 359 | """fingerprint property""" 360 | return self.__bin[0] 361 | 362 | @property 363 | def count(self) -> int: 364 | """count property""" 365 | return self.__bin[1] 366 | 367 | def __repr__(self) -> str: 368 | """how do we represent this?""" 369 | return self.__str__() 370 | 371 | def __str__(self) -> str: 372 | """convert it into a string""" 373 | return f"(fingerprint:{self.__bin[0]} count:{self.__bin[1]})" 374 | 375 | def increment(self) -> int: 376 | """increment""" 377 | self.__bin[1] += 1 378 | return self.__bin[1] 379 | 380 | def decrement(self) -> int: 381 | """decrement""" 382 | self.__bin[1] -= 1 383 | return self.__bin[1] 384 | -------------------------------------------------------------------------------- /probables/cuckoo/cuckoo.py: -------------------------------------------------------------------------------- 1 | """Cuckoo Filter, python implementation 2 | License: MIT 3 | Author: Tyler Barrus (barrust@gmail.com) 4 | """ 5 | 6 | import math 7 | import random 8 | from array import array 9 | from collections.abc import ByteString 10 | from io import BytesIO, IOBase 11 | from mmap import mmap 12 | from numbers import Number 13 | from pathlib import Path 14 | from struct import Struct 15 | from typing import Union 16 | 17 | from probables.exceptions import CuckooFilterFullError, InitializationError 18 | from probables.hashes import KeyT, SimpleHashT, fnv_1a 19 | from probables.utilities import MMap, get_x_bits, is_valid_file, resolve_path 20 | 21 | 22 | class CuckooFilter: 23 | """Simple Cuckoo Filter implementation 24 | 25 | Args: 26 | capacity (int): The number of bins 27 | bucket_size (int): The number of buckets per bin 28 | max_swaps (int): The number of cuckoo swaps before stopping 29 | expansion_rate (int): The rate at which to expand 30 | auto_expand (bool): If the filter should automatically expand 31 | finger_size (int): The size of the fingerprint to use in bytes \ 32 | (between 1 and 4); exported as 4 bytes; up to the user to \ 33 | reset the size correctly on import 34 | filepath (str): The path to the file to load or None if no file 35 | hash_function (function): Hashing strategy function to use `hf(key)` 36 | Returns: 37 | CuckooFilter: A Cuckoo Filter object""" 38 | 39 | __slots__ = ( 40 | "_bucket_size", 41 | "_cuckoo_capacity", 42 | "__max_cuckoo_swaps", 43 | "__expansion_rate", 44 | "__auto_expand", 45 | "_fingerprint_size", 46 | "__hash_func", 47 | "_inserted_elements", 48 | "_buckets", 49 | "_error_rate", 50 | ) 51 | 52 | def __init__( 53 | self, 54 | capacity: int = 10000, 55 | bucket_size: int = 4, 56 | max_swaps: int = 500, 57 | expansion_rate: int = 2, 58 | auto_expand: bool = True, 59 | finger_size: int = 4, 60 | filepath: Union[str, Path, None] = None, 61 | hash_function: Union[SimpleHashT, None] = None, 62 | ): 63 | """setup the data structure""" 64 | valid_prms = ( 65 | isinstance(capacity, Number) 66 | and capacity >= 1 67 | and isinstance(bucket_size, Number) 68 | and bucket_size >= 1 69 | and isinstance(max_swaps, Number) 70 | and max_swaps >= 1 71 | ) 72 | if not valid_prms: 73 | msg = "CuckooFilter: capacity, bucket_size, and max_swaps must be an integer greater than 0" 74 | raise InitializationError(msg) 75 | self._bucket_size = int(bucket_size) 76 | self._cuckoo_capacity = int(capacity) 77 | self.__max_cuckoo_swaps = int(max_swaps) 78 | self.__expansion_rate = 2 79 | self.expansion_rate = expansion_rate 80 | self.__auto_expand = True 81 | self.auto_expand = auto_expand 82 | self._fingerprint_size = 32 83 | self.fingerprint_size = finger_size 84 | 85 | if hash_function is None: 86 | self.__hash_func = fnv_1a 87 | else: 88 | self.__hash_func = hash_function # type: ignore 89 | self._inserted_elements = 0 90 | if filepath is None: 91 | self._buckets = [] # type: ignore 92 | for _ in range(self.capacity): 93 | self.buckets.append([]) 94 | elif is_valid_file(filepath): 95 | filepath = resolve_path(filepath) 96 | self._load(filepath) 97 | else: 98 | msg = "CuckooFilter: failed to load provided file" 99 | raise InitializationError(msg) 100 | 101 | self._error_rate = float(self._calc_error_rate()) 102 | 103 | @classmethod 104 | def init_error_rate( 105 | cls, 106 | error_rate: float, 107 | capacity: int = 10000, 108 | bucket_size: int = 4, 109 | max_swaps: int = 500, 110 | expansion_rate: int = 2, 111 | auto_expand: bool = True, 112 | hash_function: Union[SimpleHashT, None] = None, 113 | ): 114 | """Initialize a simple Cuckoo Filter based on error rate 115 | 116 | Args: 117 | error_rate (float): 118 | capacity (int): The number of bins 119 | bucket_size (int): The number of buckets per bin 120 | max_swaps (int): The number of cuckoo swaps before stopping 121 | expansion_rate (int): The rate at which to expand 122 | auto_expand (bool): If the filter should automatically expand 123 | hash_function (function): Hashing strategy function to use \ 124 | `hf(key)` 125 | Returns: 126 | CuckooFilter: A Cuckoo Filter object""" 127 | cku = CuckooFilter( 128 | capacity=capacity, 129 | bucket_size=bucket_size, 130 | auto_expand=auto_expand, 131 | max_swaps=max_swaps, 132 | expansion_rate=expansion_rate, 133 | hash_function=hash_function, 134 | ) 135 | cku._set_error_rate(error_rate) 136 | return cku 137 | 138 | @classmethod 139 | def load_error_rate( 140 | cls, 141 | error_rate: float, 142 | filepath: Union[str, Path], 143 | hash_function: Union[SimpleHashT, None] = None, 144 | ): 145 | """Initialize a previously exported Cuckoo Filter based on error rate 146 | 147 | Args: 148 | error_rate (float): 149 | filepath (str): The path to the file to load or None if no file 150 | hash_function (function): Hashing strategy function to use `hf(key)` 151 | Returns: 152 | CuckooFilter: A Cuckoo Filter object""" 153 | filepath = resolve_path(filepath) 154 | cku = CuckooFilter(filepath=filepath, hash_function=hash_function) 155 | cku._set_error_rate(error_rate) 156 | return cku 157 | 158 | @classmethod 159 | def frombytes( 160 | cls, 161 | b: ByteString, 162 | error_rate: Union[float, None] = None, 163 | hash_function: Union[SimpleHashT, None] = None, 164 | ) -> "CuckooFilter": 165 | """ 166 | Args: 167 | b (ByteString): The bytes to load as a Expanding Bloom Filter 168 | error_rate (float): The error rate of the cuckoo filter, if used to generate the original filter 169 | hash_function (function): Hashing strategy function to use `hf(key, number)` 170 | Returns: 171 | CuckooFilter: A Bloom Filter object 172 | """ 173 | cku = CuckooFilter(hash_function=hash_function) 174 | cku._load(b) # type: ignore 175 | 176 | # if error rate is provided, use it 177 | cku._set_error_rate(error_rate) 178 | return cku 179 | 180 | def __contains__(self, key: KeyT) -> bool: 181 | """setup the `in` keyword""" 182 | return self.check(key) 183 | 184 | def __str__(self): 185 | """setup what it will print""" 186 | return ( 187 | f"{self.__class__.__name__}:\n" 188 | f"\tCapacity: {self.capacity}\n" 189 | f"\tTotal Bins: {self.capacity * self.bucket_size}\n" 190 | f"\tLoad Factor: {self.load_factor() * 100}%\n" 191 | f"\tInserted Elements: {self.elements_added}\n" 192 | f"\tMax Swaps: {self.max_swaps}\n" 193 | f"\tExpansion Rate: {self.expansion_rate}\n" 194 | f"\tAuto Expand: {self.auto_expand}" 195 | ) 196 | 197 | @property 198 | def elements_added(self) -> int: 199 | """int: The number of elements added 200 | 201 | Note: 202 | Not settable""" 203 | return self._inserted_elements 204 | 205 | @property 206 | def capacity(self) -> int: 207 | """int: The number of bins 208 | 209 | Note: 210 | Not settable""" 211 | return self._cuckoo_capacity 212 | 213 | @property 214 | def max_swaps(self) -> int: 215 | """int: The maximum number of swaps to perform 216 | 217 | Note: 218 | Not settable""" 219 | return self.__max_cuckoo_swaps 220 | 221 | @property 222 | def bucket_size(self) -> int: 223 | """int: The number of buckets per bin 224 | 225 | Note: 226 | Not settable""" 227 | return self._bucket_size 228 | 229 | @property 230 | def buckets(self) -> list[list[int]]: 231 | """list(list): The buckets holding the fingerprints 232 | 233 | Note: 234 | Not settable""" 235 | return self._buckets 236 | 237 | @property 238 | def expansion_rate(self) -> int: 239 | """int: The rate at expansion when the filter grows""" 240 | return self.__expansion_rate 241 | 242 | @expansion_rate.setter 243 | def expansion_rate(self, val: int): 244 | """set the self expand value""" 245 | self.__expansion_rate = val 246 | 247 | @property 248 | def error_rate(self) -> float: 249 | """float: The error rate of the cuckoo filter""" 250 | return self._error_rate 251 | 252 | @property 253 | def auto_expand(self) -> bool: 254 | """bool: True if the cuckoo filter will expand automatically""" 255 | return self.__auto_expand 256 | 257 | @auto_expand.setter 258 | def auto_expand(self, val: bool): 259 | """set the self expand value""" 260 | self.__auto_expand = bool(val) 261 | 262 | @property 263 | def fingerprint_size_bits(self) -> int: 264 | """int: The size in bits of the fingerprint""" 265 | return self._fingerprint_size 266 | 267 | @property 268 | def fingerprint_size(self) -> int: 269 | """int: The size in bytes of the fingerprint 270 | 271 | Raises: 272 | ValueError: If the size is not between 1 and 4 273 | Note: 274 | The size of the fingerprint must be between 1 and 4""" 275 | return math.ceil(self.fingerprint_size_bits / 8) 276 | 277 | @fingerprint_size.setter 278 | def fingerprint_size(self, val: int): 279 | """set the fingerprint size""" 280 | tmp = val 281 | if not 1 <= tmp <= 4: 282 | msg = f"{self.__class__.__name__}: fingerprint size must be between 1 and 4" 283 | raise ValueError(msg) 284 | # bytes to bits 285 | self._fingerprint_size = tmp * 8 286 | self._calc_error_rate() # if updating fingerprint size then error rate may change 287 | 288 | def load_factor(self) -> float: 289 | """float: How full the Cuckoo Filter is currently""" 290 | return self.elements_added / (self.capacity * self.bucket_size) 291 | 292 | def add(self, key: KeyT): 293 | """Add element key to the filter 294 | 295 | Args: 296 | key (str): The element to add 297 | Raises: 298 | CuckooFilterFullError: When element not inserted after maximum number of swaps or 'kicks'""" 299 | idx_1, idx_2, fingerprint = self._generate_fingerprint_info(key) 300 | 301 | is_present = self._check_if_present(idx_1, idx_2, fingerprint) 302 | if is_present is not None: # already there, nothing to do 303 | return 304 | finger = self._insert_fingerprint(fingerprint, idx_1, idx_2) 305 | self._deal_with_insertion(finger) 306 | 307 | def check(self, key: KeyT) -> bool: 308 | """Check if an element is in the filter 309 | 310 | Args: 311 | key (str): Element to check 312 | Returns: 313 | bool: True if likely present, False if definately not""" 314 | idx_1, idx_2, fingerprint = self._generate_fingerprint_info(key) 315 | is_present = self._check_if_present(idx_1, idx_2, fingerprint) 316 | return is_present is not None 317 | 318 | def remove(self, key: KeyT) -> bool: 319 | """Remove an element from the filter 320 | 321 | Args: 322 | key (str): Element to remove 323 | Returns: 324 | bool: True if removed, False if not present""" 325 | idx_1, idx_2, fingerprint = self._generate_fingerprint_info(key) 326 | idx = self._check_if_present(idx_1, idx_2, fingerprint) 327 | if idx is None: 328 | return False 329 | self.buckets[idx].remove(fingerprint) 330 | self._inserted_elements -= 1 331 | return True 332 | 333 | def export(self, file: Union[Path, str, IOBase, mmap]) -> None: 334 | """Export cuckoo filter to file 335 | 336 | Args: 337 | file: Path to file to export""" 338 | 339 | if not isinstance(file, (IOBase, mmap)): 340 | file = resolve_path(file) 341 | with open(file, "wb") as filepointer: 342 | self.export(filepointer) # type:ignore 343 | else: 344 | filepointer = file # type:ignore 345 | for _, val in enumerate(self.buckets): 346 | bucket = array(self._CUCKOO_SINGLE_INT_C, val) 347 | bucket.extend([0] * (self.bucket_size - len(bucket))) 348 | bucket.tofile(filepointer) 349 | # now put out the required information at the end 350 | filepointer.write(self._CUCKOO_FOOTER_STRUCT.pack(self.bucket_size, self.max_swaps)) 351 | 352 | def __bytes__(self) -> bytes: 353 | """Export cuckoo filter to `bytes`""" 354 | with BytesIO() as f: 355 | self.export(f) 356 | return f.getvalue() 357 | 358 | def expand(self): 359 | """Expand the cuckoo filter""" 360 | self._expand_logic(None) 361 | 362 | def _insert_fingerprint(self, fingerprint, idx_1, idx_2): 363 | """insert a fingerprint""" 364 | if self.__insert_element(fingerprint, idx_1): 365 | self._inserted_elements += 1 366 | return None 367 | if self.__insert_element(fingerprint, idx_2): 368 | self._inserted_elements += 1 369 | return None 370 | 371 | # we didn't insert, so now we need to randomly select one index to use 372 | # and move things around to the other index, if possible, until we 373 | # either move everything around or hit the maximum number of swaps 374 | idx = random.choice([idx_1, idx_2]) 375 | 376 | for _ in range(self.max_swaps): 377 | # select one element to be swapped out... 378 | swap_elm = random.randint(0, self.bucket_size - 1) 379 | 380 | swb = self.buckets[idx][swap_elm] 381 | fingerprint, self.buckets[idx][swap_elm] = swb, fingerprint 382 | 383 | # now find another place to put this fingerprint 384 | index_1, index_2 = self._indicies_from_fingerprint(fingerprint) 385 | 386 | idx = index_2 if idx == index_1 else index_1 387 | 388 | if self.__insert_element(fingerprint, idx): 389 | self._inserted_elements += 1 390 | return None 391 | 392 | # if we got here we have an error... we might need to know what is left 393 | return fingerprint 394 | 395 | def _load(self, file: Union[Path, str, IOBase, mmap, bytes]) -> None: 396 | """load a cuckoo filter from file""" 397 | if not isinstance(file, (IOBase, mmap, bytes)): 398 | file = resolve_path(file) 399 | with MMap(file) as filepointer: 400 | self._load(filepointer) 401 | else: 402 | self._parse_footer(file, self._CUCKOO_FOOTER_STRUCT) # type: ignore 403 | self._inserted_elements = 0 404 | # now pull everything in! 405 | self._parse_buckets(file) # type: ignore 406 | 407 | _CUCKOO_SINGLE_INT_C = "I" 408 | _CUCKOO_SINGLE_INT_SIZE = Struct(_CUCKOO_SINGLE_INT_C).size 409 | _CUCKOO_FOOTER_STRUCT = Struct("II") 410 | 411 | def _parse_footer(self, d: ByteString, stct: Struct) -> None: 412 | """parse bytes and set footer information""" 413 | list_size = len(d) - stct.size 414 | self._bucket_size, self.__max_cuckoo_swaps = stct.unpack(d[list_size:]) # type:ignore 415 | self._cuckoo_capacity = list_size // self._CUCKOO_SINGLE_INT_SIZE // self.bucket_size 416 | 417 | def _parse_buckets(self, d: ByteString) -> None: 418 | """parse bytes and set buckets""" 419 | self._buckets = [] 420 | bucket_byte_size = self.bucket_size * self._CUCKOO_SINGLE_INT_SIZE 421 | offs = 0 422 | for _ in range(self.capacity): 423 | next_offs = offs + bucket_byte_size 424 | self.buckets.append(self._parse_bucket(d[offs:next_offs])) # type: ignore 425 | offs = next_offs 426 | 427 | def _parse_bucket(self, d: ByteString) -> array: 428 | """parse a single bucket""" 429 | bucket = array(self._CUCKOO_SINGLE_INT_C, bytes(d)) 430 | bucket = array(self._CUCKOO_SINGLE_INT_C, [el for el in bucket if el]) 431 | self._inserted_elements += len(bucket) 432 | return bucket 433 | 434 | def _set_error_rate(self, error_rate: Union[float, None]) -> None: 435 | """set error rate correctly""" 436 | # if error rate is provided, use it 437 | if error_rate is not None: 438 | self._error_rate = error_rate 439 | self._fingerprint_size = self._calc_fingerprint_size() 440 | 441 | def _check_if_present(self, idx_1, idx_2, fingerprint): 442 | """wrapper for checking if fingerprint is already inserted""" 443 | if fingerprint in self.buckets[idx_1]: 444 | return idx_1 445 | if fingerprint in self.buckets[idx_2]: 446 | return idx_2 447 | return None 448 | 449 | def __insert_element(self, fingerprint, idx) -> bool: 450 | """insert element wrapper""" 451 | if len(self.buckets[idx]) < self.bucket_size: 452 | self.buckets[idx].append(fingerprint) 453 | return True 454 | return False 455 | 456 | def _expand_logic(self, extra_fingerprint): 457 | """the logic to acutally expand the cuckoo filter""" 458 | # get all the fingerprints 459 | fingerprints = self._setup_expand(extra_fingerprint) 460 | 461 | for finger in fingerprints: 462 | idx_1, idx_2 = self._indicies_from_fingerprint(finger) 463 | res = self._insert_fingerprint(finger, idx_1, idx_2) 464 | if res is not None: # again, this *shouldn't* happen 465 | msg = "The CuckooFilter failed to expand" 466 | raise CuckooFilterFullError(msg) 467 | 468 | def _setup_expand(self, extra_fingerprint): 469 | """setup this thing""" 470 | fingerprints = [] 471 | if extra_fingerprint is not None: 472 | fingerprints.append(extra_fingerprint) 473 | for idx in range(self.capacity): 474 | fingerprints.extend(self.buckets[idx]) 475 | 476 | self._cuckoo_capacity = self.capacity * self.expansion_rate 477 | self._buckets = [] 478 | self._inserted_elements = 0 479 | for _ in range(self.capacity): 480 | self.buckets.append([]) 481 | 482 | return fingerprints 483 | 484 | def _indicies_from_fingerprint(self, fingerprint): 485 | """Generate the possible insertion indicies from a fingerprint 486 | 487 | Args: 488 | fingerprint (int): The fingerprint to use for generating indicies""" 489 | idx_1 = fingerprint % self.capacity 490 | idx_2 = self.__hash_func(str(fingerprint)) % self.capacity 491 | return idx_1, idx_2 492 | 493 | def _generate_fingerprint_info(self, key: KeyT) -> tuple[int, int, int]: 494 | """Generate the fingerprint and indicies using the provided key 495 | 496 | Args: 497 | key (str): The element for which information is to be generated 498 | """ 499 | # generate the fingerprint along with the two possible indecies 500 | hash_val = self.__hash_func(key) 501 | fingerprint = get_x_bits(hash_val, 64, self.fingerprint_size_bits, True) 502 | idx_1, idx_2 = self._indicies_from_fingerprint(fingerprint) 503 | 504 | # NOTE: This should never happen... 505 | if idx_1 > self.capacity or idx_2 > self.capacity: 506 | raise ValueError(f"Either idx_1 {idx_1} or idx_2 {idx_2} is greater than {self.capacity}") 507 | return idx_1, idx_2, fingerprint 508 | 509 | def _deal_with_insertion(self, finger): 510 | """some code to handle the insertion the same""" 511 | if finger is None: 512 | return 513 | if self.auto_expand: 514 | self._expand_logic(finger) 515 | else: 516 | msg = f"The {self.__class__.__name__} is currently full" 517 | raise CuckooFilterFullError(msg) 518 | 519 | def _calc_error_rate(self): 520 | """calculate error rate based on fingerprint size (bits) and bucket size""" 521 | return float(1 / (2 ** (self.fingerprint_size_bits - (math.log2(self.bucket_size) + 1)))) 522 | 523 | def _calc_fingerprint_size(self) -> int: 524 | """calculate fingerprint size (bits) based on error rate and bucket size""" 525 | return int(math.ceil(math.log2(1.0 / self.error_rate) + math.log2(self.bucket_size) + 1)) 526 | -------------------------------------------------------------------------------- /probables/cuckoo/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/barrust/pyprobables/e5d44dac65033421d455681178344eeca9961bd8/probables/cuckoo/py.typed -------------------------------------------------------------------------------- /probables/exceptions.py: -------------------------------------------------------------------------------- 1 | """PyProbables Exceptions""" 2 | 3 | 4 | class ProbablesBaseException(Exception): 5 | """Base ProbablesBaseException 6 | 7 | Args: 8 | message (str): The error message to be reported""" 9 | 10 | def __init__(self, message: str) -> None: 11 | self.message = message 12 | super().__init__(self.message) 13 | 14 | def __str__(self) -> str: 15 | return self.message 16 | 17 | 18 | class InitializationError(ProbablesBaseException): 19 | """Initialization Exception 20 | 21 | Args: 22 | message (str): The initialization error messge""" 23 | 24 | def __init__(self, message: str) -> None: 25 | self.message = message 26 | super().__init__(self.message) 27 | 28 | 29 | class NotSupportedError(ProbablesBaseException): 30 | """Not Supported Functionality Exception 31 | 32 | Args: 33 | message (str): The error message to be reported""" 34 | 35 | def __init__(self, message: str) -> None: 36 | self.message = message 37 | super().__init__(self.message) 38 | 39 | 40 | class CuckooFilterFullError(ProbablesBaseException): 41 | """Cuckoo Filter Full Exception 42 | 43 | Args: 44 | message (str): The error message to be reported""" 45 | 46 | def __init__(self, message: str) -> None: 47 | self.message = message 48 | super().__init__(self.message) 49 | 50 | 51 | class RotatingBloomFilterError(ProbablesBaseException): 52 | """RotatingBloomFilter unable to rotate Blooms Exceptions 53 | 54 | Args: 55 | message (str): The error message to be reported""" 56 | 57 | def __init__(self, message: str) -> None: 58 | self.message = message 59 | super().__init__(self.message) 60 | 61 | 62 | class CountMinSketchError(ProbablesBaseException): 63 | """CountMinSketch Exception 64 | 65 | Args: 66 | message (str): The error message to be reported""" 67 | 68 | def __init__(self, message: str) -> None: 69 | self.message = message 70 | super().__init__(self.message) 71 | 72 | 73 | class QuotientFilterError(ProbablesBaseException): 74 | """Quotient Filter Exception 75 | 76 | Args: 77 | message (str): The error message to be reported""" 78 | 79 | def __init__(self, message: str) -> None: 80 | self.message = message 81 | super().__init__(self.message) 82 | -------------------------------------------------------------------------------- /probables/hashes.py: -------------------------------------------------------------------------------- 1 | """Probables Hashing Utilities""" 2 | 3 | from functools import wraps 4 | from hashlib import md5, sha256 5 | from struct import unpack 6 | from typing import Callable, Union 7 | 8 | from probables.constants import UINT32_T_MAX, UINT64_T_MAX 9 | 10 | KeyT = Union[str, bytes] 11 | SimpleHashT = Callable[[KeyT, int], int] 12 | HashResultsT = list[int] 13 | HashFuncT = Callable[[KeyT, int], HashResultsT] 14 | HashFuncBytesT = Callable[[KeyT, int], bytes] 15 | 16 | 17 | def hash_with_depth_bytes(func: HashFuncBytesT) -> HashFuncT: 18 | """Decorator to turns a function taking a single key and hashes it to 19 | bytes. Wraps functions to be used in Bloom filters and Count-Min sketch 20 | data structures. 21 | 22 | Args: 23 | key (str): The element to be hashed 24 | depth (int): The number of hash permutations to compute 25 | Returns: 26 | list(int): 64-bit hashed representation of key 27 | Note: 28 | Arguments shown are as it will be after decorated""" 29 | 30 | @wraps(func) 31 | def hashing_func(key, depth=1): 32 | """wrapper function""" 33 | res = [] 34 | tmp = key if not isinstance(key, str) else key.encode("utf-8") 35 | for idx in range(depth): 36 | tmp = func(tmp, idx) 37 | res.append(unpack("Q", tmp[:8])[0]) # turn into 64 bit number 38 | return res 39 | 40 | return hashing_func 41 | 42 | 43 | def hash_with_depth_int(func: HashFuncT) -> HashFuncT: 44 | """Decorator to turn a function that takes a single key and hashes it to 45 | an int. Wraps functions to be used in Bloom filters and Count-Min 46 | sketch data structures. 47 | 48 | Args: 49 | key (str): The element to be hashed 50 | depth (int): The number of hash permutations to compute 51 | Returns: 52 | list(int): 64-bit hashed representation of key 53 | Note: 54 | Arguments shown are as it will be after decorated""" 55 | 56 | @wraps(func) 57 | def hashing_func(key, depth=1): 58 | """wrapper function""" 59 | res = [] 60 | tmp = func(key, 0) 61 | res.append(tmp) 62 | for idx in range(1, depth): 63 | tmp = func(f"{tmp:x}", idx) 64 | res.append(tmp) 65 | return res 66 | 67 | return hashing_func 68 | 69 | 70 | def default_fnv_1a(key: KeyT, depth: int = 1) -> list[int]: 71 | """The default fnv-1a hashing routine 72 | 73 | Args: 74 | key (str): The element to be hashed 75 | depth (int): The number of hash permutations to compute 76 | Returns: 77 | list(int): List of size depth hashes""" 78 | 79 | res = [] 80 | for idx in range(depth): 81 | res.append(fnv_1a(key, idx)) 82 | return res 83 | 84 | 85 | def fnv_1a(key: KeyT, seed: int = 0) -> int: 86 | """Pure python implementation of the 64 bit fnv-1a hash 87 | 88 | Args: 89 | key (str): The element to be hashed 90 | seed (int): Add a seed to the initial starting point (0 means no seed) 91 | Returns: 92 | int: 64-bit hashed representation of key 93 | Note: 94 | Uses the lower 64 bits when overflows occur""" 95 | hval = (14695981039346656037 + (31 * seed)) & UINT64_T_MAX 96 | fnv_64_prime = 1099511628211 97 | tmp = list(key) if not isinstance(key, str) else list(map(ord, key)) 98 | for t_str in tmp: 99 | hval ^= t_str 100 | hval *= fnv_64_prime 101 | hval &= UINT64_T_MAX 102 | return hval 103 | 104 | 105 | def fnv_1a_32(key: KeyT, seed: int = 0) -> int: 106 | """Pure python implementation of the 32 bit fnv-1a hash 107 | Args: 108 | key (str): The element to be hashed 109 | seed (int): Add a seed to the initial starting point (0 means no seed) 110 | Returns: 111 | int: 32-bit hashed representation of key 112 | Note: 113 | Uses the lower 32 bits when overflows occur""" 114 | hval = (0x811C9DC5 + (31 * seed)) & UINT32_T_MAX 115 | fnv_32_prime = 0x01000193 116 | tmp = list(key) if not isinstance(key, str) else list(map(ord, key)) 117 | for t_str in tmp: 118 | hval ^= t_str 119 | hval *= fnv_32_prime 120 | hval &= UINT32_T_MAX 121 | return hval 122 | 123 | 124 | @hash_with_depth_bytes 125 | def default_md5(key: KeyT, *args, **kwargs) -> bytes: 126 | """The default md5 hashing routine 127 | 128 | Args: 129 | key (str): The element to be hashed 130 | depth (int): The number of hash permutations to compute 131 | Returns: 132 | list(int): List of 64-bit hashed representation of key hashes 133 | Note: 134 | Returns the upper-most 64 bits""" 135 | return md5(key).digest() # type: ignore 136 | 137 | 138 | @hash_with_depth_bytes 139 | def default_sha256(key: KeyT, *args, **kwargs) -> bytes: 140 | """The default sha256 hashing routine 141 | 142 | Args: 143 | key (str): The element to be hashed 144 | depth (int): The number of hash permutations to compute 145 | Returns: 146 | list(int): List of 64-bit hashed representation of key hashes 147 | Note: 148 | Returns the upper-most 64 bits""" 149 | return sha256(key).digest() # type: ignore 150 | -------------------------------------------------------------------------------- /probables/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/barrust/pyprobables/e5d44dac65033421d455681178344eeca9961bd8/probables/py.typed -------------------------------------------------------------------------------- /probables/quotientfilter/__init__.py: -------------------------------------------------------------------------------- 1 | """Quotient Filters""" 2 | 3 | from probables.quotientfilter.quotientfilter import QuotientFilter 4 | 5 | __all__ = ["QuotientFilter"] 6 | -------------------------------------------------------------------------------- /probables/quotientfilter/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/barrust/pyprobables/e5d44dac65033421d455681178344eeca9961bd8/probables/quotientfilter/py.typed -------------------------------------------------------------------------------- /probables/utilities.py: -------------------------------------------------------------------------------- 1 | """Utility Functions""" 2 | 3 | import math 4 | import mmap 5 | import string 6 | from array import array 7 | from pathlib import Path 8 | from typing import Union 9 | 10 | 11 | def is_hex_string(hex_string: Union[str, None]) -> bool: 12 | """check if the passed in string is really hex""" 13 | if hex_string is None: 14 | return False 15 | return all(c in string.hexdigits for c in hex_string) 16 | 17 | 18 | def is_valid_file(filepath: Union[str, Path, None]) -> bool: 19 | """check if the passed filepath points to a real file""" 20 | if filepath is None: 21 | return False 22 | return Path(filepath).exists() 23 | 24 | 25 | def resolve_path(filepath: Union[str, Path]) -> Path: 26 | """fully resolve the path by expanding user and resolving""" 27 | return Path(filepath).expanduser().resolve() 28 | 29 | 30 | def get_x_bits(num: int, max_bits: int, num_bits: int, right_bits: bool = True) -> int: 31 | """ensure the correct bits are pulled from num""" 32 | if right_bits: 33 | return num & (2**num_bits - 1) 34 | return ((1 << num_bits) - 1) & (num >> (max_bits - num_bits)) 35 | 36 | 37 | class MMap: 38 | """Simplified mmap.mmap class""" 39 | 40 | __slots__ = ("__p", "__f", "__m", "_closed") 41 | 42 | def __init__(self, path: Union[Path, str]): 43 | self.__p = Path(path) 44 | self.__f = self.path.open("rb") # noqa: SIM115 45 | self.__m = mmap.mmap(self.__f.fileno(), 0, access=mmap.ACCESS_READ) 46 | self._closed = False 47 | 48 | def __enter__(self) -> mmap.mmap: 49 | return self.__m 50 | 51 | def __exit__(self, *args, **kwargs) -> None: 52 | if self.__m and not self.map.closed: 53 | self.map.close() 54 | if self.__f: 55 | self.__f.close() 56 | self._closed = True 57 | 58 | @property 59 | def closed(self) -> bool: 60 | """Is the MMap closed""" 61 | return self._closed 62 | 63 | @property 64 | def map(self) -> mmap.mmap: 65 | """Return a pointer to the mmap""" 66 | return self.__m 67 | 68 | @property 69 | def path(self) -> Path: 70 | """Return the path to the mmap'd file""" 71 | return self.__p 72 | 73 | def close(self) -> None: 74 | """Close the MMap class includeing cleaning up open files, etc""" 75 | self.__exit__() 76 | 77 | def seek(self, pos: int, whence: int) -> None: 78 | """Implement a method to seek on top of the MMap class""" 79 | self.__m.seek(pos, whence) 80 | 81 | def read(self, n: int = -1) -> bytes: 82 | """Implement a method to read from the file on top of the MMap class""" 83 | return self.__m.read(n) 84 | 85 | 86 | class Bitarray: 87 | """Simplified, pure python bitarray implementation using as little memory as possible 88 | 89 | Args: 90 | size (int): The number of bits in the bitarray 91 | Returns: 92 | Bitarray: A bitarray 93 | Raises: 94 | TypeError: 95 | ValueError:""" 96 | 97 | def __init__(self, size: int): 98 | if not isinstance(size, int): 99 | raise TypeError(f"Bitarray size must be an int; {type(size)} was provided") 100 | if size <= 0: 101 | raise ValueError(f"Bitarray size must be larger than 1; {size} was provided") 102 | self._size_bytes = math.ceil(size / 8) 103 | self._bitarray = array("B", [0]) * self._size_bytes 104 | self._size = size 105 | 106 | @property 107 | def size_bytes(self) -> int: 108 | """The size of the bitarray in bytes""" 109 | return self._size_bytes 110 | 111 | @property 112 | def size(self) -> int: 113 | """The number of bits in the bitarray""" 114 | return self._size 115 | 116 | @property 117 | def bitarray(self) -> array: 118 | """The bitarray""" 119 | return self._bitarray 120 | 121 | def __getitem__(self, key: int) -> int: 122 | return self.check_bit(key) 123 | 124 | def __setitem__(self, idx: int, val: int): 125 | if val < 0 or val > 1: 126 | raise ValueError("Invalid bit setting; must be 0 or 1") 127 | if idx < 0 or idx >= self._size: 128 | raise IndexError(f"Bitarray index outside of range; index {idx} was provided") 129 | b = idx // 8 130 | if val == 1: 131 | self._bitarray[b] = self._bitarray[b] | (1 << (idx % 8)) 132 | else: 133 | self._bitarray[b] = self._bitarray[b] & ~(1 << (idx % 8)) 134 | 135 | def check_bit(self, idx: int) -> int: 136 | """Check if the bit idx is set 137 | 138 | Args: 139 | idx (int): The index to check 140 | Returns: 141 | int: The status of the bit, either 0 or 1""" 142 | if idx < 0 or idx >= self._size: 143 | raise IndexError(f"Bitarray index outside of range; index {idx} was provided") 144 | return 0 if (self._bitarray[idx // 8] & (1 << (idx % 8))) == 0 else 1 145 | 146 | def is_bit_set(self, idx: int) -> bool: 147 | """Check if the bit idx is set 148 | 149 | Args: 150 | idx (int): The index to check 151 | Returns: 152 | int: The status of the bit, either 0 or 1""" 153 | return bool(self.check_bit(idx)) 154 | 155 | def set_bit(self, idx: int) -> None: 156 | """Set the bit at idx to 1 157 | 158 | Args: 159 | idx (int): The index to set""" 160 | if idx < 0 or idx >= self._size: 161 | raise IndexError(f"Bitarray index outside of range; index {idx} was provided") 162 | b = idx // 8 163 | self._bitarray[b] = self._bitarray[b] | (1 << (idx % 8)) 164 | 165 | def clear_bit(self, idx: int) -> None: 166 | """Set the bit at idx to 0 167 | 168 | Args: 169 | idx (int): The index to clear""" 170 | if idx < 0 or idx >= self._size: 171 | raise IndexError(f"Bitarray index outside of range; index {idx} was provided") 172 | b = idx // 8 173 | self._bitarray[b] = self._bitarray[b] & ~(1 << (idx % 8)) 174 | 175 | def clear(self): 176 | """Clear all bits in the bitarray""" 177 | for i in range(self._size_bytes): 178 | self._bitarray[i] = 0 179 | 180 | def as_string(self): 181 | """String representation of the bitarray 182 | 183 | Returns: 184 | str: Bitarray representation as a string""" 185 | return "".join(str(self.check_bit(x)) for x in range(self._size)) 186 | 187 | def num_bits_set(self) -> int: 188 | """Number of bits set in the bitarray 189 | 190 | Returns: 191 | int: Number of bits set""" 192 | return sum(self.check_bit(x) for x in range(self._size)) 193 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "pyprobables" 3 | dynamic = ["version"] 4 | authors = [{ name = "Tyler Barrus", email = "barrust@gmail.com" }] 5 | license = "MIT" 6 | description = "Probabilistic data structures in python" 7 | keywords = [ 8 | "python", 9 | "probabilistic", 10 | "data-structure", 11 | "bloom", 12 | "filter", 13 | "count-min", 14 | "sketch", 15 | "bloom-filter", 16 | "count-min-sketch", 17 | "cuckoo-filter", 18 | "quotient-filter", 19 | ] 20 | readme = "README.rst" 21 | classifiers = [ 22 | "Development Status :: 5 - Production/Stable", 23 | "Intended Audience :: Developers", 24 | "Intended Audience :: Information Technology", 25 | "Intended Audience :: Science/Research", 26 | "Topic :: Software Development :: Libraries", 27 | "Topic :: Utilities", 28 | "Programming Language :: Python", 29 | "Programming Language :: Python :: 3", 30 | "Programming Language :: Python :: 3.9", 31 | "Programming Language :: Python :: 3.10", 32 | "Programming Language :: Python :: 3.11", 33 | "Programming Language :: Python :: 3.12", 34 | "Programming Language :: Python :: 3.13", 35 | ] 36 | requires-python = ">=3.9" 37 | 38 | [tool.setuptools.dynamic] 39 | version = { attr = "probables.__version__" } 40 | 41 | [project.urls] 42 | Homepage = "https://github.com/barrust/pyprobables" 43 | Bug-tracker = "https://github.com/barrust/pyprobables/issues" 44 | Documentation = "https://pyprobables.readthedocs.io/" 45 | 46 | [tool.setuptools.packages.find] 47 | include = ["probables", "probables.*"] 48 | 49 | [tool.flit.module] 50 | name = "probables" 51 | 52 | [tool.pep8] 53 | max-line-length = 120 54 | 55 | [tool.pycodestyle] 56 | max-line-length = 120 57 | 58 | [tool.flake8] 59 | max-line-length = 120 60 | 61 | [tool.isort] 62 | profile = "black" 63 | 64 | [tool.black] 65 | line-length = 120 66 | target-version = ['py39'] 67 | include = '\.pyi?$' 68 | 69 | [tool.ruff] 70 | include = ["pyproject.toml", "probables/**/*.py", "probables/*.py"] 71 | exclude = [ 72 | ".bzr", 73 | ".direnv", 74 | ".eggs", 75 | ".git", 76 | ".git-rewrite", 77 | ".hg", 78 | ".ipynb_checkpoints", 79 | ".mypy_cache", 80 | ".nox", 81 | ".pants.d", 82 | ".pyenv", 83 | ".pytest_cache", 84 | ".pytype", 85 | ".ruff_cache", 86 | ".svn", 87 | ".tox", 88 | ".venv", 89 | ".vscode", 90 | "__pypackages__", 91 | "_build", 92 | "buck-out", 93 | "build", 94 | "dist", 95 | "node_modules", 96 | "site-packages", 97 | "venv", 98 | ] 99 | 100 | # Same as Black. 101 | line-length = 120 102 | indent-width = 4 103 | target-version = "py39" 104 | 105 | [tool.ruff.lint] 106 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. 107 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or 108 | # McCabe complexity (`C901`) by default. 109 | select = [ 110 | # pycodestyle 111 | "E", 112 | # Pyflakes 113 | "F", 114 | # pyupgrade 115 | "UP", 116 | # flake8-bugbear 117 | "B", 118 | # flake8-simplify 119 | "SIM", 120 | # isort 121 | "I", 122 | ] 123 | ignore = [] 124 | 125 | # Allow fix for all enabled rules (when `--fix`) is provided. 126 | fixable = ["ALL"] 127 | unfixable = [] 128 | 129 | # Allow unused variables when underscore-prefixed. 130 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 131 | 132 | [tool.ruff.format] 133 | # Like Black, use double quotes for strings. 134 | quote-style = "double" 135 | 136 | # Like Black, indent with spaces, rather than tabs. 137 | indent-style = "space" 138 | 139 | # Like Black, respect magic trailing commas. 140 | skip-magic-trailing-comma = false 141 | 142 | # Like Black, automatically detect the appropriate line ending. 143 | line-ending = "auto" 144 | 145 | # Enable auto-formatting of code examples in docstrings. Markdown, 146 | # reStructuredText code/literal blocks and doctests are all supported. 147 | # 148 | # This is currently disabled by default, but it is planned for this 149 | # to be opt-out in the future. 150 | docstring-code-format = false 151 | 152 | # Set the line length limit used when formatting code snippets in 153 | # docstrings. 154 | # 155 | # This only has an effect when the `docstring-code-format` setting is 156 | # enabled. 157 | docstring-code-line-length = "dynamic" 158 | 159 | [build-system] 160 | requires = ["setuptools>=77.0.0", "wheel"] 161 | build-backend = "setuptools.build_meta" 162 | -------------------------------------------------------------------------------- /scripts/version_bump.py: -------------------------------------------------------------------------------- 1 | """ Update all the different version variables 2 | """ 3 | import os 4 | from datetime import datetime 5 | from functools import wraps 6 | 7 | 8 | def read_and_write(func): 9 | @wraps(func) 10 | def wrapper(**kwargs): 11 | path = kwargs["path"] 12 | 13 | with open(path) as fobj: 14 | data = fobj.readlines() 15 | 16 | func(data, **kwargs) 17 | 18 | with open(path, "w") as fobj: 19 | fobj.writelines(data) 20 | 21 | return wrapper 22 | 23 | 24 | @read_and_write 25 | def update_file(data, **kwargs): 26 | """Parse a file based on the key (k) and update it's value with the provided value (v) 27 | 28 | Args: 29 | path (str): 30 | k (str): 31 | v (str): 32 | """ 33 | for i, line in enumerate(data): 34 | if line.startswith(kwargs["k"]): 35 | data[i] = """{} = "{}"\n""".format(kwargs["k"], kwargs["v"]) 36 | 37 | 38 | @read_and_write 39 | def update_citation_file(data, **kwargs): 40 | """Parse the citation file and update it's values with the provide file 41 | 42 | Args: 43 | path (str): 44 | v (str): 45 | """ 46 | for i, line in enumerate(data): 47 | if line.startswith("version:"): 48 | data[i] = "version: {}\n".format(kwargs["v"]) 49 | if line.startswith("date-released:"): 50 | data[i] = "date-released: '{}'".format(datetime.today().strftime("%Y-%m-%d")) 51 | 52 | 53 | def _parse_args(): 54 | import argparse 55 | 56 | parser = argparse.ArgumentParser(description="Automate the version bump of the pyprobables project") 57 | parser.add_argument("new_version", help="The new version of the package") 58 | 59 | return parser.parse_args() 60 | 61 | 62 | if __name__ == "__main__": 63 | args = _parse_args() 64 | 65 | # get current path to find where the script is currently 66 | script_path = os.path.dirname(os.path.abspath(__file__)) 67 | 68 | module_path = os.path.abspath(f"{script_path}/../") 69 | 70 | # update the package __init__ file 71 | init_file = f"{module_path}/probables/__init__.py" 72 | update_file(path=init_file, k="__version__", v=args.new_version) 73 | 74 | # update the citation file 75 | citation_file = f"{module_path}/CITATION.cff" 76 | update_citation_file(path=citation_file, v=args.new_version) 77 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Testing Module""" 2 | -------------------------------------------------------------------------------- /tests/countingcuckoo_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Unittest class""" 3 | 4 | import hashlib 5 | import os 6 | import sys 7 | import unittest 8 | from pathlib import Path 9 | from tempfile import NamedTemporaryFile 10 | 11 | this_dir = Path(__file__).parent 12 | sys.path.insert(0, str(this_dir)) 13 | sys.path.insert(0, str(this_dir.parent)) 14 | 15 | from probables import CountingCuckooFilter, CuckooFilterFullError # noqa: E402 16 | from tests.utilities import calc_file_md5 # noqa: E402 17 | 18 | DELETE_TEMP_FILES = True 19 | 20 | 21 | class TestCountingCuckooFilter(unittest.TestCase): 22 | """base Cuckoo Filter test""" 23 | 24 | def test_c_cuckoo_filter_default(self): 25 | """test counting cuckoo filter default properties""" 26 | cko = CountingCuckooFilter() 27 | self.assertEqual(10000, cko.capacity) 28 | self.assertEqual(4, cko.bucket_size) 29 | self.assertEqual(500, cko.max_swaps) 30 | self.assertEqual(2, cko.expansion_rate) 31 | self.assertEqual(True, cko.auto_expand) 32 | 33 | def test_c_cuckoo_filter_diff(self): 34 | """test counting cuckoo filter non-standard properties""" 35 | cko = CountingCuckooFilter( 36 | capacity=100, 37 | bucket_size=2, 38 | max_swaps=5, 39 | expansion_rate=4, 40 | auto_expand=False, 41 | ) 42 | self.assertEqual(100, cko.capacity) 43 | self.assertEqual(2, cko.bucket_size) 44 | self.assertEqual(5, cko.max_swaps) 45 | self.assertEqual(4, cko.expansion_rate) 46 | self.assertEqual(False, cko.auto_expand) 47 | 48 | def test_c_cuckoo_filter_add(self): 49 | """test adding to the counting cuckoo filter""" 50 | cko = CountingCuckooFilter() 51 | cko.add("this is a test") 52 | self.assertEqual(cko.elements_added, 1) 53 | cko.add("this is another test") 54 | self.assertEqual(cko.elements_added, 2) 55 | cko.add("this is yet another test") 56 | self.assertEqual(cko.elements_added, 3) 57 | 58 | def test_c_cuckoo_filter_remove(self): 59 | """test removing from the counting cuckoo filter""" 60 | cko = CountingCuckooFilter() 61 | cko.add("this is a test") 62 | self.assertEqual(cko.elements_added, 1) 63 | cko.add("this is another test") 64 | self.assertEqual(cko.elements_added, 2) 65 | cko.add("this is yet another test") 66 | self.assertEqual(cko.elements_added, 3) 67 | self.assertEqual(cko.unique_elements, 3) 68 | cko.add("this is a test") 69 | cko.add("this is a test") 70 | cko.add("this is a test") 71 | self.assertEqual(cko.elements_added, 6) 72 | self.assertEqual(cko.unique_elements, 3) 73 | 74 | res = cko.remove("this is another test") 75 | self.assertTrue(res) 76 | self.assertEqual(cko.elements_added, 5) 77 | self.assertEqual(cko.unique_elements, 2) 78 | 79 | self.assertTrue(cko.check("this is a test")) 80 | self.assertFalse(cko.check("this is another test")) 81 | self.assertTrue(cko.check("this is yet another test")) 82 | 83 | def test_c_cuckoo_filter_rmv_miss(self): 84 | """test removing from the counting cuckoo filter when not present""" 85 | cko = CountingCuckooFilter() 86 | cko.add("this is a test") 87 | self.assertEqual(cko.elements_added, 1) 88 | cko.add("this is another test") 89 | self.assertEqual(cko.elements_added, 2) 90 | cko.add("this is yet another test") 91 | self.assertEqual(cko.elements_added, 3) 92 | 93 | res = cko.remove("this is still a test") 94 | self.assertFalse(res) 95 | self.assertEqual(cko.elements_added, 3) 96 | self.assertTrue(cko.check("this is a test")) 97 | self.assertTrue(cko.check("this is another test")) 98 | self.assertTrue(cko.check("this is yet another test")) 99 | 100 | def test_c_cuckoo_filter_lots(self): 101 | """test inserting lots into the counting cuckoo filter""" 102 | cko = CountingCuckooFilter(capacity=100, bucket_size=2, max_swaps=100) 103 | for i in range(125): 104 | cko.add(str(i)) 105 | self.assertEqual(cko.elements_added, 125) 106 | 107 | def test_c_cuckoo_filter_full(self): 108 | """test inserting until counting cuckoo filter is full""" 109 | 110 | def runner(): 111 | """runner""" 112 | cko = CountingCuckooFilter(capacity=100, bucket_size=2, max_swaps=100, auto_expand=False) 113 | for i in range(175): 114 | cko.add(str(i)) 115 | 116 | self.assertRaises(CuckooFilterFullError, runner) 117 | 118 | def test_c_cuckoo_full_msg(self): 119 | """test exception message for full counting cuckoo filter""" 120 | try: 121 | cko = CountingCuckooFilter(capacity=100, bucket_size=2, max_swaps=100, auto_expand=False) 122 | for i in range(175): 123 | cko.add(str(i)) 124 | except CuckooFilterFullError as ex: 125 | msg = "The CountingCuckooFilter is currently full" 126 | self.assertEqual(str(ex), msg) 127 | else: 128 | self.assertEqual(True, False) 129 | 130 | def test_c_cuckoo_idx(self): 131 | """test that the indexing works correctly for counting cuckoo filter 132 | swap""" 133 | cko = CountingCuckooFilter(capacity=100, bucket_size=2, max_swaps=5) 134 | txt = "this is a test" 135 | idx_1, idx_2, fingerprint = cko._generate_fingerprint_info(txt) 136 | index_1, index_2 = cko._indicies_from_fingerprint(fingerprint) 137 | self.assertEqual(idx_1, index_1) 138 | self.assertEqual(idx_2, index_2) 139 | 140 | def test_c_cuckoo_filter_check(self): 141 | """test checking if element in counting cuckoo filter""" 142 | cko = CountingCuckooFilter() 143 | cko.add("this is a test") 144 | cko.add("this is another test") 145 | cko.add("this is yet another test") 146 | self.assertEqual(cko.check("this is a test"), True) 147 | self.assertEqual(cko.check("this is another test"), True) 148 | self.assertEqual(cko.check("this is yet another test"), True) 149 | self.assertEqual(cko.check("this is not another test"), False) 150 | self.assertEqual(cko.check("this is not a test"), False) 151 | 152 | def test_c_cuckoo_filter_in(self): 153 | """test checking using 'in' counting cuckoo filter""" 154 | cko = CountingCuckooFilter() 155 | cko.add("this is a test") 156 | cko.add("this is another test") 157 | cko.add("this is yet another test") 158 | self.assertEqual("this is a test" in cko, True) 159 | self.assertEqual("this is another test" in cko, True) 160 | self.assertEqual("this is yet another test" in cko, True) 161 | self.assertEqual("this is not another test" in cko, False) 162 | self.assertEqual("this is not a test" in cko, False) 163 | 164 | def test_c_cuckoo_filter_dup_add(self): 165 | """test adding same item multiple times counting cuckoo filter""" 166 | cko = CountingCuckooFilter() 167 | cko.add("this is a test") 168 | cko.add("this is another test") 169 | cko.add("this is yet another test") 170 | self.assertEqual(cko.elements_added, 3) 171 | cko.add("this is a test") 172 | cko.add("this is another test") 173 | cko.add("this is yet another test") 174 | self.assertEqual(cko.elements_added, 6) 175 | self.assertEqual(cko.unique_elements, 3) 176 | 177 | def test_c_cuckoo_filter_l_fact(self): 178 | """test the load factor of the counting cuckoo filter""" 179 | cko = CountingCuckooFilter(capacity=100, bucket_size=2, max_swaps=10) 180 | self.assertEqual(cko.load_factor(), 0.0) 181 | for i in range(50): 182 | cko.add(str(i)) 183 | self.assertEqual(cko.load_factor(), 0.25) 184 | for i in range(50): 185 | cko.add(str(i + 50)) 186 | 187 | if cko.capacity == 200: # self expanded 188 | self.assertEqual(cko.load_factor(), 0.25) 189 | else: 190 | self.assertEqual(cko.load_factor(), 0.50) 191 | 192 | for i in range(100): 193 | cko.add(str(i)) 194 | if cko.capacity == 200: # self expanded 195 | self.assertEqual(cko.load_factor(), 0.25) 196 | else: 197 | self.assertEqual(cko.load_factor(), 0.50) 198 | 199 | def test_c_cuckoo_filter_export(self): 200 | """test exporting a counting cuckoo filter""" 201 | md5sum = "6a98c2df1ec9fbb4f75f8e6392696b9b" 202 | with NamedTemporaryFile(dir=os.getcwd(), suffix=".cck", delete=DELETE_TEMP_FILES) as fobj: 203 | cko = CountingCuckooFilter(capacity=1000, bucket_size=2, auto_expand=False) 204 | for i in range(100): 205 | cko.add(str(i)) 206 | 207 | cko.export(fobj.name) 208 | md5_out = calc_file_md5(fobj.name) 209 | self.assertEqual(md5sum, md5_out) 210 | 211 | def test_c_cuckoo_filter_bytes(self): 212 | """test exporting a counting cuckoo filter""" 213 | md5sum = "6a98c2df1ec9fbb4f75f8e6392696b9b" 214 | cko = CountingCuckooFilter(capacity=1000, bucket_size=2, auto_expand=False) 215 | for i in range(100): 216 | cko.add(str(i)) 217 | md5_out = hashlib.md5(bytes(cko)).hexdigest() 218 | self.assertEqual(md5sum, md5_out) 219 | 220 | def test_c_cuckoo_filter_frombytes(self): 221 | """test initializing a counting cuckoo filter frombytes""" 222 | cko = CountingCuckooFilter(capacity=1000, bucket_size=2, auto_expand=False) 223 | for i in range(100): 224 | cko.add(str(i)) 225 | bytes_out = bytes(cko) 226 | 227 | cko2 = CountingCuckooFilter.frombytes(bytes_out) 228 | 229 | self.assertEqual(bytes_out, bytes(cko2)) 230 | for i in range(100): 231 | self.assertTrue(cko2.check(str(i))) 232 | self.assertFalse(cko2.check("999")) 233 | 234 | def test_c_cuckoo_filter_load(self): 235 | """test loading a saved counting cuckoo filter""" 236 | md5sum = "6a98c2df1ec9fbb4f75f8e6392696b9b" 237 | with NamedTemporaryFile(dir=os.getcwd(), suffix=".cck", delete=DELETE_TEMP_FILES) as fobj: 238 | cko = CountingCuckooFilter(capacity=1000, bucket_size=2, auto_expand=False) 239 | for i in range(100): 240 | cko.add(str(i)) 241 | 242 | cko.export(fobj.name) 243 | md5_out = calc_file_md5(fobj.name) 244 | self.assertEqual(md5sum, md5_out) 245 | 246 | ckf = CountingCuckooFilter(filepath=fobj.name) 247 | for i in range(100): 248 | self.assertEqual(ckf.check(str(i)), 1) 249 | 250 | self.assertEqual(1000, ckf.capacity) 251 | self.assertEqual(2, ckf.bucket_size) 252 | self.assertEqual(500, ckf.max_swaps) 253 | self.assertEqual(0.05, ckf.load_factor()) 254 | 255 | def test_c_cuckoo_filter_expand_els(self): 256 | """test out the expansion of the counting cuckoo filter""" 257 | cko = CountingCuckooFilter() 258 | for i in range(200): 259 | cko.add(str(i)) 260 | cko.expand() 261 | for i in range(200): 262 | self.assertGreater(cko.check(str(i)), 0) 263 | self.assertEqual(20000, cko.capacity) 264 | 265 | def test_c_cuckoo_filter_auto_exp(self): 266 | """test inserting until counting cuckoo filter is full""" 267 | cko = CountingCuckooFilter(capacity=100, bucket_size=2, max_swaps=100) 268 | for i in range(375): # this would fail if it doesn't expand 269 | cko.add(str(i)) 270 | self.assertEqual(400, cko.capacity) 271 | self.assertEqual(375, cko.elements_added) 272 | for i in range(375): 273 | self.assertGreater(cko.check(str(i)), 0) 274 | 275 | def test_c_cuckoo_filter_bin(self): 276 | """test the cuckoo bin repr""" 277 | cko = CountingCuckooFilter(capacity=1, bucket_size=2, max_swaps=100) 278 | cko.add("this is a test") 279 | self.assertEqual("[(fingerprint:4280557824 count:1)]", str(cko.buckets[0])) 280 | 281 | def test_c_cuckoo_filter_str(self): 282 | """test the str representation of the counting cuckoo filter""" 283 | cko = CountingCuckooFilter(capacity=100, bucket_size=2, max_swaps=100) 284 | for i in range(75): 285 | cko.add(str(i)) 286 | msg = ( 287 | "CountingCuckooFilter:\n" 288 | "\tCapacity: 100\n" 289 | "\tTotal Bins: 200\n" 290 | "\tLoad Factor: 37.5%\n" 291 | "\tInserted Elements: 75\n" 292 | "\tMax Swaps: 100\n" 293 | "\tExpansion Rate: 2\n" 294 | "\tAuto Expand: True" 295 | ) 296 | self.assertEqual(str(cko), msg) 297 | 298 | 299 | class TestCuckooFilterErrorRate(unittest.TestCase): 300 | """Test CountingCuckooFilter using Error Rate""" 301 | 302 | def test_c_cuckoo_filter_er_default(self): 303 | """test cuckoo filter default properties""" 304 | cko = CountingCuckooFilter.init_error_rate(0.00001) 305 | self.assertEqual(10000, cko.capacity) 306 | self.assertEqual(4, cko.bucket_size) 307 | self.assertEqual(500, cko.max_swaps) 308 | self.assertEqual(2, cko.expansion_rate) 309 | self.assertEqual(True, cko.auto_expand) 310 | self.assertEqual(3, cko.fingerprint_size) 311 | self.assertEqual(20, cko.fingerprint_size_bits) 312 | self.assertEqual(0.00001, cko.error_rate) 313 | 314 | def test_c_cuckoo_filter_er_add_check(self): 315 | """test adding to the cuckoo filter""" 316 | cko = CountingCuckooFilter.init_error_rate(0.00001) 317 | cko.add("this is a test") 318 | self.assertEqual(cko.elements_added, 1) 319 | cko.add("this is another test") 320 | self.assertEqual(cko.elements_added, 2) 321 | cko.add("this is yet another test") 322 | self.assertEqual(cko.elements_added, 3) 323 | 324 | # check 325 | self.assertEqual(cko.check("this is a test"), True) 326 | self.assertEqual(cko.check("this is another test"), True) 327 | self.assertEqual(cko.check("this is yet another test"), True) 328 | self.assertEqual(cko.check("this is not another test"), False) 329 | self.assertEqual(cko.check("this is not a test"), False) 330 | 331 | # use of `in` 332 | self.assertEqual("this is a test" in cko, True) 333 | self.assertEqual("this is another test" in cko, True) 334 | self.assertEqual("this is yet another test" in cko, True) 335 | self.assertEqual("this is not another test" in cko, False) 336 | self.assertEqual("this is not a test" in cko, False) 337 | 338 | def test_c_cuckoo_filter_er_export(self): 339 | """test exporting a cuckoo filter""" 340 | md5sum = "f68767bd97b21426f5d2315fb38961ad" 341 | with NamedTemporaryFile(dir=os.getcwd(), suffix=".cko", delete=DELETE_TEMP_FILES) as fobj: 342 | cko = CountingCuckooFilter.init_error_rate(0.00001) 343 | for i in range(1000): 344 | cko.add(str(i)) 345 | cko.export(fobj.name) 346 | md5_out = calc_file_md5(fobj.name) 347 | self.assertEqual(md5sum, md5_out) 348 | 349 | def test_c_cuckoo_filter_load(self): 350 | """test loading a saved cuckoo filter""" 351 | md5sum = "88bc3a08bfc967f9ba60e9d57c21207f" 352 | with NamedTemporaryFile(dir=os.getcwd(), suffix=".cko", delete=DELETE_TEMP_FILES) as fobj: 353 | cko = CountingCuckooFilter.init_error_rate(0.00001) 354 | for i in range(1000): 355 | cko.add(str(i)) 356 | if i % 2 == 1: 357 | cko.add(str(i)) 358 | cko.export(fobj.name) 359 | md5_out = calc_file_md5(fobj.name) 360 | self.assertEqual(md5sum, md5_out) 361 | 362 | ckf = CountingCuckooFilter.load_error_rate(error_rate=0.00001, filepath=fobj.name) 363 | for i in range(1000): 364 | self.assertEqual(ckf.check(str(i)), (i % 2) + 1) 365 | 366 | self.assertEqual(10000, ckf.capacity) 367 | self.assertEqual(4, ckf.bucket_size) 368 | self.assertEqual(500, ckf.max_swaps) 369 | self.assertEqual(2, ckf.expansion_rate) 370 | self.assertEqual(True, ckf.auto_expand) 371 | self.assertEqual(20, ckf.fingerprint_size_bits) 372 | self.assertEqual(3, ckf.fingerprint_size) 373 | self.assertEqual(0.00001, ckf.error_rate) 374 | self.assertEqual(0.025, ckf.load_factor()) 375 | 376 | def test_c_cuckoo_filter_er_bytes(self): 377 | """test exporting a cuckoo filter to bytes""" 378 | md5sum = "f68767bd97b21426f5d2315fb38961ad" 379 | cko = CountingCuckooFilter.init_error_rate(0.00001) 380 | for i in range(1000): 381 | cko.add(str(i)) 382 | md5_out = hashlib.md5(bytes(cko)).hexdigest() 383 | self.assertEqual(md5sum, md5_out) 384 | 385 | def test_c_cuckoo_filter_er_frombytes(self): 386 | """test initializing a couting cuckoo filter from bytes""" 387 | cko = CountingCuckooFilter.init_error_rate(0.00001, capacity=3000) 388 | for i in range(1000): 389 | cko.add(str(i)) 390 | bytes_out = bytes(cko) 391 | 392 | cko2 = CountingCuckooFilter.frombytes(bytes_out, error_rate=0.00001) 393 | 394 | self.assertEqual(bytes_out, bytes(cko2)) 395 | for i in range(1000): 396 | self.assertTrue(cko2.check(str(i))) 397 | self.assertFalse(cko2.check("9999")) 398 | self.assertEqual(cko2.capacity, 3000) 399 | 400 | def test_c_cuckoo_filter_er_remove(self): 401 | """test removing from the counting cuckoo filter""" 402 | cko = CountingCuckooFilter.init_error_rate(0.00001) 403 | cko.add("this is a test") 404 | self.assertEqual(cko.elements_added, 1) 405 | cko.add("this is another test") 406 | self.assertEqual(cko.elements_added, 2) 407 | cko.add("this is yet another test") 408 | self.assertEqual(cko.elements_added, 3) 409 | self.assertEqual(cko.unique_elements, 3) 410 | cko.add("this is a test") 411 | cko.add("this is a test") 412 | cko.add("this is a test") 413 | self.assertEqual(cko.elements_added, 6) 414 | self.assertEqual(cko.unique_elements, 3) 415 | 416 | res = cko.remove("this is another test") 417 | self.assertTrue(res) 418 | self.assertEqual(cko.elements_added, 5) 419 | self.assertEqual(cko.unique_elements, 2) 420 | 421 | self.assertTrue(cko.check("this is a test")) 422 | self.assertFalse(cko.check("this is another test")) 423 | self.assertTrue(cko.check("this is yet another test")) 424 | 425 | 426 | if __name__ == "__main__": 427 | unittest.main() 428 | -------------------------------------------------------------------------------- /tests/expandingbloom_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Unittest class""" 3 | 4 | import hashlib 5 | import os 6 | import sys 7 | import unittest 8 | from pathlib import Path 9 | from tempfile import NamedTemporaryFile 10 | 11 | this_dir = Path(__file__).parent 12 | sys.path.insert(0, str(this_dir)) 13 | sys.path.insert(0, str(this_dir.parent)) 14 | 15 | from probables import ExpandingBloomFilter, RotatingBloomFilter # noqa: E402 16 | from probables.exceptions import RotatingBloomFilterError # noqa: E402 17 | from tests.utilities import calc_file_md5, different_hash # noqa: E402 18 | 19 | DELETE_TEMP_FILES = True 20 | 21 | 22 | class TestExpandingBloomFilter(unittest.TestCase): 23 | """Test ExpandingBloomFilter""" 24 | 25 | def test_ebf_init(self): 26 | """test the initialization of an expanding bloom filter""" 27 | blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05) 28 | self.assertEqual(blm.expansions, 0) 29 | self.assertEqual(blm.false_positive_rate, 0.05) 30 | self.assertEqual(blm.estimated_elements, 10) 31 | self.assertEqual(blm.elements_added, 0) 32 | 33 | def test_ebf_add_lots(self): 34 | """test adding "lots" of elements to force the expansion""" 35 | blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05) 36 | for i in range(100): 37 | blm.add(f"{i}", True) 38 | self.assertEqual(blm.expansions, 9) 39 | 40 | def test_ebf_add_lots_diff_hash(self): 41 | """test adding "lots" of elements to force the expansion using a different hash""" 42 | blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05, hash_function=different_hash) 43 | for i in range(100): 44 | blm.add(f"{i}", True) 45 | self.assertEqual(blm.expansions, 9) 46 | 47 | def test_ebf_add_lots_without_force(self): 48 | """testing adding "lots" but force them to be inserted multiple times""" 49 | blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05) 50 | # simulate false positives... notice it didn't grow a few... 51 | for i in range(120): 52 | blm.add(f"{i}") 53 | self.assertEqual(blm.expansions, 8) 54 | self.assertEqual(blm.elements_added, 120) 55 | 56 | def test_ebf_check(self): 57 | """ensure that checking the expanding bloom filter works""" 58 | blm = ExpandingBloomFilter(est_elements=30, false_positive_rate=0.05) 59 | # expand it out some first! 60 | for i in range(100): 61 | blm.add(f"{i}") 62 | blm.add("this is a test") 63 | blm.add("this is another test") 64 | self.assertGreater(blm.expansions, 1) 65 | self.assertEqual(blm.check("this is a test"), True) 66 | self.assertEqual(blm.check("this is another test"), True) 67 | self.assertEqual(blm.check("this is yet another test!"), False) 68 | self.assertEqual(blm.check("this is not another test"), False) 69 | self.assertEqual(blm.elements_added, 102) 70 | 71 | def test_ebf_contains(self): 72 | """ensure that "in" functionality for the expanding bloom filter works""" 73 | blm = ExpandingBloomFilter(est_elements=30, false_positive_rate=0.05) 74 | # expand it out some first! 75 | for i in range(100): 76 | blm.add(f"{i}") 77 | blm.add("this is a test") 78 | blm.add("this is another test") 79 | self.assertGreater(blm.expansions, 1) 80 | self.assertEqual("this is a test" in blm, True) 81 | self.assertEqual("this is another test" in blm, True) 82 | self.assertEqual("this is yet another test!" in blm, False) 83 | self.assertEqual("this is not another test" in blm, False) 84 | 85 | def test_ebf_push(self): 86 | """ensure that we are able to push new Bloom Filters""" 87 | blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05) 88 | self.assertEqual(blm.expansions, 0) 89 | blm.push() 90 | self.assertEqual(blm.expansions, 1) 91 | self.assertEqual(blm.elements_added, 0) 92 | blm.push() 93 | self.assertEqual(blm.expansions, 2) 94 | self.assertEqual(blm.elements_added, 0) 95 | blm.push() 96 | self.assertEqual(blm.expansions, 3) 97 | self.assertEqual(blm.elements_added, 0) 98 | 99 | def test_ebf_export(self): 100 | """basic expanding Bloom Filter export test""" 101 | with NamedTemporaryFile(dir=os.getcwd(), suffix=".ebf", delete=DELETE_TEMP_FILES) as fobj: 102 | blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05) 103 | blm.export(fobj.name) 104 | self.assertEqual(calc_file_md5(fobj.name), "eb5769ae9babdf7b37d6ce64d58812bc") 105 | 106 | def test_ebf_bytes(self): 107 | """basic expanding Bloom Filter export bytes test""" 108 | blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05) 109 | self.assertEqual(hashlib.md5(bytes(blm)).hexdigest(), "eb5769ae9babdf7b37d6ce64d58812bc") 110 | 111 | def test_ebf_frombytes(self): 112 | """expanding Bloom Filter load bytes test""" 113 | blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05) 114 | for i in range(105): 115 | blm.add(str(i)) 116 | bytes_out = bytes(blm) 117 | 118 | blm2 = ExpandingBloomFilter.frombytes(bytes_out) 119 | self.assertEqual(blm2.expansions, 3) 120 | self.assertEqual(blm2.false_positive_rate, 0.05000000074505806) 121 | self.assertEqual(blm2.estimated_elements, 25) 122 | self.assertEqual(blm2.elements_added, 105) 123 | self.assertEqual(bytes(blm2), bytes(blm)) 124 | 125 | for i in range(105): 126 | self.assertTrue(blm.check(str(i))) 127 | 128 | def test_ebf_import_empty(self): 129 | """test that expanding Bloom Filter is correct on import""" 130 | with NamedTemporaryFile(dir=os.getcwd(), suffix=".ebf", delete=DELETE_TEMP_FILES) as fobj: 131 | blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05) 132 | blm.export(fobj.name) 133 | self.assertEqual(calc_file_md5(fobj.name), "eb5769ae9babdf7b37d6ce64d58812bc") 134 | 135 | blm2 = ExpandingBloomFilter(filepath=fobj.name) 136 | for bloom in blm2._blooms: 137 | self.assertEqual(bloom.elements_added, 0) 138 | 139 | def test_ebf_import_non_empty(self): 140 | """test expanding Bloom Filter import when non-empty""" 141 | with NamedTemporaryFile(dir=os.getcwd(), suffix=".ebf", delete=DELETE_TEMP_FILES) as fobj: 142 | blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05) 143 | for i in range(15): 144 | blm.add(f"{i}") 145 | blm.push() 146 | 147 | blm.export(fobj.name) 148 | 149 | blm2 = ExpandingBloomFilter(filepath=fobj.name) 150 | self.assertEqual(blm2.expansions, 15) 151 | for i in range(15): 152 | self.assertEqual(f"{i}" in blm2, True) 153 | 154 | # check for things that are not there! 155 | for i in range(99, 125): 156 | self.assertEqual(f"{i}" in blm2, False) 157 | 158 | 159 | class TestRotatingBloomFilter(unittest.TestCase): 160 | """Test RotatingBloomFilter""" 161 | 162 | def test_rbf_init(self): 163 | """test the initialization of an rotating bloom filter""" 164 | blm = RotatingBloomFilter(est_elements=10, false_positive_rate=0.05, max_queue_size=10) 165 | self.assertEqual(blm.expansions, 0) 166 | self.assertEqual(blm.max_queue_size, 10) 167 | 168 | def test_rbf_rotate(self): 169 | """test that the bloom filter rotates the first bloom off the stack""" 170 | blm = RotatingBloomFilter(est_elements=10, false_positive_rate=0.05, max_queue_size=5) 171 | self.assertEqual(blm.expansions, 0) 172 | blm.add("test") 173 | self.assertEqual(blm.expansions, 0) 174 | for i in range(10): 175 | blm.add(f"{i}", force=True) 176 | self.assertEqual(blm.expansions, 1) 177 | self.assertEqual(blm.current_queue_size, 2) 178 | self.assertEqual(blm.check("test"), True) 179 | 180 | for i in range(10, 20): 181 | blm.add(f"{i}", force=True) 182 | self.assertEqual(blm.check("test"), True) 183 | self.assertEqual(blm.current_queue_size, 3) 184 | 185 | for i in range(20, 30): 186 | blm.add(f"{i}", force=True) 187 | self.assertEqual(blm.check("test"), True) 188 | self.assertEqual(blm.current_queue_size, 4) 189 | 190 | for i in range(30, 40): 191 | blm.add(f"{i}", force=True) 192 | self.assertEqual(blm.check("test"), True) 193 | self.assertEqual(blm.current_queue_size, 5) 194 | 195 | for i in range(40, 50): 196 | blm.add(f"{i}", force=True) 197 | self.assertEqual(blm.check("test"), False) # it should roll off 198 | self.assertEqual(blm.current_queue_size, 5) 199 | 200 | self.assertEqual(blm.elements_added, 51) 201 | 202 | def test_rbf_push_pop(self): 203 | """test forcing push and pop""" 204 | blm = RotatingBloomFilter(est_elements=10, false_positive_rate=0.05, max_queue_size=5) 205 | self.assertEqual(blm.current_queue_size, 1) 206 | blm.add("test") 207 | blm.push() 208 | self.assertEqual(blm.current_queue_size, 2) 209 | self.assertEqual("test" in blm, True) 210 | blm.push() 211 | self.assertEqual(blm.current_queue_size, 3) 212 | self.assertEqual("test" in blm, True) 213 | blm.push() 214 | self.assertEqual(blm.current_queue_size, 4) 215 | self.assertEqual("test" in blm, True) 216 | blm.push() 217 | self.assertEqual(blm.current_queue_size, 5) 218 | self.assertEqual("test" in blm, True) 219 | blm.push() 220 | self.assertEqual(blm.current_queue_size, 5) 221 | self.assertEqual("test" in blm, False) 222 | 223 | # test popping 224 | blm.add("that") 225 | blm.pop() 226 | self.assertEqual(blm.current_queue_size, 4) 227 | self.assertEqual("that" in blm, True) 228 | blm.pop() 229 | self.assertEqual(blm.current_queue_size, 3) 230 | self.assertEqual("that" in blm, True) 231 | blm.pop() 232 | self.assertEqual(blm.current_queue_size, 2) 233 | self.assertEqual("that" in blm, True) 234 | blm.pop() 235 | self.assertEqual(blm.current_queue_size, 1) 236 | self.assertEqual("that" in blm, True) 237 | 238 | def test_rbf_pop_exception(self): 239 | """ensure the correct exception is thrown""" 240 | blm = RotatingBloomFilter(est_elements=10, false_positive_rate=0.05, max_queue_size=5) 241 | self.assertRaises(RotatingBloomFilterError, lambda: blm.pop()) 242 | 243 | def test_rbf_pop_exception_msg(self): 244 | """rotating bloom filter error: check the resulting error message""" 245 | blm = RotatingBloomFilter(est_elements=10, false_positive_rate=0.05, max_queue_size=5) 246 | try: 247 | blm.pop() 248 | except RotatingBloomFilterError as ex: 249 | msg = "Popping a Bloom Filter will result in an unusable system!" 250 | self.assertEqual(str(ex), msg) 251 | except: # noqa: E722 252 | self.assertEqual(True, False) 253 | 254 | def test_rfb_basic_export(self): 255 | """basic rotating Bloom Filter export test""" 256 | with NamedTemporaryFile(dir=os.getcwd(), suffix=".rbf", delete=DELETE_TEMP_FILES) as fobj: 257 | blm = RotatingBloomFilter(est_elements=25, false_positive_rate=0.05) 258 | blm.export(fobj.name) 259 | self.assertEqual(calc_file_md5(fobj.name), "eb5769ae9babdf7b37d6ce64d58812bc") 260 | 261 | def test_rfb_basic_bytes(self): 262 | """basic rotating Bloom Filter export bytes test""" 263 | blm = RotatingBloomFilter(est_elements=25, false_positive_rate=0.05) 264 | self.assertEqual(hashlib.md5(bytes(blm)).hexdigest(), "eb5769ae9babdf7b37d6ce64d58812bc") 265 | 266 | def test_rfb_from_bytes(self): 267 | """basic rotating Bloom Filter export bytes test""" 268 | blm = RotatingBloomFilter(est_elements=25, false_positive_rate=0.05, max_queue_size=3) 269 | for i in range(105): 270 | blm.add(str(i)) 271 | bytes_out = bytes(blm) 272 | 273 | blm2 = RotatingBloomFilter.frombytes(bytes_out, max_queue_size=3) 274 | self.assertEqual(blm2.expansions, 2) 275 | self.assertEqual(blm2.false_positive_rate, 0.05000000074505806) 276 | self.assertEqual(blm2.estimated_elements, 25) 277 | self.assertEqual(blm2.elements_added, 105) 278 | self.assertEqual(blm2.current_queue_size, 3) 279 | self.assertEqual(bytes(blm2), bytes(blm)) 280 | for i in range(105): 281 | self.assertEqual(blm.check(str(i)), blm2.check(str(i))) 282 | 283 | def test_rbf_import_empty(self): 284 | """test that rotating Bloom Filter is correct on import""" 285 | with NamedTemporaryFile(dir=os.getcwd(), suffix=".rbf", delete=DELETE_TEMP_FILES) as fobj: 286 | blm = RotatingBloomFilter(est_elements=25, false_positive_rate=0.05) 287 | blm.export(fobj.name) 288 | self.assertEqual(calc_file_md5(fobj.name), "eb5769ae9babdf7b37d6ce64d58812bc") 289 | 290 | blm2 = ExpandingBloomFilter(filepath=fobj.name) 291 | for bloom in blm2._blooms: 292 | self.assertEqual(bloom.elements_added, 0) 293 | 294 | def test_rbf_non_basic_import(self): 295 | """test that the imported rotating Bloom filter is correct""" 296 | with NamedTemporaryFile(dir=os.getcwd(), suffix=".rbf", delete=DELETE_TEMP_FILES) as fobj: 297 | blm = RotatingBloomFilter(est_elements=25, false_positive_rate=0.05) 298 | for i in range(15): 299 | blm.add(f"{i}") 300 | blm.push() 301 | blm.export(fobj.name) 302 | 303 | blm2 = RotatingBloomFilter(filepath=fobj.name) 304 | # test those that should be popped off... 305 | for i in range(5): 306 | self.assertEqual(f"{i}" in blm2, False) 307 | # test things that would not be popped 308 | for i in range(6, 15): 309 | self.assertEqual(f"{i}" in blm2, True) 310 | self.assertEqual(blm2.current_queue_size, 10) 311 | self.assertEqual(blm2.expansions, 9) 312 | self.assertEqual(blm2.elements_added, 15) 313 | 314 | 315 | if __name__ == "__main__": 316 | unittest.main() 317 | -------------------------------------------------------------------------------- /tests/hashes_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Unittest class""" 3 | 4 | import hashlib 5 | import sys 6 | import unittest 7 | from pathlib import Path 8 | 9 | this_dir = Path(__file__).parent 10 | sys.path.insert(0, str(this_dir)) 11 | sys.path.insert(0, str(this_dir.parent)) 12 | 13 | from probables.constants import UINT64_T_MAX # noqa: E402 14 | from probables.hashes import ( # noqa: E402 15 | default_fnv_1a, 16 | default_md5, 17 | default_sha256, 18 | fnv_1a_32, 19 | hash_with_depth_bytes, 20 | hash_with_depth_int, 21 | ) 22 | 23 | 24 | class TestHashes(unittest.TestCase): 25 | """Test the different hash algorithms""" 26 | 27 | def test_default_fnv_1a(self): 28 | """test default fnv-1a algorithm""" 29 | this_is_a_test = [ 30 | 4040040117721899264, 31 | 3916497180155386777, 32 | 468410530588793106, 33 | 13781401791305604595, 34 | 321382271269641900, 35 | ] 36 | this_is_also = [ 37 | 7925790280716546811, 38 | 13347851945403505568, 39 | 17775584719969392601, 40 | 10279404995231728046, 41 | 13802534855964835503, 42 | ] 43 | hashes = default_fnv_1a("this is a test", 5) 44 | self.assertEqual(hashes, this_is_a_test) 45 | hashes = default_fnv_1a("this is also a test", 5) 46 | self.assertEqual(hashes, this_is_also) 47 | 48 | def test_default_hash_colision(self): 49 | """test when different strings start with the same hash value (issue 62)""" 50 | h1 = default_fnv_1a("gMPflVXtwGDXbIhP73TX", 5) 51 | h2 = default_fnv_1a("LtHf1prlU1bCeYZEdqWf", 5) 52 | 53 | self.assertEqual(h1[0], h2[0]) # these should match 54 | for i in range(1, 5): 55 | self.assertNotEqual(h1[i], h2[i]) 56 | 57 | def test_fnv_1a_32(self): 58 | """test fnv_1a 32 bit hash""" 59 | hash = fnv_1a_32("this is a test", 0) 60 | self.assertEqual(hash, 2139996864) 61 | hash = fnv_1a_32("this is also a test", 0) 62 | self.assertEqual(hash, 1462718619) 63 | 64 | def test_default_md5(self): 65 | """test default md5 algorithm""" 66 | this_is_a_test = [ 67 | 12174049463882854484, 68 | 10455450501617390806, 69 | 3838261292881602234, 70 | 12102952520950148619, 71 | 12126605867972429202, 72 | ] 73 | this_is_also = [ 74 | 8938037604889355346, 75 | 9361632593818981393, 76 | 15781121455678786382, 77 | 5600686735535066561, 78 | 1353473153840687523, 79 | ] 80 | hashes = default_md5("this is a test", 5) 81 | self.assertEqual(hashes, this_is_a_test) 82 | hashes = default_md5("this is also a test", 5) 83 | self.assertEqual(hashes, this_is_also) 84 | 85 | def test_default_sha256(self): 86 | """test default sha256 algorithm""" 87 | this_is_a_test = [ 88 | 10244166640140130606, 89 | 5650905005272240665, 90 | 14215057275609328422, 91 | 5952353080197385534, 92 | 4990779931033217093, 93 | ] 94 | this_is_also = [ 95 | 4140421647067018332, 96 | 9306548247555387104, 97 | 5672713771950536751, 98 | 8501641957786831066, 99 | 15146689942378126332, 100 | ] 101 | hashes = default_sha256("this is a test", 5) 102 | self.assertEqual(hashes, this_is_a_test) 103 | hashes = default_sha256("this is also a test", 5) 104 | self.assertEqual(hashes, this_is_also) 105 | 106 | def test_hash_bytes_decorator(self): 107 | """test making bytes hashing strategy with decorator""" 108 | results = [ 109 | 1164302962920061, 110 | 16735493734761467723, 111 | 18150279091576190542, 112 | 9861778148718857663, 113 | 14008040072978383620, 114 | ] 115 | 116 | @hash_with_depth_bytes 117 | def my_hash(key, depth=1): 118 | """my hash function""" 119 | return hashlib.sha512(key).digest() 120 | 121 | self.assertEqual(my_hash("this is a test", 5), results) 122 | res = my_hash("this is a test", 1) 123 | self.assertEqual(len(res), 1) 124 | self.assertEqual(res[0], results[0]) 125 | 126 | def test_hash_ints_decorator(self): 127 | """test making int hashing strategy with decorator""" 128 | results = [ 129 | 14409285476674975580, 130 | 6203976290780191624, 131 | 5074829385518853901, 132 | 3953072760750514173, 133 | 11782747630324011555, 134 | ] 135 | 136 | @hash_with_depth_int 137 | def my_hash(key, depth=1, encoding="utf-8"): 138 | """my hash function""" 139 | max64mod = UINT64_T_MAX + 1 140 | val = int(hashlib.sha512(key.encode(encoding)).hexdigest(), 16) 141 | return val % max64mod 142 | 143 | self.assertEqual(my_hash("this is a test", 5), results) 144 | res = my_hash("this is a test", 1) 145 | self.assertEqual(len(res), 1) 146 | self.assertEqual(res[0], results[0]) 147 | 148 | def test_default_fnv_1a_bytes(self): 149 | """test default fnv-1a algorithm""" 150 | this_is_a_test = [ 151 | 4040040117721899264, 152 | 3916497180155386777, 153 | 468410530588793106, 154 | 13781401791305604595, 155 | 321382271269641900, 156 | ] 157 | this_is_also = [ 158 | 7925790280716546811, 159 | 13347851945403505568, 160 | 17775584719969392601, 161 | 10279404995231728046, 162 | 13802534855964835503, 163 | ] 164 | hashes = default_fnv_1a(b"this is a test", 5) 165 | self.assertEqual(hashes, this_is_a_test) 166 | hashes = default_fnv_1a(b"this is also a test", 5) 167 | self.assertEqual(hashes, this_is_also) 168 | 169 | def test_default_md5_bytes(self): 170 | """test default md5 algorithm using bytes""" 171 | this_is_a_test = [ 172 | 12174049463882854484, 173 | 10455450501617390806, 174 | 3838261292881602234, 175 | 12102952520950148619, 176 | 12126605867972429202, 177 | ] 178 | this_is_also = [ 179 | 8938037604889355346, 180 | 9361632593818981393, 181 | 15781121455678786382, 182 | 5600686735535066561, 183 | 1353473153840687523, 184 | ] 185 | hashes = default_md5(b"this is a test", 5) 186 | self.assertEqual(hashes, this_is_a_test) 187 | hashes = default_md5(b"this is also a test", 5) 188 | self.assertEqual(hashes, this_is_also) 189 | 190 | def test_default_sha256_bytes(self): 191 | """test default sha256 algorithm using bytes""" 192 | this_is_a_test = [ 193 | 10244166640140130606, 194 | 5650905005272240665, 195 | 14215057275609328422, 196 | 5952353080197385534, 197 | 4990779931033217093, 198 | ] 199 | this_is_also = [ 200 | 4140421647067018332, 201 | 9306548247555387104, 202 | 5672713771950536751, 203 | 8501641957786831066, 204 | 15146689942378126332, 205 | ] 206 | hashes = default_sha256(b"this is a test", 5) 207 | self.assertEqual(hashes, this_is_a_test) 208 | hashes = default_sha256(b"this is also a test", 5) 209 | self.assertEqual(hashes, this_is_also) 210 | 211 | 212 | if __name__ == "__main__": 213 | unittest.main() 214 | -------------------------------------------------------------------------------- /tests/quotientfilter_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Unittest class""" 3 | 4 | import os 5 | import random 6 | import sys 7 | import unittest 8 | from pathlib import Path 9 | from tempfile import NamedTemporaryFile 10 | 11 | from probables.exceptions import QuotientFilterError 12 | 13 | this_dir = Path(__file__).parent 14 | sys.path.insert(0, str(this_dir)) 15 | sys.path.insert(0, str(this_dir.parent)) 16 | from probables import QuotientFilter # noqa: E402 17 | 18 | DELETE_TEMP_FILES = True 19 | 20 | 21 | class TestQuotientFilter(unittest.TestCase): 22 | """Test the default quotient filter implementation""" 23 | 24 | def test_qf_init(self): 25 | "test initializing a blank quotient filter" 26 | qf = QuotientFilter() 27 | 28 | self.assertEqual(qf.bits_per_elm, 16) 29 | self.assertEqual(qf.quotient, 20) 30 | self.assertEqual(qf.remainder, 12) 31 | self.assertEqual(qf.elements_added, 0) 32 | self.assertEqual(qf.num_elements, 1048576) # 2**qf.quotient 33 | 34 | qf = QuotientFilter(quotient=8) 35 | 36 | self.assertEqual(qf.bits_per_elm, 32) 37 | self.assertEqual(qf.quotient, 8) 38 | self.assertEqual(qf.remainder, 24) 39 | self.assertEqual(qf.elements_added, 0) 40 | self.assertEqual(qf.num_elements, 256) # 2**qf.quotient 41 | self.assertTrue(qf.auto_expand) 42 | 43 | qf = QuotientFilter(quotient=24, auto_expand=False) 44 | 45 | self.assertEqual(qf.bits_per_elm, 8) 46 | self.assertEqual(qf.quotient, 24) 47 | self.assertEqual(qf.remainder, 8) 48 | self.assertEqual(qf.elements_added, 0) 49 | self.assertEqual(qf.num_elements, 16777216) # 2**qf.quotient 50 | self.assertFalse(qf.auto_expand) 51 | 52 | # reset auto_expand 53 | qf.auto_expand = True 54 | self.assertTrue(qf.auto_expand) 55 | 56 | def test_qf_add_check(self): 57 | "test that the qf is able to add and check elements" 58 | qf = QuotientFilter(quotient=8) 59 | 60 | for i in range(0, 200, 2): 61 | qf.add(str(i)) 62 | self.assertEqual(qf.elements_added, 100) 63 | self.assertEqual(qf.load_factor, 100 / qf.size) 64 | found_no = False 65 | for i in range(0, 200, 2): 66 | if not qf.check(str(i)): 67 | found_no = True 68 | self.assertFalse(found_no) 69 | 70 | for i in range(1, 200, 2): 71 | print(i) 72 | self.assertFalse(qf.check(str(i))) 73 | 74 | self.assertEqual(qf.elements_added, 100) 75 | 76 | def test_qf_add_check_in(self): 77 | "test that the qf is able to add and check elements using `in`" 78 | qf = QuotientFilter(quotient=8) 79 | 80 | for i in range(0, 200, 2): 81 | qf.add(str(i)) 82 | self.assertEqual(qf.elements_added, 100) 83 | 84 | found_no = False 85 | for i in range(0, 200, 2): 86 | if str(i) not in qf: 87 | found_no = True 88 | self.assertFalse(found_no) 89 | 90 | for i in range(1, 200, 2): 91 | print(i) 92 | self.assertFalse(str(i) in qf) 93 | 94 | self.assertEqual(qf.elements_added, 100) 95 | 96 | def test_qf_init_errors(self): 97 | """test quotient filter initialization errors""" 98 | self.assertRaises(QuotientFilterError, lambda: QuotientFilter(quotient=2)) 99 | self.assertRaises(QuotientFilterError, lambda: QuotientFilter(quotient=32)) 100 | 101 | def test_qf_retrieve_hashes(self): 102 | """test retrieving hashes back from the quotient filter""" 103 | qf = QuotientFilter(quotient=8, auto_expand=False) 104 | hashes = [] 105 | for i in range(255): 106 | hashes.append(qf._hash_func(str(i), 0)) # use the private function here.. 107 | qf.add(str(i)) 108 | self.assertEqual(qf.size, 256) 109 | self.assertEqual(qf.load_factor, 255 / qf.size) 110 | out_hashes = qf.get_hashes() 111 | self.assertEqual(qf.elements_added, len(out_hashes)) 112 | self.assertEqual(set(hashes), set(out_hashes)) 113 | 114 | def test_qf_resize(self): 115 | """test resizing the quotient filter""" 116 | qf = QuotientFilter(quotient=8, auto_expand=False) 117 | for i in range(200): 118 | qf.add(str(i)) 119 | 120 | self.assertEqual(qf.elements_added, 200) 121 | self.assertEqual(qf.load_factor, 200 / qf.size) 122 | self.assertEqual(qf.quotient, 8) 123 | self.assertEqual(qf.remainder, 24) 124 | self.assertEqual(qf.bits_per_elm, 32) 125 | self.assertFalse(qf.auto_expand) 126 | 127 | self.assertRaises(QuotientFilterError, lambda: qf.resize(7)) # should be too small to fit 128 | 129 | qf.resize(17) 130 | self.assertEqual(qf.elements_added, 200) 131 | self.assertEqual(qf.load_factor, 200 / qf.size) 132 | self.assertEqual(qf.quotient, 17) 133 | self.assertEqual(qf.remainder, 15) 134 | self.assertEqual(qf.bits_per_elm, 16) 135 | # ensure everything is still accessable 136 | for i in range(200): 137 | self.assertTrue(qf.check(str(i))) 138 | 139 | def test_qf_auto_resize(self): 140 | """test resizing the quotient filter automatically""" 141 | qf = QuotientFilter(quotient=8, auto_expand=True) 142 | self.assertEqual(qf.max_load_factor, 0.85) 143 | self.assertEqual(qf.elements_added, 0) 144 | self.assertEqual(qf.load_factor, 0 / qf.size) 145 | self.assertEqual(qf.quotient, 8) 146 | self.assertEqual(qf.remainder, 24) 147 | self.assertEqual(qf.bits_per_elm, 32) 148 | self.assertTrue(qf.auto_expand) 149 | 150 | for i in range(220): 151 | qf.add(str(i)) 152 | 153 | self.assertEqual(qf.max_load_factor, 0.85) 154 | self.assertEqual(qf.elements_added, 220) 155 | self.assertEqual(qf.load_factor, 220 / qf.size) 156 | self.assertEqual(qf.quotient, 9) 157 | self.assertEqual(qf.remainder, 23) 158 | self.assertEqual(qf.bits_per_elm, 32) 159 | 160 | def test_qf_auto_resize_changed_max_load_factor(self): 161 | """test resizing the quotient filter with a different load factor""" 162 | qf = QuotientFilter(quotient=8, auto_expand=True) 163 | self.assertEqual(qf.max_load_factor, 0.85) 164 | self.assertTrue(qf.auto_expand) 165 | qf.max_load_factor = 0.65 166 | self.assertEqual(qf.max_load_factor, 0.65) 167 | 168 | self.assertEqual(qf.elements_added, 0) 169 | self.assertEqual(qf.load_factor, 0 / qf.size) 170 | self.assertEqual(qf.quotient, 8) 171 | self.assertEqual(qf.remainder, 24) 172 | self.assertEqual(qf.bits_per_elm, 32) 173 | self.assertTrue(qf.auto_expand) 174 | 175 | for i in range(200): 176 | qf.add(str(i)) 177 | 178 | self.assertEqual(qf.max_load_factor, 0.85) 179 | self.assertEqual(qf.elements_added, 200) 180 | self.assertEqual(qf.load_factor, 200 / qf.size) 181 | self.assertEqual(qf.quotient, 9) 182 | self.assertEqual(qf.remainder, 23) 183 | self.assertEqual(qf.bits_per_elm, 32) 184 | 185 | def test_qf_resize_errors(self): 186 | """test resizing errors""" 187 | 188 | qf = QuotientFilter(quotient=8, auto_expand=True) 189 | for i in range(200): 190 | qf.add(str(i)) 191 | 192 | self.assertRaises(QuotientFilterError, lambda: qf.resize(quotient=2)) 193 | self.assertRaises(QuotientFilterError, lambda: qf.resize(quotient=32)) 194 | self.assertRaises(QuotientFilterError, lambda: qf.resize(quotient=6)) 195 | 196 | def test_qf_merge(self): 197 | """test merging two quotient filters together""" 198 | qf = QuotientFilter(quotient=8, auto_expand=True) 199 | for i in range(200): 200 | qf.add(str(i)) 201 | 202 | fq = QuotientFilter(quotient=8) 203 | for i in range(300, 500): 204 | fq.add(str(i)) 205 | 206 | qf.merge(fq) 207 | 208 | for i in range(200): 209 | self.assertTrue(qf.check(str(i))) 210 | for i in range(200, 300): 211 | self.assertFalse(qf.check(str(i))) 212 | for i in range(300, 500): 213 | self.assertTrue(qf.check(str(i))) 214 | 215 | self.assertEqual(qf.elements_added, 400) 216 | 217 | def test_qf_merge_error(self): 218 | """test unable to merge due to inability to grow""" 219 | qf = QuotientFilter(quotient=8, auto_expand=False) 220 | for i in range(200): 221 | qf.add(str(i)) 222 | 223 | fq = QuotientFilter(quotient=8) 224 | for i in range(300, 400): 225 | fq.add(str(i)) 226 | 227 | self.assertRaises(QuotientFilterError, lambda: qf.merge(fq)) 228 | 229 | # test mismatch hashes 230 | def useless_hash(key, seed) -> int: 231 | return 99999999 232 | 233 | qq = QuotientFilter(quotient=8, hash_function=useless_hash) 234 | qq.add("999") 235 | 236 | self.assertRaises(QuotientFilterError, lambda: fq.merge(qq)) 237 | 238 | def test_qf_remove_missing_elm(self): 239 | """test removing a missing element""" 240 | alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"] 241 | qf = QuotientFilter(quotient=7) 242 | for a in alpha: 243 | qf.add(a) 244 | 245 | qf.remove("~") 246 | 247 | missing_vals = [] 248 | for a in alpha: 249 | if not qf.check(a): 250 | missing_vals.append(a) 251 | self.assertListEqual(missing_vals, []) 252 | self.assertTrue(qf.validate_metadata()) 253 | 254 | def test_qf_remove_cluster_start(self): 255 | """test removing a cluster start followed by empty""" 256 | alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"] 257 | qf = QuotientFilter(quotient=7) 258 | for a in alpha: 259 | qf.add(a) 260 | 261 | qf.remove(".") 262 | 263 | missing_vals = [] 264 | for a in alpha: 265 | if not qf.check(a): 266 | missing_vals.append(a) 267 | self.assertListEqual(missing_vals, ["."]) 268 | self.assertTrue(qf.validate_metadata()) 269 | 270 | def test_qf_remove_cluster_start_cluster(self): 271 | """test removing a cluster start followed by cluster start""" 272 | alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"] 273 | qf = QuotientFilter(quotient=7) 274 | for a in alpha: 275 | qf.add(a) 276 | 277 | qf.remove("-") 278 | 279 | missing_vals = [] 280 | for a in alpha: 281 | if not qf.check(a): 282 | missing_vals.append(a) 283 | self.assertListEqual(missing_vals, ["-"]) 284 | self.assertTrue(qf.validate_metadata()) 285 | 286 | def test_qf_remove_shifted_run_start_followed_by_empty(self): 287 | """test removing a shifted run start followed by empty""" 288 | alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"] 289 | qf = QuotientFilter(quotient=7) 290 | for a in alpha: 291 | qf.add(a) 292 | 293 | qf.remove("z") 294 | 295 | missing_vals = [] 296 | for a in alpha: 297 | if not qf.check(a): 298 | missing_vals.append(a) 299 | self.assertListEqual(missing_vals, ["z"]) 300 | self.assertTrue(qf.validate_metadata()) 301 | 302 | def test_qf_remove_shifted_run_start_followed_continuation(self): 303 | """test removing a shifted run start followed by continuation""" 304 | alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"] 305 | qf = QuotientFilter(quotient=7) 306 | for a in alpha: 307 | qf.add(a) 308 | 309 | qf.remove("y") 310 | 311 | missing_vals = [] 312 | for a in alpha: 313 | if not qf.check(a): 314 | missing_vals.append(a) 315 | self.assertListEqual(missing_vals, ["y"]) 316 | self.assertTrue(qf.validate_metadata()) 317 | 318 | def test_qf_remove_shifted_continuation_followed_run_start(self): 319 | """test removing a shifted continuation followed by run start""" 320 | alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"] 321 | qf = QuotientFilter(quotient=7) 322 | for a in alpha: 323 | qf.add(a) 324 | 325 | qf.remove("x") 326 | 327 | missing_vals = [] 328 | for a in alpha: 329 | if not qf.check(a): 330 | missing_vals.append(a) 331 | self.assertListEqual(missing_vals, ["x"]) 332 | self.assertTrue(qf.validate_metadata()) 333 | 334 | def test_qf_remove_shifted_run_start_followed_run_start(self): 335 | """test removing a shifted run start followed by run start""" 336 | alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"] 337 | qf = QuotientFilter(quotient=7) 338 | for a in alpha: 339 | qf.add(a) 340 | 341 | qf.remove("a") 342 | 343 | missing_vals = [] 344 | for a in alpha: 345 | if not qf.check(a): 346 | missing_vals.append(a) 347 | self.assertListEqual(missing_vals, ["a"]) 348 | self.assertTrue(qf.validate_metadata()) 349 | 350 | def test_qf_remove_cluster_start_followed_continuation_follow_run_start(self): 351 | """test removing a cluster start followed by continuation putting a run start into a cluster start position""" 352 | alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"] 353 | qf = QuotientFilter(quotient=7) 354 | for a in alpha: 355 | qf.add(a) 356 | 357 | qf.remove("d") 358 | 359 | missing_vals = [] 360 | for a in alpha: 361 | if not qf.check(a): 362 | missing_vals.append(a) 363 | self.assertListEqual(missing_vals, ["d"]) 364 | self.assertTrue(qf.validate_metadata()) 365 | 366 | def test_qf_remove_full(self): 367 | """Test removing all elements, but find each one after each removal""" 368 | alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"] 369 | qf = QuotientFilter(quotient=7) 370 | for a in alpha: 371 | _hash = qf._hash_func(a, 0) 372 | print(a, _hash >> qf._r, _hash & ((1 << qf._r) - 1)) 373 | qf.add(a) 374 | 375 | for a in alpha: 376 | self.assertTrue(qf.check(a), "failed to insert") 377 | 378 | while alpha: 379 | missing_vals = [] 380 | val = alpha.pop(0) 381 | qf.remove(val) 382 | missing_vals = [] 383 | for a in alpha: 384 | if not qf.check(a): 385 | missing_vals.append(a) 386 | self.assertListEqual(missing_vals, []) 387 | self.assertTrue(qf.validate_metadata()) 388 | 389 | def test_qf_remove_full_random(self): 390 | """Test removing all elements, but in a random order""" 391 | alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"] 392 | qf = QuotientFilter(quotient=7) 393 | for a in alpha: 394 | qf.add(a) 395 | 396 | for a in alpha: 397 | self.assertTrue(qf.check(a), "failed to insert") 398 | self.assertTrue(qf.validate_metadata()) 399 | 400 | while alpha: 401 | missing_vals = [] 402 | idx = random.randrange(len(alpha)) 403 | val = alpha.pop(idx) 404 | qf.remove(val) 405 | missing_vals = [] 406 | for a in alpha: 407 | if not qf.check(a): 408 | missing_vals.append(a) 409 | self.assertListEqual(missing_vals, []) 410 | self.assertTrue(qf.validate_metadata()) 411 | 412 | def test_qf_remove_full_random_take_2(self): 413 | """Test removing all elements, but in a random order - take 2""" 414 | alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"] 415 | qf = QuotientFilter(quotient=7) 416 | for a in alpha: 417 | qf.add(a) 418 | 419 | for a in alpha: 420 | self.assertTrue(qf.check(a), "failed to insert") 421 | 422 | while alpha: 423 | missing_vals = [] 424 | idx = random.randrange(len(alpha)) 425 | val = alpha.pop(idx) 426 | qf.remove(val) 427 | missing_vals = [] 428 | for a in alpha: 429 | if not qf.check(a): 430 | missing_vals.append(a) 431 | self.assertListEqual(missing_vals, []) 432 | self.assertTrue(qf.validate_metadata()) 433 | 434 | def test_quotient_filter_print_empty(self): 435 | """Test printing the data of a quotient filter in a manner to be read through""" 436 | qf = QuotientFilter(quotient=7) 437 | with NamedTemporaryFile(dir=os.getcwd(), suffix=".txt", delete=DELETE_TEMP_FILES, mode="wt") as fobj: 438 | qf.print(file=fobj.file) 439 | fobj.flush() 440 | 441 | with open(fobj.name) as fobj: 442 | data = fobj.readlines() 443 | data = [x.strip() for x in data] 444 | self.assertEqual(data[0], "idx\t--\tO-C-S\tStatus") 445 | for i in range(2, len(data)): 446 | self.assertEqual(data[i], f"{i-2}\t--\t0-0-0\tEmpty") 447 | 448 | def test_quotient_filter_print(self): 449 | """Test printing the data of a quotient filter in a manner to be read through not empty""" 450 | alpha = [a for a in "abcd.efghij;klm-nopqrs=tuvwxyz"] 451 | qf = QuotientFilter(quotient=7) 452 | for a in alpha: 453 | qf.add(a) 454 | 455 | with NamedTemporaryFile(dir=os.getcwd(), suffix=".txt", delete=DELETE_TEMP_FILES, mode="wt") as fobj: 456 | qf.print(file=fobj.file) 457 | fobj.flush() 458 | 459 | with open(fobj.name) as fobj: 460 | data = fobj.readlines() 461 | data = [x.strip() for x in data] 462 | self.assertEqual(data[0], "idx\t--\tO-C-S\tStatus") 463 | self.assertEqual(data[22], "20\t--\t1-0-0\tCluster Start") 464 | self.assertEqual(data[23], "21\t--\t1-0-0\tCluster Start") 465 | 466 | self.assertEqual(data[114], "112\t--\t1-0-0\tCluster Start") 467 | self.assertEqual(data[115], "113\t--\t1-1-1\tContinuation") 468 | self.assertEqual(data[116], "114\t--\t1-0-1\tRun Start") 469 | self.assertEqual(data[10], "8\t--\t0-1-1\tContinuation") 470 | self.assertEqual(data[11], "9\t--\t0-0-1\tRun Start") 471 | -------------------------------------------------------------------------------- /tests/test_utilities.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """probables utilitites tests""" 3 | 4 | import os 5 | import sys 6 | import unittest 7 | from pathlib import Path 8 | from tempfile import NamedTemporaryFile 9 | 10 | this_dir = Path(__file__).parent 11 | sys.path.insert(0, str(this_dir)) 12 | sys.path.insert(0, str(this_dir.parent)) 13 | 14 | from probables.utilities import Bitarray, MMap, get_x_bits, is_hex_string, is_valid_file, resolve_path # noqa: E402 15 | from tests.utilities import different_hash # noqa: E402 16 | 17 | DELETE_TEMP_FILES = True 18 | 19 | 20 | class TestProbablesUtilities(unittest.TestCase): 21 | """test the utilities for pyprobables""" 22 | 23 | def test_is_hex(self): 24 | """test the is valid hex function""" 25 | self.assertTrue(is_hex_string("123467890abcdef")) 26 | self.assertTrue(is_hex_string("123467890ABCDEF")) 27 | self.assertFalse(is_hex_string("123467890abcdfq")) 28 | self.assertFalse(is_hex_string("123467890ABCDEFQ")) 29 | 30 | def test_is_valid_file(self): 31 | """test the is valid file function""" 32 | with NamedTemporaryFile(dir=os.getcwd(), suffix=".rbf", delete=DELETE_TEMP_FILES) as fobj: 33 | self.assertFalse(is_valid_file(None)) 34 | self.assertFalse(is_valid_file("./file_doesnt_exist.txt")) 35 | with open(fobj.name, "w"): 36 | pass 37 | self.assertTrue(is_valid_file(fobj.name)) 38 | 39 | def test_get_x_bits(self): 40 | """test the get x bits function""" 41 | for i in range(8): 42 | res = get_x_bits(i, 4, 2, True) 43 | self.assertEqual(res, i % 4) 44 | for i in range(8): 45 | res = get_x_bits(i, 4, 2, False) 46 | if i < 4: 47 | self.assertEqual(res, 0) 48 | else: 49 | self.assertEqual(res, 1) 50 | 51 | def test_get_x_bits_large(self): 52 | """test it on much larger numbers""" 53 | res = different_hash("this is a test", 1)[0] 54 | # 1010100101011011100100010101010011110000001010011010000101001011 55 | tmp1 = get_x_bits(res, 64, 32, True) 56 | tmp2 = get_x_bits(res, 64, 32, False) 57 | self.assertEqual(4029260107, tmp1) 58 | self.assertEqual(2841350484, tmp2) 59 | 60 | tmp1 = get_x_bits(res, 64, 16, True) 61 | tmp2 = get_x_bits(res, 64, 16, False) 62 | self.assertEqual(41291, tmp1) 63 | self.assertEqual(43355, tmp2) 64 | 65 | tmp1 = get_x_bits(res, 64, 8, True) 66 | tmp2 = get_x_bits(res, 64, 8, False) 67 | self.assertEqual(75, tmp1) 68 | self.assertEqual(169, tmp2) 69 | 70 | tmp1 = get_x_bits(res, 64, 4, True) 71 | tmp2 = get_x_bits(res, 64, 4, False) 72 | self.assertEqual(11, tmp1) 73 | self.assertEqual(10, tmp2) 74 | 75 | tmp1 = get_x_bits(res, 64, 2, True) 76 | tmp2 = get_x_bits(res, 64, 2, False) 77 | self.assertEqual(3, tmp1) 78 | self.assertEqual(2, tmp2) 79 | 80 | tmp1 = get_x_bits(res, 64, 1, True) 81 | tmp2 = get_x_bits(res, 64, 1, False) 82 | self.assertEqual(1, tmp1) 83 | self.assertEqual(1, tmp2) 84 | 85 | def test_mmap_functionality(self): 86 | """test some of the MMap class functionality""" 87 | data = b"this is a test of the MMap system!" 88 | with NamedTemporaryFile(dir=os.getcwd(), suffix=".rbf", delete=DELETE_TEMP_FILES) as fobj: 89 | with open(fobj.name, "wb") as fobj: 90 | fobj.write(data) 91 | m = MMap(fobj.name) 92 | self.assertFalse(m.closed) 93 | self.assertEqual(data, m.read()) 94 | m.seek(0, os.SEEK_SET) 95 | self.assertEqual(data[:5], m.read(5)) 96 | self.assertEqual(data[5:], m.read()) 97 | m.close() 98 | self.assertTrue(m.closed) 99 | 100 | def test_resolve_path(self): 101 | """test that resolve_path returns correct""" 102 | p = resolve_path("~") 103 | self.assertTrue(p.is_absolute()) 104 | 105 | with NamedTemporaryFile(dir=os.getcwd(), suffix=".rbf", delete=DELETE_TEMP_FILES) as fobj: 106 | with open(fobj.name, "w"): 107 | pass 108 | p2 = resolve_path(f"./{fobj.name}") 109 | self.assertTrue(p2.is_absolute()) 110 | 111 | def test_bitarray(self): 112 | """test bit array basic operations""" 113 | ba = Bitarray(100) 114 | 115 | self.assertEqual(ba.size, 100) 116 | self.assertEqual(ba.size_bytes, 13) 117 | for i in range(ba.size_bytes): 118 | self.assertEqual(0, ba.bitarray[i]) 119 | 120 | # test setting bits 121 | for i in range(33): 122 | ba.set_bit(i * 3) 123 | 124 | self.assertEqual( 125 | ba.as_string(), 126 | "1001001001001001001001001001001001001001001001001001001001001001001001001001001001001001001001001000", 127 | ) 128 | self.assertEqual(ba.num_bits_set(), 33) 129 | self.assertTrue(ba.is_bit_set(3)) 130 | self.assertFalse(ba.is_bit_set(4)) 131 | self.assertEqual(ba[0], 1) 132 | self.assertEqual(ba[1], 0) 133 | 134 | # test clearing bits 135 | for i in range(33): 136 | ba.clear_bit(i * 3) 137 | 138 | self.assertEqual( 139 | ba.as_string(), 140 | "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", 141 | ) 142 | 143 | for i in range(33): 144 | ba.set_bit(i * 3) 145 | self.assertEqual( 146 | ba.as_string(), 147 | "1001001001001001001001001001001001001001001001001001001001001001001001001001001001001001001001001000", 148 | ) 149 | 150 | self.assertEqual(ba[2], 0) 151 | ba[2] = 1 152 | self.assertEqual(ba[2], 1) 153 | ba[2] = 0 154 | self.assertEqual(ba[2], 0) 155 | 156 | ba.clear() 157 | self.assertEqual( 158 | ba.as_string(), 159 | "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", 160 | ) 161 | 162 | def test_bitarray_invalid_idx(self): 163 | """use an invalid type in a jaccard index""" 164 | self.assertRaises(TypeError, lambda: Bitarray("100")) 165 | self.assertRaises(ValueError, lambda: Bitarray(-100)) 166 | ba = Bitarray(10) 167 | self.assertRaises(IndexError, lambda: ba.set_bit(12)) 168 | self.assertRaises(IndexError, lambda: ba.set_bit(-1)) 169 | self.assertRaises(IndexError, lambda: ba.check_bit(-1)) 170 | self.assertRaises(IndexError, lambda: ba.check_bit(12)) 171 | self.assertRaises(IndexError, lambda: ba.clear_bit(-1)) 172 | self.assertRaises(IndexError, lambda: ba.clear_bit(12)) 173 | 174 | self.assertRaises(IndexError, lambda: ba[-1]) 175 | self.assertRaises(IndexError, lambda: ba[12]) 176 | 177 | def test_set(idx, val): 178 | ba[idx] = val 179 | 180 | self.assertRaises(IndexError, lambda: test_set(-1, 0)) 181 | self.assertRaises(IndexError, lambda: test_set(12, 0)) 182 | # set as non-valid bit value 183 | self.assertRaises(ValueError, lambda: test_set(1, 5)) 184 | self.assertRaises(ValueError, lambda: test_set(12, -1)) 185 | 186 | 187 | if __name__ == "__main__": 188 | unittest.main() 189 | -------------------------------------------------------------------------------- /tests/utilities.py: -------------------------------------------------------------------------------- 1 | """utility functions""" 2 | 3 | from hashlib import md5 4 | from pathlib import Path 5 | from typing import Union 6 | 7 | from probables.constants import UINT64_T_MAX 8 | from probables.hashes import KeyT 9 | 10 | 11 | def calc_file_md5(filename: Union[str, Path]) -> str: 12 | """calc the md5 of a file""" 13 | with open(filename, "rb") as filepointer: 14 | res = filepointer.read() 15 | return md5(res).hexdigest() 16 | 17 | 18 | def different_hash(key: KeyT, depth: int) -> list[int]: 19 | """the default fnv-1a hashing routine, but different""" 20 | 21 | def __fnv_1a(key: KeyT) -> int: 22 | """64 bit fnv-1a hash""" 23 | hval = 14695981039346656074 # made minor change 24 | fnv_64_prime = 1099511628211 25 | tmp = list(key) if not isinstance(key, str) else list(map(ord, key)) 26 | for t_str in tmp: 27 | hval ^= t_str 28 | hval *= fnv_64_prime 29 | hval &= UINT64_T_MAX 30 | return hval 31 | 32 | res = [] 33 | for _ in range(depth): 34 | res.append(__fnv_1a(key)) 35 | return res 36 | --------------------------------------------------------------------------------