├── .flake8 ├── .gitignore ├── AUTHORS.rst ├── CONTRIBUTING.rst ├── HISTORY.rst ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── conftest.py ├── docs ├── Makefile ├── authors.rst ├── conf.py ├── contributing.rst ├── history.rst ├── index.rst ├── installation.rst ├── make.bat └── readme.rst ├── python_hll ├── __init__.py ├── hll.py ├── hlltype.py ├── hllutil.py ├── serialization.py └── util.py ├── requirements_dev.txt ├── setup.cfg ├── setup.py ├── tests ├── data │ ├── README.txt │ ├── cumulative_add_cardinality_correction.csv │ ├── cumulative_add_comprehensive_promotion.csv │ ├── cumulative_add_sparse_edge.csv │ ├── cumulative_add_sparse_random.csv │ ├── cumulative_add_sparse_step.csv │ ├── cumulative_union_comprehensive.csv │ ├── cumulative_union_explicit_explicit.csv │ ├── cumulative_union_explicit_promotion.csv │ ├── cumulative_union_probabilistic_probabilistic.csv │ ├── cumulative_union_sparse_full_representation.csv │ ├── cumulative_union_sparse_promotion.csv │ └── cumulative_union_sparse_sparse.csv ├── probabilistic_test_util.py ├── test_big_endian_ascending_word_deserializer.py ├── test_big_endian_ascending_word_serializer.py ├── test_bit_util.py ├── test_bit_vector.py ├── test_explicit_hll.py ├── test_full_hll.py ├── test_hll_serialization.py ├── test_hll_util.py ├── test_integration.py └── test_sparse_hll.py └── tox.ini /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 400 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .eggs 3 | python_hll.egg-info 4 | .pytest_cache 5 | .tox 6 | __pycache__ 7 | *.pyc 8 | _build 9 | docs/modules.rst 10 | docs/python_hll.rst 11 | /dist 12 | /build -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Contributors 6 | ------------ 7 | 8 | * Jon Aquino 9 | * Kushagra Verma 10 | * Alex Leu 11 | * Michael Tran 12 | * Rodrigo Westrupp 13 | * Sridharan Subramanian > 14 | * Piyush Srivastava 15 | 16 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Contributing 5 | ============ 6 | 7 | Contributions are welcome, and they are greatly appreciated! Every little bit 8 | helps, and credit will always be given. 9 | 10 | You can contribute in many ways: 11 | 12 | Types of Contributions 13 | ---------------------- 14 | 15 | Report Bugs 16 | ~~~~~~~~~~~ 17 | 18 | Report bugs at https://github.com/AdRoll/python-hll/issues. 19 | 20 | If you are reporting a bug, please include: 21 | 22 | * Your operating system name and version. 23 | * Any details about your local setup that might be helpful in troubleshooting. 24 | * Detailed steps to reproduce the bug. 25 | 26 | Fix Bugs 27 | ~~~~~~~~ 28 | 29 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help 30 | wanted" is open to whoever wants to implement it. 31 | 32 | Implement Features 33 | ~~~~~~~~~~~~~~~~~~ 34 | 35 | Look through the GitHub issues for features. Anything tagged with "enhancement" 36 | and "help wanted" is open to whoever wants to implement it. 37 | 38 | Write Documentation 39 | ~~~~~~~~~~~~~~~~~~~ 40 | 41 | python-hll could always use more documentation, whether as part of the 42 | official python-hll docs, in docstrings, or even on the web in blog posts, 43 | articles, and such. 44 | 45 | Submit Feedback 46 | ~~~~~~~~~~~~~~~ 47 | 48 | The best way to send feedback is to file an issue at https://github.com/AdRoll/python-hll/issues. 49 | 50 | If you are proposing a feature: 51 | 52 | * Explain in detail how it would work. 53 | * Keep the scope as narrow as possible, to make it easier to implement. 54 | * Remember that this is a volunteer-driven project, and that contributions 55 | are welcome :) 56 | 57 | Get Started! 58 | ------------ 59 | 60 | Ready to contribute? Here's how to set up `python-hll` for local development. 61 | 62 | 1. Fork the `python-hll` repo on GitHub. 63 | 2. Clone your fork locally:: 64 | 65 | $ git clone git@github.com:your_name_here/python-hll.git 66 | 67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: 68 | 69 | $ cd python-hll/ 70 | $ mkvirtualenv python_hll 71 | $ python setup.py develop 72 | $ pip install -r requirements_dev.txt 73 | 74 | 4. Create a branch for local development:: 75 | 76 | $ git checkout -b name-of-your-bugfix-or-feature 77 | 78 | Now you can make your changes locally. 79 | 80 | 5. When you're done making changes, check that your changes pass flake8 and the 81 | tests, including testing other Python versions with tox:: 82 | 83 | $ make lint 84 | $ make test-fast 85 | 86 | To run one test file or one test:: 87 | 88 | $ py.test --capture=no tests/test_sparse_hll.py 89 | $ py.test --capture=no tests/test_sparse_hll.py::test_add 90 | 91 | To run slow tests:: 92 | 93 | $ make test 94 | 95 | 6. Commit your changes and push your branch to GitHub:: 96 | 97 | $ git add . 98 | $ git commit -m "Your detailed description of your changes." 99 | $ git push origin name-of-your-bugfix-or-feature 100 | 101 | 7. Submit a pull request through the GitHub website. 102 | 103 | Pull Request Guidelines 104 | ----------------------- 105 | 106 | Before you submit a pull request, check that it meets these guidelines: 107 | 108 | 1. The pull request should include tests. 109 | 2. If the pull request adds functionality, the docs should be updated. Put 110 | your new functionality into a function with a docstring, and add the 111 | feature to the list in README.rst. 112 | 3. The pull request should work for Python 2.7, 3.4, 3.5 and 3.6, and for PyPy. 113 | Make sure that the tests pass for all supported Python versions. 114 | 115 | Deploying 116 | --------- 117 | 118 | A reminder for the maintainers on how to deploy. 119 | Make sure all your changes are committed (including an entry in HISTORY.rst). 120 | Then run:: 121 | 122 | $ # Run bumpversion patch, or bumpversion minor, or bumpversion major. 123 | $ # This will tag the code and increment/commit new version numbers. 124 | $ bumpversion patch 125 | $ git push 126 | $ git push --tags 127 | $ make release # use your pypi credentials 128 | $ # Log in to https://python-hll.readthedocs.io/ and publish the latest docs -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | History 3 | ======= 4 | 5 | 0.0.0 (2019-06-14) 6 | ------------------ 7 | 8 | * Submitted to AdRoll HackWeek. 9 | 10 | 0.1.0 (2019-09-12) 11 | ------------------ 12 | 13 | * First release on PyPI. 14 | 15 | 0.1.1 (2019-09-12) 16 | ------------------ 17 | 18 | * Add missing install_requires: numpy 19 | 20 | 0.1.2 (2019-12-12) 21 | ------------------ 22 | 23 | * Fix alpha_m_squared for m=32: https://github.com/AdRoll/python-hll/pull/2 24 | 25 | 0.1.3 (2021-01-22) 26 | ------------------ 27 | 28 | * Fix AttributeError: 'HLL' object has no attribute '_sparse_probabilistic_storage': 29 | https://github.com/AdRoll/python-hll/pull/4 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2019 AdRoll, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a 6 | copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.rst 2 | include CONTRIBUTING.rst 3 | include HISTORY.rst 4 | include LICENSE 5 | include README.rst 6 | 7 | recursive-include tests * 8 | recursive-exclude * __pycache__ 9 | recursive-exclude * *.py[co] 10 | 11 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif 12 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean clean-test clean-pyc clean-build docs help 2 | .DEFAULT_GOAL := help 3 | 4 | define BROWSER_PYSCRIPT 5 | import os, webbrowser, sys 6 | 7 | try: 8 | from urllib import pathname2url 9 | except: 10 | from urllib.request import pathname2url 11 | 12 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) 13 | endef 14 | export BROWSER_PYSCRIPT 15 | 16 | define PRINT_HELP_PYSCRIPT 17 | import re, sys 18 | 19 | for line in sys.stdin: 20 | match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) 21 | if match: 22 | target, help = match.groups() 23 | print("%-20s %s" % (target, help)) 24 | endef 25 | export PRINT_HELP_PYSCRIPT 26 | 27 | BROWSER := python -c "$$BROWSER_PYSCRIPT" 28 | 29 | help: 30 | @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) 31 | 32 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts 33 | 34 | clean-build: ## remove build artifacts 35 | rm -fr build/ 36 | rm -fr dist/ 37 | rm -fr .eggs/ 38 | find . -name '*.egg-info' -exec rm -fr {} + 39 | find . -name '*.egg' -exec rm -f {} + 40 | 41 | clean-pyc: ## remove Python file artifacts 42 | find . -name '*.pyc' -exec rm -f {} + 43 | find . -name '*.pyo' -exec rm -f {} + 44 | find . -name '*~' -exec rm -f {} + 45 | find . -name '__pycache__' -exec rm -fr {} + 46 | 47 | clean-test: ## remove test and coverage artifacts 48 | rm -fr .tox/ 49 | rm -f .coverage 50 | rm -fr htmlcov/ 51 | rm -fr .pytest_cache 52 | 53 | lint: ## check style with flake8 54 | flake8 python_hll tests 55 | 56 | test: ## run slow and fast tests 57 | @echo "\033[0;32mUse 'make test-fast' to run fast tests only\033[0m" 58 | py.test --capture=no 59 | 60 | test-fast: ## run fast tests 61 | py.test --capture=no --fast-only 62 | 63 | test-all: ## run tests on every Python version with tox 64 | tox 65 | 66 | coverage: ## check code coverage quickly with the default Python 67 | coverage run --source python_hll -m pytest 68 | coverage report -m 69 | coverage html 70 | $(BROWSER) htmlcov/index.html 71 | 72 | docs: ## generate Sphinx HTML documentation, including API docs 73 | rm -f docs/python_hll.rst 74 | rm -f docs/modules.rst 75 | sphinx-apidoc -o docs/ python_hll 76 | $(MAKE) -C docs clean 77 | $(MAKE) -C docs html 78 | $(BROWSER) docs/_build/html/index.html 79 | 80 | servedocs: docs ## compile the docs watching for changes 81 | watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . 82 | 83 | release: dist ## package and upload a release 84 | twine upload dist/* 85 | 86 | dist: clean ## builds source and wheel package 87 | python setup.py sdist 88 | python setup.py bdist_wheel 89 | ls -l dist 90 | 91 | install: clean ## install the package to the active Python's site-packages 92 | python setup.py install 93 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | python-hll 3 | ========== 4 | 5 | 6 | .. image:: https://img.shields.io/pypi/v/python_hll.svg 7 | :target: https://pypi.python.org/pypi/python_hll 8 | 9 | .. image:: https://readthedocs.org/projects/python-hll/badge/?version=latest 10 | :target: https://python-hll.readthedocs.io/en/latest/?badge=latest 11 | :alt: Documentation Status 12 | 13 | .. image:: https://img.shields.io/badge/github-python--hll-yellow 14 | :target: https://github.com/AdRoll/python-hll 15 | 16 | A Python implementation of `HyperLogLog `_ 17 | whose goal is to be `storage compatible `_ 18 | with `java-hll `_, `js-hll `_ 19 | and `postgresql-hll `_. 20 | 21 | **NOTE:** This is a fairly literal translation/port of `java-hll `_ 22 | to Python. Internally, bytes are represented as Java-style bytes (-128 to 127) rather than Python-style bytes (0 to 255). 23 | Also this implementation is quite slow: for example, in Java ``HLLSerializationTest`` takes 12 seconds to run 24 | while in Python ``test_hll_serialization`` takes 1.5 hours to run (about 400x slower). 25 | 26 | * Runs on: Python 2.7 and 3 27 | * Free software: MIT license 28 | * Documentation: https://python-hll.readthedocs.io 29 | * GitHub: https://github.com/AdRoll/python-hll 30 | 31 | Overview 32 | --------------- 33 | See `java-hll `_ for an overview of what HLLs are and how they work. 34 | 35 | Usage 36 | --------------- 37 | 38 | Hashing and adding a value to a new HLL:: 39 | 40 | from python_hll.hll import HLL 41 | import mmh3 42 | value_to_hash = 'foo' 43 | hashed_value = mmh3.hash(value_to_hash) 44 | 45 | hll = HLL(13, 5) # log2m=13, regwidth=5 46 | hll.add_raw(hashed_value) 47 | 48 | Retrieving the cardinality of an HLL:: 49 | 50 | cardinality = hll.cardinality() 51 | 52 | Unioning two HLLs together (and retrieving the resulting cardinality):: 53 | 54 | hll1 = HLL(13, 5) # log2m=13, regwidth=5 55 | hll2 = HLL(13, 5) # log2m=13, regwidth=5 56 | 57 | # ... (add values to both sets) ... 58 | 59 | hll1.union(hll2) # modifies hll1 to contain the union 60 | cardinalityUnion = hll1.cardinality() 61 | 62 | Reading an HLL from a hex representation of 63 | `storage specification, v1.0.0 `_ 64 | (for example, retrieved from a `PostgreSQL database `_):: 65 | 66 | from python_hll.util import NumberUtil 67 | input = '\\x128D7FFFFFFFFFF6A5C420' 68 | hex_string = input[2:] 69 | hll = HLL.from_bytes(NumberUtil.from_hex(hex_string, 0, len(hex_string))) 70 | 71 | Writing an HLL to its hex representation of 72 | `storage specification, v1.0.0 `_ 73 | (for example, to be inserted into a `PostgreSQL database `_):: 74 | 75 | bytes = hll.to_bytes() 76 | output = "\\x" + NumberUtil.to_hex(bytes, 0, len(bytes)) 77 | 78 | Also see the `API documentation `_. 79 | 80 | Development 81 | --------------- 82 | See `Contributing `_ for how to get started building, testing, and deploying the code. -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | # This file is here to add the project root to the sys.path to prevent 2 | # import errors when running a single test. See https://stackoverflow.com/a/50610630/378457 3 | 4 | # It also defines the --fast-only command-line option below. 5 | 6 | import pytest 7 | 8 | 9 | def pytest_addoption(parser): 10 | parser.addoption("--fast-only", action="store_true", help="Run fast tests only") 11 | 12 | 13 | @pytest.fixture 14 | def fastonly(request): 15 | return request.config.getoption("--fast-only") 16 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = python_hll 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/authors.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../AUTHORS.rst 2 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # python_hll documentation build configuration file, created by 5 | # sphinx-quickstart on Fri Jun 9 13:47:02 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another 17 | # directory, add these directories to sys.path here. If the directory is 18 | # relative to the documentation root, use os.path.abspath to make it 19 | # absolute, like shown here. 20 | # 21 | import os 22 | import sys 23 | sys.path.insert(0, os.path.abspath('..')) 24 | 25 | import python_hll 26 | 27 | # -- General configuration --------------------------------------------- 28 | 29 | # If your documentation needs a minimal Sphinx version, state it here. 30 | # 31 | # needs_sphinx = '1.0' 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 35 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | 40 | # The suffix(es) of source filenames. 41 | # You can specify multiple suffix as a list of string: 42 | # 43 | # source_suffix = ['.rst', '.md'] 44 | source_suffix = '.rst' 45 | 46 | # The master toctree document. 47 | master_doc = 'index' 48 | 49 | # General information about the project. 50 | project = u'python-hll' 51 | copyright = u"2019, Jon Aquino" 52 | author = u"Jon Aquino" 53 | 54 | # The version info for the project you're documenting, acts as replacement 55 | # for |version| and |release|, also used in various other places throughout 56 | # the built documents. 57 | # 58 | # The short X.Y version. 59 | version = python_hll.__version__ 60 | # The full version, including alpha/beta/rc tags. 61 | release = python_hll.__version__ 62 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation 64 | # for a list of supported languages. 65 | # 66 | # This is also used if you do content translation via gettext catalogs. 67 | # Usually you set "language" from the command line for these cases. 68 | language = None 69 | 70 | # List of patterns, relative to source directory, that match files and 71 | # directories to ignore when looking for source files. 72 | # This patterns also effect to html_static_path and html_extra_path 73 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 74 | 75 | # The name of the Pygments (syntax highlighting) style to use. 76 | pygments_style = 'sphinx' 77 | 78 | # If true, `todo` and `todoList` produce output, else they produce nothing. 79 | todo_include_todos = False 80 | 81 | 82 | # -- Options for HTML output ------------------------------------------- 83 | 84 | # The theme to use for HTML and HTML Help pages. See the documentation for 85 | # a list of builtin themes. 86 | # 87 | html_theme = 'alabaster' 88 | 89 | # Theme options are theme-specific and customize the look and feel of a 90 | # theme further. For a list of options available for each theme, see the 91 | # documentation. 92 | # 93 | # html_theme_options = {} 94 | 95 | # Add any paths that contain custom static files (such as style sheets) here, 96 | # relative to this directory. They are copied after the builtin static files, 97 | # so a file named "default.css" will overwrite the builtin "default.css". 98 | html_static_path = ['_static'] 99 | 100 | 101 | # -- Options for HTMLHelp output --------------------------------------- 102 | 103 | # Output file base name for HTML help builder. 104 | htmlhelp_basename = 'python_hlldoc' 105 | 106 | 107 | # -- Options for LaTeX output ------------------------------------------ 108 | 109 | latex_elements = { 110 | # The paper size ('letterpaper' or 'a4paper'). 111 | # 112 | # 'papersize': 'letterpaper', 113 | 114 | # The font size ('10pt', '11pt' or '12pt'). 115 | # 116 | # 'pointsize': '10pt', 117 | 118 | # Additional stuff for the LaTeX preamble. 119 | # 120 | # 'preamble': '', 121 | 122 | # Latex figure (float) alignment 123 | # 124 | # 'figure_align': 'htbp', 125 | } 126 | 127 | # Grouping the document tree into LaTeX files. List of tuples 128 | # (source start file, target name, title, author, documentclass 129 | # [howto, manual, or own class]). 130 | latex_documents = [ 131 | (master_doc, 'python_hll.tex', 132 | u'python-hll Documentation', 133 | u'Jon Aquino', 'manual'), 134 | ] 135 | 136 | 137 | # -- Options for manual page output ------------------------------------ 138 | 139 | # One entry per manual page. List of tuples 140 | # (source start file, name, description, authors, manual section). 141 | man_pages = [ 142 | (master_doc, 'python_hll', 143 | u'python-hll Documentation', 144 | [author], 1) 145 | ] 146 | 147 | 148 | # -- Options for Texinfo output ---------------------------------------- 149 | 150 | # Grouping the document tree into Texinfo files. List of tuples 151 | # (source start file, target name, title, author, 152 | # dir menu entry, description, category) 153 | texinfo_documents = [ 154 | (master_doc, 'python_hll', 155 | u'python-hll Documentation', 156 | author, 157 | 'python_hll', 158 | 'One line description of project.', 159 | 'Miscellaneous'), 160 | ] 161 | 162 | # -- Make ReadTheDocs generate API doc ---------------------------------------- 163 | 164 | # See https://github.com/isogeo/isogeo-api-py-minsdk/commit/df45262dae266035946839009e02e6c5e068a05f 165 | on_rtd = os.environ.get('READTHEDOCS', None) == 'True' 166 | if on_rtd: 167 | def run_apidoc(_): 168 | from sphinx.apidoc import main as apidoc_main 169 | 170 | cur_dir = os.path.abspath(os.path.dirname(__file__)) 171 | output_path = os.path.join(cur_dir, 'docs') 172 | modules = os.path.join(cur_dir, os.path.normpath(r"../python_hll")) 173 | exclusions = [] 174 | apidoc_main([None, '-e', '-f', '-o', output_path, modules] + exclusions) 175 | 176 | def setup(app): 177 | app.connect('builder-inited', run_apidoc) 178 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CONTRIBUTING.rst 2 | -------------------------------------------------------------------------------- /docs/history.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../HISTORY.rst 2 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to python-hll's documentation! 2 | ====================================== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Contents: 7 | 8 | readme 9 | installation 10 | API 11 | contributing 12 | authors 13 | history 14 | 15 | Indices and tables 16 | ================== 17 | * :ref:`genindex` 18 | * :ref:`modindex` 19 | * :ref:`search` 20 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Installation 5 | ============ 6 | 7 | 8 | Stable release 9 | -------------- 10 | 11 | To install python-hll, run this command in your terminal: 12 | 13 | .. code-block:: console 14 | 15 | $ pip install python_hll 16 | 17 | This is the preferred method to install python-hll, as it will always install the most recent stable release. 18 | 19 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide 20 | you through the process. 21 | 22 | .. _pip: https://pip.pypa.io 23 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ 24 | 25 | 26 | From sources 27 | ------------ 28 | 29 | The sources for python-hll can be downloaded from the `Github repo`_. 30 | 31 | You can either clone the public repository: 32 | 33 | .. code-block:: console 34 | 35 | $ git clone git://github.com/AdRoll/python-hll 36 | 37 | Or download the `tarball`_: 38 | 39 | .. code-block:: console 40 | 41 | $ curl -OL https://github.com/AdRoll/python-hll/tarball/master 42 | 43 | Once you have a copy of the source, you can install it with: 44 | 45 | .. code-block:: console 46 | 47 | $ python setup.py install 48 | 49 | 50 | .. _Github repo: https://github.com/AdRoll/python-hll 51 | .. _tarball: https://github.com/AdRoll/python-hll/tarball/master 52 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=python_hll 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/readme.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | -------------------------------------------------------------------------------- /python_hll/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Top-level package for python-hll.""" 4 | 5 | __author__ = """Jon Aquino""" 6 | __email__ = 'jonathan.aquino@adroll.com' 7 | __version__ = '0.1.3' 8 | -------------------------------------------------------------------------------- /python_hll/hll.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import division 4 | from copy import deepcopy 5 | from math import ceil, floor 6 | 7 | from python_hll.hlltype import HLLType 8 | from python_hll.serialization import SerializationUtil, HLLMetadata 9 | from python_hll.util import NumberUtil, BitVector, BitUtil 10 | 11 | 12 | class HLL: 13 | """ 14 | A probabilistic set of hashed ``long`` elements. Useful for computing 15 | the approximate cardinality of a stream of data in very small storage. 16 | 17 | A modified version of the `'HyperLogLog' data structure and algorithm 18 | `_ is used, 19 | which combines both probabilistic and non-probabilistic techniques to 20 | improve the accuracy and storage requirements of the original algorithm. 21 | 22 | More specifically, initializing and storing a new HLL will 23 | allocate a sentinel value symbolizing the empty set (HLLType.EMPTY). 24 | After adding the first few values, a sorted list of unique integers is 25 | stored in a HLLType.EXPLICIT hash set. When configured, accuracy can 26 | be sacrificed for memory footprint: the values in the sorted list are 27 | "promoted" to a "HLLType.SPARSE" map-based HyperLogLog structure. 28 | Finally, when enough registers are set, the map-based HLL will be converted 29 | to a bit-packed "HLLType.FULL" HyperLogLog structure. 30 | 31 | This data structure is interoperable with the implementations found at: 32 | 33 | * `postgresql-hll `_ 34 | * `js-hll `_ 35 | 36 | when `properly serialized `_. 37 | """ 38 | 39 | # minimum and maximum values for the log-base-2 of the number of registers 40 | # in the HLL 41 | MINIMUM_LOG2M_PARAM = 4 42 | MAXIMUM_LOG2M_PARAM = 30 43 | 44 | # minimum and maximum values for the register width of the HLL 45 | MINIMUM_REGWIDTH_PARAM = 1 46 | MAXIMUM_REGWIDTH_PARAM = 8 47 | 48 | # minimum and maximum values for the 'expthresh' parameter of the 49 | # constructor that is meant to match the PostgreSQL implementation's 50 | # constructor and parameter names 51 | MINIMUM_EXPTHRESH_PARAM = -1 52 | MAXIMUM_EXPTHRESH_PARAM = 18 53 | MAXIMUM_EXPLICIT_THRESHOLD = BitUtil.left_shift_int(1, (MAXIMUM_EXPTHRESH_PARAM - 1)) # per storage spec 54 | 55 | # ------------------------------------------------------------ 56 | # STORAGE 57 | # :var set _explicit_storage: storage used when ``type`` is EXPLICIT, None otherwise 58 | # :var dict _sparse_probabilistic_storage: storage used when ``type`` is SPARSE, None otherwise 59 | # :var BitVector _probabilistic_storage: storage used when ``type`` is FULL, None otherwise 60 | # :var HLLType type: current type of this HLL instance, if this changes then so should the storage used (see above) 61 | 62 | # ------------------------------------------------------------ 63 | # CHARACTERISTIC PARAMETERS 64 | # NOTE: These members are named to match the PostgreSQL implementation's parameters. 65 | # :var int _log2m: log2(the number of probabilistic HLL registers) 66 | # :var int _regwidth: the size (width) each register in bits 67 | 68 | # ------------------------------------------------------------ 69 | # COMPUTED CONSTANTS 70 | # ............................................................ 71 | # EXPLICIT-specific constants 72 | # :var boolean _explicit_off: flag indicating if the EXPLICIT representation should NOT be used 73 | # :var boolean _explicit_auto: flag indicating that the promotion threshold from EXPLICIT should be 74 | # computed automatically. NOTE: this only has meaning when '_explicit_off' is false. 75 | # :var int _explicit_threshold: threshold (in element count) at which a EXPLICIT HLL is converted to a 76 | # SPARSE or FULL HLL, always greater than or equal to zero and always a power of two OR simply zero 77 | # NOTE: this only has meaning when '_explicit_off' is false 78 | # ............................................................ 79 | # SPARSE-specific constants 80 | # :var int _short_word_length: the computed width of the short words 81 | # :var boolean _sparse_off: flag indicating if the SPARSE representation should not be used 82 | # :var int _sparse_threshold: threshold (in register count) at which a SPARSE HLL is converted to a 83 | # FULL HLL, always greater than zero 84 | # ............................................................ 85 | # Probabilistic algorithm constants 86 | # :var int _m: the number of registers, will always be a power of 2 87 | # :var int _m_bits_mask: a mask of the log2m bits set to one and the rest to zero 88 | # :var int _value_mask: a mask as wide as a register (see ``from_bytes()``) 89 | # :var long _long_pw_mask: mask used to ensure that p(w) does not overflow register (see ``__init__()`` and ``add_raw()``) 90 | # ;var float _alpha_m_squared: alpha * m^2 (the constant in the "'raw' HyperLogLog estimator") 91 | # :var float _small_estimator_cutoff: the cutoff value of the estimator for using the "small" range cardinality correction formula 92 | # :var float _large_estimator_cutoff: the cutoff value of the estimator for using the "large" range cardinality correction formula 93 | 94 | def __init__(self, log2m, regwidth, expthresh=-1, sparseon=True, type=HLLType.EMPTY): 95 | """ 96 | NOTE: Arguments here are named and structured identically to those in the 97 | PostgreSQL implementation, which can be found 98 | `here `_. 99 | 100 | :param log2m: log-base-2 of the number of registers used in the HyperLogLog 101 | algorithm. Must be at least 4 and at most 30. 102 | :type log2m: int 103 | :param regwidth: number of bits used per register in the HyperLogLog 104 | algorithm. Must be at least 1 and at most 8. 105 | :type regwidth: int 106 | :param expthresh: tunes when the ``HLLType.EXPLICIT`` to 107 | ``HLLType.SPARSE`` promotion occurs, 108 | based on the set's cardinality. Must be at least -1 and at most 18. 109 | +-----------+--------------------------------------------------------------------------------+ 110 | | expthresh | Meaning | 111 | +===========+================================================================================+ 112 | | -1 | Promote at whatever cutoff makes sense for optimal memory usage. ('auto' mode) | 113 | +-----------+--------------------------------------------------------------------------------+ 114 | | 0 | Skip ``EXPLICIT`` representation in hierarchy. | 115 | +-----------+--------------------------------------------------------------------------------+ 116 | | 1-18 | Promote at 2:sup:`expthresh - 1` cardinality | 117 | +-----------+--------------------------------------------------------------------------------+ 118 | :type expthresh: int 119 | :param sparseon: Flag indicating if the ``HLLType.SPARSE`` 120 | representation should be used. 121 | :type sparseon: boolean 122 | :param type: the type in the promotion hierarchy which this instance should 123 | start at. This cannot be ``None``. 124 | :type type: HLLType 125 | """ 126 | from python_hll.hllutil import HLLUtil 127 | 128 | self._log2m = log2m 129 | if log2m < HLL.MINIMUM_LOG2M_PARAM or log2m > HLL.MAXIMUM_EXPLICIT_THRESHOLD: 130 | raise Exception("'log2m' must be at least " + str(HLL.MINIMUM_LOG2M_PARAM) + " and at most " + str(HLL.MAXIMUM_LOG2M_PARAM) + " (was: " + str(log2m) + ")") 131 | 132 | self._regwidth = regwidth 133 | if regwidth < HLL.MINIMUM_REGWIDTH_PARAM or regwidth > HLL.MAXIMUM_REGWIDTH_PARAM: 134 | raise Exception("'regwidth' must be at least " + str(HLL.MINIMUM_REGWIDTH_PARAM) + " and at most " + str(HLL.MAXIMUM_REGWIDTH_PARAM) + " (was: " + str(regwidth) + ")") 135 | 136 | self._m = BitUtil.left_shift_int(1, log2m) 137 | self._m_bits_mask = self._m - 1 138 | self._value_mask = BitUtil.left_shift_int(1, regwidth) - 1 139 | self._pw_max_mask = HLLUtil.pw_max_mask(regwidth) 140 | self._alpha_m_squared = HLLUtil.alpha_m_squared(self._m) 141 | self._small_estimator_cutoff = HLLUtil.small_estimator_cutoff(self._m) 142 | self._large_estimator_cutoff = HLLUtil.large_estimator_cutoff(log2m, regwidth) 143 | 144 | if expthresh == -1: 145 | self._explicit_auto = True 146 | self._explicit_off = False 147 | 148 | # NOTE: This math matches the size calculation in the PostgreSQL impl. 149 | full_representation_size = floor((self._regwidth * self._m + 7) / 8) # round up to next whole byte 150 | num_longs = floor(full_representation_size / 8) # integer division to round down 151 | 152 | if num_longs > HLL.MAXIMUM_EXPLICIT_THRESHOLD: 153 | self._explicit_threshold = HLL.MAXIMUM_EXPLICIT_THRESHOLD 154 | else: 155 | self._explicit_threshold = num_longs 156 | elif expthresh == 0: 157 | self._explicit_auto = False 158 | self._explicit_off = True 159 | self._explicit_threshold = 0 160 | elif 0 < expthresh <= HLL.MAXIMUM_EXPTHRESH_PARAM: 161 | self._explicit_auto = False 162 | self._explicit_off = False 163 | self._explicit_threshold = BitUtil.left_shift_int(1, (expthresh - 1)) 164 | else: 165 | raise Exception("'expthresh' must be at least " + str(HLL.MINIMUM_EXPTHRESH_PARAM) + " and at most " + str(HLL.MAXIMUM_EXPTHRESH_PARAM) + " (was: " + str(expthresh) + ")") 166 | 167 | self._short_word_length = regwidth + log2m 168 | self._sparse_off = not sparseon 169 | if self._sparse_off: 170 | self._sparse_threshold = 0 171 | else: 172 | # TODO improve this cutoff to include the cost overhead of members/objects 173 | largest_pow_2_less_than_cutoff = int(NumberUtil.log2((self._m * self._regwidth) / self._short_word_length)) 174 | self._sparse_threshold = BitUtil.left_shift_int(1, largest_pow_2_less_than_cutoff) 175 | 176 | self._initialize_storage(type) 177 | 178 | @classmethod 179 | def create_for_testing(cls, log2m, regwidth, explicit_threshold, sparse_threshold, type): 180 | """ 181 | Convenience constructor for testing. Assumes that both ``HLLType.EXPLICIT`` 182 | and ``HLLType.SPARSE`` representations should be enabled. 183 | 184 | :param log2m: log-base-2 of the number of registers used in the HyperLogLog 185 | algorithm. Must be at least 4 and at most 30. 186 | :type log2m: int 187 | :param regwidth: number of bits used per register in the HyperLogLog 188 | algorithm. Must be at least 1 and at most 8. 189 | :type regwidth: int 190 | :param explicit_threshold: cardinality threshold at which the ``HLLType.EXPLICIT`` 191 | representation should be promoted to ``HLLType.SPARSE``. 192 | This must be greater than zero and less than or equal to ``MAXIMUM_EXPLICIT_THRESHOLD``. 193 | :type explicit_threshold: int 194 | :param sparse_threshold: register count threshold at which the ``HLLType.SPARSE`` 195 | representation should be promoted to ``HLLType.FULL``. 196 | This must be greater than zero. 197 | :type sparse_threshold: int 198 | :param type: the type in the promotion hierarchy which this instance should 199 | start at. This cannot be ``None``. 200 | :type type: HLLType 201 | :rtype: HLL 202 | """ 203 | hll = HLL(log2m=log2m, regwidth=regwidth, expthresh=-1, sparseon=True, type=type) 204 | hll._explicit_auto = False 205 | hll._explicit_off = False 206 | hll._explicit_threshold = explicit_threshold 207 | if explicit_threshold < 1 or explicit_threshold > cls.MAXIMUM_EXPLICIT_THRESHOLD: 208 | raise Exception("'explicit_threshold' must be at least 1 and at most " + str(cls.MAXIMUM_EXPLICIT_THRESHOLD) + " (was: " + str(explicit_threshold) + ")") 209 | hll._sparse_off = False 210 | hll._sparse_threshold = sparse_threshold 211 | return hll 212 | 213 | def get_type(self): 214 | """ 215 | Returns the type in the promotion hierarchy of this instance. This will 216 | never be ``None``. 217 | 218 | :rtype: HLLType 219 | """ 220 | return self._type 221 | 222 | def add_raw(self, raw_value): 223 | """ 224 | Adds ``rawValue`` directly to the HLL. 225 | 226 | :param long raw_value: the value to be added. It is very important that this 227 | value already be hashed with a strong (but not 228 | necessarily cryptographic) hash function. For instance, the 229 | `MurmurHash3 implementation `_ 230 | is an excellent hash function for this purpose. 231 | :rtype: void 232 | """ 233 | 234 | if self._type == HLLType.EMPTY: 235 | # Note: EMPTY type is always promoted on add_raw() 236 | if self._explicit_threshold > 0: 237 | self._initialize_storage(HLLType.EXPLICIT) 238 | self._explicit_storage.add(raw_value) 239 | elif not self._sparse_off: 240 | self._initialize_storage(HLLType.SPARSE) 241 | self._add_raw_sparse_probabilistic(raw_value) 242 | else: 243 | self._initialize_storage(HLLType.FULL) 244 | self._add_raw_probabilistic(raw_value) 245 | return 246 | 247 | elif self._type == HLLType.EXPLICIT: 248 | self._explicit_storage.add(raw_value) 249 | 250 | # promotion, if necessary 251 | if len(self._explicit_storage) > self._explicit_threshold: 252 | if not self._sparse_off: 253 | self._initialize_storage(HLLType.SPARSE) 254 | for value in self._explicit_storage: 255 | self._add_raw_sparse_probabilistic(value) 256 | else: 257 | self._initialize_storage(HLLType.FULL) 258 | for value in self._explicit_storage: 259 | self._add_raw_probabilistic(value) 260 | self._explicit_storage = None 261 | return 262 | 263 | elif self._type == HLLType.SPARSE: 264 | self._add_raw_sparse_probabilistic(raw_value) 265 | 266 | # promotion, if necessary 267 | if len(self._sparse_probabilistic_storage) > self._sparse_threshold: 268 | self._initialize_storage(HLLType.FULL) 269 | for register_index in self._sparse_probabilistic_storage.keys(): 270 | register_value = self._sparse_probabilistic_storage.get(register_index, 0) 271 | self._probabilistic_storage.set_max_register(register_index, register_value) 272 | self._sparse_probabilistic_storage = None 273 | return 274 | 275 | elif self._type == HLLType.FULL: 276 | self._add_raw_probabilistic(raw_value) 277 | return 278 | 279 | else: 280 | raise Exception("Unsupported HLL type: {}".format(self._type)) 281 | 282 | def _add_raw_sparse_probabilistic(self, raw_value): 283 | """ 284 | Adds the raw value to the ``sparseProbabilisticStorage``. 285 | ``type`` ``HLLType.SPARSE``. 286 | 287 | :param long raw_value: the raw value to add to the sparse storage. 288 | :rtype: void 289 | """ 290 | 291 | # p(w): position of the least significant set bit (one-indexed) 292 | # By contract: p(w) <= 2^(register_value_in_bits) - 1 (the max register value) 293 | # 294 | # By construction of pw_max_mask (see constructor), 295 | # lsb(pw_max_mask) = 2^(register_value_in_bits) - 2, 296 | # thus lsb(any_long | pw_max_mask) <= 2^(register_value_in_bits) - 2, 297 | # thus 1 + lsb(any_long | pw_max_mask) <= 2^(register_value_in_bits) -1. 298 | sub_stream_value = BitUtil.unsigned_right_shift_long(raw_value, self._log2m) 299 | p_w = None 300 | 301 | if sub_stream_value == 0: 302 | # The paper does not cover p(0x0), so the special value 0 is used. 303 | # 0 is the original initialization value of the registers, so by 304 | # doing this the multiset simply ignores it. This is acceptable 305 | # because the probability is 1/(2^(2^register_size_in_bits)). 306 | p_w = 0 307 | else: 308 | p_w = BitUtil.to_signed_byte(1 + BitUtil.least_significant_bit(sub_stream_value | self._pw_max_mask)) 309 | 310 | # Short-circuit if the register is being set to zero, since algorithmically 311 | # this corresponds to an "unset" register, and "unset" registers aren't 312 | # stored to save memory. (The very reason this sparse implementation 313 | # exists.) If a register is set to zero it will break the algorithm_cardinality 314 | # code. 315 | if p_w == 0: 316 | return 317 | 318 | # NOTE: no +1 as in paper since 0-based indexing 319 | j = int(raw_value & self._m_bits_mask) 320 | 321 | current_value = self._sparse_probabilistic_storage.get(j, 0) 322 | if p_w > current_value: 323 | self._sparse_probabilistic_storage[j] = p_w 324 | 325 | def _add_raw_probabilistic(self, raw_value): 326 | """ 327 | Adds the raw value to the ``probabilisticStorage``. 328 | ``type`` must be ``HLLType.FULL``. 329 | 330 | :param long raw_value: the raw value to add to the full probabilistic storage. 331 | :rtype: void 332 | """ 333 | # p(w): position of the least significant set bit (one-indexed) 334 | # By contract: p(w) <= 2^(register_value_in_bits) - 1 (the max register value) 335 | # 336 | # By construction of pw_max_mask (see constructor), 337 | # lsb(pw_max_mask) = 2^(register_value_in_bits) - 2, 338 | # thus lsb(any_long | pw_max_mask) <= 2^(register_value_in_bits) - 2, 339 | # thus 1 + lsb(any_long | pw_max_mask) <= 2^(register_value_in_bits) -1. 340 | sub_stream_value = BitUtil.unsigned_right_shift_long(raw_value, self._log2m) 341 | p_w = None 342 | 343 | if sub_stream_value == 0: 344 | # The paper does not cover p(0x0), so the special value 0 is used. 345 | # 0 is the original initialization value of the registers, so by 346 | # doing this the multiset simply ignores it. This is acceptable 347 | # because the probability is 1/(2^(2^register_size_in_bits)). 348 | p_w = 0 349 | else: 350 | p_w = BitUtil.to_signed_byte(1 + BitUtil.least_significant_bit(sub_stream_value | self._pw_max_mask)) 351 | 352 | # Short-circuit if the register is being set to zero, since algorithmically 353 | # this corresponds to an "unset" register, and "unset" registers aren't 354 | # stored to save memory. (The very reason this sparse implementation 355 | # exists.) If a register is set to zero it will break the algorithm_cardinality 356 | # code. 357 | if p_w == 0: 358 | return 359 | 360 | # NOTE: no +1 as in paper since 0-based indexing 361 | j = int(raw_value & self._m_bits_mask) 362 | 363 | self._probabilistic_storage.set_max_register(j, p_w) 364 | 365 | def _initialize_storage(self, type): 366 | """ 367 | Initializes storage for the specified ``HLLType`` and changes the 368 | instance's ``type``. 369 | 370 | :param HLLType type: the ``HLLType`` to initialize storage for. This cannot be 371 | ``None`` and must be an instantiable type. (For instance, 372 | it cannot be ``HLLType.UNDEFINED``.) 373 | :rtype: void 374 | """ 375 | self._type = type 376 | if type == HLLType.EMPTY: 377 | # nothing to be done 378 | pass 379 | elif type == HLLType.EXPLICIT: 380 | self._explicit_storage = set() 381 | elif type == HLLType.SPARSE: 382 | self._sparse_probabilistic_storage = dict() 383 | elif type == HLLType.FULL: 384 | self._probabilistic_storage = BitVector(self._regwidth, self._m) 385 | else: 386 | raise Exception("Unsupported HLL type: {}".format(self._type)) 387 | 388 | def cardinality(self): 389 | """ 390 | Computes the cardinality of the HLL. 391 | 392 | :returns: the cardinality of HLL. This will never be negative. 393 | :rtype: long 394 | """ 395 | if self._type == HLLType.EMPTY: 396 | return 0 # by definition 397 | elif self._type == HLLType.EXPLICIT: 398 | return len(self._explicit_storage) 399 | elif self._type == HLLType.SPARSE: 400 | return ceil(self._sparse_probabilistic_algorithm_cardinality()) 401 | elif self._type == HLLType.FULL: 402 | return ceil(self._full_probabilistic_algorithm_cardinality()) 403 | else: 404 | raise Exception("Unsupported HLL type: {}".format(self._type)) 405 | 406 | def _sparse_probabilistic_algorithm_cardinality(self): 407 | """ 408 | Computes the exact cardinality value returned by the HLL algorithm when 409 | represented as a ``HLLType.SPARSE`` HLL. Kept 410 | separate from ``cardinality()`` for testing purposes. ``type`` 411 | must be ``HLLType.SPARSE``. 412 | 413 | :returns: the exact, unrounded cardinality given by the HLL algorithm 414 | :rtype: float 415 | """ 416 | from python_hll.hllutil import HLLUtil 417 | m = self._m 418 | 419 | # compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the 420 | # 'j'th register value 421 | indicator_function = 0.0 422 | number_of_zeroes = 0 # "V" in the paper 423 | for j in range(m): 424 | register = self._sparse_probabilistic_storage.get(j, 0) 425 | 426 | indicator_function += 1.0 / BitUtil.left_shift_long(1, register) 427 | if register == 0: 428 | number_of_zeroes += 1 429 | 430 | # apply the estimate and correction to the indicator function 431 | estimator = self._alpha_m_squared / indicator_function 432 | if number_of_zeroes != 0 and estimator < self._small_estimator_cutoff: 433 | return HLLUtil.small_estimator(m, number_of_zeroes) 434 | elif estimator <= self._large_estimator_cutoff: 435 | return estimator 436 | else: 437 | return HLLUtil.large_estimator(self._log2m, self._regwidth, estimator) 438 | 439 | def _full_probabilistic_algorithm_cardinality(self): 440 | """ 441 | Computes the exact cardinality value returned by the HLL algorithm when 442 | represented as a ``HLLType.FULL`` HLL. Kept separate from ``cardinality()`` for testing purposes. 443 | type must be ``HLLType.FULL``. 444 | 445 | :rtype: float 446 | """ 447 | from python_hll.hllutil import HLLUtil 448 | # for performance 449 | m = self._m 450 | # compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the 451 | # 'j'th register value 452 | sum = 0 453 | number_of_zeroes = 0 # "V" in the paper 454 | iterator = self._probabilistic_storage.register_iterator() 455 | for register in iterator: 456 | sum += 1.0 / BitUtil.left_shift_long(1, register) 457 | if register == 0: 458 | number_of_zeroes += 1 459 | # apply the estimate and correction to the indicator function 460 | estimator = self._alpha_m_squared / sum 461 | if number_of_zeroes != 0 and (estimator < self._small_estimator_cutoff): 462 | return HLLUtil.small_estimator(m, number_of_zeroes) 463 | elif estimator <= self._large_estimator_cutoff: 464 | return estimator 465 | else: 466 | return HLLUtil.large_estimator(self._log2m, self._regwidth, estimator) 467 | 468 | def clear(self): 469 | """ 470 | Clears the HLL. The HLL will have cardinality zero and will act as if no 471 | elements have been added. 472 | 473 | NOTE: Unlike ``addRaw(long)``, ``clear`` does NOT handle 474 | transitions between ``HLLType``'s - a probabilistic type will remain 475 | probabilistic after being cleared. 476 | 477 | :rtype: void 478 | """ 479 | if self._type == HLLType.EMPTY: 480 | return # do nothing 481 | elif self._type == HLLType.EXPLICIT: 482 | return self._explicit_storage.clear() 483 | elif self._type == HLLType.SPARSE: 484 | return self._sparse_probabilistic_storage.clear() 485 | elif self._type == HLLType.FULL: 486 | self._probabilistic_storage.fill(0) 487 | return 488 | else: 489 | raise Exception('Unsupported HLL type: {}'.format(self._type)) 490 | 491 | def union(self, other): 492 | """ 493 | Computes the union of HLLs and stores the result in this instance. 494 | 495 | :param HLL other: the other ``HLL`` instance to union into this one. This 496 | cannot be ``None``. 497 | :rtype: void 498 | """ 499 | # TODO: verify HLL compatibility 500 | other_type = other.get_type() 501 | 502 | if self._type == other_type: 503 | self._homogeneous_union(other) 504 | else: 505 | self._heterogenous_union(other) 506 | 507 | def _heterogeneous_union_for_empty_hll(self, other): 508 | # The union of empty with non-empty HLL is just a clone of the non-empty. 509 | 510 | if other.get_type() == HLLType.EXPLICIT: 511 | # src: EXPLICIT 512 | # dest: EMPTY 513 | 514 | if len(other._explicit_storage) <= self._explicit_threshold: 515 | self._type = HLLType.EXPLICIT 516 | self._explicit_storage = deepcopy(other._explicit_storage) 517 | else: 518 | if not self._sparse_off: 519 | self._initialize_storage(HLLType.SPARSE) 520 | else: 521 | self._initialize_storage(HLLType.FULL) 522 | 523 | for value in other._explicit_storage: 524 | self.add_raw(value) 525 | 526 | elif other.get_type() == HLLType.SPARSE: 527 | # src: SPARSE 528 | # dest: EMPTY 529 | 530 | if not self._sparse_off: 531 | self._type = HLLType.SPARSE 532 | self._sparse_probabilistic_storage = deepcopy(other._sparse_probabilistic_storage) 533 | else: 534 | self._initialize_storage(HLLType.FULL) 535 | for register_index in other._sparse_probabilistic_storage.keys(): 536 | register_value = other._sparse_probabilistic_storage.get(register_index) 537 | self._probabilistic_storage.set_max_register(register_index, register_value) 538 | return 539 | 540 | else: # case FULL 541 | # src: FULL 542 | # dest: EMPTY 543 | self._type = HLLType.FULL 544 | self._probabilistic_storage = deepcopy(other._probabilistic_storage) 545 | return 546 | 547 | def _heterogeneous_union_for_non_empty_hll(self, other): 548 | if self._type == HLLType.EXPLICIT: 549 | # src: FULL/SPARSE 550 | # dest: EXPLICIT 551 | # "Storing into destination" cannot be done (since destination 552 | # is by definition of smaller capacity than source), so a clone 553 | # of source is made and values from destination are inserted 554 | # into that. 555 | 556 | # Determine source and destination storage. 557 | # NOTE: destination storage may change through promotion if 558 | # source is SPARSE. 559 | 560 | if other.get_type() == HLLType.SPARSE: 561 | if not self._sparse_off: 562 | self._type = HLLType.SPARSE 563 | self._sparse_probabilistic_storage = deepcopy(other._sparse_probabilistic_storage) 564 | else: 565 | self._initialize_storage(HLLType.FULL) 566 | for register_index in other._sparse_probabilistic_storage.keys(): 567 | register_value = other._sparse_probabilistic_storage.get(register_index) 568 | self._probabilistic_storage.set_max_register(register_index, register_value) 569 | 570 | else: # source is HLLType.FULL 571 | self._type = HLLType.FULL 572 | self._probabilistic_storage = deepcopy(other._probabilistic_storage) 573 | 574 | for value in self._explicit_storage: 575 | self.add_raw(value) 576 | self._explicit_storage = None 577 | return 578 | 579 | elif self._type == HLLType.SPARSE: 580 | if other.get_type() == HLLType.EXPLICIT: 581 | # src: EXPLICIT 582 | # dest: SPARSE 583 | # Add the raw values from the source to the destination. 584 | 585 | for value in other._explicit_storage: 586 | # NOTE: add_raw will handle promotion cleanup 587 | self.add_raw(value) 588 | 589 | else: # source is HLLType.FULL 590 | # src: FULL 591 | # dest: SPARSE 592 | # "Storing into destination" cannot be done (since destination 593 | # is by definition of smaller capacity than source), so a 594 | # clone of source is made and registers from the destination 595 | # are merged into the clone. 596 | 597 | self._type = HLLType.FULL 598 | self._probabilistic_storage = deepcopy(other._probabilistic_storage) 599 | for register_index in self._sparse_probabilistic_storage.keys(): 600 | register_value = self._sparse_probabilistic_storage.get(register_index, 0) 601 | self._probabilistic_storage.set_max_register(register_index, register_value) 602 | self._sparse_probabilistic_storage = None 603 | 604 | else: # destination is HLLType.FULL 605 | if other._type == HLLType.EXPLICIT: 606 | # src: EXPLICIT 607 | # dest: FULL 608 | # Add the raw values from the source to the destination. 609 | # Promotion is not possible, so don't bother checking. 610 | 611 | for value in other._explicit_storage: 612 | self.add_raw(value) 613 | 614 | else: # source is HLLType.SPARSE 615 | # src: SPARSE 616 | # dest: FULL 617 | # Merge the registers from the source into the destination. 618 | # Promotion is not possible, so don't bother checking. 619 | 620 | for register_index in other._sparse_probabilistic_storage.keys(): 621 | register_value = other._sparse_probabilistic_storage.get(register_index) 622 | self._probabilistic_storage.set_max_register(register_index, register_value) 623 | 624 | def _heterogenous_union(self, other): 625 | """ 626 | The logic here is divided into two sections: unions with an EMPTY 627 | HLL, and unions between EXPLICIT/SPARSE/FULL HLL. 628 | 629 | Between those two sections, all possible heterogeneous unions are 630 | covered. Should another type be added to HLLType whose unions 631 | are not easily reduced (say, as EMPTY's are below) this may be more 632 | easily implemented as Strategies. However, that is unnecessary as it 633 | stands. 634 | :type other: HLL 635 | :rtype: void 636 | """ 637 | 638 | # Union with an EMPTY 639 | if self._type == HLLType.EMPTY: 640 | self._heterogeneous_union_for_empty_hll(other) 641 | return 642 | elif other.get_type() == HLLType.EMPTY: 643 | # source is empty, so just return destination since it is unchanged 644 | return 645 | 646 | # else -- both of the sets are not empty 647 | self._heterogeneous_union_for_non_empty_hll(other) 648 | 649 | def _homogeneous_union(self, other): 650 | """ 651 | Computes the union of two HLLs of the same type, and stores the 652 | result in this instance. 653 | 654 | :param HLL other: the other ``HLL`` instance to union into this one. This 655 | cannot be ``None``. 656 | :rtype: void 657 | """ 658 | if self._type == HLLType.EMPTY: 659 | # union of empty and empty is empty 660 | return 661 | 662 | elif self._type == HLLType.EXPLICIT: 663 | for value in other._explicit_storage: 664 | # Note: add_raw() will handle promotion, if necessary 665 | self.add_raw(value) 666 | 667 | elif self._type == HLLType.SPARSE: 668 | 669 | for register_index in other._sparse_probabilistic_storage.keys(): 670 | register_value = other._sparse_probabilistic_storage.get(register_index) 671 | current_register_value = self._sparse_probabilistic_storage.get(register_index, 0) 672 | if register_value > current_register_value: 673 | self._sparse_probabilistic_storage[register_index] = register_value 674 | 675 | # promotion, if necessary 676 | if len(self._sparse_probabilistic_storage) > self._sparse_threshold: 677 | self._initialize_storage(HLLType.FULL) 678 | for register_index in self._sparse_probabilistic_storage.keys(): 679 | register_value = self._sparse_probabilistic_storage.get(register_index, 0) 680 | self._probabilistic_storage.set_max_register(register_index, register_value) 681 | 682 | self._sparse_probabilistic_storage = None 683 | 684 | elif self._type == HLLType.FULL: 685 | for i in range(self._m): 686 | register_value = other._probabilistic_storage.get_register(i) 687 | self._probabilistic_storage.set_max_register(i, register_value) 688 | return 689 | 690 | else: 691 | raise Exception('Unsupported HLL type: {}'.format(self._type)) 692 | 693 | def to_bytes(self, schema_version=SerializationUtil.DEFAULT_SCHEMA_VERSION): 694 | """ 695 | Serializes the HLL to an array of bytes in correspondence with the format 696 | of the default schema version, ``SerializationUtil.DEFAULT_SCHEMA_VERSION``. 697 | 698 | :param SchemaVersion schema_version: the schema version dictating the serialization format 699 | :returns: the array of bytes representing the HLL. This will never be 700 | ``None`` or empty. 701 | :rtype: list 702 | """ 703 | from python_hll.hllutil import HLLUtil 704 | if self._type == HLLType.EMPTY: 705 | byte_array_length = schema_version.padding_bytes(self._type) 706 | byte_array = [0] * byte_array_length 707 | 708 | elif self._type == HLLType.EXPLICIT: 709 | serializer = schema_version.get_serializer( 710 | self._type, 711 | HLLUtil.LONG_BIT_LENGTH, 712 | len(self._explicit_storage) 713 | ) 714 | 715 | values = list(self._explicit_storage) 716 | values = sorted(values) 717 | for value in values: 718 | serializer.write_word(value) 719 | 720 | byte_array = serializer.get_bytes() 721 | 722 | elif self._type == HLLType.SPARSE: 723 | serializer = schema_version.get_serializer( 724 | self._type, 725 | self._short_word_length, 726 | len(self._sparse_probabilistic_storage) 727 | ) 728 | 729 | indices = self._sparse_probabilistic_storage.keys() 730 | indices = sorted(indices) 731 | 732 | for register_index in indices: 733 | register_value = self._sparse_probabilistic_storage.get(register_index, 0) 734 | 735 | # pack index and value into "short word" 736 | short_word = BitUtil.left_shift_int(register_index, self._regwidth) | register_value 737 | serializer.write_word(short_word) 738 | 739 | byte_array = serializer.get_bytes() 740 | 741 | elif self._type == HLLType.FULL: 742 | serializer = schema_version.get_serializer(self._type, self._regwidth, self._m) 743 | self._probabilistic_storage.get_register_contents(serializer) 744 | 745 | byte_array = serializer.get_bytes() 746 | 747 | else: 748 | raise Exception('Unsupported HLL type: {}'.format(self._type)) 749 | 750 | # no use of it if any _explicit_off or _explicit_auto is true 751 | log2_explicit_threshold = 0 752 | if not self._explicit_auto | self._explicit_off: 753 | log2_explicit_threshold = int(NumberUtil.log2(self._explicit_threshold)) 754 | 755 | metadata = HLLMetadata( 756 | schema_version.schema_version_number(), 757 | self._type, 758 | self._log2m, 759 | self._regwidth, 760 | log2_explicit_threshold, 761 | self._explicit_off, 762 | self._explicit_auto, 763 | not self._sparse_off 764 | ) 765 | schema_version.write_metadata(byte_array, metadata) 766 | 767 | return byte_array 768 | 769 | @classmethod 770 | def from_bytes(cls, bytes): 771 | """ 772 | Deserializes the HLL (in ``toBytes()`` format) serialized 773 | into ``bytes``. 774 | 775 | :param list bytes: the serialized bytes of new HLL 776 | :returns: the deserialized HLL. This will never be ``None``. 777 | :rtype: HLL 778 | """ 779 | from python_hll.hllutil import HLLUtil 780 | schema_version = SerializationUtil.get_schema_version(bytes) 781 | metadata = schema_version.read_metadata(bytes) 782 | 783 | type = metadata.hll_type() 784 | reg_width = metadata.register_width() 785 | log_2m = metadata.register_count_log2() 786 | sparseon = metadata.sparse_enabled() 787 | 788 | expthresh = 0 789 | if metadata.explicit_auto(): 790 | expthresh = -1 791 | elif metadata.explicit_off(): 792 | expthresh = 0 793 | else: 794 | # NOTE: take into account that the postgres-compatible constructor 795 | # subtracts one before taking a power of two. 796 | expthresh = metadata.log2_explicit_cutoff() + 1 797 | 798 | hll = HLL(log_2m, reg_width, expthresh, sparseon, type) 799 | 800 | # Short-circuit on empty, which needs no other deserialization. 801 | if type == HLLType.EMPTY: 802 | return hll 803 | 804 | word_length = 0 805 | if type == HLLType.EXPLICIT: 806 | word_length = HLLUtil.LONG_BIT_LENGTH # 64 for both java and python 807 | 808 | elif type == HLLType.SPARSE: 809 | word_length = hll._short_word_length 810 | 811 | elif type == HLLType.FULL: 812 | word_length = hll._regwidth 813 | 814 | else: 815 | raise Exception('Unsupported HLL type: {}'.format(type)) 816 | 817 | deserializer = schema_version.get_deserializer(type, word_length, bytes) 818 | if type == HLLType.EXPLICIT: 819 | # NOTE: This should not exceed expthresh and this will always 820 | # be exactly the number of words that were encoded, 821 | # because the word length is at least a byte wide. 822 | # SEE: BigEndianAscendingWordDeserializer.total_word_count() 823 | for i in range(deserializer.total_word_count()): 824 | hll._explicit_storage.add(deserializer.read_word()) 825 | 826 | elif type == HLLType.SPARSE: 827 | # NOTE: If the short_word_length were smaller than 8 bits 828 | # (1 byte) there would be a possibility (because of 829 | # padding arithmetic) of having one or more extra 830 | # registers read. However, this is not relevant as the 831 | # extra registers will be all zeroes, which are ignored 832 | # in the sparse representation. 833 | for i in range(deserializer.total_word_count()): 834 | short_word = deserializer.read_word() 835 | 836 | register_value = BitUtil.to_signed_byte(short_word & hll._value_mask) 837 | # Only set non-zero registers. 838 | if register_value != 0: 839 | register_key = int(BitUtil.unsigned_right_shift_long(short_word, hll._regwidth)) 840 | hll._sparse_probabilistic_storage[register_key] = register_value 841 | 842 | elif type == HLLType.FULL: 843 | # NOTE: Iteration is done using m (register count) and NOT 844 | # deserializer.total_word_count() because regwidth may be 845 | # less than 8 and as such the padding on the 'last' byte 846 | # may be larger than regwidth, causing an extra register 847 | # to be read. 848 | # SEE: BigEndianAscendingWordDeserializer.total_word_count() 849 | for i in range(hll._m): 850 | hll._probabilistic_storage.set_register(i, deserializer.read_word()) 851 | 852 | else: 853 | raise Exception('Unsupported HLL type: {}'.format(type)) 854 | 855 | return hll 856 | -------------------------------------------------------------------------------- /python_hll/hlltype.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | class HLLType: 5 | """ 6 | The types of algorithm/data structure that HLL can utilize. For more 7 | information, see the Javadoc for HLL. 8 | """ 9 | EMPTY = 1 10 | EXPLICIT = 2 11 | SPARSE = 3 12 | FULL = 4 13 | UNDEFINED = 5 # used by the PostgreSQL implementation to indicate legacy/corrupt/incompatible/unknown formats 14 | -------------------------------------------------------------------------------- /python_hll/hllutil.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from math import log 3 | from python_hll.hll import HLL 4 | from python_hll.util import NumberUtil 5 | from python_hll.util import BitUtil 6 | 7 | 8 | class HLLUtil: 9 | """ 10 | Static functions for computing constants and parameters used in the HLL 11 | algorithm. 12 | """ 13 | 14 | # The number of bits used to represent a long value in two's complement binary form 15 | LONG_BIT_LENGTH = 64 16 | 17 | # Precomputed ``pw_max_mask`` values indexed by ``register_size_in_bits``. 18 | # Calculated with this formula:: 19 | # 20 | # int max_register_value = (1 << register_size_in_bits) - 1; 21 | # // Mask with all bits set except for (max_register_value - 1) least significant bits (see add_raw()) 22 | # return ~((1L << (max_register_value - 1)) - 1); 23 | # 24 | # See ``pw_max_mask()`` 25 | 26 | PW_MASK = [ 27 | -9223372036854775808, # ~((1 << (((1 << 0) - 1) - 1)) - 1) 28 | -1, # ~((1 << (((1 << 1) - 1) - 1)) - 1) 29 | -4, # ~((1 << (((1 << 2) - 1) - 1)) - 1) 30 | -64, # ~((1 << (((1 << 3) - 1) - 1)) - 1) 31 | -16384, # ~((1 << (((1 << 4) - 1) - 1)) - 1) 32 | -1073741824, # ~((1 << (((1 << 5) - 1) - 1)) - 1) 33 | -4611686018427387904, # ~((1 << (((1 << 6) - 1) - 1)) - 1) 34 | -4611686018427387904, # ~((1 << (((1 << 7) - 1) - 1)) - 1) 35 | -4611686018427387904, # ~((1 << (((1 << 8) - 1) - 1)) - 1) 36 | ] 37 | 38 | # Spacing constant used to compute offsets into ``TWO_TO_L``. 39 | REG_WIDTH_INDEX_MULTIPLIER = HLL.MAXIMUM_LOG2M_PARAM + 1 40 | 41 | @classmethod 42 | def register_bit_size(cls, expected_unique_elements): 43 | """ 44 | Computes the bit-width of HLL registers necessary to estimate a set of 45 | the specified cardinality. 46 | 47 | :param long expected_unique_elements: an upper bound on the number of unique 48 | elements that are expected. This must be greater than zero. 49 | :returns: a register size in bits (i.e. ``log2(log2(n))``) 50 | :rtype: int 51 | """ 52 | return max( 53 | HLL.MINIMUM_REGWIDTH_PARAM, 54 | NumberUtil.log2(NumberUtil.log2(expected_unique_elements)) 55 | ) 56 | 57 | @classmethod 58 | def alpha_m_squared(cls, m): 59 | """ 60 | Computes the 'alpha-m-squared' constant used by the HyperLogLog algorithm. 61 | 62 | :param int m: this must be a power of two, cannot be less than 63 | 16 (2:sup:`4`), and cannot be greater than 65536 (2:sup:`16`). 64 | :returns: gamma times ``registerCount`` squared where gamma is 65 | based on the value of ``registerCount``. 66 | :rtype: float 67 | """ 68 | 69 | if m < 16: 70 | raise Exception("'m' cannot be less than 16 ({m} < 16).".format(m=m)) 71 | 72 | elif m == 16: 73 | return 0.673 * m * m 74 | 75 | elif m == 32: 76 | return 0.697 * m * m 77 | 78 | elif m == 64: 79 | return 0.709 * m * m 80 | 81 | else: 82 | return (0.7213 / (1.0 + 1.079 / m)) * m * m 83 | 84 | @classmethod 85 | def pw_max_mask(cls, register_size_in_bits): 86 | """ 87 | Computes a mask that prevents overflow of HyperLogLog registers. 88 | 89 | :param int register_size_in_bits: the size of the HLL registers, in bits. 90 | :returns: mask a ``long`` mask to prevent overflow of the registers 91 | :rtype: long 92 | """ 93 | return cls.PW_MASK[register_size_in_bits] 94 | 95 | @classmethod 96 | def small_estimator_cutoff(cls, m): 97 | """ 98 | The cutoff for using the "small range correction" formula, in the 99 | HyperLogLog algorithm. 100 | 101 | :param int m: the number of registers in the HLL. m in the paper. 102 | :returns: the cutoff for the small range correction. 103 | :rtype: float 104 | """ 105 | return (float(m) * 5) / 2 106 | 107 | @classmethod 108 | def small_estimator(cls, m, number_of_zeroes): 109 | """ 110 | The "small range correction" formula from the HyperLogLog algorithm. Only 111 | appropriate if both the estimator is smaller than ``(5/2) * m`` and 112 | there are still registers that have the zero value. 113 | 114 | :param int m: the number of registers in the HLL. m in the paper. 115 | :param int number_of_zeroes: the number of registers with value zero. ``V`` 116 | in the paper. 117 | :returns: a corrected cardinality estimate. 118 | :rtype: float 119 | """ 120 | return m * log(float(m) / number_of_zeroes) 121 | 122 | @classmethod 123 | def large_estimator_cutoff(cls, log2m, register_size_in_bits): 124 | """ 125 | The cutoff for using the "large range correction" formula, from the 126 | HyperLogLog algorithm, adapted for 64 bit hashes. 127 | 128 | See `Blog post with section on 64 bit hashes and "large range correction" cutoff `_. 129 | 130 | :param int log2m: log-base-2 of the number of registers in the HLL. b in the paper. 131 | :param int register_size_in_bits: the size of the HLL registers, in bits. 132 | :returns: the cutoff for the large range correction. 133 | :rtype: float 134 | """ 135 | return TWO_TO_L[ 136 | (cls.REG_WIDTH_INDEX_MULTIPLIER * register_size_in_bits) + log2m 137 | ] / 30.0 138 | 139 | @classmethod 140 | def large_estimator(cls, log2m, register_size_in_bits, estimator): 141 | """ 142 | The "large range correction" formula from the HyperLogLog algorithm, adapted 143 | for 64 bit hashes. Only appropriate for estimators whose value exceeds 144 | the return of ``largeEstimatorCutoff()``. 145 | 146 | See `Blog post with section on 64 bit hashes and "large range correction" cutoff `_. 147 | 148 | :param int log2m: log-base-2 of the number of registers in the HLL. b in the paper. 149 | :param int register_size_in_bits: the size of the HLL registers, in bits. 150 | :param float estimator: the original estimator ("E" in the paper). 151 | :returns: a corrected cardinality estimate. 152 | :rtype: float 153 | """ 154 | two_to_l = TWO_TO_L[(cls.REG_WIDTH_INDEX_MULTIPLIER * register_size_in_bits) + log2m] 155 | try: 156 | return -1 * two_to_l * log(1.0 - (estimator/two_to_l)) 157 | except ValueError: 158 | return 0 159 | 160 | 161 | # Precomputed ``twoToL`` values indexed by a linear combination of 162 | # ``regwidth`` and ``log2m``. 163 | # 164 | # The array is one-dimensional and can be accessed by using index 165 | # ``(REG_WIDTH_INDEX_MULTIPLIER * regwidth) + log2m`` 166 | # for ``regwidth`` and ``log2m`` between the specified 167 | # ``HLL.{MINIMUM,MAXIMUM}_{REGWIDTH,LOG2M}_PARAM`` constants. 168 | # 169 | # See ``large_estimator()``. 170 | # See ``large_estimator_cutoff()``. 171 | # See `Blog post with section on 2^L 172 | # `_ 173 | TWO_TO_L = [0.0] * (HLL.MAXIMUM_REGWIDTH_PARAM + 1) * (HLL.MAXIMUM_LOG2M_PARAM + 1) 174 | for reg_width in range(HLL.MINIMUM_REGWIDTH_PARAM, HLL.MAXIMUM_REGWIDTH_PARAM+1): 175 | for log2m in range(HLL.MINIMUM_LOG2M_PARAM, HLL.MAXIMUM_LOG2M_PARAM+1): 176 | max_register_value = BitUtil.left_shift_int(1, reg_width) - 1 177 | 178 | # Since 1 is added to p(w) in the insertion algorithm, only 179 | # (maxRegisterValue - 1) bits are inspected hence the hash 180 | # space is one power of two smaller. 181 | pw_bits = max_register_value - 1 182 | total_bits = pw_bits + log2m 183 | two_to_l = 2**total_bits 184 | TWO_TO_L[(HLLUtil.REG_WIDTH_INDEX_MULTIPLIER * reg_width) + log2m] = two_to_l 185 | -------------------------------------------------------------------------------- /python_hll/serialization.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from python_hll.hlltype import HLLType 4 | from python_hll.util import BitUtil 5 | 6 | 7 | class BigEndianAscendingWordDeserializer: 8 | """ 9 | A corresponding deserializer for BigEndianAscendingWordSerializer. 10 | """ 11 | 12 | # The number of bits per byte. 13 | BITS_PER_BYTE = 8 14 | 15 | # long mask for the maximum value stored in a byte 16 | BYTE_MASK = BitUtil.left_shift_long(1, BITS_PER_BYTE) - 1 17 | 18 | # :var int word_length: The length in bits of the words to be read. 19 | # :var list bytes: The byte array to which the words are serialized. 20 | # :var int byte_padding: The number of leading padding bytes in 'bytes' to be ignored. 21 | # :var int word_count: The number of words that the byte array contains. 22 | # :var int current_word_index: The current read state. 23 | 24 | def __init__(self, word_length, byte_padding, bytes): 25 | """ 26 | :param int word_length: the length in bits of the words to be deserialized. Must 27 | be less than or equal to 64 and greater than or equal to 1. 28 | :param int byte_padding: the number of leading bytes that pad the serialized words. 29 | :param list bytes: the byte array containing the serialized words. Cannot be ``None``. 30 | """ 31 | if not 1 <= word_length <= 64: 32 | raise ValueError("Word length must be >= 1 and <= 64. (was: {word_length})".format(word_length=word_length)) 33 | 34 | if byte_padding < 0: 35 | raise ValueError("Byte padding must be >= zero. (was: {byte_padding})".format(byte_padding=byte_padding)) 36 | 37 | self._word_length = word_length 38 | self._bytes = bytes 39 | self._byte_padding = byte_padding 40 | 41 | self.data_bytes = (len(bytes) - byte_padding) 42 | self.data_bits = self.data_bytes * self.BITS_PER_BYTE 43 | 44 | self.word_count = int(self.data_bits/self._word_length) 45 | 46 | self.current_word_index = 0 47 | 48 | def read_word(self): 49 | """ 50 | Return the next word in the sequence. Should not be called more than ``total_word_count`` times. 51 | 52 | :rtype: long 53 | """ 54 | word = self._read_word(self.current_word_index) 55 | self.current_word_index += 1 56 | return word 57 | 58 | def _read_word(self, position): 59 | """ 60 | Reads the word at the specific sequence position (zero-indexed). 61 | 62 | :param int position: the zero-indexed position of the word to be read. This must be greater 63 | than or equal to zero. 64 | :returns: the value of the serialized word at the specific position. 65 | :rtype: long 66 | """ 67 | if position < 0: 68 | raise ValueError("Array index out of bounds for {position}".format(position=position)) 69 | 70 | # First bit of the word 71 | first_bit_index = (position * self._word_length) 72 | first_byte_index = (self._byte_padding + int(first_bit_index / self.BITS_PER_BYTE)) 73 | first_byte_skip_bits = int(first_bit_index % self.BITS_PER_BYTE) 74 | 75 | # Last bit of the word 76 | last_bit_index = (first_bit_index + self._word_length - 1) 77 | last_byte_index = (self._byte_padding + int(last_bit_index / self.BITS_PER_BYTE)) 78 | 79 | bits_after_byte_boundary = int((last_bit_index + 1) % self.BITS_PER_BYTE) 80 | 81 | # If the word terminates at the end of the last byte,, consume the whole 82 | # last byte. 83 | if bits_after_byte_boundary == 0: 84 | last_byte_bits_to_consume = self.BITS_PER_BYTE 85 | else: 86 | # Otherwise, only consume what is necessary. 87 | last_byte_bits_to_consume = bits_after_byte_boundary 88 | 89 | if last_byte_index >= len(self._bytes): 90 | raise ValueError("Word out of bounds of backing array, {} >= {}".format(last_byte_index, len(self._bytes))) 91 | 92 | # Accumulator 93 | value = 0 94 | 95 | # ------------------------------------------------------------------- 96 | # First byte 97 | bits_remaining_in_first_byte = (self.BITS_PER_BYTE - first_byte_skip_bits) 98 | bits_to_consume_in_first_byte = min(bits_remaining_in_first_byte, self._word_length) 99 | first_byte = self._bytes[first_byte_index] 100 | 101 | # Mask off the bits to skip in the first byte. 102 | first_byte_mask = (BitUtil.left_shift_long(1, bits_remaining_in_first_byte) - 1) 103 | first_byte &= first_byte_mask 104 | 105 | # Right-align relevant bits of first byte. 106 | first_byte = BitUtil.unsigned_right_shift_long( 107 | first_byte, 108 | bits_remaining_in_first_byte - bits_to_consume_in_first_byte 109 | ) 110 | 111 | value |= first_byte 112 | 113 | # If the first byte contains the whold word, short-circuit. 114 | if first_byte_index == last_byte_index: 115 | return value 116 | 117 | # ------------------------------------------------------------- 118 | # Middle bytes 119 | middle_byte_count = int(last_byte_index - first_byte_index - 1) 120 | for i in range(middle_byte_count): 121 | middle_byte = self._bytes[first_byte_index + i + 1] & self.BYTE_MASK 122 | # Push middle byte onto accumulator. 123 | value = BitUtil.left_shift_long(value, self.BITS_PER_BYTE) 124 | value |= middle_byte 125 | 126 | # -------------------------------------------------- 127 | # Last byte 128 | last_byte = (self._bytes[last_byte_index] & self.BYTE_MASK) 129 | last_byte >>= self.BITS_PER_BYTE - last_byte_bits_to_consume 130 | value = BitUtil.left_shift_long(value, last_byte_bits_to_consume) 131 | value |= last_byte 132 | return value 133 | 134 | def total_word_count(self): 135 | """ 136 | Returns the number of words that could be encoded in the sequence. 137 | 138 | NOTE: the sequence that was encoded may be shorter than the value this 139 | method returns due to padding issues within bytes. This guarantees 140 | only an upper bound on the number of times ``readWord()`` 141 | can be called. 142 | 143 | :returns: the maximum number of words that could be read from the sequence. 144 | :rtype: int 145 | """ 146 | return self.word_count 147 | 148 | 149 | class BigEndianAscendingWordSerializer: 150 | """ 151 | A serializer that writes a sequence of fixed bit-width 'words' to a byte array. 152 | Bitwise OR is used to write words into bytes, so a low bit in a word is also 153 | a low bit in a byte. However, a high byte in a word is written at a lower index 154 | in the array than a low byte in a word. The first word is written at the lowest 155 | array index. Each serializer is one time use and returns its backing byte 156 | array. 157 | 158 | This encoding was chosen so that when reading bytes as octets in the typical 159 | first-octet-is-the-high-nibble fashion, an octet-to-binary conversion 160 | would yield a high-to-low, left-to-right view of the "short words". 161 | 162 | Example: 163 | 164 | Say short words are 5 bits wide. Our word sequence is the values 165 | ``[31, 1, 5]``. In big-endian binary format, the values are 166 | ``[0b11111, 0b00001, 0b00101]``. We use 15 of 16 bits in two bytes 167 | and pad the last (lowest) bit of the last byte with a zero:: 168 | 169 | [0b11111000, 0b01001010] = [0xF8, 0x4A] 170 | """ 171 | 172 | # The number of bits per byte. 173 | BITS_PER_BYTE = 8 174 | 175 | # :var int bits_left_in_byte: Number of bits that remain writable in the current byte. 176 | # :var int byte_index: Index of byte currently being written to. 177 | # :var int words_written: Number of words written. 178 | 179 | def __init__(self, word_length, word_count, byte_padding): 180 | """ 181 | :param int word_length: the length in bits of the words to be serialized. Must 182 | be greater than or equal to 1 and less than or equal to 64. 183 | :param int word_count: the number of words to be serialized. Must be greater than 184 | or equal to zero. 185 | :param int byte_padding: the number of leading bytes that should pad the 186 | serialized words. Must be greater than or equal to zero. 187 | """ 188 | if (word_length < 1) or (word_length > 64): 189 | raise ValueError('Word length must be >= 1 and <= 64. (was: {})'.format(word_length)) 190 | if word_count < 0: 191 | raise ValueError('Word count must be >= 0. (was: {})'.format(word_count)) 192 | if byte_padding < 0: 193 | raise ValueError('Byte padding must be must be >= 0. (was: {})'.format(byte_padding)) 194 | 195 | self._word_length = word_length 196 | self._word_count = word_count 197 | 198 | bits_required = word_length * word_count 199 | leftover_bits = ((bits_required % self.BITS_PER_BYTE) != 0) 200 | leftover_bits_inc = 0 201 | if leftover_bits: 202 | leftover_bits_inc = 1 203 | bytes_required = (bits_required / self.BITS_PER_BYTE) + leftover_bits_inc + byte_padding 204 | self._bytes = [0] * int(bytes_required) 205 | 206 | self._bits_left_in_byte = self.BITS_PER_BYTE 207 | self._byte_index = byte_padding 208 | self._words_written = 0 209 | 210 | def write_word(self, word): 211 | """ 212 | Writes the word to the backing array. 213 | 214 | :param long word: the word to write. 215 | :rtype: void 216 | """ 217 | if self._words_written == self._word_count: 218 | raise ValueError('Cannot write more words, backing array full!') 219 | 220 | bits_left_in_word = self._word_length 221 | 222 | while bits_left_in_word > 0: 223 | # Move to the next byte if the current one is fully packed. 224 | if self._bits_left_in_byte == 0: 225 | self._byte_index += 1 226 | self._bits_left_in_byte = self.BITS_PER_BYTE 227 | 228 | consumed_mask = ~0 if bits_left_in_word == 64 else (BitUtil.left_shift_long(1, bits_left_in_word) - 1) 229 | 230 | # Fix how many bits will be written in this cycle. Choose the 231 | # smaller of the remaining bits in the word or byte. 232 | number_of_bits_to_write = min(self._bits_left_in_byte, bits_left_in_word) 233 | bits_in_byte_remaining_after_write = self._bits_left_in_byte - number_of_bits_to_write 234 | 235 | # In general, we write the highest bits of the word first, so we 236 | # strip the highest bits that were consumed in previous cycles. 237 | remaining_bits_of_word_to_write = (word & consumed_mask) 238 | 239 | # If the byte can accept all remaining bits, there is no need 240 | # to shift off the bits that won't be written in this cycle. 241 | bits_that_the_byte_can_accept = remaining_bits_of_word_to_write 242 | 243 | # If there is more left in the word than can be written to this 244 | # byte, shift off the bits that can't be written off the bottom. 245 | if bits_left_in_word > number_of_bits_to_write: 246 | bits_that_the_byte_can_accept = BitUtil.unsigned_right_shift_long(remaining_bits_of_word_to_write, bits_left_in_word - self._bits_left_in_byte) 247 | else: 248 | # If the byte can accept all remaining bits, there is no need 249 | # to shift off the bits that won't be written in this cycle. 250 | bits_that_the_byte_can_accept = remaining_bits_of_word_to_write 251 | 252 | # Align the word bits to write up against the byte bits that have 253 | # already been written. This shift may do nothing if the remainder 254 | # of the byte is being consumed in this cycle. 255 | aligned_bits = BitUtil.left_shift_long(bits_that_the_byte_can_accept, bits_in_byte_remaining_after_write) 256 | 257 | # Update the byte with the alignedBits. 258 | self._bytes[self._byte_index] |= BitUtil.to_signed_byte(aligned_bits) 259 | 260 | # Update state with bit count written. 261 | bits_left_in_word -= number_of_bits_to_write 262 | self._bits_left_in_byte = bits_in_byte_remaining_after_write 263 | 264 | self._words_written += 1 265 | 266 | def get_bytes(self): 267 | """ 268 | Returns the backing array of ``byte``'s that contain the serialized words. 269 | 270 | :returns: the serialized words as a list of bytes. 271 | :rtype: list 272 | """ 273 | if self._words_written < self._word_count: 274 | raise ValueError('Not all words have been written! ({}/{})'.format(self._words_written, self._word_count)) 275 | return self._bytes 276 | 277 | 278 | class HLLMetadata: 279 | """ 280 | The metadata and parameters associated with a HLL. 281 | """ 282 | 283 | def __init__(self, schema_version, type, register_count_log2, register_width, log2_explicit_cutoff, explicit_off, explicit_auto, sparse_enabled): 284 | """ 285 | :param int schema_version: the schema version number of the HLL. This must 286 | be greater than or equal to zero. 287 | :param HLLType type: the type of the HLL. This cannot be ``None``. 288 | :param int register_count_log2: the log-base-2 register count parameter for 289 | probabilistic HLLs. This must be greater than or equal to zero. 290 | :param int register_width: the register width parameter for probabilistic 291 | HLLs. This must be greater than or equal to zero. 292 | :param int log2_explicit_cutoff: the log-base-2 of the explicit cardinality cutoff, 293 | if it is explicitly defined. (If ``explicit_off`` or ``explicit_auto`` is True then 294 | this has no meaning. 295 | :param boolean explicit_off: the flag for 'explicit off'-mode, where the 296 | ``HLLType.EXPLICIT`` representation is not used. Both this and 297 | ``explicit_auto`` cannot be True at the same time. 298 | :param boolean explicit_auto: the flag for 'explicit auto'-mode, where the 299 | ``HLLType.EXPLICIT`` representation's promotion cutoff is 300 | determined based on in-memory size automatically. Both this and 301 | ``explicit_off`` cannot be True at the same time. 302 | :param boolean sparse_enabled: the flag for 'sparse-enabled'-mode, where the 303 | ``HLLType.SPARSE`` representation is used. 304 | """ 305 | self._schema_version = schema_version 306 | self._type = type 307 | self._register_count_log2 = register_count_log2 308 | self._register_width = register_width 309 | self._log2_explicit_cutoff = log2_explicit_cutoff 310 | self._explicit_off = explicit_off 311 | self._explicit_auto = explicit_auto 312 | self._sparse_enabled = sparse_enabled 313 | 314 | def schema_version(self): 315 | """ 316 | :returns: the schema version of the HLL. This will never be ``None``. 317 | :rtype: int 318 | """ 319 | return self._schema_version 320 | 321 | def hll_type(self): 322 | """ 323 | :returns: the type of the HLL. This will never be ``None``. 324 | :rtype: HLLType 325 | """ 326 | return self._type 327 | 328 | def register_count_log2(self): 329 | """ 330 | :returns: the log-base-2 of the register count parameter of the HLL. This 331 | will always be greater than or equal to 4 and less than or equal 332 | to 31. 333 | :rtype: int 334 | """ 335 | return self._register_count_log2 336 | 337 | def register_width(self): 338 | """ 339 | :returns: the register width parameter of the HLL. This will always be 340 | greater than or equal to 1 and less than or equal to 8. 341 | :rtype: int 342 | """ 343 | return self._register_width 344 | 345 | def log2_explicit_cutoff(self): 346 | """ 347 | :returns: the log-base-2 of the explicit cutoff cardinality. This will always 348 | be greater than or equal to zero and less than 31, per the specification. 349 | :rtype: int 350 | """ 351 | return self._log2_explicit_cutoff 352 | 353 | def explicit_off(self): 354 | """ 355 | :returns: True if the ``HLLType.EXPLICIT`` representation 356 | has been disabled. False< otherwise. 357 | :rtype: boolean 358 | """ 359 | return self._explicit_off 360 | 361 | def explicit_auto(self): 362 | """ 363 | :returns: True if the ``HLLType.EXPLICIT`` representation 364 | cutoff cardinality is set to be automatically chosen, 365 | False otherwise. 366 | :rtype: boolean 367 | """ 368 | return self._explicit_auto 369 | 370 | def sparse_enabled(self): 371 | """ 372 | :returns: True if the HLLType.SPARSE representation is enabled. 373 | :rtype: boolean 374 | """ 375 | return self._sparse_enabled 376 | 377 | def __str__(self): 378 | return "" % (self._schema_version, self._type, self._register_count_log2, self._register_width, self._log2_explicit_cutoff, self._explicit_off, self._explicit_auto) 379 | 380 | 381 | class SchemaVersionOne: 382 | """ 383 | A serialization schema for HLLs. Reads and writes HLL metadata to 384 | and from byte representations. 385 | """ 386 | 387 | # The schema version number for this instance. 388 | SCHEMA_VERSION = 1 389 | 390 | # Version-specific ordinals (array position) for each of the HLL types 391 | TYPE_ORDINALS = [ 392 | HLLType.UNDEFINED, 393 | HLLType.EMPTY, 394 | HLLType.EXPLICIT, 395 | HLLType.SPARSE, 396 | HLLType.FULL 397 | ] 398 | 399 | # number of header bytes for all HLL types 400 | HEADER_BYTE_COUNT = 3 401 | 402 | # sentinel values from the spec for explicit off and auto 403 | EXPLICIT_OFF = 0 404 | EXPLICIT_AUTO = 63 405 | 406 | def padding_bytes(self, type): 407 | """ 408 | The number of metadata bytes required for a serialized HLL of the 409 | specified type. 410 | 411 | :param HLLType type: the type of the serialized HLL 412 | :returns: the number of padding bytes needed in order to fully accommodate 413 | the needed metadata. 414 | :rtype: int 415 | """ 416 | return self.HEADER_BYTE_COUNT 417 | 418 | def write_metadata(self, bytes, metadata): 419 | """ 420 | Writes metadata bytes to serialized HLL. 421 | 422 | :param list bytes: the padded data bytes of the HLL 423 | :param HLLMetadata metadata: the metadata to write to the padding bytes 424 | :rtype: void 425 | """ 426 | type = metadata.hll_type() 427 | type_ordinal = self._get_ordinal(type) 428 | 429 | explicit_cut_off_value = metadata.log2_explicit_cutoff() + 1 430 | 431 | if metadata.explicit_off(): 432 | explicit_cut_off_value = self.EXPLICIT_OFF 433 | elif metadata.explicit_auto(): 434 | explicit_cut_off_value = self.EXPLICIT_AUTO 435 | 436 | bytes[0] = SerializationUtil.pack_version_byte(self.SCHEMA_VERSION, type_ordinal) 437 | bytes[1] = SerializationUtil.pack_parameters_byte(metadata.register_width(), metadata.register_count_log2()) 438 | bytes[2] = SerializationUtil.pack_cutoff_byte(explicit_cut_off_value, metadata.sparse_enabled()) 439 | 440 | def read_metadata(self, bytes): 441 | """ 442 | Reads the metadata bytes of the serialized HLL. 443 | 444 | :param list bytes: the serialized HLL 445 | :returns: the HLL metadata 446 | :rtype: HLLMetadata 447 | """ 448 | version_byte = bytes[0] 449 | parameters_byte = bytes[1] 450 | cutoff_byte = bytes[2] 451 | 452 | type_ordinal = SerializationUtil.type_ordinal(version_byte) 453 | explicit_cut_off_value = SerializationUtil.explicit_cutoff(cutoff_byte) 454 | explicit_off = (explicit_cut_off_value == self.EXPLICIT_OFF) 455 | explicit_auto = (explicit_cut_off_value == self.EXPLICIT_AUTO) 456 | log2_explicit_cutoff = -1 if (explicit_off or explicit_auto) else explicit_cut_off_value - 1 457 | 458 | return HLLMetadata(SchemaVersionOne.SCHEMA_VERSION, self._get_type(type_ordinal), SerializationUtil.register_count_log2(parameters_byte), 459 | SerializationUtil.register_width(parameters_byte), log2_explicit_cutoff, explicit_off, 460 | explicit_auto, SerializationUtil.sparse_enabled(cutoff_byte)) 461 | 462 | def get_serializer(self, type, word_length, word_count): 463 | """ 464 | Builds an HLL serializer that matches this schema version. 465 | 466 | :param HLLType type: the HLL type that will be serialized. This cannot be ``None``. 467 | :param int word_length: the length of the 'words' that comprise the data of the 468 | HLL. Words must be at least 5 bits and at most 64 bits long. 469 | :param int word_count: the number of 'words' in the HLL's data. 470 | 471 | :returns: a byte array serializer used to serialize a HLL according 472 | to this schema version's specification. 473 | :rtype: BigEndianAscendingWordSerializer 474 | """ 475 | return BigEndianAscendingWordSerializer(word_length, word_count, self.padding_bytes(type)) 476 | 477 | def get_deserializer(self, type, word_length, bytes): 478 | """ 479 | Builds an HLL deserializer that matches this schema version. 480 | 481 | :param HLLType type: the HLL type that will be deserialized. This cannot be ``None``. 482 | :param int word_length: the length of the 'words' that comprise the data of the 483 | serialized HLL. Words must be at least 5 bits and at most 64 484 | bits long. 485 | :param list bytes: the serialized HLL to deserialize. This cannot be ``None``. 486 | :returns: a byte array deserializer used to deserialize a HLL serialized 487 | according to this schema version's specification. 488 | :rtype: BigEndianAscendingWordDeserializer 489 | """ 490 | return BigEndianAscendingWordDeserializer(word_length, self.padding_bytes(type), bytes) 491 | 492 | def schema_version_number(self): 493 | """ 494 | :returns: the schema version number 495 | :rtype: int 496 | """ 497 | return self.SCHEMA_VERSION 498 | 499 | @classmethod 500 | def _get_ordinal(cls, type): 501 | """ 502 | Gets the ordinal for the specified ``HLLType``. 503 | 504 | :param HLLType type: the type whose ordinal is desired 505 | :returns the ordinal for the specified type, to be used in the version byte. 506 | This will always be non-negative. 507 | :rtype: int 508 | """ 509 | return cls.TYPE_ORDINALS.index(type) 510 | 511 | @classmethod 512 | def _get_type(cls, ordinal): 513 | """ 514 | Gets the ``HLLType`` for the specified ordinal. 515 | 516 | :param int ordinal: the ordinal whose type is desired 517 | :returns: the type for the specified ordinal. This will never be ``None``. 518 | :rtype: HLLType 519 | """ 520 | if ordinal < 0 or ordinal >= len(cls.TYPE_ORDINALS): 521 | raise ValueError('Invalid type ordinal {}. Only 0-{} inclusive allowed'.format( 522 | ordinal, (len(cls.TYPE_ORDINALS) - 1))) 523 | return cls.TYPE_ORDINALS[ordinal] 524 | 525 | 526 | class SerializationUtil: 527 | """ 528 | A collection of constants and utilities for serializing and deserializing 529 | HLLs. 530 | """ 531 | 532 | # The number of bits (of the parameters byte) dedicated to encoding the 533 | # width of the registers. 534 | REGISTER_WIDTH_BITS = 3 535 | 536 | # A mask to cap the maximum value of the register width. 537 | REGISTER_WIDTH_MASK = BitUtil.left_shift_int(1, REGISTER_WIDTH_BITS) - 1 538 | 539 | # The number of bits (of the parameters byte) dedicated to encoding 540 | # ``log2(register_count)``. 541 | LOG2_REGISTER_COUNT_BITS = 5 542 | 543 | # A mask to cap the maximum value of ``log2(register_count)``. 544 | LOG2_REGISTER_COUNT_MASK = BitUtil.left_shift_int(1, LOG2_REGISTER_COUNT_BITS) - 1 545 | 546 | # The number of bits (of the cutoff byte) dedicated to encoding the 547 | # log-base-2 of the explicit cutoff or sentinel values for 548 | # 'explicit-disabled' or 'auto'. 549 | EXPLICIT_CUTOFF_BITS = 6 550 | 551 | # A mask to cap the maximum value of the explicit cutoff choice. 552 | EXPLICIT_CUTOFF_MASK = BitUtil.left_shift_int(1, EXPLICIT_CUTOFF_BITS) - 1 553 | 554 | # Number of bits in a nibble. 555 | NIBBLE_BITS = 4 556 | 557 | # A mask to cap the maximum value of a nibble. 558 | NIBBLE_MASK = BitUtil.left_shift_int(1, NIBBLE_BITS) - 1 559 | 560 | # ************************************************************************ 561 | # Serialization utilities 562 | 563 | # Schema version one (v1). 564 | VERSION_ONE = SchemaVersionOne() 565 | 566 | # The default schema version for serializing HLLs. 567 | DEFAULT_SCHEMA_VERSION = VERSION_ONE 568 | 569 | # List of registered schema versions, indexed by their version numbers. If 570 | # an entry is ``None``, then no such schema version is registered. 571 | # Similarly, registering a new schema version simply entails assigning an 572 | # SchemaVersion instance to the appropriate index of this array. 573 | # 574 | # By default, only SchemaVersionOne is registered. Note that version 575 | # zero will always be reserved for internal (e.g. proprietary, legacy) schema 576 | # specifications/implementations and will never be assigned to in by this 577 | # library. 578 | REGISTERED_SCHEMA_VERSIONS = [None, VERSION_ONE] 579 | 580 | @classmethod 581 | def get_schema_version_from_number(cls, schema_version_number): 582 | """ 583 | :param int schema_version_number: the version number of the ``SchemaVersion`` 584 | desired. This must be a registered schema version number. 585 | 586 | :returns: The ``SchemaVersion`` for the given number. This will never be ``None``. 587 | :rtype: SchemaVersion 588 | """ 589 | if schema_version_number >= len(cls.REGISTERED_SCHEMA_VERSIONS) or schema_version_number < 0: 590 | raise ValueError('Invalid schema version number {}'.format(schema_version_number)) 591 | schema_version = cls.REGISTERED_SCHEMA_VERSIONS[schema_version_number] 592 | 593 | if schema_version is None: 594 | raise ValueError('Unknown schema version number {}'.format(schema_version_number)) 595 | return schema_version 596 | 597 | @classmethod 598 | def get_schema_version(cls, bytes): 599 | """ 600 | Get the appropriate ``SchemaVersion`` for the specified 601 | serialized HLL. 602 | 603 | :param list bytes: the serialized HLL whose schema version is desired. 604 | 605 | :returns the schema version for the specified HLL. This will never be ``None``. 606 | :rtype: SchemaVersion 607 | """ 608 | version_byte = bytes[0] 609 | schema_version_number = cls.schema_version(version_byte) 610 | 611 | return cls.get_schema_version_from_number(schema_version_number) 612 | 613 | @classmethod 614 | def pack_version_byte(cls, schema_version, type_ordinal): 615 | """ 616 | Generates a byte that encodes the schema version and the type ordinal of the HLL. 617 | 618 | The top nibble is the schema version and the bottom nibble is the type ordinal. 619 | 620 | :param int schema_version: the schema version to encode. 621 | :param int type_ordinal: the type ordinal of the HLL to encode. 622 | :returns: the packed version byte 623 | :rtype: byte 624 | """ 625 | return BitUtil.to_signed_byte(BitUtil.left_shift_int(cls.NIBBLE_MASK & schema_version, cls.NIBBLE_BITS) | (cls.NIBBLE_MASK & type_ordinal)) 626 | 627 | @classmethod 628 | def pack_cutoff_byte(cls, explicit_cutoff, sparse_enabled): 629 | """ 630 | Generates a byte that encodes the log-base-2 of the explicit cutoff or sentinel values for 631 | 'explicit-disabled' or 'auto', as well as the boolean indicating whether to use ``HLLType.SPARSE`` 632 | in the promotion hierarchy. 633 | 634 | The top bit is always padding, the second highest bit indicates the 635 | 'sparse-enabled' boolean, and the lowest six bits encode the explicit 636 | cutoff value. 637 | 638 | :param int explicit_cutoff: the explicit cutoff value to encode. 639 | * If 'explicit-disabled' is chosen, this value should be ``0``. 640 | * If a cutoff of 2:sup:`n` is desired, for``0 <= n < 31``, this value should be ``n + 1``. 641 | :param boolean sparse_enabled: whether ``HLLType.SPARSE`` 642 | should be used in the promotion hierarchy to improve HLL 643 | storage. 644 | :rtype: byte 645 | """ 646 | sparse_bit = BitUtil.left_shift_int(1, cls.EXPLICIT_CUTOFF_BITS) if sparse_enabled else 0 647 | return BitUtil.to_signed_byte(sparse_bit | (cls.EXPLICIT_CUTOFF_MASK & explicit_cutoff)) 648 | 649 | @classmethod 650 | def pack_parameters_byte(cls, register_width, register_count_log2): 651 | """ 652 | Generates a byte that encodes the parameters of a ``HLLType.FULL`` or ``HLLType.SPARSE`` HLL. 653 | 654 | The top 3 bits are used to encode ``registerWidth - 1`` 655 | (range of ``registerWidth`` is thus 1-9) and the bottom 5 656 | bits are used to encode ``registerCountLog2`` 657 | (range of ``registerCountLog2`` is thus 0-31). 658 | 659 | :param int register_width: the register width (must be at least 1 and at 660 | most 9) 661 | :param int register_count_log2: the log-base-2 of the register count (must 662 | be at least 0 and at most 31) 663 | :returns: the packed parameters byte 664 | :rtype: byte 665 | """ 666 | width_bits = (register_width - 1) & cls.REGISTER_WIDTH_MASK 667 | count_bits = register_count_log2 & cls.LOG2_REGISTER_COUNT_MASK 668 | return BitUtil.to_signed_byte(BitUtil.to_signed_byte(BitUtil.left_shift_int(width_bits, cls.LOG2_REGISTER_COUNT_BITS) | count_bits)) 669 | 670 | @classmethod 671 | def sparse_enabled(cls, cutoff_byte): 672 | """ 673 | Extracts the 'sparse-enabled' boolean from the cutoff byte of a serialized HLL. 674 | 675 | :param byte cutoff_byte: the cutoff byte of the serialized HLL 676 | :returns: the 'sparse-enabled' boolean 677 | :rtype: boolean 678 | """ 679 | return (BitUtil.unsigned_right_shift_byte(cutoff_byte, cls.EXPLICIT_CUTOFF_BITS) & 1) == 1 680 | 681 | @classmethod 682 | def explicit_cutoff(cls, cutoff_byte): 683 | """ 684 | Extracts the explicit cutoff value from the cutoff byte of a serialized HLL. 685 | 686 | :param byte cutoff_byte: the cutoff byte of the serialized HLL 687 | :returns: the explicit cutoff value 688 | :rtype: int 689 | """ 690 | return cutoff_byte & cls.EXPLICIT_CUTOFF_MASK 691 | 692 | @classmethod 693 | def schema_version(cls, version_byte): 694 | """ 695 | Extracts the schema version from the version byte of a serialized HLL. 696 | 697 | :param byte version_byte: the version byte of the serialized HLL 698 | :returns: the schema version of the serialized HLL 699 | :rtype: int 700 | """ 701 | return cls.NIBBLE_MASK & BitUtil.unsigned_right_shift_byte(version_byte, cls.NIBBLE_BITS) 702 | 703 | @classmethod 704 | def type_ordinal(cls, version_byte): 705 | """ 706 | Extracts the type ordinal from the version byte of a serialized HLL. 707 | 708 | :param byte version_byte: the version byte of the serialized HLL 709 | :returns: the type ordinal of the serialized HLL 710 | :rtype: int 711 | """ 712 | return version_byte & cls.NIBBLE_MASK 713 | 714 | @classmethod 715 | def register_width(cls, parameters_byte): 716 | """ 717 | Extracts the register width from the parameters byte of a serialized ``HLLType.FULL`` HLL. 718 | 719 | :param byte parameters_byte: the parameters byte of the serialized HLL 720 | :returns: the register width of the serialized HLL 721 | :rtype: int 722 | """ 723 | return (BitUtil.unsigned_right_shift_byte(parameters_byte, cls.LOG2_REGISTER_COUNT_BITS) & cls.REGISTER_WIDTH_MASK) + 1 724 | 725 | @classmethod 726 | def register_count_log2(cls, parameters_byte): 727 | """ 728 | Extracts the log2(register_count) from the parameters byte of a serialized ``HLLType.FULL`` HLL. 729 | 730 | :param byte parameters_byte: the parameters byte of the serialized HLL 731 | :returns: log2(registerCount) of the serialized HLL 732 | :rtype: int 733 | """ 734 | return parameters_byte & cls.LOG2_REGISTER_COUNT_MASK 735 | -------------------------------------------------------------------------------- /python_hll/util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from math import log 4 | import numpy as np 5 | 6 | 7 | class BitUtil: 8 | """ 9 | A collection of bit utilities. 10 | """ 11 | 12 | # The set of least-significant bits for a given ``byte``. ``-1`` 13 | # is used if no bits are set (so as to not be confused with "index of zero" 14 | # meaning that the least significant bit is the 0th (1st) bit). 15 | LEAST_SIGNIFICANT_BIT = [ 16 | -1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 17 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 18 | 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 19 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 20 | 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 21 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 22 | 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 23 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 24 | 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 25 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 26 | 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 27 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 28 | 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 29 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 30 | 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 31 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 32 | ] 33 | 34 | @classmethod 35 | def least_significant_bit(cls, value): 36 | """ 37 | Computes the least-significant bit of the specified ``long`` 38 | that is set to ``1``. Zero-indexed. 39 | 40 | See 41 | and . 42 | 43 | :param long value: the ``long`` whose least-significant bit is desired. 44 | :returns: the least-significant bit of the specified ``long``. 45 | ``-1`` is returned if there are no bits set. 46 | :rtype: int 47 | """ 48 | 49 | if value == 0: 50 | # by contract 51 | return -1 52 | 53 | elif value & 0xFF != 0: 54 | index = int(cls.unsigned_right_shift_long(value, 0) & 0xFF) 55 | return cls.LEAST_SIGNIFICANT_BIT[index] + 0 56 | 57 | elif value & 0xFFFF != 0: 58 | index = int(cls.unsigned_right_shift_long(value, 8) & 0xFF) 59 | return cls.LEAST_SIGNIFICANT_BIT[index] + 8 60 | 61 | elif value & 0xFFFFFF != 0: 62 | index = int(cls.unsigned_right_shift_long(value, 16) & 0xFF) 63 | return cls.LEAST_SIGNIFICANT_BIT[index] + 16 64 | 65 | elif value & 0xFFFFFFFF != 0: 66 | index = int(cls.unsigned_right_shift_long(value, 24) & 0xFF) 67 | return cls.LEAST_SIGNIFICANT_BIT[index] + 24 68 | 69 | elif value & 0xFFFFFFFFFF != 0: 70 | index = int(cls.unsigned_right_shift_long(value, 32) & 0xFF) 71 | return cls.LEAST_SIGNIFICANT_BIT[index] + 32 72 | 73 | elif value & 0xFFFFFFFFFFFF != 0: 74 | index = int(cls.unsigned_right_shift_long(value, 40) & 0xFF) 75 | return cls.LEAST_SIGNIFICANT_BIT[index] + 40 76 | 77 | elif value & 0xFFFFFFFFFFFFFF != 0: 78 | index = int(cls.unsigned_right_shift_long(value, 48) & 0xFF) 79 | return cls.LEAST_SIGNIFICANT_BIT[index] + 48 80 | 81 | else: 82 | index = int(cls.unsigned_right_shift_long(value, 56) & 0xFF) 83 | return cls.LEAST_SIGNIFICANT_BIT[index] + 56 84 | 85 | @classmethod 86 | def unsigned_right_shift_long(cls, val, n): 87 | """ 88 | Equivalent to Java >>> on a long value 89 | """ 90 | return val if n == 0 else int(np.uint64(val) >> np.uint64(n)) 91 | 92 | @classmethod 93 | def unsigned_right_shift_int(cls, val, n): 94 | """ 95 | Equivalent to Java >>> on an int value 96 | """ 97 | return val if n == 0 else int(np.uint32(val) >> np.uint32(n)) 98 | 99 | @classmethod 100 | def unsigned_right_shift_byte(cls, val, n): 101 | """ 102 | Equivalent to Java >>> on a byte value 103 | """ 104 | return val if n == 0 else int(np.uint32(val) >> np.uint32(n)) 105 | 106 | @classmethod 107 | def to_signed_byte(cls, i): 108 | """ 109 | Converts a Python byte (unsigned integer from 0 to 255) to a Java byte 110 | (signed two's complement integer from -128 to 127). 111 | :type i: byte 112 | :rtype: byte 113 | """ 114 | return i if i <= 127 else i - 256 115 | 116 | @classmethod 117 | def left_shift_long(cls, long_x, int_y): 118 | """ 119 | Simulates a Java << for a long. 120 | 121 | :param long_x: expected long value in python code 122 | :param int_y: expected int value in python 123 | :returns: left shift result for, x << y 124 | :rtype: long 125 | """ 126 | x = np.int64(long_x) 127 | y = np.int(int_y) 128 | z = np.left_shift(x, y) 129 | 130 | return np.int64(z.item()) 131 | 132 | @classmethod 133 | def left_shift_int(cls, int_x, int_y): 134 | """ 135 | Simulates a Java << for an integer. 136 | 137 | :param int_x: expected int value in python code 138 | :param int_y: expected int value in python 139 | :returns: left shift result for, x << y 140 | :rtype: int 141 | """ 142 | x = np.int32(int_x) 143 | y = np.int(int_y) 144 | z = np.left_shift(x, y) 145 | 146 | return z.item() 147 | 148 | @classmethod 149 | def left_shift_byte(cls, byte_x, int_y): 150 | """ 151 | Simulates a Java << for a byte. 152 | 153 | :param byte_x: expected byte value in python code 154 | :param int_y: expected int value in python 155 | :returns: left shift result for, x << y 156 | :rtype: int 157 | """ 158 | x = np.int8(byte_x) # converts to signed byte, since byte is signed in java 159 | y = np.int(int_y) 160 | z = np.left_shift(x, y) 161 | 162 | # In Java, (byte)128 << 3 produces an int. 163 | return z.item() 164 | 165 | 166 | class LongIterator: 167 | """ 168 | A ``long``-based iterator. 169 | """ 170 | 171 | LOG2_BITS_PER_WORD = 6 172 | BITS_PER_WORD = BitUtil.left_shift_int(1, LOG2_BITS_PER_WORD) 173 | 174 | def __init__(self, register_width, words, register_mask, count): 175 | self._register_width = register_width 176 | self._words = words 177 | self._register_mask = register_mask 178 | self._count = count 179 | 180 | # register setup 181 | self._register_index = 0 182 | self._word_index = 0 183 | self._remaining_word_bits = self.BITS_PER_WORD 184 | self._word = self._words[self._word_index] 185 | 186 | def __iter__(self): 187 | return self 188 | 189 | def __next__(self): 190 | # Python 3 compatibility 191 | return self.next() 192 | 193 | def next(self): 194 | if self._register_index >= self._count: 195 | raise StopIteration 196 | 197 | if self._remaining_word_bits >= self._register_width: 198 | register = self._word & self._register_mask 199 | 200 | # shift to the next register 201 | self._word = BitUtil.unsigned_right_shift_long(self._word, self._register_width) 202 | self._remaining_word_bits -= self._register_width 203 | else: # insufficient bits remaining in current word 204 | self._word_index += 1 # move to the next word 205 | 206 | register = (self._word | BitUtil.left_shift_long(self._words[self._word_index], self._remaining_word_bits)) & self._register_mask 207 | 208 | # shift to the next partial register (word) 209 | self._word = BitUtil.unsigned_right_shift_long(self._words[self._word_index], self._register_width - self._remaining_word_bits) 210 | self._remaining_word_bits += self.BITS_PER_WORD - self._register_width 211 | 212 | self._register_index += 1 213 | return register 214 | 215 | 216 | class BitVector: 217 | """ 218 | A vector (array) of bits that is accessed in units ("registers") of ``width`` 219 | bits which are stored as 64bit "words" (``long``'s). In this context 220 | a register is at most 64bits. 221 | """ 222 | 223 | # NOTE: in this context, a word is 64bits 224 | 225 | # rather than doing division to determine how a bit index fits into 64bit 226 | # words (i.e. longs), bit shifting is used 227 | LOG2_BITS_PER_WORD = 6 # =>64bits 228 | BITS_PER_WORD = BitUtil.left_shift_int(1, LOG2_BITS_PER_WORD) 229 | BITS_PER_WORD_MASK = BITS_PER_WORD - 1 230 | 231 | # ditto from above but for bytes (for output) 232 | LOG2_BITS_PER_BYTE = 3 # =>8bits 233 | BITS_PER_BYTE = BitUtil.left_shift_int(1, LOG2_BITS_PER_BYTE) 234 | 235 | BYTES_PER_WORD = 8 # 8 bytes in a long 236 | 237 | def __init__(self, width, count): 238 | """ 239 | :param int width: the width of each register. This cannot be negative or 240 | zero or greater than 63 (the signed word size). 241 | :param long count: the number of registers. This cannot be negative or zero 242 | """ 243 | # 64bit words 244 | # ceil((width * count)/BITS_PER_WORD) 245 | self._words = [0] * BitUtil.unsigned_right_shift_long((width * count) + self.BITS_PER_WORD_MASK, self.LOG2_BITS_PER_WORD) 246 | # the width of a register in bits (this cannot be more than 64 (the word size)) 247 | self._register_width = width 248 | self._count = count 249 | self._register_mask = BitUtil.left_shift_long(1, width) - 1 250 | 251 | def get_register(self, register_index): 252 | """ 253 | :param long register_index: the index of the register whose value is to be 254 | retrieved. This cannot be negative. 255 | :returns: the value at the specified register index 256 | :rtype: long 257 | """ 258 | # NOTE: if this changes then setMaxRegister() must change 259 | bit_index = register_index * self._register_width 260 | first_word_index = BitUtil.unsigned_right_shift_long(bit_index, self.LOG2_BITS_PER_WORD) # aka (bitIndex / BITS_PER_WORD) 261 | second_word_index = BitUtil.unsigned_right_shift_long(bit_index + self._register_width - 1, self.LOG2_BITS_PER_WORD) # see above 262 | bit_remainder = bit_index & self.BITS_PER_WORD_MASK # aka (bitIndex % BITS_PER_WORD) 263 | 264 | if first_word_index == second_word_index: 265 | return BitUtil.unsigned_right_shift_long(self._words[first_word_index], bit_remainder) & self._register_mask 266 | # else -- register spans words 267 | return BitUtil.unsigned_right_shift_long(self._words[first_word_index], bit_remainder) | \ 268 | BitUtil.left_shift_long(self._words[second_word_index], self.BITS_PER_WORD - bit_remainder) & self._register_mask 269 | 270 | def set_register(self, register_index, value): 271 | """ 272 | :param long register_index: the index of the register whose value is to be set. 273 | This cannot be negative 274 | :param long value: the value to set in the register 275 | :rtype: long 276 | """ 277 | # NOTE: if this changes then setMaxRegister() must change 278 | bit_index = register_index * self._register_width 279 | first_word_index = BitUtil.unsigned_right_shift_long(bit_index, self.LOG2_BITS_PER_WORD) # aka (bitIndex / BITS_PER_WORD) 280 | second_word_index = BitUtil.unsigned_right_shift_long(bit_index + self._register_width - 1, self.LOG2_BITS_PER_WORD) # see above 281 | bit_remainder = bit_index & self.BITS_PER_WORD_MASK # aka (bitIndex % BITS_PER_WORD) 282 | 283 | if first_word_index == second_word_index: 284 | # clear then set 285 | self._words[first_word_index] &= ~BitUtil.left_shift_long(self._register_mask, bit_remainder) 286 | self._words[first_word_index] |= BitUtil.left_shift_long(value, bit_remainder) 287 | else: # register spans words 288 | # clear then set each partial word 289 | self._words[first_word_index] &= BitUtil.left_shift_long(1, bit_remainder) - 1 290 | self._words[first_word_index] |= BitUtil.left_shift_long(value, bit_remainder) 291 | 292 | self._words[second_word_index] &= ~BitUtil.unsigned_right_shift_long(self._register_mask, self.BITS_PER_WORD - bit_remainder) 293 | self._words[second_word_index] |= BitUtil.unsigned_right_shift_long(value, self.BITS_PER_WORD - bit_remainder) 294 | 295 | def register_iterator(self): 296 | """ 297 | :returns: a ``LongIterator`` for iterating starting at the register 298 | with index zero. This will never be ``None``. 299 | :rtype: LongIterator 300 | """ 301 | return LongIterator(self._register_width, self._words, self._register_mask, self._count) 302 | 303 | def set_max_register(self, register_index, value): 304 | """ 305 | Sets the value of the specified index register if and only if the specified 306 | value is greater than the current value in the register. This is equivalent 307 | to but much more performant than 308 | 309 | ``vector.setRegister(index, Math.max(vector.getRegister(index), value));`` 310 | 311 | :param long register_index: the index of the register whose value is to be set. 312 | This cannot be negative 313 | :param long value: the value to set in the register if and only if this value 314 | is greater than the current value in the register 315 | :returns: True if and only if the specified value is greater 316 | than or equal to the current register value. False 317 | otherwise. 318 | :rtype: boolean 319 | """ 320 | # NOTE: if this changes then setRegister() must change 321 | bit_index = register_index * self._register_width 322 | first_word_index = BitUtil.unsigned_right_shift_long(bit_index, self.LOG2_BITS_PER_WORD) # aka (bitIndex / BITS_PER_WORD) 323 | second_word_index = BitUtil.unsigned_right_shift_long(bit_index + self._register_width - 1, self.LOG2_BITS_PER_WORD) # see above 324 | bit_remainder = bit_index & self.BITS_PER_WORD_MASK # aka (bitIndex % BITS_PER_WORD) 325 | 326 | if first_word_index == second_word_index: 327 | register_value = BitUtil.unsigned_right_shift_long(self._words[first_word_index], bit_remainder) & self._register_mask 328 | else: # register spans words 329 | # # no need to mask since at top of word 330 | register_value = BitUtil.unsigned_right_shift_long(self._words[first_word_index], bit_remainder) | \ 331 | BitUtil.left_shift_long(self._words[second_word_index], self.BITS_PER_WORD - bit_remainder) & self._register_mask 332 | 333 | # determine which is the larger and update as necessary 334 | if value > register_value: 335 | # NOTE: matches setRegister() 336 | if first_word_index == second_word_index: 337 | # clear then set 338 | self._words[first_word_index] &= ~BitUtil.left_shift_long(self._register_mask, bit_remainder) 339 | self._words[first_word_index] |= BitUtil.left_shift_long(value, bit_remainder) 340 | else: # register spans words 341 | # clear then set each partial word 342 | self._words[first_word_index] &= BitUtil.left_shift_long(1, bit_remainder) - 1 343 | self._words[first_word_index] |= BitUtil.left_shift_long(value, bit_remainder) 344 | 345 | self._words[second_word_index] &= ~BitUtil.unsigned_right_shift_long(self._register_mask, self.BITS_PER_WORD - bit_remainder) 346 | self._words[second_word_index] |= BitUtil.unsigned_right_shift_long(value, self.BITS_PER_WORD - bit_remainder) 347 | # else -- the register value is greater (or equal) so nothing needs to be done 348 | 349 | return value >= register_value 350 | 351 | def fill(self, value): 352 | """ 353 | Fills this bit vector with the specified bit value. This can be used to 354 | clear the vector by specifying ``0``. 355 | 356 | :param long value: the value to set all bits to (only the lowest bit is used) 357 | :rtype: void 358 | """ 359 | for i in range(self._count): 360 | self.set_register(i, value) 361 | 362 | def get_register_contents(self, serializer): 363 | """ 364 | Serializes the registers of the vector using the specified serializer. 365 | 366 | :param BigEndianAscendingWordSerializer serializer: the serializer to use. This cannot be ``None``. 367 | :rtype: void 368 | """ 369 | iterator = self.register_iterator() 370 | 371 | for itr in iterator: 372 | serializer.write_word(itr) 373 | 374 | 375 | class NumberUtil: 376 | """ 377 | A collection of utilities to work with numbers. 378 | """ 379 | 380 | # loge(2) (log-base e of 2) 381 | LOGE_2 = 0.6931471805599453 382 | 383 | # the hex characters 384 | HEX = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'] 385 | 386 | @classmethod 387 | def log2(cls, value): 388 | """ 389 | Computes the ``log2`` (log-base-two) of the specified value. 390 | 391 | :param float value: the ``float`` for which the ``log2`` is 392 | desired. 393 | :returns: the ``log2`` of the specified value 394 | :rtype: float 395 | """ 396 | # REF: http://en.wikipedia.org/wiki/Logarithmic_scale (conversion of bases) 397 | return log(value) / cls.LOGE_2 398 | 399 | @classmethod 400 | def to_hex(cls, bytes, offset, count): 401 | """ 402 | Converts the specified array of ``byte``'s into a string of 403 | hex characters (low ``byte`` first). 404 | 405 | :param list bytes: the array of ``byte``'s that are to be converted. 406 | This cannot be ``None`` though it may be empty. 407 | :param int offset: the offset in ``bytes`` at which the bytes will 408 | be taken. This cannot be negative and must be less than 409 | ``bytes.length - 1``. 410 | :param int count: the number of bytes to be retrieved from the specified array. 411 | This cannot be negative. If greater than ``bytes.length - offset`` 412 | then that value is used. 413 | :returns: a string of at most ``count`` characters that represents 414 | the specified byte array in hex. This will never be ``None`` 415 | though it may be empty if ``bytes`` is empty or ``count`` 416 | is zero. 417 | :rtype: string 418 | """ 419 | if offset >= len(bytes): # by contract 420 | raise Exception("Offset is greater than the length, {offset} >= {byte_array_length}" 421 | .format(offset=offset, byte_array_length=len(bytes))) 422 | byte_count = min(len(bytes) - offset, count) 423 | upper_bound = byte_count + offset 424 | 425 | chars = [None] * (byte_count * 2) # two chars per byte 426 | char_index = 0 427 | for i in range(offset, upper_bound): 428 | value = bytes[i] 429 | chars[char_index] = cls.HEX[(BitUtil.unsigned_right_shift_byte(value, 4)) & 0x0F] 430 | char_index += 1 431 | chars[char_index] = cls.HEX[value & 0x0F] 432 | char_index += 1 433 | 434 | return ''.join(chars) 435 | 436 | @classmethod 437 | def from_hex(cls, string, offset, count): 438 | """ 439 | Converts the specified array of hex characters into an array of ``byte``'s 440 | (low ``byte`` first). 441 | 442 | :param string string: the string of hex characters to be converted into ``byte``'s. 443 | This cannot be ``None`` though it may be blank. 444 | :param int offset: the offset in the string at which the characters will be 445 | taken. This cannot be negative and must be less than ``string.length() - 1``. 446 | :param int count: the number of characters to be retrieved from the specified 447 | string. This cannot be negative and must be divisible by two 448 | (since there are two characters per ``byte``). 449 | :returns: the array of ``byte``'s that were converted from the 450 | specified string (in the specified range). This will never be 451 | ``None`` though it may be empty if ``string`` 452 | is empty or ``count`` is zero. 453 | :rtype: list 454 | """ 455 | 456 | if offset >= len(string): # by contract 457 | raise Exception("Offset is greater than the length, {offset} >= {string_length}" 458 | .format(offset=offset, string_length=len(string))) 459 | if (count & 0x01) != 0: # by contract 460 | raise Exception("Count is not divisible by two, ({})".format(count)) 461 | 462 | char_count = min(len(string) - offset, count) 463 | upper_bound = offset + char_count 464 | 465 | byte_array = [0] * (BitUtil.unsigned_right_shift_int(char_count, 1)) # aka /2 466 | byte_index = 0 # beginning 467 | for i in range(0, upper_bound, 2): 468 | p1 = BitUtil.left_shift_int(cls._digit(string[i]), 4) 469 | p2 = cls._digit(string[i+1]) 470 | p = (p1 | p2) & 0xFF 471 | 472 | byte_array[byte_index] = BitUtil.to_signed_byte(p) 473 | byte_index += 1 474 | return byte_array 475 | 476 | @classmethod 477 | def _digit(cls, character): 478 | """ 479 | :param string character: a hex character to be converted to a ``byte``. 480 | This cannot be a character other than [a-fA-F0-9]. 481 | :returns: the value of the specified character. This will be a value ``0`` 482 | through ``15``. 483 | :rtype: int 484 | """ 485 | if character == '0': 486 | return 0 487 | elif character == '1': 488 | return 1 489 | elif character == '2': 490 | return 2 491 | elif character == '3': 492 | return 3 493 | elif character == '4': 494 | return 4 495 | elif character == '5': 496 | return 5 497 | elif character == '6': 498 | return 6 499 | elif character == '7': 500 | return 7 501 | elif character == '8': 502 | return 8 503 | elif character == '9': 504 | return 9 505 | elif character in ['a', 'A']: 506 | return 10 507 | elif character in ['b', 'B']: 508 | return 11 509 | elif character in ['c', 'C']: 510 | return 12 511 | elif character in ['d', 'D']: 512 | return 13 513 | elif character in ['e', 'E']: 514 | return 14 515 | elif character in ['f', 'F']: 516 | return 15 517 | else: 518 | raise Exception("Character is not in [a-fA-F0-9]: ({})".format(character)) 519 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | pip==18.1 2 | bumpversion==0.5.3 3 | wheel==0.32.1 4 | watchdog==0.9.0 5 | flake8==3.5.0 6 | tox==3.5.2 7 | coverage==4.5.1 8 | Sphinx==1.8.1 9 | twine==1.12.1 10 | numpy==1.16.4 11 | 12 | pytest==3.8.2 13 | pytest-runner==4.2 14 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.1.3 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:python_hll/__init__.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | exclude = docs 19 | 20 | [aliases] 21 | test = pytest 22 | 23 | [tool:pytest] 24 | collect_ignore = ['setup.py'] 25 | 26 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """The setup script.""" 5 | 6 | from setuptools import setup, find_packages 7 | 8 | with open('README.rst') as readme_file: 9 | readme = readme_file.read() 10 | 11 | with open('HISTORY.rst') as history_file: 12 | history = history_file.read() 13 | 14 | requirements = ['numpy'] 15 | 16 | setup_requirements = ['pytest-runner', ] 17 | 18 | test_requirements = ['pytest', ] 19 | 20 | setup( 21 | author="Jon Aquino", 22 | author_email='jonathan.aquino@adroll.com', 23 | classifiers=[ 24 | 'Development Status :: 2 - Pre-Alpha', 25 | 'Intended Audience :: Developers', 26 | 'License :: OSI Approved :: MIT License', 27 | 'Natural Language :: English', 28 | "Programming Language :: Python :: 2", 29 | 'Programming Language :: Python :: 2.7', 30 | 'Programming Language :: Python :: 3', 31 | 'Programming Language :: Python :: 3.4', 32 | 'Programming Language :: Python :: 3.5', 33 | 'Programming Language :: Python :: 3.6', 34 | 'Programming Language :: Python :: 3.7', 35 | ], 36 | description="Python library for the HyperLogLog algorithm", 37 | install_requires=requirements, 38 | license="MIT license", 39 | long_description=readme + '\n\n' + history, 40 | include_package_data=True, 41 | keywords='python_hll', 42 | name='python_hll', 43 | packages=find_packages(include=['python_hll']), 44 | setup_requires=setup_requirements, 45 | test_suite='tests', 46 | tests_require=test_requirements, 47 | url='https://github.com/AdRoll/python-hll', 48 | version='0.1.3', 49 | zip_safe=False, 50 | ) 51 | -------------------------------------------------------------------------------- /tests/data/README.txt: -------------------------------------------------------------------------------- 1 | This test data comes from https://github.com/citusdata/postgresql-hll/tree/master/sql/data 2 | 3 | If the filename starts with a "cumulative_union" prefix, then it's the 4 | standard cumulative union format we've been using (cardinality, 5 | multiset, union_cardinality, union_multiset) in which the 6 | union_multiset is an accumulator over the subsequent lines. 7 | 8 | If the filename starts with a "cumulative_add" prefix, then it's a new 9 | format (cardinality, raw_value, multiset) in which the "raw_value" is 10 | added to the accumulator "multiset". 11 | 12 | The cutoffs I'm assuming in this file are 256 for explicit to sparse, 13 | and 850 for sparse to full. Log2m=11, registerWidth=5, as usual. 14 | 15 | A brief summary of what each file tries to accomplish follows: 16 | 17 | cumulative_add_comprehensive_promotion.csv 18 | 19 | Cumulatively adds random values to an EMPTY multiset. 20 | 21 | Format: cumulative add 22 | Tests: 23 | - EMPTY, EXPLICIT, SPARSE_PROBABILISTIC, PROBABILSTIC addition 24 | - EMPTY to EXPLICIT promotion 25 | - EXPLICIT to SPARSE_PROBABILISTIC promotion 26 | - SPARSE_PROBABILISTIC to PROBABILISTIC promotion 27 | 28 | cumulative_add_sparse_step.csv 29 | 30 | Cumulatively sets successive registers to: 31 | 32 | (registerIndex % probabilisticRegisterMaxValue) + 1 33 | 34 | by adding specifically constructed values to a SPARSE_PROBABILISTIC multiset. 35 | Does not induce promotion. 36 | 37 | Format: cumulative add 38 | Tests: 39 | - SPARSE_PROBABILISTIC addition (predictable) 40 | 41 | cumulative_add_sparse_random.csv 42 | 43 | Cumulatively sets random registers of a SPARSE_PROBABILISTIC multiset to 44 | random values by adding random values. Does not induce promotion. 45 | 46 | Format: cumulative add 47 | Tests: 48 | - SPARSE_PROBABILISTIC addition (random) 49 | 50 | cumulative_union_explicit_promotion.csv 51 | 52 | Unions an EMPTY accumulator with EXPLICIT multisets, each containing a 53 | single random value. 54 | 55 | Format: cumulative union 56 | Tests: 57 | - EMPTY U EXPLICIT 58 | - EXPLICIT U EXPLICIT 59 | - EXPLICIT to SPARSE_PROBABILISTIC promotion 60 | - SPARSE_PROBABILISTIC U EXPLICIT 61 | 62 | cumulative_union_sparse_promotion.csv 63 | 64 | Unions an EMPTY accumulator with SPARSE_PROBABILISTIC multisets, each 65 | having one register set. 66 | 67 | Format: cumulative union 68 | Tests: 69 | - EMPTY U SPARSE_PROBABILISTIC 70 | - SPARSE_PROBABILISTIC U SPARSE_PROBABILISTIC 71 | - SPARSE_PROBABILISTIC promotion 72 | - SPARSE_PROBABILISTIC U PROBABILISTIC 73 | 74 | cumulative_union_explicit_explicit.csv 75 | 76 | Unions an EMPTY accumulator with EXPLICIT multisets, each having a single 77 | random value, twice in a row to verify that the set properties are 78 | satisfied. 79 | 80 | Format: cumulative union 81 | Tests: 82 | - EMPTY U EXPLICIT 83 | - EXPLICIT U EXPLICIT 84 | 85 | cumulative_union_sparse_sparse.csv 86 | 87 | Unions an EMPTY accumulator with SPARSE_PROBABILISTIC multisets, each 88 | having a single register set, twice in a row to verify that the set 89 | properties are satisfied. 90 | 91 | Format: cumulative union 92 | Tests: 93 | - EMPTY U SPARSE_PROBABILISTIC 94 | - SPARSE_PROBABILISTIC U SPARSE_PROBABILISTIC 95 | 96 | cumulative_union_probabilistic_probabilistic.csv 97 | 98 | Unions an EMPTY accumulator with PROBABILISTIC multisets, each having 99 | many registers set, twice in a row to verify that the set properties are 100 | satisfied. 101 | 102 | Format: cumulative union 103 | Tests: 104 | - EMPTY U PROBABILISTIC 105 | - PROBABILISTIC U PROBABILISTIC 106 | 107 | cumulative_union_comprehensive.csv 108 | 109 | Unions an EMPTY accumulator with random multisets. 110 | 111 | Format: cumulative union 112 | Tests: 113 | - hopefully all union possibilities 114 | -------------------------------------------------------------------------------- /tests/data/cumulative_union_sparse_full_representation.csv: -------------------------------------------------------------------------------- 1 | cardinality,HLL,union_cardinality,union_HLL 2 | 0,\x118B49,0,\x118B49 3 | 1.0002442201269182,\x148B490800000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,1.0002442201269182,\x138B490001 4 | 1.0002442201269182,\x148B490040000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,2.000977198748901,\x138B4900010021 5 | 1096.4497021580987,\x148B490002108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,1099.8687346717188,\x148B490842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 -------------------------------------------------------------------------------- /tests/probabilistic_test_util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import division 4 | from python_hll.util import BitUtil 5 | from math import ceil 6 | 7 | 8 | def construct_hll_value(log2m, register_index, register_value): 9 | """ 10 | Constructs a value that when added raw to a HLL will set the register at 11 | ``register_index`` to ``register_value``. 12 | 13 | :param log2m: The log-base-2 of the number of registers in the HLL 14 | :type log2m: int 15 | :param register_index: The index of the register to set 16 | :type register_index: int 17 | :param register_value: the value to set the register to 18 | :type register_value: int 19 | :rtype: int 20 | """ 21 | partition = register_index 22 | substream_value = BitUtil.left_shift_long(1, register_value - 1) 23 | return BitUtil.left_shift_long(substream_value, log2m) | partition 24 | 25 | 26 | def get_register_index(raw_value, log2m): 27 | """ 28 | Extracts the HLL register index from a raw value. 29 | """ 30 | m_bits_mask = BitUtil.left_shift_int(1, log2m) - 1 31 | j = raw_value & m_bits_mask 32 | return j 33 | 34 | 35 | def get_register_value(raw_value, log2m): 36 | """ 37 | Extracts the HLL register value from a raw value. 38 | """ 39 | substream_value = BitUtil.unsigned_right_shift_long(raw_value, log2m) 40 | if substream_value == 0: 41 | # The paper does not cover p(0x0), so the special value 0 is used. 42 | # 0 is the original initialization value of the registers, so by 43 | # doing this the HLL simply ignores it. This is acceptable 44 | # because the probability is 1/(2^(2^register_size_in_bits)). 45 | p_w = 0 46 | else: 47 | p_w = BitUtil.to_signed_byte(min(1 + BitUtil.least_significant_bit(substream_value), 31)) 48 | return p_w 49 | 50 | 51 | def get_required_bytes(short_word_length, register_count): 52 | """ 53 | Returns the number of bytes required to pack ``register_count`` 54 | registers of width ``short_word_length``. 55 | """ 56 | return ceil((register_count * short_word_length) / 8) 57 | -------------------------------------------------------------------------------- /tests/test_big_endian_ascending_word_deserializer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Unit tests for BigEndianAscendingWordDeserializer """ 5 | 6 | from sys import maxsize 7 | import random 8 | from python_hll.serialization import BigEndianAscendingWordDeserializer, BigEndianAscendingWordSerializer 9 | from python_hll.util import BitUtil 10 | 11 | 12 | def test_constructor_error(): 13 | """ 14 | Error checking tests for constructor. 15 | """ 16 | 17 | # word length too small 18 | try: 19 | BigEndianAscendingWordDeserializer(0, 0, [0]) 20 | assert False, "Should complain about too-short words." 21 | except ValueError as e: 22 | assert "Word length must be" in str(e) 23 | 24 | # word length too large 25 | try: 26 | BigEndianAscendingWordDeserializer(65, 0, [0]) 27 | assert False, "Should complain about too-long words." 28 | except ValueError as e: 29 | assert "Word length must be" in str(e) 30 | 31 | # byte padding negative 32 | try: 33 | BigEndianAscendingWordDeserializer(5, -1, [0]) 34 | except ValueError as e: 35 | assert "Byte padding must be" in str(e) 36 | 37 | 38 | def test_smoke_64_bit_word(): 39 | serializer = BigEndianAscendingWordSerializer(64, 5, 0) 40 | 41 | # Check that the sign bit is being preserved. 42 | serializer.write_word(-1) 43 | serializer.write_word(-112894714) 44 | 45 | # CHeck "special values" 46 | serializer.write_word(0) 47 | serializer.write_word(maxsize) 48 | serializer.write_word(-maxsize - 1) 49 | 50 | bytes_ = serializer.get_bytes() 51 | 52 | deserializer = BigEndianAscendingWordDeserializer(64, 0, bytes_) 53 | assert deserializer.total_word_count() == 5 54 | 55 | assert deserializer.read_word() == -1 56 | assert deserializer.read_word() == -112894714 57 | assert deserializer.read_word() == 0 58 | assert deserializer.read_word() == maxsize 59 | assert deserializer.read_word() == -maxsize - 1 60 | 61 | 62 | def test_ascending_smoke(fastonly): 63 | """ 64 | A smoke/fuzz test for ascending (from zero) word values. 65 | """ 66 | word_length = 5 67 | while word_length < 65: 68 | run_ascending_test(word_length, 3, 1000 if fastonly else 100000) 69 | word_length += 1 70 | 71 | 72 | def test_random_smoke(fastonly): 73 | """ 74 | A smoke/fuzz test for random word values. 75 | """ 76 | word_length = 5 77 | while word_length < 65: 78 | run_random_test(word_length, 3, 1000 if fastonly else 100000, word_length) 79 | word_length += 1 80 | 81 | 82 | def run_random_test(word_length, byte_padding, word_count, seed): 83 | """ 84 | Runs a test which serializes and deserializes random word values. 85 | """ 86 | random.seed(seed) 87 | 88 | word_mask = ~0 if word_length == 64 else BitUtil.left_shift_long(1, word_length) - 1 89 | 90 | serializer = BigEndianAscendingWordSerializer(word_length, word_count, byte_padding) 91 | 92 | for _ in range(word_count): 93 | value = random.randint(0, maxsize) & word_mask 94 | serializer.write_word(value) 95 | 96 | bytes_ = serializer.get_bytes() 97 | 98 | deserializer = BigEndianAscendingWordDeserializer(word_length, byte_padding, bytes_) 99 | 100 | assert deserializer.total_word_count() == word_count 101 | 102 | # verification random 103 | random.seed(seed) 104 | for _ in range(word_count): 105 | assert deserializer.read_word() == (random.randint(0, maxsize) & word_mask) 106 | 107 | 108 | def run_ascending_test(word_length, byte_padding, word_count): 109 | """ 110 | Runs a test which serializes and deserializes ascending (from zero) word values. 111 | """ 112 | word_mask = ~0 if word_length == 64 else BitUtil.left_shift_long(1, word_length) - 1 113 | 114 | serializer = BigEndianAscendingWordSerializer(word_length, word_count, byte_padding) 115 | 116 | for i in range(word_count): 117 | serializer.write_word(i & word_mask) 118 | 119 | bytes_ = serializer.get_bytes() 120 | 121 | deserializer = BigEndianAscendingWordDeserializer(word_length, byte_padding, bytes_) 122 | 123 | assert deserializer.total_word_count() == word_count 124 | 125 | for i in range(word_count): 126 | assert deserializer.read_word() == (i & word_mask) 127 | -------------------------------------------------------------------------------- /tests/test_big_endian_ascending_word_serializer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Unit tests for BigEndianAscendingWordSerializer """ 5 | 6 | from python_hll.serialization import BigEndianAscendingWordSerializer 7 | 8 | 9 | def test_constructor_error(): 10 | """ 11 | Test for contructors 12 | """ 13 | 14 | # Word length is too short 15 | try: 16 | BigEndianAscendingWordSerializer(0, 1, 0) 17 | assert False, "Should complain about too-short words." 18 | except ValueError as e: 19 | assert 'Word length must be >= 1 and <= 64. (was: 0)' == str(e) 20 | 21 | # Word length is too long 22 | try: 23 | BigEndianAscendingWordSerializer(65, 1, 0) 24 | assert False, "Should complain about too-long words." 25 | except ValueError as e: 26 | assert "Word length must be" in str(e) 27 | 28 | # Word Count is negative 29 | try: 30 | BigEndianAscendingWordSerializer(5, -1, 0) 31 | assert False, "Should complain about negative word count." 32 | except ValueError as e: 33 | assert "Word count must be" in str(e) 34 | 35 | # Byte padding is negative 36 | try: 37 | BigEndianAscendingWordSerializer(5, 1, -1) 38 | assert False, "Should complain about negative byte padding." 39 | except ValueError as e: 40 | assert "Byte padding must be" in str(e) 41 | 42 | 43 | def test_early_get_bytes(): 44 | """ 45 | Tests runtime exception thrown at premature call 46 | """ 47 | 48 | serializer = BigEndianAscendingWordSerializer(5, 1, 0) 49 | try: 50 | serializer.get_bytes() 51 | assert False, "Should throw." 52 | except ValueError as r: 53 | assert "Not all words" in str(r) 54 | 55 | 56 | def test_smoke_explicit_params(): 57 | """ 58 | Smoke test for typical parameters 59 | """ 60 | short_word_length = 64 61 | 62 | # Should work on empty sequence with no padding 63 | serializer = BigEndianAscendingWordSerializer(short_word_length, 0, 0) 64 | assert serializer.get_bytes() == [] 65 | 66 | # Should work on byte-divisible sequence with no padding 67 | serializer = BigEndianAscendingWordSerializer(short_word_length, 2, 0) 68 | serializer.write_word(-4995993186629670228) # 0xBAAAAAAAAAAAAAACL 69 | serializer.write_word(-8070450532247928847) # 0x8FFFFFFFFFFFFFF1L 70 | 71 | # Bytes: 72 | # ====== 73 | # 0xBA 0xAA 0xAA 0xAA 0xAA 0xAA 0xAA 0xAC 74 | # 0x8F 0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0xF1 75 | # -70 -86 ... -84 76 | # -113 -1 ... -15 77 | 78 | all_bytes = serializer.get_bytes() 79 | expected_bytes = [-70, -86, -86, -86, -86, -86, -86, -84, -113, -1, -1, -1, -1, -1, -1, -15] 80 | assert all_bytes == expected_bytes 81 | 82 | # Should pad the array correctly. 83 | serializer = BigEndianAscendingWordSerializer(short_word_length, 1, 1) 84 | serializer.write_word(1) 85 | all_bytes = serializer.get_bytes() 86 | expected_bytes = [0, 0, 0, 0, 0, 0, 0, 0, 1] 87 | assert all_bytes == expected_bytes 88 | 89 | 90 | def test_smoke_probabilistic_params(): 91 | """ 92 | Smoke Test for typical parameters used in practice. 93 | """ 94 | short_word_length = 5 95 | 96 | # Should work on an empty sequence with no padding. 97 | serializer = BigEndianAscendingWordSerializer(short_word_length, 0, 0) 98 | assert serializer.get_bytes() == [] 99 | 100 | # Should work on a non-byte-divisible sequence with no padding. 101 | serializer = BigEndianAscendingWordSerializer(short_word_length, 3, 0) 102 | serializer.write_word(9) 103 | serializer.write_word(31) 104 | serializer.write_word(1) 105 | 106 | # The values: 107 | # ----------- 108 | # 9 |31 |1 |padding 109 | 110 | # Corresponding bits: 111 | # ------------------ 112 | # 0100 1|111 11|00 001|0 113 | 114 | # And the hex/decimal (Are python bytes signed????????): 115 | # ----------------------------------------------------- 116 | # 0100 1111 -> 0x4F -> 79 117 | # 1100 0010 -> 0xC2 -> -62 118 | 119 | all_bytes = serializer.get_bytes() 120 | expected_bytes = [79, -62] 121 | assert all_bytes == expected_bytes 122 | 123 | # Should work on a byte-divisible sequence with no padding 124 | serializer = BigEndianAscendingWordSerializer(short_word_length, 8, 0) 125 | 126 | for i in range(1, 9): 127 | serializer.write_word(i) 128 | 129 | # Values: 1-8 130 | # Corresponding bits: 131 | # ------------------ 132 | # 00001 133 | # 00010 134 | # 00011 135 | # 00100 136 | # 00101 137 | # 00110 138 | # 00111 139 | # 01000 140 | 141 | # And the hex: 142 | # ------------ 143 | # 0000 1000 => 0x08 => 8 144 | # 1000 0110 => 0x86 => -122 145 | # 0100 0010 => 0x62 => 66 146 | # 1001 1000 => 0x98 => -104 147 | # 1110 1000 => 0xE8 => -24 148 | 149 | all_bytes = serializer.get_bytes() 150 | expected_bytes = [8, -122, 66, -104, -24] 151 | assert all_bytes == expected_bytes 152 | 153 | # Should pad the array correctly 154 | serializer = BigEndianAscendingWordSerializer(short_word_length, 1, 1) 155 | serializer.write_word(1) 156 | 157 | # 1 byte leading padding | value 1 | trailing padding 158 | # 0000 0000 | 0000 1|000 159 | all_bytes = serializer.get_bytes() 160 | expected_bytes = [0, 8] 161 | assert all_bytes == expected_bytes 162 | 163 | 164 | def test_smoke_sparse_params(): 165 | """ 166 | Smoke test for typical parameters used in practice. 167 | """ 168 | short_word_length = 17 169 | 170 | # Should work on an empty sequence with no padding 171 | serializer = BigEndianAscendingWordSerializer(short_word_length, 0, 0) 172 | assert serializer.get_bytes() == [] 173 | 174 | # Should work on a non-byte-divisible sequence with no padding 175 | serializer = BigEndianAscendingWordSerializer(short_word_length, 3, 0) 176 | serializer.write_word(9) 177 | serializer.write_word(42) 178 | serializer.write_word(75) 179 | # The values: 180 | # ----------- 181 | # 9 |42 |75 |padding 182 | 183 | # Corresponding bits: 184 | # ------------------ 185 | # 0000 0000 0000 0100 1|000 0000 0000 1010 10|00 0000 0000 1001 011|0 0000 186 | 187 | # And the hex/decimal (remember Java bytes are signed): 188 | # ----------------------------------------------------- 189 | # 0000 0000 -> 0x00 -> 0 190 | # 0000 0100 -> 0x04 -> 4 191 | # 1000 0000 -> 0x80 -> -128 192 | # 0000 1010 -> 0x0A -> 10 193 | # 1000 0000 -> 0x80 -> -128 194 | # 0000 1001 -> 0x09 -> 9 195 | # 0110 0000 -> 0x60 -> 96 196 | 197 | all_bytes = serializer.get_bytes() 198 | expected_bytes = [0, 4, -128, 10, -128, 9, 96] 199 | assert all_bytes == expected_bytes 200 | 201 | # Should work on a byte-divisible sequence with no padding 202 | serializer = BigEndianAscendingWordSerializer(short_word_length, 8, 0) 203 | 204 | for i in range(1, 9): 205 | serializer.write_word(i) 206 | 207 | # Values: 1-8 208 | # Corresponding bits: 209 | # ------------------ 210 | # 0000 0000 0000 0000 1 211 | # 000 0000 0000 0000 10 212 | # 00 0000 0000 0000 011 213 | # 0 0000 0000 0000 0100 214 | 215 | # 0000 0000 0000 0010 1 216 | # 000 0000 0000 0001 10 217 | # 00 0000 0000 0000 111 218 | # 0 0000 0000 0000 1000 219 | 220 | # And the hex: 221 | # ------------ 222 | # 0000 0000 -> 0x00 -> 0 223 | # 0000 0000 -> 0x00 -> 0 224 | # 1000 0000 -> 0x80 -> -128 225 | # 0000 0000 -> 0x00 -> 0 226 | # 1000 0000 -> 0x80 -> -128 227 | # 0000 0000 -> 0x00 -> 0 228 | # 0110 0000 -> 0x60 -> 96 229 | # 0000 0000 -> 0x00 -> 0 230 | # 0100 0000 -> 0x40 -> 64 231 | # 0000 0000 -> 0x00 -> 0 232 | # 0010 1000 -> 0x28 -> 40 233 | # 0000 0000 -> 0x00 -> 0 234 | # 0001 1000 -> 0x18 -> 24 235 | # 0000 0000 -> 0x00 -> 0 236 | # 0000 1110 -> 0x0D -> 14 237 | # 0000 0000 -> 0x00 -> 0 238 | # 0000 1000 -> 0x08 -> 8 239 | 240 | all_bytes = serializer.get_bytes() 241 | expected_bytes = [0, 0, -128, 0, -128, 0, 96, 0, 64, 0, 40, 0, 24, 0, 14, 0, 8] 242 | assert all_bytes == expected_bytes 243 | 244 | # Should pad the array correctly 245 | serializer = BigEndianAscendingWordSerializer(short_word_length, 1, 1) 246 | serializer.write_word(1) 247 | 248 | all_bytes = serializer.get_bytes() 249 | expected_bytes = [0, 0, 0, -128] 250 | assert all_bytes == expected_bytes 251 | -------------------------------------------------------------------------------- /tests/test_bit_util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from python_hll.util import BitUtil 5 | 6 | UNSIGNED_TO_SIGNED_INTEGERS = { 7 | 0: 0, 8 | 1: 1, 9 | 2: 2, 10 | 3: 3, 11 | 4: 4, 12 | 5: 5, 13 | 6: 6, 14 | 7: 7, 15 | 8: 8, 16 | 9: 9, 17 | 10: 10, 18 | 11: 11, 19 | 12: 12, 20 | 13: 13, 21 | 14: 14, 22 | 15: 15, 23 | 16: 16, 24 | 17: 17, 25 | 18: 18, 26 | 19: 19, 27 | 20: 20, 28 | 21: 21, 29 | 22: 22, 30 | 23: 23, 31 | 24: 24, 32 | 25: 25, 33 | 26: 26, 34 | 27: 27, 35 | 28: 28, 36 | 29: 29, 37 | 30: 30, 38 | 31: 31, 39 | 32: 32, 40 | 33: 33, 41 | 34: 34, 42 | 35: 35, 43 | 36: 36, 44 | 37: 37, 45 | 38: 38, 46 | 39: 39, 47 | 40: 40, 48 | 41: 41, 49 | 42: 42, 50 | 43: 43, 51 | 44: 44, 52 | 45: 45, 53 | 46: 46, 54 | 47: 47, 55 | 48: 48, 56 | 49: 49, 57 | 50: 50, 58 | 51: 51, 59 | 52: 52, 60 | 53: 53, 61 | 54: 54, 62 | 55: 55, 63 | 56: 56, 64 | 57: 57, 65 | 58: 58, 66 | 59: 59, 67 | 60: 60, 68 | 61: 61, 69 | 62: 62, 70 | 63: 63, 71 | 64: 64, 72 | 65: 65, 73 | 66: 66, 74 | 67: 67, 75 | 68: 68, 76 | 69: 69, 77 | 70: 70, 78 | 71: 71, 79 | 72: 72, 80 | 73: 73, 81 | 74: 74, 82 | 75: 75, 83 | 76: 76, 84 | 77: 77, 85 | 78: 78, 86 | 79: 79, 87 | 80: 80, 88 | 81: 81, 89 | 82: 82, 90 | 83: 83, 91 | 84: 84, 92 | 85: 85, 93 | 86: 86, 94 | 87: 87, 95 | 88: 88, 96 | 89: 89, 97 | 90: 90, 98 | 91: 91, 99 | 92: 92, 100 | 93: 93, 101 | 94: 94, 102 | 95: 95, 103 | 96: 96, 104 | 97: 97, 105 | 98: 98, 106 | 99: 99, 107 | 100: 100, 108 | 101: 101, 109 | 102: 102, 110 | 103: 103, 111 | 104: 104, 112 | 105: 105, 113 | 106: 106, 114 | 107: 107, 115 | 108: 108, 116 | 109: 109, 117 | 110: 110, 118 | 111: 111, 119 | 112: 112, 120 | 113: 113, 121 | 114: 114, 122 | 115: 115, 123 | 116: 116, 124 | 117: 117, 125 | 118: 118, 126 | 119: 119, 127 | 120: 120, 128 | 121: 121, 129 | 122: 122, 130 | 123: 123, 131 | 124: 124, 132 | 125: 125, 133 | 126: 126, 134 | 127: 127, 135 | 128: -128, 136 | 129: -127, 137 | 130: -126, 138 | 131: -125, 139 | 132: -124, 140 | 133: -123, 141 | 134: -122, 142 | 135: -121, 143 | 136: -120, 144 | 137: -119, 145 | 138: -118, 146 | 139: -117, 147 | 140: -116, 148 | 141: -115, 149 | 142: -114, 150 | 143: -113, 151 | 144: -112, 152 | 145: -111, 153 | 146: -110, 154 | 147: -109, 155 | 148: -108, 156 | 149: -107, 157 | 150: -106, 158 | 151: -105, 159 | 152: -104, 160 | 153: -103, 161 | 154: -102, 162 | 155: -101, 163 | 156: -100, 164 | 157: -99, 165 | 158: -98, 166 | 159: -97, 167 | 160: -96, 168 | 161: -95, 169 | 162: -94, 170 | 163: -93, 171 | 164: -92, 172 | 165: -91, 173 | 166: -90, 174 | 167: -89, 175 | 168: -88, 176 | 169: -87, 177 | 170: -86, 178 | 171: -85, 179 | 172: -84, 180 | 173: -83, 181 | 174: -82, 182 | 175: -81, 183 | 176: -80, 184 | 177: -79, 185 | 178: -78, 186 | 179: -77, 187 | 180: -76, 188 | 181: -75, 189 | 182: -74, 190 | 183: -73, 191 | 184: -72, 192 | 185: -71, 193 | 186: -70, 194 | 187: -69, 195 | 188: -68, 196 | 189: -67, 197 | 190: -66, 198 | 191: -65, 199 | 192: -64, 200 | 193: -63, 201 | 194: -62, 202 | 195: -61, 203 | 196: -60, 204 | 197: -59, 205 | 198: -58, 206 | 199: -57, 207 | 200: -56, 208 | 201: -55, 209 | 202: -54, 210 | 203: -53, 211 | 204: -52, 212 | 205: -51, 213 | 206: -50, 214 | 207: -49, 215 | 208: -48, 216 | 209: -47, 217 | 210: -46, 218 | 211: -45, 219 | 212: -44, 220 | 213: -43, 221 | 214: -42, 222 | 215: -41, 223 | 216: -40, 224 | 217: -39, 225 | 218: -38, 226 | 219: -37, 227 | 220: -36, 228 | 221: -35, 229 | 222: -34, 230 | 223: -33, 231 | 224: -32, 232 | 225: -31, 233 | 226: -30, 234 | 227: -29, 235 | 228: -28, 236 | 229: -27, 237 | 230: -26, 238 | 231: -25, 239 | 232: -24, 240 | 233: -23, 241 | 234: -22, 242 | 235: -21, 243 | 236: -20, 244 | 237: -19, 245 | 238: -18, 246 | 239: -17, 247 | 240: -16, 248 | 241: -15, 249 | 242: -14, 250 | 243: -13, 251 | 244: -12, 252 | 245: -11, 253 | 246: -10, 254 | 247: -9, 255 | 248: -8, 256 | 249: -7, 257 | 250: -6, 258 | 251: -5, 259 | 252: -4, 260 | 253: -3, 261 | 254: -2, 262 | 255: -1, 263 | } 264 | 265 | 266 | def test_to_signed_byte(): 267 | for unsigned_int, signed_int in UNSIGNED_TO_SIGNED_INTEGERS.items(): 268 | assert signed_int == BitUtil.to_signed_byte(unsigned_int) 269 | 270 | 271 | def test_unsigned_right_shift_int(): 272 | assert BitUtil.unsigned_right_shift_int(-100, 1) == 2147483598 273 | 274 | 275 | def test_unsigned_right_shift_int2(): 276 | assert BitUtil.unsigned_right_shift_int(-1, 0) == -1 277 | 278 | 279 | def test_unsigned_right_shift_byte(): 280 | assert BitUtil.unsigned_right_shift_byte(-100, 1) == 2147483598 281 | 282 | 283 | def test_unsigned_right_shift_byte2(): 284 | assert BitUtil.unsigned_right_shift_byte(-1, 0) == -1 285 | 286 | 287 | def test_unsigned_right_shift_long(): 288 | assert BitUtil.unsigned_right_shift_long(-100, 1) == 9223372036854775758 289 | 290 | 291 | def test_unsigned_right_shift_long2(): 292 | assert BitUtil.unsigned_right_shift_long(-1, 0) == -1 293 | 294 | 295 | def test_left_shift_long_1(): 296 | assert BitUtil.left_shift_long(72057594037927935, 8) == -256 297 | 298 | 299 | def test_left_shift_long_2(): 300 | assert BitUtil.left_shift_long(214748364, 8) == 54975581184 301 | 302 | 303 | def test_left_shift_long_3(): 304 | assert BitUtil.left_shift_long(128, 3) == 1024 305 | 306 | 307 | def test_left_shift_int(): 308 | assert BitUtil.left_shift_int(128, 3) == 1024 309 | 310 | 311 | def test_left_shift_byte(): 312 | assert BitUtil.left_shift_byte(128, 3) == -1024 313 | -------------------------------------------------------------------------------- /tests/test_bit_vector.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from python_hll.util import BitVector 5 | 6 | """Unit tests for BitVector.""" 7 | 8 | 9 | def test_get_set_register(): 10 | """ 11 | Tests ``BitVector.get_register()`` and ``BitVector.set_register()``. 12 | """ 13 | # NOTE: registers are only 5bits wide 14 | vector1 = BitVector(5, 2**7) # width=5, count=2^7 15 | vector2 = BitVector(5, 2**7) 16 | vector3 = BitVector(5, 2**7) 17 | vector4 = BitVector(5, 2**7) 18 | for i in range(0, 2**7): 19 | vector1.set_register(i, 0x1F) 20 | vector2.set_register(i, (i & 0x1F)) 21 | vector3.set_register(i, ((127 - i) & 0x1F)) 22 | vector4.set_register(i, 0x15) 23 | 24 | for i in range(0, 2 ** 7): 25 | assert vector1.get_register(i) == 0x1F 26 | assert vector2.get_register(i) == i & 0x1F 27 | assert vector3.get_register(i) == (127 - i) & 0x1F 28 | assert vector4.get_register(i) == 0x15 29 | -------------------------------------------------------------------------------- /tests/test_explicit_hll.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import random 5 | 6 | from python_hll.hlltype import HLLType 7 | from python_hll.hll import HLL 8 | from python_hll.serialization import SerializationUtil 9 | 10 | """Unit tests for BitVector.""" 11 | 12 | 13 | def test_add_basic(): 14 | """ 15 | Tests basic set semantics of ``HLL.add_raw()``. 16 | """ 17 | # Adding a single positive value to an empty set should work. 18 | hll = new_hll(128) # arbitrary 19 | hll.add_raw(1) # positive 20 | assert hll.cardinality() == 1 21 | 22 | # Adding a single negative value to an empty set should work. 23 | hll = new_hll(128) # arbitrary 24 | hll.add_raw(-1) # negative 25 | assert hll.cardinality() == 1 26 | 27 | # Adding a duplicate value to a set should be a no-op. 28 | hll = new_hll(128) # arbitrary 29 | hll.add_raw(1) # positive 30 | hll.add_raw(1) # dupe 31 | assert hll.cardinality() == 1 32 | 33 | 34 | def test_union(): 35 | """ 36 | Tests ``HLL.union()``. 37 | """ 38 | # Unioning two distinct sets should work 39 | hll_a = new_hll(128) # arbitrary 40 | hll_b = new_hll(128) # arbitrary 41 | hll_a.add_raw(1) 42 | hll_a.add_raw(2) 43 | hll_b.add_raw(3) 44 | 45 | hll_a.union(hll_b) 46 | assert hll_a.cardinality() == 3 47 | 48 | # Unioning two sets whose union doesn't exceed the cardinality cap should not promote 49 | hll_a = new_hll(128) # arbitrary 50 | hll_b = new_hll(128) # arbitrary 51 | hll_a.add_raw(1) 52 | hll_a.add_raw(2) 53 | hll_b.add_raw(1) 54 | 55 | hll_a.union(hll_b) 56 | assert hll_a.cardinality() == 2 57 | assert hll_a.get_type() == HLLType.EXPLICIT 58 | 59 | # Unioning two sets whose union exceeds the cardinality cap should promote 60 | hll_a = new_hll(128) # arbitrary 61 | hll_b = new_hll(128) # arbitrary 62 | for i in range(0, 128): 63 | hll_a.add_raw(i) 64 | hll_b.add_raw(i+128) 65 | 66 | hll_a.union(hll_b) 67 | assert hll_a.get_type() == HLLType.SPARSE 68 | 69 | 70 | def test_clear(): 71 | """ 72 | Tests ``HLL.clear()`` 73 | """ 74 | hll = new_hll(128) # arbitrary 75 | hll.add_raw(1) 76 | hll.clear() 77 | assert hll.cardinality() == 0 78 | 79 | 80 | def test_to_from_bytes(): 81 | """ 82 | Tests ``HLL.to_bytes() and ``HLL.from_bytes(). 83 | """ 84 | schema_version = SerializationUtil.DEFAULT_SCHEMA_VERSION 85 | type = HLLType.EXPLICIT 86 | padding = schema_version.padding_bytes(type) 87 | bytes_per_word = 8 88 | 89 | # Should work on an empty set 90 | hll = new_hll(128) 91 | bytes = hll.to_bytes(schema_version) 92 | assert len(bytes) == padding # no elements, just padding 93 | 94 | in_hll = HLL.from_bytes(bytes) 95 | assert_elements_equal(hll, in_hll) 96 | 97 | # Should work on a partially filled set 98 | hll = new_hll(128) 99 | for i in range(0, 3): 100 | hll.add_raw(i) 101 | 102 | bytes = hll.to_bytes(schema_version) 103 | assert len(bytes) == padding + bytes_per_word * 3 104 | 105 | in_hll = HLL.from_bytes(bytes) 106 | assert_elements_equal(hll, in_hll) 107 | 108 | # Should work on a full set 109 | explicit_threshold = 128 110 | hll = new_hll(explicit_threshold) 111 | 112 | for i in range(0, explicit_threshold): 113 | hll.add_raw(27 + i) 114 | 115 | bytes = hll.to_bytes(schema_version) 116 | assert len(bytes) == padding + bytes_per_word * explicit_threshold 117 | 118 | in_hll = HLL.from_bytes(bytes) 119 | assert_elements_equal(hll, in_hll) 120 | 121 | 122 | def test_random_values(): 123 | """ 124 | Tests correctness against `set()`. 125 | """ 126 | explicit_threshold = 4096 127 | canonical = set() 128 | hll = new_hll(explicit_threshold) 129 | 130 | seed = 1 # constant so results are reproducible 131 | random.seed(seed) 132 | max_java_long = 9223372036854775807 133 | for i in range(0, explicit_threshold): 134 | random_long = random.randint(1, max_java_long) 135 | canonical.add(random_long) 136 | hll.add_raw(random_long) 137 | canonical_cardinality = len(canonical) 138 | assert hll.cardinality() == canonical_cardinality 139 | 140 | 141 | def test_promotion(): 142 | """ 143 | Tests promotion to ``HLLType.SPARSE`` and ``HLLType.FULL``. 144 | """ 145 | explicit_threshold = 128 146 | hll = HLL.create_for_testing(11, 5, explicit_threshold, 256, HLLType.EXPLICIT) 147 | for i in range(0, explicit_threshold + 1): 148 | hll.add_raw(i) 149 | assert hll.get_type() == HLLType.SPARSE 150 | 151 | hll = HLL(11, 5, 4, False, HLLType.EXPLICIT) # expthresh=4 => explicit_threshold=8 152 | for i in range(0, 9): 153 | hll.add_raw(i) 154 | assert hll.get_type() == HLLType.FULL 155 | 156 | 157 | # ------------------------------------------------------------ 158 | # assertion helpers 159 | 160 | 161 | def assert_elements_equal(hll_a, hll_b): 162 | """ 163 | Asserts that values in both sets are exactly equal. 164 | """ 165 | assert hll_a._explicit_storage == hll_b._explicit_storage 166 | 167 | 168 | def new_hll(explicit_threshold): 169 | """ 170 | Builds a ``HLLType.EXPLICIT`` ``HLL`` instance with the specified 171 | explicit threshold. 172 | 173 | :param explicit_threshold: explicit threshold to use for the constructed 174 | ``HLL``. This must be greater than zero. 175 | :type explicit_threshold: int 176 | :returns: A default-sized ``HLLType.EXPLICIT`` empty ``HLL`` instance. This 177 | will never be ``None``. 178 | :rtype: HLL 179 | """ 180 | return HLL.create_for_testing(11, 5, explicit_threshold, 256, HLLType.EXPLICIT) 181 | -------------------------------------------------------------------------------- /tests/test_full_hll.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import division 4 | import pytest 5 | from math import ceil, log 6 | from python_hll.hlltype import HLLType 7 | from python_hll.hll import HLL 8 | from python_hll.hllutil import HLLUtil 9 | from python_hll.serialization import SerializationUtil 10 | from python_hll.util import BitUtil 11 | import probabilistic_test_util 12 | 13 | """Tests ``HLL`` of type ``HLLType.FULL``.""" 14 | 15 | 16 | def test_small_range_smoke(): 17 | """ 18 | Smoke test for HLL.cardinality() and the proper use of the 19 | small range correction. 20 | """ 21 | log2m = 11 22 | m = BitUtil.left_shift_int(1, log2m) 23 | regwidth = 5 24 | 25 | # only one register set 26 | hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) 27 | hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, 0, 1)) 28 | cardinality = hll.cardinality() 29 | 30 | # Trivially true that small correction conditions hold: one register 31 | # set implies zeroes exist, and estimator trivially smaller than 5m/2. 32 | # Small range correction: m * log(m/V) 33 | expected = ceil(m * log(m / (m - 1))) # # of zeroes 34 | assert cardinality == expected 35 | 36 | # all but one register set 37 | hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) 38 | for i in range(0, m - 1): 39 | hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, i, 1)) 40 | 41 | # Trivially true that small correction conditions hold: all but 42 | # one register set implies a zero exists, and estimator trivially 43 | # smaller than 5m/2 since it's alpha / ((m-1)/2) 44 | cardinality = hll.cardinality() 45 | 46 | # Small range correction: m * log(m/V) 47 | expected = ceil(m * log(m / 1)) # # of zeroes 48 | assert cardinality == expected 49 | 50 | 51 | def test_normal_range_smoke(): 52 | """ 53 | Smoke test for ``HLL.cardinality()`` and the proper use of the 54 | uncorrected estimator. 55 | """ 56 | log2m = 11 57 | regwidth = 5 58 | 59 | # regwidth = 5, so hash space is 60 | # log2m + (2^5 - 1 - 1), so L = log2m + 30 61 | L = log2m + 30 62 | m = BitUtil.left_shift_int(1, log2m) 63 | hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) 64 | 65 | # all registers at 'medium' value 66 | register_value = 7 # chosen to ensure neither correction kicks in 67 | for i in range(0, m): 68 | hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, i, register_value)) 69 | 70 | cardinality = hll.cardinality() 71 | 72 | # Simplified estimator when all registers take same value: alpha / (m/2^val) 73 | estimator = HLLUtil.alpha_m_squared(m) / (m / (2**register_value)) 74 | 75 | assert estimator <= (2**L)/30 76 | assert estimator > (5 * m / 2) 77 | 78 | expected = ceil(estimator) 79 | assert cardinality == expected 80 | 81 | 82 | def test_large_range_smoke(): 83 | """ 84 | Smoke test for ``HLL.cardinality()`` and the proper use of the large 85 | range correction. 86 | """ 87 | log2m = 12 88 | regwidth = 5 89 | # regwidth = 5, so hash space is 90 | # log2m + (2^5 - 1 - 1), so L = log2m + 30 91 | L = log2m + 30 92 | m = BitUtil.left_shift_int(1, log2m) 93 | hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) 94 | 95 | register_value = 31 # chosen to ensure large correction kicks in 96 | for i in range(0, m): 97 | hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, i, register_value)) 98 | 99 | cardinality = hll.cardinality() 100 | 101 | # Simplified estimator when all registers take same value: alpha / (m/2^val) 102 | estimator = HLLUtil.alpha_m_squared(m) / (m / (2**register_value)) 103 | 104 | # Assert conditions for large range 105 | 106 | assert estimator > (2**L) / 30 107 | 108 | # Large range correction: -2^L * log(1 - E/2^L) 109 | try: 110 | expected = ceil(-1.0 * (2 ** L) * log(1.0 - estimator / (2 ** L))) 111 | except ValueError: 112 | expected = 0 113 | assert cardinality == expected 114 | 115 | 116 | def test_register_value(): 117 | """ 118 | Tests the bounds on a register's value for a given raw input value. 119 | """ 120 | log2m = 4 # small enough to make testing easy (add_raw() shifts by one byte) 121 | 122 | # register width 4 (the minimum size) 123 | regwidth = 4 124 | hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) 125 | bit_vector = hll._probabilistic_storage 126 | 127 | # lower-bounds of the register 128 | hll.add_raw(0x000000000000001) # 'j'=1 129 | assert bit_vector.get_register(1) == 0 130 | 131 | hll.add_raw(0x0000000000000012) # 'j'=2 132 | assert bit_vector.get_register(2) == 1 133 | 134 | hll.add_raw(0x0000000000000023) # 'j'=3 135 | assert bit_vector.get_register(3) == 2 136 | 137 | hll.add_raw(0x0000000000000044) # 'j'=4 138 | assert bit_vector.get_register(4) == 3 139 | 140 | hll.add_raw(0x0000000000000085) # 'j'=5 141 | assert bit_vector.get_register(5) == 4 142 | 143 | # upper-bounds of the register 144 | # NOTE: bear in mind that BitVector itself does ensure that 145 | # overflow of a register is prevented 146 | hll.add_raw(0x0000000000010006) # 'j'=6 147 | assert bit_vector.get_register(6) == 13 148 | 149 | hll.add_raw(0x0000000000020007) # 'j'=7 150 | assert bit_vector.get_register(7) == 14 151 | 152 | hll.add_raw(0x0000000000040008) # 'j'=8 153 | assert bit_vector.get_register(8) == 15 154 | 155 | hll.add_raw(0x0000000000080009) # 'j'=9 156 | assert bit_vector.get_register(9) == 15 # overflow 157 | 158 | # sanity checks to ensure that no other bits above the lowest-set 159 | # bit matters 160 | # NOTE: same as case 'j = 6' above 161 | hll.add_raw(0x000000000003000A) # 'j'=10 162 | assert bit_vector.get_register(10) == 13 163 | 164 | hll.add_raw(0x000000000011000B) # 'j'=11 165 | assert bit_vector.get_register(11) == 13 166 | 167 | # ------------------------------------------------------------ 168 | # register width 5 169 | 170 | regwidth = 5 171 | hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) 172 | bit_vector = hll._probabilistic_storage 173 | 174 | # lower-bounds of the register 175 | hll.add_raw(0x0000000000000001) # 'j'=1 176 | assert bit_vector.get_register(1) == 0 177 | 178 | hll.add_raw(0x0000000000000012) # 'j'=2 179 | assert bit_vector.get_register(2) == 1 180 | 181 | hll.add_raw(0x0000000000000023) # 'j'=3 182 | assert bit_vector.get_register(3) == 2 183 | 184 | hll.add_raw(0x0000000000000044) # 'j'=4 185 | assert bit_vector.get_register(4) == 3 186 | 187 | hll.add_raw(0x0000000000000085) # 'j'=5 188 | assert bit_vector.get_register(5) == 4 189 | 190 | # upper-bounds of the register 191 | # NOTE: bear in mind that BitVector itself does ensure that 192 | # overflow of a register is prevented 193 | hll.add_raw(0x0000000100000006) # 'j'=6 194 | assert bit_vector.get_register(6) == 29 195 | 196 | hll.add_raw(0x0000000200000007) # 'j'=7 197 | assert bit_vector.get_register(7) == 30 198 | 199 | hll.add_raw(0x0000000400000008) # 'j'=8 200 | assert bit_vector.get_register(8) == 31 201 | 202 | hll.add_raw(0x0000000800000009) # 'j'=9 203 | assert bit_vector.get_register(9) == 31 # overflow 204 | 205 | 206 | def test_clear(): 207 | """ 208 | Tests HLL.clear(). 209 | """ 210 | regwidth = 5 211 | log2m = 4 # 16 registers per counter 212 | m = BitUtil.left_shift_int(1, log2m) 213 | 214 | hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) 215 | bit_vector = hll._probabilistic_storage 216 | for i in range(0, m): 217 | bit_vector.set_register(i, i) 218 | 219 | hll.clear() 220 | for i in range(0, m): 221 | assert bit_vector.get_register(i) == 0 # default value of register 222 | 223 | 224 | # ------------------------------------------------------------ 225 | # Serialization 226 | 227 | 228 | def test_to_from_bytes(): 229 | log2m = 11 # arbitrary 230 | regwidth = 5 231 | 232 | schema_version = SerializationUtil.DEFAULT_SCHEMA_VERSION 233 | type = HLLType.FULL 234 | padding = schema_version.padding_bytes(type) 235 | data_byte_count = probabilistic_test_util.get_required_bytes(regwidth, BitUtil.left_shift_int(1, log2m)) # aka 2^log2m = m 236 | expected_byte_count = padding + data_byte_count 237 | 238 | # Should work on an empty element 239 | hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) 240 | bytes = hll.to_bytes(schema_version) 241 | 242 | # assert output length is correct 243 | assert len(bytes) == expected_byte_count 244 | 245 | in_hll = HLL.from_bytes(bytes) 246 | assert_elements_equal(hll, in_hll) 247 | 248 | # Should work on a partially filled element 249 | hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) 250 | 251 | for i in range(0, 3): 252 | raw_value = probabilistic_test_util.construct_hll_value(log2m, i, (i+9)) 253 | hll.add_raw(raw_value) 254 | 255 | bytes = hll.to_bytes(schema_version) 256 | 257 | assert len(bytes) == expected_byte_count 258 | 259 | in_hll = HLL.from_bytes(bytes) 260 | 261 | # assert register values correct 262 | assert_elements_equal(hll, in_hll) 263 | 264 | # Should work on a full set 265 | hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) 266 | 267 | for i in range(0, BitUtil.left_shift_int(1, log2m)): 268 | raw_value = probabilistic_test_util.construct_hll_value(log2m, i, (i % 9) + 1) 269 | hll.add_raw(raw_value) 270 | 271 | bytes = hll.to_bytes(schema_version) 272 | 273 | # assert output length is correct 274 | assert len(bytes) == expected_byte_count 275 | 276 | in_hll = HLL.from_bytes(bytes) 277 | 278 | # assert register values correct 279 | assert_elements_equal(hll, in_hll) 280 | 281 | 282 | # ------------------------------------------------------------ 283 | # Assertion Helpers 284 | 285 | 286 | def assert_elements_equal(hll_a, hll_b): 287 | bit_vector_a = hll_a._probabilistic_storage 288 | bit_vector_b = hll_b._probabilistic_storage 289 | 290 | iter_a = bit_vector_a.register_iterator() 291 | iter_b = bit_vector_b.register_iterator() 292 | 293 | try: 294 | while True: 295 | assert iter_a.next() == iter_b.next() 296 | except StopIteration: 297 | pass 298 | 299 | try: 300 | iter_a.next() 301 | pytest.fail() 302 | except StopIteration: 303 | pass 304 | 305 | try: 306 | iter_b.next() 307 | pytest.fail() 308 | except StopIteration: 309 | pass 310 | -------------------------------------------------------------------------------- /tests/test_hll_serialization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Serialization smoke-tests.""" 5 | 6 | import random 7 | import sys 8 | from copy import deepcopy 9 | from python_hll.hlltype import HLLType 10 | from python_hll.hll import HLL 11 | 12 | # A fixed random seed so that this test is reproducible. 13 | RANDOM_SEED = 1 14 | 15 | 16 | def test_serialization_smoke(fastonly): 17 | """ 18 | A smoke-test that covers serialization/deserialization of an HLL 19 | under all possible parameters. 20 | """ 21 | random.seed(RANDOM_SEED) 22 | random_count = 250 23 | max_java_long = 9223372036854775807 24 | randoms = [random.randint(1, max_java_long) for i in range(0, random_count)] 25 | assert_cardinality(HLLType.EMPTY, randoms, fastonly) 26 | assert_cardinality(HLLType.EXPLICIT, randoms, fastonly) 27 | assert_cardinality(HLLType.SPARSE, randoms, fastonly) 28 | assert_cardinality(HLLType.FULL, randoms, fastonly) 29 | 30 | 31 | def assert_cardinality(hll_type, items, fastonly): 32 | # NOTE: log2m<=16 was chosen as the max log2m parameter so that the test 33 | # completes in a reasonable amount of time. Not much is gained by 34 | # testing larger values - there are no more known serialization 35 | # related edge cases that appear as log2m gets even larger. 36 | log2m_range = range(HLL.MINIMUM_LOG2M_PARAM, 16 + 1) 37 | regw_range = range(HLL.MINIMUM_REGWIDTH_PARAM, HLL.MAXIMUM_REGWIDTH_PARAM + 1) 38 | expthr_range = range(HLL.MINIMUM_EXPTHRESH_PARAM, HLL.MAXIMUM_EXPTHRESH_PARAM + 1) 39 | if fastonly: 40 | log2m_range = (HLL.MINIMUM_LOG2M_PARAM, 16) 41 | regw_range = (HLL.MINIMUM_REGWIDTH_PARAM, HLL.MAXIMUM_REGWIDTH_PARAM) 42 | expthr_range = (HLL.MINIMUM_EXPTHRESH_PARAM, HLL.MAXIMUM_EXPTHRESH_PARAM) 43 | for log2m in log2m_range: 44 | for regw in regw_range: 45 | for expthr in expthr_range: 46 | for sparse in [True, False]: 47 | hll = HLL(log2m, regw, expthr, sparse, hll_type) 48 | for item in items: 49 | hll.add_raw(item) 50 | copy = HLL.from_bytes(hll.to_bytes()) 51 | assert copy.cardinality() == hll.cardinality() 52 | assert copy.get_type() == hll.get_type() 53 | assert copy.to_bytes() == hll.to_bytes() 54 | 55 | clone = deepcopy(hll) 56 | assert clone.cardinality() == hll.cardinality() 57 | assert clone.get_type() == hll.get_type() 58 | assert clone.to_bytes() == hll.to_bytes() 59 | 60 | sys.stdout.write('.') 61 | sys.stdout.flush() 62 | -------------------------------------------------------------------------------- /tests/test_hll_util.py: -------------------------------------------------------------------------------- 1 | """Tests ``HLLUtil`` static methods.""" 2 | 3 | from python_hll.hll import HLL 4 | from python_hll.hllutil import HLLUtil 5 | 6 | 7 | def test_large_estimator_cutoff(): 8 | """ 9 | Tests that ``HLLUtil.largeEstimatorCutoff()`` is the same 10 | as a trivial implementation. 11 | """ 12 | for log2m in range(HLL.MINIMUM_LOG2M_PARAM + 1, HLL.MAXIMUM_LOG2M_PARAM + 1): 13 | for regWidth in range(HLL.MINIMUM_REGWIDTH_PARAM + 1, HLL.MINIMUM_REGWIDTH_PARAM + 1): 14 | cutoff = HLLUtil.large_estimator_cutoff(log2m, regWidth) 15 | """ 16 | See blog post (http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/) 17 | and original paper (Fig. 3) for information on 2^L and 18 | large range correction cutoff. 19 | """ 20 | expected = (regWidth ** regWidth) - (2 + log2m) / 30.0 21 | assert cutoff == expected 22 | -------------------------------------------------------------------------------- /tests/test_integration.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import csv 5 | import pytest 6 | import sys 7 | from python_hll.util import NumberUtil 8 | from python_hll.hll import HLL 9 | from python_hll.hlltype import HLLType 10 | import probabilistic_test_util 11 | 12 | """ 13 | Compares the HLLs to the files in the data directory. See README.txt for 14 | more information about this test data. 15 | """ 16 | 17 | LOG2M = 11 18 | REGWIDTH = 5 19 | EXPLICIT_THRESHOLD = 256 20 | SPARSE_THRESHOLD = 850 21 | 22 | 23 | def test_cumulative_add_cardinality_correction(fastonly): 24 | do_test_add('cumulative_add_cardinality_correction.csv', fastonly) 25 | 26 | 27 | def test_cumulative_add_comprehensive_promotion(fastonly): 28 | do_test_add('cumulative_add_comprehensive_promotion.csv', fastonly) 29 | 30 | 31 | def test_cumulative_add_sparse_edge(fastonly): 32 | do_test_add('cumulative_add_sparse_edge.csv', fastonly) 33 | 34 | 35 | def test_cumulative_add_sparse_random(fastonly): 36 | do_test_add('cumulative_add_sparse_random.csv', fastonly) 37 | 38 | 39 | def test_cumulative_add_sparse_step(fastonly): 40 | do_test_add('cumulative_add_sparse_step.csv', fastonly) 41 | 42 | 43 | def test_cumulative_union_comprehensive(fastonly): 44 | do_test_union('cumulative_union_comprehensive.csv', fastonly) 45 | 46 | 47 | def test_cumulative_union_explicit_explicit(fastonly): 48 | do_test_union('cumulative_union_explicit_explicit.csv', fastonly) 49 | 50 | 51 | def test_cumulative_union_explicit_promotion(fastonly): 52 | do_test_union('cumulative_union_explicit_promotion.csv', fastonly) 53 | 54 | 55 | def test_cumulative_union_probabilistic_probabilistic(fastonly): 56 | do_test_union('cumulative_union_probabilistic_probabilistic.csv', fastonly) 57 | 58 | 59 | def test_cumulative_union_sparse_promotion(fastonly): 60 | do_test_union('cumulative_union_sparse_promotion.csv', fastonly) 61 | 62 | 63 | def test_cumulative_union_sparse_sparse(fastonly): 64 | do_test_union('cumulative_union_sparse_sparse.csv', fastonly) 65 | 66 | 67 | def test_cumulative_union_sparse_full_representation(): 68 | # I'm not exactly sure how this test is suppossed to work - it's different 69 | # from the other union tests. For now I will just construct the HLLs in the 70 | # same way as Java's sparseFullRepresentationTest() and compare the output. 71 | 72 | # The file is generated from IntegrationTestGenerator.java. 73 | filename = 'cumulative_union_sparse_full_representation.csv' 74 | with open('tests/data/%s' % filename, mode='r') as csv_file: 75 | csv_reader = csv.DictReader(csv_file) 76 | rows = [row for row in csv_reader] 77 | print('') 78 | print('test_integration: %s: %s rows:' % (filename, len(rows))) 79 | 80 | empty_hll_1 = new_hll(HLLType.EMPTY) 81 | empty_hll_2 = new_hll(HLLType.EMPTY) 82 | assert_sparse_full_row_equals(empty_hll_1, empty_hll_2, rows[0], filename, 1) 83 | 84 | full_hll = new_hll(HLLType.FULL) 85 | full_hll.add_raw(probabilistic_test_util.construct_hll_value(LOG2M, 0, 1)) 86 | sparse_hll = new_hll(HLLType.SPARSE) 87 | sparse_hll.add_raw(probabilistic_test_util.construct_hll_value(LOG2M, 0, 1)) 88 | assert_sparse_full_row_equals(full_hll, sparse_hll, rows[1], filename, 2) 89 | 90 | full_hll_2 = new_hll(HLLType.FULL) 91 | full_hll_2.add_raw(probabilistic_test_util.construct_hll_value(LOG2M, 1, 1)) 92 | sparse_hll.add_raw(probabilistic_test_util.construct_hll_value(LOG2M, 1, 1)) 93 | assert_sparse_full_row_equals(full_hll_2, sparse_hll, rows[2], filename, 3) 94 | 95 | full_hll_3 = new_hll(HLLType.FULL) 96 | for i in range(2, SPARSE_THRESHOLD + 1): 97 | full_hll_3.add_raw(probabilistic_test_util.construct_hll_value(LOG2M, i, 1)) 98 | sparse_hll.add_raw(probabilistic_test_util.construct_hll_value(LOG2M, i, 1)) 99 | assert_sparse_full_row_equals(full_hll_3, sparse_hll, rows[3], filename, 4) 100 | 101 | 102 | def assert_sparse_full_row_equals(hll, union_hll, row, filename, line): 103 | """ 104 | Asserts that the given HLLs match the row in cumulative_union_sparse_full_representation.csv. 105 | """ 106 | assert float_cardinality(hll) == pytest.approx(float(row['cardinality'])), '%s:%s' % (filename, line) 107 | assert hll_to_string(hll) == row['HLL'], '%s:%s' % (filename, line) 108 | assert float_cardinality(union_hll) == pytest.approx(float(row['union_cardinality'])), '%s:%s' % (filename, line) 109 | assert hll_to_string(union_hll) == row['union_HLL'], '%s:%s' % (filename, line) 110 | 111 | 112 | def new_hll(type): 113 | """ 114 | Shortcut for testing constructor, which uses the constants defined at 115 | the top of the file as default parameters. 116 | 117 | :returns: a new ``HLL`` of specified type, which uses the parameters 118 | ``LOG2M`` ``REGWIDTH``, ``EXPLICIT_THRESHOLD`` and ``SPARSE_THRESHOLD`` specified above. 119 | """ 120 | return HLL.create_for_testing(LOG2M, REGWIDTH, EXPLICIT_THRESHOLD, SPARSE_THRESHOLD, type) 121 | 122 | 123 | def do_test_add(filename, fastonly): 124 | """ 125 | Tests an "add"-style test file. 126 | """ 127 | with open('tests/data/%s' % filename, mode='r') as csv_file: 128 | csv_reader = csv.DictReader(csv_file) 129 | line = 1 130 | rows = [row for row in csv_reader] 131 | if fastonly: 132 | rows = rows[0:500] 133 | print('') 134 | print('test_integration: %s: %s rows: (each . = 100 rows)' % (filename, len(rows))) 135 | for row in rows: 136 | if line == 1: 137 | hll = string_to_hll(row['multiset']) 138 | line += 1 139 | continue 140 | hll.add_raw(int(row['raw_value'])) 141 | assert float_cardinality(hll) == pytest.approx(float(row['cardinality'])), '%s:%s' % (filename, line) 142 | assert hll_to_string(hll) == row['multiset'], '%s:%s' % (filename, line) 143 | hll = string_to_hll(row['multiset']) 144 | line += 1 145 | if line % 100 == 0: 146 | sys.stdout.write('.') 147 | sys.stdout.flush() 148 | 149 | 150 | def do_test_union(filename, fastonly): 151 | """ 152 | Tests an "union"-style test file. 153 | """ 154 | with open('tests/data/%s' % filename, mode='r') as csv_file: 155 | csv_reader = csv.DictReader(csv_file) 156 | line = 1 157 | rows = [row for row in csv_reader] 158 | if fastonly: 159 | rows = rows[0:500] 160 | print('') 161 | print('test_integration: %s: %s rows: (each . = 100 rows)' % (filename, len(rows))) 162 | for row in rows: 163 | if line == 1: 164 | hll = string_to_hll(row['union_multiset']) 165 | line += 1 166 | continue 167 | other_hll = string_to_hll(row['multiset']) 168 | assert float_cardinality(other_hll) == pytest.approx(float(row['cardinality'])), '%s:%s:multiset' % (filename, line) 169 | hll.union(other_hll) 170 | assert float_cardinality(hll) == pytest.approx(float(row['union_cardinality'])), '%s:%s' % (filename, line) 171 | assert hll_to_string(hll) == row['union_multiset'], '%s:%s' % (filename, line) 172 | hll = string_to_hll(row['union_multiset']) 173 | line += 1 174 | if line % 100 == 0: 175 | sys.stdout.write('.') 176 | sys.stdout.flush() 177 | 178 | 179 | def float_cardinality(hll): 180 | """ 181 | Returns the algorithm-specific cardinality of the specified ``HLL`` 182 | ``String`` appropriate for comparison with the algorithm-specific 183 | cardinality provided by the PostgreSQL implementation. 184 | :param HLL hll: The HLL whose algorithm-specific cardinality is to be printed. 185 | This cannot be ``None``. 186 | :returns: the algorithm-specific cardinality of the instance as a PostgreSQL- 187 | compatible String. This will never be ``None`` 188 | :rtype: float 189 | """ 190 | if hll.get_type() == HLLType.EMPTY: 191 | return 0 192 | elif hll.get_type() == HLLType.EXPLICIT: # promotion has not yet occurred 193 | return hll.cardinality() 194 | elif hll.get_type() == HLLType.SPARSE: 195 | return hll._sparse_probabilistic_algorithm_cardinality() 196 | elif hll.get_type() == HLLType.FULL: 197 | return hll._full_probabilistic_algorithm_cardinality() 198 | else: 199 | raise Exception('Unknown HLL type ' + str(hll.get_type())) 200 | 201 | 202 | def string_to_hll(s): 203 | """ 204 | Converts a string (with \\x) to an HLL. 205 | """ 206 | s = s[2:] 207 | return HLL.from_bytes(NumberUtil.from_hex(s, 0, len(s))) 208 | 209 | 210 | def hll_to_string(hll): 211 | """ 212 | Converts an HLL to a string (with \\x) 213 | """ 214 | bytes = hll.to_bytes() 215 | return '\\x' + NumberUtil.to_hex(bytes, 0, len(bytes)) 216 | -------------------------------------------------------------------------------- /tests/test_sparse_hll.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import division 4 | from math import ceil, log 5 | import random 6 | from python_hll.hlltype import HLLType 7 | from python_hll.hll import HLL 8 | from python_hll.hllutil import HLLUtil 9 | from python_hll.serialization import SerializationUtil 10 | from python_hll.util import BitUtil 11 | import probabilistic_test_util 12 | 13 | """Tests ``HLL`` of type ``HLLType.SPARSE``.""" 14 | 15 | log2m = 11 16 | 17 | 18 | def test_add(): 19 | """ 20 | Tests ``HLL.add_raw()``. 21 | """ 22 | # ------------------------------------------------------------ 23 | # insert an element with register value 1 (minimum set value) 24 | register_index = 0 25 | register_value = 1 26 | raw_value = probabilistic_test_util.construct_hll_value(log2m, register_index, register_value) 27 | 28 | hll = HLL.create_for_testing(log2m, 5, 128, 256, HLLType.SPARSE) 29 | hll.add_raw(raw_value) 30 | 31 | assert_one_register_set(hll, register_index, BitUtil.to_signed_byte(register_value)) 32 | 33 | # ------------------------------------------------------------ 34 | # insert an element with register value 31 (maximum set value) 35 | register_index = 0 36 | register_value = 31 37 | raw_value = probabilistic_test_util.construct_hll_value(log2m, register_index, register_value) 38 | 39 | hll = HLL.create_for_testing(log2m, 5, 128, 256, HLLType.SPARSE) 40 | hll.add_raw(raw_value) 41 | 42 | assert_one_register_set(hll, register_index, BitUtil.to_signed_byte(register_value)) 43 | 44 | # ------------------------------------------------------------ 45 | # insert an element that could overflow the register (past 31) 46 | register_index = 0 47 | register_value = 36 48 | raw_value = probabilistic_test_util.construct_hll_value(log2m, register_index, register_value) 49 | 50 | hll = HLL.create_for_testing(log2m, 5, 128, 256, HLLType.SPARSE) 51 | hll.add_raw(raw_value) 52 | 53 | assert_one_register_set(hll, register_index, BitUtil.to_signed_byte(31)) # register max 54 | 55 | # ------------------------------------------------------------ 56 | # insert duplicate elements, observe no change 57 | register_index = 0 58 | register_value = 1 59 | raw_value = probabilistic_test_util.construct_hll_value(log2m, register_index, register_value) 60 | 61 | hll = HLL.create_for_testing(log2m, 5, 128, 256, HLLType.SPARSE) 62 | hll.add_raw(raw_value) 63 | hll.add_raw(raw_value) 64 | 65 | assert_one_register_set(hll, register_index, BitUtil.to_signed_byte(register_value)) # register max 66 | 67 | # ------------------------------------------------------------ 68 | # insert elements that increase a register's value 69 | register_index = 0 70 | register_value = 1 71 | raw_value = probabilistic_test_util.construct_hll_value(log2m, register_index, register_value) 72 | 73 | hll = HLL.create_for_testing(log2m, 5, 128, 256, HLLType.SPARSE) 74 | hll.add_raw(raw_value) 75 | 76 | register_value_2 = 2 77 | raw_value_2 = probabilistic_test_util.construct_hll_value(log2m, register_index, register_value_2) 78 | hll.add_raw(raw_value_2) 79 | 80 | assert_one_register_set(hll, register_index, BitUtil.to_signed_byte(register_value_2)) 81 | 82 | # ------------------------------------------------------------ 83 | # insert elements that have lower register values, observe no change 84 | register_index = 0 85 | register_value = 2 86 | raw_value = probabilistic_test_util.construct_hll_value(log2m, register_index, register_value) 87 | 88 | hll = HLL.create_for_testing(log2m, 5, 128, 256, HLLType.SPARSE) 89 | hll.add_raw(raw_value) 90 | 91 | register_value_2 = 1 92 | raw_value_2 = probabilistic_test_util.construct_hll_value(log2m, register_index, register_value_2) 93 | hll.add_raw(raw_value_2) 94 | 95 | assert_one_register_set(hll, register_index, BitUtil.to_signed_byte(register_value)) 96 | 97 | 98 | def test_small_range_smoke(): 99 | """ 100 | Smoke test for ``HLL.cardinality()`` and the proper use of the small 101 | range correction. 102 | """ 103 | log2m = 11 104 | m = BitUtil.left_shift_int(1, log2m) 105 | regwidth = 5 106 | 107 | # ------------------------------------------------------------ 108 | # only one register set 109 | hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.SPARSE) 110 | hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, 0, 1)) 111 | 112 | cardinality = hll.cardinality() 113 | 114 | # Trivially true that small correction conditions hold: one register 115 | # set implies zeroes exist, and estimator trivially smaller than 5m/2. 116 | # Small range correction: m * log(m/V) 117 | expected = ceil(m * log(m / (m - 1))) # # of zeroes 118 | assert cardinality == expected 119 | 120 | # ------------------------------------------------------------ 121 | # all but one register set 122 | hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.SPARSE) 123 | for i in range(0, m - 1): 124 | hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, i, 1)) 125 | 126 | # Trivially true that small correction conditions hold: all but 127 | # one register set implies a zero exists, and estimator trivially 128 | # smaller than 5m/2 since it's alpha / ((m-1)/2) 129 | cardinality = hll.cardinality() 130 | 131 | # Small range correction: m * log(m/V) 132 | expected = ceil(m * log(m / 1)) # # of zeroes 133 | assert cardinality == expected 134 | 135 | 136 | def test_normal_range_smoke(): 137 | """ 138 | Smoke test for HLL.cardinality() and the proper use of the 139 | uncorrected estimator. 140 | """ 141 | log2m = 11 142 | m = BitUtil.left_shift_int(1, log2m) 143 | regwidth = 5 144 | # regwidth = 5, so hash space is 145 | # log2m + (2^5 - 1 - 1), so L = log2m + 30 146 | L = log2m + 30 147 | 148 | # all registers at 'medium' value 149 | hll = HLL.create_for_testing(log2m, regwidth, 128, m, HLLType.SPARSE) 150 | 151 | register_value = 7 # chosen to ensure neither correction kicks in 152 | for i in range(0, m): 153 | hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, i, register_value)) 154 | 155 | cardinality = hll.cardinality() 156 | 157 | # Simplified estimator when all registers take same value: alpha / (m/2^val) 158 | estimator = HLLUtil.alpha_m_squared(m) / (m / (2 ** register_value)) 159 | 160 | # Assert conditions for uncorrected range 161 | assert estimator <= (2 ** L) / 30 162 | assert estimator > (5 * m / 2) 163 | 164 | expected = ceil(estimator) 165 | assert cardinality == expected 166 | 167 | 168 | def test_large_range_smoke(): 169 | """ 170 | Smoke test for ``HLL.cardinality()`` and the proper use of the large 171 | range correction. 172 | """ 173 | log2m = 11 174 | m = BitUtil.left_shift_int(1, log2m) 175 | regwidth = 5 176 | # regwidth = 5, so hash space is 177 | # log2m + (2^5 - 1 - 1), so L = log2m + 30 178 | L = log2m + 30 179 | 180 | # all registers at large value 181 | hll = HLL.create_for_testing(log2m, regwidth, 128, m, HLLType.SPARSE) 182 | 183 | register_value = 31 # chosen to ensure large correction kicks in 184 | for i in range(0, m): 185 | hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, i, register_value)) 186 | 187 | cardinality = hll.cardinality() 188 | 189 | # Simplified estimator when all registers take same value: alpha / (m/2^val) 190 | estimator = HLLUtil.alpha_m_squared(m) / (m / (2 ** register_value)) 191 | 192 | # Assert conditions for large range 193 | assert estimator > (2**L) / 30 194 | 195 | # Large range correction: -2^32 * log(1 - E/2^32) 196 | try: 197 | expected = ceil(-1.0 * (2**L) * log(1.0 - estimator / (2**L))) 198 | except ValueError: 199 | expected = 0 200 | assert cardinality == expected 201 | 202 | 203 | def test_union(): 204 | """ 205 | Tests ``HLL.union()``. 206 | """ 207 | log2m = 11 # arbitrary 208 | sparse_threshold = 256 # arbitrary 209 | 210 | # ------------------------------------------------------------ 211 | # two empty multisets should union to an empty set 212 | hll_a = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE) 213 | hll_b = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE) 214 | 215 | hll_a.union(hll_b) 216 | 217 | assert hll_a.get_type() == HLLType.SPARSE 218 | assert hll_a.cardinality() == 0 219 | 220 | # ------------------------------------------------------------ 221 | # two disjoint multisets should union properly 222 | hll_a = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE) 223 | hll_a.add_raw(probabilistic_test_util.construct_hll_value(log2m, 1, 1)) 224 | hll_b = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE) 225 | hll_a.add_raw(probabilistic_test_util.construct_hll_value(log2m, 2, 1)) 226 | 227 | hll_a.union(hll_b) 228 | 229 | assert hll_a.get_type() == HLLType.SPARSE # unchanged 230 | assert hll_a.cardinality() == 3 # precomputed 231 | assert_register_present(hll_a, 1, BitUtil.to_signed_byte(1)) 232 | assert_register_present(hll_a, 2, BitUtil.to_signed_byte(1)) 233 | 234 | # ------------------------------------------------------------ 235 | # two exactly overlapping multisets should union properly 236 | hll_a = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE) 237 | hll_a.add_raw(probabilistic_test_util.construct_hll_value(log2m, 1, 10)) 238 | hll_b = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE) 239 | hll_a.add_raw(probabilistic_test_util.construct_hll_value(log2m, 1, 13)) 240 | 241 | hll_a.union(hll_b) 242 | 243 | assert hll_a.get_type() == HLLType.SPARSE # unchanged 244 | assert hll_a.cardinality() == 2 # precomputed 245 | assert_one_register_set(hll_a, 1, BitUtil.to_signed_byte(13)) # max(10,13) 246 | 247 | # ------------------------------------------------------------ 248 | # overlapping multisets should union properly 249 | hll_a = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE) 250 | hll_b = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE) 251 | # register index = 3 252 | raw_value_a = probabilistic_test_util.construct_hll_value(log2m, 3, 11) 253 | 254 | # register index = 4 255 | raw_value_b = probabilistic_test_util.construct_hll_value(log2m, 4, 13) 256 | raw_value_b_prime = probabilistic_test_util.construct_hll_value(log2m, 4, 21) 257 | 258 | # register index = 5 259 | raw_value_c = probabilistic_test_util.construct_hll_value(log2m, 5, 14) 260 | 261 | hll_a.add_raw(raw_value_a) 262 | hll_a.add_raw(raw_value_b) 263 | 264 | hll_b.add_raw(raw_value_b_prime) 265 | hll_b.add_raw(raw_value_c) 266 | 267 | hll_a.union(hll_b) 268 | # union should have three registers set, with partition B set to the 269 | # max of the two registers 270 | assert_register_present(hll_a, 3, BitUtil.to_signed_byte(11)) 271 | assert_register_present(hll_a, 4, BitUtil.to_signed_byte(21)) # max(21,13) 272 | assert_register_present(hll_a, 5, BitUtil.to_signed_byte(14)) 273 | 274 | # ------------------------------------------------------------ 275 | # too-large unions should promote 276 | hll_a = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE) 277 | hll_b = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE) 278 | 279 | # fill up sets to maxCapacity 280 | for i in range(0, sparse_threshold): 281 | hll_a.add_raw(probabilistic_test_util.construct_hll_value(log2m, i, 1)) 282 | hll_b.add_raw(probabilistic_test_util.construct_hll_value(log2m, i + sparse_threshold, 1)) # non-overlapping 283 | 284 | hll_a.union(hll_b) 285 | 286 | assert hll_a.get_type() == HLLType.FULL 287 | 288 | 289 | def test_clear(): 290 | """ 291 | Tests ``HLL.clear()``. 292 | """ 293 | hll = HLL.create_for_testing(log2m, 5, 128, 256, HLLType.SPARSE) 294 | hll.add_raw(1) 295 | hll.clear() 296 | assert hll.cardinality() == 0 297 | 298 | 299 | def test_to_from_bytes(): 300 | """ 301 | Tests ``HLL.to_bytes()`` and ``HLL.from_bytes()``. 302 | """ 303 | log2m = 11 # arbitrary 304 | regwidth = 5 # arbitrary 305 | sparse_threshold = 256 # arbitrary 306 | short_word_length = 16 # log2m + regwidth = 11 + 5 307 | 308 | schema_version = SerializationUtil.DEFAULT_SCHEMA_VERSION 309 | type = HLLType.SPARSE 310 | padding = schema_version.padding_bytes(type) 311 | 312 | # ------------------------------------------------------------ 313 | # Should work on an empty element 314 | hll = HLL.create_for_testing(log2m, regwidth, 128, sparse_threshold, HLLType.SPARSE) 315 | bytes = hll.to_bytes(schema_version) 316 | 317 | # output should just be padding since no registers are used 318 | assert len(bytes) == padding 319 | 320 | in_hll = HLL.from_bytes(bytes) 321 | 322 | # assert register values correct 323 | assert_elements_equal(hll, in_hll) 324 | 325 | # ------------------------------------------------------------ 326 | # Should work on a partially filled element 327 | hll = HLL.create_for_testing(log2m, regwidth, 128, sparse_threshold, HLLType.SPARSE) 328 | 329 | for i in range(0, 3): 330 | raw_value = probabilistic_test_util.construct_hll_value(log2m, i, (i + 9)) 331 | hll.add_raw(raw_value) 332 | 333 | bytes = hll.to_bytes(schema_version) 334 | 335 | assert len(bytes) == padding + probabilistic_test_util.get_required_bytes(short_word_length, 3) # register_count 336 | 337 | in_hll = HLL.from_bytes(bytes) 338 | 339 | # assert register values correct 340 | assert_elements_equal(hll, in_hll) 341 | 342 | # ------------------------------------------------------------ 343 | # Should work on a full set 344 | hll = HLL.create_for_testing(log2m, regwidth, 128, sparse_threshold, HLLType.SPARSE) 345 | 346 | for i in range(0, sparse_threshold): 347 | raw_value = probabilistic_test_util.construct_hll_value(log2m, i, (i % 9) + 1) 348 | hll.add_raw(raw_value) 349 | 350 | bytes = hll.to_bytes(schema_version) 351 | 352 | # 'short words' should be 12 bits + 5 bits = 17 bits long 353 | assert len(bytes) == padding + probabilistic_test_util.get_required_bytes(short_word_length, sparse_threshold) 354 | 355 | in_hll = HLL.from_bytes(bytes) 356 | 357 | # assert register values correct 358 | assert_elements_equal(hll, in_hll) 359 | 360 | 361 | def test_random_values(): 362 | log2m = 11 # arbitrary 363 | regwidth = 5 # arbitrary 364 | sparse_threshold = 256 # arbitrary 365 | 366 | seed = 1 367 | random.seed(seed) 368 | max_java_long = 9223372036854775807 369 | 370 | for run in range(0, 100): 371 | hll = HLL.create_for_testing(log2m, regwidth, 128, sparse_threshold, HLLType.SPARSE) 372 | 373 | map = {} 374 | 375 | for i in range(0, sparse_threshold): 376 | raw_value = random.randint(1, max_java_long) 377 | 378 | register_index = probabilistic_test_util.get_register_index(raw_value, log2m) 379 | register_value = probabilistic_test_util.get_register_value(raw_value, log2m) 380 | if map.get(register_index, 0) < register_value: 381 | map[register_index] = register_value 382 | 383 | hll.add_raw(raw_value) 384 | 385 | for key in map.keys(): 386 | expected_register_value = map.get(key, 0) 387 | assert_register_present(hll, key, expected_register_value) 388 | 389 | # ------------------------------------------------------------ 390 | # assertion helpers 391 | 392 | 393 | def assert_register_present(hll, register_index, register_value): 394 | """ 395 | Asserts that the register at the specified index is set to the specified value. 396 | """ 397 | sparse_probabilistic_storage = hll._sparse_probabilistic_storage 398 | assert sparse_probabilistic_storage.get(register_index, 0) == register_value 399 | 400 | 401 | def assert_one_register_set(hll, register_index, register_value): 402 | """ 403 | Asserts that only the specified register is set and has the specified value. 404 | """ 405 | sparse_probabilistic_storage = hll._sparse_probabilistic_storage 406 | assert len(sparse_probabilistic_storage) == 1 407 | assert sparse_probabilistic_storage.get(register_index, 0) == register_value 408 | 409 | 410 | def assert_elements_equal(hll_a, hll_b): 411 | sparse_probabilistic_storage_a = hll_a._sparse_probabilistic_storage 412 | sparse_probabilistic_storage_b = hll_b._sparse_probabilistic_storage 413 | assert len(sparse_probabilistic_storage_a) == len(sparse_probabilistic_storage_b) 414 | for index in sparse_probabilistic_storage_a.keys(): 415 | assert sparse_probabilistic_storage_a.get(index) == sparse_probabilistic_storage_b.get(index) 416 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27, py34, py35, py36, flake8 3 | 4 | [testenv:flake8] 5 | basepython = python 6 | deps = flake8 7 | commands = flake8 python_hll 8 | 9 | [testenv] 10 | setenv = 11 | PYTHONPATH = {toxinidir} 12 | deps = 13 | -r{toxinidir}/requirements_dev.txt 14 | ; If you want to make tox run the tests with the same versions, create a 15 | ; requirements.txt with the pinned versions and uncomment the following line: 16 | ; -r{toxinidir}/requirements.txt 17 | commands = 18 | pip install -U pip 19 | py.test --capture=no --fast-only --basetemp={envtmpdir} 20 | 21 | 22 | --------------------------------------------------------------------------------