├── .flake8
├── .gitignore
├── AUTHORS.rst
├── CONTRIBUTING.rst
├── HISTORY.rst
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── conftest.py
├── docs
    ├── Makefile
    ├── authors.rst
    ├── conf.py
    ├── contributing.rst
    ├── history.rst
    ├── index.rst
    ├── installation.rst
    ├── make.bat
    └── readme.rst
├── python_hll
    ├── __init__.py
    ├── hll.py
    ├── hlltype.py
    ├── hllutil.py
    ├── serialization.py
    └── util.py
├── requirements_dev.txt
├── setup.cfg
├── setup.py
├── tests
    ├── data
    │   ├── README.txt
    │   ├── cumulative_add_cardinality_correction.csv
    │   ├── cumulative_add_comprehensive_promotion.csv
    │   ├── cumulative_add_sparse_edge.csv
    │   ├── cumulative_add_sparse_random.csv
    │   ├── cumulative_add_sparse_step.csv
    │   ├── cumulative_union_comprehensive.csv
    │   ├── cumulative_union_explicit_explicit.csv
    │   ├── cumulative_union_explicit_promotion.csv
    │   ├── cumulative_union_probabilistic_probabilistic.csv
    │   ├── cumulative_union_sparse_full_representation.csv
    │   ├── cumulative_union_sparse_promotion.csv
    │   └── cumulative_union_sparse_sparse.csv
    ├── probabilistic_test_util.py
    ├── test_big_endian_ascending_word_deserializer.py
    ├── test_big_endian_ascending_word_serializer.py
    ├── test_bit_util.py
    ├── test_bit_vector.py
    ├── test_explicit_hll.py
    ├── test_full_hll.py
    ├── test_hll_serialization.py
    ├── test_hll_util.py
    ├── test_integration.py
    └── test_sparse_hll.py
└── tox.ini


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 400


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | .eggs
 3 | python_hll.egg-info
 4 | .pytest_cache
 5 | .tox
 6 | __pycache__
 7 | *.pyc
 8 | _build
 9 | docs/modules.rst
10 | docs/python_hll.rst
11 | /dist
12 | /build


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Credits
 3 | =======
 4 | 
 5 | Contributors
 6 | ------------
 7 | 
 8 | * Jon Aquino <jonathan.aquino@adroll.com>
 9 | * Kushagra Verma <kushagra.verma@adroll.com>
10 | * Alex Leu <alex.leu@adroll.com>
11 | * Michael Tran <michael.tran@adroll.com>
12 | * Rodrigo Westrupp <rodrigo.westrupp@adroll.com>
13 | * Sridharan Subramanian <sridharan.subramanian@adroll.com>>
14 | * Piyush Srivastava <piyush.srivastava@adroll.com>
15 | 
16 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
  1 | .. highlight:: shell
  2 | 
  3 | ============
  4 | Contributing
  5 | ============
  6 | 
  7 | Contributions are welcome, and they are greatly appreciated! Every little bit
  8 | helps, and credit will always be given.
  9 | 
 10 | You can contribute in many ways:
 11 | 
 12 | Types of Contributions
 13 | ----------------------
 14 | 
 15 | Report Bugs
 16 | ~~~~~~~~~~~
 17 | 
 18 | Report bugs at https://github.com/AdRoll/python-hll/issues.
 19 | 
 20 | If you are reporting a bug, please include:
 21 | 
 22 | * Your operating system name and version.
 23 | * Any details about your local setup that might be helpful in troubleshooting.
 24 | * Detailed steps to reproduce the bug.
 25 | 
 26 | Fix Bugs
 27 | ~~~~~~~~
 28 | 
 29 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
 30 | wanted" is open to whoever wants to implement it.
 31 | 
 32 | Implement Features
 33 | ~~~~~~~~~~~~~~~~~~
 34 | 
 35 | Look through the GitHub issues for features. Anything tagged with "enhancement"
 36 | and "help wanted" is open to whoever wants to implement it.
 37 | 
 38 | Write Documentation
 39 | ~~~~~~~~~~~~~~~~~~~
 40 | 
 41 | python-hll could always use more documentation, whether as part of the
 42 | official python-hll docs, in docstrings, or even on the web in blog posts,
 43 | articles, and such.
 44 | 
 45 | Submit Feedback
 46 | ~~~~~~~~~~~~~~~
 47 | 
 48 | The best way to send feedback is to file an issue at https://github.com/AdRoll/python-hll/issues.
 49 | 
 50 | If you are proposing a feature:
 51 | 
 52 | * Explain in detail how it would work.
 53 | * Keep the scope as narrow as possible, to make it easier to implement.
 54 | * Remember that this is a volunteer-driven project, and that contributions
 55 |   are welcome :)
 56 | 
 57 | Get Started!
 58 | ------------
 59 | 
 60 | Ready to contribute? Here's how to set up `python-hll` for local development.
 61 | 
 62 | 1. Fork the `python-hll` repo on GitHub.
 63 | 2. Clone your fork locally::
 64 | 
 65 |     $ git clone git@github.com:your_name_here/python-hll.git
 66 | 
 67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
 68 | 
 69 |     $ cd python-hll/
 70 |     $ mkvirtualenv python_hll
 71 |     $ python setup.py develop
 72 |     $ pip install -r requirements_dev.txt
 73 | 
 74 | 4. Create a branch for local development::
 75 | 
 76 |     $ git checkout -b name-of-your-bugfix-or-feature
 77 | 
 78 |    Now you can make your changes locally.
 79 | 
 80 | 5. When you're done making changes, check that your changes pass flake8 and the
 81 |    tests, including testing other Python versions with tox::
 82 | 
 83 |     $ make lint
 84 |     $ make test-fast
 85 | 
 86 |    To run one test file or one test::
 87 | 
 88 |     $ py.test --capture=no tests/test_sparse_hll.py
 89 |     $ py.test --capture=no tests/test_sparse_hll.py::test_add
 90 | 
 91 |    To run slow tests::
 92 | 
 93 |     $ make test
 94 | 
 95 | 6. Commit your changes and push your branch to GitHub::
 96 | 
 97 |     $ git add .
 98 |     $ git commit -m "Your detailed description of your changes."
 99 |     $ git push origin name-of-your-bugfix-or-feature
100 | 
101 | 7. Submit a pull request through the GitHub website.
102 | 
103 | Pull Request Guidelines
104 | -----------------------
105 | 
106 | Before you submit a pull request, check that it meets these guidelines:
107 | 
108 | 1. The pull request should include tests.
109 | 2. If the pull request adds functionality, the docs should be updated. Put
110 |    your new functionality into a function with a docstring, and add the
111 |    feature to the list in README.rst.
112 | 3. The pull request should work for Python 2.7, 3.4, 3.5 and 3.6, and for PyPy.
113 |    Make sure that the tests pass for all supported Python versions.
114 | 
115 | Deploying
116 | ---------
117 | 
118 | A reminder for the maintainers on how to deploy.
119 | Make sure all your changes are committed (including an entry in HISTORY.rst).
120 | Then run::
121 | 
122 | $ # Run bumpversion patch, or bumpversion minor, or bumpversion major.
123 | $ # This will tag the code and increment/commit new version numbers.
124 | $ bumpversion patch
125 | $ git push
126 | $ git push --tags
127 | $ make release  # use your pypi credentials
128 | $ # Log in to https://python-hll.readthedocs.io/ and publish the latest docs


--------------------------------------------------------------------------------
/HISTORY.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | History
 3 | =======
 4 | 
 5 | 0.0.0 (2019-06-14)
 6 | ------------------
 7 | 
 8 | * Submitted to AdRoll HackWeek.
 9 | 
10 | 0.1.0 (2019-09-12)
11 | ------------------
12 | 
13 | * First release on PyPI.
14 | 
15 | 0.1.1 (2019-09-12)
16 | ------------------
17 | 
18 | * Add missing install_requires: numpy
19 | 
20 | 0.1.2 (2019-12-12)
21 | ------------------
22 | 
23 | * Fix alpha_m_squared for m=32: https://github.com/AdRoll/python-hll/pull/2
24 | 
25 | 0.1.3 (2021-01-22)
26 | ------------------
27 | 
28 | * Fix AttributeError: 'HLL' object has no attribute '_sparse_probabilistic_storage':
29 |   https://github.com/AdRoll/python-hll/pull/4


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2019 AdRoll, Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a
 6 | copy of this software and associated documentation files (the
 7 | "Software"), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include AUTHORS.rst
 2 | include CONTRIBUTING.rst
 3 | include HISTORY.rst
 4 | include LICENSE
 5 | include README.rst
 6 | 
 7 | recursive-include tests *
 8 | recursive-exclude * __pycache__
 9 | recursive-exclude * *.py[co]
10 | 
11 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
12 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: clean clean-test clean-pyc clean-build docs help
 2 | .DEFAULT_GOAL := help
 3 | 
 4 | define BROWSER_PYSCRIPT
 5 | import os, webbrowser, sys
 6 | 
 7 | try:
 8 | 	from urllib import pathname2url
 9 | except:
10 | 	from urllib.request import pathname2url
11 | 
12 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
13 | endef
14 | export BROWSER_PYSCRIPT
15 | 
16 | define PRINT_HELP_PYSCRIPT
17 | import re, sys
18 | 
19 | for line in sys.stdin:
20 | 	match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
21 | 	if match:
22 | 		target, help = match.groups()
23 | 		print("%-20s %s" % (target, help))
24 | endef
25 | export PRINT_HELP_PYSCRIPT
26 | 
27 | BROWSER := python -c "$$BROWSER_PYSCRIPT"
28 | 
29 | help:
30 | 	@python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
31 | 
32 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
33 | 
34 | clean-build: ## remove build artifacts
35 | 	rm -fr build/
36 | 	rm -fr dist/
37 | 	rm -fr .eggs/
38 | 	find . -name '*.egg-info' -exec rm -fr {} +
39 | 	find . -name '*.egg' -exec rm -f {} +
40 | 
41 | clean-pyc: ## remove Python file artifacts
42 | 	find . -name '*.pyc' -exec rm -f {} +
43 | 	find . -name '*.pyo' -exec rm -f {} +
44 | 	find . -name '*~' -exec rm -f {} +
45 | 	find . -name '__pycache__' -exec rm -fr {} +
46 | 
47 | clean-test: ## remove test and coverage artifacts
48 | 	rm -fr .tox/
49 | 	rm -f .coverage
50 | 	rm -fr htmlcov/
51 | 	rm -fr .pytest_cache
52 | 
53 | lint: ## check style with flake8
54 | 	flake8 python_hll tests
55 | 
56 | test: ## run slow and fast tests
57 | 	@echo "\033[0;32mUse 'make test-fast' to run fast tests only\033[0m"
58 | 	py.test --capture=no
59 | 
60 | test-fast: ## run fast tests
61 | 	py.test --capture=no --fast-only
62 | 
63 | test-all: ## run tests on every Python version with tox
64 | 	tox
65 | 
66 | coverage: ## check code coverage quickly with the default Python
67 | 	coverage run --source python_hll -m pytest
68 | 	coverage report -m
69 | 	coverage html
70 | 	$(BROWSER) htmlcov/index.html
71 | 
72 | docs: ## generate Sphinx HTML documentation, including API docs
73 | 	rm -f docs/python_hll.rst
74 | 	rm -f docs/modules.rst
75 | 	sphinx-apidoc -o docs/ python_hll
76 | 	$(MAKE) -C docs clean
77 | 	$(MAKE) -C docs html
78 | 	$(BROWSER) docs/_build/html/index.html
79 | 
80 | servedocs: docs ## compile the docs watching for changes
81 | 	watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
82 | 
83 | release: dist ## package and upload a release
84 | 	twine upload dist/*
85 | 
86 | dist: clean ## builds source and wheel package
87 | 	python setup.py sdist
88 | 	python setup.py bdist_wheel
89 | 	ls -l dist
90 | 
91 | install: clean ## install the package to the active Python's site-packages
92 | 	python setup.py install
93 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | python-hll
 3 | ==========
 4 | 
 5 | 
 6 | .. image:: https://img.shields.io/pypi/v/python_hll.svg
 7 |         :target: https://pypi.python.org/pypi/python_hll
 8 | 
 9 | .. image:: https://readthedocs.org/projects/python-hll/badge/?version=latest
10 |         :target: https://python-hll.readthedocs.io/en/latest/?badge=latest
11 |         :alt: Documentation Status
12 | 
13 | .. image:: https://img.shields.io/badge/github-python--hll-yellow
14 |         :target: https://github.com/AdRoll/python-hll
15 | 
16 | A Python implementation of `HyperLogLog <http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf>`_
17 | whose goal is to be `storage compatible <https://github.com/aggregateknowledge/hll-storage-spec>`_
18 | with `java-hll <https://github.com/aggregateknowledge/java-hll>`_, `js-hll <https://github.com/aggregateknowledge/js-hll>`_
19 | and `postgresql-hll <https://github.com/citusdata/postgresql-hll>`_.
20 | 
21 | **NOTE:** This is a fairly literal translation/port of `java-hll <https://github.com/aggregateknowledge/java-hll>`_
22 | to Python. Internally, bytes are represented as Java-style bytes (-128 to 127) rather than Python-style bytes (0 to 255).
23 | Also this implementation is quite slow: for example, in Java ``HLLSerializationTest`` takes 12 seconds to run
24 | while in Python ``test_hll_serialization`` takes 1.5 hours to run (about 400x slower).
25 | 
26 | * Runs on: Python 2.7 and 3
27 | * Free software: MIT license
28 | * Documentation: https://python-hll.readthedocs.io
29 | * GitHub: https://github.com/AdRoll/python-hll
30 | 
31 | Overview
32 | ---------------
33 | See `java-hll <https://github.com/aggregateknowledge/java-hll>`_ for an overview of what HLLs are and how they work.
34 | 
35 | Usage
36 | ---------------
37 | 
38 | Hashing and adding a value to a new HLL::
39 | 
40 |     from python_hll.hll import HLL
41 |     import mmh3
42 |     value_to_hash = 'foo'
43 |     hashed_value = mmh3.hash(value_to_hash)
44 | 
45 |     hll = HLL(13, 5) # log2m=13, regwidth=5
46 |     hll.add_raw(hashed_value)
47 | 
48 | Retrieving the cardinality of an HLL::
49 | 
50 |     cardinality = hll.cardinality()
51 | 
52 | Unioning two HLLs together (and retrieving the resulting cardinality)::
53 | 
54 |     hll1 = HLL(13, 5) # log2m=13, regwidth=5
55 |     hll2 = HLL(13, 5) # log2m=13, regwidth=5
56 | 
57 |     # ... (add values to both sets) ...
58 | 
59 |     hll1.union(hll2) # modifies hll1 to contain the union
60 |     cardinalityUnion = hll1.cardinality()
61 | 
62 | Reading an HLL from a hex representation of
63 | `storage specification, v1.0.0 <https://github.com/aggregateknowledge/hll-storage-spec/blob/v1.0.0/STORAGE.md>`_
64 | (for example, retrieved from a `PostgreSQL database <https://github.com/aggregateknowledge/postgresql-hll>`_)::
65 | 
66 |     from python_hll.util import NumberUtil
67 |     input = '\\x128D7FFFFFFFFFF6A5C420'
68 |     hex_string = input[2:]
69 |     hll = HLL.from_bytes(NumberUtil.from_hex(hex_string, 0, len(hex_string)))
70 | 
71 | Writing an HLL to its hex representation of
72 | `storage specification, v1.0.0 <https://github.com/aggregateknowledge/hll-storage-spec/blob/v1.0.0/STORAGE.md>`_
73 | (for example, to be inserted into a `PostgreSQL database <https://github.com/aggregateknowledge/postgresql-hll>`_)::
74 | 
75 |     bytes = hll.to_bytes()
76 |     output = "\\x" + NumberUtil.to_hex(bytes, 0, len(bytes))
77 | 
78 | Also see the `API documentation <https://python-hll.readthedocs.io/en/latest/docs/python_hll.html>`_.
79 | 
80 | Development
81 | ---------------
82 | See `Contributing <https://python-hll.readthedocs.io/en/latest/contributing.html>`_ for how to get started building, testing, and deploying the code.


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | # This file is here to add the project root to the sys.path to prevent
 2 | # import errors when running a single test. See https://stackoverflow.com/a/50610630/378457
 3 | 
 4 | # It also defines the --fast-only command-line option below.
 5 | 
 6 | import pytest
 7 | 
 8 | 
 9 | def pytest_addoption(parser):
10 |     parser.addoption("--fast-only", action="store_true", help="Run fast tests only")
11 | 
12 | 
13 | @pytest.fixture
14 | def fastonly(request):
15 |     return request.config.getoption("--fast-only")
16 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = python -msphinx
 7 | SPHINXPROJ    = python_hll
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../AUTHORS.rst
2 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # python_hll documentation build configuration file, created by
  5 | # sphinx-quickstart on Fri Jun  9 13:47:02 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another
 17 | # directory, add these directories to sys.path here. If the directory is
 18 | # relative to the documentation root, use os.path.abspath to make it
 19 | # absolute, like shown here.
 20 | #
 21 | import os
 22 | import sys
 23 | sys.path.insert(0, os.path.abspath('..'))
 24 | 
 25 | import python_hll
 26 | 
 27 | # -- General configuration ---------------------------------------------
 28 | 
 29 | # If your documentation needs a minimal Sphinx version, state it here.
 30 | #
 31 | # needs_sphinx = '1.0'
 32 | 
 33 | # Add any Sphinx extension module names here, as strings. They can be
 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 35 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode']
 36 | 
 37 | # Add any paths that contain templates here, relative to this directory.
 38 | templates_path = ['_templates']
 39 | 
 40 | # The suffix(es) of source filenames.
 41 | # You can specify multiple suffix as a list of string:
 42 | #
 43 | # source_suffix = ['.rst', '.md']
 44 | source_suffix = '.rst'
 45 | 
 46 | # The master toctree document.
 47 | master_doc = 'index'
 48 | 
 49 | # General information about the project.
 50 | project = u'python-hll'
 51 | copyright = u"2019, Jon Aquino"
 52 | author = u"Jon Aquino"
 53 | 
 54 | # The version info for the project you're documenting, acts as replacement
 55 | # for |version| and |release|, also used in various other places throughout
 56 | # the built documents.
 57 | #
 58 | # The short X.Y version.
 59 | version = python_hll.__version__
 60 | # The full version, including alpha/beta/rc tags.
 61 | release = python_hll.__version__
 62 | 
 63 | # The language for content autogenerated by Sphinx. Refer to documentation
 64 | # for a list of supported languages.
 65 | #
 66 | # This is also used if you do content translation via gettext catalogs.
 67 | # Usually you set "language" from the command line for these cases.
 68 | language = None
 69 | 
 70 | # List of patterns, relative to source directory, that match files and
 71 | # directories to ignore when looking for source files.
 72 | # This patterns also effect to html_static_path and html_extra_path
 73 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 74 | 
 75 | # The name of the Pygments (syntax highlighting) style to use.
 76 | pygments_style = 'sphinx'
 77 | 
 78 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 79 | todo_include_todos = False
 80 | 
 81 | 
 82 | # -- Options for HTML output -------------------------------------------
 83 | 
 84 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 85 | # a list of builtin themes.
 86 | #
 87 | html_theme = 'alabaster'
 88 | 
 89 | # Theme options are theme-specific and customize the look and feel of a
 90 | # theme further.  For a list of options available for each theme, see the
 91 | # documentation.
 92 | #
 93 | # html_theme_options = {}
 94 | 
 95 | # Add any paths that contain custom static files (such as style sheets) here,
 96 | # relative to this directory. They are copied after the builtin static files,
 97 | # so a file named "default.css" will overwrite the builtin "default.css".
 98 | html_static_path = ['_static']
 99 | 
100 | 
101 | # -- Options for HTMLHelp output ---------------------------------------
102 | 
103 | # Output file base name for HTML help builder.
104 | htmlhelp_basename = 'python_hlldoc'
105 | 
106 | 
107 | # -- Options for LaTeX output ------------------------------------------
108 | 
109 | latex_elements = {
110 |     # The paper size ('letterpaper' or 'a4paper').
111 |     #
112 |     # 'papersize': 'letterpaper',
113 | 
114 |     # The font size ('10pt', '11pt' or '12pt').
115 |     #
116 |     # 'pointsize': '10pt',
117 | 
118 |     # Additional stuff for the LaTeX preamble.
119 |     #
120 |     # 'preamble': '',
121 | 
122 |     # Latex figure (float) alignment
123 |     #
124 |     # 'figure_align': 'htbp',
125 | }
126 | 
127 | # Grouping the document tree into LaTeX files. List of tuples
128 | # (source start file, target name, title, author, documentclass
129 | # [howto, manual, or own class]).
130 | latex_documents = [
131 |     (master_doc, 'python_hll.tex',
132 |      u'python-hll Documentation',
133 |      u'Jon Aquino', 'manual'),
134 | ]
135 | 
136 | 
137 | # -- Options for manual page output ------------------------------------
138 | 
139 | # One entry per manual page. List of tuples
140 | # (source start file, name, description, authors, manual section).
141 | man_pages = [
142 |     (master_doc, 'python_hll',
143 |      u'python-hll Documentation',
144 |      [author], 1)
145 | ]
146 | 
147 | 
148 | # -- Options for Texinfo output ----------------------------------------
149 | 
150 | # Grouping the document tree into Texinfo files. List of tuples
151 | # (source start file, target name, title, author,
152 | #  dir menu entry, description, category)
153 | texinfo_documents = [
154 |     (master_doc, 'python_hll',
155 |      u'python-hll Documentation',
156 |      author,
157 |      'python_hll',
158 |      'One line description of project.',
159 |      'Miscellaneous'),
160 | ]
161 | 
162 | # -- Make ReadTheDocs generate API doc ----------------------------------------
163 | 
164 | # See https://github.com/isogeo/isogeo-api-py-minsdk/commit/df45262dae266035946839009e02e6c5e068a05f
165 | on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
166 | if on_rtd:
167 |     def run_apidoc(_):
168 |         from sphinx.apidoc import main as apidoc_main
169 | 
170 |         cur_dir = os.path.abspath(os.path.dirname(__file__))
171 |         output_path = os.path.join(cur_dir, 'docs')
172 |         modules = os.path.join(cur_dir, os.path.normpath(r"../python_hll"))
173 |         exclusions = []
174 |         apidoc_main([None, '-e', '-f', '-o', output_path, modules] + exclusions)
175 | 
176 |     def setup(app):
177 |         app.connect('builder-inited', run_apidoc)
178 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../CONTRIBUTING.rst
2 | 


--------------------------------------------------------------------------------
/docs/history.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../HISTORY.rst
2 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to python-hll's documentation!
 2 | ======================================
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 2
 6 |    :caption: Contents:
 7 | 
 8 |    readme
 9 |    installation
10 |    API <https://python-hll.readthedocs.io/en/latest/docs/python_hll.html>
11 |    contributing
12 |    authors
13 |    history
14 | 
15 | Indices and tables
16 | ==================
17 | * :ref:`genindex`
18 | * :ref:`modindex`
19 | * :ref:`search`
20 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | .. highlight:: shell
 2 | 
 3 | ============
 4 | Installation
 5 | ============
 6 | 
 7 | 
 8 | Stable release
 9 | --------------
10 | 
11 | To install python-hll, run this command in your terminal:
12 | 
13 | .. code-block:: console
14 | 
15 |     $ pip install python_hll
16 | 
17 | This is the preferred method to install python-hll, as it will always install the most recent stable release.
18 | 
19 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide
20 | you through the process.
21 | 
22 | .. _pip: https://pip.pypa.io
23 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
24 | 
25 | 
26 | From sources
27 | ------------
28 | 
29 | The sources for python-hll can be downloaded from the `Github repo`_.
30 | 
31 | You can either clone the public repository:
32 | 
33 | .. code-block:: console
34 | 
35 |     $ git clone git://github.com/AdRoll/python-hll
36 | 
37 | Or download the `tarball`_:
38 | 
39 | .. code-block:: console
40 | 
41 |     $ curl  -OL https://github.com/AdRoll/python-hll/tarball/master
42 | 
43 | Once you have a copy of the source, you can install it with:
44 | 
45 | .. code-block:: console
46 | 
47 |     $ python setup.py install
48 | 
49 | 
50 | .. _Github repo: https://github.com/AdRoll/python-hll
51 | .. _tarball: https://github.com/AdRoll/python-hll/tarball/master
52 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=python -msphinx
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=python_hll
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The Sphinx module was not found. Make sure you have Sphinx installed,
20 | 	echo.then set the SPHINXBUILD environment variable to point to the full
21 | 	echo.path of the 'sphinx-build' executable. Alternatively you may add the
22 | 	echo.Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/readme.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../README.rst
2 | 


--------------------------------------------------------------------------------
/python_hll/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | """Top-level package for python-hll."""
4 | 
5 | __author__ = """Jon Aquino"""
6 | __email__ = 'jonathan.aquino@adroll.com'
7 | __version__ = '0.1.3'
8 | 


--------------------------------------------------------------------------------
/python_hll/hll.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import division
  4 | from copy import deepcopy
  5 | from math import ceil, floor
  6 | 
  7 | from python_hll.hlltype import HLLType
  8 | from python_hll.serialization import SerializationUtil, HLLMetadata
  9 | from python_hll.util import NumberUtil, BitVector, BitUtil
 10 | 
 11 | 
 12 | class HLL:
 13 |     """
 14 |     A probabilistic set of hashed ``long`` elements. Useful for computing
 15 |     the approximate cardinality of a stream of data in very small storage.
 16 | 
 17 |     A modified version of the `'HyperLogLog' data structure and algorithm
 18 |     <http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf>`_ is used,
 19 |     which combines both probabilistic and non-probabilistic techniques to
 20 |     improve the accuracy and storage requirements of the original algorithm.
 21 | 
 22 |     More specifically, initializing and storing a new HLL will
 23 |     allocate a sentinel value symbolizing the empty set (HLLType.EMPTY).
 24 |     After adding the first few values, a sorted list of unique integers is
 25 |     stored in a HLLType.EXPLICIT hash set. When configured, accuracy can
 26 |     be sacrificed for memory footprint: the values in the sorted list are
 27 |     "promoted" to a "HLLType.SPARSE" map-based HyperLogLog structure.
 28 |     Finally, when enough registers are set, the map-based HLL will be converted
 29 |     to a bit-packed "HLLType.FULL" HyperLogLog structure.
 30 | 
 31 |     This data structure is interoperable with the implementations found at:
 32 | 
 33 |     * `postgresql-hll <https://github.com/aggregateknowledge/postgresql-hll>`_
 34 |     * `js-hll <https://github.com/aggregateknowledge/js-hll>`_
 35 | 
 36 |     when `properly serialized <https://github.com/aggregateknowledge/postgresql-hll/blob/master/STORAGE.markdown>`_.
 37 |     """
 38 | 
 39 |     # minimum and maximum values for the log-base-2 of the number of registers
 40 |     # in the HLL
 41 |     MINIMUM_LOG2M_PARAM = 4
 42 |     MAXIMUM_LOG2M_PARAM = 30
 43 | 
 44 |     # minimum and maximum values for the register width of the HLL
 45 |     MINIMUM_REGWIDTH_PARAM = 1
 46 |     MAXIMUM_REGWIDTH_PARAM = 8
 47 | 
 48 |     # minimum and maximum values for the 'expthresh' parameter of the
 49 |     # constructor that is meant to match the PostgreSQL implementation's
 50 |     # constructor and parameter names
 51 |     MINIMUM_EXPTHRESH_PARAM = -1
 52 |     MAXIMUM_EXPTHRESH_PARAM = 18
 53 |     MAXIMUM_EXPLICIT_THRESHOLD = BitUtil.left_shift_int(1, (MAXIMUM_EXPTHRESH_PARAM - 1))  # per storage spec
 54 | 
 55 |     # ------------------------------------------------------------
 56 |     # STORAGE
 57 |     # :var set _explicit_storage: storage used when ``type`` is EXPLICIT, None otherwise
 58 |     # :var dict _sparse_probabilistic_storage: storage used when ``type`` is SPARSE, None otherwise
 59 |     # :var BitVector _probabilistic_storage: storage used when ``type`` is FULL, None otherwise
 60 |     # :var HLLType type: current type of this HLL instance, if this changes then so should the storage used (see above)
 61 | 
 62 |     # ------------------------------------------------------------
 63 |     # CHARACTERISTIC PARAMETERS
 64 |     # NOTE:  These members are named to match the PostgreSQL implementation's parameters.
 65 |     # :var int _log2m: log2(the number of probabilistic HLL registers)
 66 |     # :var int _regwidth: the size (width) each register in bits
 67 | 
 68 |     # ------------------------------------------------------------
 69 |     # COMPUTED CONSTANTS
 70 |     # ............................................................
 71 |     # EXPLICIT-specific constants
 72 |     # :var boolean _explicit_off: flag indicating if the EXPLICIT representation should NOT be used
 73 |     # :var boolean _explicit_auto: flag indicating that the promotion threshold from EXPLICIT should be
 74 |     #              computed automatically. NOTE:  this only has meaning when '_explicit_off' is false.
 75 |     # :var int _explicit_threshold: threshold (in element count) at which a EXPLICIT HLL is converted to a
 76 |     #           SPARSE or FULL HLL, always greater than or equal to zero and always a power of two OR simply zero
 77 |     #           NOTE:  this only has meaning when '_explicit_off' is false
 78 |     # ............................................................
 79 |     # SPARSE-specific constants
 80 |     # :var int _short_word_length: the computed width of the short words
 81 |     # :var boolean _sparse_off: flag indicating if the SPARSE representation should not be used
 82 |     # :var int _sparse_threshold: threshold (in register count) at which a SPARSE HLL is converted to a
 83 |     #          FULL HLL, always greater than zero
 84 |     # ............................................................
 85 |     # Probabilistic algorithm constants
 86 |     # :var int _m: the number of registers, will always be a power of 2
 87 |     # :var int _m_bits_mask: a mask of the log2m bits set to one and the rest to zero
 88 |     # :var int _value_mask: a mask as wide as a register (see ``from_bytes()``)
 89 |     # :var long _long_pw_mask: mask used to ensure that p(w) does not overflow register (see ``__init__()`` and ``add_raw()``)
 90 |     # ;var float _alpha_m_squared: alpha * m^2 (the constant in the "'raw' HyperLogLog estimator")
 91 |     # :var float _small_estimator_cutoff: the cutoff value of the estimator for using the "small" range cardinality correction formula
 92 |     # :var float _large_estimator_cutoff: the cutoff value of the estimator for using the "large" range cardinality correction formula
 93 | 
 94 |     def __init__(self, log2m, regwidth, expthresh=-1, sparseon=True, type=HLLType.EMPTY):
 95 |         """
 96 |         NOTE: Arguments here are named and structured identically to those in the
 97 |               PostgreSQL implementation, which can be found
 98 |               `here <https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning>`_.
 99 | 
100 |         :param log2m: log-base-2 of the number of registers used in the HyperLogLog
101 |                algorithm. Must be at least 4 and at most 30.
102 |         :type log2m: int
103 |         :param regwidth: number of bits used per register in the HyperLogLog
104 |                algorithm. Must be at least 1 and at most 8.
105 |         :type regwidth: int
106 |         :param expthresh: tunes when the ``HLLType.EXPLICIT`` to
107 |                ``HLLType.SPARSE`` promotion occurs,
108 |                based on the set's cardinality. Must be at least -1 and at most 18.
109 |                +-----------+--------------------------------------------------------------------------------+
110 |                | expthresh | Meaning                                                                        |
111 |                +===========+================================================================================+
112 |                | -1        | Promote at whatever cutoff makes sense for optimal memory usage. ('auto' mode) |
113 |                +-----------+--------------------------------------------------------------------------------+
114 |                | 0         | Skip ``EXPLICIT`` representation in hierarchy.                                 |
115 |                +-----------+--------------------------------------------------------------------------------+
116 |                | 1-18      | Promote at 2:sup:`expthresh - 1` cardinality                                   |
117 |                +-----------+--------------------------------------------------------------------------------+
118 |         :type expthresh: int
119 |         :param sparseon: Flag indicating if the ``HLLType.SPARSE``
120 |                representation should be used.
121 |         :type sparseon: boolean
122 |         :param type: the type in the promotion hierarchy which this instance should
123 |                start at. This cannot be ``None``.
124 |         :type type: HLLType
125 |         """
126 |         from python_hll.hllutil import HLLUtil
127 | 
128 |         self._log2m = log2m
129 |         if log2m < HLL.MINIMUM_LOG2M_PARAM or log2m > HLL.MAXIMUM_EXPLICIT_THRESHOLD:
130 |             raise Exception("'log2m' must be at least " + str(HLL.MINIMUM_LOG2M_PARAM) + " and at most " + str(HLL.MAXIMUM_LOG2M_PARAM) + " (was: " + str(log2m) + ")")
131 | 
132 |         self._regwidth = regwidth
133 |         if regwidth < HLL.MINIMUM_REGWIDTH_PARAM or regwidth > HLL.MAXIMUM_REGWIDTH_PARAM:
134 |             raise Exception("'regwidth' must be at least " + str(HLL.MINIMUM_REGWIDTH_PARAM) + " and at most " + str(HLL.MAXIMUM_REGWIDTH_PARAM) + " (was: " + str(regwidth) + ")")
135 | 
136 |         self._m = BitUtil.left_shift_int(1, log2m)
137 |         self._m_bits_mask = self._m - 1
138 |         self._value_mask = BitUtil.left_shift_int(1, regwidth) - 1
139 |         self._pw_max_mask = HLLUtil.pw_max_mask(regwidth)
140 |         self._alpha_m_squared = HLLUtil.alpha_m_squared(self._m)
141 |         self._small_estimator_cutoff = HLLUtil.small_estimator_cutoff(self._m)
142 |         self._large_estimator_cutoff = HLLUtil.large_estimator_cutoff(log2m, regwidth)
143 | 
144 |         if expthresh == -1:
145 |             self._explicit_auto = True
146 |             self._explicit_off = False
147 | 
148 |             # NOTE:  This math matches the size calculation in the PostgreSQL impl.
149 |             full_representation_size = floor((self._regwidth * self._m + 7) / 8)  # round up to next whole byte
150 |             num_longs = floor(full_representation_size / 8)  # integer division to round down
151 | 
152 |             if num_longs > HLL.MAXIMUM_EXPLICIT_THRESHOLD:
153 |                 self._explicit_threshold = HLL.MAXIMUM_EXPLICIT_THRESHOLD
154 |             else:
155 |                 self._explicit_threshold = num_longs
156 |         elif expthresh == 0:
157 |             self._explicit_auto = False
158 |             self._explicit_off = True
159 |             self._explicit_threshold = 0
160 |         elif 0 < expthresh <= HLL.MAXIMUM_EXPTHRESH_PARAM:
161 |             self._explicit_auto = False
162 |             self._explicit_off = False
163 |             self._explicit_threshold = BitUtil.left_shift_int(1, (expthresh - 1))
164 |         else:
165 |             raise Exception("'expthresh' must be at least " + str(HLL.MINIMUM_EXPTHRESH_PARAM) + " and at most " + str(HLL.MAXIMUM_EXPTHRESH_PARAM) + " (was: " + str(expthresh) + ")")
166 | 
167 |         self._short_word_length = regwidth + log2m
168 |         self._sparse_off = not sparseon
169 |         if self._sparse_off:
170 |             self._sparse_threshold = 0
171 |         else:
172 |             # TODO improve this cutoff to include the cost overhead of members/objects
173 |             largest_pow_2_less_than_cutoff = int(NumberUtil.log2((self._m * self._regwidth) / self._short_word_length))
174 |             self._sparse_threshold = BitUtil.left_shift_int(1, largest_pow_2_less_than_cutoff)
175 | 
176 |         self._initialize_storage(type)
177 | 
178 |     @classmethod
179 |     def create_for_testing(cls, log2m, regwidth, explicit_threshold, sparse_threshold, type):
180 |         """
181 |         Convenience constructor for testing. Assumes that both ``HLLType.EXPLICIT``
182 |         and ``HLLType.SPARSE`` representations should be enabled.
183 | 
184 |         :param log2m: log-base-2 of the number of registers used in the HyperLogLog
185 |                algorithm. Must be at least 4 and at most 30.
186 |         :type log2m: int
187 |         :param regwidth: number of bits used per register in the HyperLogLog
188 |                algorithm. Must be at least 1 and at most 8.
189 |         :type regwidth: int
190 |         :param explicit_threshold: cardinality threshold at which the ``HLLType.EXPLICIT``
191 |                representation should be promoted to ``HLLType.SPARSE``.
192 |                This must be greater than zero and less than or equal to ``MAXIMUM_EXPLICIT_THRESHOLD``.
193 |         :type explicit_threshold: int
194 |         :param sparse_threshold: register count threshold at which the ``HLLType.SPARSE``
195 |                representation should be promoted to ``HLLType.FULL``.
196 |                This must be greater than zero.
197 |         :type sparse_threshold: int
198 |         :param type: the type in the promotion hierarchy which this instance should
199 |                start at. This cannot be ``None``.
200 |         :type type: HLLType
201 |         :rtype: HLL
202 |         """
203 |         hll = HLL(log2m=log2m, regwidth=regwidth, expthresh=-1, sparseon=True, type=type)
204 |         hll._explicit_auto = False
205 |         hll._explicit_off = False
206 |         hll._explicit_threshold = explicit_threshold
207 |         if explicit_threshold < 1 or explicit_threshold > cls.MAXIMUM_EXPLICIT_THRESHOLD:
208 |             raise Exception("'explicit_threshold' must be at least 1 and at most " + str(cls.MAXIMUM_EXPLICIT_THRESHOLD) + " (was: " + str(explicit_threshold) + ")")
209 |         hll._sparse_off = False
210 |         hll._sparse_threshold = sparse_threshold
211 |         return hll
212 | 
213 |     def get_type(self):
214 |         """
215 |         Returns the type in the promotion hierarchy of this instance. This will
216 |         never be ``None``.
217 | 
218 |         :rtype: HLLType
219 |         """
220 |         return self._type
221 | 
222 |     def add_raw(self, raw_value):
223 |         """
224 |         Adds ``rawValue`` directly to the HLL.
225 | 
226 |         :param long raw_value: the value to be added. It is very important that this
227 |                value already be hashed with a strong (but not
228 |                necessarily cryptographic) hash function. For instance, the
229 |                `MurmurHash3 implementation <https://pypi.org/project/mmh3/>`_
230 |                is an excellent hash function for this purpose.
231 |         :rtype: void
232 |         """
233 | 
234 |         if self._type == HLLType.EMPTY:
235 |             # Note: EMPTY type is always promoted on add_raw()
236 |             if self._explicit_threshold > 0:
237 |                 self._initialize_storage(HLLType.EXPLICIT)
238 |                 self._explicit_storage.add(raw_value)
239 |             elif not self._sparse_off:
240 |                 self._initialize_storage(HLLType.SPARSE)
241 |                 self._add_raw_sparse_probabilistic(raw_value)
242 |             else:
243 |                 self._initialize_storage(HLLType.FULL)
244 |                 self._add_raw_probabilistic(raw_value)
245 |             return
246 | 
247 |         elif self._type == HLLType.EXPLICIT:
248 |             self._explicit_storage.add(raw_value)
249 | 
250 |             # promotion, if necessary
251 |             if len(self._explicit_storage) > self._explicit_threshold:
252 |                 if not self._sparse_off:
253 |                     self._initialize_storage(HLLType.SPARSE)
254 |                     for value in self._explicit_storage:
255 |                         self._add_raw_sparse_probabilistic(value)
256 |                 else:
257 |                     self._initialize_storage(HLLType.FULL)
258 |                     for value in self._explicit_storage:
259 |                         self._add_raw_probabilistic(value)
260 |                 self._explicit_storage = None
261 |             return
262 | 
263 |         elif self._type == HLLType.SPARSE:
264 |             self._add_raw_sparse_probabilistic(raw_value)
265 | 
266 |             # promotion, if necessary
267 |             if len(self._sparse_probabilistic_storage) > self._sparse_threshold:
268 |                 self._initialize_storage(HLLType.FULL)
269 |                 for register_index in self._sparse_probabilistic_storage.keys():
270 |                     register_value = self._sparse_probabilistic_storage.get(register_index, 0)
271 |                     self._probabilistic_storage.set_max_register(register_index, register_value)
272 |                 self._sparse_probabilistic_storage = None
273 |             return
274 | 
275 |         elif self._type == HLLType.FULL:
276 |             self._add_raw_probabilistic(raw_value)
277 |             return
278 | 
279 |         else:
280 |             raise Exception("Unsupported HLL type: {}".format(self._type))
281 | 
282 |     def _add_raw_sparse_probabilistic(self, raw_value):
283 |         """
284 |         Adds the raw value to the ``sparseProbabilisticStorage``.
285 |         ``type`` ``HLLType.SPARSE``.
286 | 
287 |         :param long raw_value: the raw value to add to the sparse storage.
288 |         :rtype: void
289 |         """
290 | 
291 |         # p(w): position of the least significant set bit (one-indexed)
292 |         # By contract: p(w) <= 2^(register_value_in_bits) - 1 (the max register value)
293 |         #
294 |         # By construction of pw_max_mask (see constructor),
295 |         #      lsb(pw_max_mask) = 2^(register_value_in_bits) - 2,
296 |         # thus lsb(any_long | pw_max_mask) <= 2^(register_value_in_bits) - 2,
297 |         # thus 1 + lsb(any_long | pw_max_mask) <= 2^(register_value_in_bits) -1.
298 |         sub_stream_value = BitUtil.unsigned_right_shift_long(raw_value, self._log2m)
299 |         p_w = None
300 | 
301 |         if sub_stream_value == 0:
302 |             # The paper does not cover p(0x0), so the special value 0 is used.
303 |             # 0 is the original initialization value of the registers, so by
304 |             # doing this the multiset simply ignores it. This is acceptable
305 |             # because the probability is 1/(2^(2^register_size_in_bits)).
306 |             p_w = 0
307 |         else:
308 |             p_w = BitUtil.to_signed_byte(1 + BitUtil.least_significant_bit(sub_stream_value | self._pw_max_mask))
309 | 
310 |         # Short-circuit if the register is being set to zero, since algorithmically
311 |         # this corresponds to an "unset" register, and "unset" registers aren't
312 |         # stored to save memory. (The very reason this sparse implementation
313 |         # exists.) If a register is set to zero it will break the algorithm_cardinality
314 |         # code.
315 |         if p_w == 0:
316 |             return
317 | 
318 |         # NOTE:  no +1 as in paper since 0-based indexing
319 |         j = int(raw_value & self._m_bits_mask)
320 | 
321 |         current_value = self._sparse_probabilistic_storage.get(j, 0)
322 |         if p_w > current_value:
323 |             self._sparse_probabilistic_storage[j] = p_w
324 | 
325 |     def _add_raw_probabilistic(self, raw_value):
326 |         """
327 |         Adds the raw value to the ``probabilisticStorage``.
328 |         ``type`` must be ``HLLType.FULL``.
329 | 
330 |         :param long raw_value: the raw value to add to the full probabilistic storage.
331 |         :rtype: void
332 |         """
333 |         # p(w): position of the least significant set bit (one-indexed)
334 |         # By contract: p(w) <= 2^(register_value_in_bits) - 1 (the max register value)
335 |         #
336 |         # By construction of pw_max_mask (see constructor),
337 |         #      lsb(pw_max_mask) = 2^(register_value_in_bits) - 2,
338 |         # thus lsb(any_long | pw_max_mask) <= 2^(register_value_in_bits) - 2,
339 |         # thus 1 + lsb(any_long | pw_max_mask) <= 2^(register_value_in_bits) -1.
340 |         sub_stream_value = BitUtil.unsigned_right_shift_long(raw_value, self._log2m)
341 |         p_w = None
342 | 
343 |         if sub_stream_value == 0:
344 |             # The paper does not cover p(0x0), so the special value 0 is used.
345 |             # 0 is the original initialization value of the registers, so by
346 |             # doing this the multiset simply ignores it. This is acceptable
347 |             # because the probability is 1/(2^(2^register_size_in_bits)).
348 |             p_w = 0
349 |         else:
350 |             p_w = BitUtil.to_signed_byte(1 + BitUtil.least_significant_bit(sub_stream_value | self._pw_max_mask))
351 | 
352 |         # Short-circuit if the register is being set to zero, since algorithmically
353 |         # this corresponds to an "unset" register, and "unset" registers aren't
354 |         # stored to save memory. (The very reason this sparse implementation
355 |         # exists.) If a register is set to zero it will break the algorithm_cardinality
356 |         # code.
357 |         if p_w == 0:
358 |             return
359 | 
360 |         # NOTE:  no +1 as in paper since 0-based indexing
361 |         j = int(raw_value & self._m_bits_mask)
362 | 
363 |         self._probabilistic_storage.set_max_register(j, p_w)
364 | 
365 |     def _initialize_storage(self, type):
366 |         """
367 |         Initializes storage for the specified ``HLLType`` and changes the
368 |         instance's ``type``.
369 | 
370 |         :param HLLType type: the ``HLLType`` to initialize storage for. This cannot be
371 |                ``None`` and must be an instantiable type. (For instance,
372 |                it cannot be ``HLLType.UNDEFINED``.)
373 |         :rtype: void
374 |         """
375 |         self._type = type
376 |         if type == HLLType.EMPTY:
377 |             # nothing to be done
378 |             pass
379 |         elif type == HLLType.EXPLICIT:
380 |             self._explicit_storage = set()
381 |         elif type == HLLType.SPARSE:
382 |             self._sparse_probabilistic_storage = dict()
383 |         elif type == HLLType.FULL:
384 |             self._probabilistic_storage = BitVector(self._regwidth, self._m)
385 |         else:
386 |             raise Exception("Unsupported HLL type: {}".format(self._type))
387 | 
388 |     def cardinality(self):
389 |         """
390 |         Computes the cardinality of the HLL.
391 | 
392 |         :returns: the cardinality of HLL. This will never be negative.
393 |         :rtype: long
394 |         """
395 |         if self._type == HLLType.EMPTY:
396 |             return 0  # by definition
397 |         elif self._type == HLLType.EXPLICIT:
398 |             return len(self._explicit_storage)
399 |         elif self._type == HLLType.SPARSE:
400 |             return ceil(self._sparse_probabilistic_algorithm_cardinality())
401 |         elif self._type == HLLType.FULL:
402 |             return ceil(self._full_probabilistic_algorithm_cardinality())
403 |         else:
404 |             raise Exception("Unsupported HLL type: {}".format(self._type))
405 | 
406 |     def _sparse_probabilistic_algorithm_cardinality(self):
407 |         """
408 |         Computes the exact cardinality value returned by the HLL algorithm when
409 |         represented as a ``HLLType.SPARSE`` HLL. Kept
410 |         separate from ``cardinality()`` for testing purposes. ``type``
411 |         must be ``HLLType.SPARSE``.
412 | 
413 |         :returns: the exact, unrounded cardinality given by the HLL algorithm
414 |         :rtype: float
415 |         """
416 |         from python_hll.hllutil import HLLUtil
417 |         m = self._m
418 | 
419 |         # compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the
420 |         # 'j'th register value
421 |         indicator_function = 0.0
422 |         number_of_zeroes = 0  # "V" in the paper
423 |         for j in range(m):
424 |             register = self._sparse_probabilistic_storage.get(j, 0)
425 | 
426 |             indicator_function += 1.0 / BitUtil.left_shift_long(1, register)
427 |             if register == 0:
428 |                 number_of_zeroes += 1
429 | 
430 |         # apply the estimate and correction to the indicator function
431 |         estimator = self._alpha_m_squared / indicator_function
432 |         if number_of_zeroes != 0 and estimator < self._small_estimator_cutoff:
433 |             return HLLUtil.small_estimator(m, number_of_zeroes)
434 |         elif estimator <= self._large_estimator_cutoff:
435 |             return estimator
436 |         else:
437 |             return HLLUtil.large_estimator(self._log2m, self._regwidth, estimator)
438 | 
439 |     def _full_probabilistic_algorithm_cardinality(self):
440 |         """
441 |         Computes the exact cardinality value returned by the HLL algorithm when
442 |         represented as a ``HLLType.FULL`` HLL. Kept separate from ``cardinality()`` for testing purposes.
443 |         type must be ``HLLType.FULL``.
444 | 
445 |         :rtype: float
446 |         """
447 |         from python_hll.hllutil import HLLUtil
448 |         # for performance
449 |         m = self._m
450 |         # compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the
451 |         # 'j'th register value
452 |         sum = 0
453 |         number_of_zeroes = 0  # "V" in the paper
454 |         iterator = self._probabilistic_storage.register_iterator()
455 |         for register in iterator:
456 |             sum += 1.0 / BitUtil.left_shift_long(1, register)
457 |             if register == 0:
458 |                 number_of_zeroes += 1
459 |         # apply the estimate and correction to the indicator function
460 |         estimator = self._alpha_m_squared / sum
461 |         if number_of_zeroes != 0 and (estimator < self._small_estimator_cutoff):
462 |             return HLLUtil.small_estimator(m, number_of_zeroes)
463 |         elif estimator <= self._large_estimator_cutoff:
464 |             return estimator
465 |         else:
466 |             return HLLUtil.large_estimator(self._log2m, self._regwidth, estimator)
467 | 
468 |     def clear(self):
469 |         """
470 |         Clears the HLL. The HLL will have cardinality zero and will act as if no
471 |         elements have been added.
472 | 
473 |         NOTE: Unlike ``addRaw(long)``, ``clear`` does NOT handle
474 |         transitions between ``HLLType``'s - a probabilistic type will remain
475 |         probabilistic after being cleared.
476 | 
477 |         :rtype: void
478 |         """
479 |         if self._type == HLLType.EMPTY:
480 |             return  # do nothing
481 |         elif self._type == HLLType.EXPLICIT:
482 |             return self._explicit_storage.clear()
483 |         elif self._type == HLLType.SPARSE:
484 |             return self._sparse_probabilistic_storage.clear()
485 |         elif self._type == HLLType.FULL:
486 |             self._probabilistic_storage.fill(0)
487 |             return
488 |         else:
489 |             raise Exception('Unsupported HLL type: {}'.format(self._type))
490 | 
491 |     def union(self, other):
492 |         """
493 |         Computes the union of HLLs and stores the result in this instance.
494 | 
495 |         :param HLL other: the other ``HLL`` instance to union into this one. This
496 |                cannot be ``None``.
497 |         :rtype: void
498 |         """
499 |         # TODO: verify HLL compatibility
500 |         other_type = other.get_type()
501 | 
502 |         if self._type == other_type:
503 |             self._homogeneous_union(other)
504 |         else:
505 |             self._heterogenous_union(other)
506 | 
507 |     def _heterogeneous_union_for_empty_hll(self, other):
508 |         # The union of empty with non-empty HLL is just a clone of the non-empty.
509 | 
510 |         if other.get_type() == HLLType.EXPLICIT:
511 |             # src: EXPLICIT
512 |             # dest: EMPTY
513 | 
514 |             if len(other._explicit_storage) <= self._explicit_threshold:
515 |                 self._type = HLLType.EXPLICIT
516 |                 self._explicit_storage = deepcopy(other._explicit_storage)
517 |             else:
518 |                 if not self._sparse_off:
519 |                     self._initialize_storage(HLLType.SPARSE)
520 |                 else:
521 |                     self._initialize_storage(HLLType.FULL)
522 | 
523 |                 for value in other._explicit_storage:
524 |                     self.add_raw(value)
525 | 
526 |         elif other.get_type() == HLLType.SPARSE:
527 |             # src: SPARSE
528 |             # dest: EMPTY
529 | 
530 |             if not self._sparse_off:
531 |                 self._type = HLLType.SPARSE
532 |                 self._sparse_probabilistic_storage = deepcopy(other._sparse_probabilistic_storage)
533 |             else:
534 |                 self._initialize_storage(HLLType.FULL)
535 |                 for register_index in other._sparse_probabilistic_storage.keys():
536 |                     register_value = other._sparse_probabilistic_storage.get(register_index)
537 |                     self._probabilistic_storage.set_max_register(register_index, register_value)
538 |             return
539 | 
540 |         else:  # case FULL
541 |             # src: FULL
542 |             # dest: EMPTY
543 |             self._type = HLLType.FULL
544 |             self._probabilistic_storage = deepcopy(other._probabilistic_storage)
545 |             return
546 | 
547 |     def _heterogeneous_union_for_non_empty_hll(self, other):
548 |         if self._type == HLLType.EXPLICIT:
549 |             # src:  FULL/SPARSE
550 |             # dest: EXPLICIT
551 |             # "Storing into destination" cannot be done (since destination
552 |             # is by definition of smaller capacity than source), so a clone
553 |             # of source is made and values from destination are inserted
554 |             # into that.
555 | 
556 |             # Determine source and destination storage.
557 |             # NOTE:  destination storage may change through promotion if
558 |             #        source is SPARSE.
559 | 
560 |             if other.get_type() == HLLType.SPARSE:
561 |                 if not self._sparse_off:
562 |                     self._type = HLLType.SPARSE
563 |                     self._sparse_probabilistic_storage = deepcopy(other._sparse_probabilistic_storage)
564 |                 else:
565 |                     self._initialize_storage(HLLType.FULL)
566 |                     for register_index in other._sparse_probabilistic_storage.keys():
567 |                         register_value = other._sparse_probabilistic_storage.get(register_index)
568 |                         self._probabilistic_storage.set_max_register(register_index, register_value)
569 | 
570 |             else:  # source is HLLType.FULL
571 |                 self._type = HLLType.FULL
572 |                 self._probabilistic_storage = deepcopy(other._probabilistic_storage)
573 | 
574 |             for value in self._explicit_storage:
575 |                 self.add_raw(value)
576 |             self._explicit_storage = None
577 |             return
578 | 
579 |         elif self._type == HLLType.SPARSE:
580 |             if other.get_type() == HLLType.EXPLICIT:
581 |                 # src: EXPLICIT
582 |                 # dest: SPARSE
583 |                 # Add the raw values from the source to the destination.
584 | 
585 |                 for value in other._explicit_storage:
586 |                     # NOTE: add_raw will handle promotion cleanup
587 |                     self.add_raw(value)
588 | 
589 |             else:  # source is HLLType.FULL
590 |                 # src:  FULL
591 |                 # dest: SPARSE
592 |                 # "Storing into destination" cannot be done (since destination
593 |                 # is by definition of smaller capacity than source), so a
594 |                 # clone of source is made and registers from the destination
595 |                 # are merged into the clone.
596 | 
597 |                 self._type = HLLType.FULL
598 |                 self._probabilistic_storage = deepcopy(other._probabilistic_storage)
599 |                 for register_index in self._sparse_probabilistic_storage.keys():
600 |                     register_value = self._sparse_probabilistic_storage.get(register_index, 0)
601 |                     self._probabilistic_storage.set_max_register(register_index, register_value)
602 |                 self._sparse_probabilistic_storage = None
603 | 
604 |         else:  # destination is HLLType.FULL
605 |             if other._type == HLLType.EXPLICIT:
606 |                 # src: EXPLICIT
607 |                 # dest: FULL
608 |                 # Add the raw values from the source to the destination.
609 |                 # Promotion is not possible, so don't bother checking.
610 | 
611 |                 for value in other._explicit_storage:
612 |                     self.add_raw(value)
613 | 
614 |             else:  # source is HLLType.SPARSE
615 |                 # src: SPARSE
616 |                 # dest: FULL
617 |                 # Merge the registers from the source into the destination.
618 |                 # Promotion is not possible, so don't bother checking.
619 | 
620 |                 for register_index in other._sparse_probabilistic_storage.keys():
621 |                     register_value = other._sparse_probabilistic_storage.get(register_index)
622 |                     self._probabilistic_storage.set_max_register(register_index, register_value)
623 | 
624 |     def _heterogenous_union(self, other):
625 |         """
626 |         The logic here is divided into two sections: unions with an EMPTY
627 |         HLL, and unions between EXPLICIT/SPARSE/FULL HLL.
628 | 
629 |         Between those two sections, all possible heterogeneous unions are
630 |         covered. Should another type be added to HLLType whose unions
631 |         are not easily reduced (say, as EMPTY's are below) this may be more
632 |         easily implemented as Strategies. However, that is unnecessary as it
633 |         stands.
634 |         :type other: HLL
635 |         :rtype: void
636 |         """
637 | 
638 |         # Union with an EMPTY
639 |         if self._type == HLLType.EMPTY:
640 |             self._heterogeneous_union_for_empty_hll(other)
641 |             return
642 |         elif other.get_type() == HLLType.EMPTY:
643 |             # source is empty, so just return destination since it is unchanged
644 |             return
645 | 
646 |         # else -- both of the sets are not empty
647 |         self._heterogeneous_union_for_non_empty_hll(other)
648 | 
649 |     def _homogeneous_union(self, other):
650 |         """
651 |         Computes the union of two HLLs of the same type, and stores the
652 |         result in this instance.
653 | 
654 |         :param HLL other: the other ``HLL`` instance to union into this one. This
655 |                cannot be ``None``.
656 |         :rtype: void
657 |         """
658 |         if self._type == HLLType.EMPTY:
659 |             # union of empty and empty is empty
660 |             return
661 | 
662 |         elif self._type == HLLType.EXPLICIT:
663 |             for value in other._explicit_storage:
664 |                 # Note: add_raw() will handle promotion, if necessary
665 |                 self.add_raw(value)
666 | 
667 |         elif self._type == HLLType.SPARSE:
668 | 
669 |             for register_index in other._sparse_probabilistic_storage.keys():
670 |                 register_value = other._sparse_probabilistic_storage.get(register_index)
671 |                 current_register_value = self._sparse_probabilistic_storage.get(register_index, 0)
672 |                 if register_value > current_register_value:
673 |                     self._sparse_probabilistic_storage[register_index] = register_value
674 | 
675 |             # promotion, if necessary
676 |             if len(self._sparse_probabilistic_storage) > self._sparse_threshold:
677 |                 self._initialize_storage(HLLType.FULL)
678 |                 for register_index in self._sparse_probabilistic_storage.keys():
679 |                     register_value = self._sparse_probabilistic_storage.get(register_index, 0)
680 |                     self._probabilistic_storage.set_max_register(register_index, register_value)
681 | 
682 |                 self._sparse_probabilistic_storage = None
683 | 
684 |         elif self._type == HLLType.FULL:
685 |             for i in range(self._m):
686 |                 register_value = other._probabilistic_storage.get_register(i)
687 |                 self._probabilistic_storage.set_max_register(i, register_value)
688 |             return
689 | 
690 |         else:
691 |             raise Exception('Unsupported HLL type: {}'.format(self._type))
692 | 
693 |     def to_bytes(self, schema_version=SerializationUtil.DEFAULT_SCHEMA_VERSION):
694 |         """
695 |         Serializes the HLL to an array of bytes in correspondence with the format
696 |         of the default schema version, ``SerializationUtil.DEFAULT_SCHEMA_VERSION``.
697 | 
698 |         :param SchemaVersion schema_version: the schema version dictating the serialization format
699 |         :returns: the array of bytes representing the HLL. This will never be
700 |                   ``None`` or empty.
701 |         :rtype: list
702 |         """
703 |         from python_hll.hllutil import HLLUtil
704 |         if self._type == HLLType.EMPTY:
705 |             byte_array_length = schema_version.padding_bytes(self._type)
706 |             byte_array = [0] * byte_array_length
707 | 
708 |         elif self._type == HLLType.EXPLICIT:
709 |             serializer = schema_version.get_serializer(
710 |                 self._type,
711 |                 HLLUtil.LONG_BIT_LENGTH,
712 |                 len(self._explicit_storage)
713 |             )
714 | 
715 |             values = list(self._explicit_storage)
716 |             values = sorted(values)
717 |             for value in values:
718 |                 serializer.write_word(value)
719 | 
720 |             byte_array = serializer.get_bytes()
721 | 
722 |         elif self._type == HLLType.SPARSE:
723 |             serializer = schema_version.get_serializer(
724 |                 self._type,
725 |                 self._short_word_length,
726 |                 len(self._sparse_probabilistic_storage)
727 |             )
728 | 
729 |             indices = self._sparse_probabilistic_storage.keys()
730 |             indices = sorted(indices)
731 | 
732 |             for register_index in indices:
733 |                 register_value = self._sparse_probabilistic_storage.get(register_index, 0)
734 | 
735 |                 # pack index and value into "short word"
736 |                 short_word = BitUtil.left_shift_int(register_index, self._regwidth) | register_value
737 |                 serializer.write_word(short_word)
738 | 
739 |             byte_array = serializer.get_bytes()
740 | 
741 |         elif self._type == HLLType.FULL:
742 |             serializer = schema_version.get_serializer(self._type, self._regwidth, self._m)
743 |             self._probabilistic_storage.get_register_contents(serializer)
744 | 
745 |             byte_array = serializer.get_bytes()
746 | 
747 |         else:
748 |             raise Exception('Unsupported HLL type: {}'.format(self._type))
749 | 
750 |         # no use of it if any _explicit_off or _explicit_auto is true
751 |         log2_explicit_threshold = 0
752 |         if not self._explicit_auto | self._explicit_off:
753 |             log2_explicit_threshold = int(NumberUtil.log2(self._explicit_threshold))
754 | 
755 |         metadata = HLLMetadata(
756 |             schema_version.schema_version_number(),
757 |             self._type,
758 |             self._log2m,
759 |             self._regwidth,
760 |             log2_explicit_threshold,
761 |             self._explicit_off,
762 |             self._explicit_auto,
763 |             not self._sparse_off
764 |         )
765 |         schema_version.write_metadata(byte_array, metadata)
766 | 
767 |         return byte_array
768 | 
769 |     @classmethod
770 |     def from_bytes(cls, bytes):
771 |         """
772 |         Deserializes the HLL (in ``toBytes()`` format) serialized
773 |         into ``bytes``.
774 | 
775 |         :param list bytes: the serialized bytes of new HLL
776 |         :returns: the deserialized HLL. This will never be ``None``.
777 |         :rtype: HLL
778 |         """
779 |         from python_hll.hllutil import HLLUtil
780 |         schema_version = SerializationUtil.get_schema_version(bytes)
781 |         metadata = schema_version.read_metadata(bytes)
782 | 
783 |         type = metadata.hll_type()
784 |         reg_width = metadata.register_width()
785 |         log_2m = metadata.register_count_log2()
786 |         sparseon = metadata.sparse_enabled()
787 | 
788 |         expthresh = 0
789 |         if metadata.explicit_auto():
790 |             expthresh = -1
791 |         elif metadata.explicit_off():
792 |             expthresh = 0
793 |         else:
794 |             # NOTE: take into account that the postgres-compatible constructor
795 |             # subtracts one before taking a power of two.
796 |             expthresh = metadata.log2_explicit_cutoff() + 1
797 | 
798 |         hll = HLL(log_2m, reg_width, expthresh, sparseon, type)
799 | 
800 |         # Short-circuit on empty, which needs no other deserialization.
801 |         if type == HLLType.EMPTY:
802 |             return hll
803 | 
804 |         word_length = 0
805 |         if type == HLLType.EXPLICIT:
806 |             word_length = HLLUtil.LONG_BIT_LENGTH  # 64 for both java and python
807 | 
808 |         elif type == HLLType.SPARSE:
809 |             word_length = hll._short_word_length
810 | 
811 |         elif type == HLLType.FULL:
812 |             word_length = hll._regwidth
813 | 
814 |         else:
815 |             raise Exception('Unsupported HLL type: {}'.format(type))
816 | 
817 |         deserializer = schema_version.get_deserializer(type, word_length, bytes)
818 |         if type == HLLType.EXPLICIT:
819 |             # NOTE:  This should not exceed expthresh and this will always
820 |             #        be exactly the number of words that were encoded,
821 |             #        because the word length is at least a byte wide.
822 |             # SEE:   BigEndianAscendingWordDeserializer.total_word_count()
823 |             for i in range(deserializer.total_word_count()):
824 |                 hll._explicit_storage.add(deserializer.read_word())
825 | 
826 |         elif type == HLLType.SPARSE:
827 |             # NOTE:  If the short_word_length were smaller than 8 bits
828 |             #        (1 byte) there would be a possibility (because of
829 |             #        padding arithmetic) of having one or more extra
830 |             #        registers read. However, this is not relevant as the
831 |             #        extra registers will be all zeroes, which are ignored
832 |             #        in the sparse representation.
833 |             for i in range(deserializer.total_word_count()):
834 |                 short_word = deserializer.read_word()
835 | 
836 |                 register_value = BitUtil.to_signed_byte(short_word & hll._value_mask)
837 |                 # Only set non-zero registers.
838 |                 if register_value != 0:
839 |                     register_key = int(BitUtil.unsigned_right_shift_long(short_word, hll._regwidth))
840 |                     hll._sparse_probabilistic_storage[register_key] = register_value
841 | 
842 |         elif type == HLLType.FULL:
843 |             # NOTE:  Iteration is done using m (register count) and NOT
844 |             #        deserializer.total_word_count() because regwidth may be
845 |             #        less than 8 and as such the padding on the 'last' byte
846 |             #        may be larger than regwidth, causing an extra register
847 |             #        to be read.
848 |             # SEE: BigEndianAscendingWordDeserializer.total_word_count()
849 |             for i in range(hll._m):
850 |                 hll._probabilistic_storage.set_register(i, deserializer.read_word())
851 | 
852 |         else:
853 |             raise Exception('Unsupported HLL type: {}'.format(type))
854 | 
855 |         return hll
856 | 


--------------------------------------------------------------------------------
/python_hll/hlltype.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | class HLLType:
 5 |     """
 6 |     The types of algorithm/data structure that HLL can utilize. For more
 7 |     information, see the Javadoc for HLL.
 8 |     """
 9 |     EMPTY = 1
10 |     EXPLICIT = 2
11 |     SPARSE = 3
12 |     FULL = 4
13 |     UNDEFINED = 5  # used by the PostgreSQL implementation to indicate legacy/corrupt/incompatible/unknown formats
14 | 


--------------------------------------------------------------------------------
/python_hll/hllutil.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from math import log
  3 | from python_hll.hll import HLL
  4 | from python_hll.util import NumberUtil
  5 | from python_hll.util import BitUtil
  6 | 
  7 | 
  8 | class HLLUtil:
  9 |     """
 10 |     Static functions for computing constants and parameters used in the HLL
 11 |     algorithm.
 12 |     """
 13 | 
 14 |     # The number of bits used to represent a long value in two's complement binary form
 15 |     LONG_BIT_LENGTH = 64
 16 | 
 17 |     # Precomputed ``pw_max_mask`` values indexed by ``register_size_in_bits``.
 18 |     # Calculated with this formula::
 19 |     #
 20 |     #     int max_register_value = (1 << register_size_in_bits) - 1;
 21 |     #     // Mask with all bits set except for (max_register_value - 1) least significant bits (see add_raw())
 22 |     #     return ~((1L << (max_register_value - 1)) - 1);
 23 |     #
 24 |     # See ``pw_max_mask()``
 25 | 
 26 |     PW_MASK = [
 27 |         -9223372036854775808,  # ~((1 << (((1 << 0) - 1) - 1)) - 1)
 28 |         -1,                    # ~((1 << (((1 << 1) - 1) - 1)) - 1)
 29 |         -4,                    # ~((1 << (((1 << 2) - 1) - 1)) - 1)
 30 |         -64,                   # ~((1 << (((1 << 3) - 1) - 1)) - 1)
 31 |         -16384,                # ~((1 << (((1 << 4) - 1) - 1)) - 1)
 32 |         -1073741824,           # ~((1 << (((1 << 5) - 1) - 1)) - 1)
 33 |         -4611686018427387904,  # ~((1 << (((1 << 6) - 1) - 1)) - 1)
 34 |         -4611686018427387904,  # ~((1 << (((1 << 7) - 1) - 1)) - 1)
 35 |         -4611686018427387904,  # ~((1 << (((1 << 8) - 1) - 1)) - 1)
 36 |     ]
 37 | 
 38 |     # Spacing constant used to compute offsets into ``TWO_TO_L``.
 39 |     REG_WIDTH_INDEX_MULTIPLIER = HLL.MAXIMUM_LOG2M_PARAM + 1
 40 | 
 41 |     @classmethod
 42 |     def register_bit_size(cls, expected_unique_elements):
 43 |         """
 44 |         Computes the bit-width of HLL registers necessary to estimate a set of
 45 |         the specified cardinality.
 46 | 
 47 |         :param long expected_unique_elements: an upper bound on the number of unique
 48 |                elements that are expected.  This must be greater than zero.
 49 |         :returns: a register size in bits (i.e. ``log2(log2(n))``)
 50 |         :rtype: int
 51 |         """
 52 |         return max(
 53 |             HLL.MINIMUM_REGWIDTH_PARAM,
 54 |             NumberUtil.log2(NumberUtil.log2(expected_unique_elements))
 55 |         )
 56 | 
 57 |     @classmethod
 58 |     def alpha_m_squared(cls, m):
 59 |         """
 60 |         Computes the 'alpha-m-squared' constant used by the HyperLogLog algorithm.
 61 | 
 62 |         :param int m: this must be a power of two, cannot be less than
 63 |                16 (2:sup:`4`), and cannot be greater than 65536 (2:sup:`16`).
 64 |         :returns: gamma times ``registerCount`` squared where gamma is
 65 |                   based on the value of ``registerCount``.
 66 |         :rtype: float
 67 |         """
 68 | 
 69 |         if m < 16:
 70 |             raise Exception("'m' cannot be less than 16 ({m} < 16).".format(m=m))
 71 | 
 72 |         elif m == 16:
 73 |             return 0.673 * m * m
 74 | 
 75 |         elif m == 32:
 76 |             return 0.697 * m * m
 77 | 
 78 |         elif m == 64:
 79 |             return 0.709 * m * m
 80 | 
 81 |         else:
 82 |             return (0.7213 / (1.0 + 1.079 / m)) * m * m
 83 | 
 84 |     @classmethod
 85 |     def pw_max_mask(cls, register_size_in_bits):
 86 |         """
 87 |         Computes a mask that prevents overflow of HyperLogLog registers.
 88 | 
 89 |         :param int register_size_in_bits: the size of the HLL registers, in bits.
 90 |         :returns: mask a ``long`` mask to prevent overflow of the registers
 91 |         :rtype: long
 92 |         """
 93 |         return cls.PW_MASK[register_size_in_bits]
 94 | 
 95 |     @classmethod
 96 |     def small_estimator_cutoff(cls, m):
 97 |         """
 98 |         The cutoff for using the "small range correction" formula, in the
 99 |         HyperLogLog algorithm.
100 | 
101 |         :param int m: the number of registers in the HLL. <em>m<em> in the paper.
102 |         :returns: the cutoff for the small range correction.
103 |         :rtype: float
104 |         """
105 |         return (float(m) * 5) / 2
106 | 
107 |     @classmethod
108 |     def small_estimator(cls, m, number_of_zeroes):
109 |         """
110 |         The "small range correction" formula from the HyperLogLog algorithm. Only
111 |         appropriate if both the estimator is smaller than ``(5/2) * m`` and
112 |         there are still registers that have the zero value.
113 | 
114 |         :param int m: the number of registers in the HLL. <em>m<em> in the paper.
115 |         :param int number_of_zeroes: the number of registers with value zero. ``V``
116 |                in the paper.
117 |         :returns: a corrected cardinality estimate.
118 |         :rtype: float
119 |         """
120 |         return m * log(float(m) / number_of_zeroes)
121 | 
122 |     @classmethod
123 |     def large_estimator_cutoff(cls, log2m, register_size_in_bits):
124 |         """
125 |         The cutoff for using the "large range correction" formula, from the
126 |         HyperLogLog algorithm, adapted for 64 bit hashes.
127 | 
128 |         See `Blog post with section on 64 bit hashes and "large range correction" cutoff <http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/>`_.
129 | 
130 |         :param int log2m: log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
131 |         :param int register_size_in_bits: the size of the HLL registers, in bits.
132 |         :returns: the cutoff for the large range correction.
133 |         :rtype: float
134 |         """
135 |         return TWO_TO_L[
136 |             (cls.REG_WIDTH_INDEX_MULTIPLIER * register_size_in_bits) + log2m
137 |         ] / 30.0
138 | 
139 |     @classmethod
140 |     def large_estimator(cls, log2m, register_size_in_bits, estimator):
141 |         """
142 |         The "large range correction" formula from the HyperLogLog algorithm, adapted
143 |         for 64 bit hashes. Only appropriate for estimators whose value exceeds
144 |         the return of ``largeEstimatorCutoff()``.
145 | 
146 |         See `Blog post with section on 64 bit hashes and "large range correction" cutoff <http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/>`_.
147 | 
148 |         :param int log2m: log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
149 |         :param int register_size_in_bits: the size of the HLL registers, in bits.
150 |         :param float estimator: the original estimator ("E" in the paper).
151 |         :returns: a corrected cardinality estimate.
152 |         :rtype: float
153 |         """
154 |         two_to_l = TWO_TO_L[(cls.REG_WIDTH_INDEX_MULTIPLIER * register_size_in_bits) + log2m]
155 |         try:
156 |             return -1 * two_to_l * log(1.0 - (estimator/two_to_l))
157 |         except ValueError:
158 |             return 0
159 | 
160 | 
161 | # Precomputed ``twoToL`` values indexed by a linear combination of
162 | # ``regwidth`` and ``log2m``.
163 | #
164 | # The array is one-dimensional and can be accessed by using index
165 | # ``(REG_WIDTH_INDEX_MULTIPLIER * regwidth) + log2m``
166 | # for ``regwidth`` and ``log2m`` between the specified
167 | # ``HLL.{MINIMUM,MAXIMUM}_{REGWIDTH,LOG2M}_PARAM`` constants.
168 | #
169 | # See ``large_estimator()``.
170 | # See ``large_estimator_cutoff()``.
171 | # See `Blog post with section on 2^L
172 | # <http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/>`_
173 | TWO_TO_L = [0.0] * (HLL.MAXIMUM_REGWIDTH_PARAM + 1) * (HLL.MAXIMUM_LOG2M_PARAM + 1)
174 | for reg_width in range(HLL.MINIMUM_REGWIDTH_PARAM, HLL.MAXIMUM_REGWIDTH_PARAM+1):
175 |     for log2m in range(HLL.MINIMUM_LOG2M_PARAM, HLL.MAXIMUM_LOG2M_PARAM+1):
176 |         max_register_value = BitUtil.left_shift_int(1, reg_width) - 1
177 | 
178 |         # Since 1 is added to p(w) in the insertion algorithm, only
179 |         # (maxRegisterValue - 1) bits are inspected hence the hash
180 |         # space is one power of two smaller.
181 |         pw_bits = max_register_value - 1
182 |         total_bits = pw_bits + log2m
183 |         two_to_l = 2**total_bits
184 |         TWO_TO_L[(HLLUtil.REG_WIDTH_INDEX_MULTIPLIER * reg_width) + log2m] = two_to_l
185 | 


--------------------------------------------------------------------------------
/python_hll/serialization.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from python_hll.hlltype import HLLType
  4 | from python_hll.util import BitUtil
  5 | 
  6 | 
  7 | class BigEndianAscendingWordDeserializer:
  8 |     """
  9 |     A corresponding deserializer for BigEndianAscendingWordSerializer.
 10 |     """
 11 | 
 12 |     # The number of bits per byte.
 13 |     BITS_PER_BYTE = 8
 14 | 
 15 |     # long mask for the maximum value stored in a byte
 16 |     BYTE_MASK = BitUtil.left_shift_long(1, BITS_PER_BYTE) - 1
 17 | 
 18 |     # :var int word_length: The length in bits of the words to be read.
 19 |     # :var list bytes: The byte array to which the words are serialized.
 20 |     # :var int byte_padding: The number of leading padding bytes in 'bytes' to be ignored.
 21 |     # :var int word_count: The number of words that the byte array contains.
 22 |     # :var int current_word_index: The current read state.
 23 | 
 24 |     def __init__(self, word_length, byte_padding, bytes):
 25 |         """
 26 |         :param int word_length: the length in bits of the words to be deserialized. Must
 27 |             be less than or equal to 64 and greater than or equal to 1.
 28 |         :param int byte_padding: the number of leading bytes that pad the serialized words.
 29 |         :param list bytes: the byte array containing the serialized words. Cannot be ``None``.
 30 |         """
 31 |         if not 1 <= word_length <= 64:
 32 |             raise ValueError("Word length must be >= 1 and <= 64. (was: {word_length})".format(word_length=word_length))
 33 | 
 34 |         if byte_padding < 0:
 35 |             raise ValueError("Byte padding must be >= zero. (was: {byte_padding})".format(byte_padding=byte_padding))
 36 | 
 37 |         self._word_length = word_length
 38 |         self._bytes = bytes
 39 |         self._byte_padding = byte_padding
 40 | 
 41 |         self.data_bytes = (len(bytes) - byte_padding)
 42 |         self.data_bits = self.data_bytes * self.BITS_PER_BYTE
 43 | 
 44 |         self.word_count = int(self.data_bits/self._word_length)
 45 | 
 46 |         self.current_word_index = 0
 47 | 
 48 |     def read_word(self):
 49 |         """
 50 |         Return the next word in the sequence. Should not be called more than ``total_word_count`` times.
 51 | 
 52 |         :rtype: long
 53 |         """
 54 |         word = self._read_word(self.current_word_index)
 55 |         self.current_word_index += 1
 56 |         return word
 57 | 
 58 |     def _read_word(self, position):
 59 |         """
 60 |         Reads the word at the specific sequence position (zero-indexed).
 61 | 
 62 |         :param int position: the zero-indexed position of the word to be read. This must be greater
 63 |             than or equal to zero.
 64 |         :returns: the value of the serialized word at the specific position.
 65 |         :rtype: long
 66 |         """
 67 |         if position < 0:
 68 |             raise ValueError("Array index out of bounds for {position}".format(position=position))
 69 | 
 70 |         # First bit of the word
 71 |         first_bit_index = (position * self._word_length)
 72 |         first_byte_index = (self._byte_padding + int(first_bit_index / self.BITS_PER_BYTE))
 73 |         first_byte_skip_bits = int(first_bit_index % self.BITS_PER_BYTE)
 74 | 
 75 |         # Last bit of the word
 76 |         last_bit_index = (first_bit_index + self._word_length - 1)
 77 |         last_byte_index = (self._byte_padding + int(last_bit_index / self.BITS_PER_BYTE))
 78 | 
 79 |         bits_after_byte_boundary = int((last_bit_index + 1) % self.BITS_PER_BYTE)
 80 | 
 81 |         # If the word terminates at the end of the last byte,, consume the whole
 82 |         # last byte.
 83 |         if bits_after_byte_boundary == 0:
 84 |             last_byte_bits_to_consume = self.BITS_PER_BYTE
 85 |         else:
 86 |             # Otherwise, only consume what is necessary.
 87 |             last_byte_bits_to_consume = bits_after_byte_boundary
 88 | 
 89 |         if last_byte_index >= len(self._bytes):
 90 |             raise ValueError("Word out of bounds of backing array, {} >= {}".format(last_byte_index, len(self._bytes)))
 91 | 
 92 |         # Accumulator
 93 |         value = 0
 94 | 
 95 |         # -------------------------------------------------------------------
 96 |         # First byte
 97 |         bits_remaining_in_first_byte = (self.BITS_PER_BYTE - first_byte_skip_bits)
 98 |         bits_to_consume_in_first_byte = min(bits_remaining_in_first_byte, self._word_length)
 99 |         first_byte = self._bytes[first_byte_index]
100 | 
101 |         # Mask off the bits to skip in the first byte.
102 |         first_byte_mask = (BitUtil.left_shift_long(1, bits_remaining_in_first_byte) - 1)
103 |         first_byte &= first_byte_mask
104 | 
105 |         # Right-align relevant bits of first byte.
106 |         first_byte = BitUtil.unsigned_right_shift_long(
107 |             first_byte,
108 |             bits_remaining_in_first_byte - bits_to_consume_in_first_byte
109 |         )
110 | 
111 |         value |= first_byte
112 | 
113 |         # If the first byte contains the whold word, short-circuit.
114 |         if first_byte_index == last_byte_index:
115 |             return value
116 | 
117 |         # -------------------------------------------------------------
118 |         # Middle bytes
119 |         middle_byte_count = int(last_byte_index - first_byte_index - 1)
120 |         for i in range(middle_byte_count):
121 |             middle_byte = self._bytes[first_byte_index + i + 1] & self.BYTE_MASK
122 |             # Push middle byte onto accumulator.
123 |             value = BitUtil.left_shift_long(value, self.BITS_PER_BYTE)
124 |             value |= middle_byte
125 | 
126 |         # --------------------------------------------------
127 |         # Last byte
128 |         last_byte = (self._bytes[last_byte_index] & self.BYTE_MASK)
129 |         last_byte >>= self.BITS_PER_BYTE - last_byte_bits_to_consume
130 |         value = BitUtil.left_shift_long(value, last_byte_bits_to_consume)
131 |         value |= last_byte
132 |         return value
133 | 
134 |     def total_word_count(self):
135 |         """
136 |         Returns the number of words that could be encoded in the sequence.
137 | 
138 |         NOTE:  the sequence that was encoded may be shorter than the value this
139 |                method returns due to padding issues within bytes. This guarantees
140 |                only an upper bound on the number of times ``readWord()``
141 |                can be called.
142 | 
143 |         :returns: the maximum number of words that could be read from the sequence.
144 |         :rtype: int
145 |         """
146 |         return self.word_count
147 | 
148 | 
149 | class BigEndianAscendingWordSerializer:
150 |     """
151 |     A serializer that writes a sequence of fixed bit-width 'words' to a byte array.
152 |     Bitwise OR is used to write words into bytes, so a low bit in a word is also
153 |     a low bit in a byte. However, a high byte in a word is written at a lower index
154 |     in the array than a low byte in a word. The first word is written at the lowest
155 |     array index. Each serializer is one time use and returns its backing byte
156 |     array.
157 | 
158 |     This encoding was chosen so that when reading bytes as octets in the typical
159 |     first-octet-is-the-high-nibble fashion, an octet-to-binary conversion
160 |     would yield a high-to-low, left-to-right view of the "short words".
161 | 
162 |     Example:
163 | 
164 |     Say short words are 5 bits wide. Our word sequence is the values
165 |     ``[31, 1, 5]``. In big-endian binary format, the values are
166 |     ``[0b11111, 0b00001, 0b00101]``. We use 15 of 16 bits in two bytes
167 |     and pad the last (lowest) bit of the last byte with a zero::
168 | 
169 |         [0b11111000, 0b01001010] = [0xF8, 0x4A]
170 |     """
171 | 
172 |     # The number of bits per byte.
173 |     BITS_PER_BYTE = 8
174 | 
175 |     # :var int bits_left_in_byte: Number of bits that remain writable in the current byte.
176 |     # :var int byte_index: Index of byte currently being written to.
177 |     # :var int words_written: Number of words written.
178 | 
179 |     def __init__(self, word_length, word_count, byte_padding):
180 |         """
181 |         :param int word_length: the length in bits of the words to be serialized. Must
182 |                be greater than or equal to 1 and less than or equal to 64.
183 |         :param int word_count: the number of words to be serialized. Must be greater than
184 |                or equal to zero.
185 |         :param int byte_padding: the number of leading bytes that should pad the
186 |                serialized words. Must be greater than or equal to zero.
187 |         """
188 |         if (word_length < 1) or (word_length > 64):
189 |             raise ValueError('Word length must be >= 1 and <= 64. (was: {})'.format(word_length))
190 |         if word_count < 0:
191 |             raise ValueError('Word count must be >= 0. (was: {})'.format(word_count))
192 |         if byte_padding < 0:
193 |             raise ValueError('Byte padding must be must be >= 0. (was: {})'.format(byte_padding))
194 | 
195 |         self._word_length = word_length
196 |         self._word_count = word_count
197 | 
198 |         bits_required = word_length * word_count
199 |         leftover_bits = ((bits_required % self.BITS_PER_BYTE) != 0)
200 |         leftover_bits_inc = 0
201 |         if leftover_bits:
202 |             leftover_bits_inc = 1
203 |         bytes_required = (bits_required / self.BITS_PER_BYTE) + leftover_bits_inc + byte_padding
204 |         self._bytes = [0] * int(bytes_required)
205 | 
206 |         self._bits_left_in_byte = self.BITS_PER_BYTE
207 |         self._byte_index = byte_padding
208 |         self._words_written = 0
209 | 
210 |     def write_word(self, word):
211 |         """
212 |         Writes the word to the backing array.
213 | 
214 |         :param long word: the word to write.
215 |         :rtype: void
216 |         """
217 |         if self._words_written == self._word_count:
218 |             raise ValueError('Cannot write more words, backing array full!')
219 | 
220 |         bits_left_in_word = self._word_length
221 | 
222 |         while bits_left_in_word > 0:
223 |             # Move to the next byte if the current one is fully packed.
224 |             if self._bits_left_in_byte == 0:
225 |                 self._byte_index += 1
226 |                 self._bits_left_in_byte = self.BITS_PER_BYTE
227 | 
228 |             consumed_mask = ~0 if bits_left_in_word == 64 else (BitUtil.left_shift_long(1, bits_left_in_word) - 1)
229 | 
230 |             # Fix how many bits will be written in this cycle. Choose the
231 |             #  smaller of the remaining bits in the word or byte.
232 |             number_of_bits_to_write = min(self._bits_left_in_byte, bits_left_in_word)
233 |             bits_in_byte_remaining_after_write = self._bits_left_in_byte - number_of_bits_to_write
234 | 
235 |             # In general, we write the highest bits of the word first, so we
236 |             # strip the highest bits that were consumed in previous cycles.
237 |             remaining_bits_of_word_to_write = (word & consumed_mask)
238 | 
239 |             # If the byte can accept all remaining bits, there is no need
240 |             # to shift off the bits that won't be written in this cycle.
241 |             bits_that_the_byte_can_accept = remaining_bits_of_word_to_write
242 | 
243 |             # If there is more left in the word than can be written to this
244 |             # byte, shift off the bits that can't be written off the bottom.
245 |             if bits_left_in_word > number_of_bits_to_write:
246 |                 bits_that_the_byte_can_accept = BitUtil.unsigned_right_shift_long(remaining_bits_of_word_to_write, bits_left_in_word - self._bits_left_in_byte)
247 |             else:
248 |                 # If the byte can accept all remaining bits, there is no need
249 |                 # to shift off the bits that won't be written in this cycle.
250 |                 bits_that_the_byte_can_accept = remaining_bits_of_word_to_write
251 | 
252 |             # Align the word bits to write up against the byte bits that have
253 |             # already been written. This shift may do nothing if the remainder
254 |             # of the byte is being consumed in this cycle.
255 |             aligned_bits = BitUtil.left_shift_long(bits_that_the_byte_can_accept, bits_in_byte_remaining_after_write)
256 | 
257 |             # Update the byte with the alignedBits.
258 |             self._bytes[self._byte_index] |= BitUtil.to_signed_byte(aligned_bits)
259 | 
260 |             # Update state with bit count written.
261 |             bits_left_in_word -= number_of_bits_to_write
262 |             self._bits_left_in_byte = bits_in_byte_remaining_after_write
263 | 
264 |         self._words_written += 1
265 | 
266 |     def get_bytes(self):
267 |         """
268 |         Returns the backing array of ``byte``'s that contain the serialized words.
269 | 
270 |         :returns: the serialized words as a list of bytes.
271 |         :rtype: list
272 |         """
273 |         if self._words_written < self._word_count:
274 |             raise ValueError('Not all words have been written! ({}/{})'.format(self._words_written, self._word_count))
275 |         return self._bytes
276 | 
277 | 
278 | class HLLMetadata:
279 |     """
280 |     The metadata and parameters associated with a HLL.
281 |     """
282 | 
283 |     def __init__(self, schema_version, type, register_count_log2, register_width, log2_explicit_cutoff, explicit_off, explicit_auto, sparse_enabled):
284 |         """
285 |         :param int schema_version: the schema version number of the HLL. This must
286 |             be greater than or equal to zero.
287 |         :param HLLType type: the type of the HLL. This cannot be ``None``.
288 |         :param int register_count_log2: the log-base-2 register count parameter for
289 |             probabilistic HLLs. This must be greater than or equal to zero.
290 |         :param int register_width: the register width parameter for probabilistic
291 |             HLLs. This must be greater than or equal to zero.
292 |         :param int log2_explicit_cutoff: the log-base-2 of the explicit cardinality cutoff,
293 |             if it is explicitly defined. (If ``explicit_off`` or ``explicit_auto`` is True then
294 |             this has no meaning.
295 |         :param boolean explicit_off: the flag for 'explicit off'-mode, where the
296 |             ``HLLType.EXPLICIT`` representation is not used. Both this and
297 |             ``explicit_auto`` cannot be True at the same time.
298 |         :param boolean explicit_auto: the flag for 'explicit auto'-mode, where the
299 |             ``HLLType.EXPLICIT`` representation's promotion cutoff is
300 |             determined based on in-memory size automatically. Both this and
301 |             ``explicit_off`` cannot be True at the same time.
302 |         :param boolean sparse_enabled: the flag for 'sparse-enabled'-mode, where the
303 |             ``HLLType.SPARSE`` representation is used.
304 |         """
305 |         self._schema_version = schema_version
306 |         self._type = type
307 |         self._register_count_log2 = register_count_log2
308 |         self._register_width = register_width
309 |         self._log2_explicit_cutoff = log2_explicit_cutoff
310 |         self._explicit_off = explicit_off
311 |         self._explicit_auto = explicit_auto
312 |         self._sparse_enabled = sparse_enabled
313 | 
314 |     def schema_version(self):
315 |         """
316 |         :returns: the schema version of the HLL. This will never be ``None``.
317 |         :rtype: int
318 |         """
319 |         return self._schema_version
320 | 
321 |     def hll_type(self):
322 |         """
323 |         :returns: the type of the HLL. This will never be ``None``.
324 |         :rtype: HLLType
325 |         """
326 |         return self._type
327 | 
328 |     def register_count_log2(self):
329 |         """
330 |         :returns: the log-base-2 of the register count parameter of the HLL. This
331 |                   will always be greater than or equal to 4 and less than or equal
332 |                   to 31.
333 |         :rtype: int
334 |         """
335 |         return self._register_count_log2
336 | 
337 |     def register_width(self):
338 |         """
339 |         :returns: the register width parameter of the HLL. This will always be
340 |                   greater than or equal to 1 and less than or equal to 8.
341 |         :rtype: int
342 |         """
343 |         return self._register_width
344 | 
345 |     def log2_explicit_cutoff(self):
346 |         """
347 |         :returns: the log-base-2 of the explicit cutoff cardinality. This will always
348 |                   be greater than or equal to zero and less than 31, per the specification.
349 |         :rtype: int
350 |         """
351 |         return self._log2_explicit_cutoff
352 | 
353 |     def explicit_off(self):
354 |         """
355 |         :returns: True if the ``HLLType.EXPLICIT`` representation
356 |                   has been disabled. False< otherwise.
357 |         :rtype: boolean
358 |         """
359 |         return self._explicit_off
360 | 
361 |     def explicit_auto(self):
362 |         """
363 |         :returns: True if the ``HLLType.EXPLICIT`` representation
364 |                   cutoff cardinality is set to be automatically chosen,
365 |                   False otherwise.
366 |         :rtype: boolean
367 |         """
368 |         return self._explicit_auto
369 | 
370 |     def sparse_enabled(self):
371 |         """
372 |         :returns: True if the HLLType.SPARSE representation is enabled.
373 |         :rtype: boolean
374 |         """
375 |         return self._sparse_enabled
376 | 
377 |     def __str__(self):
378 |         return "<HLLMetadata schema_version: %s, type: %s, register_count_log2: %s, register_width: %s, log2_explicit_cutoff: %s, explicit_off: %s, explicit_auto: %s>" % (self._schema_version, self._type, self._register_count_log2, self._register_width, self._log2_explicit_cutoff, self._explicit_off, self._explicit_auto)
379 | 
380 | 
381 | class SchemaVersionOne:
382 |     """
383 |     A serialization schema for HLLs. Reads and writes HLL metadata to
384 |     and from byte representations.
385 |     """
386 | 
387 |     # The schema version number for this instance.
388 |     SCHEMA_VERSION = 1
389 | 
390 |     # Version-specific ordinals (array position) for each of the HLL types
391 |     TYPE_ORDINALS = [
392 |         HLLType.UNDEFINED,
393 |         HLLType.EMPTY,
394 |         HLLType.EXPLICIT,
395 |         HLLType.SPARSE,
396 |         HLLType.FULL
397 |     ]
398 | 
399 |     # number of header bytes for all HLL types
400 |     HEADER_BYTE_COUNT = 3
401 | 
402 |     # sentinel values from the spec for explicit off and auto
403 |     EXPLICIT_OFF = 0
404 |     EXPLICIT_AUTO = 63
405 | 
406 |     def padding_bytes(self, type):
407 |         """
408 |         The number of metadata bytes required for a serialized HLL of the
409 |         specified type.
410 | 
411 |         :param HLLType type: the type of the serialized HLL
412 |         :returns: the number of padding bytes needed in order to fully accommodate
413 |                   the needed metadata.
414 |         :rtype: int
415 |         """
416 |         return self.HEADER_BYTE_COUNT
417 | 
418 |     def write_metadata(self, bytes, metadata):
419 |         """
420 |         Writes metadata bytes to serialized HLL.
421 | 
422 |         :param list bytes: the padded data bytes of the HLL
423 |         :param HLLMetadata metadata: the metadata to write to the padding bytes
424 |         :rtype: void
425 |         """
426 |         type = metadata.hll_type()
427 |         type_ordinal = self._get_ordinal(type)
428 | 
429 |         explicit_cut_off_value = metadata.log2_explicit_cutoff() + 1
430 | 
431 |         if metadata.explicit_off():
432 |             explicit_cut_off_value = self.EXPLICIT_OFF
433 |         elif metadata.explicit_auto():
434 |             explicit_cut_off_value = self.EXPLICIT_AUTO
435 | 
436 |         bytes[0] = SerializationUtil.pack_version_byte(self.SCHEMA_VERSION, type_ordinal)
437 |         bytes[1] = SerializationUtil.pack_parameters_byte(metadata.register_width(), metadata.register_count_log2())
438 |         bytes[2] = SerializationUtil.pack_cutoff_byte(explicit_cut_off_value, metadata.sparse_enabled())
439 | 
440 |     def read_metadata(self, bytes):
441 |         """
442 |         Reads the metadata bytes of the serialized HLL.
443 | 
444 |         :param list bytes: the serialized HLL
445 |         :returns: the HLL metadata
446 |         :rtype: HLLMetadata
447 |         """
448 |         version_byte = bytes[0]
449 |         parameters_byte = bytes[1]
450 |         cutoff_byte = bytes[2]
451 | 
452 |         type_ordinal = SerializationUtil.type_ordinal(version_byte)
453 |         explicit_cut_off_value = SerializationUtil.explicit_cutoff(cutoff_byte)
454 |         explicit_off = (explicit_cut_off_value == self.EXPLICIT_OFF)
455 |         explicit_auto = (explicit_cut_off_value == self.EXPLICIT_AUTO)
456 |         log2_explicit_cutoff = -1 if (explicit_off or explicit_auto) else explicit_cut_off_value - 1
457 | 
458 |         return HLLMetadata(SchemaVersionOne.SCHEMA_VERSION, self._get_type(type_ordinal), SerializationUtil.register_count_log2(parameters_byte),
459 |                            SerializationUtil.register_width(parameters_byte), log2_explicit_cutoff, explicit_off,
460 |                            explicit_auto, SerializationUtil.sparse_enabled(cutoff_byte))
461 | 
462 |     def get_serializer(self, type, word_length, word_count):
463 |         """
464 |         Builds an HLL serializer that matches this schema version.
465 | 
466 |         :param HLLType type: the HLL type that will be serialized. This cannot be ``None``.
467 |         :param int word_length: the length of the 'words' that comprise the data of the
468 |                HLL. Words must be at least 5 bits and at most 64 bits long.
469 |         :param int word_count: the number of 'words' in the HLL's data.
470 | 
471 |         :returns: a byte array serializer used to serialize a HLL according
472 |                  to this schema version's specification.
473 |         :rtype: BigEndianAscendingWordSerializer
474 |         """
475 |         return BigEndianAscendingWordSerializer(word_length, word_count, self.padding_bytes(type))
476 | 
477 |     def get_deserializer(self, type, word_length, bytes):
478 |         """
479 |         Builds an HLL deserializer that matches this schema version.
480 | 
481 |         :param HLLType type: the HLL type that will be deserialized. This cannot be ``None``.
482 |         :param int word_length: the length of the 'words' that comprise the data of the
483 |                serialized HLL. Words must be at least 5 bits and at most 64
484 |                bits long.
485 |         :param list bytes: the serialized HLL to deserialize. This cannot be ``None``.
486 |         :returns: a byte array deserializer used to deserialize a HLL serialized
487 |                   according to this schema version's specification.
488 |         :rtype: BigEndianAscendingWordDeserializer
489 |         """
490 |         return BigEndianAscendingWordDeserializer(word_length, self.padding_bytes(type), bytes)
491 | 
492 |     def schema_version_number(self):
493 |         """
494 |         :returns: the schema version number
495 |         :rtype: int
496 |         """
497 |         return self.SCHEMA_VERSION
498 | 
499 |     @classmethod
500 |     def _get_ordinal(cls, type):
501 |         """
502 |         Gets the ordinal for the specified ``HLLType``.
503 | 
504 |         :param HLLType type: the type whose ordinal is desired
505 |         :returns the ordinal for the specified type, to be used in the version byte.
506 |                  This will always be non-negative.
507 |         :rtype: int
508 |         """
509 |         return cls.TYPE_ORDINALS.index(type)
510 | 
511 |     @classmethod
512 |     def _get_type(cls, ordinal):
513 |         """
514 |         Gets the ``HLLType`` for the specified ordinal.
515 | 
516 |         :param int ordinal: the ordinal whose type is desired
517 |         :returns: the type for the specified ordinal. This will never be ``None``.
518 |         :rtype: HLLType
519 |         """
520 |         if ordinal < 0 or ordinal >= len(cls.TYPE_ORDINALS):
521 |             raise ValueError('Invalid type ordinal {}. Only 0-{} inclusive allowed'.format(
522 |                 ordinal, (len(cls.TYPE_ORDINALS) - 1)))
523 |         return cls.TYPE_ORDINALS[ordinal]
524 | 
525 | 
526 | class SerializationUtil:
527 |     """
528 |     A collection of constants and utilities for serializing and deserializing
529 |     HLLs.
530 |     """
531 | 
532 |     # The number of bits (of the parameters byte) dedicated to encoding the
533 |     # width of the registers.
534 |     REGISTER_WIDTH_BITS = 3
535 | 
536 |     # A mask to cap the maximum value of the register width.
537 |     REGISTER_WIDTH_MASK = BitUtil.left_shift_int(1, REGISTER_WIDTH_BITS) - 1
538 | 
539 |     # The number of bits (of the parameters byte) dedicated to encoding
540 |     # ``log2(register_count)``.
541 |     LOG2_REGISTER_COUNT_BITS = 5
542 | 
543 |     # A mask to cap the maximum value of ``log2(register_count)``.
544 |     LOG2_REGISTER_COUNT_MASK = BitUtil.left_shift_int(1, LOG2_REGISTER_COUNT_BITS) - 1
545 | 
546 |     # The number of bits (of the cutoff byte) dedicated to encoding the
547 |     # log-base-2 of the explicit cutoff or sentinel values for
548 |     # 'explicit-disabled' or 'auto'.
549 |     EXPLICIT_CUTOFF_BITS = 6
550 | 
551 |     # A mask to cap the maximum value of the explicit cutoff choice.
552 |     EXPLICIT_CUTOFF_MASK = BitUtil.left_shift_int(1, EXPLICIT_CUTOFF_BITS) - 1
553 | 
554 |     # Number of bits in a nibble.
555 |     NIBBLE_BITS = 4
556 | 
557 |     # A mask to cap the maximum value of a nibble.
558 |     NIBBLE_MASK = BitUtil.left_shift_int(1, NIBBLE_BITS) - 1
559 | 
560 |     # ************************************************************************
561 |     # Serialization utilities
562 | 
563 |     # Schema version one (v1).
564 |     VERSION_ONE = SchemaVersionOne()
565 | 
566 |     # The default schema version for serializing HLLs.
567 |     DEFAULT_SCHEMA_VERSION = VERSION_ONE
568 | 
569 |     # List of registered schema versions, indexed by their version numbers. If
570 |     # an entry is ``None``, then no such schema version is registered.
571 |     # Similarly, registering a new schema version simply entails assigning an
572 |     # SchemaVersion instance to the appropriate index of this array.
573 |     #
574 |     # By default, only SchemaVersionOne is registered. Note that version
575 |     # zero will always be reserved for internal (e.g. proprietary, legacy) schema
576 |     # specifications/implementations and will never be assigned to in by this
577 |     # library.
578 |     REGISTERED_SCHEMA_VERSIONS = [None, VERSION_ONE]
579 | 
580 |     @classmethod
581 |     def get_schema_version_from_number(cls, schema_version_number):
582 |         """
583 |         :param int schema_version_number: the version number of the ``SchemaVersion``
584 |                desired. This must be a registered schema version number.
585 | 
586 |         :returns: The ``SchemaVersion`` for the given number. This will never be ``None``.
587 |         :rtype: SchemaVersion
588 |         """
589 |         if schema_version_number >= len(cls.REGISTERED_SCHEMA_VERSIONS) or schema_version_number < 0:
590 |             raise ValueError('Invalid schema version number {}'.format(schema_version_number))
591 |         schema_version = cls.REGISTERED_SCHEMA_VERSIONS[schema_version_number]
592 | 
593 |         if schema_version is None:
594 |             raise ValueError('Unknown schema version number {}'.format(schema_version_number))
595 |         return schema_version
596 | 
597 |     @classmethod
598 |     def get_schema_version(cls, bytes):
599 |         """
600 |         Get the appropriate ``SchemaVersion`` for the specified
601 |         serialized HLL.
602 | 
603 |         :param list bytes: the serialized HLL whose schema version is desired.
604 | 
605 |         :returns the schema version for the specified HLL. This will never be ``None``.
606 |         :rtype: SchemaVersion
607 |         """
608 |         version_byte = bytes[0]
609 |         schema_version_number = cls.schema_version(version_byte)
610 | 
611 |         return cls.get_schema_version_from_number(schema_version_number)
612 | 
613 |     @classmethod
614 |     def pack_version_byte(cls, schema_version, type_ordinal):
615 |         """
616 |         Generates a byte that encodes the schema version and the type ordinal of the HLL.
617 | 
618 |         The top nibble is the schema version and the bottom nibble is the type ordinal.
619 | 
620 |         :param int schema_version: the schema version to encode.
621 |         :param int type_ordinal: the type ordinal of the HLL to encode.
622 |         :returns: the packed version byte
623 |         :rtype: byte
624 |         """
625 |         return BitUtil.to_signed_byte(BitUtil.left_shift_int(cls.NIBBLE_MASK & schema_version, cls.NIBBLE_BITS) | (cls.NIBBLE_MASK & type_ordinal))
626 | 
627 |     @classmethod
628 |     def pack_cutoff_byte(cls, explicit_cutoff, sparse_enabled):
629 |         """
630 |         Generates a byte that encodes the log-base-2 of the explicit cutoff or sentinel values for
631 |         'explicit-disabled' or 'auto', as well as the boolean indicating whether to use ``HLLType.SPARSE``
632 |         in the promotion hierarchy.
633 | 
634 |         The top bit is always padding, the second highest bit indicates the
635 |         'sparse-enabled' boolean, and the lowest six bits encode the explicit
636 |         cutoff value.
637 | 
638 |         :param int explicit_cutoff: the explicit cutoff value to encode.
639 |                * If 'explicit-disabled' is chosen, this value should be ``0``.
640 |                * If a cutoff of 2:sup:`n` is desired, for``0 <= n < 31``, this value should be ``n + 1``.
641 |         :param boolean sparse_enabled: whether ``HLLType.SPARSE``
642 |                should be used in the promotion hierarchy to improve HLL
643 |                storage.
644 |         :rtype: byte
645 |         """
646 |         sparse_bit = BitUtil.left_shift_int(1, cls.EXPLICIT_CUTOFF_BITS) if sparse_enabled else 0
647 |         return BitUtil.to_signed_byte(sparse_bit | (cls.EXPLICIT_CUTOFF_MASK & explicit_cutoff))
648 | 
649 |     @classmethod
650 |     def pack_parameters_byte(cls, register_width, register_count_log2):
651 |         """
652 |         Generates a byte that encodes the parameters of a ``HLLType.FULL`` or ``HLLType.SPARSE`` HLL.
653 | 
654 |         The top 3 bits are used to encode ``registerWidth - 1``
655 |         (range of ``registerWidth`` is thus 1-9) and the bottom 5
656 |         bits are used to encode ``registerCountLog2``
657 |         (range of ``registerCountLog2`` is thus 0-31).
658 | 
659 |         :param int register_width: the register width (must be at least 1 and at
660 |                most 9)
661 |         :param int register_count_log2: the log-base-2 of the register count (must
662 |                be at least 0 and at most 31)
663 |         :returns: the packed parameters byte
664 |         :rtype: byte
665 |         """
666 |         width_bits = (register_width - 1) & cls.REGISTER_WIDTH_MASK
667 |         count_bits = register_count_log2 & cls.LOG2_REGISTER_COUNT_MASK
668 |         return BitUtil.to_signed_byte(BitUtil.to_signed_byte(BitUtil.left_shift_int(width_bits, cls.LOG2_REGISTER_COUNT_BITS) | count_bits))
669 | 
670 |     @classmethod
671 |     def sparse_enabled(cls, cutoff_byte):
672 |         """
673 |         Extracts the 'sparse-enabled' boolean from the cutoff byte of a serialized HLL.
674 | 
675 |         :param byte cutoff_byte: the cutoff byte of the serialized HLL
676 |         :returns: the 'sparse-enabled' boolean
677 |         :rtype: boolean
678 |         """
679 |         return (BitUtil.unsigned_right_shift_byte(cutoff_byte, cls.EXPLICIT_CUTOFF_BITS) & 1) == 1
680 | 
681 |     @classmethod
682 |     def explicit_cutoff(cls, cutoff_byte):
683 |         """
684 |         Extracts the explicit cutoff value from the cutoff byte of a serialized HLL.
685 | 
686 |         :param byte cutoff_byte: the cutoff byte of the serialized HLL
687 |         :returns: the explicit cutoff value
688 |         :rtype: int
689 |         """
690 |         return cutoff_byte & cls.EXPLICIT_CUTOFF_MASK
691 | 
692 |     @classmethod
693 |     def schema_version(cls, version_byte):
694 |         """
695 |         Extracts the schema version from the version byte of a serialized HLL.
696 | 
697 |         :param byte version_byte: the version byte of the serialized HLL
698 |         :returns: the schema version of the serialized HLL
699 |         :rtype: int
700 |         """
701 |         return cls.NIBBLE_MASK & BitUtil.unsigned_right_shift_byte(version_byte, cls.NIBBLE_BITS)
702 | 
703 |     @classmethod
704 |     def type_ordinal(cls, version_byte):
705 |         """
706 |         Extracts the type ordinal from the version byte of a serialized HLL.
707 | 
708 |         :param byte version_byte: the version byte of the serialized HLL
709 |         :returns: the type ordinal of the serialized HLL
710 |         :rtype: int
711 |         """
712 |         return version_byte & cls.NIBBLE_MASK
713 | 
714 |     @classmethod
715 |     def register_width(cls, parameters_byte):
716 |         """
717 |         Extracts the register width from the parameters byte of a serialized ``HLLType.FULL`` HLL.
718 | 
719 |         :param byte parameters_byte: the parameters byte of the serialized HLL
720 |         :returns: the register width of the serialized HLL
721 |         :rtype: int
722 |         """
723 |         return (BitUtil.unsigned_right_shift_byte(parameters_byte, cls.LOG2_REGISTER_COUNT_BITS) & cls.REGISTER_WIDTH_MASK) + 1
724 | 
725 |     @classmethod
726 |     def register_count_log2(cls, parameters_byte):
727 |         """
728 |         Extracts the log2(register_count) from the parameters byte of a serialized ``HLLType.FULL`` HLL.
729 | 
730 |         :param byte parameters_byte: the parameters byte of the serialized HLL
731 |         :returns: log2(registerCount) of the serialized HLL
732 |         :rtype: int
733 |         """
734 |         return parameters_byte & cls.LOG2_REGISTER_COUNT_MASK
735 | 


--------------------------------------------------------------------------------
/python_hll/util.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from math import log
  4 | import numpy as np
  5 | 
  6 | 
  7 | class BitUtil:
  8 |     """
  9 |     A collection of bit utilities.
 10 |     """
 11 | 
 12 |     # The set of least-significant bits for a given ``byte``. ``-1``
 13 |     # is used if no bits are set (so as to not be confused with "index of zero"
 14 |     # meaning that the least significant bit is the 0th (1st) bit).
 15 |     LEAST_SIGNIFICANT_BIT = [
 16 |         -1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 17 |         4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 18 |         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 19 |         4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 20 |         6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 21 |         4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 22 |         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 23 |         4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 24 |         7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 25 |         4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 26 |         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 27 |         4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 28 |         6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 29 |         4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 30 |         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 31 |         4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
 32 |     ]
 33 | 
 34 |     @classmethod
 35 |     def least_significant_bit(cls, value):
 36 |         """
 37 |         Computes the least-significant bit of the specified ``long``
 38 |         that is set to ``1``. Zero-indexed.
 39 | 
 40 |         See <http://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set>
 41 |         and <http://www-graphics.stanford.edu/~seander/bithacks.html>.
 42 | 
 43 |         :param long value: the ``long`` whose least-significant bit is desired.
 44 |         :returns: the least-significant bit of the specified ``long``.
 45 |                   ``-1`` is returned if there are no bits set.
 46 |         :rtype: int
 47 |         """
 48 | 
 49 |         if value == 0:
 50 |             # by contract
 51 |             return -1
 52 | 
 53 |         elif value & 0xFF != 0:
 54 |             index = int(cls.unsigned_right_shift_long(value, 0) & 0xFF)
 55 |             return cls.LEAST_SIGNIFICANT_BIT[index] + 0
 56 | 
 57 |         elif value & 0xFFFF != 0:
 58 |             index = int(cls.unsigned_right_shift_long(value, 8) & 0xFF)
 59 |             return cls.LEAST_SIGNIFICANT_BIT[index] + 8
 60 | 
 61 |         elif value & 0xFFFFFF != 0:
 62 |             index = int(cls.unsigned_right_shift_long(value, 16) & 0xFF)
 63 |             return cls.LEAST_SIGNIFICANT_BIT[index] + 16
 64 | 
 65 |         elif value & 0xFFFFFFFF != 0:
 66 |             index = int(cls.unsigned_right_shift_long(value, 24) & 0xFF)
 67 |             return cls.LEAST_SIGNIFICANT_BIT[index] + 24
 68 | 
 69 |         elif value & 0xFFFFFFFFFF != 0:
 70 |             index = int(cls.unsigned_right_shift_long(value, 32) & 0xFF)
 71 |             return cls.LEAST_SIGNIFICANT_BIT[index] + 32
 72 | 
 73 |         elif value & 0xFFFFFFFFFFFF != 0:
 74 |             index = int(cls.unsigned_right_shift_long(value, 40) & 0xFF)
 75 |             return cls.LEAST_SIGNIFICANT_BIT[index] + 40
 76 | 
 77 |         elif value & 0xFFFFFFFFFFFFFF != 0:
 78 |             index = int(cls.unsigned_right_shift_long(value, 48) & 0xFF)
 79 |             return cls.LEAST_SIGNIFICANT_BIT[index] + 48
 80 | 
 81 |         else:
 82 |             index = int(cls.unsigned_right_shift_long(value, 56) & 0xFF)
 83 |             return cls.LEAST_SIGNIFICANT_BIT[index] + 56
 84 | 
 85 |     @classmethod
 86 |     def unsigned_right_shift_long(cls, val, n):
 87 |         """
 88 |         Equivalent to Java >>> on a long value
 89 |         """
 90 |         return val if n == 0 else int(np.uint64(val) >> np.uint64(n))
 91 | 
 92 |     @classmethod
 93 |     def unsigned_right_shift_int(cls, val, n):
 94 |         """
 95 |         Equivalent to Java >>> on an int value
 96 |         """
 97 |         return val if n == 0 else int(np.uint32(val) >> np.uint32(n))
 98 | 
 99 |     @classmethod
100 |     def unsigned_right_shift_byte(cls, val, n):
101 |         """
102 |         Equivalent to Java >>> on a byte value
103 |         """
104 |         return val if n == 0 else int(np.uint32(val) >> np.uint32(n))
105 | 
106 |     @classmethod
107 |     def to_signed_byte(cls, i):
108 |         """
109 |         Converts a Python byte (unsigned integer from 0 to 255) to a Java byte
110 |         (signed two's complement integer from -128 to 127).
111 |         :type i: byte
112 |         :rtype: byte
113 |         """
114 |         return i if i <= 127 else i - 256
115 | 
116 |     @classmethod
117 |     def left_shift_long(cls, long_x, int_y):
118 |         """
119 |         Simulates a Java << for a long.
120 | 
121 |         :param long_x: expected long value in python code
122 |         :param int_y: expected int value in python
123 |         :returns: left shift result for, x << y
124 |         :rtype: long
125 |         """
126 |         x = np.int64(long_x)
127 |         y = np.int(int_y)
128 |         z = np.left_shift(x, y)
129 | 
130 |         return np.int64(z.item())
131 | 
132 |     @classmethod
133 |     def left_shift_int(cls, int_x, int_y):
134 |         """
135 |         Simulates a Java << for an integer.
136 | 
137 |         :param int_x: expected int value in python code
138 |         :param int_y: expected int value in python
139 |         :returns: left shift result for, x << y
140 |         :rtype: int
141 |         """
142 |         x = np.int32(int_x)
143 |         y = np.int(int_y)
144 |         z = np.left_shift(x, y)
145 | 
146 |         return z.item()
147 | 
148 |     @classmethod
149 |     def left_shift_byte(cls, byte_x, int_y):
150 |         """
151 |         Simulates a Java << for a byte.
152 | 
153 |         :param byte_x: expected byte value in python code
154 |         :param int_y: expected int value in python
155 |         :returns: left shift result for, x << y
156 |         :rtype: int
157 |         """
158 |         x = np.int8(byte_x)  # converts to signed byte, since byte is signed in java
159 |         y = np.int(int_y)
160 |         z = np.left_shift(x, y)
161 | 
162 |         # In Java, (byte)128 << 3 produces an int.
163 |         return z.item()
164 | 
165 | 
166 | class LongIterator:
167 |     """
168 |     A ``long``-based iterator.
169 |     """
170 | 
171 |     LOG2_BITS_PER_WORD = 6
172 |     BITS_PER_WORD = BitUtil.left_shift_int(1, LOG2_BITS_PER_WORD)
173 | 
174 |     def __init__(self, register_width, words, register_mask, count):
175 |         self._register_width = register_width
176 |         self._words = words
177 |         self._register_mask = register_mask
178 |         self._count = count
179 | 
180 |         # register setup
181 |         self._register_index = 0
182 |         self._word_index = 0
183 |         self._remaining_word_bits = self.BITS_PER_WORD
184 |         self._word = self._words[self._word_index]
185 | 
186 |     def __iter__(self):
187 |         return self
188 | 
189 |     def __next__(self):
190 |         # Python 3 compatibility
191 |         return self.next()
192 | 
193 |     def next(self):
194 |         if self._register_index >= self._count:
195 |             raise StopIteration
196 | 
197 |         if self._remaining_word_bits >= self._register_width:
198 |             register = self._word & self._register_mask
199 | 
200 |             # shift to the next register
201 |             self._word = BitUtil.unsigned_right_shift_long(self._word, self._register_width)
202 |             self._remaining_word_bits -= self._register_width
203 |         else:  # insufficient bits remaining in current word
204 |             self._word_index += 1  # move to the next word
205 | 
206 |             register = (self._word | BitUtil.left_shift_long(self._words[self._word_index], self._remaining_word_bits)) & self._register_mask
207 | 
208 |             # shift to the next partial register (word)
209 |             self._word = BitUtil.unsigned_right_shift_long(self._words[self._word_index], self._register_width - self._remaining_word_bits)
210 |             self._remaining_word_bits += self.BITS_PER_WORD - self._register_width
211 | 
212 |         self._register_index += 1
213 |         return register
214 | 
215 | 
216 | class BitVector:
217 |     """
218 |     A vector (array) of bits that is accessed in units ("registers") of ``width``
219 |     bits which are stored as 64bit "words" (``long``'s).  In this context
220 |     a register is at most 64bits.
221 |     """
222 | 
223 |     # NOTE:  in this context, a word is 64bits
224 | 
225 |     # rather than doing division to determine how a bit index fits into 64bit
226 |     # words (i.e. longs), bit shifting is used
227 |     LOG2_BITS_PER_WORD = 6  # =>64bits
228 |     BITS_PER_WORD = BitUtil.left_shift_int(1, LOG2_BITS_PER_WORD)
229 |     BITS_PER_WORD_MASK = BITS_PER_WORD - 1
230 | 
231 |     # ditto from above but for bytes (for output)
232 |     LOG2_BITS_PER_BYTE = 3  # =>8bits
233 |     BITS_PER_BYTE = BitUtil.left_shift_int(1, LOG2_BITS_PER_BYTE)
234 | 
235 |     BYTES_PER_WORD = 8  # 8 bytes in a long
236 | 
237 |     def __init__(self, width, count):
238 |         """
239 |         :param int width: the width of each register.  This cannot be negative or
240 |                zero or greater than 63 (the signed word size).
241 |         :param long count: the number of registers.  This cannot be negative or zero
242 |         """
243 |         # 64bit words
244 |         # ceil((width * count)/BITS_PER_WORD)
245 |         self._words = [0] * BitUtil.unsigned_right_shift_long((width * count) + self.BITS_PER_WORD_MASK, self.LOG2_BITS_PER_WORD)
246 |         # the width of a register in bits (this cannot be more than 64 (the word size))
247 |         self._register_width = width
248 |         self._count = count
249 |         self._register_mask = BitUtil.left_shift_long(1, width) - 1
250 | 
251 |     def get_register(self, register_index):
252 |         """
253 |         :param long register_index: the index of the register whose value is to be
254 |                retrieved.  This cannot be negative.
255 |         :returns: the value at the specified register index
256 |         :rtype: long
257 |         """
258 |         # NOTE:  if this changes then setMaxRegister() must change
259 |         bit_index = register_index * self._register_width
260 |         first_word_index = BitUtil.unsigned_right_shift_long(bit_index, self.LOG2_BITS_PER_WORD)  # aka (bitIndex / BITS_PER_WORD)
261 |         second_word_index = BitUtil.unsigned_right_shift_long(bit_index + self._register_width - 1, self.LOG2_BITS_PER_WORD)  # see above
262 |         bit_remainder = bit_index & self.BITS_PER_WORD_MASK  # aka (bitIndex % BITS_PER_WORD)
263 | 
264 |         if first_word_index == second_word_index:
265 |             return BitUtil.unsigned_right_shift_long(self._words[first_word_index], bit_remainder) & self._register_mask
266 |         # else -- register spans words
267 |         return BitUtil.unsigned_right_shift_long(self._words[first_word_index], bit_remainder) | \
268 |             BitUtil.left_shift_long(self._words[second_word_index], self.BITS_PER_WORD - bit_remainder) & self._register_mask
269 | 
270 |     def set_register(self, register_index, value):
271 |         """
272 |         :param long register_index: the index of the register whose value is to be set.
273 |                This cannot be negative
274 |         :param long value: the value to set in the register
275 |         :rtype: long
276 |         """
277 |         # NOTE:  if this changes then setMaxRegister() must change
278 |         bit_index = register_index * self._register_width
279 |         first_word_index = BitUtil.unsigned_right_shift_long(bit_index, self.LOG2_BITS_PER_WORD)  # aka (bitIndex / BITS_PER_WORD)
280 |         second_word_index = BitUtil.unsigned_right_shift_long(bit_index + self._register_width - 1, self.LOG2_BITS_PER_WORD)  # see above
281 |         bit_remainder = bit_index & self.BITS_PER_WORD_MASK  # aka (bitIndex % BITS_PER_WORD)
282 | 
283 |         if first_word_index == second_word_index:
284 |             # clear then set
285 |             self._words[first_word_index] &= ~BitUtil.left_shift_long(self._register_mask, bit_remainder)
286 |             self._words[first_word_index] |= BitUtil.left_shift_long(value, bit_remainder)
287 |         else:  # register spans words
288 |             # clear then set each partial word
289 |             self._words[first_word_index] &= BitUtil.left_shift_long(1, bit_remainder) - 1
290 |             self._words[first_word_index] |= BitUtil.left_shift_long(value, bit_remainder)
291 | 
292 |             self._words[second_word_index] &= ~BitUtil.unsigned_right_shift_long(self._register_mask, self.BITS_PER_WORD - bit_remainder)
293 |             self._words[second_word_index] |= BitUtil.unsigned_right_shift_long(value, self.BITS_PER_WORD - bit_remainder)
294 | 
295 |     def register_iterator(self):
296 |         """
297 |         :returns: a ``LongIterator`` for iterating starting at the register
298 |                   with index zero. This will never be ``None``.
299 |         :rtype: LongIterator
300 |         """
301 |         return LongIterator(self._register_width, self._words, self._register_mask, self._count)
302 | 
303 |     def set_max_register(self, register_index, value):
304 |         """
305 |         Sets the value of the specified index register if and only if the specified
306 |         value is greater than the current value in the register.  This is equivalent
307 |         to but much more performant than
308 | 
309 |         ``vector.setRegister(index, Math.max(vector.getRegister(index), value));``
310 | 
311 |         :param long register_index: the index of the register whose value is to be set.
312 |                This cannot be negative
313 |         :param long value: the value to set in the register if and only if this value
314 |                is greater than the current value in the register
315 |         :returns: True if and only if the specified value is greater
316 |                   than or equal to the current register value. False
317 |                   otherwise.
318 |         :rtype: boolean
319 |         """
320 |         # NOTE:  if this changes then setRegister() must change
321 |         bit_index = register_index * self._register_width
322 |         first_word_index = BitUtil.unsigned_right_shift_long(bit_index, self.LOG2_BITS_PER_WORD)  # aka (bitIndex / BITS_PER_WORD)
323 |         second_word_index = BitUtil.unsigned_right_shift_long(bit_index + self._register_width - 1, self.LOG2_BITS_PER_WORD)  # see above
324 |         bit_remainder = bit_index & self.BITS_PER_WORD_MASK  # aka (bitIndex % BITS_PER_WORD)
325 | 
326 |         if first_word_index == second_word_index:
327 |             register_value = BitUtil.unsigned_right_shift_long(self._words[first_word_index], bit_remainder) & self._register_mask
328 |         else:  # register spans words
329 |             # # no need to mask since at top of word
330 |             register_value = BitUtil.unsigned_right_shift_long(self._words[first_word_index], bit_remainder) | \
331 |                BitUtil.left_shift_long(self._words[second_word_index], self.BITS_PER_WORD - bit_remainder) & self._register_mask
332 | 
333 |         # determine which is the larger and update as necessary
334 |         if value > register_value:
335 |             # NOTE:  matches setRegister()
336 |             if first_word_index == second_word_index:
337 |                 # clear then set
338 |                 self._words[first_word_index] &= ~BitUtil.left_shift_long(self._register_mask, bit_remainder)
339 |                 self._words[first_word_index] |= BitUtil.left_shift_long(value, bit_remainder)
340 |             else:  # register spans words
341 |                 # clear then set each partial word
342 |                 self._words[first_word_index] &= BitUtil.left_shift_long(1, bit_remainder) - 1
343 |                 self._words[first_word_index] |= BitUtil.left_shift_long(value,  bit_remainder)
344 | 
345 |                 self._words[second_word_index] &= ~BitUtil.unsigned_right_shift_long(self._register_mask, self.BITS_PER_WORD - bit_remainder)
346 |                 self._words[second_word_index] |= BitUtil.unsigned_right_shift_long(value, self.BITS_PER_WORD - bit_remainder)
347 |         # else -- the register value is greater (or equal) so nothing needs to be done
348 | 
349 |         return value >= register_value
350 | 
351 |     def fill(self, value):
352 |         """
353 |         Fills this bit vector with the specified bit value.  This can be used to
354 |         clear the vector by specifying ``0``.
355 | 
356 |         :param long value: the value to set all bits to (only the lowest bit is used)
357 |         :rtype: void
358 |         """
359 |         for i in range(self._count):
360 |             self.set_register(i, value)
361 | 
362 |     def get_register_contents(self, serializer):
363 |         """
364 |         Serializes the registers of the vector using the specified serializer.
365 | 
366 |         :param BigEndianAscendingWordSerializer serializer: the serializer to use. This cannot be ``None``.
367 |         :rtype: void
368 |         """
369 |         iterator = self.register_iterator()
370 | 
371 |         for itr in iterator:
372 |             serializer.write_word(itr)
373 | 
374 | 
375 | class NumberUtil:
376 |     """
377 |     A collection of utilities to work with numbers.
378 |     """
379 | 
380 |     # loge(2) (log-base e of 2)
381 |     LOGE_2 = 0.6931471805599453
382 | 
383 |     # the hex characters
384 |     HEX = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F']
385 | 
386 |     @classmethod
387 |     def log2(cls, value):
388 |         """
389 |         Computes the ``log2`` (log-base-two) of the specified value.
390 | 
391 |         :param float value: the ``float`` for which the ``log2`` is
392 |                desired.
393 |         :returns: the ``log2`` of the specified value
394 |         :rtype: float
395 |         """
396 |         # REF:  http://en.wikipedia.org/wiki/Logarithmic_scale (conversion of bases)
397 |         return log(value) / cls.LOGE_2
398 | 
399 |     @classmethod
400 |     def to_hex(cls, bytes, offset, count):
401 |         """
402 |         Converts the specified array of ``byte``'s into a string of
403 |         hex characters (low ``byte`` first).
404 | 
405 |         :param list bytes: the array of ``byte``'s that are to be converted.
406 |                This cannot be ``None`` though it may be empty.
407 |         :param int offset: the offset in ``bytes`` at which the bytes will
408 |                be taken.  This cannot be negative and must be less than
409 |                ``bytes.length - 1``.
410 |         :param int count: the number of bytes to be retrieved from the specified array.
411 |                This cannot be negative.  If greater than ``bytes.length - offset``
412 |                then that value is used.
413 |         :returns: a string of at most ``count`` characters that represents
414 |                   the specified byte array in hex.  This will never be ``None``
415 |                   though it may be empty if ``bytes`` is empty or ``count``
416 |                   is zero.
417 |         :rtype: string
418 |         """
419 |         if offset >= len(bytes):  # by contract
420 |             raise Exception("Offset is greater than the length, {offset} >= {byte_array_length}"
421 |                             .format(offset=offset, byte_array_length=len(bytes)))
422 |         byte_count = min(len(bytes) - offset, count)
423 |         upper_bound = byte_count + offset
424 | 
425 |         chars = [None] * (byte_count * 2)   # two chars per byte
426 |         char_index = 0
427 |         for i in range(offset, upper_bound):
428 |             value = bytes[i]
429 |             chars[char_index] = cls.HEX[(BitUtil.unsigned_right_shift_byte(value, 4)) & 0x0F]
430 |             char_index += 1
431 |             chars[char_index] = cls.HEX[value & 0x0F]
432 |             char_index += 1
433 | 
434 |         return ''.join(chars)
435 | 
436 |     @classmethod
437 |     def from_hex(cls, string, offset, count):
438 |         """
439 |         Converts the specified array of hex characters into an array of ``byte``'s
440 |         (low ``byte`` first).
441 | 
442 |         :param string string: the string of hex characters to be converted into ``byte``'s.
443 |                This cannot be ``None`` though it may be blank.
444 |         :param int offset: the offset in the string at which the characters will be
445 |                taken.  This cannot be negative and must be less than ``string.length() - 1``.
446 |         :param int count: the number of characters to be retrieved from the specified
447 |                string.  This cannot be negative and must be divisible by two
448 |                (since there are two characters per ``byte``).
449 |         :returns: the array of ``byte``'s that were converted from the
450 |                   specified string (in the specified range).  This will never be
451 |                   ``None`` though it may be empty if ``string``
452 |                   is empty or ``count`` is zero.
453 |         :rtype: list
454 |         """
455 | 
456 |         if offset >= len(string):  # by contract
457 |             raise Exception("Offset is greater than the length, {offset} >= {string_length}"
458 |                             .format(offset=offset, string_length=len(string)))
459 |         if (count & 0x01) != 0:  # by contract
460 |             raise Exception("Count is not divisible by two, ({})".format(count))
461 | 
462 |         char_count = min(len(string) - offset, count)
463 |         upper_bound = offset + char_count
464 | 
465 |         byte_array = [0] * (BitUtil.unsigned_right_shift_int(char_count, 1))  # aka /2
466 |         byte_index = 0  # beginning
467 |         for i in range(0, upper_bound, 2):
468 |             p1 = BitUtil.left_shift_int(cls._digit(string[i]), 4)
469 |             p2 = cls._digit(string[i+1])
470 |             p = (p1 | p2) & 0xFF
471 | 
472 |             byte_array[byte_index] = BitUtil.to_signed_byte(p)
473 |             byte_index += 1
474 |         return byte_array
475 | 
476 |     @classmethod
477 |     def _digit(cls, character):
478 |         """
479 |         :param string character: a hex character to be converted to a ``byte``.
480 |                This cannot be a character other than [a-fA-F0-9].
481 |         :returns: the value of the specified character.  This will be a value ``0``
482 |                   through ``15``.
483 |         :rtype: int
484 |         """
485 |         if character == '0':
486 |             return 0
487 |         elif character == '1':
488 |             return 1
489 |         elif character == '2':
490 |             return 2
491 |         elif character == '3':
492 |             return 3
493 |         elif character == '4':
494 |             return 4
495 |         elif character == '5':
496 |             return 5
497 |         elif character == '6':
498 |             return 6
499 |         elif character == '7':
500 |             return 7
501 |         elif character == '8':
502 |             return 8
503 |         elif character == '9':
504 |             return 9
505 |         elif character in ['a', 'A']:
506 |             return 10
507 |         elif character in ['b', 'B']:
508 |             return 11
509 |         elif character in ['c', 'C']:
510 |             return 12
511 |         elif character in ['d', 'D']:
512 |             return 13
513 |         elif character in ['e', 'E']:
514 |             return 14
515 |         elif character in ['f', 'F']:
516 |             return 15
517 |         else:
518 |             raise Exception("Character is not in [a-fA-F0-9]: ({})".format(character))
519 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
 1 | pip==18.1
 2 | bumpversion==0.5.3
 3 | wheel==0.32.1
 4 | watchdog==0.9.0
 5 | flake8==3.5.0
 6 | tox==3.5.2
 7 | coverage==4.5.1
 8 | Sphinx==1.8.1
 9 | twine==1.12.1
10 | numpy==1.16.4
11 | 
12 | pytest==3.8.2
13 | pytest-runner==4.2
14 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.1.3
 3 | commit = True
 4 | tag = True
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version='{current_version}'
 8 | replace = version='{new_version}'
 9 | 
10 | [bumpversion:file:python_hll/__init__.py]
11 | search = __version__ = '{current_version}'
12 | replace = __version__ = '{new_version}'
13 | 
14 | [bdist_wheel]
15 | universal = 1
16 | 
17 | [flake8]
18 | exclude = docs
19 | 
20 | [aliases]
21 | test = pytest
22 | 
23 | [tool:pytest]
24 | collect_ignore = ['setup.py']
25 | 
26 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """The setup script."""
 5 | 
 6 | from setuptools import setup, find_packages
 7 | 
 8 | with open('README.rst') as readme_file:
 9 |     readme = readme_file.read()
10 | 
11 | with open('HISTORY.rst') as history_file:
12 |     history = history_file.read()
13 | 
14 | requirements = ['numpy']
15 | 
16 | setup_requirements = ['pytest-runner', ]
17 | 
18 | test_requirements = ['pytest', ]
19 | 
20 | setup(
21 |     author="Jon Aquino",
22 |     author_email='jonathan.aquino@adroll.com',
23 |     classifiers=[
24 |         'Development Status :: 2 - Pre-Alpha',
25 |         'Intended Audience :: Developers',
26 |         'License :: OSI Approved :: MIT License',
27 |         'Natural Language :: English',
28 |         "Programming Language :: Python :: 2",
29 |         'Programming Language :: Python :: 2.7',
30 |         'Programming Language :: Python :: 3',
31 |         'Programming Language :: Python :: 3.4',
32 |         'Programming Language :: Python :: 3.5',
33 |         'Programming Language :: Python :: 3.6',
34 |         'Programming Language :: Python :: 3.7',
35 |     ],
36 |     description="Python library for the HyperLogLog algorithm",
37 |     install_requires=requirements,
38 |     license="MIT license",
39 |     long_description=readme + '\n\n' + history,
40 |     include_package_data=True,
41 |     keywords='python_hll',
42 |     name='python_hll',
43 |     packages=find_packages(include=['python_hll']),
44 |     setup_requires=setup_requirements,
45 |     test_suite='tests',
46 |     tests_require=test_requirements,
47 |     url='https://github.com/AdRoll/python-hll',
48 |     version='0.1.3',
49 |     zip_safe=False,
50 | )
51 | 


--------------------------------------------------------------------------------
/tests/data/README.txt:
--------------------------------------------------------------------------------
  1 | This test data comes from https://github.com/citusdata/postgresql-hll/tree/master/sql/data
  2 | 
  3 | If the filename starts with a "cumulative_union" prefix, then it's the
  4 | standard cumulative union format we've been using (cardinality,
  5 | multiset, union_cardinality, union_multiset) in which the
  6 | union_multiset is an accumulator over the subsequent lines.
  7 | 
  8 | If the filename starts with a "cumulative_add" prefix, then it's a new
  9 | format (cardinality, raw_value, multiset) in which the "raw_value" is
 10 | added to the accumulator "multiset".
 11 | 
 12 | The cutoffs I'm assuming in this file are 256 for explicit to sparse,
 13 | and 850 for sparse to full. Log2m=11, registerWidth=5, as usual.
 14 | 
 15 | A brief summary of what each file tries to accomplish follows:
 16 | 
 17 | cumulative_add_comprehensive_promotion.csv
 18 | 
 19 | Cumulatively adds random values to an EMPTY multiset.
 20 | 
 21 | Format: cumulative add
 22 | Tests:
 23 | - EMPTY, EXPLICIT, SPARSE_PROBABILISTIC, PROBABILSTIC addition
 24 | - EMPTY to EXPLICIT promotion
 25 | - EXPLICIT to SPARSE_PROBABILISTIC promotion
 26 | - SPARSE_PROBABILISTIC to PROBABILISTIC promotion
 27 | 
 28 | cumulative_add_sparse_step.csv
 29 | 
 30 | Cumulatively sets successive registers to:
 31 | 
 32 |     <code>(registerIndex % probabilisticRegisterMaxValue) + 1</code>
 33 | 
 34 | by adding specifically constructed values to a SPARSE_PROBABILISTIC multiset.
 35 | Does not induce promotion.
 36 | 
 37 | Format: cumulative add
 38 | Tests:
 39 | - SPARSE_PROBABILISTIC addition (predictable)
 40 | 
 41 | cumulative_add_sparse_random.csv
 42 | 
 43 | Cumulatively sets random registers of a SPARSE_PROBABILISTIC multiset to
 44 | random values by adding random values. Does not induce promotion.
 45 | 
 46 | Format: cumulative add
 47 | Tests:
 48 | - SPARSE_PROBABILISTIC addition (random)
 49 | 
 50 | cumulative_union_explicit_promotion.csv
 51 | 
 52 | Unions an EMPTY accumulator with EXPLICIT multisets, each containing a
 53 | single random value.
 54 | 
 55 | Format: cumulative union
 56 | Tests:
 57 | - EMPTY U EXPLICIT
 58 | - EXPLICIT U EXPLICIT
 59 | - EXPLICIT to SPARSE_PROBABILISTIC promotion
 60 | - SPARSE_PROBABILISTIC U EXPLICIT
 61 | 
 62 | cumulative_union_sparse_promotion.csv
 63 | 
 64 | Unions an EMPTY accumulator with SPARSE_PROBABILISTIC multisets, each
 65 | having one register set.
 66 | 
 67 | Format: cumulative union
 68 | Tests:
 69 | - EMPTY U SPARSE_PROBABILISTIC
 70 | - SPARSE_PROBABILISTIC U SPARSE_PROBABILISTIC
 71 | - SPARSE_PROBABILISTIC promotion
 72 | - SPARSE_PROBABILISTIC U PROBABILISTIC
 73 | 
 74 | cumulative_union_explicit_explicit.csv
 75 | 
 76 | Unions an EMPTY accumulator with EXPLICIT multisets, each having a single
 77 | random value, twice in a row to verify that the set properties are
 78 | satisfied.
 79 | 
 80 | Format: cumulative union
 81 | Tests:
 82 | - EMPTY U EXPLICIT
 83 | - EXPLICIT U EXPLICIT
 84 | 
 85 | cumulative_union_sparse_sparse.csv
 86 | 
 87 | Unions an EMPTY accumulator with SPARSE_PROBABILISTIC multisets, each
 88 | having a single register set, twice in a row to verify that the set
 89 | properties are satisfied.
 90 | 
 91 | Format: cumulative union
 92 | Tests:
 93 | - EMPTY U SPARSE_PROBABILISTIC
 94 | - SPARSE_PROBABILISTIC U SPARSE_PROBABILISTIC
 95 | 
 96 | cumulative_union_probabilistic_probabilistic.csv
 97 | 
 98 | Unions an EMPTY accumulator with PROBABILISTIC multisets, each having
 99 | many registers set, twice in a row to verify that the set properties are
100 | satisfied.
101 | 
102 | Format: cumulative union
103 | Tests:
104 | - EMPTY U PROBABILISTIC
105 | - PROBABILISTIC U PROBABILISTIC
106 | 
107 | cumulative_union_comprehensive.csv
108 | 
109 | Unions an EMPTY accumulator with random multisets.
110 | 
111 | Format: cumulative union
112 | Tests:
113 | - hopefully all union possibilities
114 | 


--------------------------------------------------------------------------------
/tests/data/cumulative_union_sparse_full_representation.csv:
--------------------------------------------------------------------------------
1 | cardinality,HLL,union_cardinality,union_HLL
2 | 0,\x118B49,0,\x118B49
3 | 1.0002442201269182,\x148B490800000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,1.0002442201269182,\x138B490001
4 | 1.0002442201269182,\x148B490040000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,2.000977198748901,\x138B4900010021
5 | 1096.4497021580987,\x148B490002108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,1099.8687346717188,\x148B490842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084210842108421084200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000


--------------------------------------------------------------------------------
/tests/probabilistic_test_util.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from __future__ import division
 4 | from python_hll.util import BitUtil
 5 | from math import ceil
 6 | 
 7 | 
 8 | def construct_hll_value(log2m, register_index, register_value):
 9 |     """
10 |     Constructs a value that when added raw to a HLL will set the register at
11 |     ``register_index`` to ``register_value``.
12 | 
13 |     :param log2m: The log-base-2 of the number of registers in the HLL
14 |     :type log2m: int
15 |     :param register_index: The index of the register to set
16 |     :type register_index: int
17 |     :param register_value: the value to set the register to
18 |     :type register_value: int
19 |     :rtype: int
20 |     """
21 |     partition = register_index
22 |     substream_value = BitUtil.left_shift_long(1, register_value - 1)
23 |     return BitUtil.left_shift_long(substream_value, log2m) | partition
24 | 
25 | 
26 | def get_register_index(raw_value, log2m):
27 |     """
28 |     Extracts the HLL register index from a raw value.
29 |     """
30 |     m_bits_mask = BitUtil.left_shift_int(1, log2m) - 1
31 |     j = raw_value & m_bits_mask
32 |     return j
33 | 
34 | 
35 | def get_register_value(raw_value, log2m):
36 |     """
37 |     Extracts the HLL register value from a raw value.
38 |     """
39 |     substream_value = BitUtil.unsigned_right_shift_long(raw_value, log2m)
40 |     if substream_value == 0:
41 |         # The paper does not cover p(0x0), so the special value 0 is used.
42 |         # 0 is the original initialization value of the registers, so by
43 |         # doing this the HLL simply ignores it. This is acceptable
44 |         # because the probability is 1/(2^(2^register_size_in_bits)).
45 |         p_w = 0
46 |     else:
47 |         p_w = BitUtil.to_signed_byte(min(1 + BitUtil.least_significant_bit(substream_value), 31))
48 |     return p_w
49 | 
50 | 
51 | def get_required_bytes(short_word_length, register_count):
52 |     """
53 |     Returns the number of bytes required to pack ``register_count``
54 |     registers of width ``short_word_length``.
55 |     """
56 |     return ceil((register_count * short_word_length) / 8)
57 | 


--------------------------------------------------------------------------------
/tests/test_big_endian_ascending_word_deserializer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """Unit tests for BigEndianAscendingWordDeserializer """
  5 | 
  6 | from sys import maxsize
  7 | import random
  8 | from python_hll.serialization import BigEndianAscendingWordDeserializer, BigEndianAscendingWordSerializer
  9 | from python_hll.util import BitUtil
 10 | 
 11 | 
 12 | def test_constructor_error():
 13 |     """
 14 |     Error checking tests for constructor.
 15 |     """
 16 | 
 17 |     # word length too small
 18 |     try:
 19 |         BigEndianAscendingWordDeserializer(0, 0, [0])
 20 |         assert False, "Should complain about too-short words."
 21 |     except ValueError as e:
 22 |         assert "Word length must be" in str(e)
 23 | 
 24 |     # word length too large
 25 |     try:
 26 |         BigEndianAscendingWordDeserializer(65, 0, [0])
 27 |         assert False, "Should complain about too-long words."
 28 |     except ValueError as e:
 29 |         assert "Word length must be" in str(e)
 30 | 
 31 |     # byte padding negative
 32 |     try:
 33 |         BigEndianAscendingWordDeserializer(5, -1, [0])
 34 |     except ValueError as e:
 35 |         assert "Byte padding must be" in str(e)
 36 | 
 37 | 
 38 | def test_smoke_64_bit_word():
 39 |     serializer = BigEndianAscendingWordSerializer(64, 5, 0)
 40 | 
 41 |     # Check that the sign bit is being preserved.
 42 |     serializer.write_word(-1)
 43 |     serializer.write_word(-112894714)
 44 | 
 45 |     # CHeck "special values"
 46 |     serializer.write_word(0)
 47 |     serializer.write_word(maxsize)
 48 |     serializer.write_word(-maxsize - 1)
 49 | 
 50 |     bytes_ = serializer.get_bytes()
 51 | 
 52 |     deserializer = BigEndianAscendingWordDeserializer(64, 0, bytes_)
 53 |     assert deserializer.total_word_count() == 5
 54 | 
 55 |     assert deserializer.read_word() == -1
 56 |     assert deserializer.read_word() == -112894714
 57 |     assert deserializer.read_word() == 0
 58 |     assert deserializer.read_word() == maxsize
 59 |     assert deserializer.read_word() == -maxsize - 1
 60 | 
 61 | 
 62 | def test_ascending_smoke(fastonly):
 63 |     """
 64 |     A smoke/fuzz test for ascending (from zero) word values.
 65 |     """
 66 |     word_length = 5
 67 |     while word_length < 65:
 68 |         run_ascending_test(word_length, 3, 1000 if fastonly else 100000)
 69 |         word_length += 1
 70 | 
 71 | 
 72 | def test_random_smoke(fastonly):
 73 |     """
 74 |     A smoke/fuzz test for random word values.
 75 |     """
 76 |     word_length = 5
 77 |     while word_length < 65:
 78 |         run_random_test(word_length, 3, 1000 if fastonly else 100000, word_length)
 79 |         word_length += 1
 80 | 
 81 | 
 82 | def run_random_test(word_length, byte_padding, word_count, seed):
 83 |     """
 84 |     Runs a test which serializes and deserializes random word values.
 85 |     """
 86 |     random.seed(seed)
 87 | 
 88 |     word_mask = ~0 if word_length == 64 else BitUtil.left_shift_long(1, word_length) - 1
 89 | 
 90 |     serializer = BigEndianAscendingWordSerializer(word_length, word_count, byte_padding)
 91 | 
 92 |     for _ in range(word_count):
 93 |         value = random.randint(0, maxsize) & word_mask
 94 |         serializer.write_word(value)
 95 | 
 96 |     bytes_ = serializer.get_bytes()
 97 | 
 98 |     deserializer = BigEndianAscendingWordDeserializer(word_length, byte_padding, bytes_)
 99 | 
100 |     assert deserializer.total_word_count() == word_count
101 | 
102 |     # verification random
103 |     random.seed(seed)
104 |     for _ in range(word_count):
105 |         assert deserializer.read_word() == (random.randint(0, maxsize) & word_mask)
106 | 
107 | 
108 | def run_ascending_test(word_length, byte_padding, word_count):
109 |     """
110 |     Runs a test which serializes and deserializes ascending (from zero) word values.
111 |     """
112 |     word_mask = ~0 if word_length == 64 else BitUtil.left_shift_long(1, word_length) - 1
113 | 
114 |     serializer = BigEndianAscendingWordSerializer(word_length, word_count, byte_padding)
115 | 
116 |     for i in range(word_count):
117 |         serializer.write_word(i & word_mask)
118 | 
119 |     bytes_ = serializer.get_bytes()
120 | 
121 |     deserializer = BigEndianAscendingWordDeserializer(word_length, byte_padding, bytes_)
122 | 
123 |     assert deserializer.total_word_count() == word_count
124 | 
125 |     for i in range(word_count):
126 |         assert deserializer.read_word() == (i & word_mask)
127 | 


--------------------------------------------------------------------------------
/tests/test_big_endian_ascending_word_serializer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """Unit tests for BigEndianAscendingWordSerializer """
  5 | 
  6 | from python_hll.serialization import BigEndianAscendingWordSerializer
  7 | 
  8 | 
  9 | def test_constructor_error():
 10 |     """
 11 |     Test for contructors
 12 |     """
 13 | 
 14 |     # Word length is too short
 15 |     try:
 16 |         BigEndianAscendingWordSerializer(0, 1, 0)
 17 |         assert False, "Should complain about too-short words."
 18 |     except ValueError as e:
 19 |         assert 'Word length must be >= 1 and <= 64. (was: 0)' == str(e)
 20 | 
 21 |     # Word length is too long
 22 |     try:
 23 |         BigEndianAscendingWordSerializer(65, 1, 0)
 24 |         assert False, "Should complain about too-long words."
 25 |     except ValueError as e:
 26 |         assert "Word length must be" in str(e)
 27 | 
 28 |     # Word Count is negative
 29 |     try:
 30 |         BigEndianAscendingWordSerializer(5, -1, 0)
 31 |         assert False, "Should complain about negative word count."
 32 |     except ValueError as e:
 33 |         assert "Word count must be" in str(e)
 34 | 
 35 |     # Byte padding is negative
 36 |     try:
 37 |         BigEndianAscendingWordSerializer(5, 1, -1)
 38 |         assert False, "Should complain about negative byte padding."
 39 |     except ValueError as e:
 40 |         assert "Byte padding must be" in str(e)
 41 | 
 42 | 
 43 | def test_early_get_bytes():
 44 |     """
 45 |     Tests runtime exception thrown at premature call
 46 |     """
 47 | 
 48 |     serializer = BigEndianAscendingWordSerializer(5, 1, 0)
 49 |     try:
 50 |         serializer.get_bytes()
 51 |         assert False, "Should throw."
 52 |     except ValueError as r:
 53 |         assert "Not all words" in str(r)
 54 | 
 55 | 
 56 | def test_smoke_explicit_params():
 57 |     """
 58 |     Smoke test for typical parameters
 59 |     """
 60 |     short_word_length = 64
 61 | 
 62 |     # Should work on empty sequence with no padding
 63 |     serializer = BigEndianAscendingWordSerializer(short_word_length, 0, 0)
 64 |     assert serializer.get_bytes() == []
 65 | 
 66 |     # Should work on byte-divisible sequence with no padding
 67 |     serializer = BigEndianAscendingWordSerializer(short_word_length, 2, 0)
 68 |     serializer.write_word(-4995993186629670228)  # 0xBAAAAAAAAAAAAAACL
 69 |     serializer.write_word(-8070450532247928847)  # 0x8FFFFFFFFFFFFFF1L
 70 | 
 71 |     # Bytes:
 72 |     #   ======
 73 |     #   0xBA 0xAA 0xAA 0xAA 0xAA 0xAA 0xAA 0xAC
 74 |     #   0x8F 0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0xF1
 75 |     #  -70 -86 ...                        -84
 76 |     #  -113 -1 ...                        -15
 77 | 
 78 |     all_bytes = serializer.get_bytes()
 79 |     expected_bytes = [-70, -86, -86, -86, -86, -86, -86, -84, -113, -1, -1, -1, -1, -1, -1, -15]
 80 |     assert all_bytes == expected_bytes
 81 | 
 82 |     # Should pad the array correctly.
 83 |     serializer = BigEndianAscendingWordSerializer(short_word_length, 1, 1)
 84 |     serializer.write_word(1)
 85 |     all_bytes = serializer.get_bytes()
 86 |     expected_bytes = [0, 0, 0, 0, 0, 0, 0, 0, 1]
 87 |     assert all_bytes == expected_bytes
 88 | 
 89 | 
 90 | def test_smoke_probabilistic_params():
 91 |     """
 92 |     Smoke Test for typical parameters used in practice.
 93 |     """
 94 |     short_word_length = 5
 95 | 
 96 |     # Should work on an empty sequence with no padding.
 97 |     serializer = BigEndianAscendingWordSerializer(short_word_length, 0, 0)
 98 |     assert serializer.get_bytes() == []
 99 | 
100 |     # Should work on a non-byte-divisible sequence with no padding.
101 |     serializer = BigEndianAscendingWordSerializer(short_word_length, 3, 0)
102 |     serializer.write_word(9)
103 |     serializer.write_word(31)
104 |     serializer.write_word(1)
105 | 
106 |     # The values:
107 |     # -----------
108 |     # 9     |31    |1     |padding
109 | 
110 |     # Corresponding bits:
111 |     # ------------------
112 |     # 0100 1|111 11|00 001|0
113 | 
114 |     # And the hex/decimal (Are python bytes signed????????):
115 |     # -----------------------------------------------------
116 |     # 0100 1111 -> 0x4F -> 79
117 |     # 1100 0010 -> 0xC2 -> -62
118 | 
119 |     all_bytes = serializer.get_bytes()
120 |     expected_bytes = [79, -62]
121 |     assert all_bytes == expected_bytes
122 | 
123 |     # Should work on a byte-divisible sequence with no padding
124 |     serializer = BigEndianAscendingWordSerializer(short_word_length, 8, 0)
125 | 
126 |     for i in range(1, 9):
127 |         serializer.write_word(i)
128 | 
129 |     # Values: 1-8
130 |     # Corresponding bits:
131 |     # ------------------
132 |     # 00001
133 |     # 00010
134 |     # 00011
135 |     # 00100
136 |     # 00101
137 |     # 00110
138 |     # 00111
139 |     # 01000
140 | 
141 |     # And the hex:
142 |     # ------------
143 |     # 0000 1000 => 0x08 => 8
144 |     # 1000 0110 => 0x86 => -122
145 |     # 0100 0010 => 0x62 => 66
146 |     # 1001 1000 => 0x98 => -104
147 |     # 1110 1000 => 0xE8 => -24
148 | 
149 |     all_bytes = serializer.get_bytes()
150 |     expected_bytes = [8, -122, 66, -104, -24]
151 |     assert all_bytes == expected_bytes
152 | 
153 |     # Should pad the array correctly
154 |     serializer = BigEndianAscendingWordSerializer(short_word_length, 1, 1)
155 |     serializer.write_word(1)
156 | 
157 |     # 1 byte leading padding | value 1 | trailing padding
158 |     # 0000 0000 | 0000 1|000
159 |     all_bytes = serializer.get_bytes()
160 |     expected_bytes = [0, 8]
161 |     assert all_bytes == expected_bytes
162 | 
163 | 
164 | def test_smoke_sparse_params():
165 |     """
166 |     Smoke test for typical parameters used in practice.
167 |     """
168 |     short_word_length = 17
169 | 
170 |     # Should work on an empty sequence with no padding
171 |     serializer = BigEndianAscendingWordSerializer(short_word_length, 0, 0)
172 |     assert serializer.get_bytes() == []
173 | 
174 |     # Should work on a non-byte-divisible sequence with no padding
175 |     serializer = BigEndianAscendingWordSerializer(short_word_length, 3, 0)
176 |     serializer.write_word(9)
177 |     serializer.write_word(42)
178 |     serializer.write_word(75)
179 |     # The values:
180 |     # -----------
181 |     # 9                    |42                   |75                   |padding
182 | 
183 |     # Corresponding bits:
184 |     # ------------------
185 |     # 0000 0000 0000 0100 1|000 0000 0000 1010 10|00 0000 0000 1001 011|0 0000
186 | 
187 |     # And the hex/decimal (remember Java bytes are signed):
188 |     # -----------------------------------------------------
189 |     # 0000 0000 -> 0x00 -> 0
190 |     # 0000 0100 -> 0x04 -> 4
191 |     # 1000 0000 -> 0x80 -> -128
192 |     # 0000 1010 -> 0x0A -> 10
193 |     # 1000 0000 -> 0x80 -> -128
194 |     # 0000 1001 -> 0x09 -> 9
195 |     # 0110 0000 -> 0x60 -> 96
196 | 
197 |     all_bytes = serializer.get_bytes()
198 |     expected_bytes = [0, 4, -128, 10, -128, 9, 96]
199 |     assert all_bytes == expected_bytes
200 | 
201 |     # Should work on a byte-divisible sequence with no padding
202 |     serializer = BigEndianAscendingWordSerializer(short_word_length, 8, 0)
203 | 
204 |     for i in range(1, 9):
205 |         serializer.write_word(i)
206 | 
207 |     # Values: 1-8
208 |     # Corresponding bits:
209 |     # ------------------
210 |     # 0000 0000 0000 0000 1
211 |     # 000 0000 0000 0000 10
212 |     #  00 0000 0000 0000 011
213 |     # 0 0000 0000 0000 0100
214 | 
215 |     # 0000 0000 0000 0010 1
216 |     # 000 0000 0000 0001 10
217 |     # 00 0000 0000 0000 111
218 |     # 0 0000 0000 0000 1000
219 | 
220 |     # And the hex:
221 |     # ------------
222 |     # 0000 0000 -> 0x00 -> 0
223 |     # 0000 0000 -> 0x00 -> 0
224 |     # 1000 0000 -> 0x80 -> -128
225 |     # 0000 0000 -> 0x00 -> 0
226 |     # 1000 0000 -> 0x80 -> -128
227 |     # 0000 0000 -> 0x00 -> 0
228 |     # 0110 0000 -> 0x60 -> 96
229 |     # 0000 0000 -> 0x00 -> 0
230 |     # 0100 0000 -> 0x40 -> 64
231 |     # 0000 0000 -> 0x00 -> 0
232 |     # 0010 1000 -> 0x28 -> 40
233 |     # 0000 0000 -> 0x00 -> 0
234 |     # 0001 1000 -> 0x18 -> 24
235 |     # 0000 0000 -> 0x00 -> 0
236 |     # 0000 1110 -> 0x0D -> 14
237 |     # 0000 0000 -> 0x00 -> 0
238 |     # 0000 1000 -> 0x08 -> 8
239 | 
240 |     all_bytes = serializer.get_bytes()
241 |     expected_bytes = [0, 0, -128, 0, -128, 0, 96, 0, 64, 0, 40, 0, 24, 0, 14, 0, 8]
242 |     assert all_bytes == expected_bytes
243 | 
244 |     # Should pad the array correctly
245 |     serializer = BigEndianAscendingWordSerializer(short_word_length, 1, 1)
246 |     serializer.write_word(1)
247 | 
248 |     all_bytes = serializer.get_bytes()
249 |     expected_bytes = [0, 0, 0, -128]
250 |     assert all_bytes == expected_bytes
251 | 


--------------------------------------------------------------------------------
/tests/test_bit_util.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from python_hll.util import BitUtil
  5 | 
  6 | UNSIGNED_TO_SIGNED_INTEGERS = {
  7 |     0: 0,
  8 |     1: 1,
  9 |     2: 2,
 10 |     3: 3,
 11 |     4: 4,
 12 |     5: 5,
 13 |     6: 6,
 14 |     7: 7,
 15 |     8: 8,
 16 |     9: 9,
 17 |     10: 10,
 18 |     11: 11,
 19 |     12: 12,
 20 |     13: 13,
 21 |     14: 14,
 22 |     15: 15,
 23 |     16: 16,
 24 |     17: 17,
 25 |     18: 18,
 26 |     19: 19,
 27 |     20: 20,
 28 |     21: 21,
 29 |     22: 22,
 30 |     23: 23,
 31 |     24: 24,
 32 |     25: 25,
 33 |     26: 26,
 34 |     27: 27,
 35 |     28: 28,
 36 |     29: 29,
 37 |     30: 30,
 38 |     31: 31,
 39 |     32: 32,
 40 |     33: 33,
 41 |     34: 34,
 42 |     35: 35,
 43 |     36: 36,
 44 |     37: 37,
 45 |     38: 38,
 46 |     39: 39,
 47 |     40: 40,
 48 |     41: 41,
 49 |     42: 42,
 50 |     43: 43,
 51 |     44: 44,
 52 |     45: 45,
 53 |     46: 46,
 54 |     47: 47,
 55 |     48: 48,
 56 |     49: 49,
 57 |     50: 50,
 58 |     51: 51,
 59 |     52: 52,
 60 |     53: 53,
 61 |     54: 54,
 62 |     55: 55,
 63 |     56: 56,
 64 |     57: 57,
 65 |     58: 58,
 66 |     59: 59,
 67 |     60: 60,
 68 |     61: 61,
 69 |     62: 62,
 70 |     63: 63,
 71 |     64: 64,
 72 |     65: 65,
 73 |     66: 66,
 74 |     67: 67,
 75 |     68: 68,
 76 |     69: 69,
 77 |     70: 70,
 78 |     71: 71,
 79 |     72: 72,
 80 |     73: 73,
 81 |     74: 74,
 82 |     75: 75,
 83 |     76: 76,
 84 |     77: 77,
 85 |     78: 78,
 86 |     79: 79,
 87 |     80: 80,
 88 |     81: 81,
 89 |     82: 82,
 90 |     83: 83,
 91 |     84: 84,
 92 |     85: 85,
 93 |     86: 86,
 94 |     87: 87,
 95 |     88: 88,
 96 |     89: 89,
 97 |     90: 90,
 98 |     91: 91,
 99 |     92: 92,
100 |     93: 93,
101 |     94: 94,
102 |     95: 95,
103 |     96: 96,
104 |     97: 97,
105 |     98: 98,
106 |     99: 99,
107 |     100: 100,
108 |     101: 101,
109 |     102: 102,
110 |     103: 103,
111 |     104: 104,
112 |     105: 105,
113 |     106: 106,
114 |     107: 107,
115 |     108: 108,
116 |     109: 109,
117 |     110: 110,
118 |     111: 111,
119 |     112: 112,
120 |     113: 113,
121 |     114: 114,
122 |     115: 115,
123 |     116: 116,
124 |     117: 117,
125 |     118: 118,
126 |     119: 119,
127 |     120: 120,
128 |     121: 121,
129 |     122: 122,
130 |     123: 123,
131 |     124: 124,
132 |     125: 125,
133 |     126: 126,
134 |     127: 127,
135 |     128: -128,
136 |     129: -127,
137 |     130: -126,
138 |     131: -125,
139 |     132: -124,
140 |     133: -123,
141 |     134: -122,
142 |     135: -121,
143 |     136: -120,
144 |     137: -119,
145 |     138: -118,
146 |     139: -117,
147 |     140: -116,
148 |     141: -115,
149 |     142: -114,
150 |     143: -113,
151 |     144: -112,
152 |     145: -111,
153 |     146: -110,
154 |     147: -109,
155 |     148: -108,
156 |     149: -107,
157 |     150: -106,
158 |     151: -105,
159 |     152: -104,
160 |     153: -103,
161 |     154: -102,
162 |     155: -101,
163 |     156: -100,
164 |     157: -99,
165 |     158: -98,
166 |     159: -97,
167 |     160: -96,
168 |     161: -95,
169 |     162: -94,
170 |     163: -93,
171 |     164: -92,
172 |     165: -91,
173 |     166: -90,
174 |     167: -89,
175 |     168: -88,
176 |     169: -87,
177 |     170: -86,
178 |     171: -85,
179 |     172: -84,
180 |     173: -83,
181 |     174: -82,
182 |     175: -81,
183 |     176: -80,
184 |     177: -79,
185 |     178: -78,
186 |     179: -77,
187 |     180: -76,
188 |     181: -75,
189 |     182: -74,
190 |     183: -73,
191 |     184: -72,
192 |     185: -71,
193 |     186: -70,
194 |     187: -69,
195 |     188: -68,
196 |     189: -67,
197 |     190: -66,
198 |     191: -65,
199 |     192: -64,
200 |     193: -63,
201 |     194: -62,
202 |     195: -61,
203 |     196: -60,
204 |     197: -59,
205 |     198: -58,
206 |     199: -57,
207 |     200: -56,
208 |     201: -55,
209 |     202: -54,
210 |     203: -53,
211 |     204: -52,
212 |     205: -51,
213 |     206: -50,
214 |     207: -49,
215 |     208: -48,
216 |     209: -47,
217 |     210: -46,
218 |     211: -45,
219 |     212: -44,
220 |     213: -43,
221 |     214: -42,
222 |     215: -41,
223 |     216: -40,
224 |     217: -39,
225 |     218: -38,
226 |     219: -37,
227 |     220: -36,
228 |     221: -35,
229 |     222: -34,
230 |     223: -33,
231 |     224: -32,
232 |     225: -31,
233 |     226: -30,
234 |     227: -29,
235 |     228: -28,
236 |     229: -27,
237 |     230: -26,
238 |     231: -25,
239 |     232: -24,
240 |     233: -23,
241 |     234: -22,
242 |     235: -21,
243 |     236: -20,
244 |     237: -19,
245 |     238: -18,
246 |     239: -17,
247 |     240: -16,
248 |     241: -15,
249 |     242: -14,
250 |     243: -13,
251 |     244: -12,
252 |     245: -11,
253 |     246: -10,
254 |     247: -9,
255 |     248: -8,
256 |     249: -7,
257 |     250: -6,
258 |     251: -5,
259 |     252: -4,
260 |     253: -3,
261 |     254: -2,
262 |     255: -1,
263 | }
264 | 
265 | 
266 | def test_to_signed_byte():
267 |     for unsigned_int, signed_int in UNSIGNED_TO_SIGNED_INTEGERS.items():
268 |         assert signed_int == BitUtil.to_signed_byte(unsigned_int)
269 | 
270 | 
271 | def test_unsigned_right_shift_int():
272 |     assert BitUtil.unsigned_right_shift_int(-100, 1) == 2147483598
273 | 
274 | 
275 | def test_unsigned_right_shift_int2():
276 |     assert BitUtil.unsigned_right_shift_int(-1, 0) == -1
277 | 
278 | 
279 | def test_unsigned_right_shift_byte():
280 |     assert BitUtil.unsigned_right_shift_byte(-100, 1) == 2147483598
281 | 
282 | 
283 | def test_unsigned_right_shift_byte2():
284 |     assert BitUtil.unsigned_right_shift_byte(-1, 0) == -1
285 | 
286 | 
287 | def test_unsigned_right_shift_long():
288 |     assert BitUtil.unsigned_right_shift_long(-100, 1) == 9223372036854775758
289 | 
290 | 
291 | def test_unsigned_right_shift_long2():
292 |     assert BitUtil.unsigned_right_shift_long(-1, 0) == -1
293 | 
294 | 
295 | def test_left_shift_long_1():
296 |     assert BitUtil.left_shift_long(72057594037927935, 8) == -256
297 | 
298 | 
299 | def test_left_shift_long_2():
300 |     assert BitUtil.left_shift_long(214748364, 8) == 54975581184
301 | 
302 | 
303 | def test_left_shift_long_3():
304 |     assert BitUtil.left_shift_long(128, 3) == 1024
305 | 
306 | 
307 | def test_left_shift_int():
308 |     assert BitUtil.left_shift_int(128, 3) == 1024
309 | 
310 | 
311 | def test_left_shift_byte():
312 |     assert BitUtil.left_shift_byte(128, 3) == -1024
313 | 


--------------------------------------------------------------------------------
/tests/test_bit_vector.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from python_hll.util import BitVector
 5 | 
 6 | """Unit tests for BitVector."""
 7 | 
 8 | 
 9 | def test_get_set_register():
10 |     """
11 |     Tests ``BitVector.get_register()`` and ``BitVector.set_register()``.
12 |     """
13 |     # NOTE: registers are only 5bits wide
14 |     vector1 = BitVector(5, 2**7)  # width=5, count=2^7
15 |     vector2 = BitVector(5, 2**7)
16 |     vector3 = BitVector(5, 2**7)
17 |     vector4 = BitVector(5, 2**7)
18 |     for i in range(0, 2**7):
19 |         vector1.set_register(i, 0x1F)
20 |         vector2.set_register(i, (i & 0x1F))
21 |         vector3.set_register(i, ((127 - i) & 0x1F))
22 |         vector4.set_register(i, 0x15)
23 | 
24 |     for i in range(0, 2 ** 7):
25 |         assert vector1.get_register(i) == 0x1F
26 |         assert vector2.get_register(i) == i & 0x1F
27 |         assert vector3.get_register(i) == (127 - i) & 0x1F
28 |         assert vector4.get_register(i) == 0x15
29 | 


--------------------------------------------------------------------------------
/tests/test_explicit_hll.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import random
  5 | 
  6 | from python_hll.hlltype import HLLType
  7 | from python_hll.hll import HLL
  8 | from python_hll.serialization import SerializationUtil
  9 | 
 10 | """Unit tests for BitVector."""
 11 | 
 12 | 
 13 | def test_add_basic():
 14 |     """
 15 |     Tests basic set semantics of ``HLL.add_raw()``.
 16 |     """
 17 |     # Adding a single positive value to an empty set should work.
 18 |     hll = new_hll(128)  # arbitrary
 19 |     hll.add_raw(1)  # positive
 20 |     assert hll.cardinality() == 1
 21 | 
 22 |     # Adding a single negative value to an empty set should work.
 23 |     hll = new_hll(128)  # arbitrary
 24 |     hll.add_raw(-1)  # negative
 25 |     assert hll.cardinality() == 1
 26 | 
 27 |     # Adding a duplicate value to a set should be a no-op.
 28 |     hll = new_hll(128)  # arbitrary
 29 |     hll.add_raw(1)  # positive
 30 |     hll.add_raw(1)  # dupe
 31 |     assert hll.cardinality() == 1
 32 | 
 33 | 
 34 | def test_union():
 35 |     """
 36 |     Tests ``HLL.union()``.
 37 |     """
 38 |     # Unioning two distinct sets should work
 39 |     hll_a = new_hll(128)  # arbitrary
 40 |     hll_b = new_hll(128)  # arbitrary
 41 |     hll_a.add_raw(1)
 42 |     hll_a.add_raw(2)
 43 |     hll_b.add_raw(3)
 44 | 
 45 |     hll_a.union(hll_b)
 46 |     assert hll_a.cardinality() == 3
 47 | 
 48 |     # Unioning two sets whose union doesn't exceed the cardinality cap should not promote
 49 |     hll_a = new_hll(128)  # arbitrary
 50 |     hll_b = new_hll(128)  # arbitrary
 51 |     hll_a.add_raw(1)
 52 |     hll_a.add_raw(2)
 53 |     hll_b.add_raw(1)
 54 | 
 55 |     hll_a.union(hll_b)
 56 |     assert hll_a.cardinality() == 2
 57 |     assert hll_a.get_type() == HLLType.EXPLICIT
 58 | 
 59 |     # Unioning two sets whose union exceeds the cardinality cap should promote
 60 |     hll_a = new_hll(128)  # arbitrary
 61 |     hll_b = new_hll(128)  # arbitrary
 62 |     for i in range(0, 128):
 63 |         hll_a.add_raw(i)
 64 |         hll_b.add_raw(i+128)
 65 | 
 66 |     hll_a.union(hll_b)
 67 |     assert hll_a.get_type() == HLLType.SPARSE
 68 | 
 69 | 
 70 | def test_clear():
 71 |     """
 72 |     Tests ``HLL.clear()``
 73 |     """
 74 |     hll = new_hll(128)  # arbitrary
 75 |     hll.add_raw(1)
 76 |     hll.clear()
 77 |     assert hll.cardinality() == 0
 78 | 
 79 | 
 80 | def test_to_from_bytes():
 81 |     """
 82 |     Tests ``HLL.to_bytes() and ``HLL.from_bytes().
 83 |     """
 84 |     schema_version = SerializationUtil.DEFAULT_SCHEMA_VERSION
 85 |     type = HLLType.EXPLICIT
 86 |     padding = schema_version.padding_bytes(type)
 87 |     bytes_per_word = 8
 88 | 
 89 |     # Should work on an empty set
 90 |     hll = new_hll(128)
 91 |     bytes = hll.to_bytes(schema_version)
 92 |     assert len(bytes) == padding  # no elements, just padding
 93 | 
 94 |     in_hll = HLL.from_bytes(bytes)
 95 |     assert_elements_equal(hll, in_hll)
 96 | 
 97 |     # Should work on a partially filled set
 98 |     hll = new_hll(128)
 99 |     for i in range(0, 3):
100 |         hll.add_raw(i)
101 | 
102 |     bytes = hll.to_bytes(schema_version)
103 |     assert len(bytes) == padding + bytes_per_word * 3
104 | 
105 |     in_hll = HLL.from_bytes(bytes)
106 |     assert_elements_equal(hll, in_hll)
107 | 
108 |     # Should work on a full set
109 |     explicit_threshold = 128
110 |     hll = new_hll(explicit_threshold)
111 | 
112 |     for i in range(0, explicit_threshold):
113 |         hll.add_raw(27 + i)
114 | 
115 |     bytes = hll.to_bytes(schema_version)
116 |     assert len(bytes) == padding + bytes_per_word * explicit_threshold
117 | 
118 |     in_hll = HLL.from_bytes(bytes)
119 |     assert_elements_equal(hll, in_hll)
120 | 
121 | 
122 | def test_random_values():
123 |     """
124 |     Tests correctness against `set()`.
125 |     """
126 |     explicit_threshold = 4096
127 |     canonical = set()
128 |     hll = new_hll(explicit_threshold)
129 | 
130 |     seed = 1  # constant so results are reproducible
131 |     random.seed(seed)
132 |     max_java_long = 9223372036854775807
133 |     for i in range(0, explicit_threshold):
134 |         random_long = random.randint(1, max_java_long)
135 |         canonical.add(random_long)
136 |         hll.add_raw(random_long)
137 |     canonical_cardinality = len(canonical)
138 |     assert hll.cardinality() == canonical_cardinality
139 | 
140 | 
141 | def test_promotion():
142 |     """
143 |     Tests promotion to ``HLLType.SPARSE`` and ``HLLType.FULL``.
144 |     """
145 |     explicit_threshold = 128
146 |     hll = HLL.create_for_testing(11, 5, explicit_threshold, 256, HLLType.EXPLICIT)
147 |     for i in range(0, explicit_threshold + 1):
148 |         hll.add_raw(i)
149 |     assert hll.get_type() == HLLType.SPARSE
150 | 
151 |     hll = HLL(11, 5, 4, False, HLLType.EXPLICIT)  # expthresh=4 => explicit_threshold=8
152 |     for i in range(0, 9):
153 |         hll.add_raw(i)
154 |     assert hll.get_type() == HLLType.FULL
155 | 
156 | 
157 | # ------------------------------------------------------------
158 | # assertion helpers
159 | 
160 | 
161 | def assert_elements_equal(hll_a, hll_b):
162 |     """
163 |     Asserts that values in both sets are exactly equal.
164 |     """
165 |     assert hll_a._explicit_storage == hll_b._explicit_storage
166 | 
167 | 
168 | def new_hll(explicit_threshold):
169 |     """
170 |     Builds a ``HLLType.EXPLICIT`` ``HLL`` instance with the specified
171 |     explicit threshold.
172 | 
173 |     :param explicit_threshold: explicit threshold to use for the constructed
174 |            ``HLL``. This must be greater than zero.
175 |     :type explicit_threshold: int
176 |     :returns: A default-sized ``HLLType.EXPLICIT`` empty ``HLL`` instance. This
177 |               will never be ``None``.
178 |     :rtype: HLL
179 |     """
180 |     return HLL.create_for_testing(11, 5, explicit_threshold, 256, HLLType.EXPLICIT)
181 | 


--------------------------------------------------------------------------------
/tests/test_full_hll.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | from __future__ import division
  4 | import pytest
  5 | from math import ceil, log
  6 | from python_hll.hlltype import HLLType
  7 | from python_hll.hll import HLL
  8 | from python_hll.hllutil import HLLUtil
  9 | from python_hll.serialization import SerializationUtil
 10 | from python_hll.util import BitUtil
 11 | import probabilistic_test_util
 12 | 
 13 | """Tests ``HLL`` of type ``HLLType.FULL``."""
 14 | 
 15 | 
 16 | def test_small_range_smoke():
 17 |     """
 18 |     Smoke test for HLL.cardinality() and the proper use of the
 19 |     small range correction.
 20 |     """
 21 |     log2m = 11
 22 |     m = BitUtil.left_shift_int(1, log2m)
 23 |     regwidth = 5
 24 | 
 25 |     # only one register set
 26 |     hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
 27 |     hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, 0, 1))
 28 |     cardinality = hll.cardinality()
 29 | 
 30 |     # Trivially true that small correction conditions hold: one register
 31 |     # set implies zeroes exist, and estimator trivially smaller than 5m/2.
 32 |     # Small range correction: m * log(m/V)
 33 |     expected = ceil(m * log(m / (m - 1)))  # # of zeroes
 34 |     assert cardinality == expected
 35 | 
 36 |     # all but one register set
 37 |     hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
 38 |     for i in range(0, m - 1):
 39 |         hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, i, 1))
 40 | 
 41 |     # Trivially true that small correction conditions hold: all but
 42 |     # one register set implies a zero exists, and estimator trivially
 43 |     # smaller than 5m/2 since it's alpha / ((m-1)/2)
 44 |     cardinality = hll.cardinality()
 45 | 
 46 |     # Small range correction: m * log(m/V)
 47 |     expected = ceil(m * log(m / 1))  # # of zeroes
 48 |     assert cardinality == expected
 49 | 
 50 | 
 51 | def test_normal_range_smoke():
 52 |     """
 53 |     Smoke test for ``HLL.cardinality()`` and the proper use of the
 54 |     uncorrected estimator.
 55 |     """
 56 |     log2m = 11
 57 |     regwidth = 5
 58 | 
 59 |     # regwidth = 5, so hash space is
 60 |     # log2m + (2^5 - 1 - 1), so L = log2m + 30
 61 |     L = log2m + 30
 62 |     m = BitUtil.left_shift_int(1, log2m)
 63 |     hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
 64 | 
 65 |     # all registers at 'medium' value
 66 |     register_value = 7  # chosen to ensure neither correction kicks in
 67 |     for i in range(0, m):
 68 |         hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, i, register_value))
 69 | 
 70 |     cardinality = hll.cardinality()
 71 | 
 72 |     # Simplified estimator when all registers take same value: alpha / (m/2^val)
 73 |     estimator = HLLUtil.alpha_m_squared(m) / (m / (2**register_value))
 74 | 
 75 |     assert estimator <= (2**L)/30
 76 |     assert estimator > (5 * m / 2)
 77 | 
 78 |     expected = ceil(estimator)
 79 |     assert cardinality == expected
 80 | 
 81 | 
 82 | def test_large_range_smoke():
 83 |     """
 84 |     Smoke test for ``HLL.cardinality()`` and the proper use of the large
 85 |     range correction.
 86 |     """
 87 |     log2m = 12
 88 |     regwidth = 5
 89 |     # regwidth = 5, so hash space is
 90 |     # log2m + (2^5 - 1 - 1), so L = log2m + 30
 91 |     L = log2m + 30
 92 |     m = BitUtil.left_shift_int(1, log2m)
 93 |     hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
 94 | 
 95 |     register_value = 31  # chosen to ensure large correction kicks in
 96 |     for i in range(0, m):
 97 |         hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, i, register_value))
 98 | 
 99 |     cardinality = hll.cardinality()
100 | 
101 |     # Simplified estimator when all registers take same value: alpha / (m/2^val)
102 |     estimator = HLLUtil.alpha_m_squared(m) / (m / (2**register_value))
103 | 
104 |     # Assert conditions for large range
105 | 
106 |     assert estimator > (2**L) / 30
107 | 
108 |     # Large range correction: -2^L * log(1 - E/2^L)
109 |     try:
110 |         expected = ceil(-1.0 * (2 ** L) * log(1.0 - estimator / (2 ** L)))
111 |     except ValueError:
112 |         expected = 0
113 |     assert cardinality == expected
114 | 
115 | 
116 | def test_register_value():
117 |     """
118 |     Tests the bounds on a register's value for a given raw input value.
119 |     """
120 |     log2m = 4  # small enough to make testing easy (add_raw() shifts by one byte)
121 | 
122 |     # register width 4 (the minimum size)
123 |     regwidth = 4
124 |     hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
125 |     bit_vector = hll._probabilistic_storage
126 | 
127 |     # lower-bounds of the register
128 |     hll.add_raw(0x000000000000001)  # 'j'=1
129 |     assert bit_vector.get_register(1) == 0
130 | 
131 |     hll.add_raw(0x0000000000000012)  # 'j'=2
132 |     assert bit_vector.get_register(2) == 1
133 | 
134 |     hll.add_raw(0x0000000000000023)  # 'j'=3
135 |     assert bit_vector.get_register(3) == 2
136 | 
137 |     hll.add_raw(0x0000000000000044)  # 'j'=4
138 |     assert bit_vector.get_register(4) == 3
139 | 
140 |     hll.add_raw(0x0000000000000085)  # 'j'=5
141 |     assert bit_vector.get_register(5) == 4
142 | 
143 |     # upper-bounds of the register
144 |     # NOTE:  bear in mind that BitVector itself does ensure that
145 |     #        overflow of a register is prevented
146 |     hll.add_raw(0x0000000000010006)  # 'j'=6
147 |     assert bit_vector.get_register(6) == 13
148 | 
149 |     hll.add_raw(0x0000000000020007)  # 'j'=7
150 |     assert bit_vector.get_register(7) == 14
151 | 
152 |     hll.add_raw(0x0000000000040008)  # 'j'=8
153 |     assert bit_vector.get_register(8) == 15
154 | 
155 |     hll.add_raw(0x0000000000080009)  # 'j'=9
156 |     assert bit_vector.get_register(9) == 15  # overflow
157 | 
158 |     # sanity checks to ensure that no other bits above the lowest-set
159 |     # bit matters
160 |     # NOTE:  same as case 'j = 6' above
161 |     hll.add_raw(0x000000000003000A)  # 'j'=10
162 |     assert bit_vector.get_register(10) == 13
163 | 
164 |     hll.add_raw(0x000000000011000B)  # 'j'=11
165 |     assert bit_vector.get_register(11) == 13
166 | 
167 |     # ------------------------------------------------------------
168 |     # register width 5
169 | 
170 |     regwidth = 5
171 |     hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
172 |     bit_vector = hll._probabilistic_storage
173 | 
174 |     # lower-bounds of the register
175 |     hll.add_raw(0x0000000000000001)  # 'j'=1
176 |     assert bit_vector.get_register(1) == 0
177 | 
178 |     hll.add_raw(0x0000000000000012)  # 'j'=2
179 |     assert bit_vector.get_register(2) == 1
180 | 
181 |     hll.add_raw(0x0000000000000023)  # 'j'=3
182 |     assert bit_vector.get_register(3) == 2
183 | 
184 |     hll.add_raw(0x0000000000000044)  # 'j'=4
185 |     assert bit_vector.get_register(4) == 3
186 | 
187 |     hll.add_raw(0x0000000000000085)  # 'j'=5
188 |     assert bit_vector.get_register(5) == 4
189 | 
190 |     # upper-bounds of the register
191 |     # NOTE:  bear in mind that BitVector itself does ensure that
192 |     #        overflow of a register is prevented
193 |     hll.add_raw(0x0000000100000006)  # 'j'=6
194 |     assert bit_vector.get_register(6) == 29
195 | 
196 |     hll.add_raw(0x0000000200000007)  # 'j'=7
197 |     assert bit_vector.get_register(7) == 30
198 | 
199 |     hll.add_raw(0x0000000400000008)  # 'j'=8
200 |     assert bit_vector.get_register(8) == 31
201 | 
202 |     hll.add_raw(0x0000000800000009)  # 'j'=9
203 |     assert bit_vector.get_register(9) == 31  # overflow
204 | 
205 | 
206 | def test_clear():
207 |     """
208 |     Tests HLL.clear().
209 |     """
210 |     regwidth = 5
211 |     log2m = 4  # 16 registers per counter
212 |     m = BitUtil.left_shift_int(1, log2m)
213 | 
214 |     hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
215 |     bit_vector = hll._probabilistic_storage
216 |     for i in range(0, m):
217 |         bit_vector.set_register(i, i)
218 | 
219 |     hll.clear()
220 |     for i in range(0, m):
221 |         assert bit_vector.get_register(i) == 0  # default value of register
222 | 
223 | 
224 | # ------------------------------------------------------------
225 | # Serialization
226 | 
227 | 
228 | def test_to_from_bytes():
229 |     log2m = 11  # arbitrary
230 |     regwidth = 5
231 | 
232 |     schema_version = SerializationUtil.DEFAULT_SCHEMA_VERSION
233 |     type = HLLType.FULL
234 |     padding = schema_version.padding_bytes(type)
235 |     data_byte_count = probabilistic_test_util.get_required_bytes(regwidth, BitUtil.left_shift_int(1, log2m))  # aka 2^log2m = m
236 |     expected_byte_count = padding + data_byte_count
237 | 
238 |     # Should work on an empty element
239 |     hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
240 |     bytes = hll.to_bytes(schema_version)
241 | 
242 |     # assert output length is correct
243 |     assert len(bytes) == expected_byte_count
244 | 
245 |     in_hll = HLL.from_bytes(bytes)
246 |     assert_elements_equal(hll, in_hll)
247 | 
248 |     # Should work on a partially filled element
249 |     hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
250 | 
251 |     for i in range(0, 3):
252 |         raw_value = probabilistic_test_util.construct_hll_value(log2m, i, (i+9))
253 |         hll.add_raw(raw_value)
254 | 
255 |     bytes = hll.to_bytes(schema_version)
256 | 
257 |     assert len(bytes) == expected_byte_count
258 | 
259 |     in_hll = HLL.from_bytes(bytes)
260 | 
261 |     # assert register values correct
262 |     assert_elements_equal(hll, in_hll)
263 | 
264 |     # Should work on a full set
265 |     hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
266 | 
267 |     for i in range(0, BitUtil.left_shift_int(1, log2m)):
268 |         raw_value = probabilistic_test_util.construct_hll_value(log2m, i, (i % 9) + 1)
269 |         hll.add_raw(raw_value)
270 | 
271 |     bytes = hll.to_bytes(schema_version)
272 | 
273 |     # assert output length is correct
274 |     assert len(bytes) == expected_byte_count
275 | 
276 |     in_hll = HLL.from_bytes(bytes)
277 | 
278 |     # assert register values correct
279 |     assert_elements_equal(hll, in_hll)
280 | 
281 | 
282 | # ------------------------------------------------------------
283 | # Assertion Helpers
284 | 
285 | 
286 | def assert_elements_equal(hll_a, hll_b):
287 |     bit_vector_a = hll_a._probabilistic_storage
288 |     bit_vector_b = hll_b._probabilistic_storage
289 | 
290 |     iter_a = bit_vector_a.register_iterator()
291 |     iter_b = bit_vector_b.register_iterator()
292 | 
293 |     try:
294 |         while True:
295 |             assert iter_a.next() == iter_b.next()
296 |     except StopIteration:
297 |         pass
298 | 
299 |     try:
300 |         iter_a.next()
301 |         pytest.fail()
302 |     except StopIteration:
303 |         pass
304 | 
305 |     try:
306 |         iter_b.next()
307 |         pytest.fail()
308 |     except StopIteration:
309 |         pass
310 | 


--------------------------------------------------------------------------------
/tests/test_hll_serialization.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Serialization smoke-tests."""
 5 | 
 6 | import random
 7 | import sys
 8 | from copy import deepcopy
 9 | from python_hll.hlltype import HLLType
10 | from python_hll.hll import HLL
11 | 
12 | # A fixed random seed so that this test is reproducible.
13 | RANDOM_SEED = 1
14 | 
15 | 
16 | def test_serialization_smoke(fastonly):
17 |     """
18 |     A smoke-test that covers serialization/deserialization of an HLL
19 |     under all possible parameters.
20 |     """
21 |     random.seed(RANDOM_SEED)
22 |     random_count = 250
23 |     max_java_long = 9223372036854775807
24 |     randoms = [random.randint(1, max_java_long) for i in range(0, random_count)]
25 |     assert_cardinality(HLLType.EMPTY, randoms, fastonly)
26 |     assert_cardinality(HLLType.EXPLICIT, randoms, fastonly)
27 |     assert_cardinality(HLLType.SPARSE, randoms, fastonly)
28 |     assert_cardinality(HLLType.FULL, randoms, fastonly)
29 | 
30 | 
31 | def assert_cardinality(hll_type, items, fastonly):
32 |     # NOTE: log2m<=16 was chosen as the max log2m parameter so that the test
33 |     #       completes in a reasonable amount of time. Not much is gained by
34 |     #       testing larger values - there are no more known serialization
35 |     #       related edge cases that appear as log2m gets even larger.
36 |     log2m_range = range(HLL.MINIMUM_LOG2M_PARAM, 16 + 1)
37 |     regw_range = range(HLL.MINIMUM_REGWIDTH_PARAM, HLL.MAXIMUM_REGWIDTH_PARAM + 1)
38 |     expthr_range = range(HLL.MINIMUM_EXPTHRESH_PARAM, HLL.MAXIMUM_EXPTHRESH_PARAM + 1)
39 |     if fastonly:
40 |         log2m_range = (HLL.MINIMUM_LOG2M_PARAM, 16)
41 |         regw_range = (HLL.MINIMUM_REGWIDTH_PARAM, HLL.MAXIMUM_REGWIDTH_PARAM)
42 |         expthr_range = (HLL.MINIMUM_EXPTHRESH_PARAM, HLL.MAXIMUM_EXPTHRESH_PARAM)
43 |     for log2m in log2m_range:
44 |         for regw in regw_range:
45 |             for expthr in expthr_range:
46 |                 for sparse in [True, False]:
47 |                     hll = HLL(log2m, regw, expthr, sparse, hll_type)
48 |                     for item in items:
49 |                         hll.add_raw(item)
50 |                     copy = HLL.from_bytes(hll.to_bytes())
51 |                     assert copy.cardinality() == hll.cardinality()
52 |                     assert copy.get_type() == hll.get_type()
53 |                     assert copy.to_bytes() == hll.to_bytes()
54 | 
55 |                     clone = deepcopy(hll)
56 |                     assert clone.cardinality() == hll.cardinality()
57 |                     assert clone.get_type() == hll.get_type()
58 |                     assert clone.to_bytes() == hll.to_bytes()
59 | 
60 |                     sys.stdout.write('.')
61 |                     sys.stdout.flush()
62 | 


--------------------------------------------------------------------------------
/tests/test_hll_util.py:
--------------------------------------------------------------------------------
 1 | """Tests ``HLLUtil`` static methods."""
 2 | 
 3 | from python_hll.hll import HLL
 4 | from python_hll.hllutil import HLLUtil
 5 | 
 6 | 
 7 | def test_large_estimator_cutoff():
 8 |     """
 9 |     Tests that ``HLLUtil.largeEstimatorCutoff()`` is the same
10 |     as a trivial implementation.
11 |     """
12 |     for log2m in range(HLL.MINIMUM_LOG2M_PARAM + 1, HLL.MAXIMUM_LOG2M_PARAM + 1):
13 |         for regWidth in range(HLL.MINIMUM_REGWIDTH_PARAM + 1, HLL.MINIMUM_REGWIDTH_PARAM + 1):
14 |             cutoff = HLLUtil.large_estimator_cutoff(log2m, regWidth)
15 |             """
16 |             See blog post (http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/)
17 |             and original paper (Fig. 3) for information on 2^L and
18 |             large range correction cutoff.
19 |             """
20 |             expected = (regWidth ** regWidth) - (2 + log2m) / 30.0
21 |             assert cutoff == expected
22 | 


--------------------------------------------------------------------------------
/tests/test_integration.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import csv
  5 | import pytest
  6 | import sys
  7 | from python_hll.util import NumberUtil
  8 | from python_hll.hll import HLL
  9 | from python_hll.hlltype import HLLType
 10 | import probabilistic_test_util
 11 | 
 12 | """
 13 | Compares the HLLs to the files in the data directory. See README.txt for
 14 | more information about this test data.
 15 | """
 16 | 
 17 | LOG2M = 11
 18 | REGWIDTH = 5
 19 | EXPLICIT_THRESHOLD = 256
 20 | SPARSE_THRESHOLD = 850
 21 | 
 22 | 
 23 | def test_cumulative_add_cardinality_correction(fastonly):
 24 |     do_test_add('cumulative_add_cardinality_correction.csv', fastonly)
 25 | 
 26 | 
 27 | def test_cumulative_add_comprehensive_promotion(fastonly):
 28 |     do_test_add('cumulative_add_comprehensive_promotion.csv', fastonly)
 29 | 
 30 | 
 31 | def test_cumulative_add_sparse_edge(fastonly):
 32 |     do_test_add('cumulative_add_sparse_edge.csv', fastonly)
 33 | 
 34 | 
 35 | def test_cumulative_add_sparse_random(fastonly):
 36 |     do_test_add('cumulative_add_sparse_random.csv', fastonly)
 37 | 
 38 | 
 39 | def test_cumulative_add_sparse_step(fastonly):
 40 |     do_test_add('cumulative_add_sparse_step.csv', fastonly)
 41 | 
 42 | 
 43 | def test_cumulative_union_comprehensive(fastonly):
 44 |     do_test_union('cumulative_union_comprehensive.csv', fastonly)
 45 | 
 46 | 
 47 | def test_cumulative_union_explicit_explicit(fastonly):
 48 |     do_test_union('cumulative_union_explicit_explicit.csv', fastonly)
 49 | 
 50 | 
 51 | def test_cumulative_union_explicit_promotion(fastonly):
 52 |     do_test_union('cumulative_union_explicit_promotion.csv', fastonly)
 53 | 
 54 | 
 55 | def test_cumulative_union_probabilistic_probabilistic(fastonly):
 56 |     do_test_union('cumulative_union_probabilistic_probabilistic.csv', fastonly)
 57 | 
 58 | 
 59 | def test_cumulative_union_sparse_promotion(fastonly):
 60 |     do_test_union('cumulative_union_sparse_promotion.csv', fastonly)
 61 | 
 62 | 
 63 | def test_cumulative_union_sparse_sparse(fastonly):
 64 |     do_test_union('cumulative_union_sparse_sparse.csv', fastonly)
 65 | 
 66 | 
 67 | def test_cumulative_union_sparse_full_representation():
 68 |     # I'm not exactly sure how this test is suppossed to work - it's different
 69 |     # from the other union tests. For now I will just construct the HLLs in the
 70 |     # same way as Java's sparseFullRepresentationTest() and compare the output.
 71 | 
 72 |     # The file is generated from IntegrationTestGenerator.java.
 73 |     filename = 'cumulative_union_sparse_full_representation.csv'
 74 |     with open('tests/data/%s' % filename, mode='r') as csv_file:
 75 |         csv_reader = csv.DictReader(csv_file)
 76 |         rows = [row for row in csv_reader]
 77 |     print('')
 78 |     print('test_integration: %s: %s rows:' % (filename, len(rows)))
 79 | 
 80 |     empty_hll_1 = new_hll(HLLType.EMPTY)
 81 |     empty_hll_2 = new_hll(HLLType.EMPTY)
 82 |     assert_sparse_full_row_equals(empty_hll_1, empty_hll_2, rows[0], filename, 1)
 83 | 
 84 |     full_hll = new_hll(HLLType.FULL)
 85 |     full_hll.add_raw(probabilistic_test_util.construct_hll_value(LOG2M, 0, 1))
 86 |     sparse_hll = new_hll(HLLType.SPARSE)
 87 |     sparse_hll.add_raw(probabilistic_test_util.construct_hll_value(LOG2M, 0, 1))
 88 |     assert_sparse_full_row_equals(full_hll, sparse_hll, rows[1], filename, 2)
 89 | 
 90 |     full_hll_2 = new_hll(HLLType.FULL)
 91 |     full_hll_2.add_raw(probabilistic_test_util.construct_hll_value(LOG2M, 1, 1))
 92 |     sparse_hll.add_raw(probabilistic_test_util.construct_hll_value(LOG2M, 1, 1))
 93 |     assert_sparse_full_row_equals(full_hll_2, sparse_hll, rows[2], filename, 3)
 94 | 
 95 |     full_hll_3 = new_hll(HLLType.FULL)
 96 |     for i in range(2, SPARSE_THRESHOLD + 1):
 97 |         full_hll_3.add_raw(probabilistic_test_util.construct_hll_value(LOG2M, i, 1))
 98 |         sparse_hll.add_raw(probabilistic_test_util.construct_hll_value(LOG2M, i, 1))
 99 |     assert_sparse_full_row_equals(full_hll_3, sparse_hll, rows[3], filename, 4)
100 | 
101 | 
102 | def assert_sparse_full_row_equals(hll, union_hll, row, filename, line):
103 |     """
104 |     Asserts that the given HLLs match the row in cumulative_union_sparse_full_representation.csv.
105 |     """
106 |     assert float_cardinality(hll) == pytest.approx(float(row['cardinality'])), '%s:%s' % (filename, line)
107 |     assert hll_to_string(hll) == row['HLL'], '%s:%s' % (filename, line)
108 |     assert float_cardinality(union_hll) == pytest.approx(float(row['union_cardinality'])), '%s:%s' % (filename, line)
109 |     assert hll_to_string(union_hll) == row['union_HLL'], '%s:%s' % (filename, line)
110 | 
111 | 
112 | def new_hll(type):
113 |     """
114 |     Shortcut for testing constructor, which uses the constants defined at
115 |     the top of the file as default parameters.
116 | 
117 |     :returns: a new ``HLL`` of specified type, which uses the parameters
118 |               ``LOG2M`` ``REGWIDTH``, ``EXPLICIT_THRESHOLD`` and ``SPARSE_THRESHOLD`` specified above.
119 |     """
120 |     return HLL.create_for_testing(LOG2M, REGWIDTH, EXPLICIT_THRESHOLD, SPARSE_THRESHOLD, type)
121 | 
122 | 
123 | def do_test_add(filename, fastonly):
124 |     """
125 |     Tests an "add"-style test file.
126 |     """
127 |     with open('tests/data/%s' % filename, mode='r') as csv_file:
128 |         csv_reader = csv.DictReader(csv_file)
129 |         line = 1
130 |         rows = [row for row in csv_reader]
131 |         if fastonly:
132 |             rows = rows[0:500]
133 |         print('')
134 |         print('test_integration: %s: %s rows: (each . = 100 rows)' % (filename, len(rows)))
135 |         for row in rows:
136 |             if line == 1:
137 |                 hll = string_to_hll(row['multiset'])
138 |                 line += 1
139 |                 continue
140 |             hll.add_raw(int(row['raw_value']))
141 |             assert float_cardinality(hll) == pytest.approx(float(row['cardinality'])), '%s:%s' % (filename, line)
142 |             assert hll_to_string(hll) == row['multiset'], '%s:%s' % (filename, line)
143 |             hll = string_to_hll(row['multiset'])
144 |             line += 1
145 |             if line % 100 == 0:
146 |                 sys.stdout.write('.')
147 |                 sys.stdout.flush()
148 | 
149 | 
150 | def do_test_union(filename, fastonly):
151 |     """
152 |     Tests an "union"-style test file.
153 |     """
154 |     with open('tests/data/%s' % filename, mode='r') as csv_file:
155 |         csv_reader = csv.DictReader(csv_file)
156 |         line = 1
157 |         rows = [row for row in csv_reader]
158 |         if fastonly:
159 |             rows = rows[0:500]
160 |         print('')
161 |         print('test_integration: %s: %s rows: (each . = 100 rows)' % (filename, len(rows)))
162 |         for row in rows:
163 |             if line == 1:
164 |                 hll = string_to_hll(row['union_multiset'])
165 |                 line += 1
166 |                 continue
167 |             other_hll = string_to_hll(row['multiset'])
168 |             assert float_cardinality(other_hll) == pytest.approx(float(row['cardinality'])), '%s:%s:multiset' % (filename, line)
169 |             hll.union(other_hll)
170 |             assert float_cardinality(hll) == pytest.approx(float(row['union_cardinality'])), '%s:%s' % (filename, line)
171 |             assert hll_to_string(hll) == row['union_multiset'], '%s:%s' % (filename, line)
172 |             hll = string_to_hll(row['union_multiset'])
173 |             line += 1
174 |             if line % 100 == 0:
175 |                 sys.stdout.write('.')
176 |                 sys.stdout.flush()
177 | 
178 | 
179 | def float_cardinality(hll):
180 |     """
181 |     Returns the algorithm-specific cardinality of the specified ``HLL``
182 |      ``String`` appropriate for comparison with the algorithm-specific
183 |      cardinality provided by the PostgreSQL implementation.
184 |     :param HLL hll: The HLL whose algorithm-specific cardinality is to be printed.
185 |            This cannot be ``None``.
186 |     :returns: the algorithm-specific cardinality of the instance as a PostgreSQL-
187 |               compatible String. This will never be ``None``
188 |     :rtype: float
189 |     """
190 |     if hll.get_type() == HLLType.EMPTY:
191 |         return 0
192 |     elif hll.get_type() == HLLType.EXPLICIT:  # promotion has not yet occurred
193 |         return hll.cardinality()
194 |     elif hll.get_type() == HLLType.SPARSE:
195 |         return hll._sparse_probabilistic_algorithm_cardinality()
196 |     elif hll.get_type() == HLLType.FULL:
197 |         return hll._full_probabilistic_algorithm_cardinality()
198 |     else:
199 |         raise Exception('Unknown HLL type ' + str(hll.get_type()))
200 | 
201 | 
202 | def string_to_hll(s):
203 |     """
204 |     Converts a string (with \\x) to an HLL.
205 |     """
206 |     s = s[2:]
207 |     return HLL.from_bytes(NumberUtil.from_hex(s, 0, len(s)))
208 | 
209 | 
210 | def hll_to_string(hll):
211 |     """
212 |     Converts an HLL to a string (with \\x)
213 |     """
214 |     bytes = hll.to_bytes()
215 |     return '\\x' + NumberUtil.to_hex(bytes, 0, len(bytes))
216 | 


--------------------------------------------------------------------------------
/tests/test_sparse_hll.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | from __future__ import division
  4 | from math import ceil, log
  5 | import random
  6 | from python_hll.hlltype import HLLType
  7 | from python_hll.hll import HLL
  8 | from python_hll.hllutil import HLLUtil
  9 | from python_hll.serialization import SerializationUtil
 10 | from python_hll.util import BitUtil
 11 | import probabilistic_test_util
 12 | 
 13 | """Tests ``HLL`` of type ``HLLType.SPARSE``."""
 14 | 
 15 | log2m = 11
 16 | 
 17 | 
 18 | def test_add():
 19 |     """
 20 |     Tests ``HLL.add_raw()``.
 21 |     """
 22 |     # ------------------------------------------------------------
 23 |     # insert an element with register value 1 (minimum set value)
 24 |     register_index = 0
 25 |     register_value = 1
 26 |     raw_value = probabilistic_test_util.construct_hll_value(log2m, register_index, register_value)
 27 | 
 28 |     hll = HLL.create_for_testing(log2m, 5, 128, 256, HLLType.SPARSE)
 29 |     hll.add_raw(raw_value)
 30 | 
 31 |     assert_one_register_set(hll, register_index, BitUtil.to_signed_byte(register_value))
 32 | 
 33 |     # ------------------------------------------------------------
 34 |     # insert an element with register value 31 (maximum set value)
 35 |     register_index = 0
 36 |     register_value = 31
 37 |     raw_value = probabilistic_test_util.construct_hll_value(log2m, register_index, register_value)
 38 | 
 39 |     hll = HLL.create_for_testing(log2m, 5, 128, 256, HLLType.SPARSE)
 40 |     hll.add_raw(raw_value)
 41 | 
 42 |     assert_one_register_set(hll, register_index, BitUtil.to_signed_byte(register_value))
 43 | 
 44 |     # ------------------------------------------------------------
 45 |     # insert an element that could overflow the register (past 31)
 46 |     register_index = 0
 47 |     register_value = 36
 48 |     raw_value = probabilistic_test_util.construct_hll_value(log2m, register_index, register_value)
 49 | 
 50 |     hll = HLL.create_for_testing(log2m, 5, 128, 256, HLLType.SPARSE)
 51 |     hll.add_raw(raw_value)
 52 | 
 53 |     assert_one_register_set(hll, register_index, BitUtil.to_signed_byte(31))  # register max
 54 | 
 55 |     # ------------------------------------------------------------
 56 |     # insert duplicate elements, observe no change
 57 |     register_index = 0
 58 |     register_value = 1
 59 |     raw_value = probabilistic_test_util.construct_hll_value(log2m, register_index, register_value)
 60 | 
 61 |     hll = HLL.create_for_testing(log2m, 5, 128, 256, HLLType.SPARSE)
 62 |     hll.add_raw(raw_value)
 63 |     hll.add_raw(raw_value)
 64 | 
 65 |     assert_one_register_set(hll, register_index, BitUtil.to_signed_byte(register_value))  # register max
 66 | 
 67 |     # ------------------------------------------------------------
 68 |     # insert elements that increase a register's value
 69 |     register_index = 0
 70 |     register_value = 1
 71 |     raw_value = probabilistic_test_util.construct_hll_value(log2m, register_index, register_value)
 72 | 
 73 |     hll = HLL.create_for_testing(log2m, 5, 128, 256, HLLType.SPARSE)
 74 |     hll.add_raw(raw_value)
 75 | 
 76 |     register_value_2 = 2
 77 |     raw_value_2 = probabilistic_test_util.construct_hll_value(log2m, register_index, register_value_2)
 78 |     hll.add_raw(raw_value_2)
 79 | 
 80 |     assert_one_register_set(hll, register_index, BitUtil.to_signed_byte(register_value_2))
 81 | 
 82 |     # ------------------------------------------------------------
 83 |     # insert elements that have lower register values, observe no change
 84 |     register_index = 0
 85 |     register_value = 2
 86 |     raw_value = probabilistic_test_util.construct_hll_value(log2m, register_index, register_value)
 87 | 
 88 |     hll = HLL.create_for_testing(log2m, 5, 128, 256, HLLType.SPARSE)
 89 |     hll.add_raw(raw_value)
 90 | 
 91 |     register_value_2 = 1
 92 |     raw_value_2 = probabilistic_test_util.construct_hll_value(log2m, register_index, register_value_2)
 93 |     hll.add_raw(raw_value_2)
 94 | 
 95 |     assert_one_register_set(hll, register_index, BitUtil.to_signed_byte(register_value))
 96 | 
 97 | 
 98 | def test_small_range_smoke():
 99 |     """
100 |     Smoke test for ``HLL.cardinality()`` and the proper use of the small
101 |     range correction.
102 |     """
103 |     log2m = 11
104 |     m = BitUtil.left_shift_int(1, log2m)
105 |     regwidth = 5
106 | 
107 |     # ------------------------------------------------------------
108 |     # only one register set
109 |     hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.SPARSE)
110 |     hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, 0, 1))
111 | 
112 |     cardinality = hll.cardinality()
113 | 
114 |     # Trivially true that small correction conditions hold: one register
115 |     # set implies zeroes exist, and estimator trivially smaller than 5m/2.
116 |     # Small range correction: m * log(m/V)
117 |     expected = ceil(m * log(m / (m - 1)))  # # of zeroes
118 |     assert cardinality == expected
119 | 
120 |     # ------------------------------------------------------------
121 |     # all but one register set
122 |     hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.SPARSE)
123 |     for i in range(0, m - 1):
124 |         hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, i, 1))
125 | 
126 |     # Trivially true that small correction conditions hold: all but
127 |     # one register set implies a zero exists, and estimator trivially
128 |     # smaller than 5m/2 since it's alpha / ((m-1)/2)
129 |     cardinality = hll.cardinality()
130 | 
131 |     # Small range correction: m * log(m/V)
132 |     expected = ceil(m * log(m / 1))  # # of zeroes
133 |     assert cardinality == expected
134 | 
135 | 
136 | def test_normal_range_smoke():
137 |     """
138 |     Smoke test for HLL.cardinality() and the proper use of the
139 |     uncorrected estimator.
140 |     """
141 |     log2m = 11
142 |     m = BitUtil.left_shift_int(1, log2m)
143 |     regwidth = 5
144 |     # regwidth = 5, so hash space is
145 |     # log2m + (2^5 - 1 - 1), so L = log2m + 30
146 |     L = log2m + 30
147 | 
148 |     # all registers at 'medium' value
149 |     hll = HLL.create_for_testing(log2m, regwidth, 128, m, HLLType.SPARSE)
150 | 
151 |     register_value = 7  # chosen to ensure neither correction kicks in
152 |     for i in range(0, m):
153 |         hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, i, register_value))
154 | 
155 |     cardinality = hll.cardinality()
156 | 
157 |     # Simplified estimator when all registers take same value: alpha / (m/2^val)
158 |     estimator = HLLUtil.alpha_m_squared(m) / (m / (2 ** register_value))
159 | 
160 |     # Assert conditions for uncorrected range
161 |     assert estimator <= (2 ** L) / 30
162 |     assert estimator > (5 * m / 2)
163 | 
164 |     expected = ceil(estimator)
165 |     assert cardinality == expected
166 | 
167 | 
168 | def test_large_range_smoke():
169 |     """
170 |     Smoke test for ``HLL.cardinality()`` and the proper use of the large
171 |     range correction.
172 |     """
173 |     log2m = 11
174 |     m = BitUtil.left_shift_int(1, log2m)
175 |     regwidth = 5
176 |     # regwidth = 5, so hash space is
177 |     # log2m + (2^5 - 1 - 1), so L = log2m + 30
178 |     L = log2m + 30
179 | 
180 |     # all registers at large value
181 |     hll = HLL.create_for_testing(log2m, regwidth, 128, m, HLLType.SPARSE)
182 | 
183 |     register_value = 31  # chosen to ensure large correction kicks in
184 |     for i in range(0, m):
185 |         hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, i, register_value))
186 | 
187 |     cardinality = hll.cardinality()
188 | 
189 |     # Simplified estimator when all registers take same value: alpha / (m/2^val)
190 |     estimator = HLLUtil.alpha_m_squared(m) / (m / (2 ** register_value))
191 | 
192 |     # Assert conditions for large range
193 |     assert estimator > (2**L) / 30
194 | 
195 |     # Large range correction: -2^32 * log(1 - E/2^32)
196 |     try:
197 |         expected = ceil(-1.0 * (2**L) * log(1.0 - estimator / (2**L)))
198 |     except ValueError:
199 |         expected = 0
200 |     assert cardinality == expected
201 | 
202 | 
203 | def test_union():
204 |     """
205 |     Tests ``HLL.union()``.
206 |     """
207 |     log2m = 11  # arbitrary
208 |     sparse_threshold = 256  # arbitrary
209 | 
210 |     # ------------------------------------------------------------
211 |     # two empty multisets should union to an empty set
212 |     hll_a = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE)
213 |     hll_b = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE)
214 | 
215 |     hll_a.union(hll_b)
216 | 
217 |     assert hll_a.get_type() == HLLType.SPARSE
218 |     assert hll_a.cardinality() == 0
219 | 
220 |     # ------------------------------------------------------------
221 |     # two disjoint multisets should union properly
222 |     hll_a = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE)
223 |     hll_a.add_raw(probabilistic_test_util.construct_hll_value(log2m, 1, 1))
224 |     hll_b = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE)
225 |     hll_a.add_raw(probabilistic_test_util.construct_hll_value(log2m, 2, 1))
226 | 
227 |     hll_a.union(hll_b)
228 | 
229 |     assert hll_a.get_type() == HLLType.SPARSE  # unchanged
230 |     assert hll_a.cardinality() == 3  # precomputed
231 |     assert_register_present(hll_a, 1, BitUtil.to_signed_byte(1))
232 |     assert_register_present(hll_a, 2, BitUtil.to_signed_byte(1))
233 | 
234 |     # ------------------------------------------------------------
235 |     # two exactly overlapping multisets should union properly
236 |     hll_a = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE)
237 |     hll_a.add_raw(probabilistic_test_util.construct_hll_value(log2m, 1, 10))
238 |     hll_b = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE)
239 |     hll_a.add_raw(probabilistic_test_util.construct_hll_value(log2m, 1, 13))
240 | 
241 |     hll_a.union(hll_b)
242 | 
243 |     assert hll_a.get_type() == HLLType.SPARSE  # unchanged
244 |     assert hll_a.cardinality() == 2  # precomputed
245 |     assert_one_register_set(hll_a, 1, BitUtil.to_signed_byte(13))  # max(10,13)
246 | 
247 |     # ------------------------------------------------------------
248 |     # overlapping multisets should union properly
249 |     hll_a = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE)
250 |     hll_b = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE)
251 |     # register index = 3
252 |     raw_value_a = probabilistic_test_util.construct_hll_value(log2m, 3, 11)
253 | 
254 |     # register index = 4
255 |     raw_value_b = probabilistic_test_util.construct_hll_value(log2m, 4, 13)
256 |     raw_value_b_prime = probabilistic_test_util.construct_hll_value(log2m, 4, 21)
257 | 
258 |     # register index = 5
259 |     raw_value_c = probabilistic_test_util.construct_hll_value(log2m, 5, 14)
260 | 
261 |     hll_a.add_raw(raw_value_a)
262 |     hll_a.add_raw(raw_value_b)
263 | 
264 |     hll_b.add_raw(raw_value_b_prime)
265 |     hll_b.add_raw(raw_value_c)
266 | 
267 |     hll_a.union(hll_b)
268 |     # union should have three registers set, with partition B set to the
269 |     # max of the two registers
270 |     assert_register_present(hll_a, 3, BitUtil.to_signed_byte(11))
271 |     assert_register_present(hll_a, 4, BitUtil.to_signed_byte(21))  # max(21,13)
272 |     assert_register_present(hll_a, 5, BitUtil.to_signed_byte(14))
273 | 
274 |     # ------------------------------------------------------------
275 |     # too-large unions should promote
276 |     hll_a = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE)
277 |     hll_b = HLL.create_for_testing(log2m, 5, 128, sparse_threshold, HLLType.SPARSE)
278 | 
279 |     # fill up sets to maxCapacity
280 |     for i in range(0, sparse_threshold):
281 |         hll_a.add_raw(probabilistic_test_util.construct_hll_value(log2m, i, 1))
282 |         hll_b.add_raw(probabilistic_test_util.construct_hll_value(log2m, i + sparse_threshold, 1))  # non-overlapping
283 | 
284 |     hll_a.union(hll_b)
285 | 
286 |     assert hll_a.get_type() == HLLType.FULL
287 | 
288 | 
289 | def test_clear():
290 |     """
291 |     Tests ``HLL.clear()``.
292 |     """
293 |     hll = HLL.create_for_testing(log2m, 5, 128, 256, HLLType.SPARSE)
294 |     hll.add_raw(1)
295 |     hll.clear()
296 |     assert hll.cardinality() == 0
297 | 
298 | 
299 | def test_to_from_bytes():
300 |     """
301 |     Tests ``HLL.to_bytes()`` and ``HLL.from_bytes()``.
302 |     """
303 |     log2m = 11  # arbitrary
304 |     regwidth = 5  # arbitrary
305 |     sparse_threshold = 256  # arbitrary
306 |     short_word_length = 16  # log2m + regwidth = 11 + 5
307 | 
308 |     schema_version = SerializationUtil.DEFAULT_SCHEMA_VERSION
309 |     type = HLLType.SPARSE
310 |     padding = schema_version.padding_bytes(type)
311 | 
312 |     # ------------------------------------------------------------
313 |     # Should work on an empty element
314 |     hll = HLL.create_for_testing(log2m, regwidth, 128, sparse_threshold, HLLType.SPARSE)
315 |     bytes = hll.to_bytes(schema_version)
316 | 
317 |     # output should just be padding since no registers are used
318 |     assert len(bytes) == padding
319 | 
320 |     in_hll = HLL.from_bytes(bytes)
321 | 
322 |     # assert register values correct
323 |     assert_elements_equal(hll, in_hll)
324 | 
325 |     # ------------------------------------------------------------
326 |     # Should work on a partially filled element
327 |     hll = HLL.create_for_testing(log2m, regwidth, 128, sparse_threshold, HLLType.SPARSE)
328 | 
329 |     for i in range(0, 3):
330 |         raw_value = probabilistic_test_util.construct_hll_value(log2m, i, (i + 9))
331 |         hll.add_raw(raw_value)
332 | 
333 |     bytes = hll.to_bytes(schema_version)
334 | 
335 |     assert len(bytes) == padding + probabilistic_test_util.get_required_bytes(short_word_length, 3)  # register_count
336 | 
337 |     in_hll = HLL.from_bytes(bytes)
338 | 
339 |     # assert register values correct
340 |     assert_elements_equal(hll, in_hll)
341 | 
342 |     # ------------------------------------------------------------
343 |     # Should work on a full set
344 |     hll = HLL.create_for_testing(log2m, regwidth, 128, sparse_threshold, HLLType.SPARSE)
345 | 
346 |     for i in range(0, sparse_threshold):
347 |         raw_value = probabilistic_test_util.construct_hll_value(log2m, i, (i % 9) + 1)
348 |         hll.add_raw(raw_value)
349 | 
350 |     bytes = hll.to_bytes(schema_version)
351 | 
352 |     # 'short words' should be 12 bits + 5 bits = 17 bits long
353 |     assert len(bytes) == padding + probabilistic_test_util.get_required_bytes(short_word_length, sparse_threshold)
354 | 
355 |     in_hll = HLL.from_bytes(bytes)
356 | 
357 |     # assert register values correct
358 |     assert_elements_equal(hll, in_hll)
359 | 
360 | 
361 | def test_random_values():
362 |     log2m = 11  # arbitrary
363 |     regwidth = 5  # arbitrary
364 |     sparse_threshold = 256  # arbitrary
365 | 
366 |     seed = 1
367 |     random.seed(seed)
368 |     max_java_long = 9223372036854775807
369 | 
370 |     for run in range(0, 100):
371 |         hll = HLL.create_for_testing(log2m, regwidth, 128, sparse_threshold, HLLType.SPARSE)
372 | 
373 |         map = {}
374 | 
375 |         for i in range(0, sparse_threshold):
376 |             raw_value = random.randint(1, max_java_long)
377 | 
378 |             register_index = probabilistic_test_util.get_register_index(raw_value, log2m)
379 |             register_value = probabilistic_test_util.get_register_value(raw_value, log2m)
380 |             if map.get(register_index, 0) < register_value:
381 |                 map[register_index] = register_value
382 | 
383 |             hll.add_raw(raw_value)
384 | 
385 |         for key in map.keys():
386 |             expected_register_value = map.get(key, 0)
387 |             assert_register_present(hll, key, expected_register_value)
388 | 
389 | # ------------------------------------------------------------
390 | # assertion helpers
391 | 
392 | 
393 | def assert_register_present(hll, register_index, register_value):
394 |     """
395 |     Asserts that the register at the specified index is set to the specified value.
396 |     """
397 |     sparse_probabilistic_storage = hll._sparse_probabilistic_storage
398 |     assert sparse_probabilistic_storage.get(register_index, 0) == register_value
399 | 
400 | 
401 | def assert_one_register_set(hll, register_index, register_value):
402 |     """
403 |     Asserts that only the specified register is set and has the specified value.
404 |     """
405 |     sparse_probabilistic_storage = hll._sparse_probabilistic_storage
406 |     assert len(sparse_probabilistic_storage) == 1
407 |     assert sparse_probabilistic_storage.get(register_index, 0) == register_value
408 | 
409 | 
410 | def assert_elements_equal(hll_a, hll_b):
411 |     sparse_probabilistic_storage_a = hll_a._sparse_probabilistic_storage
412 |     sparse_probabilistic_storage_b = hll_b._sparse_probabilistic_storage
413 |     assert len(sparse_probabilistic_storage_a) == len(sparse_probabilistic_storage_b)
414 |     for index in sparse_probabilistic_storage_a.keys():
415 |         assert sparse_probabilistic_storage_a.get(index) == sparse_probabilistic_storage_b.get(index)
416 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py27, py34, py35, py36, flake8
 3 | 
 4 | [testenv:flake8]
 5 | basepython = python
 6 | deps = flake8
 7 | commands = flake8 python_hll
 8 | 
 9 | [testenv]
10 | setenv =
11 |     PYTHONPATH = {toxinidir}
12 | deps =
13 |     -r{toxinidir}/requirements_dev.txt
14 | ; If you want to make tox run the tests with the same versions, create a
15 | ; requirements.txt with the pinned versions and uncomment the following line:
16 | ;     -r{toxinidir}/requirements.txt
17 | commands =
18 |     pip install -U pip
19 |     py.test --capture=no --fast-only --basetemp={envtmpdir}
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------