├── .github ├── ISSUE_TEMPLATE.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── build_deploy.yml │ ├── changelog.yml │ └── test.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── LICENSE-3rdparty.csv ├── NOTICE ├── README.md ├── ddsketch ├── __init__.py ├── _version.py ├── ddsketch.py ├── mapping.py ├── pb │ ├── __init__.py │ ├── ddsketch.proto │ ├── ddsketch_pb2.py │ ├── ddsketch_pre319_pb2.py │ └── proto.py ├── py.typed └── store.py ├── docker-compose.yml ├── mypy.ini ├── pyproject.toml ├── releasenotes ├── config.yaml └── notes │ ├── ddsketch-api-a84ffc0875bbacd6.yaml │ ├── extend-range-06474632c8235187.yaml │ ├── oldpy-db6189c9b26e10f7.yaml │ ├── pbopt-ec6525c1948d782f.yaml │ ├── proto4-e8646610178bef59.yaml │ ├── protobuf-min-f6af9a2d5d96f53c.yaml │ ├── py2-c963608396db7258.yaml │ ├── py310-ac5baa9b0b69008a.yaml │ ├── remove-custom-exceptions-e2bc67a72250269d.yaml │ ├── remove-numpy-25fedcd9be9d6d80.yaml │ ├── tests-wheel-bf71b228c86a9ced.yaml │ ├── toplevelapi-6c04f2ca35a49d4b.yaml │ ├── typing-25579ab88323a332.yaml │ └── version-b2a276df190a703a.yaml ├── riotfile.py ├── scripts └── check-releasenotes ├── setup.cfg ├── setup.py └── tests ├── __init__.py ├── datasets.py ├── test_ddsketch.py ├── test_mapping.py ├── test_proto.py └── test_store.py /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | **Describe what happened:** 2 | 3 | 4 | **Describe what you expected:** 5 | 6 | 7 | **Steps to reproduce the issue:** 8 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ### What does this PR do? 2 | 3 | A brief description of the change being made with this pull request. 4 | 5 | ### Motivation 6 | 7 | What inspired you to submit this pull request? 8 | 9 | ### Additional Notes 10 | 11 | Anything else we should know when reviewing? 12 | -------------------------------------------------------------------------------- /.github/workflows/build_deploy.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | pull_request: 5 | release: 6 | types: 7 | - published 8 | 9 | jobs: 10 | build_wheel: 11 | name: Build wheels 12 | runs-on: ubuntu-22.04 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | # Include all history and tags 17 | with: 18 | fetch-depth: 0 19 | 20 | - uses: actions/setup-python@v2 21 | name: Install Python 22 | with: 23 | python-version: '3.9' 24 | 25 | - name: Build wheels 26 | run: | 27 | pip install wheel 28 | pip wheel --no-deps -w dist . 29 | 30 | - uses: actions/upload-artifact@v2 31 | with: 32 | path: dist/*.whl 33 | 34 | build_sdist: 35 | name: Build source distribution 36 | runs-on: ubuntu-latest 37 | steps: 38 | - uses: actions/checkout@v2 39 | # Include all history and tags 40 | with: 41 | fetch-depth: 0 42 | 43 | - uses: actions/setup-python@v2 44 | name: Install Python 45 | with: 46 | python-version: '3.9' 47 | 48 | - name: Build sdist 49 | run: | 50 | python setup.py sdist 51 | 52 | - uses: actions/upload-artifact@v2 53 | with: 54 | path: dist/*.tar.gz 55 | 56 | upload_pypi: 57 | needs: [build_wheel, build_sdist] 58 | runs-on: ubuntu-latest 59 | if: github.event_name == 'release' && github.event.action == 'published' 60 | steps: 61 | - uses: actions/download-artifact@v2 62 | with: 63 | name: artifact 64 | path: dist 65 | 66 | - uses: pypa/gh-action-pypi-publish@master 67 | with: 68 | user: __token__ 69 | password: ${{ secrets.PYPI_TOKEN }} 70 | # To test: repository_url: https://test.pypi.org/legacy/ 71 | -------------------------------------------------------------------------------- /.github/workflows/changelog.yml: -------------------------------------------------------------------------------- 1 | name: Changelog 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | # Important that we run on `labeled` and `unlabeled` to pick up `changelog/no-changelog` being added/removed 8 | # DEV: [opened, reopened, synchronize] is the default 9 | types: [opened, reopened, synchronize, labeled, unlabeled, ready_for_review] 10 | jobs: 11 | validate: 12 | name: Validate changelog 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v2 16 | # Include all history and tags 17 | with: 18 | fetch-depth: 0 19 | 20 | # Ensure a new reno release note was added in this PR. 21 | # Use `reno new ` to add a new note to `releasenotes/notes`, 22 | # or add `changelog/no-changelog` label if no release note is needed. 23 | - name: Ensure release note added 24 | # Only run this on pull requests 25 | if: github.event_name == 'pull_request' 26 | run: scripts/check-releasenotes 27 | 28 | - uses: actions/setup-python@v2 29 | name: Install Python 30 | with: 31 | python-version: '3.9' 32 | 33 | - name: Install Dependencies 34 | run: pip install reno docutils 35 | 36 | - name: Lint changelog notes 37 | run: reno lint 38 | 39 | - name: Generate changelog 40 | run: | 41 | reno report | tee CHANGELOG.rst 42 | rst2html.py CHANGELOG.rst CHANGELOG.html 43 | 44 | - name: Upload CHANGELOG.rst 45 | uses: actions/upload-artifact@v2 46 | with: 47 | name: changelog 48 | path: | 49 | CHANGELOG.rst 50 | CHANGELOG.html 51 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | jobs: 8 | check: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/setup-python@v5 12 | with: 13 | python-version: '3.12' 14 | - uses: actions/checkout@v4 15 | with: 16 | fetch-depth: 0 17 | - run: pip install riot==0.19.0 18 | - run: riot -v run check_fmt 19 | - run: riot -v run -s mypy 20 | - run: riot -v run -s flake8 21 | 22 | test: 23 | strategy: 24 | matrix: 25 | os: [ubuntu-latest, macos-latest] 26 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] 27 | runs-on: ${{ matrix.os }} 28 | steps: 29 | - uses: actions/checkout@v4 30 | - name: Setup Python 31 | uses: actions/setup-python@v5 32 | with: 33 | python-version: ${{ matrix.python-version }} 34 | - name: install riot 35 | # Note that pip3 has to be used since the system pip when running 36 | # under the 2.7 instance will be Python 2 pip. 37 | # (riot is not Python 2 compatible) 38 | run: pip3 install riot==0.19.0 39 | - run: | 40 | riot run -p ${{ matrix.python-version}} test 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *~ 3 | .riot/ 4 | 5 | # Generated version module 6 | ddsketch/__version.py 7 | 8 | # Ignore files generated during `python setup.py install` 9 | build/ 10 | dist/ 11 | *.egg-info/ 12 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to sketches-py 2 | 3 | First of all, thanks for contributing! 4 | 5 | * If you think you've found an issue, please open a Github issue. 6 | * To propose improvements, feel free to submit a PR. 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2020 DataDog, Inc. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /LICENSE-3rdparty.csv: -------------------------------------------------------------------------------- 1 | Component,Origin,License,Copyright 2 | import,numpy,BSD-3-Clause,Copyright (c) 2005-2020 NumPy Developers.; All rights reserved. 3 | import,setuptools,MIT,Copyright (c) 2016 Jason R Coombs 4 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Datadog sketches-py 2 | Copyright 2020 Datadog, Inc. 3 | 4 | This product includes software developed at Datadog (https://www.datadoghq.com/). 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ddsketch 2 | 3 | This repo contains the Python implementation of the distributed quantile sketch 4 | algorithm DDSketch [1]. DDSketch has relative-error guarantees for any quantile 5 | q in [0, 1]. That is if the true value of the qth-quantile is `x` then DDSketch 6 | returns a value `y` such that `|x-y| / x < e` where `e` is the relative error 7 | parameter. (The default here is set to 0.01.) DDSketch is also fully mergeable, 8 | meaning that multiple sketches from distributed systems can be combined in a 9 | central node. 10 | 11 | Our default implementation, `DDSketch`, is guaranteed [1] to not grow too large 12 | in size for any data that can be described by a distribution whose tails are 13 | sub-exponential. 14 | 15 | We also provide implementations (`LogCollapsingLowestDenseDDSketch` and 16 | `LogCollapsingHighestDenseDDSketch`) where the q-quantile will be accurate up to 17 | the specified relative error for q that is not too small (or large). Concretely, 18 | the q-quantile will be accurate up to the specified relative error as long as it 19 | belongs to one of the `m` bins kept by the sketch. If the data is time in 20 | seconds, the default of `m = 2048` covers 80 microseconds to 1 year. 21 | 22 | ## Installation 23 | 24 | To install this package, run `pip install ddsketch`, or clone the repo and run 25 | `python setup.py install`. This package depends on `numpy` and `protobuf`. (The 26 | protobuf dependency can be removed if it's not applicable.) 27 | 28 | ## Usage 29 | ``` 30 | from ddsketch import DDSketch 31 | 32 | sketch = DDSketch() 33 | ``` 34 | Add values to the sketch 35 | ``` 36 | import numpy as np 37 | 38 | values = np.random.normal(size=500) 39 | for v in values: 40 | sketch.add(v) 41 | ``` 42 | Find the quantiles of `values` to within the relative error. 43 | ``` 44 | quantiles = [sketch.get_quantile_value(q) for q in [0.5, 0.75, 0.9, 1]] 45 | ``` 46 | Merge another `DDSketch` into `sketch`. 47 | ``` 48 | another_sketch = DDSketch() 49 | other_values = np.random.normal(size=500) 50 | for v in other_values: 51 | another_sketch.add(v) 52 | sketch.merge(another_sketch) 53 | ``` 54 | The quantiles of `values` concatenated with `other_values` are still accurate to within the relative error. 55 | 56 | ## Development 57 | 58 | To work on ddsketch a Python interpreter must be installed. It is recommended to use the provided development 59 | container (requires [docker](https://www.docker.com/)) which includes all the required Python interpreters. 60 | 61 | docker-compose run dev 62 | 63 | Or, if developing outside of docker then it is recommended to use a virtual environment: 64 | 65 | pip install virtualenv 66 | virtualenv --python=3 .venv 67 | source .venv/bin/activate 68 | 69 | 70 | ### Testing 71 | 72 | To run the tests install `riot`: 73 | 74 | pip install riot 75 | 76 | Replace the Python version with the interpreter(s) available. 77 | 78 | # Run tests with Python 3.9 79 | riot run -p3.9 test 80 | 81 | ### Release notes 82 | 83 | New features, bug fixes, deprecations and other breaking changes must have 84 | release notes included. 85 | 86 | To generate a release note for the change: 87 | 88 | riot run reno new 89 | 90 | Edit the generated file to include notes on the changes made in the commit/PR 91 | and add commit it. 92 | 93 | 94 | ### Formatting 95 | 96 | Format code with 97 | 98 | riot run fmt 99 | 100 | 101 | ### Type-checking 102 | 103 | Type checking is done with [mypy](http://mypy-lang.org/): 104 | 105 | riot run mypy 106 | 107 | 108 | ### Type-checking 109 | 110 | Lint the code with [flake8](https://flake8.pycqa.org/en/latest/): 111 | 112 | riot run flake8 113 | 114 | 115 | ### Protobuf 116 | 117 | The protobuf is stored in the go repository: https://github.com/DataDog/sketches-go/blob/master/ddsketch/pb/ddsketch.proto 118 | 119 | Install the minimum required protoc and generate the Python code: 120 | 121 | ```sh 122 | docker run -v $PWD:/code -it ubuntu:18.04 /bin/bash 123 | apt update && apt install protobuf-compiler # default is 3.0.0 124 | protoc --proto_path=ddsketch/pb/ --python_out=ddsketch/pb/ ddsketch/pb/ddsketch.proto 125 | ``` 126 | 127 | 128 | ### Releasing 129 | 130 | 1. Generate the release notes and use [`pandoc`](https://pandoc.org/) to format 131 | them for Github: 132 | ```bash 133 | git checkout master && git pull 134 | riot run -s reno report --no-show-source | pandoc -f rst -t gfm --wrap=none 135 | ``` 136 | Copy the output into a new release: https://github.com/DataDog/sketches-py/releases/new. 137 | 138 | 2. Enter a tag for the release (following [`semver`](https://semver.org)) (eg. `v1.1.3`, `v1.0.3`, `v1.2.0`). 139 | 3. Use the tag without the `v` as the title. 140 | 4. Save the release as a draft and pass the link to someone else to give a quick review. 141 | 5. If all looks good hit publish 142 | 143 | 144 | ## References 145 | [1] Charles Masson and Jee E Rim and Homin K. Lee. DDSketch: A fast and fully-mergeable quantile sketch with relative-error guarantees. PVLDB, 12(12): 2195-2205, 2019. (The code referenced in the paper, including our implementation of the the Greenwald-Khanna (GK) algorithm, can be found at: https://github.com/DataDog/sketches-py/releases/tag/v0.1 ) 146 | -------------------------------------------------------------------------------- /ddsketch/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import get_version 2 | from .ddsketch import DDSketch 3 | from .ddsketch import LogCollapsingHighestDenseDDSketch 4 | from .ddsketch import LogCollapsingLowestDenseDDSketch 5 | from .mapping import CubicallyInterpolatedMapping 6 | from .mapping import LinearlyInterpolatedMapping 7 | from .mapping import LogarithmicMapping 8 | from .store import CollapsingHighestDenseStore 9 | from .store import CollapsingLowestDenseStore 10 | 11 | 12 | __version__ = get_version() 13 | 14 | 15 | __all__ = [ 16 | "DDSketch", 17 | "LogCollapsingLowestDenseDDSketch", 18 | "LogCollapsingHighestDenseDDSketch", 19 | "CubicallyInterpolatedMapping", 20 | "LinearlyInterpolatedMapping", 21 | "LogarithmicMapping", 22 | "CollapsingHighestDenseStore", 23 | "CollapsingLowestDenseStore", 24 | ] 25 | -------------------------------------------------------------------------------- /ddsketch/_version.py: -------------------------------------------------------------------------------- 1 | def get_version(): 2 | # type: () -> str 3 | """Return the package version. 4 | 5 | The write_to functionality of setuptools_scm is used (see setup.py) 6 | to output the version to ddsketch/__version.py which we attempt to import. 7 | 8 | This is done to avoid the expensive overhead of importing pkg_resources. 9 | """ 10 | try: 11 | from .__version import version 12 | 13 | return version 14 | except ImportError: 15 | import pkg_resources 16 | 17 | return pkg_resources.get_distribution(__name__).version 18 | -------------------------------------------------------------------------------- /ddsketch/ddsketch.py: -------------------------------------------------------------------------------- 1 | # Unless explicitly stated otherwise all files in this repository are licensed 2 | # under the Apache License 2.0. 3 | # This product includes software developed at Datadog (https://www.datadoghq.com/). 4 | # Copyright 2020 Datadog, Inc. 5 | 6 | """A quantile sketch with relative-error guarantees. This sketch computes 7 | quantile values with an approximation error that is relative to the actual 8 | quantile value. It works on both negative and non-negative input values. 9 | 10 | For instance, using DDSketch with a relative accuracy guarantee set to 1%, if 11 | the expected quantile value is 100, the computed quantile value is guaranteed to 12 | be between 99 and 101. If the expected quantile value is 1000, the computed 13 | quantile value is guaranteed to be between 990 and 1010. 14 | 15 | DDSketch works by mapping floating-point input values to bins and counting the 16 | number of values for each bin. The underlying structure that keeps track of bin 17 | counts is store. 18 | 19 | The memory size of the sketch depends on the range that is covered by the input 20 | values: the larger that range, the more bins are needed to keep track of the 21 | input values. As a rough estimate, if working on durations with a relative 22 | accuracy of 2%, about 2kB (275 bins) are needed to cover values between 1 23 | millisecond and 1 minute, and about 6kB (802 bins) to cover values between 1 24 | nanosecond and 1 day. 25 | 26 | The size of the sketch can be have a fail-safe upper-bound by using collapsing 27 | stores. As shown in 28 | the DDSketch paper 29 | the likelihood of a store collapsing when using the default bound is vanishingly 30 | small for most data. 31 | 32 | DDSketch implementations are also available in: 33 | Go 34 | Python 35 | JavaScript 36 | """ 37 | import typing 38 | 39 | from .mapping import LogarithmicMapping 40 | from .store import CollapsingHighestDenseStore 41 | from .store import CollapsingLowestDenseStore 42 | from .store import DenseStore 43 | 44 | 45 | if typing.TYPE_CHECKING: 46 | from typing import Optional # noqa: F401 47 | 48 | from .mapping import KeyMapping # noqa: F401 49 | from .store import Store # noqa: F401 50 | 51 | 52 | DEFAULT_REL_ACC = 0.01 # "alpha" in the paper 53 | DEFAULT_BIN_LIMIT = 2048 54 | 55 | 56 | class BaseDDSketch(object): 57 | """The base implementation of DDSketch with neither mapping nor storage specified. 58 | 59 | Args: 60 | mapping (mapping.KeyMapping): map btw values and store bins 61 | store (store.Store): storage for positive values 62 | negative_store (store.Store): storage for negative values 63 | zero_count (float): The count of zero values 64 | 65 | Attributes: 66 | relative_accuracy (float): the accuracy guarantee; referred to as alpha 67 | in the paper. (0. < alpha < 1.) 68 | 69 | count: the number of values seen by the sketch 70 | min: the minimum value seen by the sketch 71 | max: the maximum value seen by the sketch 72 | sum: the sum of the values seen by the sketch 73 | """ 74 | 75 | def __init__( 76 | self, 77 | mapping, 78 | store, 79 | negative_store, 80 | zero_count, 81 | ): 82 | # type: (KeyMapping, Store, Store, float) -> None 83 | self._mapping = mapping 84 | self._store = store 85 | self._negative_store = negative_store 86 | self._zero_count = zero_count 87 | 88 | self._relative_accuracy = mapping.relative_accuracy 89 | self._count = self._negative_store.count + self._zero_count + self._store.count 90 | self._min = float("+inf") 91 | self._max = float("-inf") 92 | self._sum = 0.0 93 | 94 | def __repr__(self): 95 | # type: () -> str 96 | return ( 97 | "store: {}, negative_store: {}, " 98 | "zero_count: {}, count: {}, " 99 | "sum: {}, min: {}, max: {}" 100 | ).format( 101 | self._store, 102 | self._negative_store, 103 | self._zero_count, 104 | self._count, 105 | self._sum, 106 | self._min, 107 | self._max, 108 | ) 109 | 110 | @property 111 | def count(self): 112 | return self._count 113 | 114 | @property 115 | def name(self): 116 | # type: () -> str 117 | """str: name of the sketch""" 118 | return "DDSketch" 119 | 120 | @property 121 | def num_values(self): 122 | # type: () -> float 123 | """Return the number of values in the sketch.""" 124 | return self._count 125 | 126 | @property 127 | def avg(self): 128 | # type: () -> float 129 | """Return the exact average of the values added to the sketch.""" 130 | return self._sum / self._count 131 | 132 | @property 133 | def sum(self): # noqa: A003 134 | # type: () -> float 135 | """Return the exact sum of the values added to the sketch.""" 136 | return self._sum 137 | 138 | def add(self, val, weight=1.0): 139 | # type: (float, float) -> None 140 | """Add a value to the sketch.""" 141 | if weight <= 0.0: 142 | raise ValueError("weight must be a positive float, got %r" % weight) 143 | 144 | if val > self._mapping.min_possible: 145 | self._store.add(self._mapping.key(val), weight) 146 | elif val < -self._mapping.min_possible: 147 | self._negative_store.add(self._mapping.key(-val), weight) 148 | else: 149 | self._zero_count += weight 150 | 151 | # Keep track of summary stats 152 | self._count += weight 153 | self._sum += val * weight 154 | if val < self._min: 155 | self._min = val 156 | if val > self._max: 157 | self._max = val 158 | 159 | def get_quantile_value(self, quantile): 160 | # type: (float) -> Optional[float] 161 | """Return the approximate value at the specified quantile. 162 | 163 | Args: 164 | quantile (float): 0 <= q <=1 165 | 166 | Returns: 167 | the value at the specified quantile or None if the sketch is empty 168 | """ 169 | if quantile < 0 or quantile > 1 or self._count == 0: 170 | return None 171 | 172 | rank = quantile * (self._count - 1) 173 | if rank < self._negative_store.count: 174 | reversed_rank = self._negative_store.count - rank - 1 175 | key = self._negative_store.key_at_rank(reversed_rank, lower=False) 176 | quantile_value = -self._mapping.value(key) 177 | elif rank < self._zero_count + self._negative_store.count: 178 | return 0 179 | else: 180 | key = self._store.key_at_rank( 181 | rank - self._zero_count - self._negative_store.count 182 | ) 183 | quantile_value = self._mapping.value(key) 184 | return quantile_value 185 | 186 | def merge(self, sketch): 187 | # type: (BaseDDSketch) -> None 188 | """Merge the given sketch into this one. After this operation, this sketch 189 | encodes the values that were added to both this and the input sketch. 190 | """ 191 | if not self._mergeable(sketch): 192 | raise ValueError( 193 | "Cannot merge two DDSketches with different parameters, got %r and %r" 194 | % (self._mapping.gamma, sketch._mapping.gamma) 195 | ) 196 | 197 | if sketch.count == 0: 198 | return 199 | 200 | if self._count == 0: 201 | self._copy(sketch) 202 | return 203 | 204 | # Merge the stores 205 | self._store.merge(sketch._store) 206 | self._negative_store.merge(sketch._negative_store) 207 | self._zero_count += sketch._zero_count 208 | 209 | # Merge summary stats 210 | self._count += sketch._count 211 | self._sum += sketch._sum 212 | if sketch._min < self._min: 213 | self._min = sketch._min 214 | if sketch._max > self._max: 215 | self._max = sketch._max 216 | 217 | def _mergeable(self, other): 218 | # type: (BaseDDSketch) -> bool 219 | """Two sketches can be merged only if their gammas are equal.""" 220 | return self._mapping.gamma == other._mapping.gamma 221 | 222 | def _copy(self, sketch): 223 | # type: (BaseDDSketch) -> None 224 | """Copy the input sketch into this one""" 225 | self._store.copy(sketch._store) 226 | self._negative_store.copy(sketch._negative_store) 227 | self._zero_count = sketch._zero_count 228 | self._min = sketch._min 229 | self._max = sketch._max 230 | self._count = sketch._count 231 | self._sum = sketch._sum 232 | 233 | 234 | class DDSketch(BaseDDSketch): 235 | """The default implementation of BaseDDSketch, with optimized memory usage at 236 | the cost of lower ingestion speed, using an unlimited number of bins. The 237 | number of bins will not exceed a reasonable number unless the data is 238 | distributed with tails heavier than any subexponential. 239 | (cf. http://www.vldb.org/pvldb/vol12/p2195-masson.pdf) 240 | """ 241 | 242 | def __init__(self, relative_accuracy=None): 243 | # type: (Optional[float]) -> None 244 | # Make sure the parameters are valid 245 | if relative_accuracy is None: 246 | relative_accuracy = DEFAULT_REL_ACC 247 | 248 | mapping = LogarithmicMapping(relative_accuracy) 249 | store = DenseStore() 250 | negative_store = DenseStore() 251 | super(DDSketch, self).__init__( 252 | mapping=mapping, 253 | store=store, 254 | negative_store=negative_store, 255 | zero_count=0.0, 256 | ) 257 | 258 | 259 | class LogCollapsingLowestDenseDDSketch(BaseDDSketch): 260 | """Implementation of BaseDDSketch with optimized memory usage at the cost of 261 | lower ingestion speed, using a limited number of bins. When the maximum 262 | number of bins is reached, bins with lowest indices are collapsed, which 263 | causes the relative accuracy to be lost on the lowest quantiles. For the 264 | default bin limit, collapsing is unlikely to occur unless the data is 265 | distributed with tails heavier than any subexponential. 266 | (cf. http://www.vldb.org/pvldb/vol12/p2195-masson.pdf) 267 | """ 268 | 269 | def __init__(self, relative_accuracy=None, bin_limit=None): 270 | # type: (Optional[float], Optional[int]) -> None 271 | # Make sure the parameters are valid 272 | if relative_accuracy is None: 273 | relative_accuracy = DEFAULT_REL_ACC 274 | 275 | if bin_limit is None or bin_limit < 0: 276 | bin_limit = DEFAULT_BIN_LIMIT 277 | 278 | mapping = LogarithmicMapping(relative_accuracy) 279 | store = CollapsingLowestDenseStore(bin_limit) 280 | negative_store = CollapsingLowestDenseStore(bin_limit) 281 | super(LogCollapsingLowestDenseDDSketch, self).__init__( 282 | mapping=mapping, 283 | store=store, 284 | negative_store=negative_store, 285 | zero_count=0.0, 286 | ) 287 | 288 | 289 | class LogCollapsingHighestDenseDDSketch(BaseDDSketch): 290 | """Implementation of BaseDDSketch with optimized memory usage at the cost of 291 | lower ingestion speed, using a limited number of bins. When the maximum 292 | number of bins is reached, bins with highest indices are collapsed, which 293 | causes the relative accuracy to be lost on the highest quantiles. For the 294 | default bin limit, collapsing is unlikely to occur unless the data is 295 | distributed with tails heavier than any subexponential. 296 | (cf. http://www.vldb.org/pvldb/vol12/p2195-masson.pdf) 297 | """ 298 | 299 | def __init__(self, relative_accuracy=None, bin_limit=None): 300 | # type: (Optional[float], Optional[int]) -> None 301 | # Make sure the parameters are valid 302 | if relative_accuracy is None: 303 | relative_accuracy = DEFAULT_REL_ACC 304 | 305 | if bin_limit is None or bin_limit < 0: 306 | bin_limit = DEFAULT_BIN_LIMIT 307 | 308 | mapping = LogarithmicMapping(relative_accuracy) 309 | store = CollapsingHighestDenseStore(bin_limit) 310 | negative_store = CollapsingHighestDenseStore(bin_limit) 311 | super(LogCollapsingHighestDenseDDSketch, self).__init__( 312 | mapping=mapping, 313 | store=store, 314 | negative_store=negative_store, 315 | zero_count=0.0, 316 | ) 317 | -------------------------------------------------------------------------------- /ddsketch/mapping.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | 4 | # Unless explicitly stated otherwise all files in this repository are licensed 5 | # under the Apache License 2.0. 6 | # This product includes software developed at Datadog (https://www.datadoghq.com/). 7 | # Copyright 2020 Datadog, Inc. 8 | 9 | """A mapping between values and integer indices that imposes relative accuracy 10 | guarantees. Specifically, for any value `minIndexableValue() < value < 11 | maxIndexableValue` implementations of `KeyMapping` must be such that 12 | `value(key(v))` is close to `v` with a relative error that is less than 13 | `relative_accuracy`. 14 | 15 | In implementations of KeyMapping, there is generally a trade-off between the 16 | cost of computing the key and the number of keys that are required to cover a 17 | given range of values (memory optimality). The most memory-optimal mapping is 18 | the LogarithmicMapping, but it requires the costly evaluation of the logarithm 19 | when computing the index. Other mappings can approximate the logarithmic 20 | mapping, while being less computationally costly. 21 | """ 22 | from abc import ABCMeta 23 | from abc import abstractmethod 24 | import math 25 | import sys 26 | 27 | import six 28 | 29 | 30 | class KeyMapping(six.with_metaclass(ABCMeta)): 31 | """ 32 | Args: 33 | relative_accuracy (float): the accuracy guarantee; referred to as alpha 34 | in the paper. (0. < alpha < 1.) 35 | offset (float): an offset that can be used to shift all bin keys 36 | Attributes: 37 | gamma (float): the base for the exponential buckets. gamma = (1 + alpha) / (1 - alpha) 38 | min_possible: the smallest value the sketch can distinguish from 0 39 | max_possible: the largest value the sketch can handle 40 | _multiplier (float): used for calculating log_gamma(value) initially, _multiplier = 1 / log(gamma) 41 | """ 42 | 43 | def __init__(self, relative_accuracy, offset=0.0): 44 | # type: (float, float) -> None 45 | if relative_accuracy <= 0 or relative_accuracy >= 1: 46 | raise ValueError( 47 | "Relative accuracy must be between 0 and 1, got %r" % relative_accuracy 48 | ) 49 | self.relative_accuracy = relative_accuracy 50 | self._offset = offset 51 | 52 | gamma_mantissa = 2 * relative_accuracy / (1 - relative_accuracy) 53 | self.gamma = 1 + gamma_mantissa 54 | self._multiplier = 1 / math.log1p(gamma_mantissa) 55 | self.min_possible = sys.float_info.min * self.gamma 56 | self.max_possible = sys.float_info.max / self.gamma 57 | 58 | @classmethod 59 | def from_gamma_offset(cls, gamma, offset): 60 | # type: (float, float) -> KeyMapping 61 | """Constructor used by pb.proto""" 62 | relative_accuracy = (gamma - 1.0) / (gamma + 1.0) 63 | return cls(relative_accuracy, offset=offset) 64 | 65 | @abstractmethod 66 | def _log_gamma(self, value): 67 | # type: (float) -> float 68 | """Return (an approximation of) the logarithm of the value base gamma""" 69 | 70 | @abstractmethod 71 | def _pow_gamma(self, value): 72 | # type: (float) -> float 73 | """Return (an approximation of) gamma to the power value""" 74 | 75 | def key(self, value): 76 | # type: (float) -> int 77 | """ 78 | Args: 79 | value (float) 80 | Returns: 81 | int: the key specifying the bucket for value 82 | """ 83 | return int(math.ceil(self._log_gamma(value)) + self._offset) 84 | 85 | def value(self, key): 86 | # type: (int) -> float 87 | """ 88 | Args: 89 | key (int) 90 | Returns: 91 | float: the value represented by the bucket specified by the key 92 | """ 93 | return self._pow_gamma(key - self._offset) * (2.0 / (1 + self.gamma)) 94 | 95 | 96 | class LogarithmicMapping(KeyMapping): 97 | """A memory-optimal KeyMapping, i.e., given a targeted relative accuracy, it 98 | requires the least number of keys to cover a given range of values. This is 99 | done by logarithmically mapping floating-point values to integers. 100 | """ 101 | 102 | def __init__(self, relative_accuracy, offset=0.0): 103 | # type: (float, float) -> None 104 | super(LogarithmicMapping, self).__init__(relative_accuracy, offset=offset) 105 | self._multiplier *= math.log(2) 106 | 107 | def _log_gamma(self, value): 108 | # type: (float) -> float 109 | return math.log(value, 2) * self._multiplier 110 | 111 | def _pow_gamma(self, value): 112 | # type: (float) -> float 113 | return math.pow(2.0, value / self._multiplier) 114 | 115 | 116 | def _cbrt(x): 117 | # type: (float) -> float 118 | y = float(abs(x) ** (1.0 / 3.0)) 119 | if x < 0: 120 | return -y 121 | return y 122 | 123 | 124 | class LinearlyInterpolatedMapping(KeyMapping): 125 | """A fast KeyMapping that approximates the memory-optimal 126 | LogarithmicMapping by extracting the floor value of the logarithm to the 127 | base 2 from the binary representations of floating-point values and 128 | linearly interpolating the logarithm in-between. 129 | """ 130 | 131 | def _log2_approx(self, value): 132 | # type: (float) -> float 133 | """Approximates log2 by s + f 134 | where v = (s+1) * 2 ** f for s in [0, 1) 135 | 136 | frexp(v) returns m and e s.t. 137 | v = m * 2 ** e ; (m in [0.5, 1) or 0.0) 138 | so we adjust m and e accordingly 139 | """ 140 | mantissa, exponent = math.frexp(value) 141 | significand = 2 * mantissa - 1 142 | return significand + (exponent - 1) 143 | 144 | def _exp2_approx(self, value): 145 | # type: (float) -> float 146 | """Inverse of _log2_approx""" 147 | exponent = int(math.floor(value) + 1) 148 | mantissa = (value - exponent + 2) / 2.0 149 | return math.ldexp(mantissa, exponent) 150 | 151 | def _log_gamma(self, value): 152 | # type: (float) -> float 153 | return self._log2_approx(value) * self._multiplier 154 | 155 | def _pow_gamma(self, value): 156 | # type: (float) -> float 157 | return self._exp2_approx(value / self._multiplier) 158 | 159 | 160 | class CubicallyInterpolatedMapping(KeyMapping): 161 | """A fast KeyMapping that approximates the memory-optimal LogarithmicMapping by 162 | extracting the floor value of the logarithm to the base 2 from the binary 163 | representations of floating-point values and cubically interpolating the 164 | logarithm in-between. 165 | 166 | More detailed documentation of this method can be found in: 167 | sketches-java 168 | """ 169 | 170 | A = 6.0 / 35.0 171 | B = -3.0 / 5.0 172 | C = 10.0 / 7.0 173 | 174 | def __init__(self, relative_accuracy, offset=0.0): 175 | # type: (float, float) -> None 176 | super(CubicallyInterpolatedMapping, self).__init__( 177 | relative_accuracy, offset=offset 178 | ) 179 | self._multiplier /= self.C 180 | 181 | def _cubic_log2_approx(self, value): 182 | # type: (float) -> float 183 | """Approximates log2 using a cubic polynomial""" 184 | mantissa, exponent = math.frexp(value) 185 | significand = 2 * mantissa - 1 186 | return ( 187 | (self.A * significand + self.B) * significand + self.C 188 | ) * significand + (exponent - 1) 189 | 190 | def _cubic_exp2_approx(self, value): 191 | # type: (float) -> float 192 | # Derived from Cardano's formula 193 | exponent = int(math.floor(value)) 194 | delta_0 = self.B * self.B - 3 * self.A * self.C 195 | delta_1 = ( 196 | 2.0 * self.B * self.B * self.B 197 | - 9.0 * self.A * self.B * self.C 198 | - 27.0 * self.A * self.A * (value - exponent) 199 | ) 200 | cardano = _cbrt( 201 | (delta_1 - ((delta_1 * delta_1 - 4 * delta_0 * delta_0 * delta_0) ** 0.5)) 202 | / 2.0 203 | ) 204 | significand_plus_one = ( 205 | -(self.B + cardano + delta_0 / cardano) / (3.0 * self.A) + 1.0 206 | ) 207 | mantissa = significand_plus_one / 2 208 | return math.ldexp(mantissa, exponent + 1) 209 | 210 | def _log_gamma(self, value): 211 | # type: (float) -> float 212 | return self._cubic_log2_approx(value) * self._multiplier 213 | 214 | def _pow_gamma(self, value): 215 | # type: (float) -> float 216 | return self._cubic_exp2_approx(value / self._multiplier) 217 | -------------------------------------------------------------------------------- /ddsketch/pb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataDog/sketches-py/0d16e695d1f991276863b8ffaaf6c8e9bd9ad9de/ddsketch/pb/__init__.py -------------------------------------------------------------------------------- /ddsketch/pb/ddsketch.proto: -------------------------------------------------------------------------------- 1 | /* Unless explicitly stated otherwise all files in this repository are licensed under the Apache License 2.0. 2 | * This product includes software developed at Datadog (https://www.datadoghq.com/). 3 | * Copyright 2020 Datadog, Inc. 4 | */ 5 | 6 | syntax = "proto3"; 7 | 8 | // A DDSketch is essentially a histogram that partitions the range of positive values into an infinite number of 9 | // indexed bins whose size grows exponentially. It keeps track of the number of values (or possibly floating-point 10 | // weights) added to each bin. Negative values are partitioned like positive values, symmetrically to zero. 11 | // The value zero as well as its close neighborhood that would be mapped to extreme bin indexes is mapped to a specific 12 | // counter. 13 | message DDSketch { 14 | // The mapping between positive values and the bin indexes they belong to. 15 | IndexMapping mapping = 1; 16 | 17 | // The store for keeping track of positive values. 18 | Store positiveValues = 2; 19 | 20 | // The store for keeping track of negative values. A negative value v is mapped using its positive opposite -v. 21 | Store negativeValues = 3; 22 | 23 | // The count for the value zero and its close neighborhood (whose width depends on the mapping). 24 | double zeroCount = 4; 25 | } 26 | 27 | // How to map positive values to the bins they belong to. 28 | message IndexMapping { 29 | // The gamma parameter of the mapping, such that bin index that a value v belongs to is roughly equal to 30 | // log(v)/log(gamma). 31 | double gamma = 1; 32 | 33 | // An offset that can be used to shift all bin indexes. 34 | double indexOffset = 2; 35 | 36 | // To speed up the computation of the index a value belongs to, the computation of the log may be approximated using 37 | // the fact that the log to the base 2 of powers of 2 can be computed at a low cost from the binary representation of 38 | // the input value. Other values can be approximated by interpolating between successive powers of 2 (linearly, 39 | // quadratically or cubically). 40 | // NONE means that the log is to be computed exactly (no interpolation). 41 | Interpolation interpolation = 3; 42 | enum Interpolation { 43 | NONE = 0; 44 | LINEAR = 1; 45 | QUADRATIC = 2; 46 | CUBIC = 3; 47 | } 48 | } 49 | 50 | // A Store maps bin indexes to their respective counts. 51 | // Counts can be encoded sparsely using binCounts, but also in a contiguous way using contiguousBinCounts and 52 | // contiguousBinIndexOffset. Given that non-empty bins are in practice usually contiguous or close to one another, the 53 | // latter contiguous encoding method is usually more efficient than the sparse one. 54 | // Both encoding methods can be used conjointly. If a bin appears in both the sparse and the contiguous encodings, its 55 | // count value is the sum of the counts in each encodings. 56 | message Store { 57 | // The bin counts, encoded sparsely. 58 | map binCounts = 1; 59 | 60 | // The bin counts, encoded contiguously. The values of contiguousBinCounts are the counts for the bins of indexes 61 | // o, o+1, o+2, etc., where o is contiguousBinIndexOffset. 62 | repeated double contiguousBinCounts = 2 [packed = true]; 63 | sint32 contiguousBinIndexOffset = 3; 64 | } 65 | -------------------------------------------------------------------------------- /ddsketch/pb/ddsketch_pb2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by the protocol buffer compiler. DO NOT EDIT! 3 | # source: ddsketch.proto 4 | """Generated protocol buffer code.""" 5 | from google.protobuf import descriptor as _descriptor 6 | from google.protobuf import descriptor_pool as _descriptor_pool 7 | from google.protobuf import message as _message 8 | from google.protobuf import reflection as _reflection 9 | from google.protobuf import symbol_database as _symbol_database 10 | # @@protoc_insertion_point(imports) 11 | 12 | _sym_db = _symbol_database.Default() 13 | 14 | 15 | 16 | 17 | DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0e\x64\x64sketch.proto\"}\n\x08\x44\x44Sketch\x12\x1e\n\x07mapping\x18\x01 \x01(\x0b\x32\r.IndexMapping\x12\x1e\n\x0epositiveValues\x18\x02 \x01(\x0b\x32\x06.Store\x12\x1e\n\x0enegativeValues\x18\x03 \x01(\x0b\x32\x06.Store\x12\x11\n\tzeroCount\x18\x04 \x01(\x01\"\xa7\x01\n\x0cIndexMapping\x12\r\n\x05gamma\x18\x01 \x01(\x01\x12\x13\n\x0bindexOffset\x18\x02 \x01(\x01\x12\x32\n\rinterpolation\x18\x03 \x01(\x0e\x32\x1b.IndexMapping.Interpolation\"?\n\rInterpolation\x12\x08\n\x04NONE\x10\x00\x12\n\n\x06LINEAR\x10\x01\x12\r\n\tQUADRATIC\x10\x02\x12\t\n\x05\x43UBIC\x10\x03\"\xa6\x01\n\x05Store\x12(\n\tbinCounts\x18\x01 \x03(\x0b\x32\x15.Store.BinCountsEntry\x12\x1f\n\x13\x63ontiguousBinCounts\x18\x02 \x03(\x01\x42\x02\x10\x01\x12 \n\x18\x63ontiguousBinIndexOffset\x18\x03 \x01(\x11\x1a\x30\n\x0e\x42inCountsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x11\x12\r\n\x05value\x18\x02 \x01(\x01:\x02\x38\x01\x62\x06proto3') 18 | 19 | 20 | 21 | _DDSKETCH = DESCRIPTOR.message_types_by_name['DDSketch'] 22 | _INDEXMAPPING = DESCRIPTOR.message_types_by_name['IndexMapping'] 23 | _STORE = DESCRIPTOR.message_types_by_name['Store'] 24 | _STORE_BINCOUNTSENTRY = _STORE.nested_types_by_name['BinCountsEntry'] 25 | _INDEXMAPPING_INTERPOLATION = _INDEXMAPPING.enum_types_by_name['Interpolation'] 26 | DDSketch = _reflection.GeneratedProtocolMessageType('DDSketch', (_message.Message,), { 27 | 'DESCRIPTOR' : _DDSKETCH, 28 | '__module__' : 'ddsketch_pb2' 29 | # @@protoc_insertion_point(class_scope:DDSketch) 30 | }) 31 | _sym_db.RegisterMessage(DDSketch) 32 | 33 | IndexMapping = _reflection.GeneratedProtocolMessageType('IndexMapping', (_message.Message,), { 34 | 'DESCRIPTOR' : _INDEXMAPPING, 35 | '__module__' : 'ddsketch_pb2' 36 | # @@protoc_insertion_point(class_scope:IndexMapping) 37 | }) 38 | _sym_db.RegisterMessage(IndexMapping) 39 | 40 | Store = _reflection.GeneratedProtocolMessageType('Store', (_message.Message,), { 41 | 42 | 'BinCountsEntry' : _reflection.GeneratedProtocolMessageType('BinCountsEntry', (_message.Message,), { 43 | 'DESCRIPTOR' : _STORE_BINCOUNTSENTRY, 44 | '__module__' : 'ddsketch_pb2' 45 | # @@protoc_insertion_point(class_scope:Store.BinCountsEntry) 46 | }) 47 | , 48 | 'DESCRIPTOR' : _STORE, 49 | '__module__' : 'ddsketch_pb2' 50 | # @@protoc_insertion_point(class_scope:Store) 51 | }) 52 | _sym_db.RegisterMessage(Store) 53 | _sym_db.RegisterMessage(Store.BinCountsEntry) 54 | 55 | if _descriptor._USE_C_DESCRIPTORS == False: 56 | 57 | DESCRIPTOR._options = None 58 | _STORE_BINCOUNTSENTRY._options = None 59 | _STORE_BINCOUNTSENTRY._serialized_options = b'8\001' 60 | _STORE.fields_by_name['contiguousBinCounts']._options = None 61 | _STORE.fields_by_name['contiguousBinCounts']._serialized_options = b'\020\001' 62 | _DDSKETCH._serialized_start=18 63 | _DDSKETCH._serialized_end=143 64 | _INDEXMAPPING._serialized_start=146 65 | _INDEXMAPPING._serialized_end=313 66 | _INDEXMAPPING_INTERPOLATION._serialized_start=250 67 | _INDEXMAPPING_INTERPOLATION._serialized_end=313 68 | _STORE._serialized_start=316 69 | _STORE._serialized_end=482 70 | _STORE_BINCOUNTSENTRY._serialized_start=434 71 | _STORE_BINCOUNTSENTRY._serialized_end=482 72 | # @@protoc_insertion_point(module_scope) 73 | -------------------------------------------------------------------------------- /ddsketch/pb/ddsketch_pre319_pb2.py: -------------------------------------------------------------------------------- 1 | # Generated by the protocol buffer compiler. DO NOT EDIT! 2 | # source: ddsketch.proto 3 | 4 | import sys 5 | _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) 6 | from google.protobuf import descriptor as _descriptor 7 | from google.protobuf import message as _message 8 | from google.protobuf import reflection as _reflection 9 | from google.protobuf import symbol_database as _symbol_database 10 | from google.protobuf import descriptor_pb2 11 | # @@protoc_insertion_point(imports) 12 | 13 | _sym_db = _symbol_database.Default() 14 | 15 | 16 | 17 | 18 | DESCRIPTOR = _descriptor.FileDescriptor( 19 | name='ddsketch.proto', 20 | package='', 21 | syntax='proto3', 22 | serialized_pb=_b('\n\x0e\x64\x64sketch.proto\"}\n\x08\x44\x44Sketch\x12\x1e\n\x07mapping\x18\x01 \x01(\x0b\x32\r.IndexMapping\x12\x1e\n\x0epositiveValues\x18\x02 \x01(\x0b\x32\x06.Store\x12\x1e\n\x0enegativeValues\x18\x03 \x01(\x0b\x32\x06.Store\x12\x11\n\tzeroCount\x18\x04 \x01(\x01\"\xa7\x01\n\x0cIndexMapping\x12\r\n\x05gamma\x18\x01 \x01(\x01\x12\x13\n\x0bindexOffset\x18\x02 \x01(\x01\x12\x32\n\rinterpolation\x18\x03 \x01(\x0e\x32\x1b.IndexMapping.Interpolation\"?\n\rInterpolation\x12\x08\n\x04NONE\x10\x00\x12\n\n\x06LINEAR\x10\x01\x12\r\n\tQUADRATIC\x10\x02\x12\t\n\x05\x43UBIC\x10\x03\"\xa6\x01\n\x05Store\x12(\n\tbinCounts\x18\x01 \x03(\x0b\x32\x15.Store.BinCountsEntry\x12\x1f\n\x13\x63ontiguousBinCounts\x18\x02 \x03(\x01\x42\x02\x10\x01\x12 \n\x18\x63ontiguousBinIndexOffset\x18\x03 \x01(\x11\x1a\x30\n\x0e\x42inCountsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x11\x12\r\n\x05value\x18\x02 \x01(\x01:\x02\x38\x01\x62\x06proto3') 23 | ) 24 | _sym_db.RegisterFileDescriptor(DESCRIPTOR) 25 | 26 | 27 | 28 | _INDEXMAPPING_INTERPOLATION = _descriptor.EnumDescriptor( 29 | name='Interpolation', 30 | full_name='IndexMapping.Interpolation', 31 | filename=None, 32 | file=DESCRIPTOR, 33 | values=[ 34 | _descriptor.EnumValueDescriptor( 35 | name='NONE', index=0, number=0, 36 | options=None, 37 | type=None), 38 | _descriptor.EnumValueDescriptor( 39 | name='LINEAR', index=1, number=1, 40 | options=None, 41 | type=None), 42 | _descriptor.EnumValueDescriptor( 43 | name='QUADRATIC', index=2, number=2, 44 | options=None, 45 | type=None), 46 | _descriptor.EnumValueDescriptor( 47 | name='CUBIC', index=3, number=3, 48 | options=None, 49 | type=None), 50 | ], 51 | containing_type=None, 52 | options=None, 53 | serialized_start=250, 54 | serialized_end=313, 55 | ) 56 | _sym_db.RegisterEnumDescriptor(_INDEXMAPPING_INTERPOLATION) 57 | 58 | 59 | _DDSKETCH = _descriptor.Descriptor( 60 | name='DDSketch', 61 | full_name='DDSketch', 62 | filename=None, 63 | file=DESCRIPTOR, 64 | containing_type=None, 65 | fields=[ 66 | _descriptor.FieldDescriptor( 67 | name='mapping', full_name='DDSketch.mapping', index=0, 68 | number=1, type=11, cpp_type=10, label=1, 69 | has_default_value=False, default_value=None, 70 | message_type=None, enum_type=None, containing_type=None, 71 | is_extension=False, extension_scope=None, 72 | options=None), 73 | _descriptor.FieldDescriptor( 74 | name='positiveValues', full_name='DDSketch.positiveValues', index=1, 75 | number=2, type=11, cpp_type=10, label=1, 76 | has_default_value=False, default_value=None, 77 | message_type=None, enum_type=None, containing_type=None, 78 | is_extension=False, extension_scope=None, 79 | options=None), 80 | _descriptor.FieldDescriptor( 81 | name='negativeValues', full_name='DDSketch.negativeValues', index=2, 82 | number=3, type=11, cpp_type=10, label=1, 83 | has_default_value=False, default_value=None, 84 | message_type=None, enum_type=None, containing_type=None, 85 | is_extension=False, extension_scope=None, 86 | options=None), 87 | _descriptor.FieldDescriptor( 88 | name='zeroCount', full_name='DDSketch.zeroCount', index=3, 89 | number=4, type=1, cpp_type=5, label=1, 90 | has_default_value=False, default_value=float(0), 91 | message_type=None, enum_type=None, containing_type=None, 92 | is_extension=False, extension_scope=None, 93 | options=None), 94 | ], 95 | extensions=[ 96 | ], 97 | nested_types=[], 98 | enum_types=[ 99 | ], 100 | options=None, 101 | is_extendable=False, 102 | syntax='proto3', 103 | extension_ranges=[], 104 | oneofs=[ 105 | ], 106 | serialized_start=18, 107 | serialized_end=143, 108 | ) 109 | 110 | 111 | _INDEXMAPPING = _descriptor.Descriptor( 112 | name='IndexMapping', 113 | full_name='IndexMapping', 114 | filename=None, 115 | file=DESCRIPTOR, 116 | containing_type=None, 117 | fields=[ 118 | _descriptor.FieldDescriptor( 119 | name='gamma', full_name='IndexMapping.gamma', index=0, 120 | number=1, type=1, cpp_type=5, label=1, 121 | has_default_value=False, default_value=float(0), 122 | message_type=None, enum_type=None, containing_type=None, 123 | is_extension=False, extension_scope=None, 124 | options=None), 125 | _descriptor.FieldDescriptor( 126 | name='indexOffset', full_name='IndexMapping.indexOffset', index=1, 127 | number=2, type=1, cpp_type=5, label=1, 128 | has_default_value=False, default_value=float(0), 129 | message_type=None, enum_type=None, containing_type=None, 130 | is_extension=False, extension_scope=None, 131 | options=None), 132 | _descriptor.FieldDescriptor( 133 | name='interpolation', full_name='IndexMapping.interpolation', index=2, 134 | number=3, type=14, cpp_type=8, label=1, 135 | has_default_value=False, default_value=0, 136 | message_type=None, enum_type=None, containing_type=None, 137 | is_extension=False, extension_scope=None, 138 | options=None), 139 | ], 140 | extensions=[ 141 | ], 142 | nested_types=[], 143 | enum_types=[ 144 | _INDEXMAPPING_INTERPOLATION, 145 | ], 146 | options=None, 147 | is_extendable=False, 148 | syntax='proto3', 149 | extension_ranges=[], 150 | oneofs=[ 151 | ], 152 | serialized_start=146, 153 | serialized_end=313, 154 | ) 155 | 156 | 157 | _STORE_BINCOUNTSENTRY = _descriptor.Descriptor( 158 | name='BinCountsEntry', 159 | full_name='Store.BinCountsEntry', 160 | filename=None, 161 | file=DESCRIPTOR, 162 | containing_type=None, 163 | fields=[ 164 | _descriptor.FieldDescriptor( 165 | name='key', full_name='Store.BinCountsEntry.key', index=0, 166 | number=1, type=17, cpp_type=1, label=1, 167 | has_default_value=False, default_value=0, 168 | message_type=None, enum_type=None, containing_type=None, 169 | is_extension=False, extension_scope=None, 170 | options=None), 171 | _descriptor.FieldDescriptor( 172 | name='value', full_name='Store.BinCountsEntry.value', index=1, 173 | number=2, type=1, cpp_type=5, label=1, 174 | has_default_value=False, default_value=float(0), 175 | message_type=None, enum_type=None, containing_type=None, 176 | is_extension=False, extension_scope=None, 177 | options=None), 178 | ], 179 | extensions=[ 180 | ], 181 | nested_types=[], 182 | enum_types=[ 183 | ], 184 | options=_descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001')), 185 | is_extendable=False, 186 | syntax='proto3', 187 | extension_ranges=[], 188 | oneofs=[ 189 | ], 190 | serialized_start=434, 191 | serialized_end=482, 192 | ) 193 | 194 | _STORE = _descriptor.Descriptor( 195 | name='Store', 196 | full_name='Store', 197 | filename=None, 198 | file=DESCRIPTOR, 199 | containing_type=None, 200 | fields=[ 201 | _descriptor.FieldDescriptor( 202 | name='binCounts', full_name='Store.binCounts', index=0, 203 | number=1, type=11, cpp_type=10, label=3, 204 | has_default_value=False, default_value=[], 205 | message_type=None, enum_type=None, containing_type=None, 206 | is_extension=False, extension_scope=None, 207 | options=None), 208 | _descriptor.FieldDescriptor( 209 | name='contiguousBinCounts', full_name='Store.contiguousBinCounts', index=1, 210 | number=2, type=1, cpp_type=5, label=3, 211 | has_default_value=False, default_value=[], 212 | message_type=None, enum_type=None, containing_type=None, 213 | is_extension=False, extension_scope=None, 214 | options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))), 215 | _descriptor.FieldDescriptor( 216 | name='contiguousBinIndexOffset', full_name='Store.contiguousBinIndexOffset', index=2, 217 | number=3, type=17, cpp_type=1, label=1, 218 | has_default_value=False, default_value=0, 219 | message_type=None, enum_type=None, containing_type=None, 220 | is_extension=False, extension_scope=None, 221 | options=None), 222 | ], 223 | extensions=[ 224 | ], 225 | nested_types=[_STORE_BINCOUNTSENTRY, ], 226 | enum_types=[ 227 | ], 228 | options=None, 229 | is_extendable=False, 230 | syntax='proto3', 231 | extension_ranges=[], 232 | oneofs=[ 233 | ], 234 | serialized_start=316, 235 | serialized_end=482, 236 | ) 237 | 238 | _DDSKETCH.fields_by_name['mapping'].message_type = _INDEXMAPPING 239 | _DDSKETCH.fields_by_name['positiveValues'].message_type = _STORE 240 | _DDSKETCH.fields_by_name['negativeValues'].message_type = _STORE 241 | _INDEXMAPPING.fields_by_name['interpolation'].enum_type = _INDEXMAPPING_INTERPOLATION 242 | _INDEXMAPPING_INTERPOLATION.containing_type = _INDEXMAPPING 243 | _STORE_BINCOUNTSENTRY.containing_type = _STORE 244 | _STORE.fields_by_name['binCounts'].message_type = _STORE_BINCOUNTSENTRY 245 | DESCRIPTOR.message_types_by_name['DDSketch'] = _DDSKETCH 246 | DESCRIPTOR.message_types_by_name['IndexMapping'] = _INDEXMAPPING 247 | DESCRIPTOR.message_types_by_name['Store'] = _STORE 248 | 249 | DDSketch = _reflection.GeneratedProtocolMessageType('DDSketch', (_message.Message,), dict( 250 | DESCRIPTOR = _DDSKETCH, 251 | __module__ = 'ddsketch_pb2' 252 | # @@protoc_insertion_point(class_scope:DDSketch) 253 | )) 254 | _sym_db.RegisterMessage(DDSketch) 255 | 256 | IndexMapping = _reflection.GeneratedProtocolMessageType('IndexMapping', (_message.Message,), dict( 257 | DESCRIPTOR = _INDEXMAPPING, 258 | __module__ = 'ddsketch_pb2' 259 | # @@protoc_insertion_point(class_scope:IndexMapping) 260 | )) 261 | _sym_db.RegisterMessage(IndexMapping) 262 | 263 | Store = _reflection.GeneratedProtocolMessageType('Store', (_message.Message,), dict( 264 | 265 | BinCountsEntry = _reflection.GeneratedProtocolMessageType('BinCountsEntry', (_message.Message,), dict( 266 | DESCRIPTOR = _STORE_BINCOUNTSENTRY, 267 | __module__ = 'ddsketch_pb2' 268 | # @@protoc_insertion_point(class_scope:Store.BinCountsEntry) 269 | )) 270 | , 271 | DESCRIPTOR = _STORE, 272 | __module__ = 'ddsketch_pb2' 273 | # @@protoc_insertion_point(class_scope:Store) 274 | )) 275 | _sym_db.RegisterMessage(Store) 276 | _sym_db.RegisterMessage(Store.BinCountsEntry) 277 | 278 | 279 | _STORE_BINCOUNTSENTRY.has_options = True 280 | _STORE_BINCOUNTSENTRY._options = _descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001')) 281 | _STORE.fields_by_name['contiguousBinCounts'].has_options = True 282 | _STORE.fields_by_name['contiguousBinCounts']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001')) 283 | # @@protoc_insertion_point(module_scope) 284 | -------------------------------------------------------------------------------- /ddsketch/pb/proto.py: -------------------------------------------------------------------------------- 1 | from ddsketch.ddsketch import BaseDDSketch 2 | from ..mapping import ( 3 | CubicallyInterpolatedMapping, 4 | LinearlyInterpolatedMapping, 5 | LogarithmicMapping, 6 | ) 7 | from ..store import DenseStore 8 | 9 | import google.protobuf 10 | 11 | 12 | pb_version = tuple(map(int, google.protobuf.__version__.split(".")[0:2])) 13 | 14 | if pb_version >= (3, 19, 0): 15 | import ddsketch.pb.ddsketch_pb2 as pb 16 | else: 17 | import ddsketch.pb.ddsketch_pre319_pb2 as pb 18 | 19 | 20 | class KeyMappingProto: 21 | @classmethod 22 | def _proto_interpolation(cls, mapping): 23 | if type(mapping) is LogarithmicMapping: 24 | return pb.IndexMapping.NONE 25 | if type(mapping) is LinearlyInterpolatedMapping: 26 | return pb.IndexMapping.LINEAR 27 | if type(mapping) is CubicallyInterpolatedMapping: 28 | return pb.IndexMapping.CUBIC 29 | 30 | @classmethod 31 | def to_proto(cls, mapping): 32 | """serialize to protobuf""" 33 | return pb.IndexMapping( 34 | gamma=mapping.gamma, 35 | indexOffset=mapping._offset, 36 | interpolation=cls._proto_interpolation(mapping), 37 | ) 38 | 39 | @classmethod 40 | def from_proto(cls, proto): 41 | """deserialize from protobuf""" 42 | if proto.interpolation == pb.IndexMapping.NONE: 43 | return LogarithmicMapping.from_gamma_offset(proto.gamma, proto.indexOffset) 44 | elif proto.interpolation == pb.IndexMapping.LINEAR: 45 | return LinearlyInterpolatedMapping.from_gamma_offset( 46 | proto.gamma, proto.indexOffset 47 | ) 48 | elif proto.interpolation == pb.IndexMapping.CUBIC: 49 | return CubicallyInterpolatedMapping.from_gamma_offset( 50 | proto.gamma, proto.indexOffset 51 | ) 52 | else: 53 | raise ValueError("Unrecognized interpolation %r" % proto.interpolation) 54 | 55 | 56 | class StoreProto: 57 | """Currently only supports DenseStore""" 58 | 59 | @classmethod 60 | def to_proto(cls, store): 61 | """serialize to protobuf""" 62 | return pb.Store( 63 | contiguousBinCounts=store.bins, contiguousBinIndexOffset=store.offset 64 | ) 65 | 66 | @classmethod 67 | def from_proto(cls, proto): 68 | """deserialize from protobuf""" 69 | store = DenseStore() 70 | index = proto.contiguousBinIndexOffset 71 | store.offset = index 72 | for count in proto.contiguousBinCounts: 73 | store.add(index, count) 74 | index += 1 75 | return store 76 | 77 | 78 | class DDSketchProto: 79 | @classmethod 80 | def to_proto(self, ddsketch): 81 | """serialize to protobuf""" 82 | return pb.DDSketch( 83 | mapping=KeyMappingProto.to_proto(ddsketch._mapping), 84 | positiveValues=StoreProto.to_proto(ddsketch._store), 85 | negativeValues=StoreProto.to_proto(ddsketch._negative_store), 86 | zeroCount=ddsketch._zero_count, 87 | ) 88 | 89 | @classmethod 90 | def from_proto(cls, proto): 91 | """deserialize from protobuf 92 | 93 | N.B., The current protobuf loses any min/max/sum/avg information. 94 | """ 95 | mapping = KeyMappingProto.from_proto(proto.mapping) 96 | negative_store = StoreProto.from_proto(proto.negativeValues) 97 | store = StoreProto.from_proto(proto.positiveValues) 98 | zero_count = proto.zeroCount 99 | return BaseDDSketch( 100 | mapping=mapping, 101 | store=store, 102 | negative_store=negative_store, 103 | zero_count=zero_count, 104 | ) 105 | -------------------------------------------------------------------------------- /ddsketch/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataDog/sketches-py/0d16e695d1f991276863b8ffaaf6c8e9bd9ad9de/ddsketch/py.typed -------------------------------------------------------------------------------- /ddsketch/store.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | 4 | # Unless explicitly stated otherwise all files in this repository are licensed 5 | # under the Apache License 2.0. 6 | # This product includes software developed at Datadog (https://www.datadoghq.com/). 7 | # Copyright 2020 Datadog, Inc. 8 | 9 | """ 10 | Stores map integers to counters. They can be seen as a collection of bins. 11 | We start with 128 bins and grow the store in chunks of 128 unless specified 12 | otherwise. 13 | """ 14 | 15 | import abc 16 | import math 17 | import typing 18 | 19 | 20 | if typing.TYPE_CHECKING: 21 | from typing import List # noqa: F401 22 | from typing import Optional # noqa: F401 23 | 24 | import six 25 | 26 | 27 | CHUNK_SIZE = 128 28 | 29 | 30 | class _NegativeIntInfinity(int): 31 | def __ge__(self, x): 32 | return False 33 | 34 | __gt__ = __ge__ 35 | 36 | def __lt__(self, x): 37 | return True 38 | 39 | __le__ = __lt__ 40 | 41 | 42 | class _PositiveIntInfinity(int): 43 | def __ge__(self, x): 44 | return True 45 | 46 | __gt__ = __ge__ 47 | 48 | def __lt__(self, x): 49 | return False 50 | 51 | __le__ = __lt__ 52 | 53 | 54 | _neg_infinity = _NegativeIntInfinity() 55 | _pos_infinity = _PositiveIntInfinity() 56 | 57 | 58 | class Store(six.with_metaclass(abc.ABCMeta)): 59 | """The basic specification of a store 60 | 61 | Attributes: 62 | count (float): the sum of the counts for the bins 63 | min_key (int): the minimum key bin 64 | max_key (int): the maximum key bin 65 | """ 66 | 67 | def __init__(self): 68 | # type: () -> None 69 | self.count = 0 # type: float 70 | self.min_key = _pos_infinity # type: int 71 | self.max_key = _neg_infinity # type: int 72 | 73 | @abc.abstractmethod 74 | def copy(self, store): 75 | """Copies the input store into this one.""" 76 | 77 | @abc.abstractmethod 78 | def length(self): 79 | # type: () -> int 80 | """Return the number of bins.""" 81 | 82 | @abc.abstractmethod 83 | def add(self, key, weight=1.0): 84 | # type: (int, float) -> None 85 | """Updates the counter at the specified index key, growing the number of bins if 86 | necessary. 87 | """ 88 | 89 | @abc.abstractmethod 90 | def key_at_rank(self, rank, lower=True): 91 | # type: (float, bool) -> int 92 | """Return the key for the value at given rank. 93 | 94 | E.g., if the non-zero bins are [1, 1] for keys a, b with no offset 95 | 96 | if lower = True: 97 | key_at_rank(x) = a for x in [0, 1) 98 | key_at_rank(x) = b for x in [1, 2) 99 | 100 | if lower = False: 101 | key_at_rank(x) = a for x in (-1, 0] 102 | key_at_rank(x) = b for x in (0, 1] 103 | """ 104 | 105 | @abc.abstractmethod 106 | def merge(self, store): 107 | # type: (Store) -> None 108 | """Merge another store into this one. This should be equivalent as running the 109 | add operations that have been run on the other store on this one. 110 | """ 111 | 112 | 113 | class DenseStore(Store): 114 | """A dense store that keeps all the bins between the bin for the min_key and the 115 | bin for the max_key. 116 | 117 | Args: 118 | chunk_size (int, optional): the number of bins to grow by 119 | 120 | Attributes: 121 | count (int): the sum of the counts for the bins 122 | min_key (int): the minimum key bin 123 | max_key (int): the maximum key bin 124 | offset (int): the difference btw the keys and the index in which they are stored 125 | bins (List[float]): the bins 126 | """ 127 | 128 | def __init__(self, chunk_size=CHUNK_SIZE): 129 | # type: (int) -> None 130 | super(DenseStore, self).__init__() 131 | 132 | self.chunk_size = chunk_size # type: int 133 | self.offset = 0 # type: int 134 | self.bins = [] # type: List[float] 135 | 136 | def __repr__(self): 137 | # type: () -> str 138 | repr_str = "{" 139 | for i, sbin in enumerate(self.bins): 140 | repr_str += "%s: %s, " % (i + self.offset, sbin) 141 | repr_str += "}}, min_key:%s, max_key:%s, offset:%s" % ( 142 | self.min_key, 143 | self.max_key, 144 | self.offset, 145 | ) 146 | return repr_str 147 | 148 | def copy(self, store): 149 | # type: (DenseStore) -> None 150 | self.bins = store.bins[:] 151 | self.count = store.count 152 | self.min_key = store.min_key 153 | self.max_key = store.max_key 154 | self.offset = store.offset 155 | 156 | def length(self): 157 | # type: () -> int 158 | """Return the number of bins.""" 159 | return len(self.bins) 160 | 161 | def add(self, key, weight=1.0): 162 | # type: (int, float) -> None 163 | idx = self._get_index(key) 164 | self.bins[idx] += weight 165 | self.count += weight 166 | 167 | def _get_index(self, key): 168 | # type: (int) -> int 169 | """Calculate the bin index for the key, extending the range if necessary.""" 170 | if key < self.min_key: 171 | self._extend_range(key) 172 | elif key > self.max_key: 173 | self._extend_range(key) 174 | 175 | return key - self.offset 176 | 177 | def _get_new_length(self, new_min_key, new_max_key): 178 | # type: (int, int) -> int 179 | desired_length = new_max_key - new_min_key + 1 180 | return self.chunk_size * int(math.ceil(desired_length / self.chunk_size)) 181 | 182 | def _extend_range(self, key, second_key=None): 183 | # type: (int, Optional[int]) -> None 184 | """Grow the bins as necessary and call _adjust""" 185 | if second_key is None: 186 | second_key = key 187 | new_min_key = min(key, second_key, self.min_key) 188 | new_max_key = max(key, second_key, self.max_key) 189 | 190 | if self.length() == 0: 191 | # initialize bins 192 | self.bins = [0.0] * self._get_new_length(new_min_key, new_max_key) 193 | self.offset = new_min_key 194 | self._adjust(new_min_key, new_max_key) 195 | 196 | elif new_min_key >= self.min_key and new_max_key < self.offset + self.length(): 197 | # no need to change the range; just update min/max keys 198 | self.min_key = new_min_key 199 | self.max_key = new_max_key 200 | 201 | else: 202 | # grow the bins 203 | new_length = self._get_new_length(new_min_key, new_max_key) 204 | if new_length > self.length(): 205 | self.bins.extend([0.0] * (new_length - self.length())) 206 | self._adjust(new_min_key, new_max_key) 207 | 208 | def _adjust(self, new_min_key, new_max_key): 209 | # type: (int, int) -> None 210 | """Adjust the bins, the offset, the min_key, and max_key, without resizing the 211 | bins, in order to try making it fit the specified range. 212 | """ 213 | self._center_bins(new_min_key, new_max_key) 214 | self.min_key = new_min_key 215 | self.max_key = new_max_key 216 | 217 | def _shift_bins(self, shift): 218 | # type: (int) -> None 219 | """Shift the bins; this changes the offset.""" 220 | if shift > 0: 221 | self.bins = self.bins[:-shift] 222 | self.bins[:0] = [0.0] * shift 223 | else: 224 | self.bins = self.bins[abs(shift) :] 225 | self.bins.extend([0.0] * abs(shift)) 226 | self.offset -= shift 227 | 228 | def _center_bins(self, new_min_key, new_max_key): 229 | # type: (int, int) -> None 230 | """Center the bins; this changes the offset.""" 231 | middle_key = new_min_key + (new_max_key - new_min_key + 1) // 2 232 | self._shift_bins(self.offset + self.length() // 2 - middle_key) 233 | 234 | def key_at_rank(self, rank, lower=True): 235 | # type: (float, bool) -> int 236 | running_ct = 0.0 237 | for i, bin_ct in enumerate(self.bins): 238 | running_ct += bin_ct 239 | if (lower and running_ct > rank) or (not lower and running_ct >= rank + 1): 240 | return i + self.offset 241 | 242 | return self.max_key 243 | 244 | def merge(self, store): # type: ignore[override] 245 | # type: (DenseStore) -> None 246 | if store.count == 0: 247 | return 248 | 249 | if self.count == 0: 250 | self.copy(store) 251 | return 252 | 253 | if store.min_key < self.min_key or store.max_key > self.max_key: 254 | self._extend_range(store.min_key, store.max_key) 255 | 256 | for key in range(store.min_key, store.max_key + 1): 257 | self.bins[key - self.offset] += store.bins[key - store.offset] 258 | 259 | self.count += store.count 260 | 261 | 262 | class CollapsingLowestDenseStore(DenseStore): 263 | """A dense store that keeps all the bins between the bin for the min_key and the 264 | bin for the max_key, but collapsing the left-most bins if the number of bins 265 | exceeds the bin_limit 266 | 267 | Args: 268 | bin_limit (int): the maximum number of bins 269 | chunk_size (int, optional): the number of bins to grow by 270 | 271 | Attributes: 272 | count (int): the sum of the counts for the bins 273 | min_key (int): the minimum key bin 274 | max_key (int): the maximum key bin 275 | offset (int): the difference btw the keys and the index in which they are stored 276 | bins (List[int]): the bins 277 | """ 278 | 279 | def __init__(self, bin_limit, chunk_size=CHUNK_SIZE): 280 | # type: (int, int) -> None 281 | super(CollapsingLowestDenseStore, self).__init__() 282 | self.bin_limit = bin_limit 283 | self.is_collapsed = False 284 | 285 | def copy(self, store): # type: ignore[override] 286 | # type: (CollapsingLowestDenseStore) -> None 287 | self.bin_limit = store.bin_limit 288 | self.is_collapsed = store.is_collapsed 289 | super(CollapsingLowestDenseStore, self).copy(store) 290 | 291 | def _get_new_length(self, new_min_key, new_max_key): 292 | # type: (int, int) -> int 293 | desired_length = new_max_key - new_min_key + 1 294 | return min( 295 | self.chunk_size * int(math.ceil(desired_length / self.chunk_size)), 296 | self.bin_limit, 297 | ) 298 | 299 | def _get_index(self, key): 300 | # type: (int) -> int 301 | """Calculate the bin index for the key, extending the range if necessary.""" 302 | if key < self.min_key: 303 | if self.is_collapsed: 304 | return 0 305 | 306 | self._extend_range(key) 307 | if self.is_collapsed: 308 | return 0 309 | elif key > self.max_key: 310 | self._extend_range(key) 311 | 312 | return key - self.offset 313 | 314 | def _adjust(self, new_min_key, new_max_key): 315 | # type: (int, int) -> None 316 | """Override. Adjust the bins, the offset, the min_key, and max_key, without 317 | resizing the bins, in order to try making it fit the specified 318 | range. Collapse to the left if necessary. 319 | """ 320 | if new_max_key - new_min_key + 1 > self.length(): 321 | # The range of keys is too wide, the lowest bins need to be collapsed. 322 | new_min_key = new_max_key - self.length() + 1 323 | 324 | if new_min_key >= self.max_key: 325 | # put everything in the first bin 326 | self.offset = new_min_key 327 | self.min_key = new_min_key 328 | self.bins[:] = [0.0] * self.length() 329 | self.bins[0] = self.count 330 | else: 331 | shift = self.offset - new_min_key 332 | if shift < 0: 333 | collapse_start_index = self.min_key - self.offset 334 | collapse_end_index = new_min_key - self.offset 335 | collapsed_count = sum( 336 | self.bins[collapse_start_index:collapse_end_index] 337 | ) 338 | self.bins[collapse_start_index:collapse_end_index] = [0.0] * ( 339 | new_min_key - self.min_key 340 | ) 341 | self.bins[collapse_end_index] += collapsed_count 342 | self.min_key = new_min_key 343 | # shift the buckets to make room for new_max_key 344 | self._shift_bins(shift) 345 | else: 346 | self.min_key = new_min_key 347 | # shift the buckets to make room for new_min_key 348 | self._shift_bins(shift) 349 | 350 | self.max_key = new_max_key 351 | self.is_collapsed = True 352 | else: 353 | self._center_bins(new_min_key, new_max_key) 354 | self.min_key = new_min_key 355 | self.max_key = new_max_key 356 | 357 | def merge(self, store): # type: ignore[override] 358 | # type: (CollapsingLowestDenseStore) -> None # type: ignore[override] 359 | """Override.""" 360 | if store.count == 0: 361 | return 362 | 363 | if self.count == 0: 364 | self.copy(store) 365 | return 366 | 367 | if store.min_key < self.min_key or store.max_key > self.max_key: 368 | self._extend_range(store.min_key, store.max_key) 369 | 370 | collapse_start_idx = store.min_key - store.offset 371 | collapse_end_idx = min(self.min_key, store.max_key + 1) - store.offset 372 | if collapse_end_idx > collapse_start_idx: 373 | collapse_count = sum(store.bins[collapse_start_idx:collapse_end_idx]) 374 | self.bins[0] += collapse_count 375 | else: 376 | collapse_end_idx = collapse_start_idx 377 | 378 | for key in range(collapse_end_idx + store.offset, store.max_key + 1): 379 | self.bins[key - self.offset] += store.bins[key - store.offset] 380 | 381 | self.count += store.count 382 | 383 | 384 | class CollapsingHighestDenseStore(DenseStore): 385 | """A dense store that keeps all the bins between the bin for the min_key and the 386 | bin for the max_key, but collapsing the right-most bins if the number of bins 387 | exceeds the bin_limit 388 | 389 | Args: 390 | bin_limit (int): the maximum number of bins 391 | chunk_size (int, optional): the number of bins to grow by 392 | 393 | Attributes: 394 | count (int): the sum of the counts for the bins 395 | min_key (int): the minimum key bin 396 | max_key (int): the maximum key bin 397 | offset (int): the difference btw the keys and the index in which they are stored 398 | bins (List[int]): the bins 399 | """ 400 | 401 | def __init__(self, bin_limit, chunk_size=CHUNK_SIZE): 402 | super(CollapsingHighestDenseStore, self).__init__() 403 | self.bin_limit = bin_limit 404 | self.is_collapsed = False 405 | 406 | def copy(self, store): # type: ignore[override] 407 | # type: (CollapsingHighestDenseStore) -> None 408 | self.bin_limit = store.bin_limit 409 | self.is_collapsed = store.is_collapsed 410 | super(CollapsingHighestDenseStore, self).copy(store) 411 | 412 | def _get_new_length(self, new_min_key, new_max_key): 413 | # type: (int, int) -> int 414 | desired_length = new_max_key - new_min_key + 1 415 | # For some reason mypy can't infer that min(int, int) is an int, so cast it. 416 | return int( 417 | min( 418 | self.chunk_size * int(math.ceil(desired_length / self.chunk_size)), 419 | self.bin_limit, 420 | ) 421 | ) 422 | 423 | def _get_index(self, key): 424 | # type: (int) -> int 425 | """Calculate the bin index for the key, extending the range if necessary""" 426 | if key > self.max_key: 427 | if self.is_collapsed: 428 | return self.length() - 1 429 | 430 | self._extend_range(key) 431 | if self.is_collapsed: 432 | return self.length() - 1 433 | elif key < self.min_key: 434 | self._extend_range(key) 435 | return key - self.offset 436 | 437 | def _adjust(self, new_min_key, new_max_key): 438 | # type: (int, int) -> None 439 | """Override. Adjust the bins, the offset, the min_key, and max_key, without 440 | resizing the bins, in order to try making it fit the specified 441 | range. Collapse to the left if necessary. 442 | """ 443 | if new_max_key - new_min_key + 1 > self.length(): 444 | # The range of keys is too wide, the lowest bins need to be collapsed. 445 | new_max_key = new_min_key + self.length() - 1 446 | 447 | if new_max_key <= self.min_key: 448 | # put everything in the last bin 449 | self.offset = new_min_key 450 | self.max_key = new_max_key 451 | self.bins[:] = [0.0] * self.length() 452 | self.bins[-1] = self.count 453 | else: 454 | shift = self.offset - new_min_key 455 | if shift > 0: 456 | collapse_start_index = new_max_key - self.offset + 1 457 | collapse_end_index = self.max_key - self.offset + 1 458 | collapsed_count = sum( 459 | self.bins[collapse_start_index:collapse_end_index] 460 | ) 461 | self.bins[collapse_start_index:collapse_end_index] = [0.0] * ( 462 | self.max_key - new_max_key 463 | ) 464 | self.bins[collapse_start_index - 1] += collapsed_count 465 | self.max_key = new_max_key 466 | # shift the buckets to make room for new_max_key 467 | self._shift_bins(shift) 468 | else: 469 | self.max_key = new_max_key 470 | # shift the buckets to make room for new_min_key 471 | self._shift_bins(shift) 472 | 473 | self.min_key = new_min_key 474 | self.is_collapsed = True 475 | else: 476 | self._center_bins(new_min_key, new_max_key) 477 | self.min_key = new_min_key 478 | self.max_key = new_max_key 479 | 480 | def merge(self, store): # type: ignore[override] 481 | # type: (CollapsingHighestDenseStore) -> None # type: ignore[override] 482 | """Override.""" 483 | if store.count == 0: 484 | return 485 | 486 | if self.count == 0: 487 | self.copy(store) 488 | return 489 | 490 | if store.min_key < self.min_key or store.max_key > self.max_key: 491 | self._extend_range(store.min_key, store.max_key) 492 | 493 | collapse_end_idx = store.max_key - store.offset + 1 494 | collapse_start_idx = max(self.max_key + 1, store.min_key) - store.offset 495 | if collapse_end_idx > collapse_start_idx: 496 | collapse_count = sum(store.bins[collapse_start_idx:collapse_end_idx]) 497 | self.bins[-1] += collapse_count 498 | else: 499 | collapse_start_idx = collapse_end_idx 500 | 501 | for key in range(store.min_key, collapse_start_idx + store.offset): 502 | self.bins[key - self.offset] += store.bins[key - store.offset] 503 | 504 | self.count += store.count 505 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | dev: 5 | # The dd-trace-py image includes all required versions of Python. 6 | image: datadog/dd-trace-py:buster 7 | command: bash 8 | network_mode: host 9 | working_dir: /src 10 | volumes: 11 | - ./:/src 12 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | files = ddsketch,tests 3 | show_error_codes = true 4 | warn_return_any = true 5 | warn_unused_ignores = true 6 | warn_unused_configs = true 7 | no_implicit_optional = true 8 | ignore_missing_imports = true 9 | 10 | [mypy-ddsketch.pb.*] 11 | ignore_errors = true 12 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.isort] 2 | force_single_line = true 3 | lines_after_imports = 2 4 | force_sort_within_sections = true 5 | known_first_party = "ddsketch" 6 | default_section = "THIRDPARTY" 7 | skip = [".riot/", ".venv/", "ddsketch/pb", "ddsketch/__version.py"] 8 | line_length = 120 9 | 10 | [tool.black] 11 | exclude = ''' 12 | ^/( 13 | ( 14 | \.riot 15 | | ddsketch/pb.* 16 | | \.venv.* 17 | | \.eggs 18 | )/ 19 | | ddsketch/__version.py 20 | ) 21 | ''' 22 | -------------------------------------------------------------------------------- /releasenotes/config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | unreleased_version_title: Unreleased 3 | -------------------------------------------------------------------------------- /releasenotes/notes/ddsketch-api-a84ffc0875bbacd6.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | upgrade: 3 | - "``DDSketch`` attributes ``mapping``, ``store``, ``negative_store``, ``zero_count``, ``relative_accuracy``, ``min`` and ``max`` have been removed." 4 | - "``DDSketch.copy`` method has been removed." 5 | - "``DDSketch.count`` attribute has been made read-only." 6 | - "``DDSketch.mergeable`` method has been removed." 7 | -------------------------------------------------------------------------------- /releasenotes/notes/extend-range-06474632c8235187.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | fixes: 3 | - | 4 | Fix merging stores with max_key=0. 5 | -------------------------------------------------------------------------------- /releasenotes/notes/oldpy-db6189c9b26e10f7.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | other: 3 | - | 4 | This release drops support for Python versions older than 3.7. 5 | -------------------------------------------------------------------------------- /releasenotes/notes/pbopt-ec6525c1948d782f.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | other: 3 | - | 4 | This change makes protobuf an optional requirement. It can be installed with ``pip install ddsketch[serialization]``. 5 | -------------------------------------------------------------------------------- /releasenotes/notes/proto4-e8646610178bef59.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | fixes: 3 | - | 4 | Add support for protobuf 4. 5 | -------------------------------------------------------------------------------- /releasenotes/notes/protobuf-min-f6af9a2d5d96f53c.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | other: 3 | - | 4 | Add support for protobuf>=3.0.0. 5 | -------------------------------------------------------------------------------- /releasenotes/notes/py2-c963608396db7258.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | features: 3 | - | 4 | Add support for Python 2. 5 | -------------------------------------------------------------------------------- /releasenotes/notes/py310-ac5baa9b0b69008a.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | features: 3 | - | 4 | Add support for Python 3.10. 5 | -------------------------------------------------------------------------------- /releasenotes/notes/remove-custom-exceptions-e2bc67a72250269d.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | upgrade: 3 | - | 4 | The custom exceptions ``IllegalArgumentException`` and ``UnequalSketchParametersException`` 5 | as well as the ``ddsketch.exceptions`` module have been removed. 6 | 7 | ``IllegalArgumentException`` and ``UnequalSketchParametersException`` are replaced with ``ValueError``. 8 | -------------------------------------------------------------------------------- /releasenotes/notes/remove-numpy-25fedcd9be9d6d80.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | prelude: > 3 | numpy has been removed as a dependency. 4 | upgrade: 5 | - | 6 | ``BaseDDSketch.get_quantile_value`` will now return ``None`` instead of 7 | ``numpy.NaN`` if the specified quantile is empty. 8 | -------------------------------------------------------------------------------- /releasenotes/notes/tests-wheel-bf71b228c86a9ced.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | fixes: 3 | - | 4 | Exclude the tests module from the package. 5 | -------------------------------------------------------------------------------- /releasenotes/notes/toplevelapi-6c04f2ca35a49d4b.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | features: 3 | - | 4 | The implementations of stores and mappings are now exposed via the top 5 | level module ``ddsketch``. 6 | -------------------------------------------------------------------------------- /releasenotes/notes/typing-25579ab88323a332.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | features: 3 | - | 4 | Add typing. 5 | -------------------------------------------------------------------------------- /releasenotes/notes/version-b2a276df190a703a.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | features: 3 | - | 4 | The package version is now exposed through ``ddsketch.__version__``. 5 | -------------------------------------------------------------------------------- /riotfile.py: -------------------------------------------------------------------------------- 1 | from riot import Venv 2 | from riot import latest 3 | 4 | 5 | venv = Venv( 6 | pys=["3"], 7 | venvs=[ 8 | Venv( 9 | name="test", 10 | command="pytest {cmdargs}", 11 | pkgs={ 12 | "pytest": latest, 13 | "numpy": latest, 14 | }, 15 | venvs=[ 16 | Venv( 17 | pys=["3.7", "3.8", "3.9"], 18 | pkgs={ 19 | "protobuf": ["==3.0.0", latest], 20 | }, 21 | ), 22 | Venv( 23 | pys=["3.10", "3.11", "3.12"], 24 | pkgs={ 25 | "protobuf": ["==3.8.0", latest], 26 | }, 27 | ), 28 | ], 29 | ), 30 | Venv( 31 | pkgs={ 32 | "reno": latest, 33 | }, 34 | venvs=[ 35 | Venv( 36 | name="reno", 37 | command="reno {cmdargs}", 38 | ) 39 | ], 40 | ), 41 | Venv( 42 | name="flake8", 43 | command="flake8 {cmdargs}", 44 | pkgs={ 45 | "flake8": latest, 46 | "flake8-blind-except": latest, 47 | "flake8-builtins": latest, 48 | "flake8-docstrings": latest, 49 | "flake8-rst-docstrings": latest, 50 | # needed for some features from flake8-rst-docstrings 51 | "pygments": latest, 52 | }, 53 | ), 54 | Venv( 55 | pkgs={ 56 | "black": latest, 57 | "isort": latest, 58 | "toml": latest, 59 | }, 60 | venvs=[ 61 | Venv( 62 | name="black", 63 | command="black {cmdargs}", 64 | ), 65 | Venv( 66 | name="fmt", 67 | command="isort . && black .", 68 | ), 69 | Venv( 70 | name="check_fmt", 71 | command="isort --check . && black --check .", 72 | ), 73 | ], 74 | ), 75 | Venv( 76 | name="mypy", 77 | create=True, 78 | command="mypy --install-types --non-interactive {cmdargs}", 79 | pkgs={ 80 | "mypy": latest, 81 | "types-protobuf": latest, 82 | "types-setuptools": latest, 83 | "types-six": latest, 84 | }, 85 | ), 86 | ], 87 | ) 88 | -------------------------------------------------------------------------------- /scripts/check-releasenotes: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # If we are running outside a GitHub action, default to `master` 5 | BASE_REF="${GITHUB_BASE_REF:-master}" 6 | 7 | # Print input data 8 | echo "Base ref: origin/${BASE_REF}" 9 | echo "GitHub event path: ${GITHUB_EVENT_PATH}" 10 | echo "JQ: $(which jq)" 11 | 12 | 13 | # Skip the label check if we do not have a GitHub event path 14 | if [[ -f "${GITHUB_EVENT_PATH}" ]] && jq -e '.pull_request?.labels[]?.name | select(. == "no-changelog")' "${GITHUB_EVENT_PATH}"; 15 | then 16 | echo "PR has label 'no-changelog', skipping validation" 17 | exit 0 18 | fi 19 | 20 | # Check if they added a new file to releasenotes/notes 21 | if git diff --name-only --diff-filter=A "origin/${BASE_REF}" | grep releasenotes/notes; 22 | then 23 | echo "New release note found, success" 24 | exit 0 25 | else 26 | echo "Release note not found." 27 | echo "Use 'reno new ' to add a new note to 'releasenotes/notes', or add the label 'no-changelog' to skip this validation" 28 | exit 1 29 | fi 30 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | application-import-name=ddsketch 3 | exclude= 4 | .riot, 5 | .git, 6 | .venv, 7 | __pycache__, 8 | *.eggs-info, 9 | build, 10 | ddsketch/pb, 11 | # E501,E231,W503: not respected by black 12 | ignore = E501,W503,E231,D100,D101,D102,D103,D104,D105,D107,D205,D400,D401,D402,E203,B902,I100 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | 4 | with open("README.md", "r") as fh: 5 | long_description = fh.read() 6 | 7 | setuptools.setup( 8 | name="ddsketch", 9 | author="Jee Rim, Charles-Philippe Masson, Homin Lee", 10 | author_email="jee.rim@datadoghq.com, charles.masson@datadoghq.com, homin@datadoghq.com", 11 | description="Distributed quantile sketches", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="http://github.com/datadog/sketches-py", 15 | packages=setuptools.find_packages(exclude=["tests*"]), 16 | package_data={"ddsketch": ["py.typed"]}, 17 | classifiers=[ 18 | "Programming Language :: Python :: 3", 19 | "License :: OSI Approved :: Apache Software License", 20 | ], 21 | keywords=["ddsketch", "quantile", "sketch"], 22 | install_requires=[ 23 | "six", 24 | ], 25 | extras_require={"serialization": ["protobuf>=3.0.0"]}, 26 | python_requires=">=3.7", 27 | download_url="https://github.com/DataDog/sketches-py/archive/v1.0.tar.gz", 28 | setup_requires=["setuptools_scm"], 29 | use_scm_version={"write_to": "ddsketch/__version.py"}, 30 | ) 31 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataDog/sketches-py/0d16e695d1f991276863b8ffaaf6c8e9bd9ad9de/tests/__init__.py -------------------------------------------------------------------------------- /tests/datasets.py: -------------------------------------------------------------------------------- 1 | # Unless explicitly stated otherwise all files in this repository are licensed 2 | # under the Apache License 2.0. 3 | # This product includes software developed at Datadog (https://www.datadoghq.com/). 4 | # Copyright 2020 Datadog, Inc. 5 | 6 | import abc 7 | 8 | import numpy as np 9 | import six 10 | 11 | 12 | class Dataset(six.with_metaclass(abc.ABCMeta)): 13 | def __init__(self, size): 14 | self.size = int(size) 15 | self.data = self.populate() 16 | 17 | def __str__(self): 18 | return "{}_{}".format(self.name, self.size) 19 | 20 | def __len__(self): 21 | return self.size 22 | 23 | def rank(self, value): 24 | lower = np.array(sorted(self.data)) < value 25 | if np.all(lower): 26 | return self.size - 1 27 | else: 28 | return np.argmin(lower) 29 | 30 | def quantile(self, q): 31 | self.data.sort() 32 | rank = int(q * (self.size - 1)) 33 | return self.data[rank] 34 | 35 | @property 36 | def sum(self): # noqa: A003 37 | return np.sum(self.data) 38 | 39 | @property 40 | def avg(self): 41 | return np.mean(self.data) 42 | 43 | @abc.abstractmethod 44 | def name(self): 45 | """Name of dataset""" 46 | 47 | @abc.abstractmethod 48 | def populate(self): 49 | """Populate self.data with self.size values""" 50 | 51 | 52 | class EmptyDataset(Dataset): 53 | @property 54 | def name(self): 55 | return "no_name" 56 | 57 | def populate(self): 58 | return [] 59 | 60 | def add(self, val): 61 | self.size += 1 62 | self.data.append(val) 63 | 64 | def add_all(self, vals): 65 | self.size += len(vals) 66 | self.data.extend(vals) 67 | 68 | 69 | class UniformForward(Dataset): 70 | @property 71 | def name(self): 72 | return "uniform_forward" 73 | 74 | def populate(self): 75 | return list(self.generate()) 76 | 77 | def generate(self): 78 | for x in range(self.size): 79 | yield x 80 | 81 | 82 | class UniformBackward(Dataset): 83 | @property 84 | def name(self): 85 | return "uniform_backward" 86 | 87 | def populate(self): 88 | return list(self.generate()) 89 | 90 | def generate(self): 91 | for x in range(self.size, 0, -1): 92 | yield x 93 | 94 | 95 | class NegativeUniformForward(Dataset): 96 | @property 97 | def name(self): 98 | return "negative_uniform_forward" 99 | 100 | def populate(self): 101 | return list(self.generate()) 102 | 103 | def generate(self): 104 | for x in range(self.size, 0, -1): 105 | yield -x 106 | 107 | 108 | class NegativeUniformBackward(Dataset): 109 | @property 110 | def name(self): 111 | return "negative_uniform_backward" 112 | 113 | def populate(self): 114 | return list(self.generate()) 115 | 116 | def generate(self): 117 | for x in range(self.size): 118 | yield -x 119 | 120 | 121 | class NumberLineForward(Dataset): 122 | @property 123 | def name(self): 124 | return "number_line_forward" 125 | 126 | def populate(self): 127 | return list(self.generate()) 128 | 129 | def generate(self): 130 | for x in range(-self.size // 2 + 1, self.size // 2 + 1, 1): 131 | yield x 132 | 133 | 134 | class NumberLineBackward(Dataset): 135 | @property 136 | def name(self): 137 | return "number_line_backward" 138 | 139 | def populate(self): 140 | return list(self.generate()) 141 | 142 | def generate(self): 143 | for x in range(self.size // 2, -self.size // 2, -1): 144 | yield x 145 | 146 | 147 | class UniformZoomIn(Dataset): 148 | @property 149 | def name(self): 150 | return "uniform_zoomin" 151 | 152 | def populate(self): 153 | return list(self.generate()) 154 | 155 | def generate(self): 156 | if self.size % 2 == 1: 157 | for item in range(self.size // 2): 158 | yield item 159 | yield self.size - item - 1 160 | yield self.size // 2 161 | else: 162 | for item in range(self.size // 2): 163 | yield item 164 | yield self.size - item - 1 165 | 166 | 167 | class UniformZoomOut(Dataset): 168 | @property 169 | def name(self): 170 | return "uniform_zoomout" 171 | 172 | def populate(self): 173 | return list(self.generate()) 174 | 175 | def generate(self): 176 | if self.size % 2 == 1: 177 | yield self.size // 2 178 | half = int(np.floor(self.size / 2)) 179 | for item in range(1, half + 1): 180 | yield half + item 181 | yield half - item 182 | else: 183 | half = int(np.ceil(self.size / 2)) - 0.5 184 | for item in range(0, int(half + 0.5)): 185 | yield int(half + item + 0.5) 186 | yield int(half - item - 0.5) 187 | 188 | 189 | class UniformSqrt(Dataset): 190 | @property 191 | def name(self): 192 | return "uniform_sqrt" 193 | 194 | def populate(self): 195 | return list(self.generate()) 196 | 197 | def generate(self): 198 | t = int(np.sqrt(2 * self.size)) 199 | initial_item = 0 200 | initial_skip = 1 201 | emitted = 0 202 | i = 0 203 | while emitted < self.size: 204 | item = initial_item 205 | skip = initial_skip 206 | for j in range(t - i): 207 | if item < self.size: 208 | yield item 209 | emitted += 1 210 | item += skip 211 | skip += 1 212 | if t - i > 1: 213 | initial_skip += 1 214 | initial_item += initial_skip 215 | i += 1 216 | else: 217 | initial_item += 1 218 | 219 | 220 | class Constant(Dataset): 221 | constant = 42.0 222 | 223 | @property 224 | def name(self): 225 | return "constant" 226 | 227 | def populate(self): 228 | return [self.constant] * self.size 229 | 230 | 231 | class Exponential(Dataset): 232 | scale = 0.01 233 | 234 | @classmethod 235 | def from_params(cls, scale, n): 236 | cls.scale = scale 237 | return cls(n) 238 | 239 | @property 240 | def name(self): 241 | return "exponential" 242 | 243 | def populate(self): 244 | return np.random.exponential(scale=self.scale, size=self.size) 245 | 246 | 247 | class Lognormal(Dataset): 248 | scale = 100.0 249 | 250 | @classmethod 251 | def from_params(cls, scale, n): 252 | cls.scale = scale 253 | return cls(n) 254 | 255 | @property 256 | def name(self): 257 | return "lognormal" 258 | 259 | def populate(self): 260 | return np.random.lognormal(size=self.size) / self.scale 261 | 262 | 263 | class Normal(Dataset): 264 | loc = 37.4 265 | scale = 1.0 266 | 267 | @classmethod 268 | def from_params(cls, loc, scale, n): 269 | cls.loc = loc 270 | cls.scale = scale 271 | return cls(n) 272 | 273 | @property 274 | def name(self): 275 | return "normal" 276 | 277 | def populate(self): 278 | return np.random.normal(loc=self.loc, scale=self.scale, size=self.size) 279 | 280 | 281 | class Laplace(Dataset): 282 | loc = 11278.0 283 | scale = 100.0 284 | 285 | @classmethod 286 | def from_params(cls, loc, scale, n): 287 | cls.loc = loc 288 | cls.scale = scale 289 | return cls(n) 290 | 291 | @property 292 | def name(self): 293 | return "laplace" 294 | 295 | def populate(self): 296 | return np.random.laplace(loc=self.loc, scale=self.scale, size=self.size) 297 | 298 | 299 | class Bimodal(Dataset): 300 | right_loc = 17.3 301 | left_loc = -2.0 302 | left_std = 3.0 303 | 304 | @property 305 | def name(self): 306 | return "bimodal" 307 | 308 | def populate(self): 309 | return [next(self.generate()) for _ in range(int(self.size))] 310 | 311 | def generate(self): 312 | if np.random.random() > 0.5: 313 | yield np.random.laplace(self.right_loc) 314 | else: 315 | yield np.random.normal(self.left_loc, self.left_std) 316 | 317 | 318 | class Mixed(Dataset): 319 | mean = 0.0 320 | sigma = 0.25 321 | scale_factor = 0.1 322 | 323 | loc = 10.0 324 | scale = 0.5 325 | 326 | def __init__(self, size, ratio=0.9, ignore_rank=False): 327 | self.size = int(size) 328 | self.ratio = ratio 329 | self.data = self.populate() 330 | self._ignore_rank = ignore_rank 331 | 332 | @property 333 | def name(self): 334 | return "mixed" 335 | 336 | def populate(self): 337 | return [next(self.generate()) for _ in range(int(self.size))] 338 | 339 | def generate(self): 340 | if np.random.random() < self.ratio: 341 | yield self.scale_factor * np.random.lognormal(self.mean, self.sigma) 342 | else: 343 | yield np.random.normal(self.loc, self.scale) 344 | 345 | 346 | class Trimodal(Dataset): 347 | right_loc = 17.3 348 | left_loc = 5.0 349 | left_std = 0.5 350 | exp_scale = 0.01 351 | 352 | @property 353 | def name(self): 354 | return "trimodal" 355 | 356 | def populate(self): 357 | return [next(self.generate()) for _ in range(int(self.size))] 358 | 359 | def generate(self): 360 | if np.random.random() > 2.0 / 3.0: 361 | yield np.random.laplace(self.right_loc) 362 | elif np.random.random() > 1.0 / 3.0: 363 | yield np.random.normal(self.left_loc, self.left_std) 364 | else: 365 | yield np.random.exponential(scale=self.exp_scale) 366 | 367 | 368 | class Integers(Dataset): 369 | loc = 4.3 370 | scale = 5.0 371 | 372 | @classmethod 373 | def from_params(cls, loc, scale, n): 374 | cls.loc = loc 375 | cls.scale = scale 376 | return cls(n) 377 | 378 | @property 379 | def name(self): 380 | return "integers" 381 | 382 | def populate(self): 383 | return [ 384 | int(x) 385 | for x in np.random.normal(loc=self.loc, scale=self.scale, size=self.size) 386 | ] 387 | -------------------------------------------------------------------------------- /tests/test_ddsketch.py: -------------------------------------------------------------------------------- 1 | # Unless explicitly stated otherwise all files in this repository are licensed 2 | # under the Apache License 2.0. 3 | # This product includes software developed at Datadog (https://www.datadoghq.com/). 4 | # Copyright 2020 Datadog, Inc. 5 | 6 | """Tests for DDSketch""" 7 | 8 | import abc 9 | from collections import Counter 10 | from unittest import TestCase 11 | 12 | import numpy as np 13 | import pytest 14 | import six 15 | 16 | import ddsketch 17 | from ddsketch.ddsketch import DDSketch 18 | from ddsketch.ddsketch import LogCollapsingHighestDenseDDSketch 19 | from ddsketch.ddsketch import LogCollapsingLowestDenseDDSketch 20 | from tests.datasets import Bimodal 21 | from tests.datasets import Constant 22 | from tests.datasets import EmptyDataset 23 | from tests.datasets import Exponential 24 | from tests.datasets import Integers 25 | from tests.datasets import Laplace 26 | from tests.datasets import Lognormal 27 | from tests.datasets import Mixed 28 | from tests.datasets import NegativeUniformBackward 29 | from tests.datasets import NegativeUniformForward 30 | from tests.datasets import Normal 31 | from tests.datasets import NumberLineBackward 32 | from tests.datasets import NumberLineForward 33 | from tests.datasets import Trimodal 34 | from tests.datasets import UniformBackward 35 | from tests.datasets import UniformForward 36 | from tests.datasets import UniformSqrt 37 | from tests.datasets import UniformZoomIn 38 | from tests.datasets import UniformZoomOut 39 | 40 | 41 | TEST_QUANTILES = [0, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999, 1] 42 | TEST_SIZES = [3, 5, 10, 100, 1000] 43 | DATASETS = [ 44 | UniformForward, 45 | UniformBackward, 46 | UniformZoomIn, 47 | UniformZoomOut, 48 | UniformSqrt, 49 | Constant, 50 | NegativeUniformBackward, 51 | NegativeUniformForward, 52 | NumberLineBackward, 53 | NumberLineForward, 54 | Exponential, 55 | Lognormal, 56 | Normal, 57 | Laplace, 58 | Bimodal, 59 | Trimodal, 60 | Mixed, 61 | Integers, 62 | ] 63 | 64 | TEST_REL_ACC = 0.05 65 | TEST_BIN_LIMIT = 1024 66 | 67 | 68 | class BaseTestDDSketches(six.with_metaclass(abc.ABCMeta)): 69 | """AbstractBaseClass for testing DDSketch implementations""" 70 | 71 | @staticmethod 72 | @abc.abstractmethod 73 | def _new_dd_sketch(): 74 | """Create a new DDSketch of the appropriate type""" 75 | 76 | def _evaluate_sketch_accuracy(self, sketch, data, eps, summary_stats=True): 77 | size = data.size 78 | for quantile in TEST_QUANTILES: 79 | sketch_q = sketch.get_quantile_value(quantile) 80 | data_q = data.quantile(quantile) 81 | err = abs(sketch_q - data_q) 82 | assert err - eps * abs(data_q) <= 1e-15 83 | assert sketch.num_values == size 84 | if summary_stats: 85 | assert sketch.sum == pytest.approx(data.sum) 86 | assert sketch.avg == pytest.approx(data.avg) 87 | 88 | def test_distributions(self): 89 | """Test DDSketch on values from various distributions""" 90 | for dataset in DATASETS: 91 | for size in TEST_SIZES: 92 | data = dataset(size) 93 | sketch = self._new_dd_sketch() 94 | for value in data.data: 95 | sketch.add(value) 96 | self._evaluate_sketch_accuracy(sketch, data, TEST_REL_ACC) 97 | 98 | def test_add_multiple(self): 99 | """Test DDSketch on adding integer weighted values""" 100 | data = Integers(1000) 101 | sketch = self._new_dd_sketch() 102 | for value, count in Counter(data.data).items(): 103 | sketch.add(value, count) 104 | self._evaluate_sketch_accuracy(sketch, data, TEST_REL_ACC) 105 | 106 | def test_add_decimal(self): 107 | """Test DDSketch on adding decimal weighted values""" 108 | sketch = self._new_dd_sketch() 109 | for value in range(100): 110 | sketch.add(value, 1.1) 111 | sketch.add(100, 110.0) 112 | 113 | data_median = 99 114 | sketch_median = sketch.get_quantile_value(0.5) 115 | err = abs(sketch_median - data_median) 116 | assert err - TEST_REL_ACC * abs(data_median) <= 1e-15 117 | assert sketch.num_values == pytest.approx(110 * 2) 118 | assert sketch.sum == pytest.approx(5445 + 11000) 119 | assert sketch.avg == pytest.approx(74.75) 120 | 121 | def test_merge_equal(self): 122 | """Test merging equal-sized DDSketches""" 123 | parameters = [(35, 1), (1, 3), (15, 2), (40, 0.5)] 124 | for size in TEST_SIZES: 125 | dataset = EmptyDataset(0) 126 | target_sketch = self._new_dd_sketch() 127 | for params in parameters: 128 | generator = Normal.from_params(params[0], params[1], size) 129 | sketch = self._new_dd_sketch() 130 | for value in generator.data: 131 | sketch.add(value) 132 | dataset.add(value) 133 | target_sketch.merge(sketch) 134 | self._evaluate_sketch_accuracy(target_sketch, dataset, TEST_REL_ACC) 135 | 136 | self._evaluate_sketch_accuracy(target_sketch, dataset, TEST_REL_ACC) 137 | 138 | def test_merge_unequal(self): 139 | """Test merging variable-sized DDSketches""" 140 | ntests = 20 141 | for _ in range(ntests): 142 | for size in TEST_SIZES: 143 | dataset = Lognormal(size) 144 | sketch1 = self._new_dd_sketch() 145 | sketch2 = self._new_dd_sketch() 146 | for value in dataset.data: 147 | if np.random.random() > 0.7: 148 | sketch1.add(value) 149 | else: 150 | sketch2.add(value) 151 | sketch1.merge(sketch2) 152 | self._evaluate_sketch_accuracy(sketch1, dataset, TEST_REL_ACC) 153 | 154 | def test_merge_mixed(self): 155 | """Test merging DDSketches of different distributions""" 156 | ntests = 20 157 | test_datasets = [Normal, Exponential, Laplace, Bimodal] 158 | for _ in range(ntests): 159 | merged_dataset = EmptyDataset(0) 160 | merged_sketch = self._new_dd_sketch() 161 | for dataset in test_datasets: 162 | generator = dataset(np.random.randint(0, 500)) 163 | sketch = self._new_dd_sketch() 164 | for value in generator.data: 165 | sketch.add(value) 166 | merged_dataset.add(value) 167 | merged_sketch.merge(sketch) 168 | self._evaluate_sketch_accuracy(merged_sketch, merged_dataset, TEST_REL_ACC) 169 | 170 | def test_consistent_merge(self): 171 | """Test that merge() calls do not modify the argument sketch.""" 172 | sketch1 = self._new_dd_sketch() 173 | sketch2 = self._new_dd_sketch() 174 | dataset = Normal(100) 175 | for value in dataset.data: 176 | sketch1.add(value) 177 | sketch1.merge(sketch2) 178 | # sketch2 is still empty 179 | assert sketch2.num_values == 0 180 | 181 | dataset = Normal(50) 182 | for value in dataset.data: 183 | sketch2.add(value) 184 | 185 | sketch2_summary = [sketch2.get_quantile_value(q) for q in TEST_QUANTILES] + [ 186 | sketch2.sum, 187 | sketch2.avg, 188 | sketch2.num_values, 189 | ] 190 | sketch1.merge(sketch2) 191 | 192 | dataset = Normal(10) 193 | for value in dataset.data: 194 | sketch1.add(value) 195 | # changes to sketch1 does not affect sketch2 after merge 196 | sketch2_summary = [sketch2.get_quantile_value(q) for q in TEST_QUANTILES] + [ 197 | sketch2.sum, 198 | sketch2.avg, 199 | sketch2.num_values, 200 | ] 201 | assert sketch2_summary == pytest.approx( 202 | [sketch2.get_quantile_value(q) for q in TEST_QUANTILES] 203 | + [sketch2.sum, sketch2.avg, sketch2.num_values], 204 | ) 205 | 206 | sketch3 = self._new_dd_sketch() 207 | sketch3.merge(sketch2) 208 | # merging to an empty sketch does not change sketch2 209 | assert sketch2_summary == pytest.approx( 210 | [sketch2.get_quantile_value(q) for q in TEST_QUANTILES] 211 | + [sketch2.sum, sketch2.avg, sketch2.num_values], 212 | ) 213 | 214 | 215 | class TestDDSketch(BaseTestDDSketches, TestCase): 216 | """Class for testing LogCollapsingLowestDenseDDSketch""" 217 | 218 | @staticmethod 219 | def _new_dd_sketch(): 220 | return DDSketch(TEST_REL_ACC) 221 | 222 | 223 | class TestLogCollapsingLowestDenseDDSketch(BaseTestDDSketches, TestCase): 224 | """Class for testing LogCollapsingLowestDenseDDSketch""" 225 | 226 | @staticmethod 227 | def _new_dd_sketch(): 228 | return LogCollapsingLowestDenseDDSketch(TEST_REL_ACC, TEST_BIN_LIMIT) 229 | 230 | 231 | class TestLogCollapsingHighestDenseDDSketch(BaseTestDDSketches, TestCase): 232 | """Class for testing LogCollapsingHighestDenseDDSketch""" 233 | 234 | @staticmethod 235 | def _new_dd_sketch(): 236 | return LogCollapsingHighestDenseDDSketch(TEST_REL_ACC, TEST_BIN_LIMIT) 237 | 238 | 239 | def test_version(): 240 | """Ensure the package version is exposed by the API.""" 241 | assert hasattr(ddsketch, "__version__") 242 | assert isinstance(ddsketch.__version__, str) 243 | -------------------------------------------------------------------------------- /tests/test_mapping.py: -------------------------------------------------------------------------------- 1 | # Unless explicitly stated otherwise all files in this repository are licensed 2 | # under the Apache License 2.0. 3 | # This product includes software developed at Datadog (https://www.datadoghq.com/). 4 | # Copyright 2020 Datadog, Inc. 5 | 6 | """Tests for the KeyMapping classes""" 7 | 8 | import abc 9 | import math 10 | from unittest import TestCase 11 | 12 | import numpy 13 | import pytest 14 | import six 15 | 16 | from ddsketch.mapping import CubicallyInterpolatedMapping 17 | from ddsketch.mapping import LinearlyInterpolatedMapping 18 | from ddsketch.mapping import LogarithmicMapping 19 | from ddsketch.mapping import _cbrt 20 | 21 | 22 | def _relative_error(expected_min, expected_max, actual): 23 | """Calculate the relative error""" 24 | if expected_min < 0 or expected_max < 0 or actual < 0: 25 | raise Exception() 26 | if (expected_min <= actual) and (actual <= expected_max): 27 | return 0.0 28 | if expected_min == 0 and expected_max == 0: 29 | return 0.0 if actual == 0 else float("+inf") 30 | if actual < expected_min: 31 | return (expected_min - actual) / expected_min 32 | 33 | return (actual - expected_max) / expected_max 34 | 35 | 36 | def _test_value_rel_acc(mapping): 37 | """Calculate the relative accuracy of a mapping on a large range of values""" 38 | value_mult = 2 - math.sqrt(2) * 1e-1 39 | max_relative_acc = 0.0 40 | value = mapping.min_possible 41 | while value < mapping.max_possible / value_mult: 42 | value *= value_mult 43 | map_val = mapping.value(mapping.key(value)) 44 | rel_err = _relative_error(value, value, map_val) 45 | assert rel_err < mapping.relative_accuracy 46 | max_relative_acc = max(max_relative_acc, rel_err) 47 | max_relative_acc = max( 48 | max_relative_acc, 49 | _relative_error( 50 | mapping.max_possible, 51 | mapping.max_possible, 52 | mapping.value(mapping.key(mapping.max_possible)), 53 | ), 54 | ) 55 | return max_relative_acc 56 | 57 | 58 | class BaseTestKeyMapping(six.with_metaclass(abc.ABCMeta)): 59 | """Abstract class for testing KeyMapping classes""" 60 | 61 | offsets = [0, 1, -12.23, 7768.3] 62 | 63 | @abc.abstractmethod 64 | def mapping(self, relative_accuracy, offset): 65 | """Return the KeyMapping instance to be tested""" 66 | 67 | def test_accuracy(self): 68 | """Test the mapping on a large range of relative accuracies""" 69 | rel_acc_mult = 1 - math.sqrt(2) * 1e-1 70 | min_rel_acc = 1e-8 71 | rel_acc = 1 - 1e-3 72 | 73 | while rel_acc >= min_rel_acc: 74 | mapping = self.mapping(rel_acc, offset=0.0) 75 | max_rel_acc = _test_value_rel_acc(mapping) 76 | assert max_rel_acc < mapping.relative_accuracy 77 | rel_acc *= rel_acc_mult 78 | 79 | def test_offsets(self): 80 | """Test offsets""" 81 | for offset in self.offsets: 82 | mapping = self.mapping(0.01, offset=offset) 83 | assert mapping.key(1) == int(offset) 84 | 85 | 86 | class TestLogarithmicMapping(BaseTestKeyMapping, TestCase): 87 | """Class for testing LogarithmicMapping class""" 88 | 89 | def mapping(self, relative_accuracy, offset): 90 | return LogarithmicMapping(relative_accuracy, offset) 91 | 92 | 93 | class TestLinearlyInterpolatedMapping(BaseTestKeyMapping, TestCase): 94 | """Class for testing LinearlyInterpolatedMapping class""" 95 | 96 | def mapping(self, relative_accuracy, offset): 97 | return LinearlyInterpolatedMapping(relative_accuracy, offset) 98 | 99 | 100 | class TestCubicallyInterpolatedMapping(BaseTestKeyMapping, TestCase): 101 | """Class for testing CubicallyInterpolatedMapping class""" # 102 | 103 | def mapping(self, relative_accuracy, offset): 104 | return CubicallyInterpolatedMapping(relative_accuracy, offset) 105 | 106 | 107 | @pytest.mark.parametrize("x", [-12.3, -1.0, -1.0 / 3.0, 0.0, 1.0, 1.0 / 3.0, 2.0**10]) 108 | def test_cbrt(x): 109 | assert pytest.approx(_cbrt(x)) == numpy.cbrt(x) 110 | -------------------------------------------------------------------------------- /tests/test_proto.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from unittest import TestCase 3 | 4 | import pytest 5 | import six 6 | 7 | from ddsketch.mapping import CubicallyInterpolatedMapping 8 | from ddsketch.mapping import LinearlyInterpolatedMapping 9 | from ddsketch.mapping import LogarithmicMapping 10 | from ddsketch.pb.proto import DDSketchProto 11 | from ddsketch.pb.proto import KeyMappingProto 12 | from ddsketch.pb.proto import StoreProto 13 | from ddsketch.store import DenseStore 14 | from tests.test_ddsketch import TestDDSketch 15 | from tests.test_store import TestDenseStore 16 | 17 | 18 | class BaseTestKeyMappingProto(six.with_metaclass(abc.ABCMeta)): 19 | offsets = [0, 1, -12.23, 7768.3] 20 | 21 | def test_round_trip(self): 22 | rel_accs = [1e-1, 1e-2, 1e-8] 23 | for rel_acc in rel_accs: 24 | for offset in self.offsets: 25 | mapping = self.mapping(rel_acc, offset) 26 | round_trip_mapping = KeyMappingProto.from_proto( 27 | KeyMappingProto.to_proto(mapping) 28 | ) 29 | assert type(mapping) == type(round_trip_mapping) # noqa: E721 30 | assert mapping.relative_accuracy == pytest.approx( 31 | round_trip_mapping.relative_accuracy 32 | ) 33 | assert mapping.value(0) == pytest.approx(round_trip_mapping.value(0)) 34 | 35 | 36 | class TestLogarithmicMapping(BaseTestKeyMappingProto, TestCase): 37 | """Class for testing LogarithmicMapping class""" 38 | 39 | def mapping(self, relative_accuracy, offset): 40 | return LogarithmicMapping(relative_accuracy, offset) 41 | 42 | 43 | class TestLinearlyInterpolatedMapping(BaseTestKeyMappingProto, TestCase): 44 | """Class for testing LinearlyInterpolatedMapping class""" 45 | 46 | def mapping(self, relative_accuracy, offset): 47 | return LinearlyInterpolatedMapping(relative_accuracy, offset) 48 | 49 | 50 | class TestCubicallyInterpolatedMapping(BaseTestKeyMappingProto, TestCase): 51 | """Class for testing CubicallyInterpolatedMapping class""" 52 | 53 | def mapping(self, relative_accuracy, offset): 54 | return CubicallyInterpolatedMapping(relative_accuracy, offset) 55 | 56 | 57 | class TestStoreProto(TestDenseStore, TestCase): 58 | def _test_store(self, values): 59 | store = DenseStore() 60 | for val in values: 61 | store.add(val) 62 | self._test_values(StoreProto.from_proto(StoreProto.to_proto(store)), values) 63 | 64 | 65 | class TestDDSketchProto(TestDDSketch, TestCase): 66 | def _evaluate_sketch_accuracy(self, sketch, data, eps, summary_stats=False): 67 | round_trip_sketch = DDSketchProto.from_proto(DDSketchProto.to_proto(sketch)) 68 | super(TestDDSketchProto, self)._evaluate_sketch_accuracy( 69 | round_trip_sketch, data, eps, summary_stats 70 | ) 71 | 72 | def test_add_multiple(self): 73 | """Override.""" 74 | 75 | def test_add_decimal(self): 76 | """Override.""" 77 | 78 | def test_merge_equal(self): 79 | """Override.""" 80 | 81 | def test_merge_unequal(self): 82 | """Override.""" 83 | 84 | def test_merge_mixed(self): 85 | """Override.""" 86 | 87 | def test_consistent_merge(self): 88 | """Override.""" 89 | -------------------------------------------------------------------------------- /tests/test_store.py: -------------------------------------------------------------------------------- 1 | # Unless explicitly stated otherwise all files in this repository are licensed 2 | # under the Apache License 2.0. 3 | # This product includes software developed at Datadog (https://www.datadoghq.com/). 4 | # Copyright 2020 Datadog, Inc. 5 | 6 | """Tests for the Store classes""" 7 | 8 | import abc 9 | from collections import Counter 10 | import sys 11 | from unittest import TestCase 12 | 13 | import six 14 | 15 | from ddsketch.store import CollapsingHighestDenseStore 16 | from ddsketch.store import CollapsingLowestDenseStore 17 | from ddsketch.store import DenseStore 18 | 19 | 20 | TEST_BIN_LIMIT = [1, 20, 1000] 21 | EXTREME_MAX = sys.maxsize 22 | EXTREME_MIN = -sys.maxsize - 1 23 | 24 | 25 | class BaseTestStore(six.with_metaclass(abc.ABCMeta)): 26 | """Base class for testing Store classes""" 27 | 28 | @abc.abstractmethod 29 | def _test_values(self, store, values): 30 | """Test the store's bin counts against what we expect""" 31 | 32 | @abc.abstractmethod 33 | def _test_store(self, values): 34 | """Initialize the store; add the values; call _test_values""" 35 | 36 | @abc.abstractmethod 37 | def _test_merging(self, list_values): 38 | """ 39 | Initialize the stores; for each values in list_values, add them to the 40 | corresponding store; merge the stores; test the merged store's bin 41 | counts against what we expect. 42 | """ 43 | 44 | def test_empty(self): 45 | """Test no values""" 46 | values = [] 47 | self._test_store(values) 48 | 49 | def test_constant(self): 50 | """Test a constant stream of values""" 51 | values = [0] * 10000 52 | self._test_store(values) 53 | 54 | def test_increasing_linearly(self): 55 | """Test a stream of increasing values""" 56 | values = list(range(10000)) 57 | self._test_store(values) 58 | 59 | def test_decreasing_linearly(self): 60 | """Test a stream of decreasing values""" 61 | values = list(reversed(range(10000))) 62 | self._test_store(values) 63 | 64 | def test_increasing_exponentially(self): 65 | """Test a stream of values increasing exponentially""" 66 | values = [2**x for x in range(16)] 67 | self._test_store(values) 68 | 69 | def test_decreasing_exponentially(self): 70 | """Test a stream of values decreasing exponentially""" 71 | values = [2**x for x in reversed(range(16))] 72 | self._test_store(values) 73 | 74 | def test_bin_counts(self): 75 | """Test bin counts for positive and negative numbers""" 76 | values = [x for x in range(10) for i in range(2 * x)] 77 | self._test_store(values) 78 | 79 | values = [-x for x in range(10) for i in range(2 * x)] 80 | self._test_store(values) 81 | 82 | def test_extreme_values(self): 83 | """Test extreme values""" 84 | self._test_store([EXTREME_MAX]) 85 | self._test_store([EXTREME_MIN]) 86 | self._test_store([0, EXTREME_MIN]) 87 | self._test_store([0, EXTREME_MAX]) 88 | self._test_store([EXTREME_MIN, EXTREME_MAX]) 89 | self._test_store([EXTREME_MAX, EXTREME_MIN]) 90 | 91 | def test_merging_empty(self): 92 | """Test merging empty stores""" 93 | self._test_merging([[], []]) 94 | 95 | def test_merging_far_apart(self): 96 | """Test merging stores with values that are far apart""" 97 | self._test_merging([[-10000], [10000]]) 98 | self._test_merging([[10000], [-10000]]) 99 | self._test_merging([[10000], [-10000], [0]]) 100 | self._test_merging([[10000, 0], [-10000], [0]]) 101 | 102 | def test_merging_constant(self): 103 | """Test merging stores with the same constants""" 104 | self._test_merging([[2, 2], [2, 2, 2], [2]]) 105 | self._test_merging([[-8, -8], [-8]]) 106 | 107 | def test_merging_extreme_values(self): 108 | """Test merging stores with extreme values""" 109 | self._test_merging([[0], [EXTREME_MIN]]) 110 | self._test_merging([[0], [EXTREME_MAX]]) 111 | self._test_merging([[EXTREME_MIN], [0]]) 112 | self._test_merging([[EXTREME_MAX], [0]]) 113 | self._test_merging([[EXTREME_MIN], [EXTREME_MIN]]) 114 | self._test_merging([[EXTREME_MAX], [EXTREME_MAX]]) 115 | self._test_merging([[EXTREME_MIN], [EXTREME_MAX]]) 116 | self._test_merging([[EXTREME_MAX], [EXTREME_MIN]]) 117 | self._test_merging([[0], [EXTREME_MIN, EXTREME_MAX]]) 118 | self._test_merging([[EXTREME_MIN, EXTREME_MAX], [0]]) 119 | 120 | def test_copying_empty(self): 121 | """Test copying empty stores""" 122 | store = CollapsingLowestDenseStore(10) 123 | store.copy(CollapsingLowestDenseStore(10)) 124 | assert store.count == 0 125 | 126 | def test_copying_non_empty(self): 127 | """Test copying stores""" 128 | store = CollapsingLowestDenseStore(10) 129 | new_store = CollapsingLowestDenseStore(10) 130 | new_store.add(0) 131 | store.copy(new_store) 132 | assert store.count == 1 133 | 134 | 135 | class TestDenseStore(BaseTestStore, TestCase): 136 | """Class for testing the DenseStore class""" 137 | 138 | def _test_values(self, store, values): 139 | counter = Counter(values) 140 | 141 | expected_total_count = sum(counter.values()) 142 | assert expected_total_count == sum(store.bins) 143 | if expected_total_count == 0: 144 | assert all([x == 0 for x in store.bins]) 145 | else: 146 | assert not all([x == 0 for x in store.bins]) 147 | 148 | counter = Counter(values) 149 | for i, sbin in enumerate(store.bins): 150 | if sbin != 0: 151 | assert counter[i + store.offset] == sbin 152 | 153 | def _test_store(self, values): 154 | store = DenseStore() 155 | for val in values: 156 | store.add(val) 157 | self._test_values(store, values) 158 | 159 | def _test_merging(self, list_values): 160 | store = DenseStore() 161 | 162 | for values in list_values: 163 | intermediate_store = DenseStore() 164 | for val in values: 165 | intermediate_store.add(val) 166 | store.merge(intermediate_store) 167 | 168 | flat_values = [v for values in list_values for v in values] 169 | self._test_values(store, flat_values) 170 | 171 | def test_key_at_rank(self): 172 | """Test that key_at_rank properly handles decimal ranks""" 173 | store = DenseStore() 174 | store.add(4) 175 | store.add(10) 176 | store.add(100) 177 | assert store.key_at_rank(0) == 4 178 | assert store.key_at_rank(1) == 10 179 | assert store.key_at_rank(2) == 100 180 | assert store.key_at_rank(0, lower=False) == 4 181 | assert store.key_at_rank(1, lower=False) == 10 182 | assert store.key_at_rank(2, lower=False) == 100 183 | assert store.key_at_rank(0.5) == 4 184 | assert store.key_at_rank(1.5) == 10 185 | assert store.key_at_rank(2.5) == 100 186 | assert store.key_at_rank(-0.5, lower=False) == 4 187 | assert store.key_at_rank(0.5, lower=False) == 10 188 | assert store.key_at_rank(1.5, lower=False) == 100 189 | 190 | def test_extreme_values(self): 191 | """Override. DenseStore is not meant to be used with values that are extremely 192 | far from one another as it would allocate an excessively large 193 | array. 194 | """ 195 | 196 | def test_merging_extreme_values(self): 197 | """Override. DenseStore is not meant to be used with values that are extremely 198 | far from one another as it would allocate an excessively large 199 | array. 200 | """ 201 | 202 | 203 | class TestCollapsingLowestDenseStore(BaseTestStore, TestCase): 204 | """Class for testing the CollapsingLowestDenseStore class""" 205 | 206 | def _test_values(self, store, values): 207 | counter = Counter(values) 208 | expected_total_count = sum(counter.values()) 209 | assert expected_total_count == sum(store.bins) 210 | 211 | if expected_total_count == 0: 212 | assert all([x == 0 for x in store.bins]) 213 | else: 214 | assert not all([x == 0 for x in store.bins]) 215 | 216 | max_index = max(counter) 217 | min_storable_index = max(float("-inf"), max_index - store.bin_limit + 1) 218 | counter = Counter([max(x, min_storable_index) for x in values]) 219 | 220 | for i, sbin in enumerate(store.bins): 221 | if sbin != 0: 222 | assert counter[i + store.offset] == sbin 223 | 224 | def _test_store(self, values): 225 | for bin_limit in TEST_BIN_LIMIT: 226 | store = CollapsingLowestDenseStore(bin_limit) 227 | for val in values: 228 | store.add(val) 229 | self._test_values(store, values) 230 | 231 | def _test_merging(self, list_values): 232 | for bin_limit in TEST_BIN_LIMIT: 233 | store = CollapsingLowestDenseStore(bin_limit) 234 | 235 | for values in list_values: 236 | intermediate_store = CollapsingLowestDenseStore(bin_limit) 237 | for val in values: 238 | intermediate_store.add(val) 239 | store.merge(intermediate_store) 240 | flat_values = [v for values in list_values for v in values] 241 | self._test_values(store, flat_values) 242 | 243 | 244 | class TestCollapsingHighestDenseStore(BaseTestStore, TestCase): 245 | """Class for testing the CollapsingHighestDenseStore class""" 246 | 247 | def _test_values(self, store, values): 248 | counter = Counter(values) 249 | 250 | expected_total_count = sum(counter.values()) 251 | assert expected_total_count == sum(store.bins) 252 | if expected_total_count == 0: 253 | assert all([x == 0 for x in store.bins]) 254 | else: 255 | assert not all([x == 0 for x in store.bins]) 256 | 257 | min_index = min(counter) 258 | max_storable_index = min(float("+inf"), min_index + store.bin_limit - 1) 259 | counter = Counter([min(x, max_storable_index) for x in values]) 260 | 261 | for i, sbin in enumerate(store.bins): 262 | if sbin != 0: 263 | assert counter[i + store.offset] == sbin 264 | 265 | def _test_store(self, values): 266 | for bin_limit in TEST_BIN_LIMIT[1:2]: 267 | store = CollapsingHighestDenseStore(bin_limit) 268 | for val in values: 269 | store.add(val) 270 | self._test_values(store, values) 271 | 272 | def _test_merging(self, list_values): 273 | for bin_limit in TEST_BIN_LIMIT: 274 | store = CollapsingHighestDenseStore(bin_limit) 275 | 276 | for values in list_values: 277 | intermediate_store = CollapsingHighestDenseStore(bin_limit) 278 | for val in values: 279 | intermediate_store.add(val) 280 | store.merge(intermediate_store) 281 | flat_values = [v for values in list_values for v in values] 282 | self._test_values(store, flat_values) 283 | --------------------------------------------------------------------------------