├── .github
    ├── ISSUE_TEMPLATE.md
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   ├── build_deploy.yml
    │   ├── changelog.yml
    │   └── test.yml
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── LICENSE-3rdparty.csv
├── NOTICE
├── README.md
├── ddsketch
    ├── __init__.py
    ├── _version.py
    ├── ddsketch.py
    ├── mapping.py
    ├── pb
    │   ├── __init__.py
    │   ├── ddsketch.proto
    │   ├── ddsketch_pb2.py
    │   ├── ddsketch_pre319_pb2.py
    │   └── proto.py
    ├── py.typed
    └── store.py
├── docker-compose.yml
├── mypy.ini
├── pyproject.toml
├── releasenotes
    ├── config.yaml
    └── notes
    │   ├── ddsketch-api-a84ffc0875bbacd6.yaml
    │   ├── extend-range-06474632c8235187.yaml
    │   ├── oldpy-db6189c9b26e10f7.yaml
    │   ├── pbopt-ec6525c1948d782f.yaml
    │   ├── proto4-e8646610178bef59.yaml
    │   ├── protobuf-min-f6af9a2d5d96f53c.yaml
    │   ├── py2-c963608396db7258.yaml
    │   ├── py310-ac5baa9b0b69008a.yaml
    │   ├── remove-custom-exceptions-e2bc67a72250269d.yaml
    │   ├── remove-numpy-25fedcd9be9d6d80.yaml
    │   ├── tests-wheel-bf71b228c86a9ced.yaml
    │   ├── toplevelapi-6c04f2ca35a49d4b.yaml
    │   ├── typing-25579ab88323a332.yaml
    │   └── version-b2a276df190a703a.yaml
├── riotfile.py
├── scripts
    └── check-releasenotes
├── setup.cfg
├── setup.py
└── tests
    ├── __init__.py
    ├── datasets.py
    ├── test_ddsketch.py
    ├── test_mapping.py
    ├── test_proto.py
    └── test_store.py


/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | **Describe what happened:**
2 | 
3 | 
4 | **Describe what you expected:**
5 | 
6 | 
7 | **Steps to reproduce the issue:**
8 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ### What does this PR do?
 2 | 
 3 | A brief description of the change being made with this pull request.
 4 | 
 5 | ### Motivation
 6 | 
 7 | What inspired you to submit this pull request?
 8 | 
 9 | ### Additional Notes
10 | 
11 | Anything else we should know when reviewing?
12 | 


--------------------------------------------------------------------------------
/.github/workflows/build_deploy.yml:
--------------------------------------------------------------------------------
 1 | name: Build
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   release:
 6 |     types:
 7 |       - published
 8 | 
 9 | jobs:
10 |   build_wheel:
11 |     name: Build wheels
12 |     runs-on: ubuntu-22.04
13 | 
14 |     steps:
15 |       - uses: actions/checkout@v2
16 |         # Include all history and tags
17 |         with:
18 |           fetch-depth: 0
19 | 
20 |       - uses: actions/setup-python@v2
21 |         name: Install Python
22 |         with:
23 |           python-version: '3.9'
24 | 
25 |       - name: Build wheels
26 |         run: |
27 |           pip install wheel
28 |           pip wheel --no-deps -w dist .
29 | 
30 |       - uses: actions/upload-artifact@v2
31 |         with:
32 |           path: dist/*.whl
33 | 
34 |   build_sdist:
35 |     name: Build source distribution
36 |     runs-on: ubuntu-latest
37 |     steps:
38 |       - uses: actions/checkout@v2
39 |         # Include all history and tags
40 |         with:
41 |           fetch-depth: 0
42 | 
43 |       - uses: actions/setup-python@v2
44 |         name: Install Python
45 |         with:
46 |           python-version: '3.9'
47 | 
48 |       - name: Build sdist
49 |         run: |
50 |           python setup.py sdist
51 | 
52 |       - uses: actions/upload-artifact@v2
53 |         with:
54 |           path: dist/*.tar.gz
55 | 
56 |   upload_pypi:
57 |     needs: [build_wheel, build_sdist]
58 |     runs-on: ubuntu-latest
59 |     if: github.event_name == 'release' && github.event.action == 'published'
60 |     steps:
61 |       - uses: actions/download-artifact@v2
62 |         with:
63 |           name: artifact
64 |           path: dist
65 | 
66 |       - uses: pypa/gh-action-pypi-publish@master
67 |         with:
68 |           user: __token__
69 |           password: ${{ secrets.PYPI_TOKEN }}
70 |           # To test: repository_url: https://test.pypi.org/legacy/
71 | 


--------------------------------------------------------------------------------
/.github/workflows/changelog.yml:
--------------------------------------------------------------------------------
 1 | name: Changelog
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 |   pull_request:
 7 |     # Important that we run on `labeled` and `unlabeled` to pick up `changelog/no-changelog` being added/removed
 8 |     # DEV: [opened, reopened, synchronize] is the default
 9 |     types: [opened, reopened, synchronize, labeled, unlabeled, ready_for_review]
10 | jobs:
11 |   validate:
12 |     name: Validate changelog
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v2
16 |         # Include all history and tags
17 |         with:
18 |           fetch-depth: 0
19 | 
20 |       # Ensure a new reno release note was added in this PR.
21 |       # Use `reno new <slug>` to add a new note to `releasenotes/notes`,
22 |       #   or add `changelog/no-changelog` label if no release note is needed.
23 |       - name: Ensure release note added
24 |         # Only run this on pull requests
25 |         if: github.event_name == 'pull_request'
26 |         run: scripts/check-releasenotes
27 | 
28 |       - uses: actions/setup-python@v2
29 |         name: Install Python
30 |         with:
31 |           python-version: '3.9'
32 | 
33 |       - name: Install Dependencies
34 |         run: pip install reno docutils
35 | 
36 |       - name: Lint changelog notes
37 |         run: reno lint
38 | 
39 |       - name: Generate changelog
40 |         run: |
41 |           reno report | tee CHANGELOG.rst
42 |           rst2html.py CHANGELOG.rst CHANGELOG.html
43 | 
44 |       - name: Upload CHANGELOG.rst
45 |         uses: actions/upload-artifact@v2
46 |         with:
47 |           name: changelog
48 |           path: |
49 |             CHANGELOG.rst
50 |             CHANGELOG.html
51 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 |   pull_request:
 7 | jobs:
 8 |   check:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/setup-python@v5
12 |         with:
13 |           python-version: '3.12'
14 |       - uses: actions/checkout@v4
15 |         with:
16 |           fetch-depth: 0
17 |       - run: pip install riot==0.19.0
18 |       - run: riot -v run check_fmt
19 |       - run: riot -v run -s mypy
20 |       - run: riot -v run -s flake8
21 | 
22 |   test:
23 |     strategy:
24 |       matrix:
25 |         os: [ubuntu-latest, macos-latest]
26 |         python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
27 |     runs-on: ${{ matrix.os }}
28 |     steps:
29 |       - uses: actions/checkout@v4
30 |       - name: Setup Python
31 |         uses: actions/setup-python@v5
32 |         with:
33 |           python-version: ${{ matrix.python-version }}
34 |       - name: install riot
35 |         # Note that pip3 has to be used since the system pip when running
36 |         # under the 2.7 instance will be Python 2 pip.
37 |         # (riot is not Python 2 compatible)
38 |         run: pip3 install riot==0.19.0
39 |       - run: |
40 |           riot run -p ${{ matrix.python-version}} test
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *~
 3 | .riot/
 4 | 
 5 | # Generated version module
 6 | ddsketch/__version.py
 7 | 
 8 | # Ignore files generated during `python setup.py install`
 9 | build/
10 | dist/
11 | *.egg-info/
12 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to sketches-py 
2 | 
3 | First of all, thanks for contributing!
4 | 
5 | * If you think you've found an issue, please open a Github issue.
6 | * To propose improvements, feel free to submit a PR.
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2020 DataDog, Inc.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/LICENSE-3rdparty.csv:
--------------------------------------------------------------------------------
1 | Component,Origin,License,Copyright
2 | import,numpy,BSD-3-Clause,Copyright (c) 2005-2020 NumPy Developers.; All rights reserved.
3 | import,setuptools,MIT,Copyright (c) 2016 Jason R Coombs <jaraco@jaraco.com>
4 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Datadog sketches-py
2 | Copyright 2020 Datadog, Inc.
3 | 
4 | This product includes software developed at Datadog (https://www.datadoghq.com/).
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ddsketch
  2 | 
  3 | This repo contains the Python implementation of the distributed quantile sketch
  4 | algorithm DDSketch [1]. DDSketch has relative-error guarantees for any quantile
  5 | q in [0, 1]. That is if the true value of the qth-quantile is `x` then DDSketch
  6 | returns a value `y` such that `|x-y| / x < e` where `e` is the relative error
  7 | parameter. (The default here is set to 0.01.)  DDSketch is also fully mergeable,
  8 | meaning that multiple sketches from distributed systems can be combined in a
  9 | central node.
 10 | 
 11 | Our default implementation, `DDSketch`, is guaranteed [1] to not grow too large
 12 | in size for any data that can be described by a distribution whose tails are
 13 | sub-exponential.
 14 | 
 15 | We also provide implementations (`LogCollapsingLowestDenseDDSketch` and
 16 | `LogCollapsingHighestDenseDDSketch`) where the q-quantile will be accurate up to
 17 | the specified relative error for q that is not too small (or large). Concretely,
 18 | the q-quantile will be accurate up to the specified relative error as long as it
 19 | belongs to one of the `m` bins kept by the sketch.  If the data is time in
 20 | seconds, the default of `m = 2048` covers 80 microseconds to 1 year.
 21 | 
 22 | ## Installation
 23 | 
 24 | To install this package, run `pip install ddsketch`, or clone the repo and run
 25 | `python setup.py install`. This package depends on `numpy` and `protobuf`. (The
 26 | protobuf dependency can be removed if it's not applicable.)
 27 | 
 28 | ## Usage
 29 | ```
 30 | from ddsketch import DDSketch
 31 | 
 32 | sketch = DDSketch()
 33 | ```
 34 | Add values to the sketch
 35 | ```
 36 | import numpy as np
 37 | 
 38 | values = np.random.normal(size=500)
 39 | for v in values:
 40 |   sketch.add(v)
 41 | ```
 42 | Find the quantiles of `values` to within the relative error.
 43 | ```
 44 | quantiles = [sketch.get_quantile_value(q) for q in [0.5, 0.75, 0.9, 1]]
 45 | ```
 46 | Merge another `DDSketch` into `sketch`.
 47 | ```
 48 | another_sketch = DDSketch()
 49 | other_values = np.random.normal(size=500)
 50 | for v in other_values:
 51 |   another_sketch.add(v)
 52 | sketch.merge(another_sketch)
 53 | ```
 54 | The quantiles of `values` concatenated with `other_values` are still accurate to within the relative error.
 55 | 
 56 | ## Development
 57 | 
 58 | To work on ddsketch a Python interpreter must be installed. It is recommended to use the provided development
 59 | container (requires [docker](https://www.docker.com/)) which includes all the required Python interpreters.
 60 | 
 61 |     docker-compose run dev
 62 | 
 63 | Or, if developing outside of docker then it is recommended to use a virtual environment:
 64 | 
 65 |     pip install virtualenv
 66 |     virtualenv --python=3 .venv
 67 |     source .venv/bin/activate
 68 | 
 69 | 
 70 | ### Testing
 71 | 
 72 | To run the tests install `riot`:
 73 | 
 74 |     pip install riot
 75 | 
 76 | Replace the Python version with the interpreter(s) available.
 77 | 
 78 |     # Run tests with Python 3.9
 79 |     riot run -p3.9 test
 80 | 
 81 | ### Release notes
 82 | 
 83 | New features, bug fixes, deprecations and other breaking changes must have
 84 | release notes included.
 85 | 
 86 | To generate a release note for the change:
 87 | 
 88 |     riot run reno new <short-description-of-change-no-spaces>
 89 | 
 90 | Edit the generated file to include notes on the changes made in the commit/PR
 91 | and add commit it.
 92 | 
 93 | 
 94 | ### Formatting
 95 | 
 96 | Format code with
 97 | 
 98 |     riot run fmt
 99 | 
100 | 
101 | ### Type-checking
102 | 
103 | Type checking is done with [mypy](http://mypy-lang.org/):
104 | 
105 |     riot run mypy
106 | 
107 | 
108 | ### Type-checking
109 | 
110 | Lint the code with [flake8](https://flake8.pycqa.org/en/latest/):
111 | 
112 |     riot run flake8
113 | 
114 | 
115 | ### Protobuf
116 | 
117 | The protobuf is stored in the go repository: https://github.com/DataDog/sketches-go/blob/master/ddsketch/pb/ddsketch.proto
118 | 
119 | Install the minimum required protoc and generate the Python code:
120 | 
121 | ```sh
122 | docker run -v $PWD:/code -it ubuntu:18.04 /bin/bash
123 | apt update && apt install protobuf-compiler  # default is 3.0.0
124 | protoc --proto_path=ddsketch/pb/ --python_out=ddsketch/pb/ ddsketch/pb/ddsketch.proto
125 | ```
126 | 
127 | 
128 | ### Releasing
129 | 
130 | 1. Generate the release notes and use [`pandoc`](https://pandoc.org/) to format
131 | them for Github:
132 | ```bash
133 |     git checkout master && git pull
134 |     riot run -s reno report --no-show-source | pandoc -f rst -t gfm --wrap=none
135 | ```
136 |    Copy the output into a new release: https://github.com/DataDog/sketches-py/releases/new.
137 | 
138 | 2. Enter a tag for the release (following [`semver`](https://semver.org)) (eg. `v1.1.3`, `v1.0.3`, `v1.2.0`).
139 | 3. Use the tag without the `v` as the title.
140 | 4. Save the release as a draft and pass the link to someone else to give a quick review.
141 | 5. If all looks good hit publish
142 | 
143 | 
144 | ## References
145 | [1] Charles Masson and Jee E Rim and Homin K. Lee. DDSketch: A fast and fully-mergeable quantile sketch with relative-error guarantees. PVLDB, 12(12): 2195-2205, 2019. (The code referenced in the paper, including our implementation of the the Greenwald-Khanna (GK) algorithm, can be found at: https://github.com/DataDog/sketches-py/releases/tag/v0.1 )
146 | 


--------------------------------------------------------------------------------
/ddsketch/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._version import get_version
 2 | from .ddsketch import DDSketch
 3 | from .ddsketch import LogCollapsingHighestDenseDDSketch
 4 | from .ddsketch import LogCollapsingLowestDenseDDSketch
 5 | from .mapping import CubicallyInterpolatedMapping
 6 | from .mapping import LinearlyInterpolatedMapping
 7 | from .mapping import LogarithmicMapping
 8 | from .store import CollapsingHighestDenseStore
 9 | from .store import CollapsingLowestDenseStore
10 | 
11 | 
12 | __version__ = get_version()
13 | 
14 | 
15 | __all__ = [
16 |     "DDSketch",
17 |     "LogCollapsingLowestDenseDDSketch",
18 |     "LogCollapsingHighestDenseDDSketch",
19 |     "CubicallyInterpolatedMapping",
20 |     "LinearlyInterpolatedMapping",
21 |     "LogarithmicMapping",
22 |     "CollapsingHighestDenseStore",
23 |     "CollapsingLowestDenseStore",
24 | ]
25 | 


--------------------------------------------------------------------------------
/ddsketch/_version.py:
--------------------------------------------------------------------------------
 1 | def get_version():
 2 |     # type: () -> str
 3 |     """Return the package version.
 4 | 
 5 |     The write_to functionality of setuptools_scm is used (see setup.py)
 6 |     to output the version to ddsketch/__version.py which we attempt to import.
 7 | 
 8 |     This is done to avoid the expensive overhead of importing pkg_resources.
 9 |     """
10 |     try:
11 |         from .__version import version
12 | 
13 |         return version
14 |     except ImportError:
15 |         import pkg_resources
16 | 
17 |         return pkg_resources.get_distribution(__name__).version
18 | 


--------------------------------------------------------------------------------
/ddsketch/ddsketch.py:
--------------------------------------------------------------------------------
  1 | # Unless explicitly stated otherwise all files in this repository are licensed
  2 | # under the Apache License 2.0.
  3 | # This product includes software developed at Datadog (https://www.datadoghq.com/).
  4 | # Copyright 2020 Datadog, Inc.
  5 | 
  6 | """A quantile sketch with relative-error guarantees. This sketch computes
  7 | quantile values with an approximation error that is relative to the actual
  8 | quantile value. It works on both negative and non-negative input values.
  9 | 
 10 | For instance, using DDSketch with a relative accuracy guarantee set to 1%, if
 11 | the expected quantile value is 100, the computed quantile value is guaranteed to
 12 | be between 99 and 101. If the expected quantile value is 1000, the computed
 13 | quantile value is guaranteed to be between 990 and 1010.
 14 | 
 15 | DDSketch works by mapping floating-point input values to bins and counting the
 16 | number of values for each bin. The underlying structure that keeps track of bin
 17 | counts is store.
 18 | 
 19 | The memory size of the sketch depends on the range that is covered by the input
 20 | values: the larger that range, the more bins are needed to keep track of the
 21 | input values. As a rough estimate, if working on durations with a relative
 22 | accuracy of 2%, about 2kB (275 bins) are needed to cover values between 1
 23 | millisecond and 1 minute, and about 6kB (802 bins) to cover values between 1
 24 | nanosecond and 1 day.
 25 | 
 26 | The size of the sketch can be have a fail-safe upper-bound by using collapsing
 27 | stores. As shown in
 28 | <a href="http://www.vldb.org/pvldb/vol12/p2195-masson.pdf">the DDSketch paper</a>
 29 | the likelihood of a store collapsing when using the default bound is vanishingly
 30 | small for most data.
 31 | 
 32 | DDSketch implementations are also available in:
 33 | <a href="https://github.com/DataDog/sketches-go/">Go</a>
 34 | <a href="https://github.com/DataDog/sketches-py/">Python</a>
 35 | <a href="https://github.com/DataDog/sketches-js/">JavaScript</a>
 36 | """
 37 | import typing
 38 | 
 39 | from .mapping import LogarithmicMapping
 40 | from .store import CollapsingHighestDenseStore
 41 | from .store import CollapsingLowestDenseStore
 42 | from .store import DenseStore
 43 | 
 44 | 
 45 | if typing.TYPE_CHECKING:
 46 |     from typing import Optional  # noqa: F401
 47 | 
 48 |     from .mapping import KeyMapping  # noqa: F401
 49 |     from .store import Store  # noqa: F401
 50 | 
 51 | 
 52 | DEFAULT_REL_ACC = 0.01  # "alpha" in the paper
 53 | DEFAULT_BIN_LIMIT = 2048
 54 | 
 55 | 
 56 | class BaseDDSketch(object):
 57 |     """The base implementation of DDSketch with neither mapping nor storage specified.
 58 | 
 59 |     Args:
 60 |         mapping (mapping.KeyMapping): map btw values and store bins
 61 |         store (store.Store): storage for positive values
 62 |         negative_store (store.Store): storage for negative values
 63 |         zero_count (float): The count of zero values
 64 | 
 65 |     Attributes:
 66 |         relative_accuracy (float): the accuracy guarantee; referred to as alpha
 67 |             in the paper. (0. < alpha < 1.)
 68 | 
 69 |         count: the number of values seen by the sketch
 70 |         min: the minimum value seen by the sketch
 71 |         max: the maximum value seen by the sketch
 72 |         sum: the sum of the values seen by the sketch
 73 |     """
 74 | 
 75 |     def __init__(
 76 |         self,
 77 |         mapping,
 78 |         store,
 79 |         negative_store,
 80 |         zero_count,
 81 |     ):
 82 |         # type: (KeyMapping, Store, Store, float) -> None
 83 |         self._mapping = mapping
 84 |         self._store = store
 85 |         self._negative_store = negative_store
 86 |         self._zero_count = zero_count
 87 | 
 88 |         self._relative_accuracy = mapping.relative_accuracy
 89 |         self._count = self._negative_store.count + self._zero_count + self._store.count
 90 |         self._min = float("+inf")
 91 |         self._max = float("-inf")
 92 |         self._sum = 0.0
 93 | 
 94 |     def __repr__(self):
 95 |         # type: () -> str
 96 |         return (
 97 |             "store: {}, negative_store: {}, "
 98 |             "zero_count: {}, count: {}, "
 99 |             "sum: {}, min: {}, max: {}"
100 |         ).format(
101 |             self._store,
102 |             self._negative_store,
103 |             self._zero_count,
104 |             self._count,
105 |             self._sum,
106 |             self._min,
107 |             self._max,
108 |         )
109 | 
110 |     @property
111 |     def count(self):
112 |         return self._count
113 | 
114 |     @property
115 |     def name(self):
116 |         # type: () -> str
117 |         """str: name of the sketch"""
118 |         return "DDSketch"
119 | 
120 |     @property
121 |     def num_values(self):
122 |         # type: () -> float
123 |         """Return the number of values in the sketch."""
124 |         return self._count
125 | 
126 |     @property
127 |     def avg(self):
128 |         # type: () -> float
129 |         """Return the exact average of the values added to the sketch."""
130 |         return self._sum / self._count
131 | 
132 |     @property
133 |     def sum(self):  # noqa: A003
134 |         # type: () -> float
135 |         """Return the exact sum of the values added to the sketch."""
136 |         return self._sum
137 | 
138 |     def add(self, val, weight=1.0):
139 |         # type: (float, float) -> None
140 |         """Add a value to the sketch."""
141 |         if weight <= 0.0:
142 |             raise ValueError("weight must be a positive float, got %r" % weight)
143 | 
144 |         if val > self._mapping.min_possible:
145 |             self._store.add(self._mapping.key(val), weight)
146 |         elif val < -self._mapping.min_possible:
147 |             self._negative_store.add(self._mapping.key(-val), weight)
148 |         else:
149 |             self._zero_count += weight
150 | 
151 |         # Keep track of summary stats
152 |         self._count += weight
153 |         self._sum += val * weight
154 |         if val < self._min:
155 |             self._min = val
156 |         if val > self._max:
157 |             self._max = val
158 | 
159 |     def get_quantile_value(self, quantile):
160 |         # type: (float) -> Optional[float]
161 |         """Return the approximate value at the specified quantile.
162 | 
163 |         Args:
164 |             quantile (float): 0 <= q <=1
165 | 
166 |         Returns:
167 |             the value at the specified quantile or None if the sketch is empty
168 |         """
169 |         if quantile < 0 or quantile > 1 or self._count == 0:
170 |             return None
171 | 
172 |         rank = quantile * (self._count - 1)
173 |         if rank < self._negative_store.count:
174 |             reversed_rank = self._negative_store.count - rank - 1
175 |             key = self._negative_store.key_at_rank(reversed_rank, lower=False)
176 |             quantile_value = -self._mapping.value(key)
177 |         elif rank < self._zero_count + self._negative_store.count:
178 |             return 0
179 |         else:
180 |             key = self._store.key_at_rank(
181 |                 rank - self._zero_count - self._negative_store.count
182 |             )
183 |             quantile_value = self._mapping.value(key)
184 |         return quantile_value
185 | 
186 |     def merge(self, sketch):
187 |         # type: (BaseDDSketch) -> None
188 |         """Merge the given sketch into this one. After this operation, this sketch
189 |         encodes the values that were added to both this and the input sketch.
190 |         """
191 |         if not self._mergeable(sketch):
192 |             raise ValueError(
193 |                 "Cannot merge two DDSketches with different parameters, got %r and %r"
194 |                 % (self._mapping.gamma, sketch._mapping.gamma)
195 |             )
196 | 
197 |         if sketch.count == 0:
198 |             return
199 | 
200 |         if self._count == 0:
201 |             self._copy(sketch)
202 |             return
203 | 
204 |         # Merge the stores
205 |         self._store.merge(sketch._store)
206 |         self._negative_store.merge(sketch._negative_store)
207 |         self._zero_count += sketch._zero_count
208 | 
209 |         # Merge summary stats
210 |         self._count += sketch._count
211 |         self._sum += sketch._sum
212 |         if sketch._min < self._min:
213 |             self._min = sketch._min
214 |         if sketch._max > self._max:
215 |             self._max = sketch._max
216 | 
217 |     def _mergeable(self, other):
218 |         # type: (BaseDDSketch) -> bool
219 |         """Two sketches can be merged only if their gammas are equal."""
220 |         return self._mapping.gamma == other._mapping.gamma
221 | 
222 |     def _copy(self, sketch):
223 |         # type: (BaseDDSketch) -> None
224 |         """Copy the input sketch into this one"""
225 |         self._store.copy(sketch._store)
226 |         self._negative_store.copy(sketch._negative_store)
227 |         self._zero_count = sketch._zero_count
228 |         self._min = sketch._min
229 |         self._max = sketch._max
230 |         self._count = sketch._count
231 |         self._sum = sketch._sum
232 | 
233 | 
234 | class DDSketch(BaseDDSketch):
235 |     """The default implementation of BaseDDSketch, with optimized memory usage at
236 |     the cost of lower ingestion speed, using an unlimited number of bins. The
237 |     number of bins will not exceed a reasonable number unless the data is
238 |     distributed with tails heavier than any subexponential.
239 |     (cf. http://www.vldb.org/pvldb/vol12/p2195-masson.pdf)
240 |     """
241 | 
242 |     def __init__(self, relative_accuracy=None):
243 |         # type: (Optional[float]) -> None
244 |         # Make sure the parameters are valid
245 |         if relative_accuracy is None:
246 |             relative_accuracy = DEFAULT_REL_ACC
247 | 
248 |         mapping = LogarithmicMapping(relative_accuracy)
249 |         store = DenseStore()
250 |         negative_store = DenseStore()
251 |         super(DDSketch, self).__init__(
252 |             mapping=mapping,
253 |             store=store,
254 |             negative_store=negative_store,
255 |             zero_count=0.0,
256 |         )
257 | 
258 | 
259 | class LogCollapsingLowestDenseDDSketch(BaseDDSketch):
260 |     """Implementation of BaseDDSketch with optimized memory usage at the cost of
261 |     lower ingestion speed, using a limited number of bins. When the maximum
262 |     number of bins is reached, bins with lowest indices are collapsed, which
263 |     causes the relative accuracy to be lost on the lowest quantiles. For the
264 |     default bin limit, collapsing is unlikely to occur unless the data is
265 |     distributed with tails heavier than any subexponential.
266 |     (cf. http://www.vldb.org/pvldb/vol12/p2195-masson.pdf)
267 |     """
268 | 
269 |     def __init__(self, relative_accuracy=None, bin_limit=None):
270 |         # type: (Optional[float], Optional[int]) -> None
271 |         # Make sure the parameters are valid
272 |         if relative_accuracy is None:
273 |             relative_accuracy = DEFAULT_REL_ACC
274 | 
275 |         if bin_limit is None or bin_limit < 0:
276 |             bin_limit = DEFAULT_BIN_LIMIT
277 | 
278 |         mapping = LogarithmicMapping(relative_accuracy)
279 |         store = CollapsingLowestDenseStore(bin_limit)
280 |         negative_store = CollapsingLowestDenseStore(bin_limit)
281 |         super(LogCollapsingLowestDenseDDSketch, self).__init__(
282 |             mapping=mapping,
283 |             store=store,
284 |             negative_store=negative_store,
285 |             zero_count=0.0,
286 |         )
287 | 
288 | 
289 | class LogCollapsingHighestDenseDDSketch(BaseDDSketch):
290 |     """Implementation of BaseDDSketch with optimized memory usage at the cost of
291 |     lower ingestion speed, using a limited number of bins. When the maximum
292 |     number of bins is reached, bins with highest indices are collapsed, which
293 |     causes the relative accuracy to be lost on the highest quantiles. For the
294 |     default bin limit, collapsing is unlikely to occur unless the data is
295 |     distributed with tails heavier than any subexponential.
296 |     (cf. http://www.vldb.org/pvldb/vol12/p2195-masson.pdf)
297 |     """
298 | 
299 |     def __init__(self, relative_accuracy=None, bin_limit=None):
300 |         # type: (Optional[float], Optional[int]) -> None
301 |         # Make sure the parameters are valid
302 |         if relative_accuracy is None:
303 |             relative_accuracy = DEFAULT_REL_ACC
304 | 
305 |         if bin_limit is None or bin_limit < 0:
306 |             bin_limit = DEFAULT_BIN_LIMIT
307 | 
308 |         mapping = LogarithmicMapping(relative_accuracy)
309 |         store = CollapsingHighestDenseStore(bin_limit)
310 |         negative_store = CollapsingHighestDenseStore(bin_limit)
311 |         super(LogCollapsingHighestDenseDDSketch, self).__init__(
312 |             mapping=mapping,
313 |             store=store,
314 |             negative_store=negative_store,
315 |             zero_count=0.0,
316 |         )
317 | 


--------------------------------------------------------------------------------
/ddsketch/mapping.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | 
  4 | # Unless explicitly stated otherwise all files in this repository are licensed
  5 | # under the Apache License 2.0.
  6 | # This product includes software developed at Datadog (https://www.datadoghq.com/).
  7 | # Copyright 2020 Datadog, Inc.
  8 | 
  9 | """A mapping between values and integer indices that imposes relative accuracy
 10 | guarantees. Specifically, for any value `minIndexableValue() < value <
 11 | maxIndexableValue` implementations of `KeyMapping` must be such that
 12 | `value(key(v))` is close to `v` with a relative error that is less than
 13 | `relative_accuracy`.
 14 | 
 15 | In implementations of KeyMapping, there is generally a trade-off between the
 16 | cost of computing the key and the number of keys that are required to cover a
 17 | given range of values (memory optimality). The most memory-optimal mapping is
 18 | the LogarithmicMapping, but it requires the costly evaluation of the logarithm
 19 | when computing the index. Other mappings can approximate the logarithmic
 20 | mapping, while being less computationally costly.
 21 | """
 22 | from abc import ABCMeta
 23 | from abc import abstractmethod
 24 | import math
 25 | import sys
 26 | 
 27 | import six
 28 | 
 29 | 
 30 | class KeyMapping(six.with_metaclass(ABCMeta)):
 31 |     """
 32 |     Args:
 33 |         relative_accuracy (float): the accuracy guarantee; referred to as alpha
 34 |         in the paper. (0. < alpha < 1.)
 35 |         offset (float): an offset that can be used to shift all bin keys
 36 |     Attributes:
 37 |         gamma (float): the base for the exponential buckets. gamma = (1 + alpha) / (1 - alpha)
 38 |         min_possible: the smallest value the sketch can distinguish from 0
 39 |         max_possible: the largest value the sketch can handle
 40 |         _multiplier (float): used for calculating log_gamma(value) initially, _multiplier = 1 / log(gamma)
 41 |     """
 42 | 
 43 |     def __init__(self, relative_accuracy, offset=0.0):
 44 |         # type: (float, float) -> None
 45 |         if relative_accuracy <= 0 or relative_accuracy >= 1:
 46 |             raise ValueError(
 47 |                 "Relative accuracy must be between 0 and 1, got %r" % relative_accuracy
 48 |             )
 49 |         self.relative_accuracy = relative_accuracy
 50 |         self._offset = offset
 51 | 
 52 |         gamma_mantissa = 2 * relative_accuracy / (1 - relative_accuracy)
 53 |         self.gamma = 1 + gamma_mantissa
 54 |         self._multiplier = 1 / math.log1p(gamma_mantissa)
 55 |         self.min_possible = sys.float_info.min * self.gamma
 56 |         self.max_possible = sys.float_info.max / self.gamma
 57 | 
 58 |     @classmethod
 59 |     def from_gamma_offset(cls, gamma, offset):
 60 |         # type: (float, float) -> KeyMapping
 61 |         """Constructor used by pb.proto"""
 62 |         relative_accuracy = (gamma - 1.0) / (gamma + 1.0)
 63 |         return cls(relative_accuracy, offset=offset)
 64 | 
 65 |     @abstractmethod
 66 |     def _log_gamma(self, value):
 67 |         # type: (float) -> float
 68 |         """Return (an approximation of) the logarithm of the value base gamma"""
 69 | 
 70 |     @abstractmethod
 71 |     def _pow_gamma(self, value):
 72 |         # type: (float) -> float
 73 |         """Return (an approximation of) gamma to the power value"""
 74 | 
 75 |     def key(self, value):
 76 |         # type: (float) -> int
 77 |         """
 78 |         Args:
 79 |             value (float)
 80 |         Returns:
 81 |             int: the key specifying the bucket for value
 82 |         """
 83 |         return int(math.ceil(self._log_gamma(value)) + self._offset)
 84 | 
 85 |     def value(self, key):
 86 |         # type: (int) -> float
 87 |         """
 88 |         Args:
 89 |             key (int)
 90 |         Returns:
 91 |             float: the value represented by the bucket specified by the key
 92 |         """
 93 |         return self._pow_gamma(key - self._offset) * (2.0 / (1 + self.gamma))
 94 | 
 95 | 
 96 | class LogarithmicMapping(KeyMapping):
 97 |     """A memory-optimal KeyMapping, i.e., given a targeted relative accuracy, it
 98 |     requires the least number of keys to cover a given range of values. This is
 99 |     done by logarithmically mapping floating-point values to integers.
100 |     """
101 | 
102 |     def __init__(self, relative_accuracy, offset=0.0):
103 |         # type: (float, float) -> None
104 |         super(LogarithmicMapping, self).__init__(relative_accuracy, offset=offset)
105 |         self._multiplier *= math.log(2)
106 | 
107 |     def _log_gamma(self, value):
108 |         # type: (float) -> float
109 |         return math.log(value, 2) * self._multiplier
110 | 
111 |     def _pow_gamma(self, value):
112 |         # type: (float) -> float
113 |         return math.pow(2.0, value / self._multiplier)
114 | 
115 | 
116 | def _cbrt(x):
117 |     # type: (float) -> float
118 |     y = float(abs(x) ** (1.0 / 3.0))
119 |     if x < 0:
120 |         return -y
121 |     return y
122 | 
123 | 
124 | class LinearlyInterpolatedMapping(KeyMapping):
125 |     """A fast KeyMapping that approximates the memory-optimal
126 |     LogarithmicMapping by extracting the floor value of the logarithm to the
127 |     base 2 from the binary representations of floating-point values and
128 |     linearly interpolating the logarithm in-between.
129 |     """
130 | 
131 |     def _log2_approx(self, value):
132 |         # type: (float) -> float
133 |         """Approximates log2 by s + f
134 |         where v = (s+1) * 2 ** f  for s in [0, 1)
135 | 
136 |         frexp(v) returns m and e s.t.
137 |         v = m * 2 ** e ; (m in [0.5, 1) or 0.0)
138 |         so we adjust m and e accordingly
139 |         """
140 |         mantissa, exponent = math.frexp(value)
141 |         significand = 2 * mantissa - 1
142 |         return significand + (exponent - 1)
143 | 
144 |     def _exp2_approx(self, value):
145 |         # type: (float) -> float
146 |         """Inverse of _log2_approx"""
147 |         exponent = int(math.floor(value) + 1)
148 |         mantissa = (value - exponent + 2) / 2.0
149 |         return math.ldexp(mantissa, exponent)
150 | 
151 |     def _log_gamma(self, value):
152 |         # type: (float) -> float
153 |         return self._log2_approx(value) * self._multiplier
154 | 
155 |     def _pow_gamma(self, value):
156 |         # type: (float) -> float
157 |         return self._exp2_approx(value / self._multiplier)
158 | 
159 | 
160 | class CubicallyInterpolatedMapping(KeyMapping):
161 |     """A fast KeyMapping that approximates the memory-optimal LogarithmicMapping by
162 |      extracting the floor value of the logarithm to the base 2 from the binary
163 |      representations of floating-point values and cubically interpolating the
164 |      logarithm in-between.
165 | 
166 |     More detailed documentation of this method can be found in:
167 |     <a href="https://github.com/DataDog/sketches-java/">sketches-java</a>
168 |     """
169 | 
170 |     A = 6.0 / 35.0
171 |     B = -3.0 / 5.0
172 |     C = 10.0 / 7.0
173 | 
174 |     def __init__(self, relative_accuracy, offset=0.0):
175 |         # type: (float, float) -> None
176 |         super(CubicallyInterpolatedMapping, self).__init__(
177 |             relative_accuracy, offset=offset
178 |         )
179 |         self._multiplier /= self.C
180 | 
181 |     def _cubic_log2_approx(self, value):
182 |         # type: (float) -> float
183 |         """Approximates log2 using a cubic polynomial"""
184 |         mantissa, exponent = math.frexp(value)
185 |         significand = 2 * mantissa - 1
186 |         return (
187 |             (self.A * significand + self.B) * significand + self.C
188 |         ) * significand + (exponent - 1)
189 | 
190 |     def _cubic_exp2_approx(self, value):
191 |         # type: (float) -> float
192 |         # Derived from Cardano's formula
193 |         exponent = int(math.floor(value))
194 |         delta_0 = self.B * self.B - 3 * self.A * self.C
195 |         delta_1 = (
196 |             2.0 * self.B * self.B * self.B
197 |             - 9.0 * self.A * self.B * self.C
198 |             - 27.0 * self.A * self.A * (value - exponent)
199 |         )
200 |         cardano = _cbrt(
201 |             (delta_1 - ((delta_1 * delta_1 - 4 * delta_0 * delta_0 * delta_0) ** 0.5))
202 |             / 2.0
203 |         )
204 |         significand_plus_one = (
205 |             -(self.B + cardano + delta_0 / cardano) / (3.0 * self.A) + 1.0
206 |         )
207 |         mantissa = significand_plus_one / 2
208 |         return math.ldexp(mantissa, exponent + 1)
209 | 
210 |     def _log_gamma(self, value):
211 |         # type: (float) -> float
212 |         return self._cubic_log2_approx(value) * self._multiplier
213 | 
214 |     def _pow_gamma(self, value):
215 |         # type: (float) -> float
216 |         return self._cubic_exp2_approx(value / self._multiplier)
217 | 


--------------------------------------------------------------------------------
/ddsketch/pb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataDog/sketches-py/0d16e695d1f991276863b8ffaaf6c8e9bd9ad9de/ddsketch/pb/__init__.py


--------------------------------------------------------------------------------
/ddsketch/pb/ddsketch.proto:
--------------------------------------------------------------------------------
 1 | /* Unless explicitly stated otherwise all files in this repository are licensed under the Apache License 2.0.
 2 |  * This product includes software developed at Datadog (https://www.datadoghq.com/).
 3 |  * Copyright 2020 Datadog, Inc.
 4 |  */
 5 | 
 6 | syntax = "proto3";
 7 | 
 8 | // A DDSketch is essentially a histogram that partitions the range of positive values into an infinite number of
 9 | // indexed bins whose size grows exponentially. It keeps track of the number of values (or possibly floating-point
10 | // weights) added to each bin. Negative values are partitioned like positive values, symmetrically to zero.
11 | // The value zero as well as its close neighborhood that would be mapped to extreme bin indexes is mapped to a specific
12 | // counter.
13 | message DDSketch {
14 |   // The mapping between positive values and the bin indexes they belong to.
15 |   IndexMapping mapping = 1;
16 | 
17 |   // The store for keeping track of positive values.
18 |   Store positiveValues = 2;
19 | 
20 |   // The store for keeping track of negative values. A negative value v is mapped using its positive opposite -v.
21 |   Store negativeValues = 3;
22 | 
23 |   // The count for the value zero and its close neighborhood (whose width depends on the mapping).
24 |   double zeroCount = 4;
25 | }
26 | 
27 | // How to map positive values to the bins they belong to.
28 | message IndexMapping {
29 |   // The gamma parameter of the mapping, such that bin index that a value v belongs to is roughly equal to
30 |   // log(v)/log(gamma).
31 |   double gamma = 1;
32 | 
33 |   // An offset that can be used to shift all bin indexes.
34 |   double indexOffset = 2;
35 | 
36 |   // To speed up the computation of the index a value belongs to, the computation of the log may be approximated using
37 |   // the fact that the log to the base 2 of powers of 2 can be computed at a low cost from the binary representation of
38 |   // the input value. Other values can be approximated by interpolating between successive powers of 2 (linearly,
39 |   // quadratically or cubically).
40 |   // NONE means that the log is to be computed exactly (no interpolation).
41 |   Interpolation interpolation = 3;
42 |   enum Interpolation {
43 |     NONE = 0;
44 |     LINEAR = 1;
45 |     QUADRATIC = 2;
46 |     CUBIC = 3;
47 |   }
48 | }
49 | 
50 | // A Store maps bin indexes to their respective counts.
51 | // Counts can be encoded sparsely using binCounts, but also in a contiguous way using contiguousBinCounts and
52 | // contiguousBinIndexOffset. Given that non-empty bins are in practice usually contiguous or close to one another, the
53 | // latter contiguous encoding method is usually more efficient than the sparse one.
54 | // Both encoding methods can be used conjointly. If a bin appears in both the sparse and the contiguous encodings, its
55 | // count value is the sum of the counts in each encodings.
56 | message Store {
57 |   // The bin counts, encoded sparsely.
58 |   map<sint32, double> binCounts = 1;
59 | 
60 |   // The bin counts, encoded contiguously. The values of contiguousBinCounts are the counts for the bins of indexes
61 |   // o, o+1, o+2, etc., where o is contiguousBinIndexOffset.
62 |   repeated double contiguousBinCounts = 2 [packed = true];
63 |   sint32 contiguousBinIndexOffset = 3;
64 | }
65 | 


--------------------------------------------------------------------------------
/ddsketch/pb/ddsketch_pb2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Generated by the protocol buffer compiler.  DO NOT EDIT!
 3 | # source: ddsketch.proto
 4 | """Generated protocol buffer code."""
 5 | from google.protobuf import descriptor as _descriptor
 6 | from google.protobuf import descriptor_pool as _descriptor_pool
 7 | from google.protobuf import message as _message
 8 | from google.protobuf import reflection as _reflection
 9 | from google.protobuf import symbol_database as _symbol_database
10 | # @@protoc_insertion_point(imports)
11 | 
12 | _sym_db = _symbol_database.Default()
13 | 
14 | 
15 | 
16 | 
17 | DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0e\x64\x64sketch.proto\"}\n\x08\x44\x44Sketch\x12\x1e\n\x07mapping\x18\x01 \x01(\x0b\x32\r.IndexMapping\x12\x1e\n\x0epositiveValues\x18\x02 \x01(\x0b\x32\x06.Store\x12\x1e\n\x0enegativeValues\x18\x03 \x01(\x0b\x32\x06.Store\x12\x11\n\tzeroCount\x18\x04 \x01(\x01\"\xa7\x01\n\x0cIndexMapping\x12\r\n\x05gamma\x18\x01 \x01(\x01\x12\x13\n\x0bindexOffset\x18\x02 \x01(\x01\x12\x32\n\rinterpolation\x18\x03 \x01(\x0e\x32\x1b.IndexMapping.Interpolation\"?\n\rInterpolation\x12\x08\n\x04NONE\x10\x00\x12\n\n\x06LINEAR\x10\x01\x12\r\n\tQUADRATIC\x10\x02\x12\t\n\x05\x43UBIC\x10\x03\"\xa6\x01\n\x05Store\x12(\n\tbinCounts\x18\x01 \x03(\x0b\x32\x15.Store.BinCountsEntry\x12\x1f\n\x13\x63ontiguousBinCounts\x18\x02 \x03(\x01\x42\x02\x10\x01\x12 \n\x18\x63ontiguousBinIndexOffset\x18\x03 \x01(\x11\x1a\x30\n\x0e\x42inCountsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x11\x12\r\n\x05value\x18\x02 \x01(\x01:\x02\x38\x01\x62\x06proto3')
18 | 
19 | 
20 | 
21 | _DDSKETCH = DESCRIPTOR.message_types_by_name['DDSketch']
22 | _INDEXMAPPING = DESCRIPTOR.message_types_by_name['IndexMapping']
23 | _STORE = DESCRIPTOR.message_types_by_name['Store']
24 | _STORE_BINCOUNTSENTRY = _STORE.nested_types_by_name['BinCountsEntry']
25 | _INDEXMAPPING_INTERPOLATION = _INDEXMAPPING.enum_types_by_name['Interpolation']
26 | DDSketch = _reflection.GeneratedProtocolMessageType('DDSketch', (_message.Message,), {
27 |   'DESCRIPTOR' : _DDSKETCH,
28 |   '__module__' : 'ddsketch_pb2'
29 |   # @@protoc_insertion_point(class_scope:DDSketch)
30 |   })
31 | _sym_db.RegisterMessage(DDSketch)
32 | 
33 | IndexMapping = _reflection.GeneratedProtocolMessageType('IndexMapping', (_message.Message,), {
34 |   'DESCRIPTOR' : _INDEXMAPPING,
35 |   '__module__' : 'ddsketch_pb2'
36 |   # @@protoc_insertion_point(class_scope:IndexMapping)
37 |   })
38 | _sym_db.RegisterMessage(IndexMapping)
39 | 
40 | Store = _reflection.GeneratedProtocolMessageType('Store', (_message.Message,), {
41 | 
42 |   'BinCountsEntry' : _reflection.GeneratedProtocolMessageType('BinCountsEntry', (_message.Message,), {
43 |     'DESCRIPTOR' : _STORE_BINCOUNTSENTRY,
44 |     '__module__' : 'ddsketch_pb2'
45 |     # @@protoc_insertion_point(class_scope:Store.BinCountsEntry)
46 |     })
47 |   ,
48 |   'DESCRIPTOR' : _STORE,
49 |   '__module__' : 'ddsketch_pb2'
50 |   # @@protoc_insertion_point(class_scope:Store)
51 |   })
52 | _sym_db.RegisterMessage(Store)
53 | _sym_db.RegisterMessage(Store.BinCountsEntry)
54 | 
55 | if _descriptor._USE_C_DESCRIPTORS == False:
56 | 
57 |   DESCRIPTOR._options = None
58 |   _STORE_BINCOUNTSENTRY._options = None
59 |   _STORE_BINCOUNTSENTRY._serialized_options = b'8\001'
60 |   _STORE.fields_by_name['contiguousBinCounts']._options = None
61 |   _STORE.fields_by_name['contiguousBinCounts']._serialized_options = b'\020\001'
62 |   _DDSKETCH._serialized_start=18
63 |   _DDSKETCH._serialized_end=143
64 |   _INDEXMAPPING._serialized_start=146
65 |   _INDEXMAPPING._serialized_end=313
66 |   _INDEXMAPPING_INTERPOLATION._serialized_start=250
67 |   _INDEXMAPPING_INTERPOLATION._serialized_end=313
68 |   _STORE._serialized_start=316
69 |   _STORE._serialized_end=482
70 |   _STORE_BINCOUNTSENTRY._serialized_start=434
71 |   _STORE_BINCOUNTSENTRY._serialized_end=482
72 | # @@protoc_insertion_point(module_scope)
73 | 


--------------------------------------------------------------------------------
/ddsketch/pb/ddsketch_pre319_pb2.py:
--------------------------------------------------------------------------------
  1 | # Generated by the protocol buffer compiler.  DO NOT EDIT!
  2 | # source: ddsketch.proto
  3 | 
  4 | import sys
  5 | _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
  6 | from google.protobuf import descriptor as _descriptor
  7 | from google.protobuf import message as _message
  8 | from google.protobuf import reflection as _reflection
  9 | from google.protobuf import symbol_database as _symbol_database
 10 | from google.protobuf import descriptor_pb2
 11 | # @@protoc_insertion_point(imports)
 12 | 
 13 | _sym_db = _symbol_database.Default()
 14 | 
 15 | 
 16 | 
 17 | 
 18 | DESCRIPTOR = _descriptor.FileDescriptor(
 19 |   name='ddsketch.proto',
 20 |   package='',
 21 |   syntax='proto3',
 22 |   serialized_pb=_b('\n\x0e\x64\x64sketch.proto\"}\n\x08\x44\x44Sketch\x12\x1e\n\x07mapping\x18\x01 \x01(\x0b\x32\r.IndexMapping\x12\x1e\n\x0epositiveValues\x18\x02 \x01(\x0b\x32\x06.Store\x12\x1e\n\x0enegativeValues\x18\x03 \x01(\x0b\x32\x06.Store\x12\x11\n\tzeroCount\x18\x04 \x01(\x01\"\xa7\x01\n\x0cIndexMapping\x12\r\n\x05gamma\x18\x01 \x01(\x01\x12\x13\n\x0bindexOffset\x18\x02 \x01(\x01\x12\x32\n\rinterpolation\x18\x03 \x01(\x0e\x32\x1b.IndexMapping.Interpolation\"?\n\rInterpolation\x12\x08\n\x04NONE\x10\x00\x12\n\n\x06LINEAR\x10\x01\x12\r\n\tQUADRATIC\x10\x02\x12\t\n\x05\x43UBIC\x10\x03\"\xa6\x01\n\x05Store\x12(\n\tbinCounts\x18\x01 \x03(\x0b\x32\x15.Store.BinCountsEntry\x12\x1f\n\x13\x63ontiguousBinCounts\x18\x02 \x03(\x01\x42\x02\x10\x01\x12 \n\x18\x63ontiguousBinIndexOffset\x18\x03 \x01(\x11\x1a\x30\n\x0e\x42inCountsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x11\x12\r\n\x05value\x18\x02 \x01(\x01:\x02\x38\x01\x62\x06proto3')
 23 | )
 24 | _sym_db.RegisterFileDescriptor(DESCRIPTOR)
 25 | 
 26 | 
 27 | 
 28 | _INDEXMAPPING_INTERPOLATION = _descriptor.EnumDescriptor(
 29 |   name='Interpolation',
 30 |   full_name='IndexMapping.Interpolation',
 31 |   filename=None,
 32 |   file=DESCRIPTOR,
 33 |   values=[
 34 |     _descriptor.EnumValueDescriptor(
 35 |       name='NONE', index=0, number=0,
 36 |       options=None,
 37 |       type=None),
 38 |     _descriptor.EnumValueDescriptor(
 39 |       name='LINEAR', index=1, number=1,
 40 |       options=None,
 41 |       type=None),
 42 |     _descriptor.EnumValueDescriptor(
 43 |       name='QUADRATIC', index=2, number=2,
 44 |       options=None,
 45 |       type=None),
 46 |     _descriptor.EnumValueDescriptor(
 47 |       name='CUBIC', index=3, number=3,
 48 |       options=None,
 49 |       type=None),
 50 |   ],
 51 |   containing_type=None,
 52 |   options=None,
 53 |   serialized_start=250,
 54 |   serialized_end=313,
 55 | )
 56 | _sym_db.RegisterEnumDescriptor(_INDEXMAPPING_INTERPOLATION)
 57 | 
 58 | 
 59 | _DDSKETCH = _descriptor.Descriptor(
 60 |   name='DDSketch',
 61 |   full_name='DDSketch',
 62 |   filename=None,
 63 |   file=DESCRIPTOR,
 64 |   containing_type=None,
 65 |   fields=[
 66 |     _descriptor.FieldDescriptor(
 67 |       name='mapping', full_name='DDSketch.mapping', index=0,
 68 |       number=1, type=11, cpp_type=10, label=1,
 69 |       has_default_value=False, default_value=None,
 70 |       message_type=None, enum_type=None, containing_type=None,
 71 |       is_extension=False, extension_scope=None,
 72 |       options=None),
 73 |     _descriptor.FieldDescriptor(
 74 |       name='positiveValues', full_name='DDSketch.positiveValues', index=1,
 75 |       number=2, type=11, cpp_type=10, label=1,
 76 |       has_default_value=False, default_value=None,
 77 |       message_type=None, enum_type=None, containing_type=None,
 78 |       is_extension=False, extension_scope=None,
 79 |       options=None),
 80 |     _descriptor.FieldDescriptor(
 81 |       name='negativeValues', full_name='DDSketch.negativeValues', index=2,
 82 |       number=3, type=11, cpp_type=10, label=1,
 83 |       has_default_value=False, default_value=None,
 84 |       message_type=None, enum_type=None, containing_type=None,
 85 |       is_extension=False, extension_scope=None,
 86 |       options=None),
 87 |     _descriptor.FieldDescriptor(
 88 |       name='zeroCount', full_name='DDSketch.zeroCount', index=3,
 89 |       number=4, type=1, cpp_type=5, label=1,
 90 |       has_default_value=False, default_value=float(0),
 91 |       message_type=None, enum_type=None, containing_type=None,
 92 |       is_extension=False, extension_scope=None,
 93 |       options=None),
 94 |   ],
 95 |   extensions=[
 96 |   ],
 97 |   nested_types=[],
 98 |   enum_types=[
 99 |   ],
100 |   options=None,
101 |   is_extendable=False,
102 |   syntax='proto3',
103 |   extension_ranges=[],
104 |   oneofs=[
105 |   ],
106 |   serialized_start=18,
107 |   serialized_end=143,
108 | )
109 | 
110 | 
111 | _INDEXMAPPING = _descriptor.Descriptor(
112 |   name='IndexMapping',
113 |   full_name='IndexMapping',
114 |   filename=None,
115 |   file=DESCRIPTOR,
116 |   containing_type=None,
117 |   fields=[
118 |     _descriptor.FieldDescriptor(
119 |       name='gamma', full_name='IndexMapping.gamma', index=0,
120 |       number=1, type=1, cpp_type=5, label=1,
121 |       has_default_value=False, default_value=float(0),
122 |       message_type=None, enum_type=None, containing_type=None,
123 |       is_extension=False, extension_scope=None,
124 |       options=None),
125 |     _descriptor.FieldDescriptor(
126 |       name='indexOffset', full_name='IndexMapping.indexOffset', index=1,
127 |       number=2, type=1, cpp_type=5, label=1,
128 |       has_default_value=False, default_value=float(0),
129 |       message_type=None, enum_type=None, containing_type=None,
130 |       is_extension=False, extension_scope=None,
131 |       options=None),
132 |     _descriptor.FieldDescriptor(
133 |       name='interpolation', full_name='IndexMapping.interpolation', index=2,
134 |       number=3, type=14, cpp_type=8, label=1,
135 |       has_default_value=False, default_value=0,
136 |       message_type=None, enum_type=None, containing_type=None,
137 |       is_extension=False, extension_scope=None,
138 |       options=None),
139 |   ],
140 |   extensions=[
141 |   ],
142 |   nested_types=[],
143 |   enum_types=[
144 |     _INDEXMAPPING_INTERPOLATION,
145 |   ],
146 |   options=None,
147 |   is_extendable=False,
148 |   syntax='proto3',
149 |   extension_ranges=[],
150 |   oneofs=[
151 |   ],
152 |   serialized_start=146,
153 |   serialized_end=313,
154 | )
155 | 
156 | 
157 | _STORE_BINCOUNTSENTRY = _descriptor.Descriptor(
158 |   name='BinCountsEntry',
159 |   full_name='Store.BinCountsEntry',
160 |   filename=None,
161 |   file=DESCRIPTOR,
162 |   containing_type=None,
163 |   fields=[
164 |     _descriptor.FieldDescriptor(
165 |       name='key', full_name='Store.BinCountsEntry.key', index=0,
166 |       number=1, type=17, cpp_type=1, label=1,
167 |       has_default_value=False, default_value=0,
168 |       message_type=None, enum_type=None, containing_type=None,
169 |       is_extension=False, extension_scope=None,
170 |       options=None),
171 |     _descriptor.FieldDescriptor(
172 |       name='value', full_name='Store.BinCountsEntry.value', index=1,
173 |       number=2, type=1, cpp_type=5, label=1,
174 |       has_default_value=False, default_value=float(0),
175 |       message_type=None, enum_type=None, containing_type=None,
176 |       is_extension=False, extension_scope=None,
177 |       options=None),
178 |   ],
179 |   extensions=[
180 |   ],
181 |   nested_types=[],
182 |   enum_types=[
183 |   ],
184 |   options=_descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001')),
185 |   is_extendable=False,
186 |   syntax='proto3',
187 |   extension_ranges=[],
188 |   oneofs=[
189 |   ],
190 |   serialized_start=434,
191 |   serialized_end=482,
192 | )
193 | 
194 | _STORE = _descriptor.Descriptor(
195 |   name='Store',
196 |   full_name='Store',
197 |   filename=None,
198 |   file=DESCRIPTOR,
199 |   containing_type=None,
200 |   fields=[
201 |     _descriptor.FieldDescriptor(
202 |       name='binCounts', full_name='Store.binCounts', index=0,
203 |       number=1, type=11, cpp_type=10, label=3,
204 |       has_default_value=False, default_value=[],
205 |       message_type=None, enum_type=None, containing_type=None,
206 |       is_extension=False, extension_scope=None,
207 |       options=None),
208 |     _descriptor.FieldDescriptor(
209 |       name='contiguousBinCounts', full_name='Store.contiguousBinCounts', index=1,
210 |       number=2, type=1, cpp_type=5, label=3,
211 |       has_default_value=False, default_value=[],
212 |       message_type=None, enum_type=None, containing_type=None,
213 |       is_extension=False, extension_scope=None,
214 |       options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))),
215 |     _descriptor.FieldDescriptor(
216 |       name='contiguousBinIndexOffset', full_name='Store.contiguousBinIndexOffset', index=2,
217 |       number=3, type=17, cpp_type=1, label=1,
218 |       has_default_value=False, default_value=0,
219 |       message_type=None, enum_type=None, containing_type=None,
220 |       is_extension=False, extension_scope=None,
221 |       options=None),
222 |   ],
223 |   extensions=[
224 |   ],
225 |   nested_types=[_STORE_BINCOUNTSENTRY, ],
226 |   enum_types=[
227 |   ],
228 |   options=None,
229 |   is_extendable=False,
230 |   syntax='proto3',
231 |   extension_ranges=[],
232 |   oneofs=[
233 |   ],
234 |   serialized_start=316,
235 |   serialized_end=482,
236 | )
237 | 
238 | _DDSKETCH.fields_by_name['mapping'].message_type = _INDEXMAPPING
239 | _DDSKETCH.fields_by_name['positiveValues'].message_type = _STORE
240 | _DDSKETCH.fields_by_name['negativeValues'].message_type = _STORE
241 | _INDEXMAPPING.fields_by_name['interpolation'].enum_type = _INDEXMAPPING_INTERPOLATION
242 | _INDEXMAPPING_INTERPOLATION.containing_type = _INDEXMAPPING
243 | _STORE_BINCOUNTSENTRY.containing_type = _STORE
244 | _STORE.fields_by_name['binCounts'].message_type = _STORE_BINCOUNTSENTRY
245 | DESCRIPTOR.message_types_by_name['DDSketch'] = _DDSKETCH
246 | DESCRIPTOR.message_types_by_name['IndexMapping'] = _INDEXMAPPING
247 | DESCRIPTOR.message_types_by_name['Store'] = _STORE
248 | 
249 | DDSketch = _reflection.GeneratedProtocolMessageType('DDSketch', (_message.Message,), dict(
250 |   DESCRIPTOR = _DDSKETCH,
251 |   __module__ = 'ddsketch_pb2'
252 |   # @@protoc_insertion_point(class_scope:DDSketch)
253 |   ))
254 | _sym_db.RegisterMessage(DDSketch)
255 | 
256 | IndexMapping = _reflection.GeneratedProtocolMessageType('IndexMapping', (_message.Message,), dict(
257 |   DESCRIPTOR = _INDEXMAPPING,
258 |   __module__ = 'ddsketch_pb2'
259 |   # @@protoc_insertion_point(class_scope:IndexMapping)
260 |   ))
261 | _sym_db.RegisterMessage(IndexMapping)
262 | 
263 | Store = _reflection.GeneratedProtocolMessageType('Store', (_message.Message,), dict(
264 | 
265 |   BinCountsEntry = _reflection.GeneratedProtocolMessageType('BinCountsEntry', (_message.Message,), dict(
266 |     DESCRIPTOR = _STORE_BINCOUNTSENTRY,
267 |     __module__ = 'ddsketch_pb2'
268 |     # @@protoc_insertion_point(class_scope:Store.BinCountsEntry)
269 |     ))
270 |   ,
271 |   DESCRIPTOR = _STORE,
272 |   __module__ = 'ddsketch_pb2'
273 |   # @@protoc_insertion_point(class_scope:Store)
274 |   ))
275 | _sym_db.RegisterMessage(Store)
276 | _sym_db.RegisterMessage(Store.BinCountsEntry)
277 | 
278 | 
279 | _STORE_BINCOUNTSENTRY.has_options = True
280 | _STORE_BINCOUNTSENTRY._options = _descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001'))
281 | _STORE.fields_by_name['contiguousBinCounts'].has_options = True
282 | _STORE.fields_by_name['contiguousBinCounts']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))
283 | # @@protoc_insertion_point(module_scope)
284 | 


--------------------------------------------------------------------------------
/ddsketch/pb/proto.py:
--------------------------------------------------------------------------------
  1 | from ddsketch.ddsketch import BaseDDSketch
  2 | from ..mapping import (
  3 |     CubicallyInterpolatedMapping,
  4 |     LinearlyInterpolatedMapping,
  5 |     LogarithmicMapping,
  6 | )
  7 | from ..store import DenseStore
  8 | 
  9 | import google.protobuf
 10 | 
 11 | 
 12 | pb_version = tuple(map(int, google.protobuf.__version__.split(".")[0:2]))
 13 | 
 14 | if pb_version >= (3, 19, 0):
 15 |     import ddsketch.pb.ddsketch_pb2 as pb
 16 | else:
 17 |     import ddsketch.pb.ddsketch_pre319_pb2 as pb
 18 | 
 19 | 
 20 | class KeyMappingProto:
 21 |     @classmethod
 22 |     def _proto_interpolation(cls, mapping):
 23 |         if type(mapping) is LogarithmicMapping:
 24 |             return pb.IndexMapping.NONE
 25 |         if type(mapping) is LinearlyInterpolatedMapping:
 26 |             return pb.IndexMapping.LINEAR
 27 |         if type(mapping) is CubicallyInterpolatedMapping:
 28 |             return pb.IndexMapping.CUBIC
 29 | 
 30 |     @classmethod
 31 |     def to_proto(cls, mapping):
 32 |         """serialize to protobuf"""
 33 |         return pb.IndexMapping(
 34 |             gamma=mapping.gamma,
 35 |             indexOffset=mapping._offset,
 36 |             interpolation=cls._proto_interpolation(mapping),
 37 |         )
 38 | 
 39 |     @classmethod
 40 |     def from_proto(cls, proto):
 41 |         """deserialize from protobuf"""
 42 |         if proto.interpolation == pb.IndexMapping.NONE:
 43 |             return LogarithmicMapping.from_gamma_offset(proto.gamma, proto.indexOffset)
 44 |         elif proto.interpolation == pb.IndexMapping.LINEAR:
 45 |             return LinearlyInterpolatedMapping.from_gamma_offset(
 46 |                 proto.gamma, proto.indexOffset
 47 |             )
 48 |         elif proto.interpolation == pb.IndexMapping.CUBIC:
 49 |             return CubicallyInterpolatedMapping.from_gamma_offset(
 50 |                 proto.gamma, proto.indexOffset
 51 |             )
 52 |         else:
 53 |             raise ValueError("Unrecognized interpolation %r" % proto.interpolation)
 54 | 
 55 | 
 56 | class StoreProto:
 57 |     """Currently only supports DenseStore"""
 58 | 
 59 |     @classmethod
 60 |     def to_proto(cls, store):
 61 |         """serialize to protobuf"""
 62 |         return pb.Store(
 63 |             contiguousBinCounts=store.bins, contiguousBinIndexOffset=store.offset
 64 |         )
 65 | 
 66 |     @classmethod
 67 |     def from_proto(cls, proto):
 68 |         """deserialize from protobuf"""
 69 |         store = DenseStore()
 70 |         index = proto.contiguousBinIndexOffset
 71 |         store.offset = index
 72 |         for count in proto.contiguousBinCounts:
 73 |             store.add(index, count)
 74 |             index += 1
 75 |         return store
 76 | 
 77 | 
 78 | class DDSketchProto:
 79 |     @classmethod
 80 |     def to_proto(self, ddsketch):
 81 |         """serialize to protobuf"""
 82 |         return pb.DDSketch(
 83 |             mapping=KeyMappingProto.to_proto(ddsketch._mapping),
 84 |             positiveValues=StoreProto.to_proto(ddsketch._store),
 85 |             negativeValues=StoreProto.to_proto(ddsketch._negative_store),
 86 |             zeroCount=ddsketch._zero_count,
 87 |         )
 88 | 
 89 |     @classmethod
 90 |     def from_proto(cls, proto):
 91 |         """deserialize from protobuf
 92 | 
 93 |         N.B., The current protobuf loses any min/max/sum/avg information.
 94 |         """
 95 |         mapping = KeyMappingProto.from_proto(proto.mapping)
 96 |         negative_store = StoreProto.from_proto(proto.negativeValues)
 97 |         store = StoreProto.from_proto(proto.positiveValues)
 98 |         zero_count = proto.zeroCount
 99 |         return BaseDDSketch(
100 |             mapping=mapping,
101 |             store=store,
102 |             negative_store=negative_store,
103 |             zero_count=zero_count,
104 |         )
105 | 


--------------------------------------------------------------------------------
/ddsketch/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataDog/sketches-py/0d16e695d1f991276863b8ffaaf6c8e9bd9ad9de/ddsketch/py.typed


--------------------------------------------------------------------------------
/ddsketch/store.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | 
  4 | # Unless explicitly stated otherwise all files in this repository are licensed
  5 | # under the Apache License 2.0.
  6 | # This product includes software developed at Datadog (https://www.datadoghq.com/).
  7 | # Copyright 2020 Datadog, Inc.
  8 | 
  9 | """
 10 | Stores map integers to counters. They can be seen as a collection of bins.
 11 | We start with 128 bins and grow the store in chunks of 128 unless specified
 12 | otherwise.
 13 | """
 14 | 
 15 | import abc
 16 | import math
 17 | import typing
 18 | 
 19 | 
 20 | if typing.TYPE_CHECKING:
 21 |     from typing import List  # noqa: F401
 22 |     from typing import Optional  # noqa: F401
 23 | 
 24 | import six
 25 | 
 26 | 
 27 | CHUNK_SIZE = 128
 28 | 
 29 | 
 30 | class _NegativeIntInfinity(int):
 31 |     def __ge__(self, x):
 32 |         return False
 33 | 
 34 |     __gt__ = __ge__
 35 | 
 36 |     def __lt__(self, x):
 37 |         return True
 38 | 
 39 |     __le__ = __lt__
 40 | 
 41 | 
 42 | class _PositiveIntInfinity(int):
 43 |     def __ge__(self, x):
 44 |         return True
 45 | 
 46 |     __gt__ = __ge__
 47 | 
 48 |     def __lt__(self, x):
 49 |         return False
 50 | 
 51 |     __le__ = __lt__
 52 | 
 53 | 
 54 | _neg_infinity = _NegativeIntInfinity()
 55 | _pos_infinity = _PositiveIntInfinity()
 56 | 
 57 | 
 58 | class Store(six.with_metaclass(abc.ABCMeta)):
 59 |     """The basic specification of a store
 60 | 
 61 |     Attributes:
 62 |         count (float): the sum of the counts for the bins
 63 |         min_key (int): the minimum key bin
 64 |         max_key (int): the maximum key bin
 65 |     """
 66 | 
 67 |     def __init__(self):
 68 |         # type: () -> None
 69 |         self.count = 0  # type: float
 70 |         self.min_key = _pos_infinity  # type: int
 71 |         self.max_key = _neg_infinity  # type: int
 72 | 
 73 |     @abc.abstractmethod
 74 |     def copy(self, store):
 75 |         """Copies the input store into this one."""
 76 | 
 77 |     @abc.abstractmethod
 78 |     def length(self):
 79 |         # type: () -> int
 80 |         """Return the number of bins."""
 81 | 
 82 |     @abc.abstractmethod
 83 |     def add(self, key, weight=1.0):
 84 |         # type: (int, float) -> None
 85 |         """Updates the counter at the specified index key, growing the number of bins if
 86 |         necessary.
 87 |         """
 88 | 
 89 |     @abc.abstractmethod
 90 |     def key_at_rank(self, rank, lower=True):
 91 |         # type: (float, bool) -> int
 92 |         """Return the key for the value at given rank.
 93 | 
 94 |         E.g., if the non-zero bins are [1, 1] for keys a, b with no offset
 95 | 
 96 |         if lower = True:
 97 |              key_at_rank(x) = a for x in [0, 1)
 98 |              key_at_rank(x) = b for x in [1, 2)
 99 | 
100 |         if lower = False:
101 |              key_at_rank(x) = a for x in (-1, 0]
102 |              key_at_rank(x) = b for x in (0, 1]
103 |         """
104 | 
105 |     @abc.abstractmethod
106 |     def merge(self, store):
107 |         # type: (Store) -> None
108 |         """Merge another store into this one. This should be equivalent as running the
109 |         add operations that have been run on the other store on this one.
110 |         """
111 | 
112 | 
113 | class DenseStore(Store):
114 |     """A dense store that keeps all the bins between the bin for the min_key and the
115 |     bin for the max_key.
116 | 
117 |     Args:
118 |         chunk_size (int, optional): the number of bins to grow by
119 | 
120 |     Attributes:
121 |         count (int): the sum of the counts for the bins
122 |         min_key (int): the minimum key bin
123 |         max_key (int): the maximum key bin
124 |         offset (int): the difference btw the keys and the index in which they are stored
125 |         bins (List[float]): the bins
126 |     """
127 | 
128 |     def __init__(self, chunk_size=CHUNK_SIZE):
129 |         # type: (int) -> None
130 |         super(DenseStore, self).__init__()
131 | 
132 |         self.chunk_size = chunk_size  # type: int
133 |         self.offset = 0  # type: int
134 |         self.bins = []  # type: List[float]
135 | 
136 |     def __repr__(self):
137 |         # type: () -> str
138 |         repr_str = "{"
139 |         for i, sbin in enumerate(self.bins):
140 |             repr_str += "%s: %s, " % (i + self.offset, sbin)
141 |         repr_str += "}}, min_key:%s, max_key:%s, offset:%s" % (
142 |             self.min_key,
143 |             self.max_key,
144 |             self.offset,
145 |         )
146 |         return repr_str
147 | 
148 |     def copy(self, store):
149 |         # type: (DenseStore) -> None
150 |         self.bins = store.bins[:]
151 |         self.count = store.count
152 |         self.min_key = store.min_key
153 |         self.max_key = store.max_key
154 |         self.offset = store.offset
155 | 
156 |     def length(self):
157 |         # type: () -> int
158 |         """Return the number of bins."""
159 |         return len(self.bins)
160 | 
161 |     def add(self, key, weight=1.0):
162 |         # type: (int, float) -> None
163 |         idx = self._get_index(key)
164 |         self.bins[idx] += weight
165 |         self.count += weight
166 | 
167 |     def _get_index(self, key):
168 |         # type: (int) -> int
169 |         """Calculate the bin index for the key, extending the range if necessary."""
170 |         if key < self.min_key:
171 |             self._extend_range(key)
172 |         elif key > self.max_key:
173 |             self._extend_range(key)
174 | 
175 |         return key - self.offset
176 | 
177 |     def _get_new_length(self, new_min_key, new_max_key):
178 |         # type: (int, int) -> int
179 |         desired_length = new_max_key - new_min_key + 1
180 |         return self.chunk_size * int(math.ceil(desired_length / self.chunk_size))
181 | 
182 |     def _extend_range(self, key, second_key=None):
183 |         # type: (int, Optional[int]) -> None
184 |         """Grow the bins as necessary and call _adjust"""
185 |         if second_key is None:
186 |             second_key = key
187 |         new_min_key = min(key, second_key, self.min_key)
188 |         new_max_key = max(key, second_key, self.max_key)
189 | 
190 |         if self.length() == 0:
191 |             # initialize bins
192 |             self.bins = [0.0] * self._get_new_length(new_min_key, new_max_key)
193 |             self.offset = new_min_key
194 |             self._adjust(new_min_key, new_max_key)
195 | 
196 |         elif new_min_key >= self.min_key and new_max_key < self.offset + self.length():
197 |             # no need to change the range; just update min/max keys
198 |             self.min_key = new_min_key
199 |             self.max_key = new_max_key
200 | 
201 |         else:
202 |             # grow the bins
203 |             new_length = self._get_new_length(new_min_key, new_max_key)
204 |             if new_length > self.length():
205 |                 self.bins.extend([0.0] * (new_length - self.length()))
206 |             self._adjust(new_min_key, new_max_key)
207 | 
208 |     def _adjust(self, new_min_key, new_max_key):
209 |         # type: (int, int) -> None
210 |         """Adjust the bins, the offset, the min_key, and max_key, without resizing the
211 |         bins, in order to try making it fit the specified range.
212 |         """
213 |         self._center_bins(new_min_key, new_max_key)
214 |         self.min_key = new_min_key
215 |         self.max_key = new_max_key
216 | 
217 |     def _shift_bins(self, shift):
218 |         # type: (int) -> None
219 |         """Shift the bins; this changes the offset."""
220 |         if shift > 0:
221 |             self.bins = self.bins[:-shift]
222 |             self.bins[:0] = [0.0] * shift
223 |         else:
224 |             self.bins = self.bins[abs(shift) :]
225 |             self.bins.extend([0.0] * abs(shift))
226 |         self.offset -= shift
227 | 
228 |     def _center_bins(self, new_min_key, new_max_key):
229 |         # type: (int, int) -> None
230 |         """Center the bins; this changes the offset."""
231 |         middle_key = new_min_key + (new_max_key - new_min_key + 1) // 2
232 |         self._shift_bins(self.offset + self.length() // 2 - middle_key)
233 | 
234 |     def key_at_rank(self, rank, lower=True):
235 |         # type: (float, bool) -> int
236 |         running_ct = 0.0
237 |         for i, bin_ct in enumerate(self.bins):
238 |             running_ct += bin_ct
239 |             if (lower and running_ct > rank) or (not lower and running_ct >= rank + 1):
240 |                 return i + self.offset
241 | 
242 |         return self.max_key
243 | 
244 |     def merge(self, store):  # type: ignore[override]
245 |         # type: (DenseStore) -> None
246 |         if store.count == 0:
247 |             return
248 | 
249 |         if self.count == 0:
250 |             self.copy(store)
251 |             return
252 | 
253 |         if store.min_key < self.min_key or store.max_key > self.max_key:
254 |             self._extend_range(store.min_key, store.max_key)
255 | 
256 |         for key in range(store.min_key, store.max_key + 1):
257 |             self.bins[key - self.offset] += store.bins[key - store.offset]
258 | 
259 |         self.count += store.count
260 | 
261 | 
262 | class CollapsingLowestDenseStore(DenseStore):
263 |     """A dense store that keeps all the bins between the bin for the min_key and the
264 |     bin for the max_key, but collapsing the left-most bins if the number of bins
265 |     exceeds the bin_limit
266 | 
267 |     Args:
268 |         bin_limit (int): the maximum number of bins
269 |         chunk_size (int, optional): the number of bins to grow by
270 | 
271 |     Attributes:
272 |         count (int): the sum of the counts for the bins
273 |         min_key (int): the minimum key bin
274 |         max_key (int): the maximum key bin
275 |         offset (int): the difference btw the keys and the index in which they are stored
276 |         bins (List[int]): the bins
277 |     """
278 | 
279 |     def __init__(self, bin_limit, chunk_size=CHUNK_SIZE):
280 |         # type: (int, int) -> None
281 |         super(CollapsingLowestDenseStore, self).__init__()
282 |         self.bin_limit = bin_limit
283 |         self.is_collapsed = False
284 | 
285 |     def copy(self, store):  # type: ignore[override]
286 |         # type: (CollapsingLowestDenseStore) -> None
287 |         self.bin_limit = store.bin_limit
288 |         self.is_collapsed = store.is_collapsed
289 |         super(CollapsingLowestDenseStore, self).copy(store)
290 | 
291 |     def _get_new_length(self, new_min_key, new_max_key):
292 |         # type: (int, int) -> int
293 |         desired_length = new_max_key - new_min_key + 1
294 |         return min(
295 |             self.chunk_size * int(math.ceil(desired_length / self.chunk_size)),
296 |             self.bin_limit,
297 |         )
298 | 
299 |     def _get_index(self, key):
300 |         # type: (int) -> int
301 |         """Calculate the bin index for the key, extending the range if necessary."""
302 |         if key < self.min_key:
303 |             if self.is_collapsed:
304 |                 return 0
305 | 
306 |             self._extend_range(key)
307 |             if self.is_collapsed:
308 |                 return 0
309 |         elif key > self.max_key:
310 |             self._extend_range(key)
311 | 
312 |         return key - self.offset
313 | 
314 |     def _adjust(self, new_min_key, new_max_key):
315 |         # type: (int, int) -> None
316 |         """Override. Adjust the bins, the offset, the min_key, and max_key, without
317 |         resizing the bins, in order to try making it fit the specified
318 |         range. Collapse to the left if necessary.
319 |         """
320 |         if new_max_key - new_min_key + 1 > self.length():
321 |             # The range of keys is too wide, the lowest bins need to be collapsed.
322 |             new_min_key = new_max_key - self.length() + 1
323 | 
324 |             if new_min_key >= self.max_key:
325 |                 # put everything in the first bin
326 |                 self.offset = new_min_key
327 |                 self.min_key = new_min_key
328 |                 self.bins[:] = [0.0] * self.length()
329 |                 self.bins[0] = self.count
330 |             else:
331 |                 shift = self.offset - new_min_key
332 |                 if shift < 0:
333 |                     collapse_start_index = self.min_key - self.offset
334 |                     collapse_end_index = new_min_key - self.offset
335 |                     collapsed_count = sum(
336 |                         self.bins[collapse_start_index:collapse_end_index]
337 |                     )
338 |                     self.bins[collapse_start_index:collapse_end_index] = [0.0] * (
339 |                         new_min_key - self.min_key
340 |                     )
341 |                     self.bins[collapse_end_index] += collapsed_count
342 |                     self.min_key = new_min_key
343 |                     # shift the buckets to make room for new_max_key
344 |                     self._shift_bins(shift)
345 |                 else:
346 |                     self.min_key = new_min_key
347 |                     # shift the buckets to make room for new_min_key
348 |                     self._shift_bins(shift)
349 | 
350 |             self.max_key = new_max_key
351 |             self.is_collapsed = True
352 |         else:
353 |             self._center_bins(new_min_key, new_max_key)
354 |             self.min_key = new_min_key
355 |             self.max_key = new_max_key
356 | 
357 |     def merge(self, store):  # type: ignore[override]
358 |         # type: (CollapsingLowestDenseStore) -> None  # type: ignore[override]
359 |         """Override."""
360 |         if store.count == 0:
361 |             return
362 | 
363 |         if self.count == 0:
364 |             self.copy(store)
365 |             return
366 | 
367 |         if store.min_key < self.min_key or store.max_key > self.max_key:
368 |             self._extend_range(store.min_key, store.max_key)
369 | 
370 |         collapse_start_idx = store.min_key - store.offset
371 |         collapse_end_idx = min(self.min_key, store.max_key + 1) - store.offset
372 |         if collapse_end_idx > collapse_start_idx:
373 |             collapse_count = sum(store.bins[collapse_start_idx:collapse_end_idx])
374 |             self.bins[0] += collapse_count
375 |         else:
376 |             collapse_end_idx = collapse_start_idx
377 | 
378 |         for key in range(collapse_end_idx + store.offset, store.max_key + 1):
379 |             self.bins[key - self.offset] += store.bins[key - store.offset]
380 | 
381 |         self.count += store.count
382 | 
383 | 
384 | class CollapsingHighestDenseStore(DenseStore):
385 |     """A dense store that keeps all the bins between the bin for the min_key and the
386 |     bin for the max_key, but collapsing the right-most bins if the number of bins
387 |     exceeds the bin_limit
388 | 
389 |     Args:
390 |         bin_limit (int): the maximum number of bins
391 |         chunk_size (int, optional): the number of bins to grow by
392 | 
393 |     Attributes:
394 |         count (int): the sum of the counts for the bins
395 |         min_key (int): the minimum key bin
396 |         max_key (int): the maximum key bin
397 |         offset (int): the difference btw the keys and the index in which they are stored
398 |         bins (List[int]): the bins
399 |     """
400 | 
401 |     def __init__(self, bin_limit, chunk_size=CHUNK_SIZE):
402 |         super(CollapsingHighestDenseStore, self).__init__()
403 |         self.bin_limit = bin_limit
404 |         self.is_collapsed = False
405 | 
406 |     def copy(self, store):  # type: ignore[override]
407 |         # type: (CollapsingHighestDenseStore) -> None
408 |         self.bin_limit = store.bin_limit
409 |         self.is_collapsed = store.is_collapsed
410 |         super(CollapsingHighestDenseStore, self).copy(store)
411 | 
412 |     def _get_new_length(self, new_min_key, new_max_key):
413 |         # type: (int, int) -> int
414 |         desired_length = new_max_key - new_min_key + 1
415 |         # For some reason mypy can't infer that min(int, int) is an int, so cast it.
416 |         return int(
417 |             min(
418 |                 self.chunk_size * int(math.ceil(desired_length / self.chunk_size)),
419 |                 self.bin_limit,
420 |             )
421 |         )
422 | 
423 |     def _get_index(self, key):
424 |         # type: (int) -> int
425 |         """Calculate the bin index for the key, extending the range if necessary"""
426 |         if key > self.max_key:
427 |             if self.is_collapsed:
428 |                 return self.length() - 1
429 | 
430 |             self._extend_range(key)
431 |             if self.is_collapsed:
432 |                 return self.length() - 1
433 |         elif key < self.min_key:
434 |             self._extend_range(key)
435 |         return key - self.offset
436 | 
437 |     def _adjust(self, new_min_key, new_max_key):
438 |         # type: (int, int) -> None
439 |         """Override. Adjust the bins, the offset, the min_key, and max_key, without
440 |         resizing the bins, in order to try making it fit the specified
441 |         range. Collapse to the left if necessary.
442 |         """
443 |         if new_max_key - new_min_key + 1 > self.length():
444 |             # The range of keys is too wide, the lowest bins need to be collapsed.
445 |             new_max_key = new_min_key + self.length() - 1
446 | 
447 |             if new_max_key <= self.min_key:
448 |                 # put everything in the last bin
449 |                 self.offset = new_min_key
450 |                 self.max_key = new_max_key
451 |                 self.bins[:] = [0.0] * self.length()
452 |                 self.bins[-1] = self.count
453 |             else:
454 |                 shift = self.offset - new_min_key
455 |                 if shift > 0:
456 |                     collapse_start_index = new_max_key - self.offset + 1
457 |                     collapse_end_index = self.max_key - self.offset + 1
458 |                     collapsed_count = sum(
459 |                         self.bins[collapse_start_index:collapse_end_index]
460 |                     )
461 |                     self.bins[collapse_start_index:collapse_end_index] = [0.0] * (
462 |                         self.max_key - new_max_key
463 |                     )
464 |                     self.bins[collapse_start_index - 1] += collapsed_count
465 |                     self.max_key = new_max_key
466 |                     # shift the buckets to make room for new_max_key
467 |                     self._shift_bins(shift)
468 |                 else:
469 |                     self.max_key = new_max_key
470 |                     # shift the buckets to make room for new_min_key
471 |                     self._shift_bins(shift)
472 | 
473 |             self.min_key = new_min_key
474 |             self.is_collapsed = True
475 |         else:
476 |             self._center_bins(new_min_key, new_max_key)
477 |             self.min_key = new_min_key
478 |             self.max_key = new_max_key
479 | 
480 |     def merge(self, store):  # type: ignore[override]
481 |         # type: (CollapsingHighestDenseStore) -> None  # type: ignore[override]
482 |         """Override."""
483 |         if store.count == 0:
484 |             return
485 | 
486 |         if self.count == 0:
487 |             self.copy(store)
488 |             return
489 | 
490 |         if store.min_key < self.min_key or store.max_key > self.max_key:
491 |             self._extend_range(store.min_key, store.max_key)
492 | 
493 |         collapse_end_idx = store.max_key - store.offset + 1
494 |         collapse_start_idx = max(self.max_key + 1, store.min_key) - store.offset
495 |         if collapse_end_idx > collapse_start_idx:
496 |             collapse_count = sum(store.bins[collapse_start_idx:collapse_end_idx])
497 |             self.bins[-1] += collapse_count
498 |         else:
499 |             collapse_start_idx = collapse_end_idx
500 | 
501 |         for key in range(store.min_key, collapse_start_idx + store.offset):
502 |             self.bins[key - self.offset] += store.bins[key - store.offset]
503 | 
504 |         self.count += store.count
505 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | 
 3 | services:
 4 |   dev:
 5 |     # The dd-trace-py image includes all required versions of Python.
 6 |     image: datadog/dd-trace-py:buster
 7 |     command: bash
 8 |     network_mode: host
 9 |     working_dir: /src
10 |     volumes:
11 |       - ./:/src
12 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | files = ddsketch,tests
 3 | show_error_codes = true
 4 | warn_return_any = true
 5 | warn_unused_ignores = true
 6 | warn_unused_configs = true
 7 | no_implicit_optional = true
 8 | ignore_missing_imports = true
 9 | 
10 | [mypy-ddsketch.pb.*]
11 | ignore_errors = true
12 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.isort]
 2 | force_single_line = true
 3 | lines_after_imports = 2
 4 | force_sort_within_sections = true
 5 | known_first_party = "ddsketch"
 6 | default_section = "THIRDPARTY"
 7 | skip = [".riot/", ".venv/", "ddsketch/pb", "ddsketch/__version.py"]
 8 | line_length = 120
 9 | 
10 | [tool.black]
11 | exclude = '''
12 | ^/(
13 |   (
14 |     \.riot
15 |   | ddsketch/pb.*
16 |   | \.venv.*
17 |   | \.eggs
18 |   )/
19 |   | ddsketch/__version.py
20 | )
21 | '''
22 | 


--------------------------------------------------------------------------------
/releasenotes/config.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | unreleased_version_title: Unreleased
3 | 


--------------------------------------------------------------------------------
/releasenotes/notes/ddsketch-api-a84ffc0875bbacd6.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | upgrade:
3 |   - "``DDSketch`` attributes ``mapping``, ``store``, ``negative_store``, ``zero_count``, ``relative_accuracy``, ``min`` and ``max`` have been removed."
4 |   - "``DDSketch.copy`` method has been removed."
5 |   - "``DDSketch.count`` attribute has been made read-only."
6 |   - "``DDSketch.mergeable`` method has been removed."
7 | 


--------------------------------------------------------------------------------
/releasenotes/notes/extend-range-06474632c8235187.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | fixes:
3 |   - |
4 |     Fix merging stores with max_key=0.
5 | 


--------------------------------------------------------------------------------
/releasenotes/notes/oldpy-db6189c9b26e10f7.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | other:
3 |   - |
4 |     This release drops support for Python versions older than 3.7.
5 | 


--------------------------------------------------------------------------------
/releasenotes/notes/pbopt-ec6525c1948d782f.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | other:
3 |   - |
4 |     This change makes protobuf an optional requirement. It can be installed with ``pip install ddsketch[serialization]``.
5 | 


--------------------------------------------------------------------------------
/releasenotes/notes/proto4-e8646610178bef59.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | fixes:
3 |   - |
4 |     Add support for protobuf 4.
5 | 


--------------------------------------------------------------------------------
/releasenotes/notes/protobuf-min-f6af9a2d5d96f53c.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | other:
3 |   - |
4 |     Add support for protobuf>=3.0.0.
5 | 


--------------------------------------------------------------------------------
/releasenotes/notes/py2-c963608396db7258.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | features:
3 |   - |
4 |     Add support for Python 2.
5 | 


--------------------------------------------------------------------------------
/releasenotes/notes/py310-ac5baa9b0b69008a.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | features:
3 |   - |
4 |     Add support for Python 3.10.
5 | 


--------------------------------------------------------------------------------
/releasenotes/notes/remove-custom-exceptions-e2bc67a72250269d.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | upgrade:
3 |   - |
4 |     The custom exceptions ``IllegalArgumentException`` and ``UnequalSketchParametersException``
5 |     as well as the ``ddsketch.exceptions`` module have been removed.
6 | 
7 |     ``IllegalArgumentException`` and ``UnequalSketchParametersException`` are replaced with ``ValueError``.
8 | 


--------------------------------------------------------------------------------
/releasenotes/notes/remove-numpy-25fedcd9be9d6d80.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | prelude: >
3 |     numpy has been removed as a dependency.
4 | upgrade:
5 |   - |
6 |     ``BaseDDSketch.get_quantile_value`` will now return ``None`` instead of
7 |     ``numpy.NaN`` if the specified quantile is empty.
8 | 


--------------------------------------------------------------------------------
/releasenotes/notes/tests-wheel-bf71b228c86a9ced.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | fixes:
3 |   - |
4 |     Exclude the tests module from the package.
5 | 


--------------------------------------------------------------------------------
/releasenotes/notes/toplevelapi-6c04f2ca35a49d4b.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | features:
3 |   - |
4 |     The implementations of stores and mappings are now exposed via the top
5 |     level module ``ddsketch``.
6 | 


--------------------------------------------------------------------------------
/releasenotes/notes/typing-25579ab88323a332.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | features:
3 |   - |
4 |     Add typing.
5 | 


--------------------------------------------------------------------------------
/releasenotes/notes/version-b2a276df190a703a.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | features:
3 |   - |
4 |     The package version is now exposed through ``ddsketch.__version__``.
5 | 


--------------------------------------------------------------------------------
/riotfile.py:
--------------------------------------------------------------------------------
 1 | from riot import Venv
 2 | from riot import latest
 3 | 
 4 | 
 5 | venv = Venv(
 6 |     pys=["3"],
 7 |     venvs=[
 8 |         Venv(
 9 |             name="test",
10 |             command="pytest {cmdargs}",
11 |             pkgs={
12 |                 "pytest": latest,
13 |                 "numpy": latest,
14 |             },
15 |             venvs=[
16 |                 Venv(
17 |                     pys=["3.7", "3.8", "3.9"],
18 |                     pkgs={
19 |                         "protobuf": ["==3.0.0", latest],
20 |                     },
21 |                 ),
22 |                 Venv(
23 |                     pys=["3.10", "3.11", "3.12"],
24 |                     pkgs={
25 |                         "protobuf": ["==3.8.0", latest],
26 |                     },
27 |                 ),
28 |             ],
29 |         ),
30 |         Venv(
31 |             pkgs={
32 |                 "reno": latest,
33 |             },
34 |             venvs=[
35 |                 Venv(
36 |                     name="reno",
37 |                     command="reno {cmdargs}",
38 |                 )
39 |             ],
40 |         ),
41 |         Venv(
42 |             name="flake8",
43 |             command="flake8 {cmdargs}",
44 |             pkgs={
45 |                 "flake8": latest,
46 |                 "flake8-blind-except": latest,
47 |                 "flake8-builtins": latest,
48 |                 "flake8-docstrings": latest,
49 |                 "flake8-rst-docstrings": latest,
50 |                 # needed for some features from flake8-rst-docstrings
51 |                 "pygments": latest,
52 |             },
53 |         ),
54 |         Venv(
55 |             pkgs={
56 |                 "black": latest,
57 |                 "isort": latest,
58 |                 "toml": latest,
59 |             },
60 |             venvs=[
61 |                 Venv(
62 |                     name="black",
63 |                     command="black {cmdargs}",
64 |                 ),
65 |                 Venv(
66 |                     name="fmt",
67 |                     command="isort . && black .",
68 |                 ),
69 |                 Venv(
70 |                     name="check_fmt",
71 |                     command="isort --check . && black --check .",
72 |                 ),
73 |             ],
74 |         ),
75 |         Venv(
76 |             name="mypy",
77 |             create=True,
78 |             command="mypy --install-types --non-interactive {cmdargs}",
79 |             pkgs={
80 |                 "mypy": latest,
81 |                 "types-protobuf": latest,
82 |                 "types-setuptools": latest,
83 |                 "types-six": latest,
84 |             },
85 |         ),
86 |     ],
87 | )
88 | 


--------------------------------------------------------------------------------
/scripts/check-releasenotes:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # If we are running outside a GitHub action, default to `master`
 5 | BASE_REF="${GITHUB_BASE_REF:-master}"
 6 | 
 7 | # Print input data
 8 | echo "Base ref: origin/${BASE_REF}"
 9 | echo "GitHub event path: ${GITHUB_EVENT_PATH}"
10 | echo "JQ: $(which jq)"
11 | 
12 | 
13 | # Skip the label check if we do not have a GitHub event path
14 | if [[ -f "${GITHUB_EVENT_PATH}" ]] && jq -e '.pull_request?.labels[]?.name | select(. == "no-changelog")' "${GITHUB_EVENT_PATH}";
15 | then
16 |     echo "PR has label 'no-changelog', skipping validation"
17 |     exit 0
18 | fi
19 | 
20 | # Check if they added a new file to releasenotes/notes
21 | if git diff --name-only --diff-filter=A "origin/${BASE_REF}" | grep releasenotes/notes;
22 | then
23 |     echo "New release note found, success"
24 |     exit 0
25 | else
26 |     echo "Release note not found."
27 |     echo "Use 'reno new <slug>' to add a new note to 'releasenotes/notes', or add the label 'no-changelog' to skip this validation"
28 |     exit 1
29 | fi
30 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | application-import-name=ddsketch
 3 | exclude=
 4 |   .riot,
 5 |   .git,
 6 |   .venv,
 7 |   __pycache__,
 8 |   *.eggs-info,
 9 |   build,
10 |   ddsketch/pb,
11 | # E501,E231,W503: not respected by black
12 | ignore = E501,W503,E231,D100,D101,D102,D103,D104,D105,D107,D205,D400,D401,D402,E203,B902,I100
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | 
 4 | with open("README.md", "r") as fh:
 5 |     long_description = fh.read()
 6 | 
 7 | setuptools.setup(
 8 |     name="ddsketch",
 9 |     author="Jee Rim, Charles-Philippe Masson, Homin Lee",
10 |     author_email="jee.rim@datadoghq.com, charles.masson@datadoghq.com, homin@datadoghq.com",
11 |     description="Distributed quantile sketches",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="http://github.com/datadog/sketches-py",
15 |     packages=setuptools.find_packages(exclude=["tests*"]),
16 |     package_data={"ddsketch": ["py.typed"]},
17 |     classifiers=[
18 |         "Programming Language :: Python :: 3",
19 |         "License :: OSI Approved :: Apache Software License",
20 |     ],
21 |     keywords=["ddsketch", "quantile", "sketch"],
22 |     install_requires=[
23 |         "six",
24 |     ],
25 |     extras_require={"serialization": ["protobuf>=3.0.0"]},
26 |     python_requires=">=3.7",
27 |     download_url="https://github.com/DataDog/sketches-py/archive/v1.0.tar.gz",
28 |     setup_requires=["setuptools_scm"],
29 |     use_scm_version={"write_to": "ddsketch/__version.py"},
30 | )
31 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataDog/sketches-py/0d16e695d1f991276863b8ffaaf6c8e9bd9ad9de/tests/__init__.py


--------------------------------------------------------------------------------
/tests/datasets.py:
--------------------------------------------------------------------------------
  1 | # Unless explicitly stated otherwise all files in this repository are licensed
  2 | # under the Apache License 2.0.
  3 | # This product includes software developed at Datadog (https://www.datadoghq.com/).
  4 | # Copyright 2020 Datadog, Inc.
  5 | 
  6 | import abc
  7 | 
  8 | import numpy as np
  9 | import six
 10 | 
 11 | 
 12 | class Dataset(six.with_metaclass(abc.ABCMeta)):
 13 |     def __init__(self, size):
 14 |         self.size = int(size)
 15 |         self.data = self.populate()
 16 | 
 17 |     def __str__(self):
 18 |         return "{}_{}".format(self.name, self.size)
 19 | 
 20 |     def __len__(self):
 21 |         return self.size
 22 | 
 23 |     def rank(self, value):
 24 |         lower = np.array(sorted(self.data)) < value
 25 |         if np.all(lower):
 26 |             return self.size - 1
 27 |         else:
 28 |             return np.argmin(lower)
 29 | 
 30 |     def quantile(self, q):
 31 |         self.data.sort()
 32 |         rank = int(q * (self.size - 1))
 33 |         return self.data[rank]
 34 | 
 35 |     @property
 36 |     def sum(self):  # noqa: A003
 37 |         return np.sum(self.data)
 38 | 
 39 |     @property
 40 |     def avg(self):
 41 |         return np.mean(self.data)
 42 | 
 43 |     @abc.abstractmethod
 44 |     def name(self):
 45 |         """Name of dataset"""
 46 | 
 47 |     @abc.abstractmethod
 48 |     def populate(self):
 49 |         """Populate self.data with self.size values"""
 50 | 
 51 | 
 52 | class EmptyDataset(Dataset):
 53 |     @property
 54 |     def name(self):
 55 |         return "no_name"
 56 | 
 57 |     def populate(self):
 58 |         return []
 59 | 
 60 |     def add(self, val):
 61 |         self.size += 1
 62 |         self.data.append(val)
 63 | 
 64 |     def add_all(self, vals):
 65 |         self.size += len(vals)
 66 |         self.data.extend(vals)
 67 | 
 68 | 
 69 | class UniformForward(Dataset):
 70 |     @property
 71 |     def name(self):
 72 |         return "uniform_forward"
 73 | 
 74 |     def populate(self):
 75 |         return list(self.generate())
 76 | 
 77 |     def generate(self):
 78 |         for x in range(self.size):
 79 |             yield x
 80 | 
 81 | 
 82 | class UniformBackward(Dataset):
 83 |     @property
 84 |     def name(self):
 85 |         return "uniform_backward"
 86 | 
 87 |     def populate(self):
 88 |         return list(self.generate())
 89 | 
 90 |     def generate(self):
 91 |         for x in range(self.size, 0, -1):
 92 |             yield x
 93 | 
 94 | 
 95 | class NegativeUniformForward(Dataset):
 96 |     @property
 97 |     def name(self):
 98 |         return "negative_uniform_forward"
 99 | 
100 |     def populate(self):
101 |         return list(self.generate())
102 | 
103 |     def generate(self):
104 |         for x in range(self.size, 0, -1):
105 |             yield -x
106 | 
107 | 
108 | class NegativeUniformBackward(Dataset):
109 |     @property
110 |     def name(self):
111 |         return "negative_uniform_backward"
112 | 
113 |     def populate(self):
114 |         return list(self.generate())
115 | 
116 |     def generate(self):
117 |         for x in range(self.size):
118 |             yield -x
119 | 
120 | 
121 | class NumberLineForward(Dataset):
122 |     @property
123 |     def name(self):
124 |         return "number_line_forward"
125 | 
126 |     def populate(self):
127 |         return list(self.generate())
128 | 
129 |     def generate(self):
130 |         for x in range(-self.size // 2 + 1, self.size // 2 + 1, 1):
131 |             yield x
132 | 
133 | 
134 | class NumberLineBackward(Dataset):
135 |     @property
136 |     def name(self):
137 |         return "number_line_backward"
138 | 
139 |     def populate(self):
140 |         return list(self.generate())
141 | 
142 |     def generate(self):
143 |         for x in range(self.size // 2, -self.size // 2, -1):
144 |             yield x
145 | 
146 | 
147 | class UniformZoomIn(Dataset):
148 |     @property
149 |     def name(self):
150 |         return "uniform_zoomin"
151 | 
152 |     def populate(self):
153 |         return list(self.generate())
154 | 
155 |     def generate(self):
156 |         if self.size % 2 == 1:
157 |             for item in range(self.size // 2):
158 |                 yield item
159 |                 yield self.size - item - 1
160 |             yield self.size // 2
161 |         else:
162 |             for item in range(self.size // 2):
163 |                 yield item
164 |                 yield self.size - item - 1
165 | 
166 | 
167 | class UniformZoomOut(Dataset):
168 |     @property
169 |     def name(self):
170 |         return "uniform_zoomout"
171 | 
172 |     def populate(self):
173 |         return list(self.generate())
174 | 
175 |     def generate(self):
176 |         if self.size % 2 == 1:
177 |             yield self.size // 2
178 |             half = int(np.floor(self.size / 2))
179 |             for item in range(1, half + 1):
180 |                 yield half + item
181 |                 yield half - item
182 |         else:
183 |             half = int(np.ceil(self.size / 2)) - 0.5
184 |             for item in range(0, int(half + 0.5)):
185 |                 yield int(half + item + 0.5)
186 |                 yield int(half - item - 0.5)
187 | 
188 | 
189 | class UniformSqrt(Dataset):
190 |     @property
191 |     def name(self):
192 |         return "uniform_sqrt"
193 | 
194 |     def populate(self):
195 |         return list(self.generate())
196 | 
197 |     def generate(self):
198 |         t = int(np.sqrt(2 * self.size))
199 |         initial_item = 0
200 |         initial_skip = 1
201 |         emitted = 0
202 |         i = 0
203 |         while emitted < self.size:
204 |             item = initial_item
205 |             skip = initial_skip
206 |             for j in range(t - i):
207 |                 if item < self.size:
208 |                     yield item
209 |                     emitted += 1
210 |                 item += skip
211 |                 skip += 1
212 |             if t - i > 1:
213 |                 initial_skip += 1
214 |                 initial_item += initial_skip
215 |                 i += 1
216 |             else:
217 |                 initial_item += 1
218 | 
219 | 
220 | class Constant(Dataset):
221 |     constant = 42.0
222 | 
223 |     @property
224 |     def name(self):
225 |         return "constant"
226 | 
227 |     def populate(self):
228 |         return [self.constant] * self.size
229 | 
230 | 
231 | class Exponential(Dataset):
232 |     scale = 0.01
233 | 
234 |     @classmethod
235 |     def from_params(cls, scale, n):
236 |         cls.scale = scale
237 |         return cls(n)
238 | 
239 |     @property
240 |     def name(self):
241 |         return "exponential"
242 | 
243 |     def populate(self):
244 |         return np.random.exponential(scale=self.scale, size=self.size)
245 | 
246 | 
247 | class Lognormal(Dataset):
248 |     scale = 100.0
249 | 
250 |     @classmethod
251 |     def from_params(cls, scale, n):
252 |         cls.scale = scale
253 |         return cls(n)
254 | 
255 |     @property
256 |     def name(self):
257 |         return "lognormal"
258 | 
259 |     def populate(self):
260 |         return np.random.lognormal(size=self.size) / self.scale
261 | 
262 | 
263 | class Normal(Dataset):
264 |     loc = 37.4
265 |     scale = 1.0
266 | 
267 |     @classmethod
268 |     def from_params(cls, loc, scale, n):
269 |         cls.loc = loc
270 |         cls.scale = scale
271 |         return cls(n)
272 | 
273 |     @property
274 |     def name(self):
275 |         return "normal"
276 | 
277 |     def populate(self):
278 |         return np.random.normal(loc=self.loc, scale=self.scale, size=self.size)
279 | 
280 | 
281 | class Laplace(Dataset):
282 |     loc = 11278.0
283 |     scale = 100.0
284 | 
285 |     @classmethod
286 |     def from_params(cls, loc, scale, n):
287 |         cls.loc = loc
288 |         cls.scale = scale
289 |         return cls(n)
290 | 
291 |     @property
292 |     def name(self):
293 |         return "laplace"
294 | 
295 |     def populate(self):
296 |         return np.random.laplace(loc=self.loc, scale=self.scale, size=self.size)
297 | 
298 | 
299 | class Bimodal(Dataset):
300 |     right_loc = 17.3
301 |     left_loc = -2.0
302 |     left_std = 3.0
303 | 
304 |     @property
305 |     def name(self):
306 |         return "bimodal"
307 | 
308 |     def populate(self):
309 |         return [next(self.generate()) for _ in range(int(self.size))]
310 | 
311 |     def generate(self):
312 |         if np.random.random() > 0.5:
313 |             yield np.random.laplace(self.right_loc)
314 |         else:
315 |             yield np.random.normal(self.left_loc, self.left_std)
316 | 
317 | 
318 | class Mixed(Dataset):
319 |     mean = 0.0
320 |     sigma = 0.25
321 |     scale_factor = 0.1
322 | 
323 |     loc = 10.0
324 |     scale = 0.5
325 | 
326 |     def __init__(self, size, ratio=0.9, ignore_rank=False):
327 |         self.size = int(size)
328 |         self.ratio = ratio
329 |         self.data = self.populate()
330 |         self._ignore_rank = ignore_rank
331 | 
332 |     @property
333 |     def name(self):
334 |         return "mixed"
335 | 
336 |     def populate(self):
337 |         return [next(self.generate()) for _ in range(int(self.size))]
338 | 
339 |     def generate(self):
340 |         if np.random.random() < self.ratio:
341 |             yield self.scale_factor * np.random.lognormal(self.mean, self.sigma)
342 |         else:
343 |             yield np.random.normal(self.loc, self.scale)
344 | 
345 | 
346 | class Trimodal(Dataset):
347 |     right_loc = 17.3
348 |     left_loc = 5.0
349 |     left_std = 0.5
350 |     exp_scale = 0.01
351 | 
352 |     @property
353 |     def name(self):
354 |         return "trimodal"
355 | 
356 |     def populate(self):
357 |         return [next(self.generate()) for _ in range(int(self.size))]
358 | 
359 |     def generate(self):
360 |         if np.random.random() > 2.0 / 3.0:
361 |             yield np.random.laplace(self.right_loc)
362 |         elif np.random.random() > 1.0 / 3.0:
363 |             yield np.random.normal(self.left_loc, self.left_std)
364 |         else:
365 |             yield np.random.exponential(scale=self.exp_scale)
366 | 
367 | 
368 | class Integers(Dataset):
369 |     loc = 4.3
370 |     scale = 5.0
371 | 
372 |     @classmethod
373 |     def from_params(cls, loc, scale, n):
374 |         cls.loc = loc
375 |         cls.scale = scale
376 |         return cls(n)
377 | 
378 |     @property
379 |     def name(self):
380 |         return "integers"
381 | 
382 |     def populate(self):
383 |         return [
384 |             int(x)
385 |             for x in np.random.normal(loc=self.loc, scale=self.scale, size=self.size)
386 |         ]
387 | 


--------------------------------------------------------------------------------
/tests/test_ddsketch.py:
--------------------------------------------------------------------------------
  1 | # Unless explicitly stated otherwise all files in this repository are licensed
  2 | # under the Apache License 2.0.
  3 | # This product includes software developed at Datadog (https://www.datadoghq.com/).
  4 | # Copyright 2020 Datadog, Inc.
  5 | 
  6 | """Tests for DDSketch"""
  7 | 
  8 | import abc
  9 | from collections import Counter
 10 | from unittest import TestCase
 11 | 
 12 | import numpy as np
 13 | import pytest
 14 | import six
 15 | 
 16 | import ddsketch
 17 | from ddsketch.ddsketch import DDSketch
 18 | from ddsketch.ddsketch import LogCollapsingHighestDenseDDSketch
 19 | from ddsketch.ddsketch import LogCollapsingLowestDenseDDSketch
 20 | from tests.datasets import Bimodal
 21 | from tests.datasets import Constant
 22 | from tests.datasets import EmptyDataset
 23 | from tests.datasets import Exponential
 24 | from tests.datasets import Integers
 25 | from tests.datasets import Laplace
 26 | from tests.datasets import Lognormal
 27 | from tests.datasets import Mixed
 28 | from tests.datasets import NegativeUniformBackward
 29 | from tests.datasets import NegativeUniformForward
 30 | from tests.datasets import Normal
 31 | from tests.datasets import NumberLineBackward
 32 | from tests.datasets import NumberLineForward
 33 | from tests.datasets import Trimodal
 34 | from tests.datasets import UniformBackward
 35 | from tests.datasets import UniformForward
 36 | from tests.datasets import UniformSqrt
 37 | from tests.datasets import UniformZoomIn
 38 | from tests.datasets import UniformZoomOut
 39 | 
 40 | 
 41 | TEST_QUANTILES = [0, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999, 1]
 42 | TEST_SIZES = [3, 5, 10, 100, 1000]
 43 | DATASETS = [
 44 |     UniformForward,
 45 |     UniformBackward,
 46 |     UniformZoomIn,
 47 |     UniformZoomOut,
 48 |     UniformSqrt,
 49 |     Constant,
 50 |     NegativeUniformBackward,
 51 |     NegativeUniformForward,
 52 |     NumberLineBackward,
 53 |     NumberLineForward,
 54 |     Exponential,
 55 |     Lognormal,
 56 |     Normal,
 57 |     Laplace,
 58 |     Bimodal,
 59 |     Trimodal,
 60 |     Mixed,
 61 |     Integers,
 62 | ]
 63 | 
 64 | TEST_REL_ACC = 0.05
 65 | TEST_BIN_LIMIT = 1024
 66 | 
 67 | 
 68 | class BaseTestDDSketches(six.with_metaclass(abc.ABCMeta)):
 69 |     """AbstractBaseClass for testing DDSketch implementations"""
 70 | 
 71 |     @staticmethod
 72 |     @abc.abstractmethod
 73 |     def _new_dd_sketch():
 74 |         """Create a new DDSketch of the appropriate type"""
 75 | 
 76 |     def _evaluate_sketch_accuracy(self, sketch, data, eps, summary_stats=True):
 77 |         size = data.size
 78 |         for quantile in TEST_QUANTILES:
 79 |             sketch_q = sketch.get_quantile_value(quantile)
 80 |             data_q = data.quantile(quantile)
 81 |             err = abs(sketch_q - data_q)
 82 |             assert err - eps * abs(data_q) <= 1e-15
 83 |         assert sketch.num_values == size
 84 |         if summary_stats:
 85 |             assert sketch.sum == pytest.approx(data.sum)
 86 |             assert sketch.avg == pytest.approx(data.avg)
 87 | 
 88 |     def test_distributions(self):
 89 |         """Test DDSketch on values from various distributions"""
 90 |         for dataset in DATASETS:
 91 |             for size in TEST_SIZES:
 92 |                 data = dataset(size)
 93 |                 sketch = self._new_dd_sketch()
 94 |                 for value in data.data:
 95 |                     sketch.add(value)
 96 |                 self._evaluate_sketch_accuracy(sketch, data, TEST_REL_ACC)
 97 | 
 98 |     def test_add_multiple(self):
 99 |         """Test DDSketch on adding integer weighted values"""
100 |         data = Integers(1000)
101 |         sketch = self._new_dd_sketch()
102 |         for value, count in Counter(data.data).items():
103 |             sketch.add(value, count)
104 |         self._evaluate_sketch_accuracy(sketch, data, TEST_REL_ACC)
105 | 
106 |     def test_add_decimal(self):
107 |         """Test DDSketch on adding decimal weighted values"""
108 |         sketch = self._new_dd_sketch()
109 |         for value in range(100):
110 |             sketch.add(value, 1.1)
111 |         sketch.add(100, 110.0)
112 | 
113 |         data_median = 99
114 |         sketch_median = sketch.get_quantile_value(0.5)
115 |         err = abs(sketch_median - data_median)
116 |         assert err - TEST_REL_ACC * abs(data_median) <= 1e-15
117 |         assert sketch.num_values == pytest.approx(110 * 2)
118 |         assert sketch.sum == pytest.approx(5445 + 11000)
119 |         assert sketch.avg == pytest.approx(74.75)
120 | 
121 |     def test_merge_equal(self):
122 |         """Test merging equal-sized DDSketches"""
123 |         parameters = [(35, 1), (1, 3), (15, 2), (40, 0.5)]
124 |         for size in TEST_SIZES:
125 |             dataset = EmptyDataset(0)
126 |             target_sketch = self._new_dd_sketch()
127 |             for params in parameters:
128 |                 generator = Normal.from_params(params[0], params[1], size)
129 |                 sketch = self._new_dd_sketch()
130 |                 for value in generator.data:
131 |                     sketch.add(value)
132 |                     dataset.add(value)
133 |                 target_sketch.merge(sketch)
134 |                 self._evaluate_sketch_accuracy(target_sketch, dataset, TEST_REL_ACC)
135 | 
136 |             self._evaluate_sketch_accuracy(target_sketch, dataset, TEST_REL_ACC)
137 | 
138 |     def test_merge_unequal(self):
139 |         """Test merging variable-sized DDSketches"""
140 |         ntests = 20
141 |         for _ in range(ntests):
142 |             for size in TEST_SIZES:
143 |                 dataset = Lognormal(size)
144 |                 sketch1 = self._new_dd_sketch()
145 |                 sketch2 = self._new_dd_sketch()
146 |                 for value in dataset.data:
147 |                     if np.random.random() > 0.7:
148 |                         sketch1.add(value)
149 |                     else:
150 |                         sketch2.add(value)
151 |                 sketch1.merge(sketch2)
152 |                 self._evaluate_sketch_accuracy(sketch1, dataset, TEST_REL_ACC)
153 | 
154 |     def test_merge_mixed(self):
155 |         """Test merging DDSketches of different distributions"""
156 |         ntests = 20
157 |         test_datasets = [Normal, Exponential, Laplace, Bimodal]
158 |         for _ in range(ntests):
159 |             merged_dataset = EmptyDataset(0)
160 |             merged_sketch = self._new_dd_sketch()
161 |             for dataset in test_datasets:
162 |                 generator = dataset(np.random.randint(0, 500))
163 |                 sketch = self._new_dd_sketch()
164 |                 for value in generator.data:
165 |                     sketch.add(value)
166 |                     merged_dataset.add(value)
167 |                 merged_sketch.merge(sketch)
168 |             self._evaluate_sketch_accuracy(merged_sketch, merged_dataset, TEST_REL_ACC)
169 | 
170 |     def test_consistent_merge(self):
171 |         """Test that merge() calls do not modify the argument sketch."""
172 |         sketch1 = self._new_dd_sketch()
173 |         sketch2 = self._new_dd_sketch()
174 |         dataset = Normal(100)
175 |         for value in dataset.data:
176 |             sketch1.add(value)
177 |         sketch1.merge(sketch2)
178 |         # sketch2 is still empty
179 |         assert sketch2.num_values == 0
180 | 
181 |         dataset = Normal(50)
182 |         for value in dataset.data:
183 |             sketch2.add(value)
184 | 
185 |         sketch2_summary = [sketch2.get_quantile_value(q) for q in TEST_QUANTILES] + [
186 |             sketch2.sum,
187 |             sketch2.avg,
188 |             sketch2.num_values,
189 |         ]
190 |         sketch1.merge(sketch2)
191 | 
192 |         dataset = Normal(10)
193 |         for value in dataset.data:
194 |             sketch1.add(value)
195 |         # changes to sketch1 does not affect sketch2 after merge
196 |         sketch2_summary = [sketch2.get_quantile_value(q) for q in TEST_QUANTILES] + [
197 |             sketch2.sum,
198 |             sketch2.avg,
199 |             sketch2.num_values,
200 |         ]
201 |         assert sketch2_summary == pytest.approx(
202 |             [sketch2.get_quantile_value(q) for q in TEST_QUANTILES]
203 |             + [sketch2.sum, sketch2.avg, sketch2.num_values],
204 |         )
205 | 
206 |         sketch3 = self._new_dd_sketch()
207 |         sketch3.merge(sketch2)
208 |         # merging to an empty sketch does not change sketch2
209 |         assert sketch2_summary == pytest.approx(
210 |             [sketch2.get_quantile_value(q) for q in TEST_QUANTILES]
211 |             + [sketch2.sum, sketch2.avg, sketch2.num_values],
212 |         )
213 | 
214 | 
215 | class TestDDSketch(BaseTestDDSketches, TestCase):
216 |     """Class for testing LogCollapsingLowestDenseDDSketch"""
217 | 
218 |     @staticmethod
219 |     def _new_dd_sketch():
220 |         return DDSketch(TEST_REL_ACC)
221 | 
222 | 
223 | class TestLogCollapsingLowestDenseDDSketch(BaseTestDDSketches, TestCase):
224 |     """Class for testing LogCollapsingLowestDenseDDSketch"""
225 | 
226 |     @staticmethod
227 |     def _new_dd_sketch():
228 |         return LogCollapsingLowestDenseDDSketch(TEST_REL_ACC, TEST_BIN_LIMIT)
229 | 
230 | 
231 | class TestLogCollapsingHighestDenseDDSketch(BaseTestDDSketches, TestCase):
232 |     """Class for testing LogCollapsingHighestDenseDDSketch"""
233 | 
234 |     @staticmethod
235 |     def _new_dd_sketch():
236 |         return LogCollapsingHighestDenseDDSketch(TEST_REL_ACC, TEST_BIN_LIMIT)
237 | 
238 | 
239 | def test_version():
240 |     """Ensure the package version is exposed by the API."""
241 |     assert hasattr(ddsketch, "__version__")
242 |     assert isinstance(ddsketch.__version__, str)
243 | 


--------------------------------------------------------------------------------
/tests/test_mapping.py:
--------------------------------------------------------------------------------
  1 | # Unless explicitly stated otherwise all files in this repository are licensed
  2 | # under the Apache License 2.0.
  3 | # This product includes software developed at Datadog (https://www.datadoghq.com/).
  4 | # Copyright 2020 Datadog, Inc.
  5 | 
  6 | """Tests for the KeyMapping classes"""
  7 | 
  8 | import abc
  9 | import math
 10 | from unittest import TestCase
 11 | 
 12 | import numpy
 13 | import pytest
 14 | import six
 15 | 
 16 | from ddsketch.mapping import CubicallyInterpolatedMapping
 17 | from ddsketch.mapping import LinearlyInterpolatedMapping
 18 | from ddsketch.mapping import LogarithmicMapping
 19 | from ddsketch.mapping import _cbrt
 20 | 
 21 | 
 22 | def _relative_error(expected_min, expected_max, actual):
 23 |     """Calculate the relative error"""
 24 |     if expected_min < 0 or expected_max < 0 or actual < 0:
 25 |         raise Exception()
 26 |     if (expected_min <= actual) and (actual <= expected_max):
 27 |         return 0.0
 28 |     if expected_min == 0 and expected_max == 0:
 29 |         return 0.0 if actual == 0 else float("+inf")
 30 |     if actual < expected_min:
 31 |         return (expected_min - actual) / expected_min
 32 | 
 33 |     return (actual - expected_max) / expected_max
 34 | 
 35 | 
 36 | def _test_value_rel_acc(mapping):
 37 |     """Calculate the relative accuracy of a mapping on a large range of values"""
 38 |     value_mult = 2 - math.sqrt(2) * 1e-1
 39 |     max_relative_acc = 0.0
 40 |     value = mapping.min_possible
 41 |     while value < mapping.max_possible / value_mult:
 42 |         value *= value_mult
 43 |         map_val = mapping.value(mapping.key(value))
 44 |         rel_err = _relative_error(value, value, map_val)
 45 |         assert rel_err < mapping.relative_accuracy
 46 |         max_relative_acc = max(max_relative_acc, rel_err)
 47 |     max_relative_acc = max(
 48 |         max_relative_acc,
 49 |         _relative_error(
 50 |             mapping.max_possible,
 51 |             mapping.max_possible,
 52 |             mapping.value(mapping.key(mapping.max_possible)),
 53 |         ),
 54 |     )
 55 |     return max_relative_acc
 56 | 
 57 | 
 58 | class BaseTestKeyMapping(six.with_metaclass(abc.ABCMeta)):
 59 |     """Abstract class for testing KeyMapping classes"""
 60 | 
 61 |     offsets = [0, 1, -12.23, 7768.3]
 62 | 
 63 |     @abc.abstractmethod
 64 |     def mapping(self, relative_accuracy, offset):
 65 |         """Return the KeyMapping instance to be tested"""
 66 | 
 67 |     def test_accuracy(self):
 68 |         """Test the mapping on a large range of relative accuracies"""
 69 |         rel_acc_mult = 1 - math.sqrt(2) * 1e-1
 70 |         min_rel_acc = 1e-8
 71 |         rel_acc = 1 - 1e-3
 72 | 
 73 |         while rel_acc >= min_rel_acc:
 74 |             mapping = self.mapping(rel_acc, offset=0.0)
 75 |             max_rel_acc = _test_value_rel_acc(mapping)
 76 |             assert max_rel_acc < mapping.relative_accuracy
 77 |             rel_acc *= rel_acc_mult
 78 | 
 79 |     def test_offsets(self):
 80 |         """Test offsets"""
 81 |         for offset in self.offsets:
 82 |             mapping = self.mapping(0.01, offset=offset)
 83 |             assert mapping.key(1) == int(offset)
 84 | 
 85 | 
 86 | class TestLogarithmicMapping(BaseTestKeyMapping, TestCase):
 87 |     """Class for testing LogarithmicMapping class"""
 88 | 
 89 |     def mapping(self, relative_accuracy, offset):
 90 |         return LogarithmicMapping(relative_accuracy, offset)
 91 | 
 92 | 
 93 | class TestLinearlyInterpolatedMapping(BaseTestKeyMapping, TestCase):
 94 |     """Class for testing LinearlyInterpolatedMapping class"""
 95 | 
 96 |     def mapping(self, relative_accuracy, offset):
 97 |         return LinearlyInterpolatedMapping(relative_accuracy, offset)
 98 | 
 99 | 
100 | class TestCubicallyInterpolatedMapping(BaseTestKeyMapping, TestCase):
101 |     """Class for testing CubicallyInterpolatedMapping class"""  #
102 | 
103 |     def mapping(self, relative_accuracy, offset):
104 |         return CubicallyInterpolatedMapping(relative_accuracy, offset)
105 | 
106 | 
107 | @pytest.mark.parametrize("x", [-12.3, -1.0, -1.0 / 3.0, 0.0, 1.0, 1.0 / 3.0, 2.0**10])
108 | def test_cbrt(x):
109 |     assert pytest.approx(_cbrt(x)) == numpy.cbrt(x)
110 | 


--------------------------------------------------------------------------------
/tests/test_proto.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | from unittest import TestCase
 3 | 
 4 | import pytest
 5 | import six
 6 | 
 7 | from ddsketch.mapping import CubicallyInterpolatedMapping
 8 | from ddsketch.mapping import LinearlyInterpolatedMapping
 9 | from ddsketch.mapping import LogarithmicMapping
10 | from ddsketch.pb.proto import DDSketchProto
11 | from ddsketch.pb.proto import KeyMappingProto
12 | from ddsketch.pb.proto import StoreProto
13 | from ddsketch.store import DenseStore
14 | from tests.test_ddsketch import TestDDSketch
15 | from tests.test_store import TestDenseStore
16 | 
17 | 
18 | class BaseTestKeyMappingProto(six.with_metaclass(abc.ABCMeta)):
19 |     offsets = [0, 1, -12.23, 7768.3]
20 | 
21 |     def test_round_trip(self):
22 |         rel_accs = [1e-1, 1e-2, 1e-8]
23 |         for rel_acc in rel_accs:
24 |             for offset in self.offsets:
25 |                 mapping = self.mapping(rel_acc, offset)
26 |                 round_trip_mapping = KeyMappingProto.from_proto(
27 |                     KeyMappingProto.to_proto(mapping)
28 |                 )
29 |                 assert type(mapping) == type(round_trip_mapping)  # noqa: E721
30 |                 assert mapping.relative_accuracy == pytest.approx(
31 |                     round_trip_mapping.relative_accuracy
32 |                 )
33 |                 assert mapping.value(0) == pytest.approx(round_trip_mapping.value(0))
34 | 
35 | 
36 | class TestLogarithmicMapping(BaseTestKeyMappingProto, TestCase):
37 |     """Class for testing LogarithmicMapping class"""
38 | 
39 |     def mapping(self, relative_accuracy, offset):
40 |         return LogarithmicMapping(relative_accuracy, offset)
41 | 
42 | 
43 | class TestLinearlyInterpolatedMapping(BaseTestKeyMappingProto, TestCase):
44 |     """Class for testing LinearlyInterpolatedMapping class"""
45 | 
46 |     def mapping(self, relative_accuracy, offset):
47 |         return LinearlyInterpolatedMapping(relative_accuracy, offset)
48 | 
49 | 
50 | class TestCubicallyInterpolatedMapping(BaseTestKeyMappingProto, TestCase):
51 |     """Class for testing CubicallyInterpolatedMapping class"""
52 | 
53 |     def mapping(self, relative_accuracy, offset):
54 |         return CubicallyInterpolatedMapping(relative_accuracy, offset)
55 | 
56 | 
57 | class TestStoreProto(TestDenseStore, TestCase):
58 |     def _test_store(self, values):
59 |         store = DenseStore()
60 |         for val in values:
61 |             store.add(val)
62 |         self._test_values(StoreProto.from_proto(StoreProto.to_proto(store)), values)
63 | 
64 | 
65 | class TestDDSketchProto(TestDDSketch, TestCase):
66 |     def _evaluate_sketch_accuracy(self, sketch, data, eps, summary_stats=False):
67 |         round_trip_sketch = DDSketchProto.from_proto(DDSketchProto.to_proto(sketch))
68 |         super(TestDDSketchProto, self)._evaluate_sketch_accuracy(
69 |             round_trip_sketch, data, eps, summary_stats
70 |         )
71 | 
72 |     def test_add_multiple(self):
73 |         """Override."""
74 | 
75 |     def test_add_decimal(self):
76 |         """Override."""
77 | 
78 |     def test_merge_equal(self):
79 |         """Override."""
80 | 
81 |     def test_merge_unequal(self):
82 |         """Override."""
83 | 
84 |     def test_merge_mixed(self):
85 |         """Override."""
86 | 
87 |     def test_consistent_merge(self):
88 |         """Override."""
89 | 


--------------------------------------------------------------------------------
/tests/test_store.py:
--------------------------------------------------------------------------------
  1 | # Unless explicitly stated otherwise all files in this repository are licensed
  2 | # under the Apache License 2.0.
  3 | # This product includes software developed at Datadog (https://www.datadoghq.com/).
  4 | # Copyright 2020 Datadog, Inc.
  5 | 
  6 | """Tests for the Store classes"""
  7 | 
  8 | import abc
  9 | from collections import Counter
 10 | import sys
 11 | from unittest import TestCase
 12 | 
 13 | import six
 14 | 
 15 | from ddsketch.store import CollapsingHighestDenseStore
 16 | from ddsketch.store import CollapsingLowestDenseStore
 17 | from ddsketch.store import DenseStore
 18 | 
 19 | 
 20 | TEST_BIN_LIMIT = [1, 20, 1000]
 21 | EXTREME_MAX = sys.maxsize
 22 | EXTREME_MIN = -sys.maxsize - 1
 23 | 
 24 | 
 25 | class BaseTestStore(six.with_metaclass(abc.ABCMeta)):
 26 |     """Base class for testing Store classes"""
 27 | 
 28 |     @abc.abstractmethod
 29 |     def _test_values(self, store, values):
 30 |         """Test the store's bin counts against what we expect"""
 31 | 
 32 |     @abc.abstractmethod
 33 |     def _test_store(self, values):
 34 |         """Initialize the store; add the values; call _test_values"""
 35 | 
 36 |     @abc.abstractmethod
 37 |     def _test_merging(self, list_values):
 38 |         """
 39 |         Initialize the stores; for each values in list_values, add them to the
 40 |         corresponding store; merge the stores; test the merged store's bin
 41 |         counts against what we expect.
 42 |         """
 43 | 
 44 |     def test_empty(self):
 45 |         """Test no values"""
 46 |         values = []
 47 |         self._test_store(values)
 48 | 
 49 |     def test_constant(self):
 50 |         """Test a constant stream of values"""
 51 |         values = [0] * 10000
 52 |         self._test_store(values)
 53 | 
 54 |     def test_increasing_linearly(self):
 55 |         """Test a stream of increasing values"""
 56 |         values = list(range(10000))
 57 |         self._test_store(values)
 58 | 
 59 |     def test_decreasing_linearly(self):
 60 |         """Test a stream of decreasing values"""
 61 |         values = list(reversed(range(10000)))
 62 |         self._test_store(values)
 63 | 
 64 |     def test_increasing_exponentially(self):
 65 |         """Test a stream of values increasing exponentially"""
 66 |         values = [2**x for x in range(16)]
 67 |         self._test_store(values)
 68 | 
 69 |     def test_decreasing_exponentially(self):
 70 |         """Test a stream of values decreasing exponentially"""
 71 |         values = [2**x for x in reversed(range(16))]
 72 |         self._test_store(values)
 73 | 
 74 |     def test_bin_counts(self):
 75 |         """Test bin counts for positive and negative numbers"""
 76 |         values = [x for x in range(10) for i in range(2 * x)]
 77 |         self._test_store(values)
 78 | 
 79 |         values = [-x for x in range(10) for i in range(2 * x)]
 80 |         self._test_store(values)
 81 | 
 82 |     def test_extreme_values(self):
 83 |         """Test extreme values"""
 84 |         self._test_store([EXTREME_MAX])
 85 |         self._test_store([EXTREME_MIN])
 86 |         self._test_store([0, EXTREME_MIN])
 87 |         self._test_store([0, EXTREME_MAX])
 88 |         self._test_store([EXTREME_MIN, EXTREME_MAX])
 89 |         self._test_store([EXTREME_MAX, EXTREME_MIN])
 90 | 
 91 |     def test_merging_empty(self):
 92 |         """Test merging empty stores"""
 93 |         self._test_merging([[], []])
 94 | 
 95 |     def test_merging_far_apart(self):
 96 |         """Test merging stores with values that are far apart"""
 97 |         self._test_merging([[-10000], [10000]])
 98 |         self._test_merging([[10000], [-10000]])
 99 |         self._test_merging([[10000], [-10000], [0]])
100 |         self._test_merging([[10000, 0], [-10000], [0]])
101 | 
102 |     def test_merging_constant(self):
103 |         """Test merging stores with the same constants"""
104 |         self._test_merging([[2, 2], [2, 2, 2], [2]])
105 |         self._test_merging([[-8, -8], [-8]])
106 | 
107 |     def test_merging_extreme_values(self):
108 |         """Test merging stores with extreme values"""
109 |         self._test_merging([[0], [EXTREME_MIN]])
110 |         self._test_merging([[0], [EXTREME_MAX]])
111 |         self._test_merging([[EXTREME_MIN], [0]])
112 |         self._test_merging([[EXTREME_MAX], [0]])
113 |         self._test_merging([[EXTREME_MIN], [EXTREME_MIN]])
114 |         self._test_merging([[EXTREME_MAX], [EXTREME_MAX]])
115 |         self._test_merging([[EXTREME_MIN], [EXTREME_MAX]])
116 |         self._test_merging([[EXTREME_MAX], [EXTREME_MIN]])
117 |         self._test_merging([[0], [EXTREME_MIN, EXTREME_MAX]])
118 |         self._test_merging([[EXTREME_MIN, EXTREME_MAX], [0]])
119 | 
120 |     def test_copying_empty(self):
121 |         """Test copying empty stores"""
122 |         store = CollapsingLowestDenseStore(10)
123 |         store.copy(CollapsingLowestDenseStore(10))
124 |         assert store.count == 0
125 | 
126 |     def test_copying_non_empty(self):
127 |         """Test copying stores"""
128 |         store = CollapsingLowestDenseStore(10)
129 |         new_store = CollapsingLowestDenseStore(10)
130 |         new_store.add(0)
131 |         store.copy(new_store)
132 |         assert store.count == 1
133 | 
134 | 
135 | class TestDenseStore(BaseTestStore, TestCase):
136 |     """Class for testing the DenseStore class"""
137 | 
138 |     def _test_values(self, store, values):
139 |         counter = Counter(values)
140 | 
141 |         expected_total_count = sum(counter.values())
142 |         assert expected_total_count == sum(store.bins)
143 |         if expected_total_count == 0:
144 |             assert all([x == 0 for x in store.bins])
145 |         else:
146 |             assert not all([x == 0 for x in store.bins])
147 | 
148 |             counter = Counter(values)
149 |             for i, sbin in enumerate(store.bins):
150 |                 if sbin != 0:
151 |                     assert counter[i + store.offset] == sbin
152 | 
153 |     def _test_store(self, values):
154 |         store = DenseStore()
155 |         for val in values:
156 |             store.add(val)
157 |         self._test_values(store, values)
158 | 
159 |     def _test_merging(self, list_values):
160 |         store = DenseStore()
161 | 
162 |         for values in list_values:
163 |             intermediate_store = DenseStore()
164 |             for val in values:
165 |                 intermediate_store.add(val)
166 |             store.merge(intermediate_store)
167 | 
168 |         flat_values = [v for values in list_values for v in values]
169 |         self._test_values(store, flat_values)
170 | 
171 |     def test_key_at_rank(self):
172 |         """Test that key_at_rank properly handles decimal ranks"""
173 |         store = DenseStore()
174 |         store.add(4)
175 |         store.add(10)
176 |         store.add(100)
177 |         assert store.key_at_rank(0) == 4
178 |         assert store.key_at_rank(1) == 10
179 |         assert store.key_at_rank(2) == 100
180 |         assert store.key_at_rank(0, lower=False) == 4
181 |         assert store.key_at_rank(1, lower=False) == 10
182 |         assert store.key_at_rank(2, lower=False) == 100
183 |         assert store.key_at_rank(0.5) == 4
184 |         assert store.key_at_rank(1.5) == 10
185 |         assert store.key_at_rank(2.5) == 100
186 |         assert store.key_at_rank(-0.5, lower=False) == 4
187 |         assert store.key_at_rank(0.5, lower=False) == 10
188 |         assert store.key_at_rank(1.5, lower=False) == 100
189 | 
190 |     def test_extreme_values(self):
191 |         """Override. DenseStore is not meant to be used with values that are extremely
192 |         far from one another as it would allocate an excessively large
193 |         array.
194 |         """
195 | 
196 |     def test_merging_extreme_values(self):
197 |         """Override. DenseStore is not meant to be used with values that are extremely
198 |         far from one another as it would allocate an excessively large
199 |         array.
200 |         """
201 | 
202 | 
203 | class TestCollapsingLowestDenseStore(BaseTestStore, TestCase):
204 |     """Class for testing the CollapsingLowestDenseStore class"""
205 | 
206 |     def _test_values(self, store, values):
207 |         counter = Counter(values)
208 |         expected_total_count = sum(counter.values())
209 |         assert expected_total_count == sum(store.bins)
210 | 
211 |         if expected_total_count == 0:
212 |             assert all([x == 0 for x in store.bins])
213 |         else:
214 |             assert not all([x == 0 for x in store.bins])
215 | 
216 |             max_index = max(counter)
217 |             min_storable_index = max(float("-inf"), max_index - store.bin_limit + 1)
218 |             counter = Counter([max(x, min_storable_index) for x in values])
219 | 
220 |             for i, sbin in enumerate(store.bins):
221 |                 if sbin != 0:
222 |                     assert counter[i + store.offset] == sbin
223 | 
224 |     def _test_store(self, values):
225 |         for bin_limit in TEST_BIN_LIMIT:
226 |             store = CollapsingLowestDenseStore(bin_limit)
227 |             for val in values:
228 |                 store.add(val)
229 |             self._test_values(store, values)
230 | 
231 |     def _test_merging(self, list_values):
232 |         for bin_limit in TEST_BIN_LIMIT:
233 |             store = CollapsingLowestDenseStore(bin_limit)
234 | 
235 |             for values in list_values:
236 |                 intermediate_store = CollapsingLowestDenseStore(bin_limit)
237 |                 for val in values:
238 |                     intermediate_store.add(val)
239 |                 store.merge(intermediate_store)
240 |             flat_values = [v for values in list_values for v in values]
241 |             self._test_values(store, flat_values)
242 | 
243 | 
244 | class TestCollapsingHighestDenseStore(BaseTestStore, TestCase):
245 |     """Class for testing the CollapsingHighestDenseStore class"""
246 | 
247 |     def _test_values(self, store, values):
248 |         counter = Counter(values)
249 | 
250 |         expected_total_count = sum(counter.values())
251 |         assert expected_total_count == sum(store.bins)
252 |         if expected_total_count == 0:
253 |             assert all([x == 0 for x in store.bins])
254 |         else:
255 |             assert not all([x == 0 for x in store.bins])
256 | 
257 |             min_index = min(counter)
258 |             max_storable_index = min(float("+inf"), min_index + store.bin_limit - 1)
259 |             counter = Counter([min(x, max_storable_index) for x in values])
260 | 
261 |             for i, sbin in enumerate(store.bins):
262 |                 if sbin != 0:
263 |                     assert counter[i + store.offset] == sbin
264 | 
265 |     def _test_store(self, values):
266 |         for bin_limit in TEST_BIN_LIMIT[1:2]:
267 |             store = CollapsingHighestDenseStore(bin_limit)
268 |             for val in values:
269 |                 store.add(val)
270 |             self._test_values(store, values)
271 | 
272 |     def _test_merging(self, list_values):
273 |         for bin_limit in TEST_BIN_LIMIT:
274 |             store = CollapsingHighestDenseStore(bin_limit)
275 | 
276 |             for values in list_values:
277 |                 intermediate_store = CollapsingHighestDenseStore(bin_limit)
278 |                 for val in values:
279 |                     intermediate_store.add(val)
280 |                 store.merge(intermediate_store)
281 |             flat_values = [v for values in list_values for v in values]
282 |             self._test_values(store, flat_values)
283 | 


--------------------------------------------------------------------------------