├── histogrammar ├── notebooks │ ├── __init__.py │ └── histogrammar_tutorial_exercises.ipynb ├── test_data │ ├── __init__.py │ └── test.csv.gz ├── sparksql.py ├── plot │ ├── __init__.py │ └── hist_numpy.py ├── primitives │ ├── __init__.py │ ├── count.py │ ├── sum.py │ ├── select.py │ ├── average.py │ └── fraction.py ├── version.py ├── dfinterface │ ├── __init__.py │ ├── filling_utils.py │ ├── pandas_histogrammar.py │ └── spark_histogrammar.py ├── __init__.py ├── resources.py └── convenience.py ├── MANIFEST.in ├── tests ├── jars │ ├── histogrammar_2.11-1.0.11.jar │ ├── histogrammar_2.11-1.0.20.jar │ ├── histogrammar_2.12-1.0.11.jar │ ├── histogrammar_2.12-1.0.20.jar │ ├── histogrammar_2.12-1.0.30.jar │ ├── histogrammar_2.13-1.0.30.jar │ ├── histogrammar-sparksql_2.11-1.0.11.jar │ ├── histogrammar-sparksql_2.11-1.0.20.jar │ ├── histogrammar-sparksql_2.12-1.0.11.jar │ ├── histogrammar-sparksql_2.12-1.0.20.jar │ ├── histogrammar-sparksql_2.12-1.0.30.jar │ └── histogrammar-sparksql_2.13-1.0.30.jar ├── resources │ ├── gender.json │ ├── isActive.json │ ├── eyesColor.json │ ├── date.json │ ├── latitude.json │ ├── transaction.json │ ├── longitude.json │ ├── age.json │ └── isActive_age.json ├── __init__.py ├── test_notebooks.py ├── conftest.py ├── test_spec.py ├── test_spark_histogrammar.py └── test_pandas_histogrammar.py ├── .pre-commit-config.yaml ├── .github └── workflows │ └── test.yml ├── .gitignore ├── NOTICE ├── makedocs.py ├── pyproject.toml ├── CHANGES.rst ├── docs ├── index.rst ├── make.bat ├── Makefile └── conf.py ├── README.rst └── LICENSE /histogrammar/notebooks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /histogrammar/test_data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include LICENSE 3 | include NOTICE -------------------------------------------------------------------------------- /histogrammar/test_data/test.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/histogrammar/histogrammar-python/HEAD/histogrammar/test_data/test.csv.gz -------------------------------------------------------------------------------- /tests/jars/histogrammar_2.11-1.0.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/histogrammar/histogrammar-python/HEAD/tests/jars/histogrammar_2.11-1.0.11.jar -------------------------------------------------------------------------------- /tests/jars/histogrammar_2.11-1.0.20.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/histogrammar/histogrammar-python/HEAD/tests/jars/histogrammar_2.11-1.0.20.jar -------------------------------------------------------------------------------- /tests/jars/histogrammar_2.12-1.0.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/histogrammar/histogrammar-python/HEAD/tests/jars/histogrammar_2.12-1.0.11.jar -------------------------------------------------------------------------------- /tests/jars/histogrammar_2.12-1.0.20.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/histogrammar/histogrammar-python/HEAD/tests/jars/histogrammar_2.12-1.0.20.jar -------------------------------------------------------------------------------- /tests/jars/histogrammar_2.12-1.0.30.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/histogrammar/histogrammar-python/HEAD/tests/jars/histogrammar_2.12-1.0.30.jar -------------------------------------------------------------------------------- /tests/jars/histogrammar_2.13-1.0.30.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/histogrammar/histogrammar-python/HEAD/tests/jars/histogrammar_2.13-1.0.30.jar -------------------------------------------------------------------------------- /tests/jars/histogrammar-sparksql_2.11-1.0.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/histogrammar/histogrammar-python/HEAD/tests/jars/histogrammar-sparksql_2.11-1.0.11.jar -------------------------------------------------------------------------------- /tests/jars/histogrammar-sparksql_2.11-1.0.20.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/histogrammar/histogrammar-python/HEAD/tests/jars/histogrammar-sparksql_2.11-1.0.20.jar -------------------------------------------------------------------------------- /tests/jars/histogrammar-sparksql_2.12-1.0.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/histogrammar/histogrammar-python/HEAD/tests/jars/histogrammar-sparksql_2.12-1.0.11.jar -------------------------------------------------------------------------------- /tests/jars/histogrammar-sparksql_2.12-1.0.20.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/histogrammar/histogrammar-python/HEAD/tests/jars/histogrammar-sparksql_2.12-1.0.20.jar -------------------------------------------------------------------------------- /tests/jars/histogrammar-sparksql_2.12-1.0.30.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/histogrammar/histogrammar-python/HEAD/tests/jars/histogrammar-sparksql_2.12-1.0.30.jar -------------------------------------------------------------------------------- /tests/jars/histogrammar-sparksql_2.13-1.0.30.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/histogrammar/histogrammar-python/HEAD/tests/jars/histogrammar-sparksql_2.13-1.0.30.jar -------------------------------------------------------------------------------- /tests/resources/gender.json: -------------------------------------------------------------------------------- 1 | {"data": {"bins": {"female": 191.0, "male": 209.0}, 2 | "bins:type": "Count", 3 | "entries": 400.0}, 4 | "type": "Categorize", 5 | "version": "1.1"} -------------------------------------------------------------------------------- /tests/resources/isActive.json: -------------------------------------------------------------------------------- 1 | {"data": {"bins": {"False": 208.0, "True": 192.0}, 2 | "bins:type": "Count", 3 | "entries": 400.0}, 4 | "type": "Categorize", 5 | "version": "1.1"} -------------------------------------------------------------------------------- /histogrammar/sparksql.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | # MB 20210131: moved to histogrammar/dfinterface/sparksql.py. Imported here (for now) for bkw compatibility 4 | from .dfinterface.addmethods import add_sparksql_methods as addMethods 5 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | rev: v0.1.6 4 | hooks: 5 | - id: ruff 6 | args: [--fix] 7 | exclude: notebooks/ 8 | - id: ruff-format 9 | exclude: notebooks/ 10 | -------------------------------------------------------------------------------- /tests/resources/eyesColor.json: -------------------------------------------------------------------------------- 1 | {"data": {"bins": {"blue": 77.0, 2 | "brown": 71.0, 3 | "green": 82.0, 4 | "grey": 76.0, 5 | "red": 94.0}, 6 | "bins:type": "Count", 7 | "entries": 400.0}, 8 | "type": "Categorize", 9 | "version": "1.1"} -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2016 DIANA-HEP 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /histogrammar/plot/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2016 DIANA-HEP 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /histogrammar/primitives/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2016 DIANA-HEP 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /histogrammar/version.py: -------------------------------------------------------------------------------- 1 | """THIS FILE IS AUTO-GENERATED BY SETUP.PY.""" 2 | 3 | import re 4 | 5 | version = "1.1.2" 6 | 7 | 8 | def split_version_string(version_string: str) -> tuple[int, int]: 9 | version_numbers = list(map(int, re.split(r"[-.]", version_string))) 10 | return version_numbers[0], version_numbers[1] 11 | 12 | 13 | specification = ".".join([str(i) for i in split_version_string(version)[:2]]) 14 | 15 | 16 | def compatible(serialized_version: str) -> bool: 17 | self_major, self_minor = split_version_string(version) 18 | other_major, other_minor = split_version_string(serialized_version) 19 | 20 | return bool(self_major >= other_major or self_minor >= other_minor) 21 | -------------------------------------------------------------------------------- /tests/resources/date.json: -------------------------------------------------------------------------------- 1 | {"data": {"binWidth": 2592000000000000, 2 | "bins": {"60": 9.0, 3 | "61": 37.0, 4 | "62": 42.0, 5 | "63": 22.0, 6 | "64": 28.0, 7 | "65": 34.0, 8 | "66": 34.0, 9 | "67": 25.0, 10 | "68": 28.0, 11 | "69": 38.0, 12 | "70": 45.0, 13 | "71": 45.0, 14 | "72": 13.0}, 15 | "bins:type": "Count", 16 | "entries": 400.0, 17 | "nanflow": 0.0, 18 | "nanflow:type": "Count", 19 | "origin": 1262563200000000000}, 20 | "type": "SparselyBin", 21 | "version": "1.1"} -------------------------------------------------------------------------------- /tests/test_notebooks.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pytest_notebook.nb_regression import NBRegressionFixture 3 | 4 | from histogrammar.resources import notebook 5 | 6 | 7 | @pytest.fixture(scope="module") 8 | def nb_tester(): 9 | """Test notebooks using pytest-notebook""" 10 | return NBRegressionFixture( 11 | diff_ignore=( 12 | "/metadata/language_info", 13 | "/cells/*/execution_count", 14 | "/cells/*/outputs/*", 15 | ), 16 | exec_timeout=1800, 17 | ) 18 | 19 | 20 | def test_notebook_basic(nb_tester): 21 | nb_tester.check(notebook("histogrammar_tutorial_basic.ipynb")) 22 | 23 | 24 | def test_notebook_advanced(nb_tester): 25 | nb_tester.check(notebook("histogrammar_tutorial_advanced.ipynb")) 26 | 27 | 28 | def test_notebook_exercises(nb_tester): 29 | nb_tester.check(notebook("histogrammar_tutorial_exercises.ipynb")) 30 | -------------------------------------------------------------------------------- /tests/resources/latitude.json: -------------------------------------------------------------------------------- 1 | {"data": {"binWidth": 5, 2 | "bins": {"-1": 15.0, 3 | "-10": 10.0, 4 | "-11": 11.0, 5 | "-12": 12.0, 6 | "-13": 23.0, 7 | "-14": 5.0, 8 | "-15": 12.0, 9 | "-16": 8.0, 10 | "-17": 12.0, 11 | "-18": 5.0, 12 | "-2": 8.0, 13 | "-3": 8.0, 14 | "-4": 12.0, 15 | "-5": 9.0, 16 | "-6": 7.0, 17 | "-7": 10.0, 18 | "-8": 13.0, 19 | "-9": 13.0, 20 | "0": 6.0, 21 | "1": 11.0, 22 | "10": 13.0, 23 | "11": 10.0, 24 | "12": 12.0, 25 | "13": 13.0, 26 | "14": 12.0, 27 | "15": 9.0, 28 | "16": 13.0, 29 | "17": 8.0, 30 | "2": 18.0, 31 | "3": 13.0, 32 | "4": 9.0, 33 | "5": 13.0, 34 | "6": 7.0, 35 | "7": 10.0, 36 | "8": 18.0, 37 | "9": 12.0}, 38 | "bins:type": "Count", 39 | "entries": 400.0, 40 | "nanflow": 0.0, 41 | "nanflow:type": "Count", 42 | "origin": 0}, 43 | "type": "SparselyBin", 44 | "version": "1.1"} -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: 4 | push: 5 | branches: [ master, develop ] 6 | pull_request: 7 | 8 | jobs: 9 | test: 10 | strategy: 11 | matrix: 12 | os: [ ubuntu-latest ] 13 | python: [ "3.9", "3.10", "3.11", "3.12" ] 14 | numpy_version: [ "numpy-latest", "numpy<2" ] 15 | runs-on: ${{ matrix.os }} 16 | 17 | steps: 18 | - name: Checkout code 19 | uses: actions/checkout@v2 20 | 21 | - name: Set up Python ${{ matrix.python }} 22 | uses: actions/setup-python@v1 23 | with: 24 | python-version: ${{ matrix.python }} 25 | 26 | - name: Use cache for pip dependencies 27 | uses: actions/cache@v3 28 | with: 29 | path: ~/.cache/pip 30 | key: ${{ runner.os }}-pip-${{ hashFiles('**/pyproject.toml') }} 31 | restore-keys: | 32 | ${{ runner.os }}-pip- 33 | 34 | - name: Install dependencies 35 | run: | 36 | python -m pip install --upgrade pip 37 | if [ "${{ matrix.numpy_version }}" = "numpy<2" ]; then 38 | pip install ".[test,pandas,test_spark_pre2,test_numpy_pre2]" 39 | else 40 | pip install ".[test,pandas,spark]" 41 | fi 42 | 43 | - name: Lint with pre-commit 44 | run: | 45 | pip install pre-commit 46 | pre-commit run --all-files --show-diff-on-failure 47 | 48 | - name: Test with pytest 49 | run: | 50 | pytest tests 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Emacs 2 | *~ 3 | \#*\# 4 | 5 | # Generated by test 6 | plot_*.html 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | env/ 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *,cover 53 | .hypothesis/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | histogrammar.*.rst 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # IPython Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # tests output files 100 | histogrammar/notebooks/*.json 101 | -------------------------------------------------------------------------------- /histogrammar/dfinterface/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 ING Wholesale Banking Advanced Analytics 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | # this software and associated documentation files (the "Software"), to deal in 5 | # the Software without restriction, including without limitation the rights to 6 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | # the Software, and to permit persons to whom the Software is furnished to do so, 8 | # subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | from .addmethods import add_pandas_methods, add_sparksql_methods 21 | 22 | try: 23 | from pyspark.sql import DataFrame as sdf 24 | 25 | # add function to create histogrammar histograms 26 | add_sparksql_methods(cls=sdf, prefix="hg_") 27 | except (ModuleNotFoundError, AttributeError): 28 | pass 29 | 30 | try: 31 | from pandas import DataFrame as pdf 32 | 33 | # add function to create histogrammar histograms 34 | add_pandas_methods(cls=pdf, prefix="hg_") 35 | except (ModuleNotFoundError, AttributeError): 36 | pass 37 | -------------------------------------------------------------------------------- /histogrammar/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | #!/usr/bin/env python 4 | 5 | # Copyright 2016 DIANA-HEP 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | # handy monkey patch functions for pandas and spark dataframes 20 | import histogrammar.dfinterface 21 | from histogrammar.convenience import ( 22 | Histogram, 23 | Profile, 24 | ProfileErr, 25 | SparselyHistogram, 26 | SparselyProfile, 27 | SparselyProfileErr, 28 | TwoDimensionallyHistogram, 29 | TwoDimensionallySparselyHistogram, 30 | ) 31 | from histogrammar.defs import Container, Factory 32 | from histogrammar.primitives.average import Average 33 | from histogrammar.primitives.bag import Bag 34 | from histogrammar.primitives.bin import Bin 35 | from histogrammar.primitives.categorize import Categorize 36 | from histogrammar.primitives.centrallybin import CentrallyBin 37 | from histogrammar.primitives.collection import ( 38 | Branch, 39 | Collection, 40 | Index, 41 | Label, 42 | UntypedLabel, 43 | ) 44 | from histogrammar.primitives.count import Count 45 | from histogrammar.primitives.deviate import Deviate 46 | from histogrammar.primitives.fraction import Fraction 47 | from histogrammar.primitives.irregularlybin import IrregularlyBin 48 | from histogrammar.primitives.minmax import Maximize, Minimize 49 | from histogrammar.primitives.select import Select 50 | from histogrammar.primitives.sparselybin import SparselyBin 51 | from histogrammar.primitives.stack import Stack 52 | from histogrammar.primitives.sum import Sum 53 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # 3 | # NOTICE: pass-through licensing of bundled components 4 | # 5 | # Histogrammar gathers together a toolkit of pre-existing third-party 6 | # open-source software components. These software components are governed by their own licenses 7 | # which Histogrammar does not modify or supersede, please consult the originating 8 | # authors. These components altogether have a mixture of the following licenses: Apache 2.0, MIT. 9 | # 10 | # Although we have examined the licenses to verify acceptance of commercial and non-commercial 11 | # use, please see and consult the original licenses or authors. 12 | # 13 | # Here is the full list of license dependencies: 14 | # 15 | # numpy: https://github.com/numpy/numpy/blob/master/LICENSE.txt 16 | # tqdm: https://github.com/tqdm/tqdm/blob/master/LICENCE 17 | # matplotlib: https://github.com/matplotlib/matplotlib/blob/master/LICENSE/LICENSE 18 | # joblib: https://github.com/joblib/joblib/blob/master/LICENSE.txt 19 | # root: https://root.cern.ch/license 20 | # popmon: https://github.com/ing-bank/popmon/blob/master/LICENSE 21 | # 22 | # There are several functions/classes where code or techniques have been reproduced and/or modified 23 | # from existing open-source packages. We list these here: 24 | # 25 | # Package: popmon 26 | # popmon file: histogrammar/dfinterface/spark_histogrammar.py 27 | # Class: SparkHistogrammar 28 | # Reference: https://github.com/ing-bank/popmon/blob/master/popmon/hist/filling/spark_histogrammar.py 29 | # popmon file: histogrammar/dfinterface/pandas_histogrammar.py 30 | # Class: PandasHistogrammar 31 | # Reference: https://github.com/ing-bank/popmon/blob/master/popmon/hist/filling/pandas_histogrammar.py 32 | # popmon file: histogrammar/dfinterface/histogram_filler_base.py 33 | # Class: HistogramFillerBase 34 | # Reference: https://github.com/ing-bank/popmon/blob/master/popmon/hist/filling/histogram_filler_base.py 35 | # License: MIT 36 | # For details see: https://github.com/ing-bank/popmon/blob/master/LICENSE 37 | # 38 | ################################################################################################ 39 | -------------------------------------------------------------------------------- /tests/resources/transaction.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "Bin", 3 | "data": { 4 | "low": -2000.0, 5 | "high": 2000.0, 6 | "entries": 400.0, 7 | "values:type": "Count", 8 | "values": [ 9 | 0.0, 10 | 0.0, 11 | 0.0, 12 | 0.0, 13 | 0.0, 14 | 0.0, 15 | 0.0, 16 | 0.0, 17 | 0.0, 18 | 0.0, 19 | 0.0, 20 | 0.0, 21 | 0.0, 22 | 0.0, 23 | 0.0, 24 | 0.0, 25 | 0.0, 26 | 0.0, 27 | 0.0, 28 | 0.0, 29 | 0.0, 30 | 0.0, 31 | 0.0, 32 | 0.0, 33 | 0.0, 34 | 6.0, 35 | 5.0, 36 | 11.0, 37 | 8.0, 38 | 5.0, 39 | 9.0, 40 | 4.0, 41 | 5.0, 42 | 5.0, 43 | 9.0, 44 | 6.0, 45 | 3.0, 46 | 9.0, 47 | 6.0, 48 | 5.0, 49 | 9.0, 50 | 7.0, 51 | 7.0, 52 | 8.0, 53 | 8.0, 54 | 6.0, 55 | 5.0, 56 | 11.0, 57 | 4.0, 58 | 12.0, 59 | 5.0, 60 | 4.0, 61 | 6.0, 62 | 10.0, 63 | 3.0, 64 | 4.0, 65 | 5.0, 66 | 8.0, 67 | 0.0, 68 | 7.0, 69 | 6.0, 70 | 5.0, 71 | 7.0, 72 | 9.0, 73 | 7.0, 74 | 5.0, 75 | 6.0, 76 | 6.0, 77 | 7.0, 78 | 2.0, 79 | 3.0, 80 | 6.0, 81 | 3.0, 82 | 8.0, 83 | 7.0, 84 | 3.0, 85 | 8.0, 86 | 8.0, 87 | 8.0, 88 | 7.0, 89 | 7.0, 90 | 8.0, 91 | 6.0, 92 | 9.0, 93 | 7.0, 94 | 8.0, 95 | 7.0, 96 | 2.0, 97 | 0.0, 98 | 0.0, 99 | 0.0, 100 | 0.0, 101 | 0.0, 102 | 0.0, 103 | 0.0, 104 | 0.0, 105 | 0.0, 106 | 0.0, 107 | 0.0, 108 | 0.0 109 | ], 110 | "underflow:type": "Count", 111 | "underflow": 0.0, 112 | "overflow:type": "Count", 113 | "overflow": 0.0, 114 | "nanflow:type": "Count", 115 | "nanflow": 0.0 116 | }, 117 | "version": "1.1" 118 | } -------------------------------------------------------------------------------- /tests/resources/longitude.json: -------------------------------------------------------------------------------- 1 | 2 | {"data": {"binWidth": 5, 3 | "bins": {"-1": 7.0, 4 | "-10": 9.0, 5 | "-11": 6.0, 6 | "-12": 4.0, 7 | "-13": 9.0, 8 | "-14": 5.0, 9 | "-15": 9.0, 10 | "-16": 5.0, 11 | "-17": 4.0, 12 | "-18": 7.0, 13 | "-19": 11.0, 14 | "-2": 3.0, 15 | "-20": 6.0, 16 | "-21": 3.0, 17 | "-22": 5.0, 18 | "-23": 6.0, 19 | "-24": 6.0, 20 | "-25": 5.0, 21 | "-26": 11.0, 22 | "-27": 5.0, 23 | "-28": 4.0, 24 | "-29": 4.0, 25 | "-3": 7.0, 26 | "-30": 7.0, 27 | "-31": 4.0, 28 | "-32": 2.0, 29 | "-33": 4.0, 30 | "-34": 8.0, 31 | "-35": 2.0, 32 | "-36": 2.0, 33 | "-4": 5.0, 34 | "-5": 6.0, 35 | "-6": 3.0, 36 | "-7": 7.0, 37 | "-8": 2.0, 38 | "-9": 5.0, 39 | "0": 5.0, 40 | "1": 12.0, 41 | "10": 6.0, 42 | "11": 11.0, 43 | "12": 3.0, 44 | "13": 4.0, 45 | "14": 7.0, 46 | "15": 4.0, 47 | "16": 5.0, 48 | "17": 6.0, 49 | "18": 5.0, 50 | "19": 3.0, 51 | "2": 4.0, 52 | "20": 9.0, 53 | "21": 7.0, 54 | "22": 7.0, 55 | "23": 5.0, 56 | "24": 5.0, 57 | "25": 3.0, 58 | "26": 4.0, 59 | "27": 4.0, 60 | "28": 5.0, 61 | "29": 5.0, 62 | "3": 6.0, 63 | "30": 5.0, 64 | "31": 9.0, 65 | "32": 6.0, 66 | "33": 4.0, 67 | "34": 3.0, 68 | "35": 4.0, 69 | "4": 13.0, 70 | "5": 5.0, 71 | "6": 6.0, 72 | "7": 6.0, 73 | "8": 3.0, 74 | "9": 3.0}, 75 | "bins:type": "Count", 76 | "entries": 400.0, 77 | "nanflow": 0.0, 78 | "nanflow:type": "Count", 79 | "origin": 0}, 80 | "type": "SparselyBin", 81 | "version": "1.1"} -------------------------------------------------------------------------------- /makedocs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2016 DIANA-HEP 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import importlib 18 | import inspect 19 | from pathlib import Path 20 | 21 | modules = [ 22 | "histogrammar.defs", 23 | "histogrammar.specialized", 24 | "histogrammar.util", 25 | "histogrammar.version", 26 | "histogrammar.primitives.average", 27 | "histogrammar.primitives.bag", 28 | "histogrammar.primitives.bin", 29 | "histogrammar.primitives.categorize", 30 | "histogrammar.primitives.centrallybin", 31 | "histogrammar.primitives.collection", 32 | "histogrammar.primitives.count", 33 | "histogrammar.primitives.deviate", 34 | "histogrammar.primitives.fraction", 35 | "histogrammar.primitives.irregularlybin", 36 | "histogrammar.primitives.minmax", 37 | "histogrammar.primitives.select", 38 | "histogrammar.primitives.sparselybin", 39 | "histogrammar.primitives.stack", 40 | "histogrammar.primitives.sum", 41 | "histogrammar.plot.bokeh", 42 | "histogrammar.plot.root", 43 | ] 44 | 45 | modules = {name: importlib.import_module(name) for name in modules} 46 | 47 | documented = [] 48 | for moduleName, module in modules.items(): 49 | for objName in dir(module): 50 | obj = getattr(module, objName) 51 | if not objName.startswith("_") and callable(obj) and obj.__module__ == moduleName: 52 | print(objName, obj) 53 | documented.append(moduleName + "." + objName) 54 | path = Path("docs/" + moduleName + "." + objName + ".rst") 55 | if inspect.isclass(obj): 56 | path.write_text( 57 | """:orphan: 58 | 59 | {0} 60 | {1} 61 | 62 | .. autoclass:: {0} 63 | :members: 64 | :special-members: __init__, __add__ 65 | :inherited-members: 66 | :show-inheritance: 67 | """.format( 68 | moduleName + "." + objName, 69 | "=" * (len(moduleName) + len(objName) + 1), 70 | ) 71 | ) 72 | else: 73 | path.write_text( 74 | """:orphan: 75 | 76 | {0} 77 | {1} 78 | 79 | .. autofunction:: {0} 80 | """.format( 81 | moduleName + "." + objName, 82 | "=" * (len(moduleName) + len(objName) + 1), 83 | ) 84 | ) 85 | -------------------------------------------------------------------------------- /tests/resources/age.json: -------------------------------------------------------------------------------- 1 | {"data": {"binWidth": 1.0, 2 | "bins": {"10": 5.0, 3 | "11": 3.0, 4 | "12": 4.0, 5 | "13": 5.0, 6 | "14": 6.0, 7 | "15": 5.0, 8 | "16": 6.0, 9 | "17": 5.0, 10 | "18": 4.0, 11 | "19": 6.0, 12 | "20": 6.0, 13 | "21": 5.0, 14 | "22": 3.0, 15 | "23": 5.0, 16 | "24": 5.0, 17 | "25": 5.0, 18 | "26": 6.0, 19 | "27": 3.0, 20 | "28": 3.0, 21 | "29": 2.0, 22 | "30": 7.0, 23 | "31": 7.0, 24 | "32": 13.0, 25 | "33": 4.0, 26 | "34": 5.0, 27 | "35": 1.0, 28 | "36": 2.0, 29 | "37": 9.0, 30 | "38": 3.0, 31 | "39": 4.0, 32 | "40": 2.0, 33 | "41": 10.0, 34 | "42": 5.0, 35 | "43": 6.0, 36 | "44": 8.0, 37 | "45": 6.0, 38 | "46": 4.0, 39 | "47": 2.0, 40 | "48": 9.0, 41 | "49": 4.0, 42 | "50": 5.0, 43 | "51": 7.0, 44 | "52": 6.0, 45 | "53": 6.0, 46 | "54": 11.0, 47 | "55": 4.0, 48 | "56": 7.0, 49 | "57": 8.0, 50 | "58": 10.0, 51 | "59": 3.0, 52 | "60": 6.0, 53 | "61": 3.0, 54 | "62": 5.0, 55 | "63": 7.0, 56 | "64": 5.0, 57 | "65": 2.0, 58 | "66": 9.0, 59 | "67": 7.0, 60 | "68": 4.0, 61 | "69": 4.0, 62 | "70": 5.0, 63 | "71": 2.0, 64 | "72": 3.0, 65 | "73": 5.0, 66 | "74": 1.0, 67 | "75": 2.0, 68 | "76": 2.0, 69 | "77": 4.0, 70 | "78": 7.0, 71 | "79": 3.0, 72 | "80": 5.0, 73 | "81": 3.0, 74 | "82": 2.0, 75 | "83": 2.0, 76 | "84": 5.0, 77 | "85": 5.0, 78 | "86": 4.0, 79 | "87": 4.0, 80 | "88": 6.0, 81 | "89": 6.0, 82 | "90": 2.0}, 83 | "bins:type": "Count", 84 | "entries": 400.0, 85 | "nanflow": 0.0, 86 | "nanflow:type": "Count", 87 | "origin": 0.0}, 88 | "type": "SparselyBin", 89 | "version": "1.1"} -------------------------------------------------------------------------------- /histogrammar/resources.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 ING Wholesale Banking Advanced Analytics 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | # this software and associated documentation files (the "Software"), to deal in 5 | # the Software without restriction, including without limitation the rights to 6 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | # the Software, and to permit persons to whom the Software is furnished to do so, 8 | # subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | 21 | # Resources lookup file for histogrammar 22 | from importlib import resources 23 | 24 | from histogrammar import notebooks, test_data 25 | 26 | # data files that are shipped with histogrammar. 27 | _DATA = {_.name: _ for _ in resources.files(test_data).iterdir()} 28 | 29 | # Tutorial notebooks 30 | _NOTEBOOK = {p.name: p for p in resources.files(notebooks).iterdir() if p.suffix == ".ipynb"} 31 | 32 | # Resource types 33 | _RESOURCES = {"data": _DATA, "notebook": _NOTEBOOK} 34 | 35 | 36 | def _resource(resource_type, name: str) -> str: 37 | """Return the full path filename of a resource. 38 | 39 | :param str resource_type: The type of the resource. 40 | :param str name: The name of the resource. 41 | :returns: The full path filename of the fixture data set. 42 | :rtype: str 43 | :raises FileNotFoundError: If the resource cannot be found. 44 | """ 45 | full_path = _RESOURCES[resource_type].get(name, None) 46 | 47 | if full_path and full_path.exists(): 48 | return str(full_path) 49 | 50 | raise FileNotFoundError(f'Could not find {resource_type} "{name!s}"! Does it exist?') 51 | 52 | 53 | def data(name: str) -> str: 54 | """Return the full path filename of a shipped data file. 55 | 56 | :param str name: The name of the data. 57 | :returns: The full path filename of the data. 58 | :rtype: str 59 | :raises FileNotFoundError: If the data cannot be found. 60 | """ 61 | return _resource("data", name) 62 | 63 | 64 | def notebook(name: str) -> str: 65 | """Return the full path filename of a tutorial notebook. 66 | 67 | :param str name: The name of the notebook. 68 | :returns: The full path filename of the notebook. 69 | :rtype: str 70 | :raises FileNotFoundError: If the notebook cannot be found. 71 | """ 72 | return _resource("notebook", name) 73 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from decimal import Decimal 2 | from json import load 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import pytest 8 | 9 | from histogrammar import resources 10 | 11 | 12 | def get_comparer_data(): 13 | test_comparer_df = {} 14 | df = pd.DataFrame( 15 | data={ 16 | "mae": [0.1, 0.11, 0.12, 0.2, 0.09], 17 | "mse": [0.1, 0.1, 0.1, 0.1, 0.1], 18 | "date": [2000, 2001, 2002, 2003, 2004], 19 | } 20 | ) 21 | df.set_index("date", inplace=True) 22 | test_comparer_df["the_feature"] = df 23 | 24 | df = pd.DataFrame( 25 | data={ 26 | "mae": [0.1, 0.11, 0.12, 0.2, 0.09], 27 | "date": [2000, 2001, 2002, 2003, 2004], 28 | } 29 | ) 30 | df.set_index("date", inplace=True) 31 | test_comparer_df["dummy_feature"] = df 32 | 33 | return test_comparer_df 34 | 35 | 36 | def get_ref_comparer_data(): 37 | ref_data = pd.DataFrame() 38 | # we do not add "mse_std" on purpose to have some noise in the data 39 | ref_data["metric"] = ["mae_mean", "mae_std", "mae_pull", "mse_mean"] 40 | ref_data["value"] = [0.124, 0.0376, 0.0376, 0.09] 41 | ref_data["feature"] = "the_feature" 42 | ref_data["date"] = np.arange(ref_data.shape[0]) + 2010 43 | 44 | return ref_data 45 | 46 | 47 | def pytest_configure(): 48 | # attach common test data 49 | pytest.test_comparer_df = get_comparer_data() 50 | pytest.test_ref_comparer_df = get_ref_comparer_data() 51 | 52 | parent_path = Path(__file__).parent 53 | TEMPLATE_PATH = parent_path / "resources" 54 | CSV_FILE = "test.csv.gz" 55 | 56 | with (TEMPLATE_PATH / "age.json").open() as f: 57 | pytest.age = load(f) 58 | 59 | with (TEMPLATE_PATH / "company.json").open() as f: 60 | pytest.company = load(f) 61 | 62 | with (TEMPLATE_PATH / "date.json").open() as f: 63 | pytest.date = load(f) 64 | 65 | with (TEMPLATE_PATH / "eyesColor.json").open() as f: 66 | pytest.eyesColor = load(f) 67 | 68 | with (TEMPLATE_PATH / "gender.json").open() as f: 69 | pytest.gender = load(f) 70 | 71 | with (TEMPLATE_PATH / "isActive.json").open() as f: 72 | pytest.isActive = load(f) 73 | 74 | with (TEMPLATE_PATH / "isActive_age.json").open() as f: 75 | pytest.isActive_age = load(f) 76 | 77 | with (TEMPLATE_PATH / "latitude.json").open() as f: 78 | pytest.latitude = load(f) 79 | 80 | with (TEMPLATE_PATH / "longitude.json").open() as f: 81 | pytest.longitude = load(f) 82 | 83 | with (TEMPLATE_PATH / "latitude_longitude.json").open() as f: 84 | pytest.latitude_longitude = load(f) 85 | 86 | with (TEMPLATE_PATH / "transaction.json").open() as f: 87 | pytest.transaction = load(f) 88 | 89 | df = pd.read_csv(resources.data(CSV_FILE)) 90 | df["date"] = pd.to_datetime(df["date"]) 91 | 92 | # Decimal type 93 | df["amount"] = df["balance"].str.replace("$", "", regex=False).str.replace(",", "", regex=False).apply(Decimal) 94 | 95 | pytest.test_df = df 96 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "histogrammar" 7 | description = "Histograms for Pandas/Spark/Numpy" 8 | keywords = [ 9 | "pandas", 10 | "spark", 11 | "data-science", 12 | "data-analysis", 13 | "statistics", 14 | "python", 15 | "jupyter", 16 | "ipython" 17 | ] 18 | readme = "README.rst" 19 | requires-python = ">=3.9" 20 | authors = [{ name = "Max Baak", email = "maxbaak@gmail.com" }, { name = "Jim Pivarski (DIANA-HEP)", email = "pivarski@fnal.gov" }] 21 | maintainers = [{ name = "Max Baak", email = "maxbaak@gmail.com" }] 22 | license = { type = "Apache Software License v2", file = "LICENSE" } 23 | dependencies = [ 24 | "numpy", 25 | "tqdm", 26 | "joblib>=0.14.0" 27 | ] 28 | classifiers = ["Development Status :: 5 - Production/Stable", 29 | "Environment :: Console", 30 | "Intended Audience :: Science/Research", 31 | "License :: OSI Approved :: Apache Software License", 32 | "Topic :: Scientific/Engineering :: Information Analysis", 33 | "Topic :: Scientific/Engineering :: Mathematics", 34 | "Topic :: Scientific/Engineering :: Physics", 35 | ] 36 | dynamic = ["version"] 37 | 38 | [project.optional-dependencies] 39 | pandas = [ 40 | "pandas" 41 | ] 42 | spark = [ 43 | "pyspark", 44 | ] 45 | test = [ 46 | "ipykernel>=5.1.3", 47 | "jupyter_client>=5.2.3", 48 | "matplotlib", 49 | "pandas", 50 | "pre-commit>=2.9.0", 51 | "pytest-notebook>=0.6.1", 52 | "pytest>=4.0.2", 53 | ] 54 | test_numpy_pre2 = [ 55 | "numpy<2", 56 | "pandas<2", 57 | ] 58 | test_spark_pre2 = [ 59 | "pyspark<4; python_version <= '3.11'", 60 | ] 61 | 62 | # files to be shipped with the installation, under: histogrammar/test_data and histogrammar/notebooks 63 | # after installation, these can be found with the functions in resources.py 64 | [tool.setuptools.package-data] 65 | histogrammar = [ 66 | "test_data/*.csv.gz", 67 | "test_data/*.json*", 68 | "notebooks/*tutorial*.ipynb", 69 | ] 70 | 71 | [project.urls] 72 | repository = "https://github.com/histogrammar/histogrammar-python" 73 | 74 | [tool.semantic_release] 75 | version_variable = [ 76 | "histogrammar/version.py:version", 77 | ] 78 | build_command = "pip install build && python -m build" 79 | 80 | [tool.setuptools.dynamic] 81 | version = { attr = "histogrammar.version.version" } 82 | 83 | [tool.ruff] 84 | line-length = 120 85 | 86 | [tool.ruff.lint] 87 | extend-select = [ 88 | "E", # pyflakes `E` 89 | "W", # pyflakes `W` 90 | "I", # isort 91 | "UP", # pyupgrade 92 | "D212", # pydocstyle 93 | "D411", # pydocstyle 94 | "C4", # flake8-comprehensions 95 | "FA", # flake8-future-annotations 96 | "PIE", # flake8-pie 97 | "RET", # flake8-return 98 | "SIM", # flake8-simplify 99 | "TCH", # flake8-type-checking 100 | "PTH", # flake8-use-pathlib 101 | "PL", # pylint 102 | "FURB", # refurb 103 | ] 104 | ignore = [ 105 | "EM102", # f string in exception 106 | "PLR0913", # Too many arguments in function definition 107 | "PLR2004", # Magic value used in comparison 108 | "PLR1722", # Use `sys.exit()` instead of `exit` 109 | "PLR0915", # Too many statements (61 > 50) 110 | "UP038", # Use `X | Y` in `isinstance` call instead of `(X, Y)` 111 | "TCH003", # Move standard library import 112 | "TCH002", # Move third-party import `airflow.models.param.Param` 113 | "PLR0912", # Too many branches 114 | ] 115 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | Release notes 3 | ============= 4 | 5 | Version 1.1.1, Aug 2024 6 | ----------------------- 7 | * Compatibility with numpy v2.3: converting np.number to a dtype no longer allowed. 8 | 9 | Version 1.1.0, Dec 2024 10 | ----------------------- 11 | * Removed all ROOT, cpp and cuda code, it was no longer supported. 12 | 13 | Version 1.0.34, Dec 2024 14 | ------------------------ 15 | * Fix typo in build pipeline Python versions config list. 16 | * Fix error in SparselyBin __eq__ method. 17 | * Fix test utility corner case error (test_numpy.twosigfigs function). 18 | * Fix error in test context manager for pandas which prevented execution of tests. 19 | * Fix error in expected bin count in test_numpy.test_n_bins test. 20 | * Prevent logging zero execution time TestNumpy class. 21 | 22 | * Remove Python 3.8 environment from build pipeline. 23 | * Support numpy >= 2.0.0 (np.string_ -> np.bytes_, np.unicode_ -> np.str_). 24 | * Remove uses of pd.util.testing.makeMixedDataFrame not available in pandas >= 2.0.0. 25 | * Switch from 'pkg_resources' to 'importlib' module for resolving package files. 26 | * Switch from 'distutils.spawn' to 'shutil.which' for finding nvcc command. 27 | 28 | * Remove unused test_gpu.twosigfigs function. 29 | * Refactor tests with Numpy() and Pandas() context managers to use single 'with' statement. 30 | 31 | * Switch from setup.py to pyproject.toml 32 | * Add numpy<2,pandas<2 test environment to build pipeline test matrix 33 | 34 | Version 1.0.33, Dec 2022 35 | ------------------------ 36 | * fix of get_sub_hist() when Bin histogram is filled only with nans. 37 | 38 | Version 1.0.32, Sep 2022 39 | ------------------------ 40 | * Support for decimal datetype in pandas and spark. 41 | 42 | Version 1.0.31, Aug 2022 43 | ------------------------ 44 | * fix of spark df timestamp datatype detection (#59) 45 | * fix for invalid bin_edges for SparselyBin histogram (#60) 46 | 47 | Version 1.0.30, June 2022 48 | ------------------------- 49 | * Fix for machine-level rounding error, which can show up on in num_bins() call of Bin histogram. 50 | * supersedes broken v1.0.29 51 | 52 | Version 1.0.28, June 2022 53 | ------------------------- 54 | * Multiple performance updates, to Bin, SparselyBin and Categorize histograms. 55 | * SparselyBin, Categorize: optimized filling with 1-d and 2-d numpy arrays 56 | * Bin, SparselyBin, Categorize: (fast) numpy arrays for bin-centers and bin-labels. 57 | * Count: new, fast filling option when float weight is known. 58 | * util.py: faster get_datatype() and get_ndim() functions. 59 | 60 | Version 1.0.27, May 2022 61 | ------------------------ 62 | * Multiple performance updates, thanks to Simon Brugman. 63 | * Use pandas functions to infer datatypes and return numpy arrays. 64 | * Turn of unnecessary specialize function (slow) for Count objects. 65 | 66 | Version 1.0.26, Apr 2022 67 | ------------------------ 68 | * Added tutorial notebook with exercises. 69 | * Fixed 2d heatmap for categorical histograms, where one column was accidentally dropped. 70 | 71 | Version 1.0.25, Apr 2021 72 | ------------------------ 73 | * Improve null handling in pandas dataframes, by inferring datatype using pandas' infer_dtype function. 74 | * nans in bool columns get converted to "NaN", so the column keeps True and False values in Categorize. 75 | * columns of type object get converted to strings using to_string(), of type string uses only_str(). 76 | 77 | Version 1.0.24, Apr 2021 78 | ------------------------ 79 | * Categorize histogram now handles nones and nans in friendlier way, they are converted to "NaN". 80 | * make_histogram() now casts spark nulls to nan in case of numeric columns. scala interprets null as 0. 81 | * SparselyBin histograms did not add up nanflow when added. Now fixed. 82 | * Added unit test for doing checks on null conversion to nans 83 | * Use new histogrammar-scala jar files, v1.0.20 84 | * Added histogrammar-scala v1.0.20 jar files to tests/jars/ 85 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Histogrammar |version| for Python 2 | ================================= 3 | 4 | All aggregation primitives descend from two classes, :doc:`Container ` and :doc:`Factory `. Container defines all the methods for the primitive to aggregate and contain data, while Factory has methods for making containers. (In other languages, the two roles are distinct.) 5 | 6 | The "functions" passed to these primitives may be Python lambda functions, normally defined functions (with ``def``), or strings, which may be interpreted different ways by different back-ends. All primitives immediately wrap your functions as :doc:`UserFcn `, which are serializable (with ``pickle``), may be cached (:doc:`CachedFcn `), and may have a name. Although the primitives wrap your function automatically, you may do it yourself to add features, like caching or a name. See :doc:`serializable `, :doc:`cached `, and :doc:`named `. 7 | 8 | The primitive classes are listed below, grouped by kind. See the index for a list of all classes, members, and functions. 9 | 10 | Zeroth kind: depend only on weights 11 | ----------------------------------- 12 | 13 | :doc:`Count `: sum of weights 14 | Count entries by accumulating the sum of all observed weights or a sum of transformed weights (e.g. sum of squares of weights). 15 | 16 | First kind: aggregate a data without sub-aggregators 17 | ---------------------------------------------------- 18 | 19 | :doc:`Sum `: sum of a given quantity 20 | Accumulate the (weighted) sum of a given quantity, calculated from the data. 21 | 22 | :doc:`Average `: mean of a quantity 23 | Accumulate the weighted mean of a given quantity. 24 | 25 | :doc:`Deviate `: mean and variance 26 | Accumulate the weighted mean and weighted variance of a given quantity. 27 | 28 | :doc:`Minimize `: minimum value 29 | Find the minimum value of a given quantity. If no data are observed, the result is NaN. 30 | 31 | :doc:`Maximize `: maximum value 32 | Find the maximum value of a given quantity. If no data are observed, the result is NaN. 33 | 34 | :doc:`Bag `: accumulate values for scatter plots 35 | Accumulate raw numbers, vectors of numbers, or strings, with identical values merged. 36 | 37 | Second kind: pass to different sub-aggregators based on values seen in data 38 | --------------------------------------------------------------------------- 39 | 40 | :doc:`Bin `: regular binning for histograms 41 | Split a quantity into equally spaced bins between a low and high threshold and fill exactly one bin per datum. 42 | 43 | :doc:`SparselyBin `: ignore zeros 44 | Split a quantity into equally spaced bins, creating them whenever their entries would be non-zero. Exactly one sub-aggregator is filled per datum. 45 | 46 | :doc:`CentrallyBin `: irregular but fully partitioning 47 | Split a quantity into bins defined by irregularly spaced bin centers, with exactly one sub-aggregator filled per datum (the closest one). 48 | 49 | :doc:`IrregularlyBin `: exclusive filling 50 | Accumulate a suite of aggregators, each between two thresholds, filling exactly one per datum. 51 | 52 | :doc:`Categorize `: string-valued bins, bar charts 53 | Split a given quantity by its categorical value and fill only one category per datum. 54 | 55 | :doc:`Fraction `: efficiency plots 56 | Accumulate two aggregators, one containing only entries that pass a given selection (numerator) and another that contains all entries (denominator). 57 | 58 | :doc:`Stack `: cumulative filling 59 | Accumulates a suite of aggregators, each filtered with a tighter selection on the same quantity. 60 | 61 | :doc:`Select `: apply a cut 62 | Filter or weight data according to a given selection. 63 | 64 | Third kind: broadcast to every sub-aggregator, independent of data 65 | ------------------------------------------------------------------ 66 | 67 | :doc:`Label `: directory with string-based keys 68 | Accumulate any number of aggregators of the same type and label them with strings. Every sub-aggregator is filled with every input datum. 69 | 70 | :doc:`UntypedLabel `: directory of different types 71 | Accumulate any number of aggregators of any type and label them with strings. Every sub-aggregator is filled with every input datum. 72 | 73 | :doc:`Index `: list with integer keys 74 | Accumulate any number of aggregators of the same type in a list. Every sub-aggregator is filled with every input datum. 75 | 76 | :doc:`Branch `: tuple of different types 77 | Accumulate aggregators of different types, indexed by i0 through i9. Every sub-aggregator is filled with every input datum. 78 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. linkcheck to check all external links for integrity 37 | echo. doctest to run all doctests embedded in the documentation if enabled 38 | goto end 39 | ) 40 | 41 | if "%1" == "clean" ( 42 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 43 | del /q /s %BUILDDIR%\* 44 | goto end 45 | ) 46 | 47 | if "%1" == "html" ( 48 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 49 | if errorlevel 1 exit /b 1 50 | echo. 51 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 52 | goto end 53 | ) 54 | 55 | if "%1" == "dirhtml" ( 56 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 57 | if errorlevel 1 exit /b 1 58 | echo. 59 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 60 | goto end 61 | ) 62 | 63 | if "%1" == "singlehtml" ( 64 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 68 | goto end 69 | ) 70 | 71 | if "%1" == "pickle" ( 72 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished; now you can process the pickle files. 76 | goto end 77 | ) 78 | 79 | if "%1" == "json" ( 80 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished; now you can process the JSON files. 84 | goto end 85 | ) 86 | 87 | if "%1" == "htmlhelp" ( 88 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can run HTML Help Workshop with the ^ 92 | .hhp project file in %BUILDDIR%/htmlhelp. 93 | goto end 94 | ) 95 | 96 | if "%1" == "qthelp" ( 97 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 98 | if errorlevel 1 exit /b 1 99 | echo. 100 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 101 | .qhcp project file in %BUILDDIR%/qthelp, like this: 102 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Histogrammar.qhcp 103 | echo.To view the help file: 104 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Histogrammar.ghc 105 | goto end 106 | ) 107 | 108 | if "%1" == "devhelp" ( 109 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 110 | if errorlevel 1 exit /b 1 111 | echo. 112 | echo.Build finished. 113 | goto end 114 | ) 115 | 116 | if "%1" == "epub" ( 117 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 118 | if errorlevel 1 exit /b 1 119 | echo. 120 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 121 | goto end 122 | ) 123 | 124 | if "%1" == "latex" ( 125 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 129 | goto end 130 | ) 131 | 132 | if "%1" == "text" ( 133 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The text files are in %BUILDDIR%/text. 137 | goto end 138 | ) 139 | 140 | if "%1" == "man" ( 141 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 145 | goto end 146 | ) 147 | 148 | if "%1" == "texinfo" ( 149 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 150 | if errorlevel 1 exit /b 1 151 | echo. 152 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 153 | goto end 154 | ) 155 | 156 | if "%1" == "gettext" ( 157 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 158 | if errorlevel 1 exit /b 1 159 | echo. 160 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 161 | goto end 162 | ) 163 | 164 | if "%1" == "changes" ( 165 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 166 | if errorlevel 1 exit /b 1 167 | echo. 168 | echo.The overview file is in %BUILDDIR%/changes. 169 | goto end 170 | ) 171 | 172 | if "%1" == "linkcheck" ( 173 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 174 | if errorlevel 1 exit /b 1 175 | echo. 176 | echo.Link check complete; look for any errors in the above output ^ 177 | or in %BUILDDIR%/linkcheck/output.txt. 178 | goto end 179 | ) 180 | 181 | if "%1" == "doctest" ( 182 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 183 | if errorlevel 1 exit /b 1 184 | echo. 185 | echo.Testing of doctests in the sources finished, look at the ^ 186 | results in %BUILDDIR%/doctest/output.txt. 187 | goto end 188 | ) 189 | 190 | :end 191 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Histogrammar.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Histogrammar.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/Histogrammar" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Histogrammar" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /histogrammar/convenience.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 ING Wholesale Banking Advanced Analytics 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | # this software and associated documentation files (the "Software"), to deal in 5 | # the Software without restriction, including without limitation the rights to 6 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | # the Software, and to permit persons to whom the Software is furnished to do so, 8 | # subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | from histogrammar.defs import identity, unweighted 21 | from histogrammar.primitives.average import Average 22 | from histogrammar.primitives.bin import Bin 23 | from histogrammar.primitives.categorize import Categorize 24 | from histogrammar.primitives.count import Count 25 | from histogrammar.primitives.deviate import Deviate 26 | from histogrammar.primitives.select import Select 27 | from histogrammar.primitives.sparselybin import SparselyBin 28 | 29 | 30 | def Histogram(num, low, high, quantity=identity): 31 | """Create a conventional histogram that is capable of being filled and added. 32 | 33 | Parameters: 34 | num (int): the number of bins; must be at least one. 35 | low (float): the minimum-value edge of the first bin. 36 | high (float): the maximum-value edge of the last bin; must be strictly greater than `low`. 37 | quantity (function returning float or string): function that computes the quantity of interest from 38 | the data. pass on all values by default. If a string is given, quantity is set to identity(string), 39 | in which case that column is picked up from a pandas df. 40 | """ 41 | return Bin.ing(num, low, high, quantity, Count.ing(), Count.ing(), Count.ing(), Count.ing()) 42 | 43 | 44 | def HistogramCut(num, low, high, quantity=identity, selection=unweighted): 45 | """Create a conventional histogram that is capable of being filled and added, with a selection cut. 46 | 47 | Parameters: 48 | num (int): the number of bins; must be at least one. 49 | low (float): the minimum-value edge of the first bin. 50 | high (float): the maximum-value edge of the last bin; must be strictly greater than `low`. 51 | quantity (function returning float or string): function that computes the quantity of interest from 52 | the data. pass on all values by default. If a string is given, quantity is set to identity(string), 53 | in which case that column is picked up from a pandas df. 54 | selection (function returning boolean): function that computes if data point is accepted or not. 55 | default is: lamba x: True 56 | """ 57 | return Select.ing( 58 | selection, 59 | Bin.ing(num, low, high, quantity, Count.ing(), Count.ing(), Count.ing(), Count.ing()), 60 | ) 61 | 62 | 63 | def SparselyHistogram(binWidth, quantity=identity, origin=0.0): 64 | """Create a sparsely binned histogram that is only capable of being added. 65 | 66 | Parameters: 67 | binWidth (float): the width of a bin. 68 | quantity (function returning float or string): function that computes the quantity of interest from 69 | the data. pass on all values by default. If a string is given, quantity is set to identity(string), 70 | in which case that column is picked up from a pandas df. 71 | origin (float): the left edge of the bin whose index is zero. 72 | """ 73 | return SparselyBin.ing(binWidth, quantity, Count.ing(), Count.ing(), origin) 74 | 75 | 76 | def CategorizeHistogram(quantity=identity): 77 | """Create a Categorize histogram for categorical features such as strings and booleans 78 | 79 | Parameters: 80 | quantity (function returning float or string): function that computes the quantity of interest from 81 | the data. pass on all values by default. If a string is given, quantity is set to identity(string), 82 | in which case that column is picked up from a pandas df. 83 | """ 84 | return Categorize.ing(quantity, Count.ing()) 85 | 86 | 87 | def Profile(num, low, high, binnedQuantity, averagedQuantity): 88 | """Convenience function for creating binwise averages.""" 89 | return Bin.ing(num, low, high, binnedQuantity, Average.ing(averagedQuantity)) 90 | 91 | 92 | def SparselyProfile(binWidth, binnedQuantity, averagedQuantity, origin=0.0): 93 | """Convenience function for creating sparsely binned binwise averages.""" 94 | return SparselyBin.ing(binWidth, binnedQuantity, Average.ing(averagedQuantity), Count.ing(), origin) 95 | 96 | 97 | def ProfileErr(num, low, high, binnedQuantity, averagedQuantity): 98 | """Convenience function for creating a profile plot 99 | 100 | This is a Profile with variances. 101 | """ 102 | return Bin.ing(num, low, high, binnedQuantity, Deviate.ing(averagedQuantity)) 103 | 104 | 105 | def SparselyProfileErr(binWidth, binnedQuantity, averagedQuantity, origin=0.0): 106 | """Convenience function for creating a sparsely binned profile plot 107 | 108 | This is a Profile with variances. 109 | """ 110 | return SparselyBin.ing(binWidth, binnedQuantity, Deviate.ing(averagedQuantity), Count.ing(), origin) 111 | 112 | 113 | def TwoDimensionallyHistogram(xnum, xlow, xhigh, xquantity, ynum, ylow, yhigh, yquantity): 114 | """Convenience function for creating a conventional, two-dimensional histogram.""" 115 | return Bin.ing(xnum, xlow, xhigh, xquantity, Bin.ing(ynum, ylow, yhigh, yquantity)) 116 | 117 | 118 | def TwoDimensionallySparselyHistogram(xbinWidth, xquantity, ybinWidth, yquantity, xorigin=0.0, yorigin=0.0): 119 | """Convenience function for creating a sparsely binned, two-dimensional histogram.""" 120 | return SparselyBin.ing( 121 | xbinWidth, 122 | xquantity, 123 | SparselyBin.ing(ybinWidth, yquantity, Count.ing(), Count.ing(), yorigin), 124 | Count.ing(), 125 | xorigin, 126 | ) 127 | -------------------------------------------------------------------------------- /histogrammar/plot/hist_numpy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 ING Wholesale Banking Advanced Analytics 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | # this software and associated documentation files (the "Software"), to deal in 5 | # the Software without restriction, including without limitation the rights to 6 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | # the Software, and to permit persons to whom the Software is furnished to do so, 8 | # subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | 21 | import warnings 22 | 23 | import numpy as np 24 | 25 | 26 | def prepare_2dgrid(hist): 27 | """Get lists of all unique x and y keys 28 | 29 | Used as input by get_2dgrid(hist). 30 | 31 | :param hist: input histogrammar histogram 32 | :return: two comma-separated lists of unique x and y keys 33 | """ 34 | if hist.n_dim < 2: 35 | warnings.warn(f"Input histogram only has {hist.n_dim} dimensions (<2). Returning empty lists.") 36 | return [], [] 37 | 38 | xkeys = set() 39 | ykeys = set() 40 | # SparselyBin, Categorize, IrregularlyBin, CentrallyBin 41 | if hasattr(hist, "bins"): 42 | hist_bins = dict(hist.bins) 43 | xkeys = xkeys.union(hist_bins.keys()) 44 | for h in hist_bins.values(): 45 | if hasattr(h, "bins"): 46 | h_bins = dict(h.bins) 47 | ykeys = ykeys.union(h_bins.keys()) 48 | elif hasattr(h, "values"): 49 | ykeys = ykeys.union(range(len(h.values))) 50 | # Bin 51 | elif hasattr(hist, "values"): 52 | xkeys = xkeys.union(range(len(hist.values))) 53 | for h in hist.values: 54 | if hasattr(h, "bins"): 55 | h_bins = dict(h.bins) 56 | ykeys = ykeys.union(h_bins.keys()) 57 | elif hasattr(h, "values"): 58 | ykeys = ykeys.union(range(len(h.values))) 59 | return sorted(xkeys), sorted(ykeys) 60 | 61 | 62 | def set_2dgrid(hist, xkeys, ykeys): 63 | """Set 2d grid of first two dimenstions of input histogram 64 | 65 | Used as input by get_2dgrid(hist). 66 | 67 | :param hist: input histogrammar histogram 68 | :param list xkeys: list with unique x keys 69 | :param list ykeys: list with unique y keys 70 | :return: filled 2d numpy grid 71 | """ 72 | grid = np.zeros((len(ykeys), len(xkeys))) 73 | 74 | if hist.n_dim < 2: 75 | warnings.warn(f"Input histogram only has {hist.n_dim} dimensions (<2). Returning original grid.") 76 | return grid 77 | 78 | # SparselyBin, Categorize, IrregularlyBin, CentrallyBin 79 | if hasattr(hist, "bins"): 80 | hist_bins = dict(hist.bins) 81 | for k, h in hist_bins.items(): 82 | if k not in xkeys: 83 | continue 84 | i = xkeys.index(k) 85 | if hasattr(h, "bins"): 86 | h_bins = dict(h.bins) 87 | for li, g in h_bins.items(): 88 | if li not in ykeys: 89 | continue 90 | j = ykeys.index(li) 91 | grid[j, i] = g.entries 92 | elif hasattr(h, "values"): 93 | for j, g in enumerate(h.values): 94 | grid[j, i] = g.entries 95 | # Bin 96 | elif hasattr(hist, "values"): 97 | for i, h in enumerate(hist.values): 98 | if hasattr(h, "bins"): 99 | h_bins = dict(h.bins) 100 | for lj, g in h_bins.items(): 101 | if lj not in ykeys: 102 | continue 103 | j = ykeys.index(lj) 104 | grid[j, i] = g.entries 105 | elif hasattr(h, "values"): 106 | for j, g in enumerate(h.values): 107 | grid[j, i] = g.entries 108 | return grid 109 | 110 | 111 | def get_2dgrid(hist): 112 | """Get filled x,y grid of first two dimensions of input histogram 113 | 114 | :param hist: input histogrammar histogram 115 | :return: x,y,grid of first two dimenstions of input histogram 116 | """ 117 | if hist.n_dim < 2: 118 | warnings.warn(f"Input histogram only has {hist.n_dim} dimensions (<2). Returning empty grid.") 119 | return np.zeros((0, 0)) 120 | 121 | xkeys, ykeys = prepare_2dgrid(hist) 122 | grid = set_2dgrid(hist, xkeys, ykeys) 123 | 124 | x_labels = get_x_labels(hist, xkeys) 125 | y_labels = get_y_labels(hist, ykeys) 126 | 127 | return x_labels, y_labels, grid 128 | 129 | 130 | def get_x_labels(hist, xkeys): 131 | return [str(hist._center_from_key(key)) for key in xkeys] 132 | 133 | 134 | def get_y_labels(hist, ykeys): 135 | # SparselyBin, Categorize, IrregularlyBin, CentrallyBin 136 | if hasattr(hist, "bins"): 137 | hist_bins = dict(hist.bins) 138 | h = list(hist_bins.values())[0] 139 | # Bin 140 | elif hasattr(hist, "values"): 141 | h = hist.values[0] 142 | return [str(h._center_from_key(key)) for key in ykeys] 143 | 144 | 145 | def prepare2Dsparse(sparse): 146 | yminBins = [v.minBin for v in sparse.bins.values() if v.minBin is not None] 147 | ymaxBins = [v.maxBin for v in sparse.bins.values() if v.maxBin is not None] 148 | if len(yminBins) > 0 and len(ymaxBins) > 0: 149 | yminBin = min(yminBins) 150 | ymaxBin = max(ymaxBins) 151 | else: 152 | yminBin = 0 153 | ymaxBin = 0 154 | sample = list(sparse.bins.values())[0] 155 | ynum = 1 + ymaxBin - yminBin 156 | ylow = yminBin * sample.binWidth + sample.origin 157 | yhigh = (ymaxBin + 1.0) * sample.binWidth + sample.origin 158 | return yminBin, ymaxBin, ynum, ylow, yhigh 159 | 160 | 161 | def set2Dsparse(sparse, yminBin, ymaxBin, grid): 162 | for i, iindex in enumerate(range(sparse.minBin, sparse.maxBin + 1)): 163 | for j, jindex in enumerate(range(yminBin, ymaxBin + 1)): 164 | if iindex in sparse.bins and jindex in sparse.bins[iindex].bins: 165 | grid[j, i] = sparse.bins[iindex].bins[jindex].entries 166 | return grid 167 | -------------------------------------------------------------------------------- /tests/test_spec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2016 DIANA-HEP 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import codecs 18 | import json 19 | import math 20 | import sys 21 | import unittest 22 | 23 | try: 24 | from urllib2 import urlopen 25 | except ImportError: 26 | from urllib.request import urlopen 27 | 28 | import histogrammar.version 29 | from histogrammar import Factory, util 30 | 31 | tolerance = 1e-12 32 | util.relativeTolerance = tolerance 33 | util.absoluteTolerance = tolerance 34 | 35 | 36 | class TestSpec(unittest.TestCase): 37 | def compare(self, x, y, name): 38 | if Factory.fromJson(x) != Factory.fromJson(y): 39 | sys.stderr.write(" FAILED " + name + "\n") 40 | sys.stderr.write(" PYTHON | SPECIFICATION\n") 41 | left = json.dumps(x, sort_keys=True, indent=2) 42 | right = json.dumps(y, sort_keys=True, indent=2) 43 | for leftline, rightline in zip(left.split("\n"), right.split("\n")): 44 | if leftline != rightline: 45 | sys.stderr.write(f"{leftline:50s} > {rightline}\n") 46 | else: 47 | sys.stderr.write(f"{leftline:50s} | {rightline}\n") 48 | self.assertEqual(Factory.fromJson(x), Factory.fromJson(y)) 49 | 50 | def runTest(self): 51 | reader = codecs.getreader("utf-8") 52 | sys.stdout.write( 53 | f"Downloading expected results, generated by specification {histogrammar.version.specification}...\n" 54 | ) 55 | url_data = f"http://histogrammar.org/test/{histogrammar.version.specification}/test-data.json" 56 | try: 57 | testdata = json.load(reader(urlopen(url_data))) 58 | except Exception as err: 59 | msg = f"could not download {url_data}\nbecause of {err.__class__.__name__}: {str(err)}\n" 60 | sys.stdout.write(msg) 61 | return 62 | url_results = f"http://histogrammar.org/test/{histogrammar.version.specification}/test-results.json" 63 | try: 64 | testresults = json.load(reader(urlopen(url_results))) 65 | except Exception as err: 66 | msg = f"could not download {url_results}\nbecause of {err.__class__.__name__}: {str(err)}\n\n" 67 | sys.stdout.write(msg) 68 | return 69 | 70 | for x in testdata: 71 | for k, v in x.items(): 72 | if k != "strings" and v in ("nan", "inf", "-inf"): 73 | x[k] = float(v) 74 | 75 | def stripNames(x): 76 | if hasattr(x, "quantity"): 77 | x.quantity.name = None 78 | elif hasattr(x, "quantityName"): 79 | x.quantityName = None 80 | for xi in x.children: 81 | stripNames(xi) 82 | 83 | for testresult in testresults: 84 | if hasattr(math, "isfinite"): 85 | txt1 = ( 86 | 'named("round(withholes)", lambda x: round(x["withholes"]) ' 87 | 'if math.isfinite(x["withholes"]) else x["withholes"])' 88 | ) 89 | testresult["expr"] = testresult["expr"].replace('"round(withholes)"', txt1) 90 | txt2 = ( 91 | 'named("[round(withholes), 2*round(withholes), 3*round(withholes)]", ' 92 | 'lambda x: [round(x["withholes"]), 2*round(x["withholes"]), 3*round(x["withholes"])] ' 93 | 'if math.isfinite(x["withholes"]) else [x["withholes"], x["withholes"], x["withholes"]])', 94 | ) 95 | testresult["expr"] = testresult["expr"].replace( 96 | '"[round(withholes), 2*round(withholes), 3*round(withholes)]"', txt2 97 | ) 98 | 99 | sys.stderr.write(testresult["expr"] + "\n") 100 | 101 | zero = testresult["zero-named"] 102 | one = testresult["one-named"] 103 | two = testresult["two-named"] 104 | 105 | h1 = eval(testresult["expr"]) 106 | h2 = eval(testresult["expr"]) 107 | 108 | self.compare(h1.toJson(), zero, "NAMED ZERO") 109 | self.compare((h1 + h1).toJson(), zero, "NAMED ZERO + ZERO") 110 | self.compare(h1.zero().toJson(), zero, "NAMED ZERO.zero()") 111 | 112 | for x in testdata: 113 | h1.fill(x) 114 | h2.fill(x) 115 | self.compare(h1.toJson(), one, "NAMED ONE") 116 | self.compare(h1.zero().toJson(), zero, "NAMED ONE.zero()") 117 | self.compare((h1 + h1.zero()).toJson(), one, "NAMED ONE + ZERO") 118 | self.compare((h1.zero() + h1).toJson(), one, "NAMED ZERO + ONE") 119 | 120 | self.compare((h1 + h2).toJson(), two, "NAMED TWO VIA PLUS") 121 | 122 | for x in testdata: 123 | h1.fill(x) 124 | self.compare(h1.toJson(), two, "NAMED TWO VIA FILL") 125 | 126 | zero = testresult["zero-anonymous"] 127 | one = testresult["one-anonymous"] 128 | two = testresult["two-anonymous"] 129 | 130 | h1 = eval(testresult["expr"]) 131 | stripNames(h1) 132 | h2 = eval(testresult["expr"]) 133 | stripNames(h2) 134 | 135 | self.compare(h1.toJson(), zero, "ANONYMOUS ZERO") 136 | self.compare((h1 + h1).toJson(), zero, "ANONYMOUS ZERO + ZERO") 137 | self.compare(h1.zero().toJson(), zero, "ANONYMOUS ZERO.zero()") 138 | 139 | for x in testdata: 140 | h1.fill(x) 141 | h2.fill(x) 142 | self.compare(h1.toJson(), one, "ANONYMOUS ONE") 143 | self.compare(h1.zero().toJson(), zero, "ANONYMOUS ONE.zero()") 144 | self.compare((h1 + h1.zero()).toJson(), one, "ANONYMOUS ONE + ZERO") 145 | self.compare((h1.zero() + h1).toJson(), one, "ANONYMOUS ZERO + ONE") 146 | 147 | self.compare((h1 + h2).toJson(), two, "ANONYMOUS TWO VIA PLUS") 148 | 149 | for x in testdata: 150 | h1.fill(x) 151 | self.compare(h1.toJson(), two, "ANONYMOUS TWO VIA FILL") 152 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ================================== 2 | histogrammar Python implementation 3 | ================================== 4 | 5 | histogrammar is a Python package for creating histograms. histogrammar has multiple histogram types, 6 | supports numeric and categorical features, and works with Numpy arrays and Pandas and Spark dataframes. 7 | Once a histogram is filled, it's easy to plot it, store it in JSON format (and retrieve it), or convert 8 | it to Numpy arrays for further analysis. 9 | 10 | At its core histogrammar is a suite of data aggregation primitives designed for use in parallel processing. 11 | In the simplest case, you can use this to compute histograms, but the generality of the primitives 12 | allows much more. 13 | 14 | Several common histogram types can be plotted in Matplotlib and Bokeh with a single method call. 15 | If Numpy or Pandas is available, histograms and other aggregators can be filled from arrays ten to a hundred times 16 | more quickly via Numpy commands, rather than Python for loops. 17 | 18 | This Python implementation of histogrammar been tested to guarantee compatibility with its Scala implementation. 19 | 20 | Latest Python release: v1.1.2 (Sep 2025). 21 | Latest update: Sep 2025. 22 | 23 | References 24 | ========== 25 | 26 | Histogrammar is a core component of `popmon `_, a package by ING bank 27 | that allows one to check the stability of a dataset. popmon works with both pandas and spark datasets, 28 | largely thanks to Histogrammar. 29 | 30 | 31 | 32 | Announcements 33 | ============= 34 | 35 | Changes 36 | ------- 37 | 38 | See Changes log `here `_. 39 | 40 | 41 | Spark 42 | ----- 43 | 44 | With Spark, make sure to pick up the correct histogrammar jar files. Spark 4.X is based on Scala 2.13; Spark 3.X is based on Scala 2.12 or 2.13. 45 | 46 | .. code-block:: python 47 | 48 | spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.13:1.0.30,io.github.histogrammar:histogrammar-sparksql_2.13:1.0.30").getOrCreate() 49 | 50 | 51 | For Scala 2.12, in the string above simply replace "2.13" with "2.12". 52 | 53 | September, 2025 54 | 55 | 56 | Example notebooks 57 | ================= 58 | 59 | .. list-table:: 60 | :widths: 80 20 61 | :header-rows: 1 62 | 63 | * - Tutorial 64 | - Colab link 65 | * - `Basic tutorial `_ 66 | - |notebook_basic_colab| 67 | * - `Detailed example (featuring configuration, Apache Spark and more) `_ 68 | - |notebook_advanced_colab| 69 | * - `Exercises `_ 70 | - |notebook_exercises_colab| 71 | 72 | Documentation 73 | ============= 74 | 75 | See `histogrammar-docs `_ for a complete introduction to `histogrammar`. 76 | (A bit old but still good.) There you can also find documentation about the Scala implementation of `histogrammar`. 77 | 78 | Check it out 79 | ============ 80 | 81 | The `historgrammar` library requires Python 3.8+ and is pip friendly. To get started, simply do: 82 | 83 | .. code-block:: bash 84 | 85 | $ pip install histogrammar 86 | 87 | or check out the code from our GitHub repository: 88 | 89 | .. code-block:: bash 90 | 91 | $ git clone https://github.com/histogrammar/histogrammar-python 92 | $ pip install -e histogrammar-python 93 | 94 | where in this example the code is installed in edit mode (option -e). 95 | 96 | You can now use the package in Python with: 97 | 98 | .. code-block:: python 99 | 100 | import histogrammar 101 | 102 | **Congratulations, you are now ready to use the histogrammar library!** 103 | 104 | Quick run 105 | ========= 106 | 107 | As a quick example, you can do: 108 | 109 | .. code-block:: python 110 | 111 | import pandas as pd 112 | import histogrammar as hg 113 | from histogrammar import resources 114 | 115 | # open synthetic data 116 | df = pd.read_csv(resources.data('test.csv.gz'), parse_dates=['date']) 117 | df.head() 118 | 119 | # create a histogram, tell it to look for column 'age' 120 | # fill the histogram with column 'age' and plot it 121 | hist = hg.Histogram(num=100, low=0, high=100, quantity='age') 122 | hist.fill.numpy(df) 123 | hist.plot.matplotlib() 124 | 125 | # generate histograms of all features in the dataframe using automatic binning 126 | # (importing histogrammar automatically adds this functionality to a pandas or spark dataframe) 127 | hists = df.hg_make_histograms() 128 | print(hists.keys()) 129 | 130 | # multi-dimensional histograms are also supported. e.g. features longitude vs latitude 131 | hists = df.hg_make_histograms(features=['longitude:latitude']) 132 | ll = hists['longitude:latitude'] 133 | ll.plot.matplotlib() 134 | 135 | # store histogram and retrieve it again 136 | ll.toJsonFile('longitude_latitude.json') 137 | ll2 = hg.Factory().fromJsonFile('longitude_latitude.json') 138 | 139 | These examples also work with Spark dataframes (sdf): 140 | 141 | .. code-block:: python 142 | 143 | from pyspark.sql.functions import col 144 | hist = hg.Histogram(num=100, low=0, high=100, quantity=col('age')) 145 | hist.fill.sparksql(sdf) 146 | 147 | For more examples please see the example notebooks and tutorials. 148 | 149 | 150 | Project contributors 151 | ==================== 152 | 153 | This package was originally authored by DIANA-HEP and is now maintained by volunteers. 154 | 155 | Contact and support 156 | =================== 157 | 158 | * Issues & Ideas & Support: https://github.com/histogrammar/histogrammar-python/issues 159 | 160 | Please note that `histogrammar` is supported only on a best-effort basis. 161 | 162 | License 163 | ======= 164 | `histogrammar` is completely free, open-source and licensed under the `Apache-2.0 license `_. 165 | 166 | .. |notebook_basic_colab| image:: https://colab.research.google.com/assets/colab-badge.svg 167 | :alt: Open in Colab 168 | :target: https://colab.research.google.com/github/histogrammar/histogrammar-python/blob/master/histogrammar/notebooks/histogrammar_tutorial_basic.ipynb 169 | .. |notebook_advanced_colab| image:: https://colab.research.google.com/assets/colab-badge.svg 170 | :alt: Open in Colab 171 | :target: https://colab.research.google.com/github/histogrammar/histogrammar-python/blob/master/histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb 172 | .. |notebook_exercises_colab| image:: https://colab.research.google.com/assets/colab-badge.svg 173 | :alt: Open in Colab 174 | :target: https://colab.research.google.com/github/histogrammar/histogrammar-python/blob/master/histogrammar/notebooks/histogrammar_tutorial_exercises.ipynb 175 | -------------------------------------------------------------------------------- /histogrammar/primitives/count.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2016 DIANA-HEP 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import math 18 | import numbers 19 | 20 | from histogrammar.defs import ( 21 | Container, 22 | ContainerException, 23 | Factory, 24 | JsonFormatException, 25 | identity, 26 | ) 27 | from histogrammar.util import ( 28 | datatype, 29 | floatToJson, 30 | inheritdoc, 31 | n_dim, 32 | numeq, 33 | serializable, 34 | ) 35 | 36 | 37 | class Count(Factory, Container): 38 | """Count entries by accumulating the sum of all observed weights or a sum of transformed weights 39 | 40 | (e.g. collect the sum of squares of weights). 41 | 42 | An optional ``transform`` function can be applied to the weights before summing. 43 | To accumulate the sum of squares of weights, use: 44 | 45 | :: 46 | lambda x: x**2 47 | 48 | for instance. This is unlike any other primitive's ``quantity`` function in that its domain is 49 | the *weights* (always double), not *data* (any type). 50 | """ 51 | 52 | @staticmethod 53 | def ed(entries): 54 | """Create a Count that is only capable of being added. 55 | 56 | Parameters: 57 | entries (float): the number of entries. 58 | """ 59 | if not isinstance(entries, numbers.Real) and entries not in ( 60 | "nan", 61 | "inf", 62 | "-inf", 63 | ): 64 | raise TypeError(f"entries ({entries}) must be a number") 65 | if entries < 0.0: 66 | raise ValueError(f"entries ({entries}) cannot be negative") 67 | out = Count() 68 | out.entries = float(entries) 69 | return out 70 | 71 | @staticmethod 72 | def ing(transform=identity): 73 | """Synonym for ``__init__``.""" 74 | return Count(transform) 75 | 76 | def __init__(self, transform=identity): 77 | """Create a Count that is capable of being filled and added. 78 | 79 | Parameters: 80 | transform (function from float to float): transforms each weight. 81 | 82 | Other parameters: 83 | entries (float): the number of entries, initially 0.0. 84 | """ 85 | self.entries = 0.0 86 | self.transform = serializable(transform) 87 | super().__init__() 88 | 89 | @inheritdoc(Container) 90 | def zero(self): 91 | return Count(self.transform) 92 | 93 | @inheritdoc(Container) 94 | def __add__(self, other): 95 | if isinstance(other, Count): 96 | out = Count(self.transform) 97 | out.entries = self.entries + other.entries 98 | return out 99 | raise ContainerException(f"cannot add {self.name} and {other.name}") 100 | 101 | @inheritdoc(Container) 102 | def __iadd__(self, other): 103 | if isinstance(other, Count): 104 | self.entries += other.entries 105 | return self 106 | raise ContainerException(f"cannot add {self.name} and {other.name}") 107 | 108 | @inheritdoc(Container) 109 | def __mul__(self, factor): 110 | if ( 111 | self.transform != identity 112 | or not callable(self.transform.expr) 113 | or ( 114 | hasattr(self.transform.expr, "func_code") 115 | and self.transform.expr.func_code.co_code != identity.expr.func_code.co_code 116 | ) 117 | or ( 118 | hasattr(self.transform.expr, "__code__") 119 | and self.transform.expr.__code__.co_code != identity.expr.__code__.co_code 120 | ) 121 | ): 122 | raise ContainerException("Cannot scalar-multiply Count with a non-identity transform.") 123 | if math.isnan(factor) or factor <= 0.0: 124 | return self.zero() 125 | out = self.zero() 126 | out.entries = factor * self.entries 127 | return out 128 | 129 | @inheritdoc(Container) 130 | def __rmul__(self, factor): 131 | return self.__mul__(factor) 132 | 133 | @inheritdoc(Container) 134 | def fill(self, datum, weight=1.0): 135 | self._checkForCrossReferences() 136 | 137 | if weight > 0.0: 138 | t = self.transform(weight) 139 | if not isinstance(t, numbers.Real): 140 | raise TypeError(f"function return value ({t}) must be boolean or number") 141 | 142 | # no possibility of exception from here on out (for rollback) 143 | self.entries += t 144 | 145 | def _numpy(self, _, weights, shape): 146 | import numpy 147 | 148 | if isinstance(weights, numpy.ndarray): 149 | assert len(weights.shape) == 1 150 | if shape[0] is not None: 151 | assert weights.shape[0] == shape[0] 152 | 153 | if self.transform is identity: 154 | self.entries += float(weights.sum()) 155 | else: 156 | t = self.transform(weights) 157 | assert len(t.shape) == 1 158 | if shape[0] is not None: 159 | assert t.shape[0] == shape[0] 160 | self.entries += float(t.sum()) 161 | 162 | elif shape[0] is not None: 163 | if self.transform is identity: 164 | self.entries += weights * shape[0] 165 | else: 166 | t = self.transform(numpy.array([weights])) 167 | assert len(t.shape) == 1 168 | assert t.shape[0] == 1 169 | self.entries += float(t[0]) 170 | 171 | elif isinstance(weights, (int, float, numpy.number)): 172 | if self.transform is identity: 173 | self.entries += float(weights) 174 | else: 175 | self.entries += self.transform(weights) 176 | 177 | else: 178 | raise ValueError("cannot use Numpy to fill an isolated Count (unless the weights are given as an array)") 179 | 180 | def _sparksql(self, jvm, converter): 181 | return converter.Count() # TODO: handle transform 182 | 183 | @property 184 | def children(self): 185 | """List of sub-aggregators, to make it possible to walk the tree.""" 186 | return [] 187 | 188 | @inheritdoc(Container) 189 | def toJsonFragment(self, suppressName): 190 | return floatToJson(self.entries) 191 | 192 | @staticmethod 193 | @inheritdoc(Factory) 194 | def fromJsonFragment(json, nameFromParent): 195 | if json in ("nan", "inf", "-inf") or isinstance(json, numbers.Real): 196 | return Count.ed(float(json)) 197 | raise JsonFormatException(json, "Count") 198 | 199 | def __repr__(self): 200 | return f"" 201 | 202 | def __eq__(self, other): 203 | return isinstance(other, Count) and numeq(self.entries, other.entries) and self.transform == other.transform 204 | 205 | def __ne__(self, other): 206 | return not self == other 207 | 208 | def __hash__(self): 209 | return hash((self.entries, self.transform)) 210 | 211 | 212 | # extra properties: number of dimensions and datatypes of sub-hists 213 | Count.n_dim = n_dim 214 | Count.datatype = datatype 215 | 216 | # register extra methods 217 | Factory.register(Count) 218 | -------------------------------------------------------------------------------- /histogrammar/primitives/sum.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2016 DIANA-HEP 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import math 18 | import numbers 19 | 20 | from histogrammar.defs import ( 21 | Container, 22 | ContainerException, 23 | Factory, 24 | JsonFormatException, 25 | identity, 26 | ) 27 | from histogrammar.util import ( 28 | basestring, 29 | datatype, 30 | floatToJson, 31 | hasKeys, 32 | inheritdoc, 33 | maybeAdd, 34 | n_dim, 35 | numeq, 36 | serializable, 37 | ) 38 | 39 | 40 | class Sum(Factory, Container): 41 | """Accumulate the (weighted) sum of a given quantity, calculated from the data. 42 | 43 | Sum differs from :doc:`Count ` in that it computes a quantity on the spot, 44 | rather than percolating a product of weight metadata from nested primitives. Also unlike weights, the sum can add 45 | both positive and negative quantities (weights are always non-negative). 46 | """ 47 | 48 | @staticmethod 49 | def ed(entries, sum): 50 | """Create a Sum that is only capable of being added. 51 | 52 | Parameters: 53 | entries (float): the number of entries. 54 | sum (float): the sum. 55 | """ 56 | if not isinstance(entries, numbers.Real) and entries not in ( 57 | "nan", 58 | "inf", 59 | "-inf", 60 | ): 61 | raise TypeError(f"entries ({entries}) must be a number") 62 | if not isinstance(sum, numbers.Real) and entries not in ("nan", "inf", "-inf"): 63 | raise TypeError(f"sum ({sum}) must be a number") 64 | if entries < 0.0: 65 | raise ValueError(f"entries ({entries}) cannot be negative") 66 | out = Sum(None) 67 | out.entries = float(entries) 68 | out.sum = float(sum) 69 | return out.specialize() 70 | 71 | @staticmethod 72 | def ing(quantity): 73 | """Synonym for ``__init__``.""" 74 | return Sum(quantity) 75 | 76 | def __init__(self, quantity=identity): 77 | """Create a Sum that is capable of being filled and added. 78 | 79 | Parameters: 80 | quantity (function returning float): computes the quantity of interest from the data. 81 | 82 | Other parameters: 83 | entries (float): the number of entries, initially 0.0. 84 | sum (float): the running sum, initially 0.0. 85 | """ 86 | self.quantity = serializable(identity(quantity) if isinstance(quantity, str) else quantity) 87 | self.entries = 0.0 88 | self.sum = 0.0 89 | super().__init__() 90 | self.specialize() 91 | 92 | @inheritdoc(Container) 93 | def zero(self): 94 | return Sum(self.quantity) 95 | 96 | @inheritdoc(Container) 97 | def __add__(self, other): 98 | if isinstance(other, Sum): 99 | out = Sum(self.quantity) 100 | out.entries = self.entries + other.entries 101 | out.sum = self.sum + other.sum 102 | return out.specialize() 103 | raise ContainerException(f"cannot add {self.name} and {other.name}") 104 | 105 | @inheritdoc(Container) 106 | def __iadd__(self, other): 107 | self.entries += other.entries 108 | self.sum += other.sum 109 | return self 110 | 111 | @inheritdoc(Container) 112 | def __mul__(self, factor): 113 | if math.isnan(factor) or factor <= 0.0: 114 | return self.zero() 115 | out = self.zero() 116 | out.entries = factor * self.entries 117 | out.sum = factor * self.sum 118 | return out.specialize() 119 | 120 | @inheritdoc(Container) 121 | def __rmul__(self, factor): 122 | return self.__mul__(factor) 123 | 124 | @inheritdoc(Container) 125 | def fill(self, datum, weight=1.0, method=None): 126 | self._checkForCrossReferences() 127 | 128 | if weight > 0.0: 129 | q = self.quantity(datum) 130 | if not isinstance(q, numbers.Real): 131 | raise TypeError(f"function return value ({q}) must be boolean or number") 132 | 133 | # no possibility of exception from here on out (for rollback) 134 | self.entries += weight 135 | self.sum += q * weight 136 | 137 | def _numpy(self, data, weights, shape): 138 | q = self.quantity(data) 139 | self._checkNPQuantity(q, shape) 140 | self._checkNPWeights(weights, shape) 141 | weights = self._makeNPWeights(weights, shape) 142 | 143 | # no possibility of exception from here on out (for rollback) 144 | self.entries += float(weights.sum()) 145 | 146 | import numpy 147 | 148 | selection = numpy.isnan(q) 149 | numpy.bitwise_not(selection, selection) 150 | numpy.bitwise_and(selection, weights > 0.0, selection) 151 | q = q[selection] 152 | weights = weights[selection] 153 | q = q * weights 154 | 155 | self.sum += float(q.sum()) 156 | 157 | def _sparksql(self, jvm, converter): 158 | return converter.Sum(self.quantity.asSparkSQL()) 159 | 160 | @property 161 | def children(self): 162 | """List of sub-aggregators, to make it possible to walk the tree.""" 163 | return [] 164 | 165 | @inheritdoc(Container) 166 | def toJsonFragment(self, suppressName): 167 | return maybeAdd( 168 | {"entries": floatToJson(self.entries), "sum": floatToJson(self.sum)}, 169 | name=(None if suppressName else self.quantity.name), 170 | ) 171 | 172 | @staticmethod 173 | @inheritdoc(Factory) 174 | def fromJsonFragment(json, nameFromParent): 175 | if isinstance(json, dict) and hasKeys(json.keys(), ["entries", "sum"], ["name"]): 176 | if json["entries"] in ("nan", "inf", "-inf") or isinstance(json["entries"], numbers.Real): 177 | entries = float(json["entries"]) 178 | else: 179 | raise JsonFormatException(json["entries"], "Sum.entries") 180 | 181 | if isinstance(json.get("name", None), basestring): 182 | name = json["name"] 183 | elif json.get("name", None) is None: 184 | name = None 185 | else: 186 | raise JsonFormatException(json["name"], "Sum.name") 187 | 188 | if json["sum"] in ("nan", "inf", "-inf") or isinstance(json["sum"], numbers.Real): 189 | sum = float(json["sum"]) 190 | else: 191 | raise JsonFormatException(json["sum"], "Sum.sum") 192 | 193 | out = Sum.ed(entries, sum) 194 | out.quantity.name = nameFromParent if name is None else name 195 | return out.specialize() 196 | 197 | raise JsonFormatException(json, "Sum") 198 | 199 | def __repr__(self): 200 | return f"" 201 | 202 | def __eq__(self, other): 203 | return ( 204 | isinstance(other, Sum) 205 | and self.quantity == other.quantity 206 | and numeq(self.entries, other.entries) 207 | and numeq(self.sum, other.sum) 208 | ) 209 | 210 | def __ne__(self, other): 211 | return not self == other 212 | 213 | def __hash__(self): 214 | return hash((self.quantity, self.entries, self.sum)) 215 | 216 | 217 | # extra properties: number of dimensions and datatypes of sub-hists 218 | Sum.n_dim = n_dim 219 | Sum.datatype = datatype 220 | 221 | # register extra methods 222 | Factory.register(Sum) 223 | -------------------------------------------------------------------------------- /histogrammar/dfinterface/filling_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 ING Wholesale Banking Advanced Analytics 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | # this software and associated documentation files (the "Software"), to deal in 5 | # the Software without restriction, including without limitation the rights to 6 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | # the Software, and to permit persons to whom the Software is furnished to do so, 8 | # subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | 21 | import numpy as np 22 | import pandas as pd 23 | 24 | NUM_NS_DAY = 24 * 3600 * int(1e9) 25 | 26 | 27 | def check_column(col, sep=":"): 28 | """Convert input column string to list of columns 29 | 30 | :param col: input string 31 | :param sep: default ":" 32 | :return: list of columns 33 | """ 34 | if isinstance(col, str): 35 | col = col.split(sep) 36 | elif not isinstance(col, list): 37 | raise TypeError(f'Columns "{col}" needs to be a string or list of strings') 38 | return col 39 | 40 | 41 | def normalize_dtype(dtype): 42 | """Convert datatype to consistent numpy datatype 43 | 44 | :param dtype: input datatype 45 | :rtype: numpy.dtype.type 46 | """ 47 | try: 48 | if hasattr(dtype, "type"): 49 | # this converts pandas types, such as pd.Int64, into numpy types 50 | dtype = type(dtype.type()) 51 | dtype = np.dtype(dtype).type 52 | if dtype in {np.str_, np.bytes_}: 53 | dtype = np.dtype(str).type 54 | # MB 20210404: nb.object_ is kept an object -> uses to_string(). str uses only_str() 55 | except BaseException: 56 | raise RuntimeError(f'unknown assigned datatype "{dtype}"') 57 | return dtype 58 | 59 | 60 | def to_ns(x): 61 | """Convert input timestamps to nanoseconds (integers). 62 | 63 | :param x: value to be converted 64 | :returns: converted value 65 | :rtype: int 66 | """ 67 | if pd.isnull(x): 68 | return 0 69 | try: 70 | return pd.to_datetime(x).value 71 | except Exception: 72 | if hasattr(x, "__str__"): 73 | return pd.to_datetime(str(x)).value 74 | return 0 75 | 76 | 77 | def to_str(val): 78 | """Convert input to (array of) string(s). 79 | 80 | :param val: value to be converted 81 | :returns: converted value 82 | :rtype: str or np.ndarray 83 | """ 84 | if isinstance(val, str): 85 | return val 86 | if isinstance(val, pd.Series): 87 | # Note: at this point, data type of pd.series has already been inferred as being of type object (mixed) 88 | return val.astype(str).values 89 | if hasattr(val, "__iter__"): 90 | return np.asarray([(s if isinstance(s, str) else str(s) if hasattr(s, "__str__") else "") for s in val]) 91 | if hasattr(val, "__str__"): 92 | return str(val) 93 | return "None" 94 | 95 | 96 | def only_str(val): 97 | """Pass input value or array only if it is a string. 98 | 99 | :param val: value to be evaluated 100 | :returns: evaluated value 101 | :rtype: str or np.ndarray 102 | """ 103 | if isinstance(val, str): 104 | return val 105 | if isinstance(val, pd.Series): 106 | # at this point, data type of pd.series has already been inferred as *to be* 'string' 107 | dtype = np.dtype(val.dtype).type 108 | return val.values if dtype in [str, np.str_, np.bytes_] else val.astype(str).values 109 | if hasattr(val, "__iter__"): 110 | return np.asarray([s if isinstance(s, str) else "None" for s in val]) 111 | return "None" 112 | 113 | 114 | def only_bool(val): 115 | """Pass input value or array only if it is a bool. 116 | 117 | :param val: value to be evaluated 118 | :returns: evaluated value 119 | :rtype: np.bool or np.ndarray 120 | """ 121 | if isinstance(val, (np.bool_, bool)): 122 | return val 123 | if isinstance(val, pd.Series) and val.dtype in [np.bool_, bool]: 124 | return val.values 125 | if hasattr(val, "__iter__") and not isinstance(val, str): 126 | return np.asarray([s if isinstance(s, (np.bool_, bool)) else "NaN" for s in val]) 127 | return "NaN" 128 | 129 | 130 | def only_int(val): 131 | """Pass input val value or array only if it is an integer. 132 | 133 | :param val: value to be evaluated 134 | :returns: evaluated value 135 | :rtype: np.int64 or np.ndarray 136 | """ 137 | if isinstance(val, (np.int64, int)): 138 | return val 139 | if isinstance(val, pd.Series) and val.dtype in [np.int64, int]: 140 | return val.values 141 | if hasattr(val, "__iter__") and not isinstance(val, str): 142 | return np.asarray([s if isinstance(s, (np.int64, int)) else np.nan for s in val]) 143 | return np.nan 144 | 145 | 146 | def only_float(val): 147 | """Pass input val value or array only if it is a float. 148 | 149 | :param val: value to be evaluated 150 | :returns: evaluated value 151 | :rtype: np.float64 or np.ndarray 152 | """ 153 | if isinstance(val, (np.float64, float)): 154 | return val 155 | if isinstance(val, pd.Series) and val.dtype in [np.float64, float]: 156 | return val.values 157 | if hasattr(val, "__iter__") and not isinstance(val, str): 158 | return np.asarray([s if isinstance(s, (np.float64, float)) else np.nan for s in val]) 159 | return np.nan 160 | 161 | 162 | QUANTITY = { 163 | # MB 20210404: to_string for object types b/c it's a mixed type 164 | object: to_str, 165 | np.object_: to_str, 166 | str: only_str, 167 | np.str_: only_str, 168 | int: only_int, 169 | np.int64: only_int, 170 | np.int32: only_int, 171 | bool: only_bool, 172 | np.bool_: only_bool, 173 | float: only_float, 174 | np.float64: only_float, 175 | np.datetime64: only_int, 176 | } 177 | 178 | 179 | def value_to_bin_index(val, **kwargs): 180 | """Convert value to bin index. 181 | 182 | Convert a numeric or timestamp column to an integer bin index. 183 | 184 | :param binWidth: bin width value needed to convert column 185 | to an integer bin index 186 | :param origin: bin offset value needed to convert column 187 | to an integer bin index 188 | """ 189 | try: 190 | # NOTE this notation also works for timestamps 191 | bin_width = kwargs.get("binWidth", kwargs.get("bin_width", 1)) 192 | bin_offset = kwargs.get("origin", kwargs.get("bin_offset", 0)) 193 | return int(np.floor((val - bin_offset) / bin_width)) 194 | except BaseException: 195 | pass 196 | return val 197 | 198 | 199 | def value_to_bin_center(val, **kwargs): 200 | """Convert value to bin center. 201 | 202 | Convert a numeric or timestamp column to a common bin center value. 203 | 204 | :param binWidth: bin width value needed to convert column 205 | to a common bin center value 206 | :param origin: bin_offset value needed to convert column 207 | to a common bin center value 208 | """ 209 | try: 210 | # NOTE this notation also works for timestamps, and does not change the 211 | # unit 212 | bin_width = kwargs.get("binWidth", kwargs.get("bin_width", 1)) 213 | bin_offset = kwargs.get("origin", kwargs.get("bin_offset", 0)) 214 | bin_index = int(np.floor((val - bin_offset) / bin_width)) 215 | obj_type = type(bin_width) 216 | return bin_offset + obj_type((bin_index + 0.5) * bin_width) 217 | except BaseException: 218 | pass 219 | return val 220 | -------------------------------------------------------------------------------- /tests/resources/isActive_age.json: -------------------------------------------------------------------------------- 1 | {"data": {"bins": {"False": {"binWidth": 1.0, 2 | "bins": {"10": 2.0, 3 | "11": 1.0, 4 | "13": 2.0, 5 | "14": 2.0, 6 | "15": 1.0, 7 | "16": 2.0, 8 | "17": 3.0, 9 | "18": 2.0, 10 | "19": 3.0, 11 | "20": 5.0, 12 | "21": 3.0, 13 | "22": 1.0, 14 | "23": 2.0, 15 | "24": 3.0, 16 | "25": 2.0, 17 | "26": 3.0, 18 | "28": 3.0, 19 | "29": 1.0, 20 | "30": 3.0, 21 | "31": 2.0, 22 | "32": 7.0, 23 | "33": 3.0, 24 | "34": 3.0, 25 | "36": 2.0, 26 | "37": 3.0, 27 | "38": 2.0, 28 | "39": 2.0, 29 | "41": 8.0, 30 | "42": 1.0, 31 | "43": 6.0, 32 | "44": 5.0, 33 | "45": 2.0, 34 | "46": 2.0, 35 | "47": 1.0, 36 | "48": 2.0, 37 | "50": 4.0, 38 | "51": 4.0, 39 | "52": 3.0, 40 | "53": 6.0, 41 | "54": 6.0, 42 | "55": 3.0, 43 | "56": 5.0, 44 | "58": 4.0, 45 | "59": 2.0, 46 | "60": 2.0, 47 | "61": 2.0, 48 | "62": 3.0, 49 | "63": 3.0, 50 | "64": 4.0, 51 | "65": 1.0, 52 | "66": 6.0, 53 | "67": 4.0, 54 | "68": 1.0, 55 | "69": 2.0, 56 | "70": 4.0, 57 | "72": 2.0, 58 | "73": 1.0, 59 | "74": 1.0, 60 | "75": 2.0, 61 | "76": 2.0, 62 | "77": 2.0, 63 | "78": 3.0, 64 | "79": 3.0, 65 | "80": 4.0, 66 | "81": 3.0, 67 | "82": 2.0, 68 | "84": 2.0, 69 | "85": 2.0, 70 | "86": 3.0, 71 | "87": 4.0, 72 | "88": 5.0, 73 | "89": 2.0, 74 | "90": 1.0}, 75 | "bins:type": "Count", 76 | "entries": 208.0, 77 | "nanflow": 0.0, 78 | "nanflow:type": "Count", 79 | "origin": 0.0}, 80 | "True": {"binWidth": 1.0, 81 | "bins": {"10": 3.0, 82 | "11": 2.0, 83 | "12": 4.0, 84 | "13": 3.0, 85 | "14": 4.0, 86 | "15": 4.0, 87 | "16": 4.0, 88 | "17": 2.0, 89 | "18": 2.0, 90 | "19": 3.0, 91 | "20": 1.0, 92 | "21": 2.0, 93 | "22": 2.0, 94 | "23": 3.0, 95 | "24": 2.0, 96 | "25": 3.0, 97 | "26": 3.0, 98 | "27": 3.0, 99 | "29": 1.0, 100 | "30": 4.0, 101 | "31": 5.0, 102 | "32": 6.0, 103 | "33": 1.0, 104 | "34": 2.0, 105 | "35": 1.0, 106 | "37": 6.0, 107 | "38": 1.0, 108 | "39": 2.0, 109 | "40": 2.0, 110 | "41": 2.0, 111 | "42": 4.0, 112 | "44": 3.0, 113 | "45": 4.0, 114 | "46": 2.0, 115 | "47": 1.0, 116 | "48": 7.0, 117 | "49": 4.0, 118 | "50": 1.0, 119 | "51": 3.0, 120 | "52": 3.0, 121 | "54": 5.0, 122 | "55": 1.0, 123 | "56": 2.0, 124 | "57": 8.0, 125 | "58": 6.0, 126 | "59": 1.0, 127 | "60": 4.0, 128 | "61": 1.0, 129 | "62": 2.0, 130 | "63": 4.0, 131 | "64": 1.0, 132 | "65": 1.0, 133 | "66": 3.0, 134 | "67": 3.0, 135 | "68": 3.0, 136 | "69": 2.0, 137 | "70": 1.0, 138 | "71": 2.0, 139 | "72": 1.0, 140 | "73": 4.0, 141 | "77": 2.0, 142 | "78": 4.0, 143 | "80": 1.0, 144 | "83": 2.0, 145 | "84": 3.0, 146 | "85": 3.0, 147 | "86": 1.0, 148 | "88": 1.0, 149 | "89": 4.0, 150 | "90": 1.0}, 151 | "bins:type": "Count", 152 | "entries": 192.0, 153 | "nanflow": 0.0, 154 | "nanflow:type": "Count", 155 | "origin": 0.0}}, 156 | "bins:type": "SparselyBin", 157 | "entries": 400.0}, 158 | "type": "Categorize", 159 | "version": "1.1"} -------------------------------------------------------------------------------- /histogrammar/primitives/select.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2016 DIANA-HEP 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import math 18 | import numbers 19 | 20 | from histogrammar.defs import ( 21 | Container, 22 | ContainerException, 23 | Factory, 24 | JsonFormatException, 25 | identity, 26 | ) 27 | from histogrammar.primitives.count import Count 28 | from histogrammar.util import ( 29 | basestring, 30 | datatype, 31 | floatToJson, 32 | hasKeys, 33 | inheritdoc, 34 | maybeAdd, 35 | n_dim, 36 | numeq, 37 | serializable, 38 | ) 39 | 40 | # Select 41 | 42 | 43 | class Select(Factory, Container): 44 | """Filter or weight data according to a given selection. 45 | 46 | This primitive is a basic building block, intended to be used in conjunction with anything that needs a 47 | user-defined cut. In particular, a standard histogram often has a custom selection, and this can be built by 48 | nesting Select -> Bin -> Count. 49 | 50 | Select also resembles :doc:`Fraction `, but without the ``denominator``. 51 | 52 | The efficiency of a cut in a Select aggregator named ``x`` is simply ``x.cut.entries / x.entries`` 53 | (because all aggregators have an ``entries`` member). 54 | """ 55 | 56 | @staticmethod 57 | def ed(entries, cut): 58 | """Create a Select that is only capable of being added. 59 | 60 | Parameters: 61 | entries (float): the number of entries. 62 | cut (:doc:`Container `): the filled sub-aggregator. 63 | """ 64 | if not isinstance(entries, numbers.Real) and entries not in ( 65 | "nan", 66 | "inf", 67 | "-inf", 68 | ): 69 | raise TypeError(f"entries ({entries}) must be a number") 70 | if not isinstance(cut, Container): 71 | raise TypeError(f"cut ({cut}) must be a Container") 72 | if entries < 0.0: 73 | raise ValueError(f"entries ({entries}) cannot be negative") 74 | out = Select(None, cut) 75 | out.entries = float(entries) 76 | return out.specialize() 77 | 78 | @staticmethod 79 | def ing(quantity, cut=Count()): 80 | """Synonym for ``__init__``.""" 81 | return Select(quantity, cut) 82 | 83 | def __getattr__(self, attr): 84 | """Pass on searches for custom methods to the ``value``, so that Limit becomes effectively invisible.""" 85 | if attr.startswith("__") and attr.endswith("__"): 86 | return getattr(Select, attr) 87 | if attr not in self.__dict__ and hasattr(self.__dict__["cut"], attr): 88 | return getattr(self.__dict__["cut"], attr) 89 | return self.__dict__[attr] 90 | 91 | def __init__(self, quantity=identity, cut=Count()): 92 | """Create a Select that is capable of being filled and added. 93 | 94 | Parameters: 95 | quantity (function returning bool or float): computes the quantity of interest from the data and interprets 96 | it as a selection (multiplicative factor on weight). 97 | cut (:doc:`Container `): will only be filled with data that pass the cut, 98 | and which are weighted by the cut. 99 | 100 | Other Parameters: 101 | entries (float): the number of entries, initially 0.0. 102 | """ 103 | if not isinstance(cut, Container): 104 | raise TypeError(f"cut ({cut}) must be a Container") 105 | self.entries = 0.0 106 | self.quantity = serializable(identity(quantity) if isinstance(quantity, str) else quantity) 107 | self.cut = cut 108 | super().__init__() 109 | self.specialize() 110 | 111 | def fractionPassing(self): 112 | """Fraction of weights that pass the quantity.""" 113 | return self.cut.entries / self.entries 114 | 115 | @inheritdoc(Container) 116 | def zero(self): 117 | return Select(self.quantity, self.cut.zero()) 118 | 119 | @inheritdoc(Container) 120 | def __add__(self, other): 121 | if isinstance(other, Select): 122 | out = Select(self.quantity, self.cut + other.cut) 123 | out.entries = self.entries + other.entries 124 | return out.specialize() 125 | raise ContainerException(f"cannot add {self.name} and {other.name}") 126 | 127 | @inheritdoc(Container) 128 | def __iadd__(self, other): 129 | if isinstance(other, Select): 130 | self.entries += other.entries 131 | self.cut += other.cut 132 | return self 133 | raise ContainerException(f"cannot add {self.name} and {other.name}") 134 | 135 | @inheritdoc(Container) 136 | def __mul__(self, factor): 137 | if math.isnan(factor) or factor <= 0.0: 138 | return self.zero() 139 | out = self.zero() 140 | out.entries = factor * self.entries 141 | out.cut = self.cut * factor 142 | return out.specialize() 143 | 144 | @inheritdoc(Container) 145 | def __rmul__(self, factor): 146 | return self.__mul__(factor) 147 | 148 | @inheritdoc(Container) 149 | def fill(self, datum, weight=1.0): 150 | self._checkForCrossReferences() 151 | 152 | if weight > 0.0: 153 | w = self.quantity(datum) 154 | if not isinstance(w, numbers.Real): 155 | raise TypeError(f"function return value ({w}) must be boolean or number") 156 | w *= weight 157 | 158 | if w > 0.0: 159 | self.cut.fill(datum, w) 160 | # no possibility of exception from here on out (for rollback) 161 | self.entries += weight 162 | 163 | def _numpy(self, data, weights, shape): 164 | w = self.quantity(data) 165 | self._checkNPQuantity(w, shape) 166 | self._checkNPWeights(weights, shape) 167 | weights = self._makeNPWeights(weights, shape) 168 | 169 | import numpy 170 | 171 | w = w * weights 172 | w[numpy.isnan(w)] = 0.0 173 | w[w < 0.0] = 0.0 174 | 175 | self.cut._numpy(data, w, shape) 176 | 177 | # no possibility of exception from here on out (for rollback) 178 | self.entries += float(weights.sum()) 179 | 180 | def _sparksql(self, jvm, converter): 181 | return converter.Select(self.quantity.asSparkSQL(), self.cut._sparksql(jvm, converter)) 182 | 183 | @property 184 | def children(self): 185 | """List of sub-aggregators, to make it possible to walk the tree.""" 186 | return [self.cut] 187 | 188 | @inheritdoc(Container) 189 | def toJsonFragment(self, suppressName): 190 | return maybeAdd( 191 | { 192 | "entries": floatToJson(self.entries), 193 | "sub:type": self.cut.name, 194 | "data": self.cut.toJsonFragment(False), 195 | }, 196 | name=(None if suppressName else self.quantity.name), 197 | ) 198 | 199 | @staticmethod 200 | @inheritdoc(Factory) 201 | def fromJsonFragment(json, nameFromParent): 202 | if isinstance(json, dict) and hasKeys(json.keys(), ["entries", "sub:type", "data"], ["name"]): 203 | if json["entries"] in ("nan", "inf", "-inf") or isinstance(json["entries"], numbers.Real): 204 | entries = float(json["entries"]) 205 | else: 206 | raise JsonFormatException(json, "Select.entries") 207 | 208 | if isinstance(json.get("name", None), basestring): 209 | name = json["name"] 210 | elif json.get("name", None) is None: 211 | name = None 212 | else: 213 | raise JsonFormatException(json["name"], "Select.name") 214 | 215 | if isinstance(json["sub:type"], basestring): 216 | factory = Factory.registered[json["sub:type"]] 217 | else: 218 | raise JsonFormatException(json, "Select.type") 219 | 220 | cut = factory.fromJsonFragment(json["data"], None) 221 | 222 | out = Select.ed(entries, cut) 223 | out.quantity.name = nameFromParent if name is None else name 224 | return out.specialize() 225 | 226 | raise JsonFormatException(json, "Select") 227 | 228 | def __repr__(self): 229 | return f"