├── .github └── workflows │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGES.rst ├── LICENSE ├── MANIFEST.in ├── NOTICE ├── README.rst ├── docs ├── Makefile ├── conf.py ├── index.rst └── make.bat ├── histogrammar ├── __init__.py ├── convenience.py ├── defs.py ├── dfinterface │ ├── __init__.py │ ├── addmethods.py │ ├── filling_utils.py │ ├── histogram_filler_base.py │ ├── make_histograms.py │ ├── pandas_histogrammar.py │ └── spark_histogrammar.py ├── notebooks │ ├── __init__.py │ ├── histogrammar_tutorial_advanced.ipynb │ ├── histogrammar_tutorial_basic.ipynb │ └── histogrammar_tutorial_exercises.ipynb ├── plot │ ├── __init__.py │ ├── bokeh.py │ ├── hist_numpy.py │ └── matplotlib.py ├── primitives │ ├── __init__.py │ ├── average.py │ ├── bag.py │ ├── bin.py │ ├── categorize.py │ ├── centrallybin.py │ ├── collection.py │ ├── count.py │ ├── deviate.py │ ├── fraction.py │ ├── irregularlybin.py │ ├── minmax.py │ ├── select.py │ ├── sparselybin.py │ ├── stack.py │ └── sum.py ├── resources.py ├── sparksql.py ├── specialized.py ├── test_data │ ├── __init__.py │ └── test.csv.gz ├── util.py └── version.py ├── makedocs.py ├── pyproject.toml └── tests ├── __init__.py ├── conftest.py ├── jars ├── histogrammar-sparksql_2.11-1.0.11.jar ├── histogrammar-sparksql_2.11-1.0.20.jar ├── histogrammar-sparksql_2.12-1.0.11.jar ├── histogrammar-sparksql_2.12-1.0.20.jar ├── histogrammar_2.11-1.0.11.jar ├── histogrammar_2.11-1.0.20.jar ├── histogrammar_2.12-1.0.11.jar └── histogrammar_2.12-1.0.20.jar ├── resources ├── age.json ├── company.json ├── date.json ├── eyesColor.json ├── gender.json ├── isActive.json ├── isActive_age.json ├── latitude.json ├── latitude_longitude.json ├── longitude.json └── transaction.json ├── test_basic.py ├── test_notebooks.py ├── test_numpy.py ├── test_pandas_histogrammar.py ├── test_spark_histogrammar.py └── test_spec.py /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: 4 | push: 5 | branches: [ master, develop ] 6 | pull_request: 7 | 8 | jobs: 9 | test: 10 | strategy: 11 | matrix: 12 | os: [ ubuntu-latest ] 13 | python: [ "3.9", "3.10", "3.11", "3.12" ] 14 | numpy_version: [ "numpy-latest", "numpy<2" ] 15 | runs-on: ${{ matrix.os }} 16 | 17 | steps: 18 | - name: Checkout code 19 | uses: actions/checkout@v2 20 | 21 | - name: Set up Python ${{ matrix.python }} 22 | uses: actions/setup-python@v1 23 | with: 24 | python-version: ${{ matrix.python }} 25 | 26 | - name: Use cache for pip dependencies 27 | uses: actions/cache@v3 28 | with: 29 | path: ~/.cache/pip 30 | key: ${{ runner.os }}-pip-${{ hashFiles('**/pyproject.toml') }} 31 | restore-keys: | 32 | ${{ runner.os }}-pip- 33 | 34 | - name: Install dependencies 35 | run: | 36 | python -m pip install --upgrade pip 37 | if [ "${{ matrix.numpy_version }}" = "numpy<2" ]; then 38 | pip install ".[test,pandas,spark,test_numpy_pre2]" 39 | else 40 | pip install ".[test,pandas,spark]" 41 | fi 42 | 43 | - name: Lint with pre-commit 44 | run: | 45 | pip install pre-commit 46 | pre-commit run --all-files --show-diff-on-failure 47 | 48 | - name: Test with pytest 49 | run: | 50 | pytest tests 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Emacs 2 | *~ 3 | \#*\# 4 | 5 | # Generated by test 6 | plot_*.html 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | env/ 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *,cover 53 | .hypothesis/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | histogrammar.*.rst 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # IPython Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # tests output files 100 | histogrammar/notebooks/*.json 101 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | rev: v0.1.6 4 | hooks: 5 | - id: ruff 6 | args: [--fix] 7 | exclude: notebooks/ 8 | - id: ruff-format 9 | exclude: notebooks/ 10 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | Release notes 3 | ============= 4 | 5 | Version 1.1.0, Dec 2024 6 | ----------------------- 7 | * Removed all ROOT, cpp and cuda code, it was no longer supported. 8 | 9 | Version 1.0.34, Dec 2024 10 | ------------------------ 11 | * Fix typo in build pipeline Python versions config list. 12 | * Fix error in SparselyBin __eq__ method. 13 | * Fix test utility corner case error (test_numpy.twosigfigs function). 14 | * Fix error in test context manager for pandas which prevented execution of tests. 15 | * Fix error in expected bin count in test_numpy.test_n_bins test. 16 | * Prevent logging zero execution time TestNumpy class. 17 | 18 | * Remove Python 3.8 environment from build pipeline. 19 | * Support numpy >= 2.0.0 (np.string_ -> np.bytes_, np.unicode_ -> np.str_). 20 | * Remove uses of pd.util.testing.makeMixedDataFrame not available in pandas >= 2.0.0. 21 | * Switch from 'pkg_resources' to 'importlib' module for resolving package files. 22 | * Switch from 'distutils.spawn' to 'shutil.which' for finding nvcc command. 23 | 24 | * Remove unused test_gpu.twosigfigs function. 25 | * Refactor tests with Numpy() and Pandas() context managers to use single 'with' statement. 26 | 27 | * Switch from setup.py to pyproject.toml 28 | * Add numpy<2,pandas<2 test environment to build pipeline test matrix 29 | 30 | Version 1.0.33, Dec 2022 31 | ------------------------ 32 | * fix of get_sub_hist() when Bin histogram is filled only with nans. 33 | 34 | Version 1.0.32, Sep 2022 35 | ------------------------ 36 | * Support for decimal datetype in pandas and spark. 37 | 38 | Version 1.0.31, Aug 2022 39 | ------------------------ 40 | * fix of spark df timestamp datatype detection (#59) 41 | * fix for invalid bin_edges for SparselyBin histogram (#60) 42 | 43 | Version 1.0.30, June 2022 44 | ------------------------- 45 | * Fix for machine-level rounding error, which can show up on in num_bins() call of Bin histogram. 46 | * supersedes broken v1.0.29 47 | 48 | Version 1.0.28, June 2022 49 | ------------------------- 50 | * Multiple performance updates, to Bin, SparselyBin and Categorize histograms. 51 | * SparselyBin, Categorize: optimized filling with 1-d and 2-d numpy arrays 52 | * Bin, SparselyBin, Categorize: (fast) numpy arrays for bin-centers and bin-labels. 53 | * Count: new, fast filling option when float weight is known. 54 | * util.py: faster get_datatype() and get_ndim() functions. 55 | 56 | Version 1.0.27, May 2022 57 | ------------------------ 58 | * Multiple performance updates, thanks to Simon Brugman. 59 | * Use pandas functions to infer datatypes and return numpy arrays. 60 | * Turn of unnecessary specialize function (slow) for Count objects. 61 | 62 | Version 1.0.26, Apr 2022 63 | ------------------------ 64 | * Added tutorial notebook with exercises. 65 | * Fixed 2d heatmap for categorical histograms, where one column was accidentally dropped. 66 | 67 | Version 1.0.25, Apr 2021 68 | ------------------------ 69 | * Improve null handling in pandas dataframes, by inferring datatype using pandas' infer_dtype function. 70 | * nans in bool columns get converted to "NaN", so the column keeps True and False values in Categorize. 71 | * columns of type object get converted to strings using to_string(), of type string uses only_str(). 72 | 73 | Version 1.0.24, Apr 2021 74 | ------------------------ 75 | * Categorize histogram now handles nones and nans in friendlier way, they are converted to "NaN". 76 | * make_histogram() now casts spark nulls to nan in case of numeric columns. scala interprets null as 0. 77 | * SparselyBin histograms did not add up nanflow when added. Now fixed. 78 | * Added unit test for doing checks on null conversion to nans 79 | * Use new histogrammar-scala jar files, v1.0.20 80 | * Added histogrammar-scala v1.0.20 jar files to tests/jars/ 81 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include LICENSE 3 | include NOTICE -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # 3 | # NOTICE: pass-through licensing of bundled components 4 | # 5 | # Histogrammar gathers together a toolkit of pre-existing third-party 6 | # open-source software components. These software components are governed by their own licenses 7 | # which Histogrammar does not modify or supersede, please consult the originating 8 | # authors. These components altogether have a mixture of the following licenses: Apache 2.0, MIT. 9 | # 10 | # Although we have examined the licenses to verify acceptance of commercial and non-commercial 11 | # use, please see and consult the original licenses or authors. 12 | # 13 | # Here is the full list of license dependencies: 14 | # 15 | # numpy: https://github.com/numpy/numpy/blob/master/LICENSE.txt 16 | # tqdm: https://github.com/tqdm/tqdm/blob/master/LICENCE 17 | # matplotlib: https://github.com/matplotlib/matplotlib/blob/master/LICENSE/LICENSE 18 | # joblib: https://github.com/joblib/joblib/blob/master/LICENSE.txt 19 | # root: https://root.cern.ch/license 20 | # popmon: https://github.com/ing-bank/popmon/blob/master/LICENSE 21 | # 22 | # There are several functions/classes where code or techniques have been reproduced and/or modified 23 | # from existing open-source packages. We list these here: 24 | # 25 | # Package: popmon 26 | # popmon file: histogrammar/dfinterface/spark_histogrammar.py 27 | # Class: SparkHistogrammar 28 | # Reference: https://github.com/ing-bank/popmon/blob/master/popmon/hist/filling/spark_histogrammar.py 29 | # popmon file: histogrammar/dfinterface/pandas_histogrammar.py 30 | # Class: PandasHistogrammar 31 | # Reference: https://github.com/ing-bank/popmon/blob/master/popmon/hist/filling/pandas_histogrammar.py 32 | # popmon file: histogrammar/dfinterface/histogram_filler_base.py 33 | # Class: HistogramFillerBase 34 | # Reference: https://github.com/ing-bank/popmon/blob/master/popmon/hist/filling/histogram_filler_base.py 35 | # License: MIT 36 | # For details see: https://github.com/ing-bank/popmon/blob/master/LICENSE 37 | # 38 | ################################################################################################ 39 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ================================== 2 | histogrammar Python implementation 3 | ================================== 4 | 5 | histogrammar is a Python package for creating histograms. histogrammar has multiple histogram types, 6 | supports numeric and categorical features, and works with Numpy arrays and Pandas and Spark dataframes. 7 | Once a histogram is filled, it's easy to plot it, store it in JSON format (and retrieve it), or convert 8 | it to Numpy arrays for further analysis. 9 | 10 | At its core histogrammar is a suite of data aggregation primitives designed for use in parallel processing. 11 | In the simplest case, you can use this to compute histograms, but the generality of the primitives 12 | allows much more. 13 | 14 | Several common histogram types can be plotted in Matplotlib and Bokeh with a single method call. 15 | If Numpy or Pandas is available, histograms and other aggregators can be filled from arrays ten to a hundred times 16 | more quickly via Numpy commands, rather than Python for loops. 17 | 18 | This Python implementation of histogrammar been tested to guarantee compatibility with its Scala implementation. 19 | 20 | Latest Python release: v1.1.0 (Feb 2025). 21 | Latest update: Feb 2025. 22 | 23 | References 24 | ========== 25 | 26 | Histogrammar is a core component of `popmon `_, a package by ING bank 27 | that allows one to check the stability of a dataset. popmon works with both pandas and spark datasets, 28 | largely thanks to Histogrammar. 29 | 30 | 31 | 32 | Announcements 33 | ============= 34 | 35 | Changes 36 | ------- 37 | 38 | See Changes log `here `_. 39 | 40 | 41 | Spark 3.X 42 | --------- 43 | 44 | With Spark 3.X, based on Scala 2.12 or 2.13, make sure to pick up the correct histogrammar jar files: 45 | 46 | .. code-block:: python 47 | 48 | spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.12:1.0.30,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.30").getOrCreate() 49 | 50 | 51 | For Scala 2.13, in the string above simply replace "2.12" with "2.13". 52 | 53 | December, 2023 54 | 55 | 56 | Example notebooks 57 | ================= 58 | 59 | .. list-table:: 60 | :widths: 80 20 61 | :header-rows: 1 62 | 63 | * - Tutorial 64 | - Colab link 65 | * - `Basic tutorial `_ 66 | - |notebook_basic_colab| 67 | * - `Detailed example (featuring configuration, Apache Spark and more) `_ 68 | - |notebook_advanced_colab| 69 | * - `Exercises `_ 70 | - |notebook_exercises_colab| 71 | 72 | Documentation 73 | ============= 74 | 75 | See `histogrammar-docs `_ for a complete introduction to `histogrammar`. 76 | (A bit old but still good.) There you can also find documentation about the Scala implementation of `histogrammar`. 77 | 78 | Check it out 79 | ============ 80 | 81 | The `historgrammar` library requires Python 3.8+ and is pip friendly. To get started, simply do: 82 | 83 | .. code-block:: bash 84 | 85 | $ pip install histogrammar 86 | 87 | or check out the code from our GitHub repository: 88 | 89 | .. code-block:: bash 90 | 91 | $ git clone https://github.com/histogrammar/histogrammar-python 92 | $ pip install -e histogrammar-python 93 | 94 | where in this example the code is installed in edit mode (option -e). 95 | 96 | You can now use the package in Python with: 97 | 98 | .. code-block:: python 99 | 100 | import histogrammar 101 | 102 | **Congratulations, you are now ready to use the histogrammar library!** 103 | 104 | Quick run 105 | ========= 106 | 107 | As a quick example, you can do: 108 | 109 | .. code-block:: python 110 | 111 | import pandas as pd 112 | import histogrammar as hg 113 | from histogrammar import resources 114 | 115 | # open synthetic data 116 | df = pd.read_csv(resources.data('test.csv.gz'), parse_dates=['date']) 117 | df.head() 118 | 119 | # create a histogram, tell it to look for column 'age' 120 | # fill the histogram with column 'age' and plot it 121 | hist = hg.Histogram(num=100, low=0, high=100, quantity='age') 122 | hist.fill.numpy(df) 123 | hist.plot.matplotlib() 124 | 125 | # generate histograms of all features in the dataframe using automatic binning 126 | # (importing histogrammar automatically adds this functionality to a pandas or spark dataframe) 127 | hists = df.hg_make_histograms() 128 | print(hists.keys()) 129 | 130 | # multi-dimensional histograms are also supported. e.g. features longitude vs latitude 131 | hists = df.hg_make_histograms(features=['longitude:latitude']) 132 | ll = hists['longitude:latitude'] 133 | ll.plot.matplotlib() 134 | 135 | # store histogram and retrieve it again 136 | ll.toJsonFile('longitude_latitude.json') 137 | ll2 = hg.Factory().fromJsonFile('longitude_latitude.json') 138 | 139 | These examples also work with Spark dataframes (sdf): 140 | 141 | .. code-block:: python 142 | 143 | from pyspark.sql.functions import col 144 | hist = hg.Histogram(num=100, low=0, high=100, quantity=col('age')) 145 | hist.fill.sparksql(sdf) 146 | 147 | For more examples please see the example notebooks and tutorials. 148 | 149 | 150 | Project contributors 151 | ==================== 152 | 153 | This package was originally authored by DIANA-HEP and is now maintained by volunteers. 154 | 155 | Contact and support 156 | =================== 157 | 158 | * Issues & Ideas & Support: https://github.com/histogrammar/histogrammar-python/issues 159 | 160 | Please note that `histogrammar` is supported only on a best-effort basis. 161 | 162 | License 163 | ======= 164 | `histogrammar` is completely free, open-source and licensed under the `Apache-2.0 license `_. 165 | 166 | .. |notebook_basic_colab| image:: https://colab.research.google.com/assets/colab-badge.svg 167 | :alt: Open in Colab 168 | :target: https://colab.research.google.com/github/histogrammar/histogrammar-python/blob/master/histogrammar/notebooks/histogrammar_tutorial_basic.ipynb 169 | .. |notebook_advanced_colab| image:: https://colab.research.google.com/assets/colab-badge.svg 170 | :alt: Open in Colab 171 | :target: https://colab.research.google.com/github/histogrammar/histogrammar-python/blob/master/histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb 172 | .. |notebook_exercises_colab| image:: https://colab.research.google.com/assets/colab-badge.svg 173 | :alt: Open in Colab 174 | :target: https://colab.research.google.com/github/histogrammar/histogrammar-python/blob/master/histogrammar/notebooks/histogrammar_tutorial_exercises.ipynb 175 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Histogrammar.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Histogrammar.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/Histogrammar" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Histogrammar" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # 2 | # Histogrammar documentation build configuration file. 3 | # 4 | # This file is execfile()d with the current directory set to its containing dir. 5 | # 6 | # Note that not all possible configuration values are present in this 7 | # autogenerated file. 8 | # 9 | # All configuration values have a default; values that are commented out 10 | # serve to show the default. 11 | 12 | import importlib 13 | import inspect 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # sys.path.insert(0, os.path.abspath('.')) 19 | 20 | # -- General configuration ----------------------------------------------------- 21 | 22 | # If your documentation needs a minimal Sphinx version, state it here. 23 | # needs_sphinx = '1.0' 24 | 25 | # Add any Sphinx extension module names here, as strings. They can be extensions 26 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 27 | extensions = [ 28 | "sphinx.ext.autodoc", 29 | "sphinxcontrib.napoleon", 30 | "sphinx.ext.doctest", 31 | "sphinx.ext.intersphinx", 32 | "sphinx.ext.todo", 33 | "sphinx.ext.coverage", 34 | "sphinx.ext.mathjax", 35 | "sphinx.ext.ifconfig", 36 | "sphinx.ext.linkcode", 37 | ] 38 | 39 | # Add any paths that contain templates here, relative to this directory. 40 | templates_path = ["_templates"] 41 | 42 | # The suffix of source filenames. 43 | source_suffix = ".rst" 44 | 45 | # The encoding of source files. 46 | # source_encoding = 'utf-8-sig' 47 | 48 | # The master toctree document. 49 | master_doc = "index" 50 | 51 | # General information about the project. 52 | project = "Histogrammar" 53 | copyright = "2016, DIANA-HEP" 54 | 55 | # The version info for the project you're documenting, acts as replacement for 56 | # |version| and |release|, also used in various other places throughout the 57 | # built documents. 58 | # 59 | # The short X.Y version. 60 | version = "1.0.11" 61 | # The full version, including alpha/beta/rc tags. 62 | release = "1.0.11" 63 | 64 | # The language for content autogenerated by Sphinx. Refer to documentation 65 | # for a list of supported languages. 66 | # language = None 67 | 68 | # There are two options for replacing |today|: either, you set today to some 69 | # non-false value, then it is used: 70 | # today = '' 71 | # Else, today_fmt is used as the format for a strftime call. 72 | # today_fmt = '%B %d, %Y' 73 | 74 | # List of patterns, relative to source directory, that match files and 75 | # directories to ignore when looking for source files. 76 | exclude_patterns = ["_build"] 77 | 78 | # The reST default role (used for this markup: `text`) to use for all documents. 79 | # default_role = None 80 | 81 | # If true, '()' will be appended to :func: etc. cross-reference text. 82 | # add_function_parentheses = True 83 | 84 | # If true, the current module name will be prepended to all description 85 | # unit titles (such as .. function::). 86 | # add_module_names = True 87 | 88 | # If true, sectionauthor and moduleauthor directives will be shown in the 89 | # output. They are ignored by default. 90 | # show_authors = False 91 | 92 | # The name of the Pygments (syntax highlighting) style to use. 93 | pygments_style = "sphinx" 94 | 95 | # A list of ignored prefixes for module index sorting. 96 | # modindex_common_prefix = [] 97 | 98 | 99 | # -- Options for HTML output --------------------------------------------------- 100 | 101 | # The theme to use for HTML and HTML Help pages. See the documentation for 102 | # a list of builtin themes. 103 | html_theme = "sphinxdoc" 104 | 105 | # Theme options are theme-specific and customize the look and feel of a theme 106 | # further. For a list of options available for each theme, see the 107 | # documentation. 108 | html_theme_options = {"nosidebar": True} 109 | 110 | # Add any paths that contain custom themes here, relative to this directory. 111 | # html_theme_path = [] 112 | 113 | # The name for this set of Sphinx documents. If None, it defaults to 114 | # " v documentation". 115 | # html_title = None 116 | 117 | # A shorter title for the navigation bar. Default is the same as html_title. 118 | # html_short_title = None 119 | 120 | # The name of an image file (relative to this directory) to place at the top 121 | # of the sidebar. 122 | # html_logo = None 123 | 124 | # The name of an image file (within the static path) to use as favicon of the 125 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 126 | # pixels large. 127 | # html_favicon = None 128 | 129 | # Add any paths that contain custom static files (such as style sheets) here, 130 | # relative to this directory. They are copied after the builtin static files, 131 | # so a file named "default.css" will overwrite the builtin "default.css". 132 | html_static_path = ["static"] 133 | 134 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 135 | # using the given strftime format. 136 | # html_last_updated_fmt = '%b %d, %Y' 137 | 138 | # If true, SmartyPants will be used to convert quotes and dashes to 139 | # typographically correct entities. 140 | # html_use_smartypants = True 141 | 142 | # Custom sidebar templates, maps document names to template names. 143 | # html_sidebars = {} 144 | 145 | # Additional templates that should be rendered to pages, maps page names to 146 | # template names. 147 | # html_additional_pages = {} 148 | 149 | # If false, no module index is generated. 150 | # html_domain_indices = True 151 | 152 | # If false, no index is generated. 153 | # html_use_index = True 154 | 155 | # If true, the index is split into individual pages for each letter. 156 | # html_split_index = False 157 | 158 | # If true, links to the reST sources are added to the pages. 159 | # html_show_sourcelink = True 160 | 161 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 162 | # html_show_sphinx = True 163 | 164 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 165 | # html_show_copyright = True 166 | 167 | # If true, an OpenSearch description file will be output, and all pages will 168 | # contain a tag referring to it. The value of this option must be the 169 | # base URL from which the finished HTML is served. 170 | # html_use_opensearch = '' 171 | 172 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 173 | # html_file_suffix = None 174 | 175 | # Output file base name for HTML help builder. 176 | htmlhelp_basename = "Histogrammardoc" 177 | 178 | 179 | # -- Options for LaTeX output -------------------------------------------------- 180 | 181 | latex_elements = { 182 | # # The paper size ('letterpaper' or 'a4paper'). 183 | # 'papersize': 'letterpaper', 184 | # # The font size ('10pt', '11pt' or '12pt'). 185 | # 'pointsize': '10pt', 186 | # # Additional stuff for the LaTeX preamble. 187 | # 'preamble': '', 188 | } 189 | 190 | # Grouping the document tree into LaTeX files. List of tuples 191 | # (source start file, target name, title, author, documentclass [howto/manual]). 192 | latex_documents = [ 193 | ("index", "Histogrammar.tex", "Histogrammar Documentation", "DIANA-HEP", "manual"), 194 | ] 195 | 196 | # The name of an image file (relative to this directory) to place at the top of 197 | # the title page. 198 | # latex_logo = None 199 | 200 | # For "manual" documents, if this is true, then toplevel headings are parts, 201 | # not chapters. 202 | # latex_use_parts = False 203 | 204 | # If true, show page references after internal links. 205 | # latex_show_pagerefs = False 206 | 207 | # If true, show URL addresses after external links. 208 | # latex_show_urls = False 209 | 210 | # Documents to append as an appendix to all manuals. 211 | # latex_appendices = [] 212 | 213 | # If false, no module index is generated. 214 | # latex_domain_indices = True 215 | 216 | 217 | # -- Options for manual page output -------------------------------------------- 218 | 219 | # One entry per manual page. List of tuples 220 | # (source start file, name, description, authors, manual section). 221 | man_pages = [("index", "histogrammar", "Histogrammar Documentation", ["DIANA-HEP"], 1)] 222 | 223 | # If true, show URL addresses after external links. 224 | # man_show_urls = False 225 | 226 | 227 | # -- Options for Texinfo output ------------------------------------------------ 228 | 229 | # Grouping the document tree into Texinfo files. List of tuples 230 | # (source start file, target name, title, author, 231 | # dir menu entry, description, category) 232 | texinfo_documents = [ 233 | ( 234 | "index", 235 | "Histogrammar", 236 | "Histogrammar Documentation", 237 | "DIANA-HEP", 238 | "Histogrammar", 239 | "One line description of project.", 240 | "Miscellaneous", 241 | ), 242 | ] 243 | 244 | # Documents to append as an appendix to all manuals. 245 | # texinfo_appendices = [] 246 | 247 | # If false, no module index is generated. 248 | # texinfo_domain_indices = True 249 | 250 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 251 | # texinfo_show_urls = 'footnote' 252 | 253 | 254 | # -- Options for Epub output --------------------------------------------------- 255 | 256 | # Bibliographic Dublin Core info. 257 | epub_title = "Histogrammar" 258 | epub_author = "DIANA-HEP" 259 | epub_publisher = "DIANA-HEP" 260 | epub_copyright = "2016, DIANA-HEP" 261 | 262 | # The language of the text. It defaults to the language option 263 | # or en if the language is not set. 264 | # epub_language = '' 265 | 266 | # The scheme of the identifier. Typical schemes are ISBN or URL. 267 | # epub_scheme = '' 268 | 269 | # The unique identifier of the text. This can be a ISBN number 270 | # or the project homepage. 271 | # epub_identifier = '' 272 | 273 | # A unique identification for the text. 274 | # epub_uid = '' 275 | 276 | # A tuple containing the cover image and cover page html template filenames. 277 | # epub_cover = () 278 | 279 | # HTML files that should be inserted before the pages created by sphinx. 280 | # The format is a list of tuples containing the path and title. 281 | # epub_pre_files = [] 282 | 283 | # HTML files shat should be inserted after the pages created by sphinx. 284 | # The format is a list of tuples containing the path and title. 285 | # epub_post_files = [] 286 | 287 | # A list of files that should not be packed into the epub file. 288 | # epub_exclude_files = [] 289 | 290 | # The depth of the table of contents in toc.ncx. 291 | # epub_tocdepth = 3 292 | 293 | # Allow duplicate toc entries. 294 | # epub_tocdup = True 295 | 296 | 297 | # Example configuration for intersphinx: refer to the Python standard library. 298 | intersphinx_mapping = {"http://docs.python.org/": None} 299 | 300 | 301 | def skip(app, what, name, obj, skip, options): 302 | if name == "__init__": 303 | return False 304 | return skip 305 | 306 | 307 | def setup(app): 308 | app.connect("autodoc-skip-member", skip) 309 | 310 | 311 | def linkcode_resolve(domain, info): 312 | if domain != "py": 313 | return None 314 | if not info["module"]: 315 | return None 316 | fileName = info["module"].replace(".", "/") 317 | 318 | try: 319 | lineNumber = inspect.getsourcelines(getattr(importlib.import_module(info["module"]), info["fullname"]))[1] 320 | except: # noqa: E722 321 | return None 322 | else: 323 | return "https://github.com/histogrammar/histogrammar-python/blob/%s/%s.py#L%d" % (release, fileName, lineNumber) 324 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Histogrammar |version| for Python 2 | ================================= 3 | 4 | All aggregation primitives descend from two classes, :doc:`Container ` and :doc:`Factory `. Container defines all the methods for the primitive to aggregate and contain data, while Factory has methods for making containers. (In other languages, the two roles are distinct.) 5 | 6 | The "functions" passed to these primitives may be Python lambda functions, normally defined functions (with ``def``), or strings, which may be interpreted different ways by different back-ends. All primitives immediately wrap your functions as :doc:`UserFcn `, which are serializable (with ``pickle``), may be cached (:doc:`CachedFcn `), and may have a name. Although the primitives wrap your function automatically, you may do it yourself to add features, like caching or a name. See :doc:`serializable `, :doc:`cached `, and :doc:`named `. 7 | 8 | The primitive classes are listed below, grouped by kind. See the index for a list of all classes, members, and functions. 9 | 10 | Zeroth kind: depend only on weights 11 | ----------------------------------- 12 | 13 | :doc:`Count `: sum of weights 14 | Count entries by accumulating the sum of all observed weights or a sum of transformed weights (e.g. sum of squares of weights). 15 | 16 | First kind: aggregate a data without sub-aggregators 17 | ---------------------------------------------------- 18 | 19 | :doc:`Sum `: sum of a given quantity 20 | Accumulate the (weighted) sum of a given quantity, calculated from the data. 21 | 22 | :doc:`Average `: mean of a quantity 23 | Accumulate the weighted mean of a given quantity. 24 | 25 | :doc:`Deviate `: mean and variance 26 | Accumulate the weighted mean and weighted variance of a given quantity. 27 | 28 | :doc:`Minimize `: minimum value 29 | Find the minimum value of a given quantity. If no data are observed, the result is NaN. 30 | 31 | :doc:`Maximize `: maximum value 32 | Find the maximum value of a given quantity. If no data are observed, the result is NaN. 33 | 34 | :doc:`Bag `: accumulate values for scatter plots 35 | Accumulate raw numbers, vectors of numbers, or strings, with identical values merged. 36 | 37 | Second kind: pass to different sub-aggregators based on values seen in data 38 | --------------------------------------------------------------------------- 39 | 40 | :doc:`Bin `: regular binning for histograms 41 | Split a quantity into equally spaced bins between a low and high threshold and fill exactly one bin per datum. 42 | 43 | :doc:`SparselyBin `: ignore zeros 44 | Split a quantity into equally spaced bins, creating them whenever their entries would be non-zero. Exactly one sub-aggregator is filled per datum. 45 | 46 | :doc:`CentrallyBin `: irregular but fully partitioning 47 | Split a quantity into bins defined by irregularly spaced bin centers, with exactly one sub-aggregator filled per datum (the closest one). 48 | 49 | :doc:`IrregularlyBin `: exclusive filling 50 | Accumulate a suite of aggregators, each between two thresholds, filling exactly one per datum. 51 | 52 | :doc:`Categorize `: string-valued bins, bar charts 53 | Split a given quantity by its categorical value and fill only one category per datum. 54 | 55 | :doc:`Fraction `: efficiency plots 56 | Accumulate two aggregators, one containing only entries that pass a given selection (numerator) and another that contains all entries (denominator). 57 | 58 | :doc:`Stack `: cumulative filling 59 | Accumulates a suite of aggregators, each filtered with a tighter selection on the same quantity. 60 | 61 | :doc:`Select `: apply a cut 62 | Filter or weight data according to a given selection. 63 | 64 | Third kind: broadcast to every sub-aggregator, independent of data 65 | ------------------------------------------------------------------ 66 | 67 | :doc:`Label `: directory with string-based keys 68 | Accumulate any number of aggregators of the same type and label them with strings. Every sub-aggregator is filled with every input datum. 69 | 70 | :doc:`UntypedLabel `: directory of different types 71 | Accumulate any number of aggregators of any type and label them with strings. Every sub-aggregator is filled with every input datum. 72 | 73 | :doc:`Index `: list with integer keys 74 | Accumulate any number of aggregators of the same type in a list. Every sub-aggregator is filled with every input datum. 75 | 76 | :doc:`Branch `: tuple of different types 77 | Accumulate aggregators of different types, indexed by i0 through i9. Every sub-aggregator is filled with every input datum. 78 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. linkcheck to check all external links for integrity 37 | echo. doctest to run all doctests embedded in the documentation if enabled 38 | goto end 39 | ) 40 | 41 | if "%1" == "clean" ( 42 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 43 | del /q /s %BUILDDIR%\* 44 | goto end 45 | ) 46 | 47 | if "%1" == "html" ( 48 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 49 | if errorlevel 1 exit /b 1 50 | echo. 51 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 52 | goto end 53 | ) 54 | 55 | if "%1" == "dirhtml" ( 56 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 57 | if errorlevel 1 exit /b 1 58 | echo. 59 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 60 | goto end 61 | ) 62 | 63 | if "%1" == "singlehtml" ( 64 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 68 | goto end 69 | ) 70 | 71 | if "%1" == "pickle" ( 72 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished; now you can process the pickle files. 76 | goto end 77 | ) 78 | 79 | if "%1" == "json" ( 80 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished; now you can process the JSON files. 84 | goto end 85 | ) 86 | 87 | if "%1" == "htmlhelp" ( 88 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can run HTML Help Workshop with the ^ 92 | .hhp project file in %BUILDDIR%/htmlhelp. 93 | goto end 94 | ) 95 | 96 | if "%1" == "qthelp" ( 97 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 98 | if errorlevel 1 exit /b 1 99 | echo. 100 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 101 | .qhcp project file in %BUILDDIR%/qthelp, like this: 102 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Histogrammar.qhcp 103 | echo.To view the help file: 104 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Histogrammar.ghc 105 | goto end 106 | ) 107 | 108 | if "%1" == "devhelp" ( 109 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 110 | if errorlevel 1 exit /b 1 111 | echo. 112 | echo.Build finished. 113 | goto end 114 | ) 115 | 116 | if "%1" == "epub" ( 117 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 118 | if errorlevel 1 exit /b 1 119 | echo. 120 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 121 | goto end 122 | ) 123 | 124 | if "%1" == "latex" ( 125 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 129 | goto end 130 | ) 131 | 132 | if "%1" == "text" ( 133 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The text files are in %BUILDDIR%/text. 137 | goto end 138 | ) 139 | 140 | if "%1" == "man" ( 141 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 145 | goto end 146 | ) 147 | 148 | if "%1" == "texinfo" ( 149 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 150 | if errorlevel 1 exit /b 1 151 | echo. 152 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 153 | goto end 154 | ) 155 | 156 | if "%1" == "gettext" ( 157 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 158 | if errorlevel 1 exit /b 1 159 | echo. 160 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 161 | goto end 162 | ) 163 | 164 | if "%1" == "changes" ( 165 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 166 | if errorlevel 1 exit /b 1 167 | echo. 168 | echo.The overview file is in %BUILDDIR%/changes. 169 | goto end 170 | ) 171 | 172 | if "%1" == "linkcheck" ( 173 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 174 | if errorlevel 1 exit /b 1 175 | echo. 176 | echo.Link check complete; look for any errors in the above output ^ 177 | or in %BUILDDIR%/linkcheck/output.txt. 178 | goto end 179 | ) 180 | 181 | if "%1" == "doctest" ( 182 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 183 | if errorlevel 1 exit /b 1 184 | echo. 185 | echo.Testing of doctests in the sources finished, look at the ^ 186 | results in %BUILDDIR%/doctest/output.txt. 187 | goto end 188 | ) 189 | 190 | :end 191 | -------------------------------------------------------------------------------- /histogrammar/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | #!/usr/bin/env python 4 | 5 | # Copyright 2016 DIANA-HEP 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | # handy monkey patch functions for pandas and spark dataframes 20 | import histogrammar.dfinterface 21 | from histogrammar.convenience import ( 22 | Histogram, 23 | Profile, 24 | ProfileErr, 25 | SparselyHistogram, 26 | SparselyProfile, 27 | SparselyProfileErr, 28 | TwoDimensionallyHistogram, 29 | TwoDimensionallySparselyHistogram, 30 | ) 31 | from histogrammar.defs import Container, Factory 32 | from histogrammar.primitives.average import Average 33 | from histogrammar.primitives.bag import Bag 34 | from histogrammar.primitives.bin import Bin 35 | from histogrammar.primitives.categorize import Categorize 36 | from histogrammar.primitives.centrallybin import CentrallyBin 37 | from histogrammar.primitives.collection import ( 38 | Branch, 39 | Collection, 40 | Index, 41 | Label, 42 | UntypedLabel, 43 | ) 44 | from histogrammar.primitives.count import Count 45 | from histogrammar.primitives.deviate import Deviate 46 | from histogrammar.primitives.fraction import Fraction 47 | from histogrammar.primitives.irregularlybin import IrregularlyBin 48 | from histogrammar.primitives.minmax import Maximize, Minimize 49 | from histogrammar.primitives.select import Select 50 | from histogrammar.primitives.sparselybin import SparselyBin 51 | from histogrammar.primitives.stack import Stack 52 | from histogrammar.primitives.sum import Sum 53 | -------------------------------------------------------------------------------- /histogrammar/convenience.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 ING Wholesale Banking Advanced Analytics 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | # this software and associated documentation files (the "Software"), to deal in 5 | # the Software without restriction, including without limitation the rights to 6 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | # the Software, and to permit persons to whom the Software is furnished to do so, 8 | # subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | from histogrammar.defs import identity, unweighted 21 | from histogrammar.primitives.average import Average 22 | from histogrammar.primitives.bin import Bin 23 | from histogrammar.primitives.categorize import Categorize 24 | from histogrammar.primitives.count import Count 25 | from histogrammar.primitives.deviate import Deviate 26 | from histogrammar.primitives.select import Select 27 | from histogrammar.primitives.sparselybin import SparselyBin 28 | 29 | 30 | def Histogram(num, low, high, quantity=identity): 31 | """Create a conventional histogram that is capable of being filled and added. 32 | 33 | Parameters: 34 | num (int): the number of bins; must be at least one. 35 | low (float): the minimum-value edge of the first bin. 36 | high (float): the maximum-value edge of the last bin; must be strictly greater than `low`. 37 | quantity (function returning float or string): function that computes the quantity of interest from 38 | the data. pass on all values by default. If a string is given, quantity is set to identity(string), 39 | in which case that column is picked up from a pandas df. 40 | """ 41 | return Bin.ing(num, low, high, quantity, Count.ing(), Count.ing(), Count.ing(), Count.ing()) 42 | 43 | 44 | def HistogramCut(num, low, high, quantity=identity, selection=unweighted): 45 | """Create a conventional histogram that is capable of being filled and added, with a selection cut. 46 | 47 | Parameters: 48 | num (int): the number of bins; must be at least one. 49 | low (float): the minimum-value edge of the first bin. 50 | high (float): the maximum-value edge of the last bin; must be strictly greater than `low`. 51 | quantity (function returning float or string): function that computes the quantity of interest from 52 | the data. pass on all values by default. If a string is given, quantity is set to identity(string), 53 | in which case that column is picked up from a pandas df. 54 | selection (function returning boolean): function that computes if data point is accepted or not. 55 | default is: lamba x: True 56 | """ 57 | return Select.ing( 58 | selection, 59 | Bin.ing(num, low, high, quantity, Count.ing(), Count.ing(), Count.ing(), Count.ing()), 60 | ) 61 | 62 | 63 | def SparselyHistogram(binWidth, quantity=identity, origin=0.0): 64 | """Create a sparsely binned histogram that is only capable of being added. 65 | 66 | Parameters: 67 | binWidth (float): the width of a bin. 68 | quantity (function returning float or string): function that computes the quantity of interest from 69 | the data. pass on all values by default. If a string is given, quantity is set to identity(string), 70 | in which case that column is picked up from a pandas df. 71 | origin (float): the left edge of the bin whose index is zero. 72 | """ 73 | return SparselyBin.ing(binWidth, quantity, Count.ing(), Count.ing(), origin) 74 | 75 | 76 | def CategorizeHistogram(quantity=identity): 77 | """Create a Categorize histogram for categorical features such as strings and booleans 78 | 79 | Parameters: 80 | quantity (function returning float or string): function that computes the quantity of interest from 81 | the data. pass on all values by default. If a string is given, quantity is set to identity(string), 82 | in which case that column is picked up from a pandas df. 83 | """ 84 | return Categorize.ing(quantity, Count.ing()) 85 | 86 | 87 | def Profile(num, low, high, binnedQuantity, averagedQuantity): 88 | """Convenience function for creating binwise averages.""" 89 | return Bin.ing(num, low, high, binnedQuantity, Average.ing(averagedQuantity)) 90 | 91 | 92 | def SparselyProfile(binWidth, binnedQuantity, averagedQuantity, origin=0.0): 93 | """Convenience function for creating sparsely binned binwise averages.""" 94 | return SparselyBin.ing(binWidth, binnedQuantity, Average.ing(averagedQuantity), Count.ing(), origin) 95 | 96 | 97 | def ProfileErr(num, low, high, binnedQuantity, averagedQuantity): 98 | """Convenience function for creating a profile plot 99 | 100 | This is a Profile with variances. 101 | """ 102 | return Bin.ing(num, low, high, binnedQuantity, Deviate.ing(averagedQuantity)) 103 | 104 | 105 | def SparselyProfileErr(binWidth, binnedQuantity, averagedQuantity, origin=0.0): 106 | """Convenience function for creating a sparsely binned profile plot 107 | 108 | This is a Profile with variances. 109 | """ 110 | return SparselyBin.ing(binWidth, binnedQuantity, Deviate.ing(averagedQuantity), Count.ing(), origin) 111 | 112 | 113 | def TwoDimensionallyHistogram(xnum, xlow, xhigh, xquantity, ynum, ylow, yhigh, yquantity): 114 | """Convenience function for creating a conventional, two-dimensional histogram.""" 115 | return Bin.ing(xnum, xlow, xhigh, xquantity, Bin.ing(ynum, ylow, yhigh, yquantity)) 116 | 117 | 118 | def TwoDimensionallySparselyHistogram(xbinWidth, xquantity, ybinWidth, yquantity, xorigin=0.0, yorigin=0.0): 119 | """Convenience function for creating a sparsely binned, two-dimensional histogram.""" 120 | return SparselyBin.ing( 121 | xbinWidth, 122 | xquantity, 123 | SparselyBin.ing(ybinWidth, yquantity, Count.ing(), Count.ing(), yorigin), 124 | Count.ing(), 125 | xorigin, 126 | ) 127 | -------------------------------------------------------------------------------- /histogrammar/dfinterface/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 ING Wholesale Banking Advanced Analytics 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | # this software and associated documentation files (the "Software"), to deal in 5 | # the Software without restriction, including without limitation the rights to 6 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | # the Software, and to permit persons to whom the Software is furnished to do so, 8 | # subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | from .addmethods import add_pandas_methods, add_sparksql_methods 21 | 22 | try: 23 | from pyspark.sql import DataFrame as sdf 24 | 25 | # add function to create histogrammar histograms 26 | add_sparksql_methods(cls=sdf, prefix="hg_") 27 | except (ModuleNotFoundError, AttributeError): 28 | pass 29 | 30 | try: 31 | from pandas import DataFrame as pdf 32 | 33 | # add function to create histogrammar histograms 34 | add_pandas_methods(cls=pdf, prefix="hg_") 35 | except (ModuleNotFoundError, AttributeError): 36 | pass 37 | -------------------------------------------------------------------------------- /histogrammar/dfinterface/filling_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 ING Wholesale Banking Advanced Analytics 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | # this software and associated documentation files (the "Software"), to deal in 5 | # the Software without restriction, including without limitation the rights to 6 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | # the Software, and to permit persons to whom the Software is furnished to do so, 8 | # subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | 21 | import numpy as np 22 | import pandas as pd 23 | 24 | NUM_NS_DAY = 24 * 3600 * int(1e9) 25 | 26 | 27 | def check_column(col, sep=":"): 28 | """Convert input column string to list of columns 29 | 30 | :param col: input string 31 | :param sep: default ":" 32 | :return: list of columns 33 | """ 34 | if isinstance(col, str): 35 | col = col.split(sep) 36 | elif not isinstance(col, list): 37 | raise TypeError(f'Columns "{col}" needs to be a string or list of strings') 38 | return col 39 | 40 | 41 | def normalize_dtype(dtype): 42 | """Convert datatype to consistent numpy datatype 43 | 44 | :param dtype: input datatype 45 | :rtype: numpy.dtype.type 46 | """ 47 | try: 48 | if hasattr(dtype, "type"): 49 | # this converts pandas types, such as pd.Int64, into numpy types 50 | dtype = type(dtype.type()) 51 | dtype = np.dtype(dtype).type 52 | if dtype in {np.str_, np.bytes_}: 53 | dtype = np.dtype(str).type 54 | # MB 20210404: nb.object_ is kept an object -> uses to_string(). str uses only_str() 55 | except BaseException: 56 | raise RuntimeError(f'unknown assigned datatype "{dtype}"') 57 | return dtype 58 | 59 | 60 | def to_ns(x): 61 | """Convert input timestamps to nanoseconds (integers). 62 | 63 | :param x: value to be converted 64 | :returns: converted value 65 | :rtype: int 66 | """ 67 | if pd.isnull(x): 68 | return 0 69 | try: 70 | return pd.to_datetime(x).value 71 | except Exception: 72 | if hasattr(x, "__str__"): 73 | return pd.to_datetime(str(x)).value 74 | return 0 75 | 76 | 77 | def to_str(val): 78 | """Convert input to (array of) string(s). 79 | 80 | :param val: value to be converted 81 | :returns: converted value 82 | :rtype: str or np.ndarray 83 | """ 84 | if isinstance(val, str): 85 | return val 86 | if isinstance(val, pd.Series): 87 | # Note: at this point, data type of pd.series has already been inferred as being of type object (mixed) 88 | return val.astype(str).values 89 | if hasattr(val, "__iter__"): 90 | return np.asarray([(s if isinstance(s, str) else str(s) if hasattr(s, "__str__") else "") for s in val]) 91 | if hasattr(val, "__str__"): 92 | return str(val) 93 | return "None" 94 | 95 | 96 | def only_str(val): 97 | """Pass input value or array only if it is a string. 98 | 99 | :param val: value to be evaluated 100 | :returns: evaluated value 101 | :rtype: str or np.ndarray 102 | """ 103 | if isinstance(val, str): 104 | return val 105 | if isinstance(val, pd.Series): 106 | # at this point, data type of pd.series has already been inferred as *to be* 'string' 107 | dtype = np.dtype(val.dtype).type 108 | return val.values if dtype in [str, np.str_, np.bytes_] else val.astype(str).values 109 | if hasattr(val, "__iter__"): 110 | return np.asarray([s if isinstance(s, str) else "None" for s in val]) 111 | return "None" 112 | 113 | 114 | def only_bool(val): 115 | """Pass input value or array only if it is a bool. 116 | 117 | :param val: value to be evaluated 118 | :returns: evaluated value 119 | :rtype: np.bool or np.ndarray 120 | """ 121 | if isinstance(val, (np.bool_, bool)): 122 | return val 123 | if isinstance(val, pd.Series) and val.dtype in [np.bool_, bool]: 124 | return val.values 125 | if hasattr(val, "__iter__") and not isinstance(val, str): 126 | return np.asarray([s if isinstance(s, (np.bool_, bool)) else "NaN" for s in val]) 127 | return "NaN" 128 | 129 | 130 | def only_int(val): 131 | """Pass input val value or array only if it is an integer. 132 | 133 | :param val: value to be evaluated 134 | :returns: evaluated value 135 | :rtype: np.int64 or np.ndarray 136 | """ 137 | if isinstance(val, (np.int64, int)): 138 | return val 139 | if isinstance(val, pd.Series) and val.dtype in [np.int64, int]: 140 | return val.values 141 | if hasattr(val, "__iter__") and not isinstance(val, str): 142 | return np.asarray([s if isinstance(s, (np.int64, int)) else np.nan for s in val]) 143 | return np.nan 144 | 145 | 146 | def only_float(val): 147 | """Pass input val value or array only if it is a float. 148 | 149 | :param val: value to be evaluated 150 | :returns: evaluated value 151 | :rtype: np.float64 or np.ndarray 152 | """ 153 | if isinstance(val, (np.float64, float)): 154 | return val 155 | if isinstance(val, pd.Series) and val.dtype in [np.float64, float]: 156 | return val.values 157 | if hasattr(val, "__iter__") and not isinstance(val, str): 158 | return np.asarray([s if isinstance(s, (np.float64, float)) else np.nan for s in val]) 159 | return np.nan 160 | 161 | 162 | QUANTITY = { 163 | # MB 20210404: to_string for object types b/c it's a mixed type 164 | object: to_str, 165 | np.object_: to_str, 166 | str: only_str, 167 | np.str_: only_str, 168 | int: only_int, 169 | np.int64: only_int, 170 | np.int32: only_int, 171 | bool: only_bool, 172 | np.bool_: only_bool, 173 | float: only_float, 174 | np.float64: only_float, 175 | np.datetime64: only_int, 176 | } 177 | 178 | 179 | def value_to_bin_index(val, **kwargs): 180 | """Convert value to bin index. 181 | 182 | Convert a numeric or timestamp column to an integer bin index. 183 | 184 | :param binWidth: bin width value needed to convert column 185 | to an integer bin index 186 | :param origin: bin offset value needed to convert column 187 | to an integer bin index 188 | """ 189 | try: 190 | # NOTE this notation also works for timestamps 191 | bin_width = kwargs.get("binWidth", kwargs.get("bin_width", 1)) 192 | bin_offset = kwargs.get("origin", kwargs.get("bin_offset", 0)) 193 | return int(np.floor((val - bin_offset) / bin_width)) 194 | except BaseException: 195 | pass 196 | return val 197 | 198 | 199 | def value_to_bin_center(val, **kwargs): 200 | """Convert value to bin center. 201 | 202 | Convert a numeric or timestamp column to a common bin center value. 203 | 204 | :param binWidth: bin width value needed to convert column 205 | to a common bin center value 206 | :param origin: bin_offset value needed to convert column 207 | to a common bin center value 208 | """ 209 | try: 210 | # NOTE this notation also works for timestamps, and does not change the 211 | # unit 212 | bin_width = kwargs.get("binWidth", kwargs.get("bin_width", 1)) 213 | bin_offset = kwargs.get("origin", kwargs.get("bin_offset", 0)) 214 | bin_index = int(np.floor((val - bin_offset) / bin_width)) 215 | obj_type = type(bin_width) 216 | return bin_offset + obj_type((bin_index + 0.5) * bin_width) 217 | except BaseException: 218 | pass 219 | return val 220 | -------------------------------------------------------------------------------- /histogrammar/dfinterface/pandas_histogrammar.py: -------------------------------------------------------------------------------- 1 | """Copyright Eskapade: 2 | License Apache-2: https://github.com/KaveIO/Eskapade-Core/blob/master/LICENSE 3 | Reference link: 4 | https://github.com/KaveIO/Eskapade/blob/master/python/eskapade/analysis/links/hist_filler.py 5 | All modifications copyright ING WBAA. 6 | """ 7 | 8 | import numpy as np 9 | import pandas as pd 10 | from pandas.api.types import infer_dtype 11 | from tqdm import tqdm 12 | 13 | import histogrammar as hg 14 | 15 | from .filling_utils import QUANTITY, to_ns 16 | from .histogram_filler_base import HistogramFillerBase 17 | 18 | 19 | class PandasHistogrammar(HistogramFillerBase): 20 | """Fill histogrammar histograms. 21 | 22 | Algorithm to fill histogrammar style bin, sparse-bin and category histograms. 23 | Timestamp features are converted to nanoseconds before 24 | the binning is applied. Final histograms are stored in the datastore. 25 | """ 26 | 27 | def __init__( 28 | self, 29 | features=None, 30 | binning="unit", 31 | bin_specs=None, 32 | time_axis="", 33 | var_dtype=None, 34 | read_key=None, 35 | store_key=None, 36 | nbins_1d=40, 37 | nbins_2d=20, 38 | nbins_3d=10, 39 | max_nunique=500, 40 | ): 41 | """Initialize module instance. 42 | 43 | Store and do basic check on the attributes HistogramFillerBase. 44 | 45 | :param list features: columns to pick up from input data. (default is all features) 46 | For multi-dimensional histograms, separate the column names with a : 47 | 48 | Example features list is: 49 | 50 | .. code-block:: python 51 | 52 | features = ['x', 'date', 'date:x', 'date:y', 'date:x:y'] 53 | 54 | :param str binning: default binning to revert to in case bin_specs not supplied. options are: 55 | "unit" or "auto", default is "unit". When using "auto", semi-clever binning is automatically done. 56 | :param dict bin_specs: dictionaries used for rebinning numeric or timestamp features 57 | 58 | Example bin_specs dictionary is: 59 | 60 | .. code-block:: python 61 | 62 | bin_specs = {'x': {'binWidth': 1, 'origin': 0}, 63 | 'y': {'num': 10, 'low': 0.0, 'high': 2.0}, 64 | 'x:y': [{}, {'num': 5, 'low': 0.0, 'high': 1.0}], 65 | 'a': {'edges': [0, 2, 10, 11, 21, 101]}, 66 | 'b': {'centers': [1, 6, 10.5, 16, 20, 100]}, 67 | 'c': {'max': True}, 68 | 'd': {'min': True}, 69 | 'e': {'sum': True}, 70 | 'f': {'average': True}, 71 | 'a:f': [{'edges': [0, 10, 101]}, {'average': True}], 72 | 'g': {'thresholds': [0, 2, 10, 11, 21, 101]}, 73 | 'h': {'bag': True}, 74 | } 75 | 76 | In the bin specs for x:y, x reverts to the 1-dim setting. 77 | 78 | :param str time_axis: name of datetime feature, used as time axis, eg 'date'. if True, will be guessed. 79 | :param dict var_dtype: dictionary with specified datatype per feature (optional) 80 | :param str read_key: key of input histogram-dict to read from data store . 81 | (only required when calling transform(datastore) as module) 82 | :param str store_key: key of output data to store in data store 83 | (only required when calling transform(datastore) as module) 84 | :param int nbins_1d: auto-binning number of bins for 1d histograms. default is 40. 85 | :param int nbins_2d: auto-binning number of bins for 2d histograms. default is 20. 86 | :param int nbins_3d: auto-binning number of bins for 3d histograms. default is 10. 87 | :param int max_nunique: auto-binning threshold for unique categorical values. default is 500. 88 | """ 89 | HistogramFillerBase.__init__( 90 | self, 91 | features, 92 | binning, 93 | bin_specs, 94 | time_axis, 95 | var_dtype, 96 | read_key, 97 | store_key, 98 | nbins_1d, 99 | nbins_2d, 100 | nbins_3d, 101 | max_nunique, 102 | ) 103 | 104 | def assert_dataframe(self, df): 105 | """Check that input data is a filled pandas data frame. 106 | 107 | :param df: input (pandas) data frame 108 | """ 109 | if not isinstance(df, pd.DataFrame): 110 | raise TypeError(f"retrieved object not of type {pd.DataFrame}") 111 | if df.shape[0] == 0: 112 | raise RuntimeError("data is empty") 113 | return df 114 | 115 | def get_features(self, df): 116 | """Get columns of (pandas) dataframe 117 | 118 | :param df: input pandas dataframe 119 | """ 120 | return df.columns.tolist() 121 | 122 | def get_data_type(self, df, col): 123 | """Get data type of dataframe column. 124 | 125 | :param df: input data frame 126 | :param str col: column 127 | """ 128 | if col not in df.columns: 129 | raise KeyError(f'column "{col:s}" not in input dataframe') 130 | 131 | inferred = infer_dtype(df[col], skipna=True) 132 | if inferred in "string": 133 | data_type = "str" 134 | elif inferred == "integer": 135 | data_type = "int" 136 | elif inferred == "boolean": 137 | data_type = "bool" 138 | elif inferred in {"decimal", "floating", "mixed-integer-float"}: 139 | # decimal needs preprocessing (cast), signal this in metadata 140 | data_type = np.dtype("float", metadata={"decimal": True}) if inferred == "decimal" else "float" 141 | elif inferred in {"date", "datetime", "datetime64"}: 142 | data_type = "datetime64" 143 | else: # categorical, mixed, etc -> object uses to_string() 144 | data_type = np.object_ 145 | 146 | return data_type 147 | 148 | def get_quantiles(self, df, quantiles=[0.05, 0.95], columns=[]): 149 | """return dict with quantiles for given columns 150 | 151 | :param df: input pandas data frame 152 | :param quantiles: list of quantiles. default is [0.05, 0.95] 153 | :param columns: columns to select. default is all. 154 | """ 155 | if len(columns) == 0: 156 | return {} 157 | qdf = df[columns].quantile(quantiles) 158 | return {c: qdf[c].values.tolist() for c in columns} 159 | 160 | def get_nunique(self, df, columns=[]): 161 | """return dict with number of unique entries for given columns 162 | 163 | :param df: input pandas data frame 164 | :param columns: columns to select (optional) 165 | """ 166 | if not columns: 167 | columns = df.columns 168 | return df[columns].nunique().to_dict() 169 | 170 | def process_features(self, df, cols_by_type): 171 | """Process features before histogram filling. 172 | 173 | Specifically, convert timestamp features to integers 174 | 175 | :param df: input (pandas) data frame 176 | :param cols_by_type: dictionary of column sets for each type 177 | :returns: output (pandas) data frame with converted timestamp features 178 | :rtype: pandas DataFrame 179 | """ 180 | # timestamp variables are converted to ns here 181 | # make temp df for value counting (used below) 182 | idf = df[list(cols_by_type["num"]) + list(cols_by_type["str"]) + list(cols_by_type["bool"])].copy() 183 | for col in cols_by_type["dt"]: 184 | self.logger.debug(f'Converting column "{col}" of type "{self.var_dtype[col]}" to nanosec.') 185 | idf[col] = df[col].apply(to_ns) 186 | 187 | # treat decimal as float, as decimal is not supported by .quantile 188 | # (https://github.com/pandas-dev/pandas/issues/13157) 189 | for col in cols_by_type["decimal"]: 190 | idf[col] = df[col].apply(float) 191 | 192 | return idf 193 | 194 | def fill_histograms(self, idf): 195 | """Fill the histograms 196 | 197 | :param idf: converted input dataframe 198 | """ 199 | # construct empty histograms if needed 200 | for cols in self.features: 201 | name = ":".join(cols) 202 | if name not in self._hists: 203 | # create an (empty) histogram of right type 204 | self._hists[name] = self.construct_empty_hist(cols) 205 | 206 | # histogram filling with working progress bar 207 | res = [ 208 | _fill_histogram(idf=idf[c], hist=self._hists[":".join(c)], features=c) 209 | for c in tqdm(self.features, total=len(self.features), ncols=100) 210 | ] 211 | 212 | # update dictionary 213 | for name, hist in res: 214 | self._hists[name] = hist 215 | 216 | def construct_empty_hist(self, features): 217 | """Create an (empty) histogram of right type. 218 | 219 | Create a multi-dim histogram by iterating through the features in 220 | reverse order and passing a single-dim hist as input to the next 221 | column. 222 | 223 | :param list features: histogram features 224 | :return: created histogram 225 | :rtype: histogrammar.Count 226 | """ 227 | hist = hg.Count() 228 | 229 | # create a multi-dim histogram by iterating through the features 230 | # in reverse order and passing a single-dim hist as input 231 | # to the next column 232 | revcols = list(reversed(features)) 233 | for idx, col in enumerate(revcols): 234 | # histogram type depends on the data type 235 | dt = self.var_dtype[col] 236 | 237 | # processing function, e.g. only accept booleans during filling 238 | f = QUANTITY[dt] 239 | # if len(features) == 1: df[col] is a pd.series 240 | # else: df[features] is a pd.Dataframe, so fix column to col 241 | quant = (lambda x, fnc=f: fnc(x)) if len(features) == 1 else (lambda x, fnc=f, clm=col: fnc(x[clm])) 242 | hist = self.get_hist_bin(hist, features, quant, col, dt) 243 | 244 | return hist 245 | 246 | 247 | def _fill_histogram(idf, hist, features): 248 | """Fill input histogram with column(s) of input dataframe. 249 | 250 | Separate function call for parallellization. 251 | 252 | :param idf: input data frame used for filling histogram 253 | :param hist: empty histogrammar histogram about to be filled 254 | :param list features: histogram column(s) 255 | """ 256 | name = ":".join(features) 257 | clm = features[0] if len(features) == 1 else features 258 | # do the actual filling 259 | hist.fill.numpy(idf[clm]) 260 | return name, hist 261 | -------------------------------------------------------------------------------- /histogrammar/dfinterface/spark_histogrammar.py: -------------------------------------------------------------------------------- 1 | """Copyright Eskapade: 2 | License Apache-2: https://github.com/KaveIO/Eskapade-Core/blob/master/LICENSE 3 | Reference link: 4 | https://github.com/KaveIO/Eskapade-Spark/blob/master/python/eskapadespark/links/spark_histogrammar_filler.py 5 | All modifications copyright ING WBAA. 6 | """ 7 | 8 | import numpy as np 9 | from tqdm import tqdm 10 | 11 | import histogrammar as hg 12 | 13 | from .histogram_filler_base import HistogramFillerBase 14 | 15 | try: 16 | from pyspark.sql import DataFrame 17 | from pyspark.sql import functions as f 18 | from pyspark.sql.functions import approxCountDistinct 19 | except (ModuleNotFoundError, AttributeError): 20 | pass 21 | 22 | 23 | class SparkHistogrammar(HistogramFillerBase): 24 | """Fill histogrammar histograms with Spark. 25 | 26 | Algorithm to fill histogrammar style bin, sparse-bin and category histograms 27 | with Spark. Timestamp features are converted to nanoseconds before the binning 28 | is applied. Final histograms are stored in the datastore. 29 | """ 30 | 31 | def __init__( 32 | self, 33 | features=None, 34 | binning="unit", 35 | bin_specs=None, 36 | time_axis="", 37 | var_dtype=None, 38 | read_key=None, 39 | store_key=None, 40 | nbins_1d=40, 41 | nbins_2d=20, 42 | nbins_3d=10, 43 | max_nunique=500, 44 | ): 45 | """Initialize module instance. 46 | 47 | Store and do basic check on the attributes HistogramFillerBase. 48 | 49 | :param list features: colums to pick up from input data. (default is all features) 50 | For multi-dimensional histograms, separate the column names with a : 51 | 52 | Example features list is: 53 | 54 | .. code-block:: python 55 | 56 | features = ['x', 'date', 'date:x', 'date:y', 'date:x:y'] 57 | 58 | :param str binning: default binning to revert to in case bin_specs not supplied. options are: 59 | "unit" or "auto", default is "unit". When using "auto", semi-clever binning is automatically done. 60 | :param dict bin_specs: dictionaries used for rebinning numeric or timestamp features 61 | 62 | Example bin_specs dictionary is: 63 | 64 | .. code-block:: python 65 | 66 | bin_specs = {'x': {'binWidth': 1, 'origin': 0}, 67 | 'y': {'num': 10, 'low': 0.0, 'high': 2.0}, 68 | 'x:y': [{}, {'num': 5, 'low': 0.0, 'high': 1.0}], 69 | 'a': {'edges': [0, 2, 10, 11, 21, 101]}, 70 | 'b': {'centers': [1, 6, 10.5, 16, 20, 100]}, 71 | 'c': {'max': True}, 72 | 'd': {'min': True}, 73 | 'e': {'sum': True}, 74 | 'f': {'average': True}, 75 | 'a:f': [{'edges': [0, 10, 101]}, {'average': True}], 76 | 'g': {'thresholds': [0, 2, 10, 11, 21, 101]}, 77 | 'h': {'bag': True}, 78 | } 79 | 80 | In the bin specs for x:y, x reverts to the 1-dim setting. 81 | 82 | :param str time_axis: name of datetime feature, used as time axis, eg 'date'. if True, will be guessed. 83 | If time_axis is set, if no features given, features becomes: ['date:x', 'date:y', 'date:z'] etc. 84 | :param dict var_dtype: dictionary with specified datatype per feature (optional) 85 | :param str read_key: key of input histogram-dict to read from data store . 86 | (only required when calling transform(datastore) as module) 87 | :param str store_key: key of output data to store in data store 88 | (only required when calling transform(datastore) as module) 89 | :param int nbins_1d: auto-binning number of bins for 1d histograms. default is 40. 90 | :param int nbins_2d: auto-binning number of bins for 2d histograms. default is 20. 91 | :param int nbins_3d: auto-binning number of bins for 3d histograms. default is 10. 92 | :param int max_nunique: auto-binning threshold for unique categorical values. default is 500. 93 | """ 94 | HistogramFillerBase.__init__( 95 | self, 96 | features, 97 | binning, 98 | bin_specs, 99 | time_axis, 100 | var_dtype, 101 | read_key, 102 | store_key, 103 | nbins_1d, 104 | nbins_2d, 105 | nbins_3d, 106 | max_nunique, 107 | ) 108 | self._unit_timestamp_specs = { 109 | k: float(self._unit_timestamp_specs[k]) for i, k in enumerate(self._unit_timestamp_specs) 110 | } 111 | 112 | def assert_dataframe(self, df): 113 | """Check that input data is a filled spark data frame. 114 | 115 | :param df: input (spark) data frame 116 | """ 117 | if not isinstance(df, DataFrame): 118 | raise TypeError("retrieved object not of type Spark DataFrame") 119 | assert len(df.head(1)) != 0, "input dataframe is empty" 120 | return df 121 | 122 | def get_features(self, df): 123 | """Get columns of dataframe 124 | 125 | :param df: input spark dataframe 126 | """ 127 | return df.columns 128 | 129 | def get_quantiles(self, df, quantiles=[0.05, 0.95], columns=[]): 130 | """return dict with quantiles for given columns 131 | 132 | :param df: input (spark) data frame 133 | :param quantiles: list of quantiles. default is [0.05, 0.95] 134 | :param columns: columns to select. default is all. 135 | """ 136 | if len(columns) == 0: 137 | return {} 138 | qsl = df.approxQuantile(columns, quantiles, 0.25) 139 | return dict(zip(columns, qsl)) 140 | 141 | def get_nunique(self, df, columns=[]): 142 | """return dict with number of unique entries for given columns 143 | 144 | :param df: input (spark) data frame 145 | :param columns: columns to select (optional) 146 | """ 147 | if not columns: 148 | columns = df.columns 149 | qdf = df.agg(*(approxCountDistinct(f.col(c)).alias(c) for c in columns)) 150 | return qdf.toPandas().T[0].to_dict() 151 | 152 | def get_data_type(self, df, col): 153 | """Get data type of dataframe column. 154 | 155 | :param df: input data frame 156 | :param str col: column 157 | """ 158 | if col not in df.columns: 159 | raise KeyError(f'Column "{col:s}" not in input dataframe.') 160 | dt = dict(df.dtypes)[col] 161 | # spark conversions to numpy or python equivalent 162 | if dt == "string": 163 | dt = "str" 164 | elif dt in ["timestamp", "date"]: 165 | dt = np.datetime64 166 | elif dt == "boolean": 167 | dt = bool 168 | elif dt == "bigint": 169 | dt = np.int64 170 | elif dt.startswith("decimal("): 171 | return np.dtype(float, metadata={"decimal": True}) 172 | 173 | return np.dtype(dt) 174 | 175 | def process_features(self, df, cols_by_type): 176 | """Process features before histogram filling. 177 | 178 | Specifically, in this case convert timestamp features to nanoseconds 179 | 180 | :param df: input data frame 181 | :return: output data frame with converted timestamp features 182 | :rtype: DataFrame 183 | """ 184 | # make alias df for value counting (used below) 185 | idf = df.alias("") 186 | 187 | # timestamp variables are converted here to ns since 1970-1-1 188 | # histogrammar does not (yet) support long integers, so convert timestamps to float 189 | for col in cols_by_type["dt"]: 190 | self.logger.debug(f'Converting column "{col}" of type "{self.var_dtype[col]}" to nanosec.') 191 | # first cast to timestamp (in case column is stored as date) 192 | to_ns = f.col(col).cast("timestamp").cast("float") * 1e9 193 | idf = idf.withColumn(col, to_ns) 194 | 195 | # spark nulls are interpreted to 0 when cast to double in scala, done when given as input to numeric histograms 196 | # in columns that have them, replace by nones by nans 197 | for col in cols_by_type["num"]: 198 | if len(idf.where(f.col(col).isNull()).limit(1).collect()) > 0: 199 | self.logger.debug(f'In numeric column "{col}" converting each None to NaN.') 200 | idf = idf.withColumn( 201 | col, 202 | f.when(f.col(col).isNotNull(), f.col(col)).otherwise(float("nan")), 203 | ) 204 | 205 | return idf 206 | 207 | def construct_empty_hist(self, df, features): 208 | """Create an (empty) histogram of right type. 209 | 210 | Create a multi-dim histogram by iterating through the features in 211 | reverse order and passing a single-dim hist as input to the next 212 | column. 213 | 214 | :param df: input dataframe 215 | :param list features: histogram features 216 | :return: created histogram 217 | :rtype: histogrammar.Count 218 | """ 219 | hist = hg.Count() 220 | 221 | # create a multi-dim histogram by iterating through 222 | # the features in reverse order and passing a single-dim hist 223 | # as input to the next column 224 | revcols = list(reversed(features)) 225 | for idx, col in enumerate(revcols): 226 | # histogram type depends on the data type 227 | dt = self.var_dtype[col] 228 | quant = df[col] 229 | hist = self.get_hist_bin(hist, features, quant, col, dt) 230 | 231 | return hist 232 | 233 | def fill_histograms(self, idf): 234 | """Fill the histograms 235 | 236 | :param idf: input data frame used for filling histogram 237 | """ 238 | for cols in tqdm(self.features, ncols=100): 239 | self.logger.debug('Processing feature "{cols}".'.format(cols=":".join(cols))) 240 | self.fill_histogram(idf, cols) 241 | 242 | def fill_histogram(self, idf, features): 243 | """Fill input histogram with column(s) of input dataframe. 244 | 245 | :param idf: input data frame used for filling histogram 246 | :param list features: histogram column(s) 247 | """ 248 | name = ":".join(features) 249 | if name not in self._hists: 250 | # create an (empty) histogram of right type 251 | self._hists[name] = self.construct_empty_hist(idf, features) 252 | hist = self._hists[name] 253 | 254 | # do the actual filling 255 | hist.fill.sparksql(idf) 256 | self._hists[name] = hist 257 | 258 | def _execute(self, df): 259 | df.persist() 260 | hists = super()._execute(df) 261 | df.unpersist() 262 | return hists 263 | -------------------------------------------------------------------------------- /histogrammar/notebooks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/histogrammar/histogrammar-python/642d89dfbe34f1065d215a369ec153ec11ed4c2e/histogrammar/notebooks/__init__.py -------------------------------------------------------------------------------- /histogrammar/notebooks/histogrammar_tutorial_exercises.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Histogrammar exercises\n", 8 | "\n", 9 | "Histogrammar is a Python package that allows you to make histograms from numpy arrays, and pandas and spark dataframes. \n", 10 | "\n", 11 | "(There is also a scala backend for Histogrammar, that is used by spark.) \n", 12 | "\n", 13 | "You can do the exercises below after the basic tutorial.\n", 14 | "\n", 15 | "Enjoy!" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "%%capture\n", 25 | "# install histogrammar (if not installed yet)\n", 26 | "\n", 27 | "!\"{sys.executable}\" -m pip install histogrammar" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "import histogrammar as hg" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "import pandas as pd" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## Dataset\n", 53 | "Let's first load some data!" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# open a pandas dataframe for use below\n", 63 | "from histogrammar import resources\n", 64 | "\n", 65 | "df = pd.read_csv(resources.data(\"test.csv.gz\"), parse_dates=[\"date\"])" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "df.head(2)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "## Comparing histogram types" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "Histogrammar treats histograms as objects. You will see this has various advantages.\n", 89 | "\n", 90 | "Let's fill a simple histogram with a numpy array." 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "# this creates a histogram with 100 even-sized bins in the (closed) range [-5, 5]\n", 100 | "hist1 = hg.Bin(num=10, low=0, high=100)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "hist1.fill.numpy(df['age'].values)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "hist1.plot.matplotlib();" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "hist2 = hg.SparselyBin(binWidth=10, origin=0)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "hist2.fill.numpy(df['age'].values)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "hist2.plot.matplotlib();" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "Q: Have a look at the .values and .bins attributes of hist1 and hist2.\n", 153 | "What types are these? (hist1.values is a ...?) \n", 154 | "Does that make sense?" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "hist1" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "hist2" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "Q: In each bin, what type of object is keeping track of the bin count?" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "Try filling hist1 with small values (negative) or very large (> 100) or with NaNs. \n", 187 | "Find out if and how hist1 keeps track of these?" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "Now fill hist2 with small values (negative) or very large (> 100) or with NaNs. How does hist2 keeps track of these?" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "## Categorical variables\n", 202 | "\n", 203 | "For categorical variables use the Categorize histogram\n", 204 | "- Categorize histograms: accepting categorical variables such as strings and booleans.\n", 205 | "\n" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "histx = hg.Categorize('eyeColor')" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "histx.fill.numpy(df)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "Q: A categorize histogram, what is it fundementally, a dictionary or a list?" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "Q: What else can it keep track of, e.g. numbers, booleans, nans? Give it a try, fill it with more entries!" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "Fill a histograms with a boolean array (isActive), directly from the dataframe\n", 245 | "\n", 246 | "Q: what type of histogram do you get?" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "hists = df.hg_make_histograms(features=['isActive'])" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "## Multi-dimensional histograms" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "Let's make a 3-dimensional histogram, with axes: x=favoriteFruit, y=gender, z=isActive. (In Histogrammar, a multi-dimensional histogram is composed as recursive histograms, starting with the last one.) \n", 277 | "Then fill it with the dataframe." 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "# hist1 = hg.Categorize(quantity='isActive')\n", 287 | "# hist2 = hg.Categorize(quantity='gender', value=hist1)\n", 288 | "# hist3 = hg.Categorize(quantity='favoriteFruit')" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "Q: How many data points end up in the bin: banana, male, True ?\n" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "Q: Store this histogram as a json file. What is the size of the json file?" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "Q: Read back the histogram and then plot it." 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "Q: Make a histogram of the feature 'fruit', which measures the average value of 'latitude' per bin of fruit." 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "hist1 = hg.Average(quantity='latitude')" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "Q: what is the mean value of latitude for the bin 'strawberry'?" 333 | ] 334 | } 335 | ], 336 | "metadata": { 337 | "kernel_info": { 338 | "name": "python3" 339 | }, 340 | "kernelspec": { 341 | "display_name": "Python 3", 342 | "language": "python", 343 | "name": "python3" 344 | }, 345 | "language_info": { 346 | "codemirror_mode": { 347 | "name": "ipython", 348 | "version": 3 349 | }, 350 | "file_extension": ".py", 351 | "mimetype": "text/x-python", 352 | "name": "python", 353 | "nbconvert_exporter": "python", 354 | "pygments_lexer": "ipython3", 355 | "version": "3.8.5" 356 | }, 357 | "nteract": { 358 | "version": "0.15.0" 359 | }, 360 | "pycharm": { 361 | "stem_cell": { 362 | "cell_type": "raw", 363 | "metadata": { 364 | "collapsed": false 365 | }, 366 | "source": [] 367 | } 368 | } 369 | }, 370 | "nbformat": 4, 371 | "nbformat_minor": 4 372 | } 373 | -------------------------------------------------------------------------------- /histogrammar/plot/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2016 DIANA-HEP 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /histogrammar/plot/hist_numpy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 ING Wholesale Banking Advanced Analytics 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | # this software and associated documentation files (the "Software"), to deal in 5 | # the Software without restriction, including without limitation the rights to 6 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | # the Software, and to permit persons to whom the Software is furnished to do so, 8 | # subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | 21 | import warnings 22 | 23 | import numpy as np 24 | 25 | 26 | def prepare_2dgrid(hist): 27 | """Get lists of all unique x and y keys 28 | 29 | Used as input by get_2dgrid(hist). 30 | 31 | :param hist: input histogrammar histogram 32 | :return: two comma-separated lists of unique x and y keys 33 | """ 34 | if hist.n_dim < 2: 35 | warnings.warn(f"Input histogram only has {hist.n_dim} dimensions (<2). Returning empty lists.") 36 | return [], [] 37 | 38 | xkeys = set() 39 | ykeys = set() 40 | # SparselyBin, Categorize, IrregularlyBin, CentrallyBin 41 | if hasattr(hist, "bins"): 42 | hist_bins = dict(hist.bins) 43 | xkeys = xkeys.union(hist_bins.keys()) 44 | for h in hist_bins.values(): 45 | if hasattr(h, "bins"): 46 | h_bins = dict(h.bins) 47 | ykeys = ykeys.union(h_bins.keys()) 48 | elif hasattr(h, "values"): 49 | ykeys = ykeys.union(range(len(h.values))) 50 | # Bin 51 | elif hasattr(hist, "values"): 52 | xkeys = xkeys.union(range(len(hist.values))) 53 | for h in hist.values: 54 | if hasattr(h, "bins"): 55 | h_bins = dict(h.bins) 56 | ykeys = ykeys.union(h_bins.keys()) 57 | elif hasattr(h, "values"): 58 | ykeys = ykeys.union(range(len(h.values))) 59 | return sorted(xkeys), sorted(ykeys) 60 | 61 | 62 | def set_2dgrid(hist, xkeys, ykeys): 63 | """Set 2d grid of first two dimenstions of input histogram 64 | 65 | Used as input by get_2dgrid(hist). 66 | 67 | :param hist: input histogrammar histogram 68 | :param list xkeys: list with unique x keys 69 | :param list ykeys: list with unique y keys 70 | :return: filled 2d numpy grid 71 | """ 72 | grid = np.zeros((len(ykeys), len(xkeys))) 73 | 74 | if hist.n_dim < 2: 75 | warnings.warn(f"Input histogram only has {hist.n_dim} dimensions (<2). Returning original grid.") 76 | return grid 77 | 78 | # SparselyBin, Categorize, IrregularlyBin, CentrallyBin 79 | if hasattr(hist, "bins"): 80 | hist_bins = dict(hist.bins) 81 | for k, h in hist_bins.items(): 82 | if k not in xkeys: 83 | continue 84 | i = xkeys.index(k) 85 | if hasattr(h, "bins"): 86 | h_bins = dict(h.bins) 87 | for li, g in h_bins.items(): 88 | if li not in ykeys: 89 | continue 90 | j = ykeys.index(li) 91 | grid[j, i] = g.entries 92 | elif hasattr(h, "values"): 93 | for j, g in enumerate(h.values): 94 | grid[j, i] = g.entries 95 | # Bin 96 | elif hasattr(hist, "values"): 97 | for i, h in enumerate(hist.values): 98 | if hasattr(h, "bins"): 99 | h_bins = dict(h.bins) 100 | for lj, g in h_bins.items(): 101 | if lj not in ykeys: 102 | continue 103 | j = ykeys.index(lj) 104 | grid[j, i] = g.entries 105 | elif hasattr(h, "values"): 106 | for j, g in enumerate(h.values): 107 | grid[j, i] = g.entries 108 | return grid 109 | 110 | 111 | def get_2dgrid(hist): 112 | """Get filled x,y grid of first two dimensions of input histogram 113 | 114 | :param hist: input histogrammar histogram 115 | :return: x,y,grid of first two dimenstions of input histogram 116 | """ 117 | if hist.n_dim < 2: 118 | warnings.warn(f"Input histogram only has {hist.n_dim} dimensions (<2). Returning empty grid.") 119 | return np.zeros((0, 0)) 120 | 121 | xkeys, ykeys = prepare_2dgrid(hist) 122 | grid = set_2dgrid(hist, xkeys, ykeys) 123 | 124 | x_labels = get_x_labels(hist, xkeys) 125 | y_labels = get_y_labels(hist, ykeys) 126 | 127 | return x_labels, y_labels, grid 128 | 129 | 130 | def get_x_labels(hist, xkeys): 131 | return [str(hist._center_from_key(key)) for key in xkeys] 132 | 133 | 134 | def get_y_labels(hist, ykeys): 135 | # SparselyBin, Categorize, IrregularlyBin, CentrallyBin 136 | if hasattr(hist, "bins"): 137 | hist_bins = dict(hist.bins) 138 | h = list(hist_bins.values())[0] 139 | # Bin 140 | elif hasattr(hist, "values"): 141 | h = hist.values[0] 142 | return [str(h._center_from_key(key)) for key in ykeys] 143 | 144 | 145 | def prepare2Dsparse(sparse): 146 | yminBins = [v.minBin for v in sparse.bins.values() if v.minBin is not None] 147 | ymaxBins = [v.maxBin for v in sparse.bins.values() if v.maxBin is not None] 148 | if len(yminBins) > 0 and len(ymaxBins) > 0: 149 | yminBin = min(yminBins) 150 | ymaxBin = max(ymaxBins) 151 | else: 152 | yminBin = 0 153 | ymaxBin = 0 154 | sample = list(sparse.bins.values())[0] 155 | ynum = 1 + ymaxBin - yminBin 156 | ylow = yminBin * sample.binWidth + sample.origin 157 | yhigh = (ymaxBin + 1.0) * sample.binWidth + sample.origin 158 | return yminBin, ymaxBin, ynum, ylow, yhigh 159 | 160 | 161 | def set2Dsparse(sparse, yminBin, ymaxBin, grid): 162 | for i, iindex in enumerate(range(sparse.minBin, sparse.maxBin + 1)): 163 | for j, jindex in enumerate(range(yminBin, ymaxBin + 1)): 164 | if iindex in sparse.bins and jindex in sparse.bins[iindex].bins: 165 | grid[j, i] = sparse.bins[iindex].bins[jindex].entries 166 | return grid 167 | -------------------------------------------------------------------------------- /histogrammar/primitives/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2016 DIANA-HEP 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /histogrammar/primitives/average.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2016 DIANA-HEP 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import math 18 | import numbers 19 | 20 | from histogrammar.defs import ( 21 | Container, 22 | ContainerException, 23 | Factory, 24 | JsonFormatException, 25 | identity, 26 | ) 27 | from histogrammar.util import ( 28 | basestring, 29 | datatype, 30 | floatToJson, 31 | hasKeys, 32 | inheritdoc, 33 | maybeAdd, 34 | n_dim, 35 | numeq, 36 | serializable, 37 | ) 38 | 39 | 40 | class Average(Factory, Container): 41 | """Accumulate the weighted mean of a given quantity. 42 | 43 | Uses the numerically stable weighted mean algorithm described in `"Incremental calculation of weighted mean 44 | and variance," `_ Tony Finch, 45 | *Univeristy of Cambridge Computing Service,* 2009. 46 | """ 47 | 48 | @staticmethod 49 | def ed(entries, mean): 50 | """Create an Average that is only capable of being added. 51 | 52 | Parameters: 53 | entries (float): the number of entries. 54 | mean (float): the mean. 55 | """ 56 | 57 | if not isinstance(entries, numbers.Real) and entries not in ( 58 | "nan", 59 | "inf", 60 | "-inf", 61 | ): 62 | raise TypeError(f"entries ({entries}) must be a number") 63 | if not isinstance(mean, numbers.Real) and entries not in ("nan", "inf", "-inf"): 64 | raise TypeError(f"mean ({mean}) must be a number") 65 | if entries < 0.0: 66 | raise ValueError(f"entries ({entries}) cannot be negative") 67 | out = Average(None) 68 | out.entries = float(entries) 69 | out.mean = float(mean) 70 | return out.specialize() 71 | 72 | @staticmethod 73 | def ing(quantity): 74 | """Synonym for ``__init__``.""" 75 | return Average(quantity) 76 | 77 | def __init__(self, quantity=identity): 78 | """Create an Average that is capable of being filled and added. 79 | 80 | Parameters: 81 | quantity (function returning float): computes the quantity of interest from the data. 82 | 83 | Other parameters: 84 | entries (float): the number of entries, initially 0.0. 85 | mean (float): the running mean, initially NaN. 86 | """ 87 | self.quantity = serializable(identity(quantity) if isinstance(quantity, str) else quantity) 88 | self.entries = 0.0 89 | self.mean = float("nan") 90 | super().__init__() 91 | self.specialize() 92 | 93 | @inheritdoc(Container) 94 | def zero(self): 95 | return Average(self.quantity) 96 | 97 | @inheritdoc(Container) 98 | def __add__(self, other): 99 | if isinstance(other, Average): 100 | out = Average(self.quantity) 101 | out.entries = self.entries + other.entries 102 | if self.entries == 0.0: 103 | out.mean = other.mean 104 | elif other.entries == 0.0: 105 | out.mean = self.mean 106 | else: 107 | out.mean = (self.entries * self.mean + other.entries * other.mean) / (self.entries + other.entries) 108 | return out.specialize() 109 | raise ContainerException(f"cannot add {self.name} and {other.name}") 110 | 111 | @inheritdoc(Container) 112 | def __iadd__(self, other): 113 | both = self + other 114 | self.entries = both.entries 115 | self.mean = both.mean 116 | return self 117 | 118 | @inheritdoc(Container) 119 | def __mul__(self, factor): 120 | if math.isnan(factor) or factor <= 0.0: 121 | return self.zero() 122 | out = self.zero() 123 | out.entries = factor * self.entries 124 | out.mean = self.mean 125 | return out.specialize() 126 | 127 | @inheritdoc(Container) 128 | def __rmul__(self, factor): 129 | return self.__mul__(factor) 130 | 131 | @inheritdoc(Container) 132 | def fill(self, datum, weight=1.0): 133 | self._checkForCrossReferences() 134 | 135 | if weight > 0.0: 136 | q = self.quantity(datum) 137 | if not isinstance(q, numbers.Real): 138 | raise TypeError(f"function return value ({q}) must be boolean or number") 139 | 140 | # no possibility of exception from here on out (for rollback) 141 | if self.entries == 0.0: 142 | self.mean = q 143 | self.entries += weight 144 | 145 | if math.isnan(self.mean) or math.isnan(q): 146 | self.mean = float("nan") 147 | 148 | elif math.isinf(self.mean) or math.isinf(q): 149 | if math.isinf(self.mean) and math.isinf(q) and self.mean * q < 0.0: 150 | self.mean = float("nan") # opposite-sign infinities is bad 151 | elif math.isinf(q): 152 | self.mean = q # mean becomes infinite with sign of q 153 | else: 154 | pass # mean is already infinite 155 | if math.isinf(self.entries) or math.isnan(self.entries): 156 | self.mean = float("nan") # non-finite denominator is bad 157 | 158 | else: # handle finite case 159 | delta = q - self.mean 160 | shift = delta * weight / self.entries 161 | self.mean += shift 162 | 163 | def _numpy(self, data, weights, shape): 164 | q = self.quantity(data) 165 | self._checkNPQuantity(q, shape) 166 | self._checkNPWeights(weights, shape) 167 | weights = self._makeNPWeights(weights, shape) 168 | 169 | # no possibility of exception from here on out (for rollback) 170 | ca, ma = self.entries, self.mean 171 | if ca == 0.0: 172 | ma = 0.0 173 | 174 | import numpy 175 | 176 | selection = weights > 0.0 177 | q = q[selection] 178 | weights = weights[selection] 179 | 180 | self.entries += float(weights.sum()) 181 | ca_plus_cb = self.entries 182 | 183 | if math.isinf(ca_plus_cb): 184 | self.mean = float("nan") 185 | elif ca_plus_cb > 0.0: 186 | mb = numpy.average(q, weights=weights) 187 | self.mean = float((ca * ma + (ca_plus_cb - ca) * mb) / ca_plus_cb) 188 | 189 | def _sparksql(self, jvm, converter): 190 | return converter.Average(self.quantity.asSparkSQL()) 191 | 192 | @property 193 | def children(self): 194 | """List of sub-aggregators, to make it possible to walk the tree.""" 195 | return [] 196 | 197 | @inheritdoc(Container) 198 | def toJsonFragment(self, suppressName): 199 | return maybeAdd( 200 | {"entries": floatToJson(self.entries), "mean": floatToJson(self.mean)}, 201 | name=(None if suppressName else self.quantity.name), 202 | ) 203 | 204 | @staticmethod 205 | @inheritdoc(Factory) 206 | def fromJsonFragment(json, nameFromParent): 207 | if isinstance(json, dict) and hasKeys(json.keys(), ["entries", "mean"], ["name"]): 208 | if json["entries"] in ("nan", "inf", "-inf") or isinstance(json["entries"], numbers.Real): 209 | entries = float(json["entries"]) 210 | else: 211 | raise JsonFormatException(json["entries"], "Average.entries") 212 | 213 | if isinstance(json.get("name", None), basestring): 214 | name = json["name"] 215 | elif json.get("name", None) is None: 216 | name = None 217 | else: 218 | raise JsonFormatException(json["name"], "Average.name") 219 | 220 | if json["mean"] in ("nan", "inf", "-inf") or isinstance(json["mean"], numbers.Real): 221 | mean = float(json["mean"]) 222 | else: 223 | raise JsonFormatException(json["mean"], "Average.mean") 224 | 225 | out = Average.ed(entries, mean) 226 | out.quantity.name = nameFromParent if name is None else name 227 | return out.specialize() 228 | 229 | raise JsonFormatException(json, "Average") 230 | 231 | def __repr__(self): 232 | return f"" 233 | 234 | def __eq__(self, other): 235 | return ( 236 | isinstance(other, Average) 237 | and self.quantity == other.quantity 238 | and numeq(self.entries, other.entries) 239 | and numeq(self.mean, other.mean) 240 | ) 241 | 242 | def __ne__(self, other): 243 | return not self == other 244 | 245 | def __hash__(self): 246 | return hash((self.quantity, self.entries, self.mean)) 247 | 248 | 249 | # extra properties: number of dimensions and datatypes of sub-hists 250 | Average.n_dim = n_dim 251 | Average.datatype = datatype 252 | 253 | # register extra methods such as plotting 254 | Factory.register(Average) 255 | -------------------------------------------------------------------------------- /histogrammar/primitives/count.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2016 DIANA-HEP 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import math 18 | import numbers 19 | 20 | from histogrammar.defs import ( 21 | Container, 22 | ContainerException, 23 | Factory, 24 | JsonFormatException, 25 | identity, 26 | ) 27 | from histogrammar.util import ( 28 | datatype, 29 | floatToJson, 30 | inheritdoc, 31 | n_dim, 32 | numeq, 33 | serializable, 34 | ) 35 | 36 | 37 | class Count(Factory, Container): 38 | """Count entries by accumulating the sum of all observed weights or a sum of transformed weights 39 | 40 | (e.g. collect the sum of squares of weights). 41 | 42 | An optional ``transform`` function can be applied to the weights before summing. 43 | To accumulate the sum of squares of weights, use: 44 | 45 | :: 46 | lambda x: x**2 47 | 48 | for instance. This is unlike any other primitive's ``quantity`` function in that its domain is 49 | the *weights* (always double), not *data* (any type). 50 | """ 51 | 52 | @staticmethod 53 | def ed(entries): 54 | """Create a Count that is only capable of being added. 55 | 56 | Parameters: 57 | entries (float): the number of entries. 58 | """ 59 | if not isinstance(entries, numbers.Real) and entries not in ( 60 | "nan", 61 | "inf", 62 | "-inf", 63 | ): 64 | raise TypeError(f"entries ({entries}) must be a number") 65 | if entries < 0.0: 66 | raise ValueError(f"entries ({entries}) cannot be negative") 67 | out = Count() 68 | out.entries = float(entries) 69 | return out 70 | 71 | @staticmethod 72 | def ing(transform=identity): 73 | """Synonym for ``__init__``.""" 74 | return Count(transform) 75 | 76 | def __init__(self, transform=identity): 77 | """Create a Count that is capable of being filled and added. 78 | 79 | Parameters: 80 | transform (function from float to float): transforms each weight. 81 | 82 | Other parameters: 83 | entries (float): the number of entries, initially 0.0. 84 | """ 85 | self.entries = 0.0 86 | self.transform = serializable(transform) 87 | super().__init__() 88 | 89 | @inheritdoc(Container) 90 | def zero(self): 91 | return Count(self.transform) 92 | 93 | @inheritdoc(Container) 94 | def __add__(self, other): 95 | if isinstance(other, Count): 96 | out = Count(self.transform) 97 | out.entries = self.entries + other.entries 98 | return out 99 | raise ContainerException(f"cannot add {self.name} and {other.name}") 100 | 101 | @inheritdoc(Container) 102 | def __iadd__(self, other): 103 | if isinstance(other, Count): 104 | self.entries += other.entries 105 | return self 106 | raise ContainerException(f"cannot add {self.name} and {other.name}") 107 | 108 | @inheritdoc(Container) 109 | def __mul__(self, factor): 110 | if ( 111 | self.transform != identity 112 | or not callable(self.transform.expr) 113 | or ( 114 | hasattr(self.transform.expr, "func_code") 115 | and self.transform.expr.func_code.co_code != identity.expr.func_code.co_code 116 | ) 117 | or ( 118 | hasattr(self.transform.expr, "__code__") 119 | and self.transform.expr.__code__.co_code != identity.expr.__code__.co_code 120 | ) 121 | ): 122 | raise ContainerException("Cannot scalar-multiply Count with a non-identity transform.") 123 | if math.isnan(factor) or factor <= 0.0: 124 | return self.zero() 125 | out = self.zero() 126 | out.entries = factor * self.entries 127 | return out 128 | 129 | @inheritdoc(Container) 130 | def __rmul__(self, factor): 131 | return self.__mul__(factor) 132 | 133 | @inheritdoc(Container) 134 | def fill(self, datum, weight=1.0): 135 | self._checkForCrossReferences() 136 | 137 | if weight > 0.0: 138 | t = self.transform(weight) 139 | if not isinstance(t, numbers.Real): 140 | raise TypeError(f"function return value ({t}) must be boolean or number") 141 | 142 | # no possibility of exception from here on out (for rollback) 143 | self.entries += t 144 | 145 | def _numpy(self, _, weights, shape): 146 | import numpy 147 | 148 | if isinstance(weights, numpy.ndarray): 149 | assert len(weights.shape) == 1 150 | if shape[0] is not None: 151 | assert weights.shape[0] == shape[0] 152 | 153 | if self.transform is identity: 154 | self.entries += float(weights.sum()) 155 | else: 156 | t = self.transform(weights) 157 | assert len(t.shape) == 1 158 | if shape[0] is not None: 159 | assert t.shape[0] == shape[0] 160 | self.entries += float(t.sum()) 161 | 162 | elif shape[0] is not None: 163 | if self.transform is identity: 164 | self.entries += weights * shape[0] 165 | else: 166 | t = self.transform(numpy.array([weights])) 167 | assert len(t.shape) == 1 168 | assert t.shape[0] == 1 169 | self.entries += float(t[0]) 170 | 171 | elif isinstance(weights, (int, float, numpy.number)): 172 | if self.transform is identity: 173 | self.entries += float(weights) 174 | else: 175 | self.entries += self.transform(weights) 176 | 177 | else: 178 | raise ValueError("cannot use Numpy to fill an isolated Count (unless the weights are given as an array)") 179 | 180 | def _sparksql(self, jvm, converter): 181 | return converter.Count() # TODO: handle transform 182 | 183 | @property 184 | def children(self): 185 | """List of sub-aggregators, to make it possible to walk the tree.""" 186 | return [] 187 | 188 | @inheritdoc(Container) 189 | def toJsonFragment(self, suppressName): 190 | return floatToJson(self.entries) 191 | 192 | @staticmethod 193 | @inheritdoc(Factory) 194 | def fromJsonFragment(json, nameFromParent): 195 | if json in ("nan", "inf", "-inf") or isinstance(json, numbers.Real): 196 | return Count.ed(float(json)) 197 | raise JsonFormatException(json, "Count") 198 | 199 | def __repr__(self): 200 | return f"" 201 | 202 | def __eq__(self, other): 203 | return isinstance(other, Count) and numeq(self.entries, other.entries) and self.transform == other.transform 204 | 205 | def __ne__(self, other): 206 | return not self == other 207 | 208 | def __hash__(self): 209 | return hash((self.entries, self.transform)) 210 | 211 | 212 | # extra properties: number of dimensions and datatypes of sub-hists 213 | Count.n_dim = n_dim 214 | Count.datatype = datatype 215 | 216 | # register extra methods 217 | Factory.register(Count) 218 | -------------------------------------------------------------------------------- /histogrammar/primitives/fraction.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2016 DIANA-HEP 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import math 18 | import numbers 19 | 20 | from histogrammar.defs import ( 21 | Container, 22 | ContainerException, 23 | Factory, 24 | JsonFormatException, 25 | identity, 26 | ) 27 | from histogrammar.primitives.count import Count 28 | from histogrammar.util import ( 29 | basestring, 30 | datatype, 31 | floatToJson, 32 | hasKeys, 33 | inheritdoc, 34 | maybeAdd, 35 | n_dim, 36 | numeq, 37 | serializable, 38 | ) 39 | 40 | 41 | class Fraction(Factory, Container): 42 | """Accumulate two aggregators, one numerator and one denominator 43 | 44 | Accumulate two aggregators, one containing only entries that pass a given selection (numerator) and another 45 | that contains all entries (denominator). 46 | 47 | The aggregator may be a simple :doc:`Count ` to measure the efficiency of a 48 | cut, a :doc:`Bin ` to plot a turn-on curve, or anything else to be tested with 49 | and without a cut. 50 | 51 | As a side effect of NaN values returning false for any comparison, a NaN return value from the selection is 52 | treated as a failed cut (the denominator is filled but the numerator is not). 53 | """ 54 | 55 | @staticmethod 56 | def ed(entries, numerator, denominator): 57 | """Create a Fraction that is only capable of being added. 58 | 59 | Parameters: 60 | entries (float): the number of entries. 61 | numerator: (:doc:`Container `): the filled numerator. 62 | denominator (:doc:`Container `): the filled denominator. 63 | """ 64 | if not isinstance(entries, numbers.Real) and entries not in ( 65 | "nan", 66 | "inf", 67 | "-inf", 68 | ): 69 | raise TypeError(f"entries ({entries}) must be a number") 70 | if not isinstance(numerator, Container): 71 | raise TypeError(f"numerator ({numerator}) must be a Container") 72 | if not isinstance(denominator, Container): 73 | raise TypeError(f"denominator ({denominator}) must be a Container") 74 | if entries < 0.0: 75 | raise ValueError(f"entries ({entries}) cannot be negative") 76 | 77 | out = Fraction(None, None) 78 | out.entries = float(entries) 79 | out.numerator = numerator 80 | out.denominator = denominator 81 | return out.specialize() 82 | 83 | @staticmethod 84 | def ing(quantity, value=Count()): 85 | """Synonym for ``__init__``.""" 86 | return Fraction(quantity, value) 87 | 88 | def __init__(self, quantity=identity, value=Count()): 89 | """Create a Fraction that is capable of being filled and added. 90 | 91 | Parameters: 92 | quantity (function returning bool or float): computes the quantity of interest from the data and interprets 93 | it as a selection (multiplicative factor on weight). 94 | value (:doc:`Container `): generates sub-aggregators for the numerator and 95 | denominator. 96 | 97 | Other parameters: 98 | entries (float): the number of entries, initially 0.0. 99 | numerator (:doc:`Container `): the sub-aggregator of entries that pass 100 | the selection. 101 | denominator (:doc:`Container `): the sub-aggregator of all entries. 102 | """ 103 | if value is not None and not isinstance(value, Container): 104 | raise TypeError(f"value ({value}) must be None or a Container") 105 | self.entries = 0.0 106 | self.quantity = serializable(identity(quantity) if isinstance(quantity, str) else quantity) 107 | if value is not None: 108 | self.numerator = value.zero() 109 | self.denominator = value.zero() 110 | super().__init__() 111 | self.specialize() 112 | 113 | @staticmethod 114 | def build(numerator, denominator): 115 | """Create a Fraction out of pre-existing containers, which might have been aggregated on different streams. 116 | 117 | Parameters: 118 | numerator (:doc:`Container `): the filled numerator. 119 | denominator (:doc:`Container `): the filled denominator. 120 | 121 | This function will attempt to combine the ``numerator`` and ``denominator``, so they must have the same 122 | binning/bounds/etc. 123 | """ 124 | if not isinstance(numerator, Container): 125 | raise TypeError(f"numerator ({numerator}) must be a Container") 126 | if not isinstance(denominator, Container): 127 | raise TypeError(f"denominator ({denominator}) must be a Container") 128 | # check for compatibility 129 | numerator + denominator 130 | # return object 131 | return Fraction.ed(denominator.entries, numerator, denominator) 132 | 133 | @inheritdoc(Container) 134 | def zero(self): 135 | out = Fraction(self.quantity, None) 136 | out.numerator = self.numerator.zero() 137 | out.denominator = self.denominator.zero() 138 | return out.specialize() 139 | 140 | @inheritdoc(Container) 141 | def __add__(self, other): 142 | if isinstance(other, Fraction): 143 | out = Fraction(self.quantity, None) 144 | out.entries = self.entries + other.entries 145 | out.numerator = self.numerator + other.numerator 146 | out.denominator = self.denominator + other.denominator 147 | return out.specialize() 148 | raise ContainerException(f"cannot add {self.name} and {other.name}") 149 | 150 | @inheritdoc(Container) 151 | def __iadd__(self, other): 152 | if isinstance(other, Fraction): 153 | self.entries += other.entries 154 | self.numerator += other.numerator 155 | self.denominator += other.denominator 156 | return self 157 | raise ContainerException(f"cannot add {self.name} and {other.name}") 158 | 159 | @inheritdoc(Container) 160 | def __mul__(self, factor): 161 | if math.isnan(factor) or factor <= 0.0: 162 | return self.zero() 163 | out = self.zero() 164 | out.entries = factor * self.entries 165 | out.numerator = self.numerator * factor 166 | out.denominator = self.denominator * factor 167 | return out.specialize() 168 | 169 | @inheritdoc(Container) 170 | def __rmul__(self, factor): 171 | return self.__mul__(factor) 172 | 173 | @inheritdoc(Container) 174 | def fill(self, datum, weight=1.0): 175 | self._checkForCrossReferences() 176 | 177 | if weight > 0.0: 178 | w = self.quantity(datum) 179 | if not isinstance(w, numbers.Real): 180 | raise TypeError(f"function return value ({w}) must be boolean or number") 181 | w *= weight 182 | 183 | self.denominator.fill(datum, weight) 184 | if w > 0.0: 185 | self.numerator.fill(datum, w) 186 | 187 | # no possibility of exception from here on out (for rollback) 188 | self.entries += weight 189 | 190 | def _numpy(self, data, weights, shape): 191 | w = self.quantity(data) 192 | self._checkNPQuantity(w, shape) 193 | self._checkNPWeights(weights, shape) 194 | weights = self._makeNPWeights(weights, shape) 195 | 196 | import numpy 197 | 198 | w = w * weights 199 | w[numpy.isnan(w)] = 0.0 200 | w[w < 0.0] = 0.0 201 | 202 | self.numerator._numpy(data, w, shape) 203 | self.denominator._numpy(data, weights, shape) 204 | 205 | # no possibility of exception from here on out (for rollback) 206 | self.entries += float(weights.sum()) 207 | 208 | def _sparksql(self, jvm, converter): 209 | return converter.Fraction(self.quantity.asSparkSQL(), self.numerator._sparksql(jvm, converter)) 210 | 211 | @property 212 | def children(self): 213 | """List of sub-aggregators, to make it possible to walk the tree.""" 214 | return [self.numerator, self.denominator] 215 | 216 | @inheritdoc(Container) 217 | def toJsonFragment(self, suppressName): 218 | if getattr(self.numerator, "quantity", None) is not None: 219 | binsName = self.numerator.quantity.name 220 | elif getattr(self.numerator, "quantityName", None) is not None: 221 | binsName = self.numerator.quantityName 222 | else: 223 | binsName = None 224 | 225 | return maybeAdd( 226 | { 227 | "entries": floatToJson(self.entries), 228 | "sub:type": self.numerator.name, 229 | "numerator": self.numerator.toJsonFragment(True), 230 | "denominator": self.denominator.toJsonFragment(True), 231 | }, 232 | **{ 233 | "name": None if suppressName else self.quantity.name, 234 | "sub:name": binsName, 235 | }, 236 | ) 237 | 238 | @staticmethod 239 | @inheritdoc(Factory) 240 | def fromJsonFragment(json, nameFromParent): 241 | if isinstance(json, dict) and hasKeys( 242 | json.keys(), 243 | ["entries", "sub:type", "numerator", "denominator"], 244 | ["name", "sub:name"], 245 | ): 246 | if json["entries"] in ("nan", "inf", "-inf") or isinstance(json["entries"], numbers.Real): 247 | entries = float(json["entries"]) 248 | else: 249 | raise JsonFormatException(json, "Fraction.entries") 250 | 251 | if isinstance(json.get("name", None), basestring): 252 | name = json["name"] 253 | elif json.get("name", None) is None: 254 | name = None 255 | else: 256 | raise JsonFormatException(json["name"], "Fraction.name") 257 | 258 | if isinstance(json["sub:type"], basestring): 259 | factory = Factory.registered[json["sub:type"]] 260 | else: 261 | raise JsonFormatException(json, "Fraction.type") 262 | 263 | if isinstance(json.get("sub:name", None), basestring): 264 | subName = json["sub:name"] 265 | elif json.get("sub:name", None) is None: 266 | subName = None 267 | else: 268 | raise JsonFormatException(json["sub:name"], "Fraction.sub:name") 269 | 270 | numerator = factory.fromJsonFragment(json["numerator"], subName) 271 | denominator = factory.fromJsonFragment(json["denominator"], subName) 272 | 273 | out = Fraction.ed(entries, numerator, denominator) 274 | out.quantity.name = nameFromParent if name is None else name 275 | return out.specialize() 276 | 277 | raise JsonFormatException(json, "Fraction") 278 | 279 | def __repr__(self): 280 | return f"" 281 | 282 | def __eq__(self, other): 283 | return ( 284 | isinstance(other, Fraction) 285 | and numeq(self.entries, other.entries) 286 | and self.quantity == other.quantity 287 | and self.numerator == other.numerator 288 | and self.denominator == other.denominator 289 | ) 290 | 291 | def __ne__(self, other): 292 | return not self == other 293 | 294 | def __hash__(self): 295 | return hash((self.entries, self.quantity, self.numerator, self.denominator)) 296 | 297 | 298 | # extra properties: number of dimensions and datatypes of sub-hists 299 | Fraction.n_dim = n_dim 300 | Fraction.datatype = datatype 301 | 302 | # register extra methods 303 | Factory.register(Fraction) 304 | -------------------------------------------------------------------------------- /histogrammar/primitives/select.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2016 DIANA-HEP 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import math 18 | import numbers 19 | 20 | from histogrammar.defs import ( 21 | Container, 22 | ContainerException, 23 | Factory, 24 | JsonFormatException, 25 | identity, 26 | ) 27 | from histogrammar.primitives.count import Count 28 | from histogrammar.util import ( 29 | basestring, 30 | datatype, 31 | floatToJson, 32 | hasKeys, 33 | inheritdoc, 34 | maybeAdd, 35 | n_dim, 36 | numeq, 37 | serializable, 38 | ) 39 | 40 | # Select 41 | 42 | 43 | class Select(Factory, Container): 44 | """Filter or weight data according to a given selection. 45 | 46 | This primitive is a basic building block, intended to be used in conjunction with anything that needs a 47 | user-defined cut. In particular, a standard histogram often has a custom selection, and this can be built by 48 | nesting Select -> Bin -> Count. 49 | 50 | Select also resembles :doc:`Fraction `, but without the ``denominator``. 51 | 52 | The efficiency of a cut in a Select aggregator named ``x`` is simply ``x.cut.entries / x.entries`` 53 | (because all aggregators have an ``entries`` member). 54 | """ 55 | 56 | @staticmethod 57 | def ed(entries, cut): 58 | """Create a Select that is only capable of being added. 59 | 60 | Parameters: 61 | entries (float): the number of entries. 62 | cut (:doc:`Container `): the filled sub-aggregator. 63 | """ 64 | if not isinstance(entries, numbers.Real) and entries not in ( 65 | "nan", 66 | "inf", 67 | "-inf", 68 | ): 69 | raise TypeError(f"entries ({entries}) must be a number") 70 | if not isinstance(cut, Container): 71 | raise TypeError(f"cut ({cut}) must be a Container") 72 | if entries < 0.0: 73 | raise ValueError(f"entries ({entries}) cannot be negative") 74 | out = Select(None, cut) 75 | out.entries = float(entries) 76 | return out.specialize() 77 | 78 | @staticmethod 79 | def ing(quantity, cut=Count()): 80 | """Synonym for ``__init__``.""" 81 | return Select(quantity, cut) 82 | 83 | def __getattr__(self, attr): 84 | """Pass on searches for custom methods to the ``value``, so that Limit becomes effectively invisible.""" 85 | if attr.startswith("__") and attr.endswith("__"): 86 | return getattr(Select, attr) 87 | if attr not in self.__dict__ and hasattr(self.__dict__["cut"], attr): 88 | return getattr(self.__dict__["cut"], attr) 89 | return self.__dict__[attr] 90 | 91 | def __init__(self, quantity=identity, cut=Count()): 92 | """Create a Select that is capable of being filled and added. 93 | 94 | Parameters: 95 | quantity (function returning bool or float): computes the quantity of interest from the data and interprets 96 | it as a selection (multiplicative factor on weight). 97 | cut (:doc:`Container `): will only be filled with data that pass the cut, 98 | and which are weighted by the cut. 99 | 100 | Other Parameters: 101 | entries (float): the number of entries, initially 0.0. 102 | """ 103 | if not isinstance(cut, Container): 104 | raise TypeError(f"cut ({cut}) must be a Container") 105 | self.entries = 0.0 106 | self.quantity = serializable(identity(quantity) if isinstance(quantity, str) else quantity) 107 | self.cut = cut 108 | super().__init__() 109 | self.specialize() 110 | 111 | def fractionPassing(self): 112 | """Fraction of weights that pass the quantity.""" 113 | return self.cut.entries / self.entries 114 | 115 | @inheritdoc(Container) 116 | def zero(self): 117 | return Select(self.quantity, self.cut.zero()) 118 | 119 | @inheritdoc(Container) 120 | def __add__(self, other): 121 | if isinstance(other, Select): 122 | out = Select(self.quantity, self.cut + other.cut) 123 | out.entries = self.entries + other.entries 124 | return out.specialize() 125 | raise ContainerException(f"cannot add {self.name} and {other.name}") 126 | 127 | @inheritdoc(Container) 128 | def __iadd__(self, other): 129 | if isinstance(other, Select): 130 | self.entries += other.entries 131 | self.cut += other.cut 132 | return self 133 | raise ContainerException(f"cannot add {self.name} and {other.name}") 134 | 135 | @inheritdoc(Container) 136 | def __mul__(self, factor): 137 | if math.isnan(factor) or factor <= 0.0: 138 | return self.zero() 139 | out = self.zero() 140 | out.entries = factor * self.entries 141 | out.cut = self.cut * factor 142 | return out.specialize() 143 | 144 | @inheritdoc(Container) 145 | def __rmul__(self, factor): 146 | return self.__mul__(factor) 147 | 148 | @inheritdoc(Container) 149 | def fill(self, datum, weight=1.0): 150 | self._checkForCrossReferences() 151 | 152 | if weight > 0.0: 153 | w = self.quantity(datum) 154 | if not isinstance(w, numbers.Real): 155 | raise TypeError(f"function return value ({w}) must be boolean or number") 156 | w *= weight 157 | 158 | if w > 0.0: 159 | self.cut.fill(datum, w) 160 | # no possibility of exception from here on out (for rollback) 161 | self.entries += weight 162 | 163 | def _numpy(self, data, weights, shape): 164 | w = self.quantity(data) 165 | self._checkNPQuantity(w, shape) 166 | self._checkNPWeights(weights, shape) 167 | weights = self._makeNPWeights(weights, shape) 168 | 169 | import numpy 170 | 171 | w = w * weights 172 | w[numpy.isnan(w)] = 0.0 173 | w[w < 0.0] = 0.0 174 | 175 | self.cut._numpy(data, w, shape) 176 | 177 | # no possibility of exception from here on out (for rollback) 178 | self.entries += float(weights.sum()) 179 | 180 | def _sparksql(self, jvm, converter): 181 | return converter.Select(self.quantity.asSparkSQL(), self.cut._sparksql(jvm, converter)) 182 | 183 | @property 184 | def children(self): 185 | """List of sub-aggregators, to make it possible to walk the tree.""" 186 | return [self.cut] 187 | 188 | @inheritdoc(Container) 189 | def toJsonFragment(self, suppressName): 190 | return maybeAdd( 191 | { 192 | "entries": floatToJson(self.entries), 193 | "sub:type": self.cut.name, 194 | "data": self.cut.toJsonFragment(False), 195 | }, 196 | name=(None if suppressName else self.quantity.name), 197 | ) 198 | 199 | @staticmethod 200 | @inheritdoc(Factory) 201 | def fromJsonFragment(json, nameFromParent): 202 | if isinstance(json, dict) and hasKeys(json.keys(), ["entries", "sub:type", "data"], ["name"]): 203 | if json["entries"] in ("nan", "inf", "-inf") or isinstance(json["entries"], numbers.Real): 204 | entries = float(json["entries"]) 205 | else: 206 | raise JsonFormatException(json, "Select.entries") 207 | 208 | if isinstance(json.get("name", None), basestring): 209 | name = json["name"] 210 | elif json.get("name", None) is None: 211 | name = None 212 | else: 213 | raise JsonFormatException(json["name"], "Select.name") 214 | 215 | if isinstance(json["sub:type"], basestring): 216 | factory = Factory.registered[json["sub:type"]] 217 | else: 218 | raise JsonFormatException(json, "Select.type") 219 | 220 | cut = factory.fromJsonFragment(json["data"], None) 221 | 222 | out = Select.ed(entries, cut) 223 | out.quantity.name = nameFromParent if name is None else name 224 | return out.specialize() 225 | 226 | raise JsonFormatException(json, "Select") 227 | 228 | def __repr__(self): 229 | return f"