├── .github
    └── workflows
    │   └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGES.rst
├── LICENSE
├── MANIFEST.in
├── NOTICE
├── README.rst
├── docs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    └── make.bat
├── histogrammar
    ├── __init__.py
    ├── convenience.py
    ├── defs.py
    ├── dfinterface
    │   ├── __init__.py
    │   ├── addmethods.py
    │   ├── filling_utils.py
    │   ├── histogram_filler_base.py
    │   ├── make_histograms.py
    │   ├── pandas_histogrammar.py
    │   └── spark_histogrammar.py
    ├── notebooks
    │   ├── __init__.py
    │   ├── histogrammar_tutorial_advanced.ipynb
    │   ├── histogrammar_tutorial_basic.ipynb
    │   └── histogrammar_tutorial_exercises.ipynb
    ├── plot
    │   ├── __init__.py
    │   ├── bokeh.py
    │   ├── hist_numpy.py
    │   └── matplotlib.py
    ├── primitives
    │   ├── __init__.py
    │   ├── average.py
    │   ├── bag.py
    │   ├── bin.py
    │   ├── categorize.py
    │   ├── centrallybin.py
    │   ├── collection.py
    │   ├── count.py
    │   ├── deviate.py
    │   ├── fraction.py
    │   ├── irregularlybin.py
    │   ├── minmax.py
    │   ├── select.py
    │   ├── sparselybin.py
    │   ├── stack.py
    │   └── sum.py
    ├── resources.py
    ├── sparksql.py
    ├── specialized.py
    ├── test_data
    │   ├── __init__.py
    │   └── test.csv.gz
    ├── util.py
    └── version.py
├── makedocs.py
├── pyproject.toml
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── jars
        ├── histogrammar-sparksql_2.11-1.0.11.jar
        ├── histogrammar-sparksql_2.11-1.0.20.jar
        ├── histogrammar-sparksql_2.12-1.0.11.jar
        ├── histogrammar-sparksql_2.12-1.0.20.jar
        ├── histogrammar_2.11-1.0.11.jar
        ├── histogrammar_2.11-1.0.20.jar
        ├── histogrammar_2.12-1.0.11.jar
        └── histogrammar_2.12-1.0.20.jar
    ├── resources
        ├── age.json
        ├── company.json
        ├── date.json
        ├── eyesColor.json
        ├── gender.json
        ├── isActive.json
        ├── isActive_age.json
        ├── latitude.json
        ├── latitude_longitude.json
        ├── longitude.json
        └── transaction.json
    ├── test_basic.py
    ├── test_notebooks.py
    ├── test_numpy.py
    ├── test_pandas_histogrammar.py
    ├── test_spark_histogrammar.py
    └── test_spec.py


/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master, develop ]
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 |   test:
10 |     strategy:
11 |       matrix:
12 |         os: [ ubuntu-latest ]
13 |         python: [ "3.9", "3.10", "3.11", "3.12" ]
14 |         numpy_version: [ "numpy-latest", "numpy<2" ]
15 |     runs-on: ${{ matrix.os }}
16 | 
17 |     steps:
18 |       - name: Checkout code
19 |         uses: actions/checkout@v2
20 | 
21 |       - name: Set up Python ${{ matrix.python }}
22 |         uses: actions/setup-python@v1
23 |         with:
24 |           python-version: ${{ matrix.python }}
25 | 
26 |       - name: Use cache for pip dependencies
27 |         uses: actions/cache@v3
28 |         with:
29 |           path: ~/.cache/pip
30 |           key: ${{ runner.os }}-pip-${{ hashFiles('**/pyproject.toml') }}
31 |           restore-keys: |
32 |             ${{ runner.os }}-pip-
33 | 
34 |       - name: Install dependencies
35 |         run: |
36 |           python -m pip install --upgrade pip
37 |           if [ "${{ matrix.numpy_version }}" = "numpy<2" ]; then
38 |             pip install ".[test,pandas,spark,test_numpy_pre2]"
39 |           else
40 |             pip install ".[test,pandas,spark]"          
41 |           fi
42 | 
43 |       - name: Lint with pre-commit
44 |         run: |
45 |           pip install pre-commit
46 |           pre-commit run --all-files --show-diff-on-failure
47 | 
48 |       - name: Test with pytest
49 |         run: |
50 |           pytest tests
51 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Emacs
  2 | *~
  3 | \#*\#
  4 | 
  5 | # Generated by test
  6 | plot_*.html
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | env/
 19 | build/
 20 | develop-eggs/
 21 | dist/
 22 | downloads/
 23 | eggs/
 24 | .eggs/
 25 | lib/
 26 | lib64/
 27 | parts/
 28 | sdist/
 29 | var/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *,cover
 53 | .hypothesis/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | histogrammar.*.rst
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # IPython Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # dotenv
 87 | .env
 88 | 
 89 | # virtualenv
 90 | venv/
 91 | ENV/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | 
 96 | # Rope project settings
 97 | .ropeproject
 98 | 
 99 | # tests output files
100 | histogrammar/notebooks/*.json
101 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | - repo: https://github.com/astral-sh/ruff-pre-commit
 3 |   rev: v0.1.6
 4 |   hooks:
 5 |     - id: ruff
 6 |       args: [--fix]
 7 |       exclude: notebooks/
 8 |     - id: ruff-format
 9 |       exclude: notebooks/
10 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | Release notes
 3 | =============
 4 | 
 5 | Version 1.1.0, Dec 2024
 6 | -----------------------
 7 | * Removed all ROOT, cpp and cuda code, it was no longer supported.
 8 | 
 9 | Version 1.0.34, Dec 2024
10 | ------------------------
11 | * Fix typo in build pipeline Python versions config list.
12 | * Fix error in SparselyBin __eq__ method.
13 | * Fix test utility corner case error (test_numpy.twosigfigs function).
14 | * Fix error in test context manager for pandas which prevented execution of tests.
15 | * Fix error in expected bin count in test_numpy.test_n_bins test.
16 | * Prevent logging zero execution time TestNumpy class.
17 | 
18 | * Remove Python 3.8 environment from build pipeline.
19 | * Support numpy >= 2.0.0 (np.string_ -> np.bytes_, np.unicode_ -> np.str_).
20 | * Remove uses of pd.util.testing.makeMixedDataFrame not available in pandas >= 2.0.0.
21 | * Switch from 'pkg_resources' to 'importlib' module for resolving package files.
22 | * Switch from 'distutils.spawn' to 'shutil.which' for finding nvcc command.
23 | 
24 | * Remove unused test_gpu.twosigfigs function.
25 | * Refactor tests with Numpy() and Pandas() context managers to use single 'with' statement.
26 | 
27 | * Switch from setup.py to pyproject.toml
28 | * Add numpy<2,pandas<2 test environment to build pipeline test matrix
29 | 
30 | Version 1.0.33, Dec 2022
31 | ------------------------
32 | * fix of get_sub_hist() when Bin histogram is filled only with nans.
33 | 
34 | Version 1.0.32, Sep 2022
35 | ------------------------
36 | * Support for decimal datetype in pandas and spark.
37 | 
38 | Version 1.0.31, Aug 2022
39 | ------------------------
40 | * fix of spark df timestamp datatype detection (#59)
41 | * fix for invalid bin_edges for SparselyBin histogram (#60)
42 | 
43 | Version 1.0.30, June 2022
44 | -------------------------
45 | * Fix for machine-level rounding error, which can show up on in num_bins() call of Bin histogram.
46 | * supersedes broken v1.0.29
47 | 
48 | Version 1.0.28, June 2022
49 | -------------------------
50 | * Multiple performance updates, to Bin, SparselyBin and Categorize histograms.
51 | * SparselyBin, Categorize: optimized filling with 1-d and 2-d numpy arrays
52 | * Bin, SparselyBin, Categorize: (fast) numpy arrays for bin-centers and bin-labels.
53 | * Count: new, fast filling option when float weight is known.
54 | * util.py: faster get_datatype() and get_ndim() functions.
55 | 
56 | Version 1.0.27, May 2022
57 | ------------------------
58 | * Multiple performance updates, thanks to Simon Brugman.
59 | * Use pandas functions to infer datatypes and return numpy arrays.
60 | * Turn of unnecessary specialize function (slow) for Count objects.
61 | 
62 | Version 1.0.26, Apr 2022
63 | ------------------------
64 | * Added tutorial notebook with exercises.
65 | * Fixed 2d heatmap for categorical histograms, where one column was accidentally dropped.
66 | 
67 | Version 1.0.25, Apr 2021
68 | ------------------------
69 | * Improve null handling in pandas dataframes, by inferring datatype using pandas' infer_dtype function.
70 | * nans in bool columns get converted to "NaN", so the column keeps True and False values in Categorize.
71 | * columns of type object get converted to strings using to_string(), of type string uses only_str().
72 | 
73 | Version 1.0.24, Apr 2021
74 | ------------------------
75 | * Categorize histogram now handles nones and nans in friendlier way, they are converted to "NaN".
76 | * make_histogram() now casts spark nulls to nan in case of numeric columns. scala interprets null as 0.
77 | * SparselyBin histograms did not add up nanflow when added. Now fixed.
78 | * Added unit test for doing checks on null conversion to nans
79 | * Use new histogrammar-scala jar files, v1.0.20
80 | * Added histogrammar-scala v1.0.20 jar files to tests/jars/
81 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | include LICENSE
3 | include NOTICE


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | ################################################################################################
 2 | #
 3 | # NOTICE: pass-through licensing of bundled components
 4 | #
 5 | # Histogrammar gathers together a toolkit of pre-existing third-party 
 6 | # open-source software components. These software components are governed by their own licenses 
 7 | # which Histogrammar does not modify or supersede, please consult the originating 
 8 | # authors. These components altogether have a mixture of the following licenses: Apache 2.0, MIT.
 9 | #
10 | # Although we have examined the licenses to verify acceptance of commercial and non-commercial
11 | # use, please see and consult the original licenses or authors.
12 | #
13 | # Here is the full list of license dependencies:
14 | #
15 | # numpy: https://github.com/numpy/numpy/blob/master/LICENSE.txt
16 | # tqdm: https://github.com/tqdm/tqdm/blob/master/LICENCE
17 | # matplotlib: https://github.com/matplotlib/matplotlib/blob/master/LICENSE/LICENSE
18 | # joblib: https://github.com/joblib/joblib/blob/master/LICENSE.txt
19 | # root: https://root.cern.ch/license
20 | # popmon: https://github.com/ing-bank/popmon/blob/master/LICENSE
21 | #
22 | # There are several functions/classes where code or techniques have been reproduced and/or modified
23 | # from existing open-source packages. We list these here:
24 | #
25 | # Package: popmon
26 | # popmon file: histogrammar/dfinterface/spark_histogrammar.py
27 | #    Class: SparkHistogrammar
28 | #    Reference: https://github.com/ing-bank/popmon/blob/master/popmon/hist/filling/spark_histogrammar.py
29 | # popmon file: histogrammar/dfinterface/pandas_histogrammar.py
30 | #    Class: PandasHistogrammar
31 | #    Reference: https://github.com/ing-bank/popmon/blob/master/popmon/hist/filling/pandas_histogrammar.py
32 | # popmon file: histogrammar/dfinterface/histogram_filler_base.py
33 | #    Class: HistogramFillerBase
34 | #    Reference: https://github.com/ing-bank/popmon/blob/master/popmon/hist/filling/histogram_filler_base.py
35 | # License: MIT
36 | #    For details see: https://github.com/ing-bank/popmon/blob/master/LICENSE
37 | #
38 | ################################################################################################
39 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ==================================
  2 | histogrammar Python implementation
  3 | ==================================
  4 | 
  5 | histogrammar is a Python package for creating histograms. histogrammar has multiple histogram types,
  6 | supports numeric and categorical features, and works with Numpy arrays and Pandas and Spark dataframes.
  7 | Once a histogram is filled, it's easy to plot it, store it in JSON format (and retrieve it), or convert
  8 | it to Numpy arrays for further analysis.
  9 | 
 10 | At its core histogrammar is a suite of data aggregation primitives designed for use in parallel processing.
 11 | In the simplest case, you can use this to compute histograms, but the generality of the primitives
 12 | allows much more.
 13 | 
 14 | Several common histogram types can be plotted in Matplotlib and Bokeh with a single method call.
 15 | If Numpy or Pandas is available, histograms and other aggregators can be filled from arrays ten to a hundred times
 16 | more quickly via Numpy commands, rather than Python for loops.
 17 | 
 18 | This Python implementation of histogrammar been tested to guarantee compatibility with its Scala implementation.
 19 | 
 20 | Latest Python release: v1.1.0 (Feb 2025).
 21 | Latest update: Feb 2025.
 22 | 
 23 | References
 24 | ==========
 25 | 
 26 | Histogrammar is a core component of `popmon <https://github.com/ing-bank/popmon>`_, a package by ING bank
 27 | that allows one to check the stability of a dataset. popmon works with both pandas and spark datasets,
 28 | largely thanks to Histogrammar.
 29 | 
 30 | 
 31 | 
 32 | Announcements
 33 | =============
 34 | 
 35 | Changes
 36 | -------
 37 | 
 38 | See Changes log `here <https://github.com/histogrammar/histogrammar-python/blob/master/CHANGES.rst>`_.
 39 | 
 40 | 
 41 | Spark 3.X
 42 | ---------
 43 | 
 44 | With Spark 3.X, based on Scala 2.12 or 2.13, make sure to pick up the correct histogrammar jar files:
 45 | 
 46 | .. code-block:: python
 47 | 
 48 |   spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.12:1.0.30,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.30").getOrCreate()
 49 | 
 50 | 
 51 | For Scala 2.13, in the string above simply replace "2.12" with "2.13".
 52 | 
 53 | December, 2023
 54 | 
 55 | 
 56 | Example notebooks
 57 | =================
 58 | 
 59 | .. list-table::
 60 |    :widths: 80 20
 61 |    :header-rows: 1
 62 | 
 63 |    * - Tutorial
 64 |      - Colab link
 65 |    * - `Basic tutorial <https://nbviewer.jupyter.org/github/histogrammar/histogrammar-python/blob/master/histogrammar/notebooks/histogrammar_tutorial_basic.ipynb>`_
 66 |      - |notebook_basic_colab|
 67 |    * - `Detailed example (featuring configuration, Apache Spark and more) <https://nbviewer.jupyter.org/github/histogrammar/histogrammar-python/blob/master/histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb>`_
 68 |      - |notebook_advanced_colab|
 69 |    * - `Exercises <https://nbviewer.jupyter.org/github/histogrammar/histogrammar-python/blob/master/histogrammar/notebooks/histogrammar_tutorial_exercises.ipynb>`_
 70 |      - |notebook_exercises_colab|
 71 | 
 72 | Documentation
 73 | =============
 74 | 
 75 | See `histogrammar-docs <https://histogrammar.github.io/histogrammar-docs/>`_ for a complete introduction to `histogrammar`.
 76 | (A bit old but still good.) There you can also find documentation about the Scala implementation of `histogrammar`.
 77 | 
 78 | Check it out
 79 | ============
 80 | 
 81 | The `historgrammar` library requires Python 3.8+ and is pip friendly. To get started, simply do:
 82 | 
 83 | .. code-block:: bash
 84 | 
 85 |   $ pip install histogrammar
 86 | 
 87 | or check out the code from our GitHub repository:
 88 | 
 89 | .. code-block:: bash
 90 | 
 91 |   $ git clone https://github.com/histogrammar/histogrammar-python
 92 |   $ pip install -e histogrammar-python
 93 | 
 94 | where in this example the code is installed in edit mode (option -e).
 95 | 
 96 | You can now use the package in Python with:
 97 | 
 98 | .. code-block:: python
 99 | 
100 |   import histogrammar
101 | 
102 | **Congratulations, you are now ready to use the histogrammar library!**
103 | 
104 | Quick run
105 | =========
106 | 
107 | As a quick example, you can do:
108 | 
109 | .. code-block:: python
110 | 
111 |   import pandas as pd
112 |   import histogrammar as hg
113 |   from histogrammar import resources
114 | 
115 |   # open synthetic data
116 |   df = pd.read_csv(resources.data('test.csv.gz'), parse_dates=['date'])
117 |   df.head()
118 | 
119 |   # create a histogram, tell it to look for column 'age'
120 |   # fill the histogram with column 'age' and plot it
121 |   hist = hg.Histogram(num=100, low=0, high=100, quantity='age')
122 |   hist.fill.numpy(df)
123 |   hist.plot.matplotlib()
124 | 
125 |   # generate histograms of all features in the dataframe using automatic binning
126 |   # (importing histogrammar automatically adds this functionality to a pandas or spark dataframe)
127 |   hists = df.hg_make_histograms()
128 |   print(hists.keys())
129 | 
130 |   # multi-dimensional histograms are also supported. e.g. features longitude vs latitude
131 |   hists = df.hg_make_histograms(features=['longitude:latitude'])
132 |   ll = hists['longitude:latitude']
133 |   ll.plot.matplotlib()
134 | 
135 |   # store histogram and retrieve it again
136 |   ll.toJsonFile('longitude_latitude.json')
137 |   ll2 = hg.Factory().fromJsonFile('longitude_latitude.json')
138 | 
139 | These examples also work with Spark dataframes (sdf):
140 | 
141 | .. code-block:: python
142 | 
143 |   from pyspark.sql.functions import col
144 |   hist = hg.Histogram(num=100, low=0, high=100, quantity=col('age'))
145 |   hist.fill.sparksql(sdf)
146 | 
147 | For more examples please see the example notebooks and tutorials.
148 | 
149 | 
150 | Project contributors
151 | ====================
152 | 
153 | This package was originally authored by DIANA-HEP and is now maintained by volunteers.
154 | 
155 | Contact and support
156 | ===================
157 | 
158 | * Issues & Ideas & Support: https://github.com/histogrammar/histogrammar-python/issues
159 | 
160 | Please note that `histogrammar` is supported only on a best-effort basis.
161 | 
162 | License
163 | =======
164 | `histogrammar` is completely free, open-source and licensed under the `Apache-2.0 license <https://en.wikipedia.org/wiki/Apache_License>`_.
165 | 
166 | .. |notebook_basic_colab| image:: https://colab.research.google.com/assets/colab-badge.svg
167 |     :alt: Open in Colab
168 |     :target: https://colab.research.google.com/github/histogrammar/histogrammar-python/blob/master/histogrammar/notebooks/histogrammar_tutorial_basic.ipynb
169 | .. |notebook_advanced_colab| image:: https://colab.research.google.com/assets/colab-badge.svg
170 |     :alt: Open in Colab
171 |     :target: https://colab.research.google.com/github/histogrammar/histogrammar-python/blob/master/histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb
172 | .. |notebook_exercises_colab| image:: https://colab.research.google.com/assets/colab-badge.svg
173 |     :alt: Open in Colab
174 |     :target: https://colab.research.google.com/github/histogrammar/histogrammar-python/blob/master/histogrammar/notebooks/histogrammar_tutorial_exercises.ipynb
175 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 23 | 	@echo "  singlehtml to make a single large HTML file"
 24 | 	@echo "  pickle     to make pickle files"
 25 | 	@echo "  json       to make JSON files"
 26 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 27 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 31 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 32 | 	@echo "  text       to make text files"
 33 | 	@echo "  man        to make manual pages"
 34 | 	@echo "  texinfo    to make Texinfo files"
 35 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 36 | 	@echo "  gettext    to make PO message catalogs"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 
 41 | clean:
 42 | 	-rm -rf $(BUILDDIR)/*
 43 | 
 44 | html:
 45 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 48 | 
 49 | dirhtml:
 50 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 53 | 
 54 | singlehtml:
 55 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 58 | 
 59 | pickle:
 60 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the pickle files."
 63 | 
 64 | json:
 65 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the JSON files."
 68 | 
 69 | htmlhelp:
 70 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 71 | 	@echo
 72 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 73 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 74 | 
 75 | qthelp:
 76 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 77 | 	@echo
 78 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 79 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 80 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Histogrammar.qhcp"
 81 | 	@echo "To view the help file:"
 82 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Histogrammar.qhc"
 83 | 
 84 | devhelp:
 85 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 86 | 	@echo
 87 | 	@echo "Build finished."
 88 | 	@echo "To view the help file:"
 89 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/Histogrammar"
 90 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Histogrammar"
 91 | 	@echo "# devhelp"
 92 | 
 93 | epub:
 94 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 95 | 	@echo
 96 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 97 | 
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 	      "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | latexpdf:
106 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | 	@echo "Running LaTeX files through pdflatex..."
108 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 | 
111 | text:
112 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | 	@echo
114 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
115 | 
116 | man:
117 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | 	@echo
119 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 | 
121 | texinfo:
122 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | 	@echo
124 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
126 | 	      "(use \`make info' here to do that automatically)."
127 | 
128 | info:
129 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | 	@echo "Running Texinfo files through makeinfo..."
131 | 	make -C $(BUILDDIR)/texinfo info
132 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 | 
134 | gettext:
135 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | 	@echo
137 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 | 
139 | changes:
140 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | 	@echo
142 | 	@echo "The overview file is in $(BUILDDIR)/changes."
143 | 
144 | linkcheck:
145 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | 	@echo
147 | 	@echo "Link check complete; look for any errors in the above output " \
148 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
149 | 
150 | doctest:
151 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | 	@echo "Testing of doctests in the sources finished, look at the " \
153 | 	      "results in $(BUILDDIR)/doctest/output.txt."
154 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Histogrammar documentation build configuration file.
  3 | #
  4 | # This file is execfile()d with the current directory set to its containing dir.
  5 | #
  6 | # Note that not all possible configuration values are present in this
  7 | # autogenerated file.
  8 | #
  9 | # All configuration values have a default; values that are commented out
 10 | # serve to show the default.
 11 | 
 12 | import importlib
 13 | import inspect
 14 | 
 15 | # If extensions (or modules to document with autodoc) are in another directory,
 16 | # add these directories to sys.path here. If the directory is relative to the
 17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 18 | # sys.path.insert(0, os.path.abspath('.'))
 19 | 
 20 | # -- General configuration -----------------------------------------------------
 21 | 
 22 | # If your documentation needs a minimal Sphinx version, state it here.
 23 | # needs_sphinx = '1.0'
 24 | 
 25 | # Add any Sphinx extension module names here, as strings. They can be extensions
 26 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 27 | extensions = [
 28 |     "sphinx.ext.autodoc",
 29 |     "sphinxcontrib.napoleon",
 30 |     "sphinx.ext.doctest",
 31 |     "sphinx.ext.intersphinx",
 32 |     "sphinx.ext.todo",
 33 |     "sphinx.ext.coverage",
 34 |     "sphinx.ext.mathjax",
 35 |     "sphinx.ext.ifconfig",
 36 |     "sphinx.ext.linkcode",
 37 | ]
 38 | 
 39 | # Add any paths that contain templates here, relative to this directory.
 40 | templates_path = ["_templates"]
 41 | 
 42 | # The suffix of source filenames.
 43 | source_suffix = ".rst"
 44 | 
 45 | # The encoding of source files.
 46 | # source_encoding = 'utf-8-sig'
 47 | 
 48 | # The master toctree document.
 49 | master_doc = "index"
 50 | 
 51 | # General information about the project.
 52 | project = "Histogrammar"
 53 | copyright = "2016, DIANA-HEP"
 54 | 
 55 | # The version info for the project you're documenting, acts as replacement for
 56 | # |version| and |release|, also used in various other places throughout the
 57 | # built documents.
 58 | #
 59 | # The short X.Y version.
 60 | version = "1.0.11"
 61 | # The full version, including alpha/beta/rc tags.
 62 | release = "1.0.11"
 63 | 
 64 | # The language for content autogenerated by Sphinx. Refer to documentation
 65 | # for a list of supported languages.
 66 | # language = None
 67 | 
 68 | # There are two options for replacing |today|: either, you set today to some
 69 | # non-false value, then it is used:
 70 | # today = ''
 71 | # Else, today_fmt is used as the format for a strftime call.
 72 | # today_fmt = '%B %d, %Y'
 73 | 
 74 | # List of patterns, relative to source directory, that match files and
 75 | # directories to ignore when looking for source files.
 76 | exclude_patterns = ["_build"]
 77 | 
 78 | # The reST default role (used for this markup: `text`) to use for all documents.
 79 | # default_role = None
 80 | 
 81 | # If true, '()' will be appended to :func: etc. cross-reference text.
 82 | # add_function_parentheses = True
 83 | 
 84 | # If true, the current module name will be prepended to all description
 85 | # unit titles (such as .. function::).
 86 | # add_module_names = True
 87 | 
 88 | # If true, sectionauthor and moduleauthor directives will be shown in the
 89 | # output. They are ignored by default.
 90 | # show_authors = False
 91 | 
 92 | # The name of the Pygments (syntax highlighting) style to use.
 93 | pygments_style = "sphinx"
 94 | 
 95 | # A list of ignored prefixes for module index sorting.
 96 | # modindex_common_prefix = []
 97 | 
 98 | 
 99 | # -- Options for HTML output ---------------------------------------------------
100 | 
101 | # The theme to use for HTML and HTML Help pages.  See the documentation for
102 | # a list of builtin themes.
103 | html_theme = "sphinxdoc"
104 | 
105 | # Theme options are theme-specific and customize the look and feel of a theme
106 | # further.  For a list of options available for each theme, see the
107 | # documentation.
108 | html_theme_options = {"nosidebar": True}
109 | 
110 | # Add any paths that contain custom themes here, relative to this directory.
111 | # html_theme_path = []
112 | 
113 | # The name for this set of Sphinx documents.  If None, it defaults to
114 | # "<project> v<release> documentation".
115 | # html_title = None
116 | 
117 | # A shorter title for the navigation bar.  Default is the same as html_title.
118 | # html_short_title = None
119 | 
120 | # The name of an image file (relative to this directory) to place at the top
121 | # of the sidebar.
122 | # html_logo = None
123 | 
124 | # The name of an image file (within the static path) to use as favicon of the
125 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
126 | # pixels large.
127 | # html_favicon = None
128 | 
129 | # Add any paths that contain custom static files (such as style sheets) here,
130 | # relative to this directory. They are copied after the builtin static files,
131 | # so a file named "default.css" will overwrite the builtin "default.css".
132 | html_static_path = ["static"]
133 | 
134 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
135 | # using the given strftime format.
136 | # html_last_updated_fmt = '%b %d, %Y'
137 | 
138 | # If true, SmartyPants will be used to convert quotes and dashes to
139 | # typographically correct entities.
140 | # html_use_smartypants = True
141 | 
142 | # Custom sidebar templates, maps document names to template names.
143 | # html_sidebars = {}
144 | 
145 | # Additional templates that should be rendered to pages, maps page names to
146 | # template names.
147 | # html_additional_pages = {}
148 | 
149 | # If false, no module index is generated.
150 | # html_domain_indices = True
151 | 
152 | # If false, no index is generated.
153 | # html_use_index = True
154 | 
155 | # If true, the index is split into individual pages for each letter.
156 | # html_split_index = False
157 | 
158 | # If true, links to the reST sources are added to the pages.
159 | # html_show_sourcelink = True
160 | 
161 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
162 | # html_show_sphinx = True
163 | 
164 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
165 | # html_show_copyright = True
166 | 
167 | # If true, an OpenSearch description file will be output, and all pages will
168 | # contain a <link> tag referring to it.  The value of this option must be the
169 | # base URL from which the finished HTML is served.
170 | # html_use_opensearch = ''
171 | 
172 | # This is the file name suffix for HTML files (e.g. ".xhtml").
173 | # html_file_suffix = None
174 | 
175 | # Output file base name for HTML help builder.
176 | htmlhelp_basename = "Histogrammardoc"
177 | 
178 | 
179 | # -- Options for LaTeX output --------------------------------------------------
180 | 
181 | latex_elements = {
182 |     # # The paper size ('letterpaper' or 'a4paper').
183 |     # 'papersize': 'letterpaper',
184 |     # # The font size ('10pt', '11pt' or '12pt').
185 |     # 'pointsize': '10pt',
186 |     # # Additional stuff for the LaTeX preamble.
187 |     # 'preamble': '',
188 | }
189 | 
190 | # Grouping the document tree into LaTeX files. List of tuples
191 | # (source start file, target name, title, author, documentclass [howto/manual]).
192 | latex_documents = [
193 |     ("index", "Histogrammar.tex", "Histogrammar Documentation", "DIANA-HEP", "manual"),
194 | ]
195 | 
196 | # The name of an image file (relative to this directory) to place at the top of
197 | # the title page.
198 | # latex_logo = None
199 | 
200 | # For "manual" documents, if this is true, then toplevel headings are parts,
201 | # not chapters.
202 | # latex_use_parts = False
203 | 
204 | # If true, show page references after internal links.
205 | # latex_show_pagerefs = False
206 | 
207 | # If true, show URL addresses after external links.
208 | # latex_show_urls = False
209 | 
210 | # Documents to append as an appendix to all manuals.
211 | # latex_appendices = []
212 | 
213 | # If false, no module index is generated.
214 | # latex_domain_indices = True
215 | 
216 | 
217 | # -- Options for manual page output --------------------------------------------
218 | 
219 | # One entry per manual page. List of tuples
220 | # (source start file, name, description, authors, manual section).
221 | man_pages = [("index", "histogrammar", "Histogrammar Documentation", ["DIANA-HEP"], 1)]
222 | 
223 | # If true, show URL addresses after external links.
224 | # man_show_urls = False
225 | 
226 | 
227 | # -- Options for Texinfo output ------------------------------------------------
228 | 
229 | # Grouping the document tree into Texinfo files. List of tuples
230 | # (source start file, target name, title, author,
231 | #  dir menu entry, description, category)
232 | texinfo_documents = [
233 |     (
234 |         "index",
235 |         "Histogrammar",
236 |         "Histogrammar Documentation",
237 |         "DIANA-HEP",
238 |         "Histogrammar",
239 |         "One line description of project.",
240 |         "Miscellaneous",
241 |     ),
242 | ]
243 | 
244 | # Documents to append as an appendix to all manuals.
245 | # texinfo_appendices = []
246 | 
247 | # If false, no module index is generated.
248 | # texinfo_domain_indices = True
249 | 
250 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
251 | # texinfo_show_urls = 'footnote'
252 | 
253 | 
254 | # -- Options for Epub output ---------------------------------------------------
255 | 
256 | # Bibliographic Dublin Core info.
257 | epub_title = "Histogrammar"
258 | epub_author = "DIANA-HEP"
259 | epub_publisher = "DIANA-HEP"
260 | epub_copyright = "2016, DIANA-HEP"
261 | 
262 | # The language of the text. It defaults to the language option
263 | # or en if the language is not set.
264 | # epub_language = ''
265 | 
266 | # The scheme of the identifier. Typical schemes are ISBN or URL.
267 | # epub_scheme = ''
268 | 
269 | # The unique identifier of the text. This can be a ISBN number
270 | # or the project homepage.
271 | # epub_identifier = ''
272 | 
273 | # A unique identification for the text.
274 | # epub_uid = ''
275 | 
276 | # A tuple containing the cover image and cover page html template filenames.
277 | # epub_cover = ()
278 | 
279 | # HTML files that should be inserted before the pages created by sphinx.
280 | # The format is a list of tuples containing the path and title.
281 | # epub_pre_files = []
282 | 
283 | # HTML files shat should be inserted after the pages created by sphinx.
284 | # The format is a list of tuples containing the path and title.
285 | # epub_post_files = []
286 | 
287 | # A list of files that should not be packed into the epub file.
288 | # epub_exclude_files = []
289 | 
290 | # The depth of the table of contents in toc.ncx.
291 | # epub_tocdepth = 3
292 | 
293 | # Allow duplicate toc entries.
294 | # epub_tocdup = True
295 | 
296 | 
297 | # Example configuration for intersphinx: refer to the Python standard library.
298 | intersphinx_mapping = {"http://docs.python.org/": None}
299 | 
300 | 
301 | def skip(app, what, name, obj, skip, options):
302 |     if name == "__init__":
303 |         return False
304 |     return skip
305 | 
306 | 
307 | def setup(app):
308 |     app.connect("autodoc-skip-member", skip)
309 | 
310 | 
311 | def linkcode_resolve(domain, info):
312 |     if domain != "py":
313 |         return None
314 |     if not info["module"]:
315 |         return None
316 |     fileName = info["module"].replace(".", "/")
317 | 
318 |     try:
319 |         lineNumber = inspect.getsourcelines(getattr(importlib.import_module(info["module"]), info["fullname"]))[1]
320 |     except:  # noqa: E722
321 |         return None
322 |     else:
323 |         return "https://github.com/histogrammar/histogrammar-python/blob/%s/%s.py#L%d" % (release, fileName, lineNumber)
324 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Histogrammar |version| for Python
 2 | =================================
 3 | 
 4 | All aggregation primitives descend from two classes, :doc:`Container <histogrammar.defs.Container>` and :doc:`Factory <histogrammar.defs.Factory>`. Container defines all the methods for the primitive to aggregate and contain data, while Factory has methods for making containers. (In other languages, the two roles are distinct.)
 5 | 
 6 | The "functions" passed to these primitives may be Python lambda functions, normally defined functions (with ``def``), or strings, which may be interpreted different ways by different back-ends. All primitives immediately wrap your functions as :doc:`UserFcn <histogrammar.util.UserFcn>`, which are serializable (with ``pickle``), may be cached (:doc:`CachedFcn <histogrammar.util.CachedFcn>`), and may have a name. Although the primitives wrap your function automatically, you may do it yourself to add features, like caching or a name. See :doc:`serializable <histogrammar.util.serializable>`, :doc:`cached <histogrammar.util.cached>`, and :doc:`named <histogrammar.util.named>`.
 7 |       
 8 | The primitive classes are listed below, grouped by kind. See the index for a list of all classes, members, and functions.
 9 | 
10 | Zeroth kind: depend only on weights
11 | -----------------------------------
12 | 
13 | :doc:`Count <histogrammar.primitives.count.Count>`: sum of weights
14 |     Count entries by accumulating the sum of all observed weights or a sum of transformed weights (e.g. sum of squares of weights).
15 | 
16 | First kind: aggregate a data without sub-aggregators
17 | ----------------------------------------------------
18 | 
19 | :doc:`Sum <histogrammar.primitives.sum.Sum>`: sum of a given quantity
20 |     Accumulate the (weighted) sum of a given quantity, calculated from the data.
21 | 
22 | :doc:`Average <histogrammar.primitives.average.Average>`: mean of a quantity
23 |     Accumulate the weighted mean of a given quantity.
24 | 
25 | :doc:`Deviate <histogrammar.primitives.deviate.Deviate>`: mean and variance
26 |     Accumulate the weighted mean and weighted variance of a given quantity.
27 | 
28 | :doc:`Minimize <histogrammar.primitives.minmax.Minimize>`: minimum value
29 |     Find the minimum value of a given quantity. If no data are observed, the result is NaN.
30 | 
31 | :doc:`Maximize <histogrammar.primitives.minmax.Maximize>`: maximum value
32 |     Find the maximum value of a given quantity. If no data are observed, the result is NaN.
33 | 
34 | :doc:`Bag <histogrammar.primitives.bag.Bag>`: accumulate values for scatter plots
35 |     Accumulate raw numbers, vectors of numbers, or strings, with identical values merged.
36 | 
37 | Second kind: pass to different sub-aggregators based on values seen in data
38 | ---------------------------------------------------------------------------
39 | 
40 | :doc:`Bin <histogrammar.primitives.bin.Bin>`: regular binning for histograms
41 |     Split a quantity into equally spaced bins between a low and high threshold and fill exactly one bin per datum.
42 | 
43 | :doc:`SparselyBin <histogrammar.primitives.sparselybin.SparselyBin>`: ignore zeros
44 |     Split a quantity into equally spaced bins, creating them whenever their entries would be non-zero. Exactly one sub-aggregator is filled per datum.
45 | 
46 | :doc:`CentrallyBin <histogrammar.primitives.centrallybin.CentrallyBin>`: irregular but fully partitioning
47 |     Split a quantity into bins defined by irregularly spaced bin centers, with exactly one sub-aggregator filled per datum (the closest one).
48 | 
49 | :doc:`IrregularlyBin <histogrammar.primitives.irregularlybin.IrregularlyBin>`: exclusive filling
50 |     Accumulate a suite of aggregators, each between two thresholds, filling exactly one per datum.
51 | 
52 | :doc:`Categorize <histogrammar.primitives.categorize.Categorize>`: string-valued bins, bar charts
53 |     Split a given quantity by its categorical value and fill only one category per datum.
54 | 
55 | :doc:`Fraction <histogrammar.primitives.fraction.Fraction>`: efficiency plots
56 |     Accumulate two aggregators, one containing only entries that pass a given selection (numerator) and another that contains all entries (denominator).
57 | 
58 | :doc:`Stack <histogrammar.primitives.stack.Stack>`: cumulative filling
59 |     Accumulates a suite of aggregators, each filtered with a tighter selection on the same quantity.
60 | 
61 | :doc:`Select <histogrammar.primitives.select.Select>`: apply a cut
62 |     Filter or weight data according to a given selection.
63 | 
64 | Third kind: broadcast to every sub-aggregator, independent of data
65 | ------------------------------------------------------------------
66 | 
67 | :doc:`Label <histogrammar.primitives.collection.Label>`: directory with string-based keys
68 |     Accumulate any number of aggregators of the same type and label them with strings. Every sub-aggregator is filled with every input datum.
69 | 
70 | :doc:`UntypedLabel <histogrammar.primitives.collection.UntypedLabel>`: directory of different types
71 |     Accumulate any number of aggregators of any type and label them with strings. Every sub-aggregator is filled with every input datum.
72 | 
73 | :doc:`Index <histogrammar.primitives.collection.Index>`: list with integer keys
74 |     Accumulate any number of aggregators of the same type in a list. Every sub-aggregator is filled with every input datum.
75 | 
76 | :doc:`Branch <histogrammar.primitives.collection.Branch>`: tuple of different types
77 |     Accumulate aggregators of different types, indexed by i0 through i9. Every sub-aggregator is filled with every input datum.
78 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  linkcheck  to check all external links for integrity
 37 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 38 | 	goto end
 39 | )
 40 | 
 41 | if "%1" == "clean" (
 42 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 43 | 	del /q /s %BUILDDIR%\*
 44 | 	goto end
 45 | )
 46 | 
 47 | if "%1" == "html" (
 48 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 49 | 	if errorlevel 1 exit /b 1
 50 | 	echo.
 51 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 52 | 	goto end
 53 | )
 54 | 
 55 | if "%1" == "dirhtml" (
 56 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 57 | 	if errorlevel 1 exit /b 1
 58 | 	echo.
 59 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 60 | 	goto end
 61 | )
 62 | 
 63 | if "%1" == "singlehtml" (
 64 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "pickle" (
 72 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished; now you can process the pickle files.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "json" (
 80 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished; now you can process the JSON files.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "htmlhelp" (
 88 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
 92 | .hhp project file in %BUILDDIR%/htmlhelp.
 93 | 	goto end
 94 | )
 95 | 
 96 | if "%1" == "qthelp" (
 97 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
 98 | 	if errorlevel 1 exit /b 1
 99 | 	echo.
100 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
101 | .qhcp project file in %BUILDDIR%/qthelp, like this:
102 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Histogrammar.qhcp
103 | 	echo.To view the help file:
104 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Histogrammar.ghc
105 | 	goto end
106 | )
107 | 
108 | if "%1" == "devhelp" (
109 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
110 | 	if errorlevel 1 exit /b 1
111 | 	echo.
112 | 	echo.Build finished.
113 | 	goto end
114 | )
115 | 
116 | if "%1" == "epub" (
117 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
118 | 	if errorlevel 1 exit /b 1
119 | 	echo.
120 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "latex" (
125 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "text" (
133 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "man" (
141 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "texinfo" (
149 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
150 | 	if errorlevel 1 exit /b 1
151 | 	echo.
152 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
153 | 	goto end
154 | )
155 | 
156 | if "%1" == "gettext" (
157 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
158 | 	if errorlevel 1 exit /b 1
159 | 	echo.
160 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
161 | 	goto end
162 | )
163 | 
164 | if "%1" == "changes" (
165 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
166 | 	if errorlevel 1 exit /b 1
167 | 	echo.
168 | 	echo.The overview file is in %BUILDDIR%/changes.
169 | 	goto end
170 | )
171 | 
172 | if "%1" == "linkcheck" (
173 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
174 | 	if errorlevel 1 exit /b 1
175 | 	echo.
176 | 	echo.Link check complete; look for any errors in the above output ^
177 | or in %BUILDDIR%/linkcheck/output.txt.
178 | 	goto end
179 | )
180 | 
181 | if "%1" == "doctest" (
182 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
183 | 	if errorlevel 1 exit /b 1
184 | 	echo.
185 | 	echo.Testing of doctests in the sources finished, look at the ^
186 | results in %BUILDDIR%/doctest/output.txt.
187 | 	goto end
188 | )
189 | 
190 | :end
191 | 


--------------------------------------------------------------------------------
/histogrammar/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | 
 3 | #!/usr/bin/env python
 4 | 
 5 | # Copyright 2016 DIANA-HEP
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | # handy monkey patch functions for pandas and spark dataframes
20 | import histogrammar.dfinterface
21 | from histogrammar.convenience import (
22 |     Histogram,
23 |     Profile,
24 |     ProfileErr,
25 |     SparselyHistogram,
26 |     SparselyProfile,
27 |     SparselyProfileErr,
28 |     TwoDimensionallyHistogram,
29 |     TwoDimensionallySparselyHistogram,
30 | )
31 | from histogrammar.defs import Container, Factory
32 | from histogrammar.primitives.average import Average
33 | from histogrammar.primitives.bag import Bag
34 | from histogrammar.primitives.bin import Bin
35 | from histogrammar.primitives.categorize import Categorize
36 | from histogrammar.primitives.centrallybin import CentrallyBin
37 | from histogrammar.primitives.collection import (
38 |     Branch,
39 |     Collection,
40 |     Index,
41 |     Label,
42 |     UntypedLabel,
43 | )
44 | from histogrammar.primitives.count import Count
45 | from histogrammar.primitives.deviate import Deviate
46 | from histogrammar.primitives.fraction import Fraction
47 | from histogrammar.primitives.irregularlybin import IrregularlyBin
48 | from histogrammar.primitives.minmax import Maximize, Minimize
49 | from histogrammar.primitives.select import Select
50 | from histogrammar.primitives.sparselybin import SparselyBin
51 | from histogrammar.primitives.stack import Stack
52 | from histogrammar.primitives.sum import Sum
53 | 


--------------------------------------------------------------------------------
/histogrammar/convenience.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 ING Wholesale Banking Advanced Analytics
  2 | #
  3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of
  4 | # this software and associated documentation files (the "Software"), to deal in
  5 | # the Software without restriction, including without limitation the rights to
  6 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  7 | # the Software, and to permit persons to whom the Software is furnished to do so,
  8 | # subject to the following conditions:
  9 | #
 10 | # The above copyright notice and this permission notice shall be included in all
 11 | # copies or substantial portions of the Software.
 12 | #
 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 15 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 16 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 17 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 18 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 19 | 
 20 | from histogrammar.defs import identity, unweighted
 21 | from histogrammar.primitives.average import Average
 22 | from histogrammar.primitives.bin import Bin
 23 | from histogrammar.primitives.categorize import Categorize
 24 | from histogrammar.primitives.count import Count
 25 | from histogrammar.primitives.deviate import Deviate
 26 | from histogrammar.primitives.select import Select
 27 | from histogrammar.primitives.sparselybin import SparselyBin
 28 | 
 29 | 
 30 | def Histogram(num, low, high, quantity=identity):
 31 |     """Create a conventional histogram that is capable of being filled and added.
 32 | 
 33 |     Parameters:
 34 |         num (int): the number of bins; must be at least one.
 35 |         low (float): the minimum-value edge of the first bin.
 36 |         high (float): the maximum-value edge of the last bin; must be strictly greater than `low`.
 37 |         quantity (function returning float or string): function that computes the quantity of interest from
 38 |             the data. pass on all values by default. If a string is given, quantity is set to identity(string),
 39 |             in which case that column is picked up from a pandas df.
 40 |     """
 41 |     return Bin.ing(num, low, high, quantity, Count.ing(), Count.ing(), Count.ing(), Count.ing())
 42 | 
 43 | 
 44 | def HistogramCut(num, low, high, quantity=identity, selection=unweighted):
 45 |     """Create a conventional histogram that is capable of being filled and added, with a selection cut.
 46 | 
 47 |     Parameters:
 48 |         num (int): the number of bins; must be at least one.
 49 |         low (float): the minimum-value edge of the first bin.
 50 |         high (float): the maximum-value edge of the last bin; must be strictly greater than `low`.
 51 |         quantity (function returning float or string): function that computes the quantity of interest from
 52 |             the data. pass on all values by default. If a string is given, quantity is set to identity(string),
 53 |             in which case that column is picked up from a pandas df.
 54 |         selection (function returning boolean): function that computes if data point is accepted or not.
 55 |             default is: lamba x: True
 56 |     """
 57 |     return Select.ing(
 58 |         selection,
 59 |         Bin.ing(num, low, high, quantity, Count.ing(), Count.ing(), Count.ing(), Count.ing()),
 60 |     )
 61 | 
 62 | 
 63 | def SparselyHistogram(binWidth, quantity=identity, origin=0.0):
 64 |     """Create a sparsely binned histogram that is only capable of being added.
 65 | 
 66 |     Parameters:
 67 |         binWidth (float): the width of a bin.
 68 |         quantity (function returning float or string): function that computes the quantity of interest from
 69 |             the data. pass on all values by default. If a string is given, quantity is set to identity(string),
 70 |             in which case that column is picked up from a pandas df.
 71 |         origin (float): the left edge of the bin whose index is zero.
 72 |     """
 73 |     return SparselyBin.ing(binWidth, quantity, Count.ing(), Count.ing(), origin)
 74 | 
 75 | 
 76 | def CategorizeHistogram(quantity=identity):
 77 |     """Create a Categorize histogram for categorical features such as strings and booleans
 78 | 
 79 |     Parameters:
 80 |         quantity (function returning float or string): function that computes the quantity of interest from
 81 |             the data. pass on all values by default. If a string is given, quantity is set to identity(string),
 82 |             in which case that column is picked up from a pandas df.
 83 |     """
 84 |     return Categorize.ing(quantity, Count.ing())
 85 | 
 86 | 
 87 | def Profile(num, low, high, binnedQuantity, averagedQuantity):
 88 |     """Convenience function for creating binwise averages."""
 89 |     return Bin.ing(num, low, high, binnedQuantity, Average.ing(averagedQuantity))
 90 | 
 91 | 
 92 | def SparselyProfile(binWidth, binnedQuantity, averagedQuantity, origin=0.0):
 93 |     """Convenience function for creating sparsely binned binwise averages."""
 94 |     return SparselyBin.ing(binWidth, binnedQuantity, Average.ing(averagedQuantity), Count.ing(), origin)
 95 | 
 96 | 
 97 | def ProfileErr(num, low, high, binnedQuantity, averagedQuantity):
 98 |     """Convenience function for creating a profile plot
 99 | 
100 |     This is a Profile with variances.
101 |     """
102 |     return Bin.ing(num, low, high, binnedQuantity, Deviate.ing(averagedQuantity))
103 | 
104 | 
105 | def SparselyProfileErr(binWidth, binnedQuantity, averagedQuantity, origin=0.0):
106 |     """Convenience function for creating a sparsely binned profile plot
107 | 
108 |     This is a Profile with variances.
109 |     """
110 |     return SparselyBin.ing(binWidth, binnedQuantity, Deviate.ing(averagedQuantity), Count.ing(), origin)
111 | 
112 | 
113 | def TwoDimensionallyHistogram(xnum, xlow, xhigh, xquantity, ynum, ylow, yhigh, yquantity):
114 |     """Convenience function for creating a conventional, two-dimensional histogram."""
115 |     return Bin.ing(xnum, xlow, xhigh, xquantity, Bin.ing(ynum, ylow, yhigh, yquantity))
116 | 
117 | 
118 | def TwoDimensionallySparselyHistogram(xbinWidth, xquantity, ybinWidth, yquantity, xorigin=0.0, yorigin=0.0):
119 |     """Convenience function for creating a sparsely binned, two-dimensional histogram."""
120 |     return SparselyBin.ing(
121 |         xbinWidth,
122 |         xquantity,
123 |         SparselyBin.ing(ybinWidth, yquantity, Count.ing(), Count.ing(), yorigin),
124 |         Count.ing(),
125 |         xorigin,
126 |     )
127 | 


--------------------------------------------------------------------------------
/histogrammar/dfinterface/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 ING Wholesale Banking Advanced Analytics
 2 | #
 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | # this software and associated documentation files (the "Software"), to deal in
 5 | # the Software without restriction, including without limitation the rights to
 6 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | # the Software, and to permit persons to whom the Software is furnished to do so,
 8 | # subject to the following conditions:
 9 | #
10 | # The above copyright notice and this permission notice shall be included in all
11 | # copies or substantial portions of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | 
20 | from .addmethods import add_pandas_methods, add_sparksql_methods
21 | 
22 | try:
23 |     from pyspark.sql import DataFrame as sdf
24 | 
25 |     # add function to create histogrammar histograms
26 |     add_sparksql_methods(cls=sdf, prefix="hg_")
27 | except (ModuleNotFoundError, AttributeError):
28 |     pass
29 | 
30 | try:
31 |     from pandas import DataFrame as pdf
32 | 
33 |     # add function to create histogrammar histograms
34 |     add_pandas_methods(cls=pdf, prefix="hg_")
35 | except (ModuleNotFoundError, AttributeError):
36 |     pass
37 | 


--------------------------------------------------------------------------------
/histogrammar/dfinterface/filling_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 ING Wholesale Banking Advanced Analytics
  2 | #
  3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of
  4 | # this software and associated documentation files (the "Software"), to deal in
  5 | # the Software without restriction, including without limitation the rights to
  6 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  7 | # the Software, and to permit persons to whom the Software is furnished to do so,
  8 | # subject to the following conditions:
  9 | #
 10 | # The above copyright notice and this permission notice shall be included in all
 11 | # copies or substantial portions of the Software.
 12 | #
 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 15 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 16 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 17 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 18 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 19 | 
 20 | 
 21 | import numpy as np
 22 | import pandas as pd
 23 | 
 24 | NUM_NS_DAY = 24 * 3600 * int(1e9)
 25 | 
 26 | 
 27 | def check_column(col, sep=":"):
 28 |     """Convert input column string to list of columns
 29 | 
 30 |     :param col: input string
 31 |     :param sep: default ":"
 32 |     :return: list of columns
 33 |     """
 34 |     if isinstance(col, str):
 35 |         col = col.split(sep)
 36 |     elif not isinstance(col, list):
 37 |         raise TypeError(f'Columns "{col}" needs to be a string or list of strings')
 38 |     return col
 39 | 
 40 | 
 41 | def normalize_dtype(dtype):
 42 |     """Convert datatype to consistent numpy datatype
 43 | 
 44 |     :param dtype: input datatype
 45 |     :rtype: numpy.dtype.type
 46 |     """
 47 |     try:
 48 |         if hasattr(dtype, "type"):
 49 |             # this converts pandas types, such as pd.Int64, into numpy types
 50 |             dtype = type(dtype.type())
 51 |         dtype = np.dtype(dtype).type
 52 |         if dtype in {np.str_, np.bytes_}:
 53 |             dtype = np.dtype(str).type
 54 |         # MB 20210404: nb.object_ is kept an object -> uses to_string(). str uses only_str()
 55 |     except BaseException:
 56 |         raise RuntimeError(f'unknown assigned datatype "{dtype}"')
 57 |     return dtype
 58 | 
 59 | 
 60 | def to_ns(x):
 61 |     """Convert input timestamps to nanoseconds (integers).
 62 | 
 63 |     :param x: value to be converted
 64 |     :returns: converted value
 65 |     :rtype: int
 66 |     """
 67 |     if pd.isnull(x):
 68 |         return 0
 69 |     try:
 70 |         return pd.to_datetime(x).value
 71 |     except Exception:
 72 |         if hasattr(x, "__str__"):
 73 |             return pd.to_datetime(str(x)).value
 74 |     return 0
 75 | 
 76 | 
 77 | def to_str(val):
 78 |     """Convert input to (array of) string(s).
 79 | 
 80 |     :param val: value to be converted
 81 |     :returns: converted value
 82 |     :rtype: str or np.ndarray
 83 |     """
 84 |     if isinstance(val, str):
 85 |         return val
 86 |     if isinstance(val, pd.Series):
 87 |         # Note: at this point, data type of pd.series has already been inferred as being of type object (mixed)
 88 |         return val.astype(str).values
 89 |     if hasattr(val, "__iter__"):
 90 |         return np.asarray([(s if isinstance(s, str) else str(s) if hasattr(s, "__str__") else "") for s in val])
 91 |     if hasattr(val, "__str__"):
 92 |         return str(val)
 93 |     return "None"
 94 | 
 95 | 
 96 | def only_str(val):
 97 |     """Pass input value or array only if it is a string.
 98 | 
 99 |     :param val: value to be evaluated
100 |     :returns: evaluated value
101 |     :rtype: str or np.ndarray
102 |     """
103 |     if isinstance(val, str):
104 |         return val
105 |     if isinstance(val, pd.Series):
106 |         # at this point, data type of pd.series has already been inferred as *to be* 'string'
107 |         dtype = np.dtype(val.dtype).type
108 |         return val.values if dtype in [str, np.str_, np.bytes_] else val.astype(str).values
109 |     if hasattr(val, "__iter__"):
110 |         return np.asarray([s if isinstance(s, str) else "None" for s in val])
111 |     return "None"
112 | 
113 | 
114 | def only_bool(val):
115 |     """Pass input value or array only if it is a bool.
116 | 
117 |     :param val: value to be evaluated
118 |     :returns: evaluated value
119 |     :rtype: np.bool or np.ndarray
120 |     """
121 |     if isinstance(val, (np.bool_, bool)):
122 |         return val
123 |     if isinstance(val, pd.Series) and val.dtype in [np.bool_, bool]:
124 |         return val.values
125 |     if hasattr(val, "__iter__") and not isinstance(val, str):
126 |         return np.asarray([s if isinstance(s, (np.bool_, bool)) else "NaN" for s in val])
127 |     return "NaN"
128 | 
129 | 
130 | def only_int(val):
131 |     """Pass input val value or array only if it is an integer.
132 | 
133 |     :param val: value to be evaluated
134 |     :returns: evaluated value
135 |     :rtype: np.int64 or np.ndarray
136 |     """
137 |     if isinstance(val, (np.int64, int)):
138 |         return val
139 |     if isinstance(val, pd.Series) and val.dtype in [np.int64, int]:
140 |         return val.values
141 |     if hasattr(val, "__iter__") and not isinstance(val, str):
142 |         return np.asarray([s if isinstance(s, (np.int64, int)) else np.nan for s in val])
143 |     return np.nan
144 | 
145 | 
146 | def only_float(val):
147 |     """Pass input val value or array only if it is a float.
148 | 
149 |     :param val: value to be evaluated
150 |     :returns: evaluated value
151 |     :rtype: np.float64 or np.ndarray
152 |     """
153 |     if isinstance(val, (np.float64, float)):
154 |         return val
155 |     if isinstance(val, pd.Series) and val.dtype in [np.float64, float]:
156 |         return val.values
157 |     if hasattr(val, "__iter__") and not isinstance(val, str):
158 |         return np.asarray([s if isinstance(s, (np.float64, float)) else np.nan for s in val])
159 |     return np.nan
160 | 
161 | 
162 | QUANTITY = {
163 |     # MB 20210404: to_string for object types b/c it's a mixed type
164 |     object: to_str,
165 |     np.object_: to_str,
166 |     str: only_str,
167 |     np.str_: only_str,
168 |     int: only_int,
169 |     np.int64: only_int,
170 |     np.int32: only_int,
171 |     bool: only_bool,
172 |     np.bool_: only_bool,
173 |     float: only_float,
174 |     np.float64: only_float,
175 |     np.datetime64: only_int,
176 | }
177 | 
178 | 
179 | def value_to_bin_index(val, **kwargs):
180 |     """Convert value to bin index.
181 | 
182 |     Convert a numeric or timestamp column to an integer bin index.
183 | 
184 |     :param binWidth: bin width value needed to convert column
185 |         to an integer bin index
186 |     :param origin: bin offset value needed to convert column
187 |         to an integer bin index
188 |     """
189 |     try:
190 |         # NOTE this notation also works for timestamps
191 |         bin_width = kwargs.get("binWidth", kwargs.get("bin_width", 1))
192 |         bin_offset = kwargs.get("origin", kwargs.get("bin_offset", 0))
193 |         return int(np.floor((val - bin_offset) / bin_width))
194 |     except BaseException:
195 |         pass
196 |     return val
197 | 
198 | 
199 | def value_to_bin_center(val, **kwargs):
200 |     """Convert value to bin center.
201 | 
202 |     Convert a numeric or timestamp column to a common bin center value.
203 | 
204 |     :param binWidth: bin width value needed to convert column
205 |         to a common bin center value
206 |     :param origin: bin_offset value needed to convert column
207 |         to a common bin center value
208 |     """
209 |     try:
210 |         # NOTE this notation also works for timestamps, and does not change the
211 |         # unit
212 |         bin_width = kwargs.get("binWidth", kwargs.get("bin_width", 1))
213 |         bin_offset = kwargs.get("origin", kwargs.get("bin_offset", 0))
214 |         bin_index = int(np.floor((val - bin_offset) / bin_width))
215 |         obj_type = type(bin_width)
216 |         return bin_offset + obj_type((bin_index + 0.5) * bin_width)
217 |     except BaseException:
218 |         pass
219 |     return val
220 | 


--------------------------------------------------------------------------------
/histogrammar/dfinterface/pandas_histogrammar.py:
--------------------------------------------------------------------------------
  1 | """Copyright Eskapade:
  2 | License Apache-2: https://github.com/KaveIO/Eskapade-Core/blob/master/LICENSE
  3 | Reference link:
  4 | https://github.com/KaveIO/Eskapade/blob/master/python/eskapade/analysis/links/hist_filler.py
  5 | All modifications copyright ING WBAA.
  6 | """
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | from pandas.api.types import infer_dtype
 11 | from tqdm import tqdm
 12 | 
 13 | import histogrammar as hg
 14 | 
 15 | from .filling_utils import QUANTITY, to_ns
 16 | from .histogram_filler_base import HistogramFillerBase
 17 | 
 18 | 
 19 | class PandasHistogrammar(HistogramFillerBase):
 20 |     """Fill histogrammar histograms.
 21 | 
 22 |     Algorithm to fill histogrammar style bin, sparse-bin and category histograms.
 23 |     Timestamp features are converted to nanoseconds before
 24 |     the binning is applied. Final histograms are stored in the datastore.
 25 |     """
 26 | 
 27 |     def __init__(
 28 |         self,
 29 |         features=None,
 30 |         binning="unit",
 31 |         bin_specs=None,
 32 |         time_axis="",
 33 |         var_dtype=None,
 34 |         read_key=None,
 35 |         store_key=None,
 36 |         nbins_1d=40,
 37 |         nbins_2d=20,
 38 |         nbins_3d=10,
 39 |         max_nunique=500,
 40 |     ):
 41 |         """Initialize module instance.
 42 | 
 43 |         Store and do basic check on the attributes HistogramFillerBase.
 44 | 
 45 |         :param list features: columns to pick up from input data. (default is all features)
 46 |             For multi-dimensional histograms, separate the column names with a :
 47 | 
 48 |             Example features list is:
 49 | 
 50 |             .. code-block:: python
 51 | 
 52 |                 features = ['x', 'date', 'date:x', 'date:y', 'date:x:y']
 53 | 
 54 |         :param str binning: default binning to revert to in case bin_specs not supplied. options are:
 55 |             "unit" or "auto", default is "unit". When using "auto", semi-clever binning is automatically done.
 56 |         :param dict bin_specs: dictionaries used for rebinning numeric or timestamp features
 57 | 
 58 |             Example bin_specs dictionary is:
 59 | 
 60 |             .. code-block:: python
 61 | 
 62 |                 bin_specs = {'x': {'binWidth': 1, 'origin': 0},
 63 |                              'y': {'num': 10, 'low': 0.0, 'high': 2.0},
 64 |                              'x:y': [{}, {'num': 5, 'low': 0.0, 'high': 1.0}],
 65 |                              'a': {'edges': [0, 2, 10, 11, 21, 101]},
 66 |                              'b': {'centers': [1, 6, 10.5, 16, 20, 100]},
 67 |                              'c': {'max': True},
 68 |                              'd': {'min': True},
 69 |                              'e': {'sum': True},
 70 |                              'f': {'average': True},
 71 |                              'a:f': [{'edges': [0, 10, 101]}, {'average': True}],
 72 |                              'g': {'thresholds': [0, 2, 10, 11, 21, 101]},
 73 |                              'h': {'bag': True},
 74 |                              }
 75 | 
 76 |             In the bin specs for x:y, x reverts to the 1-dim setting.
 77 | 
 78 |         :param str time_axis: name of datetime feature, used as time axis, eg 'date'. if True, will be guessed.
 79 |         :param dict var_dtype: dictionary with specified datatype per feature (optional)
 80 |         :param str read_key: key of input histogram-dict to read from data store .
 81 |             (only required when calling transform(datastore) as module)
 82 |         :param str store_key: key of output data to store in data store
 83 |             (only required when calling transform(datastore) as module)
 84 |         :param int nbins_1d: auto-binning number of bins for 1d histograms. default is 40.
 85 |         :param int nbins_2d: auto-binning number of bins for 2d histograms. default is 20.
 86 |         :param int nbins_3d: auto-binning number of bins for 3d histograms. default is 10.
 87 |         :param int max_nunique: auto-binning threshold for unique categorical values. default is 500.
 88 |         """
 89 |         HistogramFillerBase.__init__(
 90 |             self,
 91 |             features,
 92 |             binning,
 93 |             bin_specs,
 94 |             time_axis,
 95 |             var_dtype,
 96 |             read_key,
 97 |             store_key,
 98 |             nbins_1d,
 99 |             nbins_2d,
100 |             nbins_3d,
101 |             max_nunique,
102 |         )
103 | 
104 |     def assert_dataframe(self, df):
105 |         """Check that input data is a filled pandas data frame.
106 | 
107 |         :param df: input (pandas) data frame
108 |         """
109 |         if not isinstance(df, pd.DataFrame):
110 |             raise TypeError(f"retrieved object not of type {pd.DataFrame}")
111 |         if df.shape[0] == 0:
112 |             raise RuntimeError("data is empty")
113 |         return df
114 | 
115 |     def get_features(self, df):
116 |         """Get columns of (pandas) dataframe
117 | 
118 |         :param df: input pandas dataframe
119 |         """
120 |         return df.columns.tolist()
121 | 
122 |     def get_data_type(self, df, col):
123 |         """Get data type of dataframe column.
124 | 
125 |         :param df: input data frame
126 |         :param str col: column
127 |         """
128 |         if col not in df.columns:
129 |             raise KeyError(f'column "{col:s}" not in input dataframe')
130 | 
131 |         inferred = infer_dtype(df[col], skipna=True)
132 |         if inferred in "string":
133 |             data_type = "str"
134 |         elif inferred == "integer":
135 |             data_type = "int"
136 |         elif inferred == "boolean":
137 |             data_type = "bool"
138 |         elif inferred in {"decimal", "floating", "mixed-integer-float"}:
139 |             # decimal needs preprocessing (cast), signal this in metadata
140 |             data_type = np.dtype("float", metadata={"decimal": True}) if inferred == "decimal" else "float"
141 |         elif inferred in {"date", "datetime", "datetime64"}:
142 |             data_type = "datetime64"
143 |         else:  # categorical, mixed, etc -> object uses to_string()
144 |             data_type = np.object_
145 | 
146 |         return data_type
147 | 
148 |     def get_quantiles(self, df, quantiles=[0.05, 0.95], columns=[]):
149 |         """return dict with quantiles for given columns
150 | 
151 |         :param df: input pandas data frame
152 |         :param quantiles: list of quantiles. default is [0.05, 0.95]
153 |         :param columns: columns to select. default is all.
154 |         """
155 |         if len(columns) == 0:
156 |             return {}
157 |         qdf = df[columns].quantile(quantiles)
158 |         return {c: qdf[c].values.tolist() for c in columns}
159 | 
160 |     def get_nunique(self, df, columns=[]):
161 |         """return dict with number of unique entries for given columns
162 | 
163 |         :param df: input pandas data frame
164 |         :param columns: columns to select (optional)
165 |         """
166 |         if not columns:
167 |             columns = df.columns
168 |         return df[columns].nunique().to_dict()
169 | 
170 |     def process_features(self, df, cols_by_type):
171 |         """Process features before histogram filling.
172 | 
173 |         Specifically, convert timestamp features to integers
174 | 
175 |         :param df: input (pandas) data frame
176 |         :param cols_by_type: dictionary of column sets for each type
177 |         :returns: output (pandas) data frame with converted timestamp features
178 |         :rtype: pandas DataFrame
179 |         """
180 |         # timestamp variables are converted to ns here
181 |         # make temp df for value counting (used below)
182 |         idf = df[list(cols_by_type["num"]) + list(cols_by_type["str"]) + list(cols_by_type["bool"])].copy()
183 |         for col in cols_by_type["dt"]:
184 |             self.logger.debug(f'Converting column "{col}" of type "{self.var_dtype[col]}" to nanosec.')
185 |             idf[col] = df[col].apply(to_ns)
186 | 
187 |         # treat decimal as float, as decimal is not supported by .quantile
188 |         # (https://github.com/pandas-dev/pandas/issues/13157)
189 |         for col in cols_by_type["decimal"]:
190 |             idf[col] = df[col].apply(float)
191 | 
192 |         return idf
193 | 
194 |     def fill_histograms(self, idf):
195 |         """Fill the histograms
196 | 
197 |         :param idf: converted input dataframe
198 |         """
199 |         # construct empty histograms if needed
200 |         for cols in self.features:
201 |             name = ":".join(cols)
202 |             if name not in self._hists:
203 |                 # create an (empty) histogram of right type
204 |                 self._hists[name] = self.construct_empty_hist(cols)
205 | 
206 |         # histogram filling with working progress bar
207 |         res = [
208 |             _fill_histogram(idf=idf[c], hist=self._hists[":".join(c)], features=c)
209 |             for c in tqdm(self.features, total=len(self.features), ncols=100)
210 |         ]
211 | 
212 |         # update dictionary
213 |         for name, hist in res:
214 |             self._hists[name] = hist
215 | 
216 |     def construct_empty_hist(self, features):
217 |         """Create an (empty) histogram of right type.
218 | 
219 |         Create a multi-dim histogram by iterating through the features in
220 |         reverse order and passing a single-dim hist as input to the next
221 |         column.
222 | 
223 |         :param list features: histogram features
224 |         :return: created histogram
225 |         :rtype: histogrammar.Count
226 |         """
227 |         hist = hg.Count()
228 | 
229 |         # create a multi-dim histogram by iterating through the features
230 |         # in reverse order and passing a single-dim hist as input
231 |         # to the next column
232 |         revcols = list(reversed(features))
233 |         for idx, col in enumerate(revcols):
234 |             # histogram type depends on the data type
235 |             dt = self.var_dtype[col]
236 | 
237 |             # processing function, e.g. only accept booleans during filling
238 |             f = QUANTITY[dt]
239 |             # if len(features) == 1: df[col] is a pd.series
240 |             # else: df[features] is a pd.Dataframe, so fix column to col
241 |             quant = (lambda x, fnc=f: fnc(x)) if len(features) == 1 else (lambda x, fnc=f, clm=col: fnc(x[clm]))
242 |             hist = self.get_hist_bin(hist, features, quant, col, dt)
243 | 
244 |         return hist
245 | 
246 | 
247 | def _fill_histogram(idf, hist, features):
248 |     """Fill input histogram with column(s) of input dataframe.
249 | 
250 |     Separate function call for parallellization.
251 | 
252 |     :param idf: input data frame used for filling histogram
253 |     :param hist: empty histogrammar histogram about to be filled
254 |     :param list features: histogram column(s)
255 |     """
256 |     name = ":".join(features)
257 |     clm = features[0] if len(features) == 1 else features
258 |     # do the actual filling
259 |     hist.fill.numpy(idf[clm])
260 |     return name, hist
261 | 


--------------------------------------------------------------------------------
/histogrammar/dfinterface/spark_histogrammar.py:
--------------------------------------------------------------------------------
  1 | """Copyright Eskapade:
  2 | License Apache-2: https://github.com/KaveIO/Eskapade-Core/blob/master/LICENSE
  3 | Reference link:
  4 | https://github.com/KaveIO/Eskapade-Spark/blob/master/python/eskapadespark/links/spark_histogrammar_filler.py
  5 | All modifications copyright ING WBAA.
  6 | """
  7 | 
  8 | import numpy as np
  9 | from tqdm import tqdm
 10 | 
 11 | import histogrammar as hg
 12 | 
 13 | from .histogram_filler_base import HistogramFillerBase
 14 | 
 15 | try:
 16 |     from pyspark.sql import DataFrame
 17 |     from pyspark.sql import functions as f
 18 |     from pyspark.sql.functions import approxCountDistinct
 19 | except (ModuleNotFoundError, AttributeError):
 20 |     pass
 21 | 
 22 | 
 23 | class SparkHistogrammar(HistogramFillerBase):
 24 |     """Fill histogrammar histograms with Spark.
 25 | 
 26 |     Algorithm to fill histogrammar style bin, sparse-bin and category histograms
 27 |     with Spark. Timestamp features are converted to nanoseconds before the binning
 28 |     is applied. Final histograms are stored in the datastore.
 29 |     """
 30 | 
 31 |     def __init__(
 32 |         self,
 33 |         features=None,
 34 |         binning="unit",
 35 |         bin_specs=None,
 36 |         time_axis="",
 37 |         var_dtype=None,
 38 |         read_key=None,
 39 |         store_key=None,
 40 |         nbins_1d=40,
 41 |         nbins_2d=20,
 42 |         nbins_3d=10,
 43 |         max_nunique=500,
 44 |     ):
 45 |         """Initialize module instance.
 46 | 
 47 |         Store and do basic check on the attributes HistogramFillerBase.
 48 | 
 49 |         :param list features: colums to pick up from input data. (default is all features)
 50 |             For multi-dimensional histograms, separate the column names with a :
 51 | 
 52 |             Example features list is:
 53 | 
 54 |             .. code-block:: python
 55 | 
 56 |                 features = ['x', 'date', 'date:x', 'date:y', 'date:x:y']
 57 | 
 58 |         :param str binning: default binning to revert to in case bin_specs not supplied. options are:
 59 |             "unit" or "auto", default is "unit". When using "auto", semi-clever binning is automatically done.
 60 |         :param dict bin_specs: dictionaries used for rebinning numeric or timestamp features
 61 | 
 62 |             Example bin_specs dictionary is:
 63 | 
 64 |             .. code-block:: python
 65 | 
 66 |                 bin_specs = {'x': {'binWidth': 1, 'origin': 0},
 67 |                              'y': {'num': 10, 'low': 0.0, 'high': 2.0},
 68 |                              'x:y': [{}, {'num': 5, 'low': 0.0, 'high': 1.0}],
 69 |                              'a': {'edges': [0, 2, 10, 11, 21, 101]},
 70 |                              'b': {'centers': [1, 6, 10.5, 16, 20, 100]},
 71 |                              'c': {'max': True},
 72 |                              'd': {'min': True},
 73 |                              'e': {'sum': True},
 74 |                              'f': {'average': True},
 75 |                              'a:f': [{'edges': [0, 10, 101]}, {'average': True}],
 76 |                              'g': {'thresholds': [0, 2, 10, 11, 21, 101]},
 77 |                              'h': {'bag': True},
 78 |                              }
 79 | 
 80 |             In the bin specs for x:y, x reverts to the 1-dim setting.
 81 | 
 82 |         :param str time_axis: name of datetime feature, used as time axis, eg 'date'. if True, will be guessed.
 83 |             If time_axis is set, if no features given, features becomes: ['date:x', 'date:y', 'date:z'] etc.
 84 |         :param dict var_dtype: dictionary with specified datatype per feature (optional)
 85 |         :param str read_key: key of input histogram-dict to read from data store .
 86 |             (only required when calling transform(datastore) as module)
 87 |         :param str store_key: key of output data to store in data store
 88 |             (only required when calling transform(datastore) as module)
 89 |         :param int nbins_1d: auto-binning number of bins for 1d histograms. default is 40.
 90 |         :param int nbins_2d: auto-binning number of bins for 2d histograms. default is 20.
 91 |         :param int nbins_3d: auto-binning number of bins for 3d histograms. default is 10.
 92 |         :param int max_nunique: auto-binning threshold for unique categorical values. default is 500.
 93 |         """
 94 |         HistogramFillerBase.__init__(
 95 |             self,
 96 |             features,
 97 |             binning,
 98 |             bin_specs,
 99 |             time_axis,
100 |             var_dtype,
101 |             read_key,
102 |             store_key,
103 |             nbins_1d,
104 |             nbins_2d,
105 |             nbins_3d,
106 |             max_nunique,
107 |         )
108 |         self._unit_timestamp_specs = {
109 |             k: float(self._unit_timestamp_specs[k]) for i, k in enumerate(self._unit_timestamp_specs)
110 |         }
111 | 
112 |     def assert_dataframe(self, df):
113 |         """Check that input data is a filled spark data frame.
114 | 
115 |         :param df: input (spark) data frame
116 |         """
117 |         if not isinstance(df, DataFrame):
118 |             raise TypeError("retrieved object not of type Spark DataFrame")
119 |         assert len(df.head(1)) != 0, "input dataframe is empty"
120 |         return df
121 | 
122 |     def get_features(self, df):
123 |         """Get columns of dataframe
124 | 
125 |         :param df: input spark dataframe
126 |         """
127 |         return df.columns
128 | 
129 |     def get_quantiles(self, df, quantiles=[0.05, 0.95], columns=[]):
130 |         """return dict with quantiles for given columns
131 | 
132 |         :param df: input (spark) data frame
133 |         :param quantiles: list of quantiles. default is [0.05, 0.95]
134 |         :param columns: columns to select. default is all.
135 |         """
136 |         if len(columns) == 0:
137 |             return {}
138 |         qsl = df.approxQuantile(columns, quantiles, 0.25)
139 |         return dict(zip(columns, qsl))
140 | 
141 |     def get_nunique(self, df, columns=[]):
142 |         """return dict with number of unique entries for given columns
143 | 
144 |         :param df: input (spark) data frame
145 |         :param columns: columns to select (optional)
146 |         """
147 |         if not columns:
148 |             columns = df.columns
149 |         qdf = df.agg(*(approxCountDistinct(f.col(c)).alias(c) for c in columns))
150 |         return qdf.toPandas().T[0].to_dict()
151 | 
152 |     def get_data_type(self, df, col):
153 |         """Get data type of dataframe column.
154 | 
155 |         :param df: input data frame
156 |         :param str col: column
157 |         """
158 |         if col not in df.columns:
159 |             raise KeyError(f'Column "{col:s}" not in input dataframe.')
160 |         dt = dict(df.dtypes)[col]
161 |         # spark conversions to numpy or python equivalent
162 |         if dt == "string":
163 |             dt = "str"
164 |         elif dt in ["timestamp", "date"]:
165 |             dt = np.datetime64
166 |         elif dt == "boolean":
167 |             dt = bool
168 |         elif dt == "bigint":
169 |             dt = np.int64
170 |         elif dt.startswith("decimal("):
171 |             return np.dtype(float, metadata={"decimal": True})
172 | 
173 |         return np.dtype(dt)
174 | 
175 |     def process_features(self, df, cols_by_type):
176 |         """Process features before histogram filling.
177 | 
178 |         Specifically, in this case convert timestamp features to nanoseconds
179 | 
180 |         :param df: input data frame
181 |         :return: output data frame with converted timestamp features
182 |         :rtype: DataFrame
183 |         """
184 |         # make alias df for value counting (used below)
185 |         idf = df.alias("")
186 | 
187 |         # timestamp variables are converted here to ns since 1970-1-1
188 |         # histogrammar does not (yet) support long integers, so convert timestamps to float
189 |         for col in cols_by_type["dt"]:
190 |             self.logger.debug(f'Converting column "{col}" of type "{self.var_dtype[col]}" to nanosec.')
191 |             # first cast to timestamp (in case column is stored as date)
192 |             to_ns = f.col(col).cast("timestamp").cast("float") * 1e9
193 |             idf = idf.withColumn(col, to_ns)
194 | 
195 |         # spark nulls are interpreted to 0 when cast to double in scala, done when given as input to numeric histograms
196 |         # in columns that have them, replace by nones by nans
197 |         for col in cols_by_type["num"]:
198 |             if len(idf.where(f.col(col).isNull()).limit(1).collect()) > 0:
199 |                 self.logger.debug(f'In numeric column "{col}" converting each None to NaN.')
200 |                 idf = idf.withColumn(
201 |                     col,
202 |                     f.when(f.col(col).isNotNull(), f.col(col)).otherwise(float("nan")),
203 |                 )
204 | 
205 |         return idf
206 | 
207 |     def construct_empty_hist(self, df, features):
208 |         """Create an (empty) histogram of right type.
209 | 
210 |         Create a multi-dim histogram by iterating through the features in
211 |         reverse order and passing a single-dim hist as input to the next
212 |         column.
213 | 
214 |         :param df: input dataframe
215 |         :param list features: histogram features
216 |         :return: created histogram
217 |         :rtype: histogrammar.Count
218 |         """
219 |         hist = hg.Count()
220 | 
221 |         # create a multi-dim histogram by iterating through
222 |         # the features in reverse order and passing a single-dim hist
223 |         # as input to the next column
224 |         revcols = list(reversed(features))
225 |         for idx, col in enumerate(revcols):
226 |             # histogram type depends on the data type
227 |             dt = self.var_dtype[col]
228 |             quant = df[col]
229 |             hist = self.get_hist_bin(hist, features, quant, col, dt)
230 | 
231 |         return hist
232 | 
233 |     def fill_histograms(self, idf):
234 |         """Fill the histograms
235 | 
236 |         :param idf: input data frame used for filling histogram
237 |         """
238 |         for cols in tqdm(self.features, ncols=100):
239 |             self.logger.debug('Processing feature "{cols}".'.format(cols=":".join(cols)))
240 |             self.fill_histogram(idf, cols)
241 | 
242 |     def fill_histogram(self, idf, features):
243 |         """Fill input histogram with column(s) of input dataframe.
244 | 
245 |         :param idf: input data frame used for filling histogram
246 |         :param list features: histogram column(s)
247 |         """
248 |         name = ":".join(features)
249 |         if name not in self._hists:
250 |             # create an (empty) histogram of right type
251 |             self._hists[name] = self.construct_empty_hist(idf, features)
252 |         hist = self._hists[name]
253 | 
254 |         # do the actual filling
255 |         hist.fill.sparksql(idf)
256 |         self._hists[name] = hist
257 | 
258 |     def _execute(self, df):
259 |         df.persist()
260 |         hists = super()._execute(df)
261 |         df.unpersist()
262 |         return hists
263 | 


--------------------------------------------------------------------------------
/histogrammar/notebooks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/histogrammar/histogrammar-python/642d89dfbe34f1065d215a369ec153ec11ed4c2e/histogrammar/notebooks/__init__.py


--------------------------------------------------------------------------------
/histogrammar/notebooks/histogrammar_tutorial_exercises.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Histogrammar exercises\n",
  8 |     "\n",
  9 |     "Histogrammar is a Python package that allows you to make histograms from numpy arrays, and pandas and spark dataframes. \n",
 10 |     "\n",
 11 |     "(There is also a scala backend for Histogrammar, that is used by spark.) \n",
 12 |     "\n",
 13 |     "You can do the exercises below after the basic tutorial.\n",
 14 |     "\n",
 15 |     "Enjoy!"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "%%capture\n",
 25 |     "# install histogrammar (if not installed yet)\n",
 26 |     "\n",
 27 |     "!\"{sys.executable}\" -m pip install histogrammar"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import histogrammar as hg"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "import pandas as pd"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## Dataset\n",
 53 |     "Let's first load some data!"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# open a pandas dataframe for use below\n",
 63 |     "from histogrammar import resources\n",
 64 |     "\n",
 65 |     "df = pd.read_csv(resources.data(\"test.csv.gz\"), parse_dates=[\"date\"])"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "df.head(2)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "## Comparing histogram types"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "Histogrammar treats histograms as objects. You will see this has various advantages.\n",
 89 |     "\n",
 90 |     "Let's fill a simple histogram with a numpy array."
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "# this creates a histogram with 100 even-sized bins in the (closed) range [-5, 5]\n",
100 |     "hist1 = hg.Bin(num=10, low=0, high=100)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "hist1.fill.numpy(df['age'].values)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "hist1.plot.matplotlib();"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "hist2 = hg.SparselyBin(binWidth=10, origin=0)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "hist2.fill.numpy(df['age'].values)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "hist2.plot.matplotlib();"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "Q: Have a look at the .values and .bins attributes of hist1 and hist2.\n",
153 |     "What types are these? (hist1.values is a ...?) \n",
154 |     "Does that make sense?"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "hist1"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "hist2"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "Q: In each bin, what type of object is keeping track of the bin count?"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "Try filling hist1 with small values (negative) or very large (> 100) or with NaNs. \n",
187 |     "Find out if and how hist1 keeps track of these?"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "markdown",
192 |    "metadata": {},
193 |    "source": [
194 |     "Now fill hist2 with small values (negative) or very large (> 100) or with NaNs. How does hist2 keeps track of these?"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "markdown",
199 |    "metadata": {},
200 |    "source": [
201 |     "## Categorical variables\n",
202 |     "\n",
203 |     "For categorical variables use the Categorize histogram\n",
204 |     "- Categorize histograms: accepting categorical variables such as strings and booleans.\n",
205 |     "\n"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "histx = hg.Categorize('eyeColor')"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "histx.fill.numpy(df)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "Q: A categorize histogram, what is it fundementally, a dictionary or a list?"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "markdown",
235 |    "metadata": {},
236 |    "source": [
237 |     "Q: What else can it keep track of, e.g. numbers, booleans, nans? Give it a try, fill it with more entries!"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "Fill a histograms with a boolean array (isActive), directly from the dataframe\n",
245 |     "\n",
246 |     "Q: what type of histogram do you get?"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "hists = df.hg_make_histograms(features=['isActive'])"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": []
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {},
268 |    "source": [
269 |     "## Multi-dimensional histograms"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {},
275 |    "source": [
276 |     "Let's make a 3-dimensional histogram, with axes: x=favoriteFruit, y=gender, z=isActive. (In Histogrammar, a multi-dimensional histogram is composed as recursive histograms, starting with the last one.) \n",
277 |     "Then fill it with the dataframe."
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": [
286 |     "# hist1 = hg.Categorize(quantity='isActive')\n",
287 |     "# hist2 = hg.Categorize(quantity='gender', value=hist1)\n",
288 |     "# hist3 = hg.Categorize(quantity='favoriteFruit')"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "markdown",
293 |    "metadata": {},
294 |    "source": [
295 |     "Q: How many data points end up in the bin: banana, male, True ?\n"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "markdown",
300 |    "metadata": {},
301 |    "source": [
302 |     "Q: Store this histogram as a json file. What is the size of the json file?"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "metadata": {},
308 |    "source": [
309 |     "Q: Read back the histogram and then plot it."
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "markdown",
314 |    "metadata": {},
315 |    "source": [
316 |     "Q: Make a histogram of the feature 'fruit', which measures the average value of 'latitude' per bin of fruit."
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "metadata": {},
323 |    "outputs": [],
324 |    "source": [
325 |     "hist1 = hg.Average(quantity='latitude')"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "markdown",
330 |    "metadata": {},
331 |    "source": [
332 |     "Q: what is the mean value of latitude for the bin 'strawberry'?"
333 |    ]
334 |   }
335 |  ],
336 |  "metadata": {
337 |   "kernel_info": {
338 |    "name": "python3"
339 |   },
340 |   "kernelspec": {
341 |    "display_name": "Python 3",
342 |    "language": "python",
343 |    "name": "python3"
344 |   },
345 |   "language_info": {
346 |    "codemirror_mode": {
347 |     "name": "ipython",
348 |     "version": 3
349 |    },
350 |    "file_extension": ".py",
351 |    "mimetype": "text/x-python",
352 |    "name": "python",
353 |    "nbconvert_exporter": "python",
354 |    "pygments_lexer": "ipython3",
355 |    "version": "3.8.5"
356 |   },
357 |   "nteract": {
358 |    "version": "0.15.0"
359 |   },
360 |   "pycharm": {
361 |    "stem_cell": {
362 |     "cell_type": "raw",
363 |     "metadata": {
364 |      "collapsed": false
365 |     },
366 |     "source": []
367 |    }
368 |   }
369 |  },
370 |  "nbformat": 4,
371 |  "nbformat_minor": 4
372 | }
373 | 


--------------------------------------------------------------------------------
/histogrammar/plot/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2016 DIANA-HEP
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 


--------------------------------------------------------------------------------
/histogrammar/plot/hist_numpy.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 ING Wholesale Banking Advanced Analytics
  2 | #
  3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of
  4 | # this software and associated documentation files (the "Software"), to deal in
  5 | # the Software without restriction, including without limitation the rights to
  6 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  7 | # the Software, and to permit persons to whom the Software is furnished to do so,
  8 | # subject to the following conditions:
  9 | #
 10 | # The above copyright notice and this permission notice shall be included in all
 11 | # copies or substantial portions of the Software.
 12 | #
 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 15 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 16 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 17 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 18 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 19 | 
 20 | 
 21 | import warnings
 22 | 
 23 | import numpy as np
 24 | 
 25 | 
 26 | def prepare_2dgrid(hist):
 27 |     """Get lists of all unique x and y keys
 28 | 
 29 |     Used as input by get_2dgrid(hist).
 30 | 
 31 |     :param hist: input histogrammar histogram
 32 |     :return: two comma-separated lists of unique x and y keys
 33 |     """
 34 |     if hist.n_dim < 2:
 35 |         warnings.warn(f"Input histogram only has {hist.n_dim} dimensions (<2). Returning empty lists.")
 36 |         return [], []
 37 | 
 38 |     xkeys = set()
 39 |     ykeys = set()
 40 |     # SparselyBin, Categorize, IrregularlyBin, CentrallyBin
 41 |     if hasattr(hist, "bins"):
 42 |         hist_bins = dict(hist.bins)
 43 |         xkeys = xkeys.union(hist_bins.keys())
 44 |         for h in hist_bins.values():
 45 |             if hasattr(h, "bins"):
 46 |                 h_bins = dict(h.bins)
 47 |                 ykeys = ykeys.union(h_bins.keys())
 48 |             elif hasattr(h, "values"):
 49 |                 ykeys = ykeys.union(range(len(h.values)))
 50 |     # Bin
 51 |     elif hasattr(hist, "values"):
 52 |         xkeys = xkeys.union(range(len(hist.values)))
 53 |         for h in hist.values:
 54 |             if hasattr(h, "bins"):
 55 |                 h_bins = dict(h.bins)
 56 |                 ykeys = ykeys.union(h_bins.keys())
 57 |             elif hasattr(h, "values"):
 58 |                 ykeys = ykeys.union(range(len(h.values)))
 59 |     return sorted(xkeys), sorted(ykeys)
 60 | 
 61 | 
 62 | def set_2dgrid(hist, xkeys, ykeys):
 63 |     """Set 2d grid of first two dimenstions of input histogram
 64 | 
 65 |     Used as input by get_2dgrid(hist).
 66 | 
 67 |     :param hist: input histogrammar histogram
 68 |     :param list xkeys: list with unique x keys
 69 |     :param list ykeys: list with unique y keys
 70 |     :return: filled 2d numpy grid
 71 |     """
 72 |     grid = np.zeros((len(ykeys), len(xkeys)))
 73 | 
 74 |     if hist.n_dim < 2:
 75 |         warnings.warn(f"Input histogram only has {hist.n_dim} dimensions (<2). Returning original grid.")
 76 |         return grid
 77 | 
 78 |     # SparselyBin, Categorize, IrregularlyBin, CentrallyBin
 79 |     if hasattr(hist, "bins"):
 80 |         hist_bins = dict(hist.bins)
 81 |         for k, h in hist_bins.items():
 82 |             if k not in xkeys:
 83 |                 continue
 84 |             i = xkeys.index(k)
 85 |             if hasattr(h, "bins"):
 86 |                 h_bins = dict(h.bins)
 87 |                 for li, g in h_bins.items():
 88 |                     if li not in ykeys:
 89 |                         continue
 90 |                     j = ykeys.index(li)
 91 |                     grid[j, i] = g.entries
 92 |             elif hasattr(h, "values"):
 93 |                 for j, g in enumerate(h.values):
 94 |                     grid[j, i] = g.entries
 95 |     # Bin
 96 |     elif hasattr(hist, "values"):
 97 |         for i, h in enumerate(hist.values):
 98 |             if hasattr(h, "bins"):
 99 |                 h_bins = dict(h.bins)
100 |                 for lj, g in h_bins.items():
101 |                     if lj not in ykeys:
102 |                         continue
103 |                     j = ykeys.index(lj)
104 |                     grid[j, i] = g.entries
105 |             elif hasattr(h, "values"):
106 |                 for j, g in enumerate(h.values):
107 |                     grid[j, i] = g.entries
108 |     return grid
109 | 
110 | 
111 | def get_2dgrid(hist):
112 |     """Get filled x,y grid of first two dimensions of input histogram
113 | 
114 |     :param hist: input histogrammar histogram
115 |     :return: x,y,grid of first two dimenstions of input histogram
116 |     """
117 |     if hist.n_dim < 2:
118 |         warnings.warn(f"Input histogram only has {hist.n_dim} dimensions (<2). Returning empty grid.")
119 |         return np.zeros((0, 0))
120 | 
121 |     xkeys, ykeys = prepare_2dgrid(hist)
122 |     grid = set_2dgrid(hist, xkeys, ykeys)
123 | 
124 |     x_labels = get_x_labels(hist, xkeys)
125 |     y_labels = get_y_labels(hist, ykeys)
126 | 
127 |     return x_labels, y_labels, grid
128 | 
129 | 
130 | def get_x_labels(hist, xkeys):
131 |     return [str(hist._center_from_key(key)) for key in xkeys]
132 | 
133 | 
134 | def get_y_labels(hist, ykeys):
135 |     # SparselyBin, Categorize, IrregularlyBin, CentrallyBin
136 |     if hasattr(hist, "bins"):
137 |         hist_bins = dict(hist.bins)
138 |         h = list(hist_bins.values())[0]
139 |     # Bin
140 |     elif hasattr(hist, "values"):
141 |         h = hist.values[0]
142 |     return [str(h._center_from_key(key)) for key in ykeys]
143 | 
144 | 
145 | def prepare2Dsparse(sparse):
146 |     yminBins = [v.minBin for v in sparse.bins.values() if v.minBin is not None]
147 |     ymaxBins = [v.maxBin for v in sparse.bins.values() if v.maxBin is not None]
148 |     if len(yminBins) > 0 and len(ymaxBins) > 0:
149 |         yminBin = min(yminBins)
150 |         ymaxBin = max(ymaxBins)
151 |     else:
152 |         yminBin = 0
153 |         ymaxBin = 0
154 |     sample = list(sparse.bins.values())[0]
155 |     ynum = 1 + ymaxBin - yminBin
156 |     ylow = yminBin * sample.binWidth + sample.origin
157 |     yhigh = (ymaxBin + 1.0) * sample.binWidth + sample.origin
158 |     return yminBin, ymaxBin, ynum, ylow, yhigh
159 | 
160 | 
161 | def set2Dsparse(sparse, yminBin, ymaxBin, grid):
162 |     for i, iindex in enumerate(range(sparse.minBin, sparse.maxBin + 1)):
163 |         for j, jindex in enumerate(range(yminBin, ymaxBin + 1)):
164 |             if iindex in sparse.bins and jindex in sparse.bins[iindex].bins:
165 |                 grid[j, i] = sparse.bins[iindex].bins[jindex].entries
166 |     return grid
167 | 


--------------------------------------------------------------------------------
/histogrammar/primitives/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2016 DIANA-HEP
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 


--------------------------------------------------------------------------------
/histogrammar/primitives/average.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright 2016 DIANA-HEP
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import math
 18 | import numbers
 19 | 
 20 | from histogrammar.defs import (
 21 |     Container,
 22 |     ContainerException,
 23 |     Factory,
 24 |     JsonFormatException,
 25 |     identity,
 26 | )
 27 | from histogrammar.util import (
 28 |     basestring,
 29 |     datatype,
 30 |     floatToJson,
 31 |     hasKeys,
 32 |     inheritdoc,
 33 |     maybeAdd,
 34 |     n_dim,
 35 |     numeq,
 36 |     serializable,
 37 | )
 38 | 
 39 | 
 40 | class Average(Factory, Container):
 41 |     """Accumulate the weighted mean of a given quantity.
 42 | 
 43 |     Uses the numerically stable weighted mean algorithm described in `"Incremental calculation of weighted mean
 44 |     and variance," <http://www-uxsup.csx.cam.ac.uk/~fanf2/hermes/doc/antiforgery/stats.pdf>`_ Tony Finch,
 45 |     *Univeristy of Cambridge Computing Service,* 2009.
 46 |     """
 47 | 
 48 |     @staticmethod
 49 |     def ed(entries, mean):
 50 |         """Create an Average that is only capable of being added.
 51 | 
 52 |         Parameters:
 53 |             entries (float): the number of entries.
 54 |             mean (float): the mean.
 55 |         """
 56 | 
 57 |         if not isinstance(entries, numbers.Real) and entries not in (
 58 |             "nan",
 59 |             "inf",
 60 |             "-inf",
 61 |         ):
 62 |             raise TypeError(f"entries ({entries}) must be a number")
 63 |         if not isinstance(mean, numbers.Real) and entries not in ("nan", "inf", "-inf"):
 64 |             raise TypeError(f"mean ({mean}) must be a number")
 65 |         if entries < 0.0:
 66 |             raise ValueError(f"entries ({entries}) cannot be negative")
 67 |         out = Average(None)
 68 |         out.entries = float(entries)
 69 |         out.mean = float(mean)
 70 |         return out.specialize()
 71 | 
 72 |     @staticmethod
 73 |     def ing(quantity):
 74 |         """Synonym for ``__init__``."""
 75 |         return Average(quantity)
 76 | 
 77 |     def __init__(self, quantity=identity):
 78 |         """Create an Average that is capable of being filled and added.
 79 | 
 80 |         Parameters:
 81 |             quantity (function returning float): computes the quantity of interest from the data.
 82 | 
 83 |         Other parameters:
 84 |             entries (float): the number of entries, initially 0.0.
 85 |             mean (float): the running mean, initially NaN.
 86 |         """
 87 |         self.quantity = serializable(identity(quantity) if isinstance(quantity, str) else quantity)
 88 |         self.entries = 0.0
 89 |         self.mean = float("nan")
 90 |         super().__init__()
 91 |         self.specialize()
 92 | 
 93 |     @inheritdoc(Container)
 94 |     def zero(self):
 95 |         return Average(self.quantity)
 96 | 
 97 |     @inheritdoc(Container)
 98 |     def __add__(self, other):
 99 |         if isinstance(other, Average):
100 |             out = Average(self.quantity)
101 |             out.entries = self.entries + other.entries
102 |             if self.entries == 0.0:
103 |                 out.mean = other.mean
104 |             elif other.entries == 0.0:
105 |                 out.mean = self.mean
106 |             else:
107 |                 out.mean = (self.entries * self.mean + other.entries * other.mean) / (self.entries + other.entries)
108 |             return out.specialize()
109 |         raise ContainerException(f"cannot add {self.name} and {other.name}")
110 | 
111 |     @inheritdoc(Container)
112 |     def __iadd__(self, other):
113 |         both = self + other
114 |         self.entries = both.entries
115 |         self.mean = both.mean
116 |         return self
117 | 
118 |     @inheritdoc(Container)
119 |     def __mul__(self, factor):
120 |         if math.isnan(factor) or factor <= 0.0:
121 |             return self.zero()
122 |         out = self.zero()
123 |         out.entries = factor * self.entries
124 |         out.mean = self.mean
125 |         return out.specialize()
126 | 
127 |     @inheritdoc(Container)
128 |     def __rmul__(self, factor):
129 |         return self.__mul__(factor)
130 | 
131 |     @inheritdoc(Container)
132 |     def fill(self, datum, weight=1.0):
133 |         self._checkForCrossReferences()
134 | 
135 |         if weight > 0.0:
136 |             q = self.quantity(datum)
137 |             if not isinstance(q, numbers.Real):
138 |                 raise TypeError(f"function return value ({q}) must be boolean or number")
139 | 
140 |             # no possibility of exception from here on out (for rollback)
141 |             if self.entries == 0.0:
142 |                 self.mean = q
143 |             self.entries += weight
144 | 
145 |             if math.isnan(self.mean) or math.isnan(q):
146 |                 self.mean = float("nan")
147 | 
148 |             elif math.isinf(self.mean) or math.isinf(q):
149 |                 if math.isinf(self.mean) and math.isinf(q) and self.mean * q < 0.0:
150 |                     self.mean = float("nan")  # opposite-sign infinities is bad
151 |                 elif math.isinf(q):
152 |                     self.mean = q  # mean becomes infinite with sign of q
153 |                 else:
154 |                     pass  # mean is already infinite
155 |                 if math.isinf(self.entries) or math.isnan(self.entries):
156 |                     self.mean = float("nan")  # non-finite denominator is bad
157 | 
158 |             else:  # handle finite case
159 |                 delta = q - self.mean
160 |                 shift = delta * weight / self.entries
161 |                 self.mean += shift
162 | 
163 |     def _numpy(self, data, weights, shape):
164 |         q = self.quantity(data)
165 |         self._checkNPQuantity(q, shape)
166 |         self._checkNPWeights(weights, shape)
167 |         weights = self._makeNPWeights(weights, shape)
168 | 
169 |         # no possibility of exception from here on out (for rollback)
170 |         ca, ma = self.entries, self.mean
171 |         if ca == 0.0:
172 |             ma = 0.0
173 | 
174 |         import numpy
175 | 
176 |         selection = weights > 0.0
177 |         q = q[selection]
178 |         weights = weights[selection]
179 | 
180 |         self.entries += float(weights.sum())
181 |         ca_plus_cb = self.entries
182 | 
183 |         if math.isinf(ca_plus_cb):
184 |             self.mean = float("nan")
185 |         elif ca_plus_cb > 0.0:
186 |             mb = numpy.average(q, weights=weights)
187 |             self.mean = float((ca * ma + (ca_plus_cb - ca) * mb) / ca_plus_cb)
188 | 
189 |     def _sparksql(self, jvm, converter):
190 |         return converter.Average(self.quantity.asSparkSQL())
191 | 
192 |     @property
193 |     def children(self):
194 |         """List of sub-aggregators, to make it possible to walk the tree."""
195 |         return []
196 | 
197 |     @inheritdoc(Container)
198 |     def toJsonFragment(self, suppressName):
199 |         return maybeAdd(
200 |             {"entries": floatToJson(self.entries), "mean": floatToJson(self.mean)},
201 |             name=(None if suppressName else self.quantity.name),
202 |         )
203 | 
204 |     @staticmethod
205 |     @inheritdoc(Factory)
206 |     def fromJsonFragment(json, nameFromParent):
207 |         if isinstance(json, dict) and hasKeys(json.keys(), ["entries", "mean"], ["name"]):
208 |             if json["entries"] in ("nan", "inf", "-inf") or isinstance(json["entries"], numbers.Real):
209 |                 entries = float(json["entries"])
210 |             else:
211 |                 raise JsonFormatException(json["entries"], "Average.entries")
212 | 
213 |             if isinstance(json.get("name", None), basestring):
214 |                 name = json["name"]
215 |             elif json.get("name", None) is None:
216 |                 name = None
217 |             else:
218 |                 raise JsonFormatException(json["name"], "Average.name")
219 | 
220 |             if json["mean"] in ("nan", "inf", "-inf") or isinstance(json["mean"], numbers.Real):
221 |                 mean = float(json["mean"])
222 |             else:
223 |                 raise JsonFormatException(json["mean"], "Average.mean")
224 | 
225 |             out = Average.ed(entries, mean)
226 |             out.quantity.name = nameFromParent if name is None else name
227 |             return out.specialize()
228 | 
229 |         raise JsonFormatException(json, "Average")
230 | 
231 |     def __repr__(self):
232 |         return f"<Average mean={self.mean}>"
233 | 
234 |     def __eq__(self, other):
235 |         return (
236 |             isinstance(other, Average)
237 |             and self.quantity == other.quantity
238 |             and numeq(self.entries, other.entries)
239 |             and numeq(self.mean, other.mean)
240 |         )
241 | 
242 |     def __ne__(self, other):
243 |         return not self == other
244 | 
245 |     def __hash__(self):
246 |         return hash((self.quantity, self.entries, self.mean))
247 | 
248 | 
249 | # extra properties: number of dimensions and datatypes of sub-hists
250 | Average.n_dim = n_dim
251 | Average.datatype = datatype
252 | 
253 | # register extra methods such as plotting
254 | Factory.register(Average)
255 | 


--------------------------------------------------------------------------------
/histogrammar/primitives/count.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright 2016 DIANA-HEP
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import math
 18 | import numbers
 19 | 
 20 | from histogrammar.defs import (
 21 |     Container,
 22 |     ContainerException,
 23 |     Factory,
 24 |     JsonFormatException,
 25 |     identity,
 26 | )
 27 | from histogrammar.util import (
 28 |     datatype,
 29 |     floatToJson,
 30 |     inheritdoc,
 31 |     n_dim,
 32 |     numeq,
 33 |     serializable,
 34 | )
 35 | 
 36 | 
 37 | class Count(Factory, Container):
 38 |     """Count entries by accumulating the sum of all observed weights or a sum of transformed weights
 39 | 
 40 |     (e.g. collect the sum of squares of weights).
 41 | 
 42 |     An optional ``transform`` function can be applied to the weights before summing.
 43 |     To accumulate the sum of squares of weights, use:
 44 | 
 45 |     ::
 46 |         lambda x: x**2
 47 | 
 48 |     for instance. This is unlike any other primitive's ``quantity`` function in that its domain is
 49 |     the *weights* (always double), not *data* (any type).
 50 |     """
 51 | 
 52 |     @staticmethod
 53 |     def ed(entries):
 54 |         """Create a Count that is only capable of being added.
 55 | 
 56 |         Parameters:
 57 |             entries (float): the number of entries.
 58 |         """
 59 |         if not isinstance(entries, numbers.Real) and entries not in (
 60 |             "nan",
 61 |             "inf",
 62 |             "-inf",
 63 |         ):
 64 |             raise TypeError(f"entries ({entries}) must be a number")
 65 |         if entries < 0.0:
 66 |             raise ValueError(f"entries ({entries}) cannot be negative")
 67 |         out = Count()
 68 |         out.entries = float(entries)
 69 |         return out
 70 | 
 71 |     @staticmethod
 72 |     def ing(transform=identity):
 73 |         """Synonym for ``__init__``."""
 74 |         return Count(transform)
 75 | 
 76 |     def __init__(self, transform=identity):
 77 |         """Create a Count that is capable of being filled and added.
 78 | 
 79 |         Parameters:
 80 |             transform (function from float to float): transforms each weight.
 81 | 
 82 |         Other parameters:
 83 |             entries (float): the number of entries, initially 0.0.
 84 |         """
 85 |         self.entries = 0.0
 86 |         self.transform = serializable(transform)
 87 |         super().__init__()
 88 | 
 89 |     @inheritdoc(Container)
 90 |     def zero(self):
 91 |         return Count(self.transform)
 92 | 
 93 |     @inheritdoc(Container)
 94 |     def __add__(self, other):
 95 |         if isinstance(other, Count):
 96 |             out = Count(self.transform)
 97 |             out.entries = self.entries + other.entries
 98 |             return out
 99 |         raise ContainerException(f"cannot add {self.name} and {other.name}")
100 | 
101 |     @inheritdoc(Container)
102 |     def __iadd__(self, other):
103 |         if isinstance(other, Count):
104 |             self.entries += other.entries
105 |             return self
106 |         raise ContainerException(f"cannot add {self.name} and {other.name}")
107 | 
108 |     @inheritdoc(Container)
109 |     def __mul__(self, factor):
110 |         if (
111 |             self.transform != identity
112 |             or not callable(self.transform.expr)
113 |             or (
114 |                 hasattr(self.transform.expr, "func_code")
115 |                 and self.transform.expr.func_code.co_code != identity.expr.func_code.co_code
116 |             )
117 |             or (
118 |                 hasattr(self.transform.expr, "__code__")
119 |                 and self.transform.expr.__code__.co_code != identity.expr.__code__.co_code
120 |             )
121 |         ):
122 |             raise ContainerException("Cannot scalar-multiply Count with a non-identity transform.")
123 |         if math.isnan(factor) or factor <= 0.0:
124 |             return self.zero()
125 |         out = self.zero()
126 |         out.entries = factor * self.entries
127 |         return out
128 | 
129 |     @inheritdoc(Container)
130 |     def __rmul__(self, factor):
131 |         return self.__mul__(factor)
132 | 
133 |     @inheritdoc(Container)
134 |     def fill(self, datum, weight=1.0):
135 |         self._checkForCrossReferences()
136 | 
137 |         if weight > 0.0:
138 |             t = self.transform(weight)
139 |             if not isinstance(t, numbers.Real):
140 |                 raise TypeError(f"function return value ({t}) must be boolean or number")
141 | 
142 |             # no possibility of exception from here on out (for rollback)
143 |             self.entries += t
144 | 
145 |     def _numpy(self, _, weights, shape):
146 |         import numpy
147 | 
148 |         if isinstance(weights, numpy.ndarray):
149 |             assert len(weights.shape) == 1
150 |             if shape[0] is not None:
151 |                 assert weights.shape[0] == shape[0]
152 | 
153 |             if self.transform is identity:
154 |                 self.entries += float(weights.sum())
155 |             else:
156 |                 t = self.transform(weights)
157 |                 assert len(t.shape) == 1
158 |                 if shape[0] is not None:
159 |                     assert t.shape[0] == shape[0]
160 |                 self.entries += float(t.sum())
161 | 
162 |         elif shape[0] is not None:
163 |             if self.transform is identity:
164 |                 self.entries += weights * shape[0]
165 |             else:
166 |                 t = self.transform(numpy.array([weights]))
167 |                 assert len(t.shape) == 1
168 |                 assert t.shape[0] == 1
169 |                 self.entries += float(t[0])
170 | 
171 |         elif isinstance(weights, (int, float, numpy.number)):
172 |             if self.transform is identity:
173 |                 self.entries += float(weights)
174 |             else:
175 |                 self.entries += self.transform(weights)
176 | 
177 |         else:
178 |             raise ValueError("cannot use Numpy to fill an isolated Count (unless the weights are given as an array)")
179 | 
180 |     def _sparksql(self, jvm, converter):
181 |         return converter.Count()  # TODO: handle transform
182 | 
183 |     @property
184 |     def children(self):
185 |         """List of sub-aggregators, to make it possible to walk the tree."""
186 |         return []
187 | 
188 |     @inheritdoc(Container)
189 |     def toJsonFragment(self, suppressName):
190 |         return floatToJson(self.entries)
191 | 
192 |     @staticmethod
193 |     @inheritdoc(Factory)
194 |     def fromJsonFragment(json, nameFromParent):
195 |         if json in ("nan", "inf", "-inf") or isinstance(json, numbers.Real):
196 |             return Count.ed(float(json))
197 |         raise JsonFormatException(json, "Count")
198 | 
199 |     def __repr__(self):
200 |         return f"<Count {self.entries}>"
201 | 
202 |     def __eq__(self, other):
203 |         return isinstance(other, Count) and numeq(self.entries, other.entries) and self.transform == other.transform
204 | 
205 |     def __ne__(self, other):
206 |         return not self == other
207 | 
208 |     def __hash__(self):
209 |         return hash((self.entries, self.transform))
210 | 
211 | 
212 | # extra properties: number of dimensions and datatypes of sub-hists
213 | Count.n_dim = n_dim
214 | Count.datatype = datatype
215 | 
216 | # register extra methods
217 | Factory.register(Count)
218 | 


--------------------------------------------------------------------------------
/histogrammar/primitives/fraction.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright 2016 DIANA-HEP
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import math
 18 | import numbers
 19 | 
 20 | from histogrammar.defs import (
 21 |     Container,
 22 |     ContainerException,
 23 |     Factory,
 24 |     JsonFormatException,
 25 |     identity,
 26 | )
 27 | from histogrammar.primitives.count import Count
 28 | from histogrammar.util import (
 29 |     basestring,
 30 |     datatype,
 31 |     floatToJson,
 32 |     hasKeys,
 33 |     inheritdoc,
 34 |     maybeAdd,
 35 |     n_dim,
 36 |     numeq,
 37 |     serializable,
 38 | )
 39 | 
 40 | 
 41 | class Fraction(Factory, Container):
 42 |     """Accumulate two aggregators, one numerator and one denominator
 43 | 
 44 |     Accumulate two aggregators, one containing only entries that pass a given selection (numerator) and another
 45 |     that contains all entries (denominator).
 46 | 
 47 |     The aggregator may be a simple :doc:`Count <histogrammar.primitives.count.Count>` to measure the efficiency of a
 48 |     cut, a :doc:`Bin <histogrammar.primitives.bin.Bin>` to plot a turn-on curve, or anything else to be tested with
 49 |     and without a cut.
 50 | 
 51 |     As a side effect of NaN values returning false for any comparison, a NaN return value from the selection is
 52 |     treated as a failed cut (the denominator is filled but the numerator is not).
 53 |     """
 54 | 
 55 |     @staticmethod
 56 |     def ed(entries, numerator, denominator):
 57 |         """Create a Fraction that is only capable of being added.
 58 | 
 59 |         Parameters:
 60 |             entries (float): the number of entries.
 61 |             numerator: (:doc:`Container <histogrammar.defs.Container>`): the filled numerator.
 62 |             denominator (:doc:`Container <histogrammar.defs.Container>`): the filled denominator.
 63 |         """
 64 |         if not isinstance(entries, numbers.Real) and entries not in (
 65 |             "nan",
 66 |             "inf",
 67 |             "-inf",
 68 |         ):
 69 |             raise TypeError(f"entries ({entries}) must be a number")
 70 |         if not isinstance(numerator, Container):
 71 |             raise TypeError(f"numerator ({numerator}) must be a Container")
 72 |         if not isinstance(denominator, Container):
 73 |             raise TypeError(f"denominator ({denominator}) must be a Container")
 74 |         if entries < 0.0:
 75 |             raise ValueError(f"entries ({entries}) cannot be negative")
 76 | 
 77 |         out = Fraction(None, None)
 78 |         out.entries = float(entries)
 79 |         out.numerator = numerator
 80 |         out.denominator = denominator
 81 |         return out.specialize()
 82 | 
 83 |     @staticmethod
 84 |     def ing(quantity, value=Count()):
 85 |         """Synonym for ``__init__``."""
 86 |         return Fraction(quantity, value)
 87 | 
 88 |     def __init__(self, quantity=identity, value=Count()):
 89 |         """Create a Fraction that is capable of being filled and added.
 90 | 
 91 |         Parameters:
 92 |             quantity (function returning bool or float): computes the quantity of interest from the data and interprets
 93 |                 it as a selection (multiplicative factor on weight).
 94 |             value (:doc:`Container <histogrammar.defs.Container>`): generates sub-aggregators for the numerator and
 95 |                 denominator.
 96 | 
 97 |         Other parameters:
 98 |             entries (float): the number of entries, initially 0.0.
 99 |             numerator (:doc:`Container <histogrammar.defs.Container>`): the sub-aggregator of entries that pass
100 |                 the selection.
101 |             denominator (:doc:`Container <histogrammar.defs.Container>`): the sub-aggregator of all entries.
102 |         """
103 |         if value is not None and not isinstance(value, Container):
104 |             raise TypeError(f"value ({value}) must be None or a Container")
105 |         self.entries = 0.0
106 |         self.quantity = serializable(identity(quantity) if isinstance(quantity, str) else quantity)
107 |         if value is not None:
108 |             self.numerator = value.zero()
109 |             self.denominator = value.zero()
110 |         super().__init__()
111 |         self.specialize()
112 | 
113 |     @staticmethod
114 |     def build(numerator, denominator):
115 |         """Create a Fraction out of pre-existing containers, which might have been aggregated on different streams.
116 | 
117 |         Parameters:
118 |             numerator (:doc:`Container <histogrammar.defs.Container>`): the filled numerator.
119 |             denominator (:doc:`Container <histogrammar.defs.Container>`): the filled denominator.
120 | 
121 |         This function will attempt to combine the ``numerator`` and ``denominator``, so they must have the same
122 |         binning/bounds/etc.
123 |         """
124 |         if not isinstance(numerator, Container):
125 |             raise TypeError(f"numerator ({numerator}) must be a Container")
126 |         if not isinstance(denominator, Container):
127 |             raise TypeError(f"denominator ({denominator}) must be a Container")
128 |         # check for compatibility
129 |         numerator + denominator
130 |         # return object
131 |         return Fraction.ed(denominator.entries, numerator, denominator)
132 | 
133 |     @inheritdoc(Container)
134 |     def zero(self):
135 |         out = Fraction(self.quantity, None)
136 |         out.numerator = self.numerator.zero()
137 |         out.denominator = self.denominator.zero()
138 |         return out.specialize()
139 | 
140 |     @inheritdoc(Container)
141 |     def __add__(self, other):
142 |         if isinstance(other, Fraction):
143 |             out = Fraction(self.quantity, None)
144 |             out.entries = self.entries + other.entries
145 |             out.numerator = self.numerator + other.numerator
146 |             out.denominator = self.denominator + other.denominator
147 |             return out.specialize()
148 |         raise ContainerException(f"cannot add {self.name} and {other.name}")
149 | 
150 |     @inheritdoc(Container)
151 |     def __iadd__(self, other):
152 |         if isinstance(other, Fraction):
153 |             self.entries += other.entries
154 |             self.numerator += other.numerator
155 |             self.denominator += other.denominator
156 |             return self
157 |         raise ContainerException(f"cannot add {self.name} and {other.name}")
158 | 
159 |     @inheritdoc(Container)
160 |     def __mul__(self, factor):
161 |         if math.isnan(factor) or factor <= 0.0:
162 |             return self.zero()
163 |         out = self.zero()
164 |         out.entries = factor * self.entries
165 |         out.numerator = self.numerator * factor
166 |         out.denominator = self.denominator * factor
167 |         return out.specialize()
168 | 
169 |     @inheritdoc(Container)
170 |     def __rmul__(self, factor):
171 |         return self.__mul__(factor)
172 | 
173 |     @inheritdoc(Container)
174 |     def fill(self, datum, weight=1.0):
175 |         self._checkForCrossReferences()
176 | 
177 |         if weight > 0.0:
178 |             w = self.quantity(datum)
179 |             if not isinstance(w, numbers.Real):
180 |                 raise TypeError(f"function return value ({w}) must be boolean or number")
181 |             w *= weight
182 | 
183 |             self.denominator.fill(datum, weight)
184 |             if w > 0.0:
185 |                 self.numerator.fill(datum, w)
186 | 
187 |             # no possibility of exception from here on out (for rollback)
188 |             self.entries += weight
189 | 
190 |     def _numpy(self, data, weights, shape):
191 |         w = self.quantity(data)
192 |         self._checkNPQuantity(w, shape)
193 |         self._checkNPWeights(weights, shape)
194 |         weights = self._makeNPWeights(weights, shape)
195 | 
196 |         import numpy
197 | 
198 |         w = w * weights
199 |         w[numpy.isnan(w)] = 0.0
200 |         w[w < 0.0] = 0.0
201 | 
202 |         self.numerator._numpy(data, w, shape)
203 |         self.denominator._numpy(data, weights, shape)
204 | 
205 |         # no possibility of exception from here on out (for rollback)
206 |         self.entries += float(weights.sum())
207 | 
208 |     def _sparksql(self, jvm, converter):
209 |         return converter.Fraction(self.quantity.asSparkSQL(), self.numerator._sparksql(jvm, converter))
210 | 
211 |     @property
212 |     def children(self):
213 |         """List of sub-aggregators, to make it possible to walk the tree."""
214 |         return [self.numerator, self.denominator]
215 | 
216 |     @inheritdoc(Container)
217 |     def toJsonFragment(self, suppressName):
218 |         if getattr(self.numerator, "quantity", None) is not None:
219 |             binsName = self.numerator.quantity.name
220 |         elif getattr(self.numerator, "quantityName", None) is not None:
221 |             binsName = self.numerator.quantityName
222 |         else:
223 |             binsName = None
224 | 
225 |         return maybeAdd(
226 |             {
227 |                 "entries": floatToJson(self.entries),
228 |                 "sub:type": self.numerator.name,
229 |                 "numerator": self.numerator.toJsonFragment(True),
230 |                 "denominator": self.denominator.toJsonFragment(True),
231 |             },
232 |             **{
233 |                 "name": None if suppressName else self.quantity.name,
234 |                 "sub:name": binsName,
235 |             },
236 |         )
237 | 
238 |     @staticmethod
239 |     @inheritdoc(Factory)
240 |     def fromJsonFragment(json, nameFromParent):
241 |         if isinstance(json, dict) and hasKeys(
242 |             json.keys(),
243 |             ["entries", "sub:type", "numerator", "denominator"],
244 |             ["name", "sub:name"],
245 |         ):
246 |             if json["entries"] in ("nan", "inf", "-inf") or isinstance(json["entries"], numbers.Real):
247 |                 entries = float(json["entries"])
248 |             else:
249 |                 raise JsonFormatException(json, "Fraction.entries")
250 | 
251 |             if isinstance(json.get("name", None), basestring):
252 |                 name = json["name"]
253 |             elif json.get("name", None) is None:
254 |                 name = None
255 |             else:
256 |                 raise JsonFormatException(json["name"], "Fraction.name")
257 | 
258 |             if isinstance(json["sub:type"], basestring):
259 |                 factory = Factory.registered[json["sub:type"]]
260 |             else:
261 |                 raise JsonFormatException(json, "Fraction.type")
262 | 
263 |             if isinstance(json.get("sub:name", None), basestring):
264 |                 subName = json["sub:name"]
265 |             elif json.get("sub:name", None) is None:
266 |                 subName = None
267 |             else:
268 |                 raise JsonFormatException(json["sub:name"], "Fraction.sub:name")
269 | 
270 |             numerator = factory.fromJsonFragment(json["numerator"], subName)
271 |             denominator = factory.fromJsonFragment(json["denominator"], subName)
272 | 
273 |             out = Fraction.ed(entries, numerator, denominator)
274 |             out.quantity.name = nameFromParent if name is None else name
275 |             return out.specialize()
276 | 
277 |         raise JsonFormatException(json, "Fraction")
278 | 
279 |     def __repr__(self):
280 |         return f"<Fraction values={self.numerator.name}>"
281 | 
282 |     def __eq__(self, other):
283 |         return (
284 |             isinstance(other, Fraction)
285 |             and numeq(self.entries, other.entries)
286 |             and self.quantity == other.quantity
287 |             and self.numerator == other.numerator
288 |             and self.denominator == other.denominator
289 |         )
290 | 
291 |     def __ne__(self, other):
292 |         return not self == other
293 | 
294 |     def __hash__(self):
295 |         return hash((self.entries, self.quantity, self.numerator, self.denominator))
296 | 
297 | 
298 | # extra properties: number of dimensions and datatypes of sub-hists
299 | Fraction.n_dim = n_dim
300 | Fraction.datatype = datatype
301 | 
302 | # register extra methods
303 | Factory.register(Fraction)
304 | 


--------------------------------------------------------------------------------
/histogrammar/primitives/select.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright 2016 DIANA-HEP
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import math
 18 | import numbers
 19 | 
 20 | from histogrammar.defs import (
 21 |     Container,
 22 |     ContainerException,
 23 |     Factory,
 24 |     JsonFormatException,
 25 |     identity,
 26 | )
 27 | from histogrammar.primitives.count import Count
 28 | from histogrammar.util import (
 29 |     basestring,
 30 |     datatype,
 31 |     floatToJson,
 32 |     hasKeys,
 33 |     inheritdoc,
 34 |     maybeAdd,
 35 |     n_dim,
 36 |     numeq,
 37 |     serializable,
 38 | )
 39 | 
 40 | # Select
 41 | 
 42 | 
 43 | class Select(Factory, Container):
 44 |     """Filter or weight data according to a given selection.
 45 | 
 46 |     This primitive is a basic building block, intended to be used in conjunction with anything that needs a
 47 |     user-defined cut. In particular, a standard histogram often has a custom selection, and this can be built by
 48 |     nesting Select -> Bin -> Count.
 49 | 
 50 |     Select also resembles :doc:`Fraction <histogrammar.primitives.fraction.Fraction>`, but without the ``denominator``.
 51 | 
 52 |     The efficiency of a cut in a Select aggregator named ``x`` is simply ``x.cut.entries / x.entries``
 53 |     (because all aggregators have an ``entries`` member).
 54 |     """
 55 | 
 56 |     @staticmethod
 57 |     def ed(entries, cut):
 58 |         """Create a Select that is only capable of being added.
 59 | 
 60 |         Parameters:
 61 |             entries (float): the number of entries.
 62 |             cut (:doc:`Container <histogrammar.defs.Container>`): the filled sub-aggregator.
 63 |         """
 64 |         if not isinstance(entries, numbers.Real) and entries not in (
 65 |             "nan",
 66 |             "inf",
 67 |             "-inf",
 68 |         ):
 69 |             raise TypeError(f"entries ({entries}) must be a number")
 70 |         if not isinstance(cut, Container):
 71 |             raise TypeError(f"cut ({cut}) must be a Container")
 72 |         if entries < 0.0:
 73 |             raise ValueError(f"entries ({entries}) cannot be negative")
 74 |         out = Select(None, cut)
 75 |         out.entries = float(entries)
 76 |         return out.specialize()
 77 | 
 78 |     @staticmethod
 79 |     def ing(quantity, cut=Count()):
 80 |         """Synonym for ``__init__``."""
 81 |         return Select(quantity, cut)
 82 | 
 83 |     def __getattr__(self, attr):
 84 |         """Pass on searches for custom methods to the ``value``, so that Limit becomes effectively invisible."""
 85 |         if attr.startswith("__") and attr.endswith("__"):
 86 |             return getattr(Select, attr)
 87 |         if attr not in self.__dict__ and hasattr(self.__dict__["cut"], attr):
 88 |             return getattr(self.__dict__["cut"], attr)
 89 |         return self.__dict__[attr]
 90 | 
 91 |     def __init__(self, quantity=identity, cut=Count()):
 92 |         """Create a Select that is capable of being filled and added.
 93 | 
 94 |         Parameters:
 95 |             quantity (function returning bool or float): computes the quantity of interest from the data and interprets
 96 |                 it as a selection (multiplicative factor on weight).
 97 |             cut (:doc:`Container <histogrammar.defs.Container>`): will only be filled with data that pass the cut,
 98 |                 and which are weighted by the cut.
 99 | 
100 |         Other Parameters:
101 |             entries (float): the number of entries, initially 0.0.
102 |         """
103 |         if not isinstance(cut, Container):
104 |             raise TypeError(f"cut ({cut}) must be a Container")
105 |         self.entries = 0.0
106 |         self.quantity = serializable(identity(quantity) if isinstance(quantity, str) else quantity)
107 |         self.cut = cut
108 |         super().__init__()
109 |         self.specialize()
110 | 
111 |     def fractionPassing(self):
112 |         """Fraction of weights that pass the quantity."""
113 |         return self.cut.entries / self.entries
114 | 
115 |     @inheritdoc(Container)
116 |     def zero(self):
117 |         return Select(self.quantity, self.cut.zero())
118 | 
119 |     @inheritdoc(Container)
120 |     def __add__(self, other):
121 |         if isinstance(other, Select):
122 |             out = Select(self.quantity, self.cut + other.cut)
123 |             out.entries = self.entries + other.entries
124 |             return out.specialize()
125 |         raise ContainerException(f"cannot add {self.name} and {other.name}")
126 | 
127 |     @inheritdoc(Container)
128 |     def __iadd__(self, other):
129 |         if isinstance(other, Select):
130 |             self.entries += other.entries
131 |             self.cut += other.cut
132 |             return self
133 |         raise ContainerException(f"cannot add {self.name} and {other.name}")
134 | 
135 |     @inheritdoc(Container)
136 |     def __mul__(self, factor):
137 |         if math.isnan(factor) or factor <= 0.0:
138 |             return self.zero()
139 |         out = self.zero()
140 |         out.entries = factor * self.entries
141 |         out.cut = self.cut * factor
142 |         return out.specialize()
143 | 
144 |     @inheritdoc(Container)
145 |     def __rmul__(self, factor):
146 |         return self.__mul__(factor)
147 | 
148 |     @inheritdoc(Container)
149 |     def fill(self, datum, weight=1.0):
150 |         self._checkForCrossReferences()
151 | 
152 |         if weight > 0.0:
153 |             w = self.quantity(datum)
154 |             if not isinstance(w, numbers.Real):
155 |                 raise TypeError(f"function return value ({w}) must be boolean or number")
156 |             w *= weight
157 | 
158 |             if w > 0.0:
159 |                 self.cut.fill(datum, w)
160 |             # no possibility of exception from here on out (for rollback)
161 |             self.entries += weight
162 | 
163 |     def _numpy(self, data, weights, shape):
164 |         w = self.quantity(data)
165 |         self._checkNPQuantity(w, shape)
166 |         self._checkNPWeights(weights, shape)
167 |         weights = self._makeNPWeights(weights, shape)
168 | 
169 |         import numpy
170 | 
171 |         w = w * weights
172 |         w[numpy.isnan(w)] = 0.0
173 |         w[w < 0.0] = 0.0
174 | 
175 |         self.cut._numpy(data, w, shape)
176 | 
177 |         # no possibility of exception from here on out (for rollback)
178 |         self.entries += float(weights.sum())
179 | 
180 |     def _sparksql(self, jvm, converter):
181 |         return converter.Select(self.quantity.asSparkSQL(), self.cut._sparksql(jvm, converter))
182 | 
183 |     @property
184 |     def children(self):
185 |         """List of sub-aggregators, to make it possible to walk the tree."""
186 |         return [self.cut]
187 | 
188 |     @inheritdoc(Container)
189 |     def toJsonFragment(self, suppressName):
190 |         return maybeAdd(
191 |             {
192 |                 "entries": floatToJson(self.entries),
193 |                 "sub:type": self.cut.name,
194 |                 "data": self.cut.toJsonFragment(False),
195 |             },
196 |             name=(None if suppressName else self.quantity.name),
197 |         )
198 | 
199 |     @staticmethod
200 |     @inheritdoc(Factory)
201 |     def fromJsonFragment(json, nameFromParent):
202 |         if isinstance(json, dict) and hasKeys(json.keys(), ["entries", "sub:type", "data"], ["name"]):
203 |             if json["entries"] in ("nan", "inf", "-inf") or isinstance(json["entries"], numbers.Real):
204 |                 entries = float(json["entries"])
205 |             else:
206 |                 raise JsonFormatException(json, "Select.entries")
207 | 
208 |             if isinstance(json.get("name", None), basestring):
209 |                 name = json["name"]
210 |             elif json.get("name", None) is None:
211 |                 name = None
212 |             else:
213 |                 raise JsonFormatException(json["name"], "Select.name")
214 | 
215 |             if isinstance(json["sub:type"], basestring):
216 |                 factory = Factory.registered[json["sub:type"]]
217 |             else:
218 |                 raise JsonFormatException(json, "Select.type")
219 | 
220 |             cut = factory.fromJsonFragment(json["data"], None)
221 | 
222 |             out = Select.ed(entries, cut)
223 |             out.quantity.name = nameFromParent if name is None else name
224 |             return out.specialize()
225 | 
226 |         raise JsonFormatException(json, "Select")
227 | 
228 |     def __repr__(self):
229 |         return f"<Select cut={self.cut.name}>"
230 | 
231 |     def __eq__(self, other):
232 |         return isinstance(other, Select) and numeq(self.entries, other.entries) and self.cut == other.cut
233 | 
234 |     def __ne__(self, other):
235 |         return not self == other
236 | 
237 |     def __hash__(self):
238 |         return hash((self.entries, self.cut))
239 | 
240 | 
241 | # extra properties: number of dimensions and datatypes of sub-hists
242 | Select.n_dim = n_dim
243 | Select.datatype = datatype
244 | 
245 | # register extra methods
246 | Factory.register(Select)
247 | 


--------------------------------------------------------------------------------
/histogrammar/primitives/sum.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright 2016 DIANA-HEP
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import math
 18 | import numbers
 19 | 
 20 | from histogrammar.defs import (
 21 |     Container,
 22 |     ContainerException,
 23 |     Factory,
 24 |     JsonFormatException,
 25 |     identity,
 26 | )
 27 | from histogrammar.util import (
 28 |     basestring,
 29 |     datatype,
 30 |     floatToJson,
 31 |     hasKeys,
 32 |     inheritdoc,
 33 |     maybeAdd,
 34 |     n_dim,
 35 |     numeq,
 36 |     serializable,
 37 | )
 38 | 
 39 | 
 40 | class Sum(Factory, Container):
 41 |     """Accumulate the (weighted) sum of a given quantity, calculated from the data.
 42 | 
 43 |     Sum differs from :doc:`Count <histogrammar.primitives.count.Count>` in that it computes a quantity on the spot,
 44 |     rather than percolating a product of weight metadata from nested primitives. Also unlike weights, the sum can add
 45 |     both positive and negative quantities (weights are always non-negative).
 46 |     """
 47 | 
 48 |     @staticmethod
 49 |     def ed(entries, sum):
 50 |         """Create a Sum that is only capable of being added.
 51 | 
 52 |         Parameters:
 53 |             entries (float): the number of entries.
 54 |             sum (float): the sum.
 55 |         """
 56 |         if not isinstance(entries, numbers.Real) and entries not in (
 57 |             "nan",
 58 |             "inf",
 59 |             "-inf",
 60 |         ):
 61 |             raise TypeError(f"entries ({entries}) must be a number")
 62 |         if not isinstance(sum, numbers.Real) and entries not in ("nan", "inf", "-inf"):
 63 |             raise TypeError(f"sum ({sum}) must be a number")
 64 |         if entries < 0.0:
 65 |             raise ValueError(f"entries ({entries}) cannot be negative")
 66 |         out = Sum(None)
 67 |         out.entries = float(entries)
 68 |         out.sum = float(sum)
 69 |         return out.specialize()
 70 | 
 71 |     @staticmethod
 72 |     def ing(quantity):
 73 |         """Synonym for ``__init__``."""
 74 |         return Sum(quantity)
 75 | 
 76 |     def __init__(self, quantity=identity):
 77 |         """Create a Sum that is capable of being filled and added.
 78 | 
 79 |         Parameters:
 80 |             quantity (function returning float): computes the quantity of interest from the data.
 81 | 
 82 |         Other parameters:
 83 |             entries (float): the number of entries, initially 0.0.
 84 |             sum (float): the running sum, initially 0.0.
 85 |         """
 86 |         self.quantity = serializable(identity(quantity) if isinstance(quantity, str) else quantity)
 87 |         self.entries = 0.0
 88 |         self.sum = 0.0
 89 |         super().__init__()
 90 |         self.specialize()
 91 | 
 92 |     @inheritdoc(Container)
 93 |     def zero(self):
 94 |         return Sum(self.quantity)
 95 | 
 96 |     @inheritdoc(Container)
 97 |     def __add__(self, other):
 98 |         if isinstance(other, Sum):
 99 |             out = Sum(self.quantity)
100 |             out.entries = self.entries + other.entries
101 |             out.sum = self.sum + other.sum
102 |             return out.specialize()
103 |         raise ContainerException(f"cannot add {self.name} and {other.name}")
104 | 
105 |     @inheritdoc(Container)
106 |     def __iadd__(self, other):
107 |         self.entries += other.entries
108 |         self.sum += other.sum
109 |         return self
110 | 
111 |     @inheritdoc(Container)
112 |     def __mul__(self, factor):
113 |         if math.isnan(factor) or factor <= 0.0:
114 |             return self.zero()
115 |         out = self.zero()
116 |         out.entries = factor * self.entries
117 |         out.sum = factor * self.sum
118 |         return out.specialize()
119 | 
120 |     @inheritdoc(Container)
121 |     def __rmul__(self, factor):
122 |         return self.__mul__(factor)
123 | 
124 |     @inheritdoc(Container)
125 |     def fill(self, datum, weight=1.0, method=None):
126 |         self._checkForCrossReferences()
127 | 
128 |         if weight > 0.0:
129 |             q = self.quantity(datum)
130 |             if not isinstance(q, numbers.Real):
131 |                 raise TypeError(f"function return value ({q}) must be boolean or number")
132 | 
133 |             # no possibility of exception from here on out (for rollback)
134 |             self.entries += weight
135 |             self.sum += q * weight
136 | 
137 |     def _numpy(self, data, weights, shape):
138 |         q = self.quantity(data)
139 |         self._checkNPQuantity(q, shape)
140 |         self._checkNPWeights(weights, shape)
141 |         weights = self._makeNPWeights(weights, shape)
142 | 
143 |         # no possibility of exception from here on out (for rollback)
144 |         self.entries += float(weights.sum())
145 | 
146 |         import numpy
147 | 
148 |         selection = numpy.isnan(q)
149 |         numpy.bitwise_not(selection, selection)
150 |         numpy.bitwise_and(selection, weights > 0.0, selection)
151 |         q = q[selection]
152 |         weights = weights[selection]
153 |         q = q * weights
154 | 
155 |         self.sum += float(q.sum())
156 | 
157 |     def _sparksql(self, jvm, converter):
158 |         return converter.Sum(self.quantity.asSparkSQL())
159 | 
160 |     @property
161 |     def children(self):
162 |         """List of sub-aggregators, to make it possible to walk the tree."""
163 |         return []
164 | 
165 |     @inheritdoc(Container)
166 |     def toJsonFragment(self, suppressName):
167 |         return maybeAdd(
168 |             {"entries": floatToJson(self.entries), "sum": floatToJson(self.sum)},
169 |             name=(None if suppressName else self.quantity.name),
170 |         )
171 | 
172 |     @staticmethod
173 |     @inheritdoc(Factory)
174 |     def fromJsonFragment(json, nameFromParent):
175 |         if isinstance(json, dict) and hasKeys(json.keys(), ["entries", "sum"], ["name"]):
176 |             if json["entries"] in ("nan", "inf", "-inf") or isinstance(json["entries"], numbers.Real):
177 |                 entries = float(json["entries"])
178 |             else:
179 |                 raise JsonFormatException(json["entries"], "Sum.entries")
180 | 
181 |             if isinstance(json.get("name", None), basestring):
182 |                 name = json["name"]
183 |             elif json.get("name", None) is None:
184 |                 name = None
185 |             else:
186 |                 raise JsonFormatException(json["name"], "Sum.name")
187 | 
188 |             if json["sum"] in ("nan", "inf", "-inf") or isinstance(json["sum"], numbers.Real):
189 |                 sum = float(json["sum"])
190 |             else:
191 |                 raise JsonFormatException(json["sum"], "Sum.sum")
192 | 
193 |             out = Sum.ed(entries, sum)
194 |             out.quantity.name = nameFromParent if name is None else name
195 |             return out.specialize()
196 | 
197 |         raise JsonFormatException(json, "Sum")
198 | 
199 |     def __repr__(self):
200 |         return f"<Sum sum={self.sum}>"
201 | 
202 |     def __eq__(self, other):
203 |         return (
204 |             isinstance(other, Sum)
205 |             and self.quantity == other.quantity
206 |             and numeq(self.entries, other.entries)
207 |             and numeq(self.sum, other.sum)
208 |         )
209 | 
210 |     def __ne__(self, other):
211 |         return not self == other
212 | 
213 |     def __hash__(self):
214 |         return hash((self.quantity, self.entries, self.sum))
215 | 
216 | 
217 | # extra properties: number of dimensions and datatypes of sub-hists
218 | Sum.n_dim = n_dim
219 | Sum.datatype = datatype
220 | 
221 | # register extra methods
222 | Factory.register(Sum)
223 | 


--------------------------------------------------------------------------------
/histogrammar/resources.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 ING Wholesale Banking Advanced Analytics
 2 | #
 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | # this software and associated documentation files (the "Software"), to deal in
 5 | # the Software without restriction, including without limitation the rights to
 6 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | # the Software, and to permit persons to whom the Software is furnished to do so,
 8 | # subject to the following conditions:
 9 | #
10 | # The above copyright notice and this permission notice shall be included in all
11 | # copies or substantial portions of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | 
20 | 
21 | # Resources lookup file for histogrammar
22 | from importlib import resources
23 | 
24 | from histogrammar import notebooks, test_data
25 | 
26 | # data files that are shipped with histogrammar.
27 | _DATA = {_.name: _ for _ in resources.files(test_data).iterdir()}
28 | 
29 | # Tutorial notebooks
30 | _NOTEBOOK = {p.name: p for p in resources.files(notebooks).iterdir() if p.suffix == ".ipynb"}
31 | 
32 | # Resource types
33 | _RESOURCES = {"data": _DATA, "notebook": _NOTEBOOK}
34 | 
35 | 
36 | def _resource(resource_type, name: str) -> str:
37 |     """Return the full path filename of a resource.
38 | 
39 |     :param str resource_type: The type of the resource.
40 |     :param str  name: The name of the resource.
41 |     :returns: The full path filename of the fixture data set.
42 |     :rtype: str
43 |     :raises FileNotFoundError: If the resource cannot be found.
44 |     """
45 |     full_path = _RESOURCES[resource_type].get(name, None)
46 | 
47 |     if full_path and full_path.exists():
48 |         return str(full_path)
49 | 
50 |     raise FileNotFoundError(f'Could not find {resource_type} "{name!s}"! Does it exist?')
51 | 
52 | 
53 | def data(name: str) -> str:
54 |     """Return the full path filename of a shipped data file.
55 | 
56 |     :param str name: The name of the data.
57 |     :returns: The full path filename of the data.
58 |     :rtype: str
59 |     :raises FileNotFoundError: If the data cannot be found.
60 |     """
61 |     return _resource("data", name)
62 | 
63 | 
64 | def notebook(name: str) -> str:
65 |     """Return the full path filename of a tutorial notebook.
66 | 
67 |     :param str name: The name of the notebook.
68 |     :returns: The full path filename of the notebook.
69 |     :rtype: str
70 |     :raises FileNotFoundError: If the notebook cannot be found.
71 |     """
72 |     return _resource("notebook", name)
73 | 


--------------------------------------------------------------------------------
/histogrammar/sparksql.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | # MB 20210131: moved to histogrammar/dfinterface/sparksql.py. Imported here (for now) for bkw compatibility
4 | from .dfinterface.addmethods import add_sparksql_methods as addMethods
5 | 


--------------------------------------------------------------------------------
/histogrammar/test_data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/histogrammar/histogrammar-python/642d89dfbe34f1065d215a369ec153ec11ed4c2e/histogrammar/test_data/__init__.py


--------------------------------------------------------------------------------
/histogrammar/test_data/test.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/histogrammar/histogrammar-python/642d89dfbe34f1065d215a369ec153ec11ed4c2e/histogrammar/test_data/test.csv.gz


--------------------------------------------------------------------------------
/histogrammar/version.py:
--------------------------------------------------------------------------------
 1 | """THIS FILE IS AUTO-GENERATED BY SETUP.PY."""
 2 | 
 3 | import re
 4 | 
 5 | version = "1.1.0"
 6 | 
 7 | 
 8 | def split_version_string(version_string: str) -> tuple[int, int]:
 9 |     version_numbers = list(map(int, re.split(r"[-.]", version_string)))
10 |     return version_numbers[0], version_numbers[1]
11 | 
12 | 
13 | specification = ".".join([str(i) for i in split_version_string(version)[:2]])
14 | 
15 | 
16 | def compatible(serialized_version: str) -> bool:
17 |     self_major, self_minor = split_version_string(version)
18 |     other_major, other_minor = split_version_string(serialized_version)
19 | 
20 |     return bool(self_major >= other_major or self_minor >= other_minor)
21 | 


--------------------------------------------------------------------------------
/makedocs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2016 DIANA-HEP
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | import importlib
18 | import inspect
19 | from pathlib import Path
20 | 
21 | modules = [
22 |     "histogrammar.defs",
23 |     "histogrammar.specialized",
24 |     "histogrammar.util",
25 |     "histogrammar.version",
26 |     "histogrammar.primitives.average",
27 |     "histogrammar.primitives.bag",
28 |     "histogrammar.primitives.bin",
29 |     "histogrammar.primitives.categorize",
30 |     "histogrammar.primitives.centrallybin",
31 |     "histogrammar.primitives.collection",
32 |     "histogrammar.primitives.count",
33 |     "histogrammar.primitives.deviate",
34 |     "histogrammar.primitives.fraction",
35 |     "histogrammar.primitives.irregularlybin",
36 |     "histogrammar.primitives.minmax",
37 |     "histogrammar.primitives.select",
38 |     "histogrammar.primitives.sparselybin",
39 |     "histogrammar.primitives.stack",
40 |     "histogrammar.primitives.sum",
41 |     "histogrammar.plot.bokeh",
42 |     "histogrammar.plot.root",
43 | ]
44 | 
45 | modules = {name: importlib.import_module(name) for name in modules}
46 | 
47 | documented = []
48 | for moduleName, module in modules.items():
49 |     for objName in dir(module):
50 |         obj = getattr(module, objName)
51 |         if not objName.startswith("_") and callable(obj) and obj.__module__ == moduleName:
52 |             print(objName, obj)
53 |             documented.append(moduleName + "." + objName)
54 |             path = Path("docs/" + moduleName + "." + objName + ".rst")
55 |             if inspect.isclass(obj):
56 |                 path.write_text(
57 |                     """:orphan:
58 | 
59 | {0}
60 | {1}
61 | 
62 | .. autoclass:: {0}
63 |     :members:
64 |     :special-members: __init__, __add__
65 |     :inherited-members:
66 |     :show-inheritance:
67 | """.format(
68 |                         moduleName + "." + objName,
69 |                         "=" * (len(moduleName) + len(objName) + 1),
70 |                     )
71 |                 )
72 |             else:
73 |                 path.write_text(
74 |                     """:orphan:
75 | 
76 | {0}
77 | {1}
78 | 
79 | .. autofunction:: {0}
80 | """.format(
81 |                         moduleName + "." + objName,
82 |                         "=" * (len(moduleName) + len(objName) + 1),
83 |                     )
84 |                 )
85 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["setuptools"]
  3 | build-backend = "setuptools.build_meta"
  4 | 
  5 | [project]
  6 | name = "histogrammar"
  7 | description = "Composable histogram primitives for distributed data reduction"
  8 | keywords = [
  9 |     "pandas",
 10 |     "spark",
 11 |     "data-science",
 12 |     "data-analysis",
 13 |     "statistics",
 14 |     "python",
 15 |     "jupyter",
 16 |     "ipython"
 17 | ]
 18 | readme = "README.rst"
 19 | requires-python = ">=3.9"
 20 | authors = [{ name = "Jim Pivarski (DIANA-HEP)", email = "pivarski@fnal.gov" }, { name = "Max Baak", email = "maxbaak@gmail.com" }]
 21 | maintainers = [{ name = "Max Baak", email = "maxbaak@gmail.com" }]
 22 | license = { type = "Apache Software License v2", file = "LICENSE" }
 23 | dependencies = [
 24 |     "numpy",
 25 |     "tqdm",
 26 |     "joblib>=0.14.0"
 27 | ]
 28 | classifiers = ["Development Status :: 5 - Production/Stable",
 29 |     "Environment :: Console",
 30 |     "Intended Audience :: Science/Research",
 31 |     "License :: OSI Approved :: Apache Software License",
 32 |     "Topic :: Scientific/Engineering :: Information Analysis",
 33 |     "Topic :: Scientific/Engineering :: Mathematics",
 34 |     "Topic :: Scientific/Engineering :: Physics",
 35 | ]
 36 | dynamic = ["version"]
 37 | 
 38 | [project.optional-dependencies]
 39 | pandas = [
 40 |     "pandas"
 41 | ]
 42 | spark = [
 43 |     "pyspark>=3.1; python_version <= '3.11'",
 44 | ]
 45 | test = [
 46 |     "ipykernel>=5.1.3",
 47 |     "jupyter_client>=5.2.3",
 48 |     "matplotlib",
 49 |     "pandas",
 50 |     "pre-commit>=2.9.0",
 51 |     "pytest-notebook>=0.6.1",
 52 |     "pytest>=4.0.2",
 53 | ]
 54 | test_numpy_pre2 = [
 55 |     "numpy<2",
 56 |     "pandas<2",
 57 | ]
 58 | 
 59 | # files to be shipped with the installation, under: histogrammar/test_data and histogrammar/notebooks
 60 | # after installation, these can be found with the functions in resources.py
 61 | [tool.setuptools.package-data]
 62 | histogrammar = [
 63 |     "test_data/*.csv.gz",
 64 |     "test_data/*.json*",
 65 |     "notebooks/*tutorial*.ipynb",
 66 | ]
 67 | 
 68 | [project.urls]
 69 | repository = "https://github.com/histogrammar/histogrammar-python"
 70 | 
 71 | [tool.semantic_release]
 72 | version_variable = [
 73 |     "histogrammar/version.py:version",
 74 | ]
 75 | build_command = "pip install build && python -m build"
 76 | 
 77 | [tool.setuptools.dynamic]
 78 | version = { attr = "histogrammar.version.version" }
 79 | 
 80 | [tool.ruff]
 81 | line-length = 120
 82 | 
 83 | [tool.ruff.lint]
 84 | extend-select = [
 85 |     "E", # pyflakes `E`
 86 |     "W", # pyflakes `W`
 87 |     "I", # isort
 88 |     "UP", # pyupgrade
 89 |     "D212", # pydocstyle
 90 |     "D411", # pydocstyle
 91 |     "C4", # flake8-comprehensions
 92 |     "FA", # flake8-future-annotations
 93 |     "PIE", # flake8-pie
 94 |     "RET", # flake8-return
 95 |     "SIM", # flake8-simplify
 96 |     "TCH", # flake8-type-checking
 97 |     "PTH", # flake8-use-pathlib
 98 |     "PL", # pylint
 99 |     "FURB", # refurb
100 | ]
101 | ignore = [
102 |     "EM102", # f string in exception
103 |     "PLR0913", # Too many arguments in function definition
104 |     "PLR2004", # Magic value used in comparison
105 |     "PLR1722", # Use `sys.exit()` instead of `exit`
106 |     "PLR0915", # Too many statements (61 > 50)
107 |     "UP038", # Use `X | Y` in `isinstance` call instead of `(X, Y)`
108 |     "TCH003", # Move standard library import
109 |     "TCH002", # Move third-party import `airflow.models.param.Param`
110 |     "PLR0912", # Too many branches
111 | ]
112 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2016 DIANA-HEP
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from decimal import Decimal
 2 | from json import load
 3 | from pathlib import Path
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import pytest
 8 | 
 9 | from histogrammar import resources
10 | 
11 | 
12 | def get_comparer_data():
13 |     test_comparer_df = {}
14 |     df = pd.DataFrame(
15 |         data={
16 |             "mae": [0.1, 0.11, 0.12, 0.2, 0.09],
17 |             "mse": [0.1, 0.1, 0.1, 0.1, 0.1],
18 |             "date": [2000, 2001, 2002, 2003, 2004],
19 |         }
20 |     )
21 |     df.set_index("date", inplace=True)
22 |     test_comparer_df["the_feature"] = df
23 | 
24 |     df = pd.DataFrame(
25 |         data={
26 |             "mae": [0.1, 0.11, 0.12, 0.2, 0.09],
27 |             "date": [2000, 2001, 2002, 2003, 2004],
28 |         }
29 |     )
30 |     df.set_index("date", inplace=True)
31 |     test_comparer_df["dummy_feature"] = df
32 | 
33 |     return test_comparer_df
34 | 
35 | 
36 | def get_ref_comparer_data():
37 |     ref_data = pd.DataFrame()
38 |     # we do not add "mse_std" on purpose to have some noise in the data
39 |     ref_data["metric"] = ["mae_mean", "mae_std", "mae_pull", "mse_mean"]
40 |     ref_data["value"] = [0.124, 0.0376, 0.0376, 0.09]
41 |     ref_data["feature"] = "the_feature"
42 |     ref_data["date"] = np.arange(ref_data.shape[0]) + 2010
43 | 
44 |     return ref_data
45 | 
46 | 
47 | def pytest_configure():
48 |     # attach common test data
49 |     pytest.test_comparer_df = get_comparer_data()
50 |     pytest.test_ref_comparer_df = get_ref_comparer_data()
51 | 
52 |     parent_path = Path(__file__).parent
53 |     TEMPLATE_PATH = parent_path / "resources"
54 |     CSV_FILE = "test.csv.gz"
55 | 
56 |     with (TEMPLATE_PATH / "age.json").open() as f:
57 |         pytest.age = load(f)
58 | 
59 |     with (TEMPLATE_PATH / "company.json").open() as f:
60 |         pytest.company = load(f)
61 | 
62 |     with (TEMPLATE_PATH / "date.json").open() as f:
63 |         pytest.date = load(f)
64 | 
65 |     with (TEMPLATE_PATH / "eyesColor.json").open() as f:
66 |         pytest.eyesColor = load(f)
67 | 
68 |     with (TEMPLATE_PATH / "gender.json").open() as f:
69 |         pytest.gender = load(f)
70 | 
71 |     with (TEMPLATE_PATH / "isActive.json").open() as f:
72 |         pytest.isActive = load(f)
73 | 
74 |     with (TEMPLATE_PATH / "isActive_age.json").open() as f:
75 |         pytest.isActive_age = load(f)
76 | 
77 |     with (TEMPLATE_PATH / "latitude.json").open() as f:
78 |         pytest.latitude = load(f)
79 | 
80 |     with (TEMPLATE_PATH / "longitude.json").open() as f:
81 |         pytest.longitude = load(f)
82 | 
83 |     with (TEMPLATE_PATH / "latitude_longitude.json").open() as f:
84 |         pytest.latitude_longitude = load(f)
85 | 
86 |     with (TEMPLATE_PATH / "transaction.json").open() as f:
87 |         pytest.transaction = load(f)
88 | 
89 |     df = pd.read_csv(resources.data(CSV_FILE))
90 |     df["date"] = pd.to_datetime(df["date"])
91 | 
92 |     # Decimal type
93 |     df["amount"] = df["balance"].str.replace("$", "", regex=False).str.replace(",", "", regex=False).apply(Decimal)
94 | 
95 |     pytest.test_df = df
96 | 


--------------------------------------------------------------------------------
/tests/jars/histogrammar-sparksql_2.11-1.0.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/histogrammar/histogrammar-python/642d89dfbe34f1065d215a369ec153ec11ed4c2e/tests/jars/histogrammar-sparksql_2.11-1.0.11.jar


--------------------------------------------------------------------------------
/tests/jars/histogrammar-sparksql_2.11-1.0.20.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/histogrammar/histogrammar-python/642d89dfbe34f1065d215a369ec153ec11ed4c2e/tests/jars/histogrammar-sparksql_2.11-1.0.20.jar


--------------------------------------------------------------------------------
/tests/jars/histogrammar-sparksql_2.12-1.0.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/histogrammar/histogrammar-python/642d89dfbe34f1065d215a369ec153ec11ed4c2e/tests/jars/histogrammar-sparksql_2.12-1.0.11.jar


--------------------------------------------------------------------------------
/tests/jars/histogrammar-sparksql_2.12-1.0.20.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/histogrammar/histogrammar-python/642d89dfbe34f1065d215a369ec153ec11ed4c2e/tests/jars/histogrammar-sparksql_2.12-1.0.20.jar


--------------------------------------------------------------------------------
/tests/jars/histogrammar_2.11-1.0.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/histogrammar/histogrammar-python/642d89dfbe34f1065d215a369ec153ec11ed4c2e/tests/jars/histogrammar_2.11-1.0.11.jar


--------------------------------------------------------------------------------
/tests/jars/histogrammar_2.11-1.0.20.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/histogrammar/histogrammar-python/642d89dfbe34f1065d215a369ec153ec11ed4c2e/tests/jars/histogrammar_2.11-1.0.20.jar


--------------------------------------------------------------------------------
/tests/jars/histogrammar_2.12-1.0.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/histogrammar/histogrammar-python/642d89dfbe34f1065d215a369ec153ec11ed4c2e/tests/jars/histogrammar_2.12-1.0.11.jar


--------------------------------------------------------------------------------
/tests/jars/histogrammar_2.12-1.0.20.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/histogrammar/histogrammar-python/642d89dfbe34f1065d215a369ec153ec11ed4c2e/tests/jars/histogrammar_2.12-1.0.20.jar


--------------------------------------------------------------------------------
/tests/resources/age.json:
--------------------------------------------------------------------------------
 1 | {"data": {"binWidth": 1.0,
 2 |           "bins": {"10": 5.0,
 3 |                    "11": 3.0,
 4 |                    "12": 4.0,
 5 |                    "13": 5.0,
 6 |                    "14": 6.0,
 7 |                    "15": 5.0,
 8 |                    "16": 6.0,
 9 |                    "17": 5.0,
10 |                    "18": 4.0,
11 |                    "19": 6.0,
12 |                    "20": 6.0,
13 |                    "21": 5.0,
14 |                    "22": 3.0,
15 |                    "23": 5.0,
16 |                    "24": 5.0,
17 |                    "25": 5.0,
18 |                    "26": 6.0,
19 |                    "27": 3.0,
20 |                    "28": 3.0,
21 |                    "29": 2.0,
22 |                    "30": 7.0,
23 |                    "31": 7.0,
24 |                    "32": 13.0,
25 |                    "33": 4.0,
26 |                    "34": 5.0,
27 |                    "35": 1.0,
28 |                    "36": 2.0,
29 |                    "37": 9.0,
30 |                    "38": 3.0,
31 |                    "39": 4.0,
32 |                    "40": 2.0,
33 |                    "41": 10.0,
34 |                    "42": 5.0,
35 |                    "43": 6.0,
36 |                    "44": 8.0,
37 |                    "45": 6.0,
38 |                    "46": 4.0,
39 |                    "47": 2.0,
40 |                    "48": 9.0,
41 |                    "49": 4.0,
42 |                    "50": 5.0,
43 |                    "51": 7.0,
44 |                    "52": 6.0,
45 |                    "53": 6.0,
46 |                    "54": 11.0,
47 |                    "55": 4.0,
48 |                    "56": 7.0,
49 |                    "57": 8.0,
50 |                    "58": 10.0,
51 |                    "59": 3.0,
52 |                    "60": 6.0,
53 |                    "61": 3.0,
54 |                    "62": 5.0,
55 |                    "63": 7.0,
56 |                    "64": 5.0,
57 |                    "65": 2.0,
58 |                    "66": 9.0,
59 |                    "67": 7.0,
60 |                    "68": 4.0,
61 |                    "69": 4.0,
62 |                    "70": 5.0,
63 |                    "71": 2.0,
64 |                    "72": 3.0,
65 |                    "73": 5.0,
66 |                    "74": 1.0,
67 |                    "75": 2.0,
68 |                    "76": 2.0,
69 |                    "77": 4.0,
70 |                    "78": 7.0,
71 |                    "79": 3.0,
72 |                    "80": 5.0,
73 |                    "81": 3.0,
74 |                    "82": 2.0,
75 |                    "83": 2.0,
76 |                    "84": 5.0,
77 |                    "85": 5.0,
78 |                    "86": 4.0,
79 |                    "87": 4.0,
80 |                    "88": 6.0,
81 |                    "89": 6.0,
82 |                    "90": 2.0},
83 |           "bins:type": "Count",
84 |           "entries": 400.0,
85 |           "nanflow": 0.0,
86 |           "nanflow:type": "Count",
87 |           "origin": 0.0},
88 |  "type": "SparselyBin",
89 |  "version": "1.1"}


--------------------------------------------------------------------------------
/tests/resources/date.json:
--------------------------------------------------------------------------------
 1 | {"data": {"binWidth": 2592000000000000,
 2 |           "bins": {"60": 9.0,
 3 |                    "61": 37.0,
 4 |                    "62": 42.0,
 5 |                    "63": 22.0,
 6 |                    "64": 28.0,
 7 |                    "65": 34.0,
 8 |                    "66": 34.0,
 9 |                    "67": 25.0,
10 |                    "68": 28.0,
11 |                    "69": 38.0,
12 |                    "70": 45.0,
13 |                    "71": 45.0,
14 |                    "72": 13.0},
15 |           "bins:type": "Count",
16 |           "entries": 400.0,
17 |           "nanflow": 0.0,
18 |           "nanflow:type": "Count",
19 |           "origin": 1262563200000000000},
20 |  "type": "SparselyBin",
21 |  "version": "1.1"}


--------------------------------------------------------------------------------
/tests/resources/eyesColor.json:
--------------------------------------------------------------------------------
1 | {"data": {"bins": {"blue": 77.0,
2 |                    "brown": 71.0,
3 |                    "green": 82.0,
4 |                    "grey": 76.0,
5 |                    "red": 94.0},
6 |           "bins:type": "Count",
7 |           "entries": 400.0},
8 |  "type": "Categorize",
9 |  "version": "1.1"}


--------------------------------------------------------------------------------
/tests/resources/gender.json:
--------------------------------------------------------------------------------
1 | {"data": {"bins": {"female": 191.0, "male": 209.0},
2 |           "bins:type": "Count",
3 |           "entries": 400.0},
4 |  "type": "Categorize",
5 |  "version": "1.1"}


--------------------------------------------------------------------------------
/tests/resources/isActive.json:
--------------------------------------------------------------------------------
1 | {"data": {"bins": {"False": 208.0, "True": 192.0},
2 |           "bins:type": "Count",
3 |           "entries": 400.0},
4 |  "type": "Categorize",
5 |  "version": "1.1"}


--------------------------------------------------------------------------------
/tests/resources/isActive_age.json:
--------------------------------------------------------------------------------
  1 | {"data": {"bins": {"False": {"binWidth": 1.0,
  2 |                              "bins": {"10": 2.0,
  3 |                                       "11": 1.0,
  4 |                                       "13": 2.0,
  5 |                                       "14": 2.0,
  6 |                                       "15": 1.0,
  7 |                                       "16": 2.0,
  8 |                                       "17": 3.0,
  9 |                                       "18": 2.0,
 10 |                                       "19": 3.0,
 11 |                                       "20": 5.0,
 12 |                                       "21": 3.0,
 13 |                                       "22": 1.0,
 14 |                                       "23": 2.0,
 15 |                                       "24": 3.0,
 16 |                                       "25": 2.0,
 17 |                                       "26": 3.0,
 18 |                                       "28": 3.0,
 19 |                                       "29": 1.0,
 20 |                                       "30": 3.0,
 21 |                                       "31": 2.0,
 22 |                                       "32": 7.0,
 23 |                                       "33": 3.0,
 24 |                                       "34": 3.0,
 25 |                                       "36": 2.0,
 26 |                                       "37": 3.0,
 27 |                                       "38": 2.0,
 28 |                                       "39": 2.0,
 29 |                                       "41": 8.0,
 30 |                                       "42": 1.0,
 31 |                                       "43": 6.0,
 32 |                                       "44": 5.0,
 33 |                                       "45": 2.0,
 34 |                                       "46": 2.0,
 35 |                                       "47": 1.0,
 36 |                                       "48": 2.0,
 37 |                                       "50": 4.0,
 38 |                                       "51": 4.0,
 39 |                                       "52": 3.0,
 40 |                                       "53": 6.0,
 41 |                                       "54": 6.0,
 42 |                                       "55": 3.0,
 43 |                                       "56": 5.0,
 44 |                                       "58": 4.0,
 45 |                                       "59": 2.0,
 46 |                                       "60": 2.0,
 47 |                                       "61": 2.0,
 48 |                                       "62": 3.0,
 49 |                                       "63": 3.0,
 50 |                                       "64": 4.0,
 51 |                                       "65": 1.0,
 52 |                                       "66": 6.0,
 53 |                                       "67": 4.0,
 54 |                                       "68": 1.0,
 55 |                                       "69": 2.0,
 56 |                                       "70": 4.0,
 57 |                                       "72": 2.0,
 58 |                                       "73": 1.0,
 59 |                                       "74": 1.0,
 60 |                                       "75": 2.0,
 61 |                                       "76": 2.0,
 62 |                                       "77": 2.0,
 63 |                                       "78": 3.0,
 64 |                                       "79": 3.0,
 65 |                                       "80": 4.0,
 66 |                                       "81": 3.0,
 67 |                                       "82": 2.0,
 68 |                                       "84": 2.0,
 69 |                                       "85": 2.0,
 70 |                                       "86": 3.0,
 71 |                                       "87": 4.0,
 72 |                                       "88": 5.0,
 73 |                                       "89": 2.0,
 74 |                                       "90": 1.0},
 75 |                              "bins:type": "Count",
 76 |                              "entries": 208.0,
 77 |                              "nanflow": 0.0,
 78 |                              "nanflow:type": "Count",
 79 |                              "origin": 0.0},
 80 |                    "True": {"binWidth": 1.0,
 81 |                             "bins": {"10": 3.0,
 82 |                                      "11": 2.0,
 83 |                                      "12": 4.0,
 84 |                                      "13": 3.0,
 85 |                                      "14": 4.0,
 86 |                                      "15": 4.0,
 87 |                                      "16": 4.0,
 88 |                                      "17": 2.0,
 89 |                                      "18": 2.0,
 90 |                                      "19": 3.0,
 91 |                                      "20": 1.0,
 92 |                                      "21": 2.0,
 93 |                                      "22": 2.0,
 94 |                                      "23": 3.0,
 95 |                                      "24": 2.0,
 96 |                                      "25": 3.0,
 97 |                                      "26": 3.0,
 98 |                                      "27": 3.0,
 99 |                                      "29": 1.0,
100 |                                      "30": 4.0,
101 |                                      "31": 5.0,
102 |                                      "32": 6.0,
103 |                                      "33": 1.0,
104 |                                      "34": 2.0,
105 |                                      "35": 1.0,
106 |                                      "37": 6.0,
107 |                                      "38": 1.0,
108 |                                      "39": 2.0,
109 |                                      "40": 2.0,
110 |                                      "41": 2.0,
111 |                                      "42": 4.0,
112 |                                      "44": 3.0,
113 |                                      "45": 4.0,
114 |                                      "46": 2.0,
115 |                                      "47": 1.0,
116 |                                      "48": 7.0,
117 |                                      "49": 4.0,
118 |                                      "50": 1.0,
119 |                                      "51": 3.0,
120 |                                      "52": 3.0,
121 |                                      "54": 5.0,
122 |                                      "55": 1.0,
123 |                                      "56": 2.0,
124 |                                      "57": 8.0,
125 |                                      "58": 6.0,
126 |                                      "59": 1.0,
127 |                                      "60": 4.0,
128 |                                      "61": 1.0,
129 |                                      "62": 2.0,
130 |                                      "63": 4.0,
131 |                                      "64": 1.0,
132 |                                      "65": 1.0,
133 |                                      "66": 3.0,
134 |                                      "67": 3.0,
135 |                                      "68": 3.0,
136 |                                      "69": 2.0,
137 |                                      "70": 1.0,
138 |                                      "71": 2.0,
139 |                                      "72": 1.0,
140 |                                      "73": 4.0,
141 |                                      "77": 2.0,
142 |                                      "78": 4.0,
143 |                                      "80": 1.0,
144 |                                      "83": 2.0,
145 |                                      "84": 3.0,
146 |                                      "85": 3.0,
147 |                                      "86": 1.0,
148 |                                      "88": 1.0,
149 |                                      "89": 4.0,
150 |                                      "90": 1.0},
151 |                             "bins:type": "Count",
152 |                             "entries": 192.0,
153 |                             "nanflow": 0.0,
154 |                             "nanflow:type": "Count",
155 |                             "origin": 0.0}},
156 |           "bins:type": "SparselyBin",
157 |           "entries": 400.0},
158 |  "type": "Categorize",
159 |  "version": "1.1"}


--------------------------------------------------------------------------------
/tests/resources/latitude.json:
--------------------------------------------------------------------------------
 1 | {"data": {"binWidth": 5,
 2 |           "bins": {"-1": 15.0,
 3 |                    "-10": 10.0,
 4 |                    "-11": 11.0,
 5 |                    "-12": 12.0,
 6 |                    "-13": 23.0,
 7 |                    "-14": 5.0,
 8 |                    "-15": 12.0,
 9 |                    "-16": 8.0,
10 |                    "-17": 12.0,
11 |                    "-18": 5.0,
12 |                    "-2": 8.0,
13 |                    "-3": 8.0,
14 |                    "-4": 12.0,
15 |                    "-5": 9.0,
16 |                    "-6": 7.0,
17 |                    "-7": 10.0,
18 |                    "-8": 13.0,
19 |                    "-9": 13.0,
20 |                    "0": 6.0,
21 |                    "1": 11.0,
22 |                    "10": 13.0,
23 |                    "11": 10.0,
24 |                    "12": 12.0,
25 |                    "13": 13.0,
26 |                    "14": 12.0,
27 |                    "15": 9.0,
28 |                    "16": 13.0,
29 |                    "17": 8.0,
30 |                    "2": 18.0,
31 |                    "3": 13.0,
32 |                    "4": 9.0,
33 |                    "5": 13.0,
34 |                    "6": 7.0,
35 |                    "7": 10.0,
36 |                    "8": 18.0,
37 |                    "9": 12.0},
38 |           "bins:type": "Count",
39 |           "entries": 400.0,
40 |           "nanflow": 0.0,
41 |           "nanflow:type": "Count",
42 |           "origin": 0},
43 |  "type": "SparselyBin",
44 |  "version": "1.1"}


--------------------------------------------------------------------------------
/tests/resources/longitude.json:
--------------------------------------------------------------------------------
 1 | 
 2 | {"data": {"binWidth": 5,
 3 |           "bins": {"-1": 7.0,
 4 |                    "-10": 9.0,
 5 |                    "-11": 6.0,
 6 |                    "-12": 4.0,
 7 |                    "-13": 9.0,
 8 |                    "-14": 5.0,
 9 |                    "-15": 9.0,
10 |                    "-16": 5.0,
11 |                    "-17": 4.0,
12 |                    "-18": 7.0,
13 |                    "-19": 11.0,
14 |                    "-2": 3.0,
15 |                    "-20": 6.0,
16 |                    "-21": 3.0,
17 |                    "-22": 5.0,
18 |                    "-23": 6.0,
19 |                    "-24": 6.0,
20 |                    "-25": 5.0,
21 |                    "-26": 11.0,
22 |                    "-27": 5.0,
23 |                    "-28": 4.0,
24 |                    "-29": 4.0,
25 |                    "-3": 7.0,
26 |                    "-30": 7.0,
27 |                    "-31": 4.0,
28 |                    "-32": 2.0,
29 |                    "-33": 4.0,
30 |                    "-34": 8.0,
31 |                    "-35": 2.0,
32 |                    "-36": 2.0,
33 |                    "-4": 5.0,
34 |                    "-5": 6.0,
35 |                    "-6": 3.0,
36 |                    "-7": 7.0,
37 |                    "-8": 2.0,
38 |                    "-9": 5.0,
39 |                    "0": 5.0,
40 |                    "1": 12.0,
41 |                    "10": 6.0,
42 |                    "11": 11.0,
43 |                    "12": 3.0,
44 |                    "13": 4.0,
45 |                    "14": 7.0,
46 |                    "15": 4.0,
47 |                    "16": 5.0,
48 |                    "17": 6.0,
49 |                    "18": 5.0,
50 |                    "19": 3.0,
51 |                    "2": 4.0,
52 |                    "20": 9.0,
53 |                    "21": 7.0,
54 |                    "22": 7.0,
55 |                    "23": 5.0,
56 |                    "24": 5.0,
57 |                    "25": 3.0,
58 |                    "26": 4.0,
59 |                    "27": 4.0,
60 |                    "28": 5.0,
61 |                    "29": 5.0,
62 |                    "3": 6.0,
63 |                    "30": 5.0,
64 |                    "31": 9.0,
65 |                    "32": 6.0,
66 |                    "33": 4.0,
67 |                    "34": 3.0,
68 |                    "35": 4.0,
69 |                    "4": 13.0,
70 |                    "5": 5.0,
71 |                    "6": 6.0,
72 |                    "7": 6.0,
73 |                    "8": 3.0,
74 |                    "9": 3.0},
75 |           "bins:type": "Count",
76 |           "entries": 400.0,
77 |           "nanflow": 0.0,
78 |           "nanflow:type": "Count",
79 |           "origin": 0},
80 |  "type": "SparselyBin",
81 |  "version": "1.1"}


--------------------------------------------------------------------------------
/tests/resources/transaction.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "type": "Bin",
  3 |     "data": {
  4 |         "low": -2000.0,
  5 |         "high": 2000.0,
  6 |         "entries": 400.0,
  7 |         "values:type": "Count",
  8 |         "values": [
  9 |             0.0,
 10 |             0.0,
 11 |             0.0,
 12 |             0.0,
 13 |             0.0,
 14 |             0.0,
 15 |             0.0,
 16 |             0.0,
 17 |             0.0,
 18 |             0.0,
 19 |             0.0,
 20 |             0.0,
 21 |             0.0,
 22 |             0.0,
 23 |             0.0,
 24 |             0.0,
 25 |             0.0,
 26 |             0.0,
 27 |             0.0,
 28 |             0.0,
 29 |             0.0,
 30 |             0.0,
 31 |             0.0,
 32 |             0.0,
 33 |             0.0,
 34 |             6.0,
 35 |             5.0,
 36 |             11.0,
 37 |             8.0,
 38 |             5.0,
 39 |             9.0,
 40 |             4.0,
 41 |             5.0,
 42 |             5.0,
 43 |             9.0,
 44 |             6.0,
 45 |             3.0,
 46 |             9.0,
 47 |             6.0,
 48 |             5.0,
 49 |             9.0,
 50 |             7.0,
 51 |             7.0,
 52 |             8.0,
 53 |             8.0,
 54 |             6.0,
 55 |             5.0,
 56 |             11.0,
 57 |             4.0,
 58 |             12.0,
 59 |             5.0,
 60 |             4.0,
 61 |             6.0,
 62 |             10.0,
 63 |             3.0,
 64 |             4.0,
 65 |             5.0,
 66 |             8.0,
 67 |             0.0,
 68 |             7.0,
 69 |             6.0,
 70 |             5.0,
 71 |             7.0,
 72 |             9.0,
 73 |             7.0,
 74 |             5.0,
 75 |             6.0,
 76 |             6.0,
 77 |             7.0,
 78 |             2.0,
 79 |             3.0,
 80 |             6.0,
 81 |             3.0,
 82 |             8.0,
 83 |             7.0,
 84 |             3.0,
 85 |             8.0,
 86 |             8.0,
 87 |             8.0,
 88 |             7.0,
 89 |             7.0,
 90 |             8.0,
 91 |             6.0,
 92 |             9.0,
 93 |             7.0,
 94 |             8.0,
 95 |             7.0,
 96 |             2.0,
 97 |             0.0,
 98 |             0.0,
 99 |             0.0,
100 |             0.0,
101 |             0.0,
102 |             0.0,
103 |             0.0,
104 |             0.0,
105 |             0.0,
106 |             0.0,
107 |             0.0,
108 |             0.0
109 |         ],
110 |         "underflow:type": "Count",
111 |         "underflow": 0.0,
112 |         "overflow:type": "Count",
113 |         "overflow": 0.0,
114 |         "nanflow:type": "Count",
115 |         "nanflow": 0.0
116 |     },
117 |     "version": "1.1"
118 | }


--------------------------------------------------------------------------------
/tests/test_notebooks.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pytest_notebook.nb_regression import NBRegressionFixture
 3 | 
 4 | from histogrammar.resources import notebook
 5 | 
 6 | 
 7 | @pytest.fixture(scope="module")
 8 | def nb_tester():
 9 |     """Test notebooks using pytest-notebook"""
10 |     return NBRegressionFixture(
11 |         diff_ignore=(
12 |             "/metadata/language_info",
13 |             "/cells/*/execution_count",
14 |             "/cells/*/outputs/*",
15 |         ),
16 |         exec_timeout=1800,
17 |     )
18 | 
19 | 
20 | def test_notebook_basic(nb_tester):
21 |     nb_tester.check(notebook("histogrammar_tutorial_basic.ipynb"))
22 | 
23 | 
24 | def test_notebook_advanced(nb_tester):
25 |     nb_tester.check(notebook("histogrammar_tutorial_advanced.ipynb"))
26 | 
27 | 
28 | def test_notebook_exercises(nb_tester):
29 |     nb_tester.check(notebook("histogrammar_tutorial_exercises.ipynb"))
30 | 


--------------------------------------------------------------------------------
/tests/test_pandas_histogrammar.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import pytest
  6 | 
  7 | from histogrammar.dfinterface.make_histograms import (
  8 |     get_bin_specs,
  9 |     get_time_axes,
 10 |     make_histograms,
 11 | )
 12 | from histogrammar.dfinterface.pandas_histogrammar import PandasHistogrammar
 13 | 
 14 | 
 15 | def test_get_histograms():
 16 |     pandas_filler = PandasHistogrammar(
 17 |         features=[
 18 |             "date",
 19 |             "isActive",
 20 |             "age",
 21 |             "eyeColor",
 22 |             "gender",
 23 |             "company",
 24 |             "latitude",
 25 |             "longitude",
 26 |             ["isActive", "age"],
 27 |             ["latitude", "longitude"],
 28 |         ],
 29 |         bin_specs={
 30 |             "longitude": {"binWidth": 5, "origin": 0},
 31 |             "latitude": {"binWidth": 5, "origin": 0},
 32 |         },
 33 |     )
 34 |     current_hists = pandas_filler.get_histograms(pytest.test_df)
 35 | 
 36 |     assert current_hists["age"].toJson() == pytest.age
 37 |     assert current_hists["company"].toJson() == pytest.company
 38 |     assert current_hists["date"].toJson() == pytest.date
 39 |     assert current_hists["eyeColor"].toJson() == pytest.eyesColor
 40 |     assert current_hists["gender"].toJson() == pytest.gender
 41 |     assert current_hists["isActive"].toJson() == pytest.isActive
 42 |     assert current_hists["isActive:age"].toJson() == pytest.isActive_age
 43 |     assert current_hists["latitude"].toJson() == pytest.latitude
 44 |     assert current_hists["longitude"].toJson() == pytest.longitude
 45 |     assert current_hists["latitude:longitude"].toJson() == pytest.latitude_longitude
 46 | 
 47 | 
 48 | def test_make_histograms():
 49 |     features = [
 50 |         "date",
 51 |         "isActive",
 52 |         "age",
 53 |         "eyeColor",
 54 |         "gender",
 55 |         "company",
 56 |         "latitude",
 57 |         "longitude",
 58 |         ["isActive", "age"],
 59 |         ["latitude", "longitude"],
 60 |         "transaction",
 61 |     ]
 62 |     bin_specs = {
 63 |         "transaction": {"num": 100, "low": -2000, "high": 2000},
 64 |         "longitude": {"binWidth": 5, "origin": 0},
 65 |         "latitude": {"binWidth": 5, "origin": 0},
 66 |     }
 67 | 
 68 |     current_hists = make_histograms(pytest.test_df, features=features, binning="unit", bin_specs=bin_specs)
 69 | 
 70 |     assert current_hists["age"].toJson() == pytest.age
 71 |     assert current_hists["company"].toJson() == pytest.company
 72 |     assert current_hists["date"].toJson() == pytest.date
 73 |     assert current_hists["eyeColor"].toJson() == pytest.eyesColor
 74 |     assert current_hists["gender"].toJson() == pytest.gender
 75 |     assert current_hists["isActive"].toJson() == pytest.isActive
 76 |     assert current_hists["isActive:age"].toJson() == pytest.isActive_age
 77 |     assert current_hists["latitude"].toJson() == pytest.latitude
 78 |     assert current_hists["longitude"].toJson() == pytest.longitude
 79 |     assert current_hists["latitude:longitude"].toJson() == pytest.latitude_longitude
 80 |     assert current_hists["transaction"].toJson() == pytest.transaction
 81 | 
 82 | 
 83 | def test_make_histograms_no_time_axis():
 84 |     hists, features, bin_specs, time_axis, var_dtype = make_histograms(
 85 |         pytest.test_df,
 86 |         time_axis="",
 87 |         ret_specs=True,
 88 |     )
 89 | 
 90 |     assert len(hists) == 22
 91 |     assert len(features) == 22
 92 |     assert len(bin_specs) == 7
 93 |     assert len(var_dtype) == 22
 94 |     assert time_axis == ""
 95 |     assert "date" in hists
 96 |     h = hists["date"]
 97 |     assert h.binWidth == pytest.approx(751582381944448.0)
 98 |     for cols in features:
 99 |         cols_arr = cols.split(":")
100 |         assert len(cols_arr) == 1
101 |     for f, bs in bin_specs.items():
102 |         assert isinstance(bs, dict)
103 |     assert "age" in bin_specs
104 |     dateage = bin_specs["age"]
105 |     assert dateage["binWidth"] == 2.0
106 |     assert dateage["origin"] == 9.5
107 | 
108 | 
109 | def test_make_histograms_with_time_axis():
110 |     hists, features, bin_specs, time_axis, var_dtype = make_histograms(
111 |         pytest.test_df,
112 |         time_axis=True,
113 |         ret_specs=True,
114 |         time_width=None,
115 |         time_offset=None,
116 |     )
117 | 
118 |     assert len(hists) == 21
119 |     assert len(features) == 21
120 |     assert len(bin_specs) == 21
121 |     assert len(var_dtype) == 22
122 |     assert time_axis == "date"
123 |     assert "date:age" in hists
124 |     h = hists["date:age"]
125 |     assert h.binWidth == pytest.approx(751582381944448.0)
126 |     for cols in features:
127 |         cols_arr = cols.split(":")
128 |         assert len(cols_arr) == 2 and cols_arr[0] == "date"
129 |     for f, bs in bin_specs.items():
130 |         assert len(bs) == 2
131 |     assert "date:age" in bin_specs
132 |     dateage = bin_specs["date:age"]
133 |     assert dateage[0]["binWidth"] == pytest.approx(751582381944448.0)
134 |     assert dateage[1]["binWidth"] == 2.0
135 |     assert dateage[1]["origin"] == 9.5
136 | 
137 |     # test get_bin_specs 1
138 |     bin_specs = get_bin_specs(hists)
139 |     assert "date:age" in bin_specs
140 |     dateage = bin_specs["date:age"]
141 |     assert dateage[0]["binWidth"] == pytest.approx(751582381944448.0)
142 |     assert dateage[1]["binWidth"] == 2.0
143 |     assert dateage[1]["origin"] == 9.5
144 | 
145 |     # test get_bin_specs 2
146 |     bin_specs = get_bin_specs(hists, skip_first_axis=True)
147 |     assert "age" in bin_specs
148 |     age = bin_specs["age"]
149 |     assert age["binWidth"] == 2.0
150 |     assert age["origin"] == 9.5
151 | 
152 |     # test get_bin_specs 3
153 |     bin_specs = get_bin_specs(hists["date:age"])
154 |     assert bin_specs[0]["binWidth"] == pytest.approx(751582381944448.0)
155 |     assert bin_specs[1]["binWidth"] == 2.0
156 |     assert bin_specs[1]["origin"] == 9.5
157 | 
158 |     # test get_bin_specs 4
159 |     bin_specs = get_bin_specs(hists["date:age"], skip_first_axis=True)
160 |     assert bin_specs["binWidth"] == 2.0
161 |     assert bin_specs["origin"] == 9.5
162 | 
163 | 
164 | def test_make_histograms_unit_binning():
165 |     hists, features, bin_specs, time_axis, var_dtype = make_histograms(
166 |         pytest.test_df, binning="unit", time_axis="", ret_specs=True
167 |     )
168 | 
169 |     assert len(hists) == 22
170 |     assert len(features) == 22
171 |     assert len(bin_specs) == 0
172 |     assert len(var_dtype) == 22
173 |     assert time_axis == ""
174 |     assert "date" in hists
175 |     h = hists["date"]
176 |     assert h.binWidth == 2592000000000000
177 |     for cols in features:
178 |         cols_arr = cols.split(":")
179 |         assert len(cols_arr) == 1
180 |     for f, bs in bin_specs.items():
181 |         assert isinstance(bs, dict)
182 |     assert "age" in hists
183 |     h = hists["age"]
184 |     assert h.binWidth == 1.0
185 |     assert h.origin == 0.0
186 | 
187 | 
188 | def test_get_histograms_module():
189 |     pandas_filler = PandasHistogrammar(
190 |         features=[
191 |             "date",
192 |             "isActive",
193 |             "age",
194 |             "eyeColor",
195 |             "gender",
196 |             "company",
197 |             "latitude",
198 |             "longitude",
199 |             ["isActive", "age"],
200 |             ["latitude", "longitude"],
201 |         ],
202 |         bin_specs={
203 |             "longitude": {"binWidth": 5, "origin": 0},
204 |             "latitude": {"binWidth": 5, "origin": 0},
205 |         },
206 |         read_key="input",
207 |         store_key="output",
208 |     )
209 | 
210 |     datastore = pandas_filler.transform(datastore={"input": pytest.test_df})
211 | 
212 |     assert "output" in datastore
213 |     current_hists = datastore["output"]
214 |     assert current_hists["age"].toJson() == pytest.age
215 |     assert current_hists["company"].toJson() == pytest.company
216 |     assert current_hists["date"].toJson() == pytest.date
217 |     assert current_hists["eyeColor"].toJson() == pytest.eyesColor
218 |     assert current_hists["gender"].toJson() == pytest.gender
219 |     assert current_hists["isActive"].toJson() == pytest.isActive
220 |     assert current_hists["isActive:age"].toJson() == pytest.isActive_age
221 |     assert current_hists["latitude"].toJson() == pytest.latitude
222 |     assert current_hists["longitude"].toJson() == pytest.longitude
223 |     assert current_hists["latitude:longitude"].toJson() == pytest.latitude_longitude
224 | 
225 | 
226 | def test_get_time_axes():
227 |     time_axes = get_time_axes(pytest.test_df)
228 |     np.testing.assert_array_equal(time_axes, ["date"])
229 | 
230 | 
231 | def test_null_histograms():
232 |     d = {
233 |         "transaction": {0: np.nan, 1: 1.0, 2: np.nan, 3: 3.0, 4: 4.0},
234 |         "isActive": {0: None, 1: None, 2: True, 3: True, 4: False},
235 |         "eyeColor": {0: None, 1: None, 2: "Jones", 3: "USA", 4: "FL"},
236 |         "t2": {0: np.nan, 1: 2.0, 2: np.nan, 3: 4.0, 4: 5.0},
237 |         "foo": {0: np.nan, 1: np.nan, 2: np.nan, 3: True, 4: False},
238 |         "bar": {0: "a", 1: "b", 2: "c", 3: "d", 4: "e"},
239 |         "bla": {0: 1, 1: 2, 2: 3, 3: 4, 4: np.nan},
240 |         "mixed": {0: "a", 1: "b", 2: "c", 3: np.nan, 4: 1},
241 |     }
242 |     df = pd.DataFrame(d)
243 |     df["bar"] = df["bar"].astype("category")
244 | 
245 |     hists = make_histograms(df, bin_specs={"transaction": {"num": 40, "low": 0, "high": 10}})
246 | 
247 |     assert "transaction" in hists
248 |     assert "isActive" in hists
249 |     assert "eyeColor" in hists
250 |     assert "t2" in hists
251 |     assert "foo" in hists
252 |     assert "bar" in hists
253 |     assert "bla" in hists
254 |     assert "bla" in hists
255 |     assert "mixed" in hists
256 | 
257 |     h = hists["transaction"]
258 |     assert h.nanflow.entries == 2
259 | 
260 |     h = hists["t2"]
261 |     assert h.nanflow.entries == 2
262 | 
263 |     h = hists["isActive"]
264 |     assert "NaN" in h.bins
265 |     assert h.bins["NaN"].entries == 2
266 | 
267 |     h = hists["eyeColor"]
268 |     assert "None" in h.bins
269 |     assert h.bins["None"].entries == 2
270 | 
271 |     h = hists["foo"]
272 |     assert "NaN" in h.bins
273 |     assert h.bins["NaN"].entries == 3
274 | 
275 |     h = hists["bar"]
276 |     assert "NaN" not in h.bins
277 | 
278 |     h = hists["bla"]
279 |     assert h.nanflow.entries == 1
280 | 
281 |     h = hists["mixed"]
282 |     assert "nan" in h.bins
283 |     assert h.bins["nan"].entries == 1
284 | 


--------------------------------------------------------------------------------
/tests/test_spark_histogrammar.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import pandas as pd
  4 | import pytest
  5 | 
  6 | from histogrammar.dfinterface.make_histograms import make_histograms
  7 | from histogrammar.dfinterface.spark_histogrammar import SparkHistogrammar
  8 | 
  9 | try:
 10 |     from pyspark import __version__ as pyspark_version
 11 |     from pyspark.sql import SparkSession
 12 | 
 13 |     spark_found = True
 14 | except (ModuleNotFoundError, AttributeError):
 15 |     spark_found = False
 16 | 
 17 | 
 18 | def get_spark():
 19 |     if not spark_found:
 20 |         return None
 21 | 
 22 |     current_path = Path(__file__).resolve().parent
 23 | 
 24 |     scala = "2.12" if int(pyspark_version[0]) >= 3 else "2.11"
 25 |     hist_spark_jar = current_path / f"jars/histogrammar-sparksql_{scala}-1.0.20.jar"
 26 |     hist_jar = current_path / f"jars/histogrammar_{scala}-1.0.20.jar"
 27 | 
 28 |     return (
 29 |         SparkSession.builder.master("local")
 30 |         .appName("histogrammar-pytest")
 31 |         .config("spark.jars", f"{hist_spark_jar},{hist_jar}")
 32 |         .config("spark.sql.session.timeZone", "GMT")
 33 |         .getOrCreate()
 34 |     )
 35 | 
 36 | 
 37 | @pytest.fixture
 38 | def spark_co():
 39 |     """:return: Spark configuration"""
 40 |     return get_spark()
 41 | 
 42 | 
 43 | # @pytest.mark.spark
 44 | @pytest.mark.skipif(not spark_found, reason="spark not found")
 45 | @pytest.mark.filterwarnings("ignore:createDataFrame attempted Arrow optimization because")
 46 | def test_get_histograms(spark_co):
 47 |     pytest.age["data"]["name"] = "'age'"
 48 |     pytest.company["data"]["name"] = "'company'"
 49 |     pytest.eyesColor["data"]["name"] = "'eyeColor'"
 50 |     pytest.gender["data"]["name"] = "'gender'"
 51 |     pytest.isActive["data"]["name"] = "'isActive'"
 52 |     pytest.latitude["data"]["name"] = "'latitude'"
 53 |     pytest.longitude["data"]["name"] = "'longitude'"
 54 |     pytest.transaction["data"]["name"] = "'transaction'"
 55 | 
 56 |     pytest.latitude_longitude["data"]["name"] = "'latitude:longitude'"
 57 |     pytest.latitude_longitude["data"]["bins:name"] = "unit_func"
 58 | 
 59 |     spark = spark_co
 60 | 
 61 |     spark_df = spark.createDataFrame(pytest.test_df)
 62 | 
 63 |     spark_filler = SparkHistogrammar(
 64 |         features=[
 65 |             "date",
 66 |             "isActive",
 67 |             "age",
 68 |             "eyeColor",
 69 |             "gender",
 70 |             "company",
 71 |             "latitude",
 72 |             "longitude",
 73 |             ["isActive", "age"],
 74 |             ["latitude", "longitude"],
 75 |             "transaction",
 76 |             "amount",
 77 |         ],
 78 |         bin_specs={
 79 |             "transaction": {"num": 100, "low": -2000, "high": 2000},
 80 |             "longitude": {"bin_width": 5.0, "bin_offset": 0.0},
 81 |             "latitude": {"bin_width": 5.0, "bin_offset": 0.0},
 82 |         },
 83 |         read_key="input",
 84 |         store_key="output",
 85 |     )
 86 | 
 87 |     # test get_histograms() function call
 88 |     current_hists = spark_filler.get_histograms(spark_df)
 89 |     # current_hists = make_histograms(spark_df, features, bin_specs)
 90 |     assert current_hists["age"].toJson() == pytest.age
 91 |     assert current_hists["company"].toJson() == pytest.company
 92 |     assert current_hists["eyeColor"].toJson() == pytest.eyesColor
 93 |     assert current_hists["gender"].toJson() == pytest.gender
 94 |     assert current_hists["latitude"].toJson() == pytest.latitude
 95 |     assert current_hists["longitude"].toJson() == pytest.longitude
 96 |     assert current_hists["transaction"].toJson() == pytest.transaction
 97 | 
 98 |     # import json
 99 |     # with open('tests/popmon/hist/resource/transaction.json', 'w') as outfile:
100 |     #     json.dump(current_hists["transaction"].toJson(), outfile, indent=4)
101 | 
102 | 
103 | # @pytest.mark.spark
104 | @pytest.mark.skipif(not spark_found, reason="spark not found")
105 | @pytest.mark.filterwarnings("ignore:createDataFrame attempted Arrow optimization because")
106 | def test_get_histograms_module(spark_co):
107 |     pytest.age["data"]["name"] = "'age'"
108 |     pytest.company["data"]["name"] = "'company'"
109 |     pytest.eyesColor["data"]["name"] = "'eyeColor'"
110 |     pytest.gender["data"]["name"] = "'gender'"
111 |     pytest.isActive["data"]["name"] = "'isActive'"
112 |     pytest.latitude["data"]["name"] = "'latitude'"
113 |     pytest.longitude["data"]["name"] = "'longitude'"
114 | 
115 |     pytest.latitude_longitude["data"]["name"] = "'latitude:longitude'"
116 |     pytest.latitude_longitude["data"]["bins:name"] = "unit_func"
117 | 
118 |     spark = spark_co
119 | 
120 |     spark_df = spark.createDataFrame(pytest.test_df)
121 | 
122 |     spark_filler = SparkHistogrammar(
123 |         features=[
124 |             "date",
125 |             "isActive",
126 |             "age",
127 |             "eyeColor",
128 |             "gender",
129 |             "company",
130 |             "latitude",
131 |             "longitude",
132 |             ["isActive", "age"],
133 |             ["latitude", "longitude"],
134 |             "amount",
135 |         ],
136 |         bin_specs={
137 |             "longitude": {"bin_width": 5.0, "bin_offset": 0.0},
138 |             "latitude": {"bin_width": 5.0, "bin_offset": 0.0},
139 |         },
140 |         read_key="input",
141 |         store_key="output",
142 |     )
143 | 
144 |     # test transform() function call
145 |     datastore = spark_filler.transform(datastore={"input": spark_df})
146 | 
147 |     assert "output" in datastore
148 |     current_hists = datastore["output"]
149 |     assert current_hists["age"].toJson() == pytest.age
150 |     assert current_hists["company"].toJson() == pytest.company
151 |     assert current_hists["eyeColor"].toJson() == pytest.eyesColor
152 |     assert current_hists["gender"].toJson() == pytest.gender
153 |     assert current_hists["latitude"].toJson() == pytest.latitude
154 |     assert current_hists["longitude"].toJson() == pytest.longitude
155 |     # assert current_hists['date'].toJson() == pytest.date
156 |     # assert current_hists['isActive'].toJson() == pytest.isActive
157 |     # assert current_hists['isActive:age'].toJson() == pytest.isActive_age
158 |     # assert current_hists['latitude:longitude'].toJson() == pytest.latitude_longitude
159 | 
160 | 
161 | # @pytest.mark.spark
162 | @pytest.mark.skipif(not spark_found, reason="spark not found")
163 | @pytest.mark.filterwarnings("ignore:createDataFrame attempted Arrow optimization because")
164 | def test_get_histograms_timestamp(spark_co):
165 |     from pyspark.sql.functions import to_timestamp
166 | 
167 |     spark = spark_co
168 | 
169 |     data_date = [
170 |         "2018-12-10 00:00:00",
171 |         "2018-12-10 00:00:00",
172 |         "2018-12-10 00:00:00",
173 |         "2018-12-10 00:00:00",
174 |         "2018-12-10 00:00:00",
175 |         "2018-12-17 00:00:00",
176 |         "2018-12-17 00:00:00",
177 |         "2018-12-17 00:00:00",
178 |         "2018-12-17 00:00:00",
179 |         "2018-12-19 00:00:00",
180 |     ]
181 | 
182 |     df = pd.DataFrame(data_date, columns=["dt"])
183 |     sdf = spark.createDataFrame(df).withColumn("dt", to_timestamp("dt", "yyyy-MM-dd HH:mm:ss"))
184 |     expected = {
185 |         "data": {
186 |             "binWidth": 2592000000000000.0,
187 |             "bins": {"108": 9.0, "109": 1.0},
188 |             "bins:type": "Count",
189 |             "entries": 10.0,
190 |             "name": "'dt'",
191 |             "nanflow": 0.0,
192 |             "nanflow:type": "Count",
193 |             "origin": 1.2625632e18,
194 |         },
195 |         "type": "SparselyBin",
196 |         "version": "1.1",
197 |     }
198 |     filler = SparkHistogrammar(features=["dt"])
199 |     current_hists = filler.get_histograms(sdf)
200 |     assert current_hists["dt"].toJson() == expected
201 | 
202 | 
203 | # @pytest.mark.spark
204 | @pytest.mark.skipif(not spark_found, reason="spark not found")
205 | @pytest.mark.filterwarnings("ignore:createDataFrame attempted Arrow optimization because")
206 | def test_get_histograms_date(spark_co):
207 |     from pyspark.sql.functions import to_date
208 | 
209 |     spark = spark_co
210 | 
211 |     data_date = [
212 |         "2018-12-10",
213 |         "2018-12-10",
214 |         "2018-12-10",
215 |         "2018-12-10",
216 |         "2018-12-10",
217 |         "2018-12-17",
218 |         "2018-12-17",
219 |         "2018-12-17",
220 |         "2018-12-17",
221 |         "2018-12-19",
222 |     ]
223 | 
224 |     df = pd.DataFrame(data_date, columns=["dt"])
225 |     sdf = spark.createDataFrame(df).withColumn("dt", to_date("dt", "yyyy-MM-dd"))
226 |     expected = {
227 |         "data": {
228 |             "binWidth": 2592000000000000.0,
229 |             "bins": {"108": 9.0, "109": 1.0},
230 |             "bins:type": "Count",
231 |             "entries": 10.0,
232 |             "name": "'dt'",
233 |             "nanflow": 0.0,
234 |             "nanflow:type": "Count",
235 |             "origin": 1.2625632e18,
236 |         },
237 |         "type": "SparselyBin",
238 |         "version": "1.1",
239 |     }
240 |     filler = SparkHistogrammar(features=["dt"])
241 |     current_hists = filler.get_histograms(sdf)
242 |     assert current_hists["dt"].toJson() == expected
243 | 
244 | 
245 | # @pytest.mark.spark
246 | @pytest.mark.skipif(not spark_found, reason="spark not found")
247 | @pytest.mark.filterwarnings("ignore:createDataFrame attempted Arrow optimization because")
248 | def test_null_histograms(spark_co):
249 |     spark = spark_co
250 | 
251 |     data = [
252 |         (None, None, None, None),
253 |         (1, None, None, 2.0),
254 |         (None, True, "Jones", None),
255 |         (3, True, "USA", 4.0),
256 |         (4, False, "FL", 5.0),
257 |     ]
258 |     columns = ["transaction", "isActive", "eyeColor", "t2"]
259 |     sdf = spark.createDataFrame(data=data, schema=columns)
260 | 
261 |     hists = make_histograms(sdf, bin_specs={"transaction": {"num": 40, "low": 0, "high": 10}})
262 | 
263 |     assert "transaction" in hists
264 |     assert "isActive" in hists
265 |     assert "eyeColor" in hists
266 |     assert "t2" in hists
267 | 
268 |     h = hists["transaction"]
269 |     assert h.nanflow.entries == 2
270 |     h = hists["t2"]
271 |     assert h.nanflow.entries == 2
272 | 
273 |     h = hists["isActive"]
274 |     assert "NaN" in h.bins
275 |     assert h.bins["NaN"].entries == 2
276 | 
277 |     h = hists["eyeColor"]
278 |     assert "NaN" in h.bins
279 |     assert h.bins["NaN"].entries == 2
280 | 


--------------------------------------------------------------------------------
/tests/test_spec.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright 2016 DIANA-HEP
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import codecs
 18 | import json
 19 | import math
 20 | import sys
 21 | import unittest
 22 | 
 23 | try:
 24 |     from urllib2 import urlopen
 25 | except ImportError:
 26 |     from urllib.request import urlopen
 27 | 
 28 | import histogrammar.version
 29 | from histogrammar import Factory, util
 30 | 
 31 | tolerance = 1e-12
 32 | util.relativeTolerance = tolerance
 33 | util.absoluteTolerance = tolerance
 34 | 
 35 | 
 36 | class TestSpec(unittest.TestCase):
 37 |     def compare(self, x, y, name):
 38 |         if Factory.fromJson(x) != Factory.fromJson(y):
 39 |             sys.stderr.write("                                          FAILED " + name + "\n")
 40 |             sys.stderr.write("                  PYTHON                           |                   SPECIFICATION\n")
 41 |             left = json.dumps(x, sort_keys=True, indent=2)
 42 |             right = json.dumps(y, sort_keys=True, indent=2)
 43 |             for leftline, rightline in zip(left.split("\n"), right.split("\n")):
 44 |                 if leftline != rightline:
 45 |                     sys.stderr.write(f"{leftline:50s} > {rightline}\n")
 46 |                 else:
 47 |                     sys.stderr.write(f"{leftline:50s} | {rightline}\n")
 48 |             self.assertEqual(Factory.fromJson(x), Factory.fromJson(y))
 49 | 
 50 |     def runTest(self):
 51 |         reader = codecs.getreader("utf-8")
 52 |         sys.stdout.write(
 53 |             f"Downloading expected results, generated by specification {histogrammar.version.specification}...\n"
 54 |         )
 55 |         url_data = f"http://histogrammar.org/test/{histogrammar.version.specification}/test-data.json"
 56 |         try:
 57 |             testdata = json.load(reader(urlopen(url_data)))
 58 |         except Exception as err:
 59 |             msg = f"could not download {url_data}\nbecause of {err.__class__.__name__}: {str(err)}\n"
 60 |             sys.stdout.write(msg)
 61 |             return
 62 |         url_results = f"http://histogrammar.org/test/{histogrammar.version.specification}/test-results.json"
 63 |         try:
 64 |             testresults = json.load(reader(urlopen(url_results)))
 65 |         except Exception as err:
 66 |             msg = f"could not download {url_results}\nbecause of {err.__class__.__name__}: {str(err)}\n\n"
 67 |             sys.stdout.write(msg)
 68 |             return
 69 | 
 70 |         for x in testdata:
 71 |             for k, v in x.items():
 72 |                 if k != "strings" and v in ("nan", "inf", "-inf"):
 73 |                     x[k] = float(v)
 74 | 
 75 |         def stripNames(x):
 76 |             if hasattr(x, "quantity"):
 77 |                 x.quantity.name = None
 78 |             elif hasattr(x, "quantityName"):
 79 |                 x.quantityName = None
 80 |             for xi in x.children:
 81 |                 stripNames(xi)
 82 | 
 83 |         for testresult in testresults:
 84 |             if hasattr(math, "isfinite"):
 85 |                 txt1 = (
 86 |                     'named("round(withholes)", lambda x: round(x["withholes"]) '
 87 |                     'if math.isfinite(x["withholes"]) else x["withholes"])'
 88 |                 )
 89 |                 testresult["expr"] = testresult["expr"].replace('"round(withholes)"', txt1)
 90 |                 txt2 = (
 91 |                     'named("[round(withholes), 2*round(withholes), 3*round(withholes)]", '
 92 |                     'lambda x: [round(x["withholes"]), 2*round(x["withholes"]), 3*round(x["withholes"])] '
 93 |                     'if math.isfinite(x["withholes"]) else [x["withholes"], x["withholes"], x["withholes"]])',
 94 |                 )
 95 |                 testresult["expr"] = testresult["expr"].replace(
 96 |                     '"[round(withholes), 2*round(withholes), 3*round(withholes)]"', txt2
 97 |                 )
 98 | 
 99 |             sys.stderr.write(testresult["expr"] + "\n")
100 | 
101 |             zero = testresult["zero-named"]
102 |             one = testresult["one-named"]
103 |             two = testresult["two-named"]
104 | 
105 |             h1 = eval(testresult["expr"])
106 |             h2 = eval(testresult["expr"])
107 | 
108 |             self.compare(h1.toJson(), zero, "NAMED ZERO")
109 |             self.compare((h1 + h1).toJson(), zero, "NAMED ZERO + ZERO")
110 |             self.compare(h1.zero().toJson(), zero, "NAMED ZERO.zero()")
111 | 
112 |             for x in testdata:
113 |                 h1.fill(x)
114 |                 h2.fill(x)
115 |             self.compare(h1.toJson(), one, "NAMED ONE")
116 |             self.compare(h1.zero().toJson(), zero, "NAMED ONE.zero()")
117 |             self.compare((h1 + h1.zero()).toJson(), one, "NAMED ONE + ZERO")
118 |             self.compare((h1.zero() + h1).toJson(), one, "NAMED ZERO + ONE")
119 | 
120 |             self.compare((h1 + h2).toJson(), two, "NAMED TWO VIA PLUS")
121 | 
122 |             for x in testdata:
123 |                 h1.fill(x)
124 |             self.compare(h1.toJson(), two, "NAMED TWO VIA FILL")
125 | 
126 |             zero = testresult["zero-anonymous"]
127 |             one = testresult["one-anonymous"]
128 |             two = testresult["two-anonymous"]
129 | 
130 |             h1 = eval(testresult["expr"])
131 |             stripNames(h1)
132 |             h2 = eval(testresult["expr"])
133 |             stripNames(h2)
134 | 
135 |             self.compare(h1.toJson(), zero, "ANONYMOUS ZERO")
136 |             self.compare((h1 + h1).toJson(), zero, "ANONYMOUS ZERO + ZERO")
137 |             self.compare(h1.zero().toJson(), zero, "ANONYMOUS ZERO.zero()")
138 | 
139 |             for x in testdata:
140 |                 h1.fill(x)
141 |                 h2.fill(x)
142 |             self.compare(h1.toJson(), one, "ANONYMOUS ONE")
143 |             self.compare(h1.zero().toJson(), zero, "ANONYMOUS ONE.zero()")
144 |             self.compare((h1 + h1.zero()).toJson(), one, "ANONYMOUS ONE + ZERO")
145 |             self.compare((h1.zero() + h1).toJson(), one, "ANONYMOUS ZERO + ONE")
146 | 
147 |             self.compare((h1 + h2).toJson(), two, "ANONYMOUS TWO VIA PLUS")
148 | 
149 |             for x in testdata:
150 |                 h1.fill(x)
151 |             self.compare(h1.toJson(), two, "ANONYMOUS TWO VIA FILL")
152 | 


--------------------------------------------------------------------------------