├── .gitignore
├── .gitattributes
├── docs
    ├── source
    │   ├── phik_index.rst
    │   ├── code.rst
    │   ├── phik.decorators.rst
    │   ├── index.rst
    │   ├── publication.rst
    │   ├── tutorials.rst
    │   ├── developing.rst
    │   ├── phik.rst
    │   ├── introduction.rst
    │   └── conf.py
    ├── autogenerate.sh
    ├── README.rst
    └── Makefile
├── phik
    ├── data
    │   └── fake_insurance_data.csv.gz
    ├── decorators
    │   ├── __init__.py
    │   └── pandas.py
    ├── simcore
    │   ├── bindings.cpp
    │   ├── __init__.py
    │   ├── asa159.hpp
    │   ├── simulation.hpp
    │   └── asa159.cpp
    ├── definitions.py
    ├── __init__.py
    ├── entry_points.py
    ├── resources.py
    ├── utils.py
    ├── betainc.py
    ├── data_quality.py
    ├── notebooks
    │   └── phik_tutorial_spark.ipynb
    ├── statistics.py
    ├── simulation.py
    ├── bivariate.py
    ├── binning.py
    ├── report.py
    └── significance.py
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── tests.yml
    │   ├── test_matrix.yml
    │   ├── valgrind.yml
    │   └── wheels.yml
├── .readthedocs.yml
├── .mbuild.sh
├── LICENSE
├── NOTICE
├── example.py
├── pyproject.toml
├── CMakeLists.txt
├── CHANGES.rst
├── README.rst
└── tests
    ├── test_phik.py
    └── integration
        ├── test_phik_tutorial_advanced.py
        └── test_phik_tutorial_basic.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.so
2 | *egg-info*
3 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | phik/notebooks/* linguist-vendored
2 | 


--------------------------------------------------------------------------------
/docs/source/phik_index.rst:
--------------------------------------------------------------------------------
1 | PhiK
2 | ====
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    phik
8 | 


--------------------------------------------------------------------------------
/phik/data/fake_insurance_data.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaveIO/PhiK/HEAD/phik/data/fake_insurance_data.csv.gz


--------------------------------------------------------------------------------
/docs/source/code.rst:
--------------------------------------------------------------------------------
1 | API Documentation
2 | =================
3 | 
4 | .. toctree::
5 |    :maxdepth: 2
6 | 
7 |    phik_index
8 | 


--------------------------------------------------------------------------------
/phik/decorators/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | # import pandas DataFrame decorators
4 | from phik.decorators import pandas
5 | 


--------------------------------------------------------------------------------
/phik/simcore/bindings.cpp:
--------------------------------------------------------------------------------
1 | #include "simulation.hpp"
2 | #include <pybind11/pybind11.h>
3 | 
4 | PYBIND11_MODULE(_phik_simulation_core, m) { bind_simulation(m); }
5 | 


--------------------------------------------------------------------------------
/docs/autogenerate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # (re)create required directories
 4 | rm -rf autogen
 5 | mkdir -p source/_static autogen
 6 | 
 7 | # auto-generate code documentation
 8 | sphinx-apidoc -f -H PhiK -o autogen ../python/phik 
 9 | mv autogen/modules.rst autogen/phik_index.rst
10 | mv autogen/* source/ 
11 | 
12 | # remove auto-gen directory
13 | rm -rf autogen
14 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | version: 2
 3 | updates:
 4 |   - package-ecosystem: pip
 5 |     directory: /
 6 |     # Check for updates once a day
 7 |     schedule:
 8 |       interval: daily
 9 |     allow:
10 |       - dependency-type: all
11 |   - package-ecosystem: github-actions
12 |     directory: /
13 |     # Check for updates once a week
14 |     schedule:
15 |       interval: weekly
16 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | # .readthedocs.yml
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.10"
13 | 
14 | python:
15 |   install:
16 |     - method: pip
17 |       path: .
18 |       extra_requirements:
19 |         - doc
20 | 


--------------------------------------------------------------------------------
/.mbuild.sh:
--------------------------------------------------------------------------------
 1 | cmake -S . -G Ninja -B build \
 2 |     -DCMAKE_BUILD_TYPE=Release \
 3 |     -DSKBUILD_PROJECT_NAME="phik" \
 4 |     -DSKBUILD_PROJECT_VERSION="0.12.4" \
 5 |     -DPHIK_MBUILD=ON \
 6 |     -DPython3_EXECUTABLE=$(python3 -c 'import sys; print(sys.executable)') \
 7 |     -Dpybind11_DIR=$(python3 -c 'import pybind11; print(pybind11.get_cmake_dir())') \
 8 |     -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 9 | 
10 | cmake --build build --target install --config Release --parallel 4
11 | 


--------------------------------------------------------------------------------
/docs/source/phik.decorators.rst:
--------------------------------------------------------------------------------
 1 | phik.decorators package
 2 | =======================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | phik.decorators.pandas module
 8 | -----------------------------
 9 | 
10 | .. automodule:: phik.decorators.pandas
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: phik.decorators
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/phik/definitions.py:
--------------------------------------------------------------------------------
 1 | """Project: PhiK - correlation analyzer library
 2 | 
 3 | Created: 2018/09/05
 4 | 
 5 | Description:
 6 |     Definitions used throughout the phik package
 7 | 
 8 | Authors:
 9 |     KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
10 | 
11 | Redistribution and use in source and binary forms, with or without
12 | modification, are permitted according to the terms listed in the file
13 | LICENSE.
14 | """
15 | 
16 | # names assigned to underflow and overflow bin when assigning bin indices
17 | OF = "OF"
18 | UF = "UF"
19 | # name replacement of np.nan
20 | NaN = "NaN"
21 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on: push
 4 | jobs:
 5 |   tests:
 6 |     runs-on: ubuntu-latest
 7 | 
 8 |     steps:
 9 |     - uses: actions/checkout@v6
10 |     - name: Set up Python
11 |       uses: actions/setup-python@v6
12 |       with:
13 |         python-version: 3.9
14 |     - name: Install dependencies
15 |       run: |
16 |         python -m pip install --upgrade pip
17 |         pip install . -v
18 |         pip install "pytest>=4.0.2" "pytest-pylint>=0.13.0"
19 |     - name: Test with pytest
20 |       run: |
21 |         cd tests
22 |         pytest test_phik.py -W ignore::DeprecationWarning
23 | 


--------------------------------------------------------------------------------
/phik/simcore/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib.util
 2 | 
 3 | try:
 4 |     _ext_spec = importlib.util.find_spec("phik.lib._phik_simulation_core")
 5 | except ModuleNotFoundError:
 6 |     _ext_spec = None
 7 | 
 8 | if _ext_spec is not None:
 9 |     from phik.lib._phik_simulation_core import _sim_2d_data_patefield
10 | 
11 |     CPP_SUPPORT = True
12 | else:
13 |     CPP_SUPPORT = False
14 | 
15 |     def _sim_2d_data_patefield(*args, **kwargs):
16 |         msg = "Patefield requires a compiled extension that was not found."
17 |         raise NotImplementedError(msg)
18 | 
19 | 
20 | __all__ = ["CPP_SUPPORT", "_sim_2d_data_patefield"]
21 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. PhiK documentation master file, created by
 2 |    sphinx-quickstart on Thu Jul  7 14:25:54 2016.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | 
 7 | .. include:: ../../README.rst
 8 | 
 9 | 
10 | Contents
11 | ========
12 | 
13 | .. toctree::
14 |    :maxdepth: 2
15 | 
16 |    introduction
17 |    tutorials
18 |    publication
19 |    developing
20 | 
21 | API
22 | ---
23 | 
24 | .. toctree::
25 |    :maxdepth: 1
26 | 
27 |    code
28 | 
29 | Indices and tables
30 | ------------------
31 | 
32 | * :ref:`genindex`
33 | * :ref:`modindex`
34 | 


--------------------------------------------------------------------------------
/phik/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | import importlib.metadata
 3 | 
 4 | from phik import decorators
 5 | from phik.outliers import (
 6 |     outlier_significance_from_array,
 7 |     outlier_significance_matrices,
 8 |     outlier_significance_matrix,
 9 | )
10 | from phik.phik import global_phik_array, phik_from_array, phik_matrix
11 | from phik.significance import significance_from_array, significance_matrix
12 | 
13 | __version__ = importlib.metadata.version("phik")
14 | 
15 | __all__ = [
16 |     "decorators",
17 |     "phik_from_array",
18 |     "significance_from_array",
19 |     "outlier_significance_from_array",
20 |     "phik_matrix",
21 |     "global_phik_array",
22 |     "significance_matrix",
23 |     "outlier_significance_matrices",
24 |     "outlier_significance_matrix",
25 | ]
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | ##############################################################################
 2 | #
 3 | # Copyright 2016 KPMG Advisory N.V. (unless otherwise stated)
 4 | #
 5 | #   Licensed under the Apache License, Version 2.0 (the "License");
 6 | #   you may not use this file except in compliance with the License.
 7 | #   You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | #   Unless required by applicable law or agreed to in writing, software
12 | #   distributed under the License is distributed on an "AS IS" BASIS,
13 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | #   See the License for the specific language governing permissions and
15 | #   limitations under the License.
16 | #
17 | ##############################################################################
18 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | ################################################################################################
 2 | #
 3 | # NOTICE: pass-through licensing of bundled components
 4 | #
 5 | # PhiK gathers together a toolkit of pre-existing third-party open-source software components.
 6 | # These software components are governed by their own licenses which PhiK does not
 7 | # modify or supersede, please consult the originating authors. These components altogether
 8 | # have a mixture of the following licenses: Apache 2.0, GPL 2.0, AGPL and LGPL, ZPL, MIT, PSF,
 9 | # BSD and some BSD-like simple licenses.
10 | # For scipy and numpy see: http://docs.continuum.io/anaconda/licenses.html .
11 | #
12 | # Although we have examined the licenses to verify acceptance of commercial and non-commercial
13 | # use, please see and consult the original licenses or authors.
14 | #
15 | ################################################################################################
16 | 


--------------------------------------------------------------------------------
/phik/entry_points.py:
--------------------------------------------------------------------------------
 1 | """Project: PhiK - correlation analyzer library
 2 | 
 3 | Created: 2018/11/13
 4 | 
 5 | Description:
 6 |     Collection of phik entry points
 7 | 
 8 | Authors:
 9 |     KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
10 | 
11 | Redistribution and use in source and binary forms, with or without
12 | modification, are permitted according to the terms listed in the file
13 | LICENSE.
14 | """
15 | 
16 | 
17 | def phik_trial():
18 |     """Run Phi_K tests.
19 | 
20 |     We will keep this here until we've completed switch to pytest or nose and tox.
21 |     We could also keep it, but I don't like the fact that packages etc. are
22 |     hard coded. Gotta come up with
23 |     a better solution.
24 |     """
25 |     import sys
26 |     import pytest
27 | 
28 |     # ['--pylint'] +
29 |     # -r xs shows extra info on skips and xfails.
30 |     default_options = ["-rxs"]
31 |     args = sys.argv[1:] + default_options
32 |     sys.exit(pytest.main(args))
33 | 


--------------------------------------------------------------------------------
/phik/simcore/asa159.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Taken from:
 3 |  * https://people.sc.fsu.edu/~jburkardt/cpp_src/asa159/asa159.html
 4 |  *
 5 |  * Michael Patefield,
 6 |  * Algorithm AS 159: An Efficient Method of Generating RXC Tables with Given Row and Column Totals,
 7 |  * Applied Statistics,
 8 |  * Volume 30, Number 1, 1981, pages 91-97.
 9 |  */
10 | 
11 | #ifndef PYTHON_PHIK_SIMCORE_ASA159_HPP_
12 | #define PYTHON_PHIK_SIMCORE_ASA159_HPP_
13 | 
14 | #include <string>
15 | 
16 | int i4_max ( int i1, int i2 );
17 | int i4_min ( int i1, int i2 );
18 | void i4mat_print ( int m, int n, int a[], std::string title );
19 | void i4mat_print_some ( int m, int n, int a[], int ilo, int jlo, int ihi,
20 |   int jhi, std::string title );
21 | void i4vec_print ( int n, int a[], std::string title );
22 | int i4vec_sum ( int n, int a[] );
23 | double r8_uniform_01 ( int *seed );
24 | void rcont2 ( int nrow, int ncol, int nrowt[], int ncolt[], bool *key,
25 |   int *seed, int matrix[],  int *ierror );
26 | void timestamp ( );
27 | 
28 | #endif  // PYTHON_PHIK_SIMCORE_ASA159_HPP_
29 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | import phik
 4 | from phik import resources, report
 5 | 
 6 | # open fake car insurance data
 7 | df = pd.read_csv( resources.fixture('fake_insurance_data.csv.gz') )
 8 | df.head()
 9 | 
10 | # Pearson's correlation matrix between numeric variables (pandas functionality)
11 | df.corr()
12 | 
13 | # get the phi_k correlation matrix between all variables
14 | df.phik_matrix()
15 | 
16 | # get global correlations based on phi_k correlation matrix
17 | df.global_phik()
18 | 
19 | # get the significance matrix (expressed as one-sided Z)
20 | # of the hypothesis test of each variable-pair dependency
21 | df.significance_matrix()
22 | 
23 | # contingency table of two columns
24 | cols = ['mileage', 'car_size']
25 | df[cols].hist2d()
26 | 
27 | # normalized residuals of contingency test applied to cols
28 | df[cols].outlier_significance_matrix()
29 | 
30 | # show the normalized residuals of each variable-pair
31 | df.outlier_significance_matrices()
32 | 
33 | # generate a phik correlation report and save as test.pdf
34 | report.correlation_report(df, pdf_file_name='test.pdf')
35 | 


--------------------------------------------------------------------------------
/.github/workflows/test_matrix.yml:
--------------------------------------------------------------------------------
 1 | name: Test Matrix
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   pull_request:
 6 |   push:
 7 |     branches:
 8 |       - master
 9 | 
10 | jobs:
11 |   build:
12 |     name: ${{ matrix.platform }} Python ${{ matrix.python-version }}
13 |     strategy:
14 |       fail-fast: false
15 |       matrix:
16 |         platform: [windows-latest, macos-latest, ubuntu-latest]
17 |         python-version: ["3.9", "3.10", "3.11", "3.12"]
18 | 
19 |     runs-on: ${{ matrix.platform }}
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v6
23 |       with:
24 |         submodules: true
25 | 
26 |     - uses: actions/setup-python@v6
27 |       with:
28 |         python-version: ${{ matrix.python-version }}
29 | 
30 |     - name: Add requirements
31 |       run: |
32 |         python -m pip install --upgrade pip wheel
33 | 
34 |     - name: Build and install
35 |       run: pip install --verbose ".[test]"
36 | 
37 |     - name: Unit test
38 |       run: |
39 |         cd tests
40 |         pytest test_phik.py -v -W ignore::DeprecationWarning
41 | 
42 |     - name: Integration test
43 |       run: |
44 |         cd tests
45 |         pytest integration -v -W ignore::DeprecationWarning
46 | 


--------------------------------------------------------------------------------
/docs/source/publication.rst:
--------------------------------------------------------------------------------
 1 | ===================
 2 | Publication & Talks
 3 | ===================
 4 | 
 5 | Publication
 6 | -----------
 7 | 
 8 | * peer-reviewed: https://www.sciencedirect.com/science/article/abs/pii/S0167947320301341
 9 | * arXiv pre-print: https://arxiv.org/abs/1811.11440
10 | 
11 | 
12 | Talks
13 | -----
14 | 
15 | * Coming soon.
16 | 
17 | 
18 | Cite as
19 | -------
20 | 
21 | Baak, M., Koopman, R., Snoek, H., & Klous, S. (2020). A new correlation coefficient between categorical, ordinal and interval variables with Pearson characteristics. *Computational Statistics & Data Analysis*, 152, 107043.
22 | 
23 | 
24 | .. code-block:: latex
25 | 
26 |   @article{phik2020,
27 |     title={A new correlation coefficient between categorical, ordinal and interval variables with Pearson characteristics},
28 |     author={Baak, M and Koopman, R and Snoek, H and Klous, S},
29 |     journal={Computational Statistics \& Data Analysis},
30 |     volume={152},
31 |     pages={107043},
32 |     year={2020},
33 |     publisher={Elsevier}
34 |   }
35 | 
36 | References
37 | ----------
38 | 
39 | * Web page: https://phik.readthedocs.io
40 | * Repository: https://github.com/kaveio/phik
41 | * Issues & Ideas: https://github.com/kaveio/phik/issues
42 | * Contact us at: kave [at] kpmg [dot] com
43 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["scikit-build-core>=0.3.3", "pybind11"]
 3 | build-backend = "scikit_build_core.build"
 4 | 
 5 | [project]
 6 | name = "phik"
 7 | version = "0.12.5"
 8 | description = "Phi_K correlation analyzer library"
 9 | readme = "README.rst"
10 | authors = [{ name = "KPMG N.V. The Netherlands", email = "kave@kpmg.com" }]
11 | requires-python = ">=3.9"
12 | classifiers = [
13 |   "Development Status :: 4 - Beta",
14 |   "License :: OSI Approved :: MIT License",
15 |   "Programming Language :: Python :: 3 :: Only",
16 |   "Programming Language :: Python :: 3.9",
17 |   "Programming Language :: Python :: 3.10",
18 |   "Programming Language :: Python :: 3.11",
19 |   "Programming Language :: Python :: 3.12",
20 | ]
21 | 
22 | dependencies = [
23 |   "numpy>=1.18.0",
24 |   "scipy>=1.5.2",
25 |   "pandas>=0.25.1",
26 |   "matplotlib>=2.2.3",
27 |   "joblib>=0.14.1",
28 | ]
29 | 
30 | [project.optional-dependencies]
31 | test = ["pytest>=4.0.2", "pytest-pylint>=0.13.0"]
32 | 
33 | [tool.scikit-build]
34 | wheel.expand-macos-universal-tags = true
35 | cmake.build-type = "Release"
36 | logging.level = "WARNING"
37 | sdist.include = ["phik/simcore", "CMakeLists.txt"]
38 | 
39 | [tool.pytest.ini_options]
40 | minversion = "6.0"
41 | addopts = ["-ra"]
42 | xfail_strict = true
43 | log_cli_level = "INFO"
44 | filterwarnings = ["error"]
45 | testpaths = ["tests"]
46 | 


--------------------------------------------------------------------------------
/docs/source/tutorials.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | Tutorials
 3 | =========
 4 | 
 5 | This section contains materials on how to use the Phi_K correlation analysis code.
 6 | There are additional side notes on how certain aspects work and where to find parts of the code.
 7 | For more in depth explanations on the functionality of the code-base, try the `API docs <phik_index.html>`_.
 8 | 
 9 | The tutorials are available in the ``phik/notebooks`` directory. We have:
10 | 
11 | * A basic tutorial: this covers the basics of calculating Phi_K, the statistical significance, and interpreting the correlation. 
12 | * An advanced tutorial: this shows how to use the advanced features of the ``PhiK`` library.
13 | * A spark tutorial: this shows how to calculate the Phi_K correlation matrix for a spark dataframe.
14 | 
15 | You can open these notebooks directly:
16 | 
17 | * Run them interactively at `MyBinder <https://mybinder.org/v2/gh/KaveIO/PhiK/master?filepath=phik%2Fnotebooks>`_.
18 | * View them statically: `basic tutorial <http://nbviewer.ipython.org/urls/raw.github.com/kaveio/phik/master/phik/notebooks/phik_tutorial_basic.ipynb>`_ and  the `advanced tutorial <http://nbviewer.ipython.org/urls/raw.github.com/kaveio/phik/master/phik/notebooks/phik_tutorial_advanced.ipynb>`_ and the `spark tutorial <http://nbviewer.ipython.org/urls/raw.github.com/kaveio/phik/master/phik/notebooks/phik_tutorial_spark.ipynb>`_.
19 | 


--------------------------------------------------------------------------------
/phik/decorators/pandas.py:
--------------------------------------------------------------------------------
 1 | """Project: PhiK - correlation analyzer library
 2 | 
 3 | Module: phik.decorators.pandas
 4 | 
 5 | Created: 2018/11/14
 6 | 
 7 | Description:
 8 |     Decorators for pandas DataFrame objects
 9 | 
10 | Authors:
11 |     KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
12 | 
13 | Redistribution and use in source and binary forms, with or without
14 | modification, are permitted according to the terms listed in the file
15 | LICENSE.
16 | """
17 | 
18 | from pandas import DataFrame, Series
19 | 
20 | # add function to create a 2d histogram
21 | from phik.binning import hist2d, hist2d_from_array
22 | DataFrame.hist2d = hist2d
23 | Series.hist2d = hist2d_from_array
24 | 
25 | # add phik correlation matrix function
26 | from phik.phik import phik_matrix, global_phik_array
27 | DataFrame.phik_matrix = phik_matrix
28 | DataFrame.global_phik = global_phik_array
29 | 
30 | # add significance matrix function for variable dependencies
31 | from phik.significance import significance_matrix
32 | DataFrame.significance_matrix = significance_matrix
33 | 
34 | # outlier matrix
35 | from phik.outliers import outlier_significance_matrices, outlier_significance_matrix, outlier_significance_from_array
36 | DataFrame.outlier_significance_matrices = outlier_significance_matrices
37 | DataFrame.outlier_significance_matrix = outlier_significance_matrix
38 | Series.outlier_significance_matrix = outlier_significance_from_array
39 | 


--------------------------------------------------------------------------------
/docs/source/developing.rst:
--------------------------------------------------------------------------------
 1 | ===========================
 2 | Developing and Contributing
 3 | ===========================
 4 | 
 5 | 
 6 | Working on the package
 7 | ----------------------
 8 | You have some cool feature and/or algorithm you want to add to the package. How do you go about it?
 9 | 
10 | First clone the package.
11 | 
12 | .. code-block:: bash
13 | 
14 |   git clone https://github.com/KaveIO/PhiK.git
15 | 
16 | then
17 | 
18 | .. code-block:: bash
19 | 
20 |   pip install -e PhiK/
21 | 
22 | this will install ``PhiK`` in editable mode, which will allow you to edit the code and run it as
23 | you would with a normal installation of the ``PhiK`` correlation analyzer package.
24 | 
25 | To make sure that everything works try executing the tests, e.g.
26 | 
27 | .. code-block:: bash
28 | 
29 |   cd PhiK/
30 |   phik_trial .
31 | 
32 | or 
33 | 
34 | .. code-block:: bash
35 | 
36 |   cd PhiK/
37 |   python setup.py test
38 | 
39 | That's it.
40 | 
41 | 
42 | Contributing
43 | ------------
44 | 
45 | When contributing to this repository, please first discuss the change you wish to make via issue, email, or any
46 | other method with the owners of this repository before making a change. You can find the contact information on the
47 | `index <index.html>`_ page.
48 | 
49 | Note that when contributing that all tests should succeed.
50 | 
51 | 
52 | Tips and Tricks
53 | ---------------
54 | 
55 | - Enable auto reload in ``jupyter``:
56 | 
57 | .. code-block:: python
58 | 
59 |   %load_ext autoreload
60 | 
61 | this will reload modules before executing any user code.
62 | 


--------------------------------------------------------------------------------
/.github/workflows/valgrind.yml:
--------------------------------------------------------------------------------
 1 | name: Valgrind
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - master
 7 |   workflow_dispatch:
 8 | 
 9 | defaults:
10 |   run:
11 |     shell: bash
12 | 
13 | jobs:
14 |   pre_job:
15 |     # continue-on-error: true # Uncomment once integration is finished
16 |     runs-on: ubuntu-latest
17 |     # Map a step output to a job output
18 |     outputs:
19 |       should_skip: ${{ steps.skip_check.outputs.should_skip }}
20 |     steps:
21 |       - id: skip_check
22 |         uses: fkirc/skip-duplicate-actions@master
23 |         with:
24 |           # All of these options are optional, so you can remove them if you are happy with the defaults
25 |           cancel_others: 'true'
26 |           do_not_skip: '["pull_request", "workflow_dispatch", "schedule"]'
27 |   build:
28 |     name: Valgrind
29 |     needs: pre_job
30 |     if: ${{ needs.pre_job.outputs.should_skip != 'true' }}
31 |     runs-on: ubuntu-latest
32 | 
33 |     steps:
34 |       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
35 |       - uses: actions/checkout@v6
36 |         with:
37 |           submodules: false
38 | 
39 |       - uses: actions/setup-python@v6
40 |         with:
41 |           python-version: '3.10'
42 | 
43 |       - name: Install dependencies on ubuntu
44 |         run: |
45 |           sudo apt-get update
46 |           sudo apt-get install -y valgrind
47 | 
48 |       - name: Install python packages
49 |         run: |
50 |           python -m pip install --upgrade pip pytest
51 |           
52 |       - name: Install
53 |         run: |
54 |           # temp fix for Valgrind issue with later versions
55 |           pip install scipy==1.9.1
56 |           CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" pip install . -v
57 | 
58 |       - name: Test
59 |         run: |
60 |           cd tests
61 |           PYTHONMALLOC=malloc valgrind --leak-check=yes --track-origins=yes --log-file=valgrind-log.txt python -m pytest test_phik.py -W ignore::DeprecationWarning
62 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.17...3.27)
 2 | 
 3 | # Scikit-build-core sets these values for you, or you can just hard-code the
 4 | # name and version.
 5 | project(
 6 |   ${SKBUILD_PROJECT_NAME}
 7 |   VERSION ${SKBUILD_PROJECT_VERSION}
 8 |   DESCRIPTION "C++ bindings for simulation RXC tables"
 9 |   LANGUAGES CXX)
10 | 
11 | set(CMAKE_CXX_STANDARD 14)
12 | # Define CMAKE_INSTALL_xxx: LIBDIR, INCLUDEDIR
13 | include(GNUInstallDirs)
14 | 
15 | find_package(Python REQUIRED COMPONENTS Interpreter Development.Module)
16 | find_package(pybind11 CONFIG REQUIRED)
17 | set(SUBPATH ${PROJECT_SOURCE_DIR}/phik/simcore/)
18 | 
19 | # ##############################################################################
20 | # build ASA159 library                                #
21 | # ##############################################################################
22 | if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
23 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
24 | endif()
25 | 
26 | add_library(_asa159 OBJECT ${SUBPATH}/asa159.cpp)
27 | 
28 | target_include_directories(_asa159 PRIVATE ${SUBPATH})
29 | # ##############################################################################
30 | # EXECUTABLE                                  #
31 | # ##############################################################################
32 | pybind11_add_module(_phik_simulation_core MODULE ${SUBPATH}/bindings.cpp
33 |                     ${SUBPATH}/simulation.hpp $<TARGET_OBJECTS:_asa159>)
34 | 
35 | target_compile_definitions(_phik_simulation_core
36 |                            PRIVATE VERSION_INFO=${SKBUILD_PROJECT_VERSION})
37 | 
38 | target_include_directories(
39 |   _phik_simulation_core PUBLIC $<INSTALL_INTERFACE:include>
40 |                                $<BUILD_INTERFACE:${SUBPATH}>)
41 | if(PHIK_MBUILD)
42 |   set(CMAKE_INSTALL_PREFIX "${PROJECT_SOURCE_DIR}")
43 | endif()
44 | 
45 | install(TARGETS _phik_simulation_core LIBRARY DESTINATION "${PROJECT_NAME}/lib")
46 | 
47 | # Quiet a warning, since this project is only valid with SKBUILD
48 | set(ignoreMe "${SKBUILD}")
49 | 


--------------------------------------------------------------------------------
/.github/workflows/wheels.yml:
--------------------------------------------------------------------------------
 1 | name: Wheels
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   pull_request:
 6 |   push:
 7 |     branches:
 8 |      - master
 9 |   release:
10 |     types:
11 |       - published
12 | 
13 | jobs:
14 |   make_sdist:
15 |     name: Make SDist
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |     - uses: actions/checkout@v6
19 | 
20 |     - name: Build SDist
21 |       run: pipx run build --sdist
22 | 
23 |     - uses: actions/upload-artifact@v6
24 |       with:
25 |         name: artifact-sdist
26 |         path: dist/*.tar.gz
27 | 
28 | 
29 |   build_wheels:
30 |     name: Wheels on ${{ matrix.os }}
31 |     runs-on: ${{ matrix.os }}
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         os: [ubuntu-latest, windows-latest, macos-latest]
36 | 
37 |     steps:
38 |     - uses: actions/checkout@v6
39 |       with:
40 |         submodules: true
41 | 
42 |     - uses: actions/setup-python@v6
43 |       with:
44 |         python-version: "3.12"
45 | 
46 |     - uses: pypa/cibuildwheel@v3.3.0
47 |       env:
48 |         CIBW_ENVIRONMENT: MACOSX_DEPLOYMENT_TARGET=10.13
49 |         CIBW_BUILD: 'cp38-* cp39-* cp310-* cp311-* cp312-* cp313-* cp314-*'
50 |         CIBW_TEST_EXTRAS: test
51 |         CIBW_TEST_COMMAND: pytest {project}/tests/test_phik.py -W ignore::DeprecationWarning
52 |         CIBW_ARCHS: "auto64"
53 |         CIBW_ARCHS_MACOS: "x86_64 arm64"
54 |         # Skip 32-bit builds
55 |         CIBW_SKIP: "*-win32 *-manylinux_i686 *-musllinux_x86_64"
56 | 
57 |     - name: Show files
58 |       run: ls -lh wheelhouse
59 |       shell: bash
60 | 
61 |     - name: Verify clean directory
62 |       run: git diff --exit-code
63 |       shell: bash
64 | 
65 |     - name: Upload wheels
66 |       uses: actions/upload-artifact@v6
67 |       with:
68 |         name: artifact-${{ matrix.os }}
69 |         path: wheelhouse/*.whl
70 | 
71 |   upload_all:
72 |     needs: [build_wheels, make_sdist]
73 |     runs-on: ubuntu-latest
74 |     if: github.event_name == 'release' && github.event.action == 'published'
75 |     steps:
76 |     - uses: actions/download-artifact@v7
77 |       with:
78 |         pattern: artifact-*
79 |         merge-multiple: true
80 |         path: dist
81 | 
82 |     - uses: pypa/gh-action-pypi-publish@release/v1
83 |       with:
84 |         user: __token__
85 |         password: ${{ secrets.pypi_password }}
86 | 


--------------------------------------------------------------------------------
/phik/resources.py:
--------------------------------------------------------------------------------
 1 | """Project: PhiK - correlation analyzer library
 2 | 
 3 | Created: 2018/11/13
 4 | 
 5 | Description:
 6 |     Collection of helper functions to get fixtures, i.e. for test data and notebooks.
 7 |     These are mostly used by the (integration) tests and example notebooks.
 8 | 
 9 | Authors:
10 |     KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
11 | 
12 | Redistribution and use in source and binary forms, with or without
13 | modification, are permitted according to the terms listed in the file
14 | LICENSE.
15 | """
16 | 
17 | import pathlib
18 | from pathlib import Path
19 | 
20 | ROOT_DIRECTORY = Path(__file__).resolve().parent
21 | 
22 | 
23 | # Fixtures
24 | _FIXTURE = {_.name: _ for _ in pathlib.Path(ROOT_DIRECTORY / "data").glob("*")}
25 | # Tutorial notebooks
26 | _NOTEBOOK = {
27 |     _.name: _ for _ in pathlib.Path(ROOT_DIRECTORY / "notebooks").glob("*.ipynb")
28 | }
29 | 
30 | # Resource types
31 | _RESOURCES = {"fixture": _FIXTURE, "notebook": _NOTEBOOK}
32 | 
33 | 
34 | def _resource(resource_type, name: str) -> str:
35 |     """Return the full path filename of a resource.
36 | 
37 |     :param str resource_type: The type of the resource.
38 |     :param str  name: The name of the resource.
39 |     :returns: The full path filename of the fixture data set.
40 |     :rtype: str
41 |     :raises FileNotFoundError: If the resource cannot be found.
42 |     """
43 |     full_path = _RESOURCES[resource_type].get(name, None)
44 | 
45 |     if full_path and full_path.exists():
46 |         return str(full_path)
47 | 
48 |     raise FileNotFoundError(
49 |         'Could not find {resource_type} "{name!s}"! Does it exist?'.format(
50 |             resource_type=resource_type, name=name
51 |         )
52 |     )
53 | 
54 | 
55 | def fixture(name: str) -> str:
56 |     """Return the full path filename of a fixture data set.
57 | 
58 |     :param str name: The name of the fixture.
59 |     :returns: The full path filename of the fixture data set.
60 |     :rtype: str
61 |     :raises FileNotFoundError: If the fixture cannot be found.
62 |     """
63 |     return _resource("fixture", name)
64 | 
65 | 
66 | def notebook(name: str) -> str:
67 |     """Return the full path filename of a tutorial notebook.
68 | 
69 |     :param str name: The name of the notebook.
70 |     :returns: The full path filename of the notebook.
71 |     :rtype: str
72 |     :raises FileNotFoundError: If the notebook cannot be found.
73 |     """
74 |     return _resource("notebook", name)
75 | 


--------------------------------------------------------------------------------
/docs/source/phik.rst:
--------------------------------------------------------------------------------
  1 | phik package
  2 | ============
  3 | 
  4 | Subpackages
  5 | -----------
  6 | 
  7 | .. toctree::
  8 | 
  9 |     phik.decorators
 10 | 
 11 | Submodules
 12 | ----------
 13 | 
 14 | phik.betainc module
 15 | -------------------
 16 | 
 17 | .. automodule:: phik.betainc
 18 |     :members:
 19 |     :undoc-members:
 20 |     :show-inheritance:
 21 | 
 22 | phik.binning module
 23 | -------------------
 24 | 
 25 | .. automodule:: phik.binning
 26 |     :members:
 27 |     :undoc-members:
 28 |     :show-inheritance:
 29 | 
 30 | phik.bivariate module
 31 | ---------------------
 32 | 
 33 | .. automodule:: phik.bivariate
 34 |     :members:
 35 |     :undoc-members:
 36 |     :show-inheritance:
 37 | 
 38 | phik.data\_quality module
 39 | -------------------------
 40 | 
 41 | .. automodule:: phik.data_quality
 42 |     :members:
 43 |     :undoc-members:
 44 |     :show-inheritance:
 45 | 
 46 | phik.definitions module
 47 | -----------------------
 48 | 
 49 | .. automodule:: phik.definitions
 50 |     :members:
 51 |     :undoc-members:
 52 |     :show-inheritance:
 53 | 
 54 | phik.entry\_points module
 55 | -------------------------
 56 | 
 57 | .. automodule:: phik.entry_points
 58 |     :members:
 59 |     :undoc-members:
 60 |     :show-inheritance:
 61 | 
 62 | phik.outliers module
 63 | --------------------
 64 | 
 65 | .. automodule:: phik.outliers
 66 |     :members:
 67 |     :undoc-members:
 68 |     :show-inheritance:
 69 | 
 70 | phik.phik module
 71 | ----------------
 72 | 
 73 | .. automodule:: phik.phik
 74 |     :members:
 75 |     :undoc-members:
 76 |     :show-inheritance:
 77 | 
 78 | phik.report module
 79 | ------------------
 80 | 
 81 | .. automodule:: phik.report
 82 |     :members:
 83 |     :undoc-members:
 84 |     :show-inheritance:
 85 | 
 86 | phik.resources module
 87 | ---------------------
 88 | 
 89 | .. automodule:: phik.resources
 90 |     :members:
 91 |     :undoc-members:
 92 |     :show-inheritance:
 93 | 
 94 | phik.significance module
 95 | ------------------------
 96 | 
 97 | .. automodule:: phik.significance
 98 |     :members:
 99 |     :undoc-members:
100 |     :show-inheritance:
101 | 
102 | phik.simulation module
103 | ----------------------
104 | 
105 | .. automodule:: phik.simulation
106 |     :members:
107 |     :undoc-members:
108 |     :show-inheritance:
109 | 
110 | phik.statistics module
111 | ----------------------
112 | 
113 | .. automodule:: phik.statistics
114 |     :members:
115 |     :undoc-members:
116 |     :show-inheritance:
117 | 
118 | phik.version module
119 | -------------------
120 | 
121 | .. automodule:: phik.version
122 |     :members:
123 |     :undoc-members:
124 |     :show-inheritance:
125 | 
126 | 
127 | Module contents
128 | ---------------
129 | 
130 | .. automodule:: phik
131 |     :members:
132 |     :undoc-members:
133 |     :show-inheritance:
134 | 


--------------------------------------------------------------------------------
/phik/simcore/simulation.hpp:
--------------------------------------------------------------------------------
 1 | /* python/phik/simulation/simulation.hpp wrapper and bindings for
 2 |  * Michael Patefield,
 3 |  * Algorithm AS 159: An Efficient Method of Generating RXC Tables with Given Row and Column Totals,
 4 |  * Applied Statistics,
 5 |  * Volume 30, Number 1, 1981, pages 91-97.
 6 |  *
 7 |  * https://people.sc.fsu.edu/~jburkardt/cpp_src/asa159/asa159.html
 8 |  */
 9 | 
10 | #ifndef PYTHON_PHIK_SIMCORE_SIMULATION_HPP_
11 | #define PYTHON_PHIK_SIMCORE_SIMULATION_HPP_
12 | #include "asa159.hpp"
13 | #include <pybind11/numpy.h>
14 | #include <pybind11/pybind11.h>
15 | 
16 | namespace py = pybind11;
17 | 
18 | struct simulation_error: std::exception {
19 |     const char* p_message;
20 |     explicit simulation_error(const char* message) : p_message(message) {}
21 |     const char* what() const throw() { return p_message; }
22 | };
23 | 
24 | void _sim_2d_data_patefield(
25 |     int nrow,
26 |     int ncol,
27 |     const py::array_t<int>& nrowt,
28 |     const py::array_t<int>& ncolt,
29 |     int seed,
30 |     py::array_t<int>& result
31 | ) {
32 |     bool key = false;
33 |     int ierror = 0;
34 |     int* nrowt_ptr = reinterpret_cast<int*>(nrowt.request().ptr);
35 |     int* ncolt_ptr = reinterpret_cast<int*>(ncolt.request().ptr);
36 |     int* result_ptr = reinterpret_cast<int*>(result.request().ptr);
37 | 
38 |     // constructs a random two-way contingency table with given sums,
39 |     // the underlying memory of result is directly modified
40 |     rcont2(nrow, ncol, nrowt_ptr, ncolt_ptr, &key, &seed, result_ptr, &ierror);
41 |     if (ierror != 0) {
42 |         throw simulation_error("Could not construct two-way contingency table");
43 |     }
44 |     return;
45 | }
46 | 
47 | auto docstring = R"pbdoc(Construct a random two-way contingency table with given sums
48 | 
49 | Parameters
50 | ----------
51 | nrow : int
52 |     number of rows in the table, should be >= 2
53 | ncol : int
54 |     number of columns in the table, should be >= 2
55 | nrowt : np.array[int]
56 |     the row sums, note all values should be positive
57 | ncolt : np.array[int]
58 |     the col sums, note all values should be positive
59 | seed : int
60 |     random seed for the generation
61 | result : np.array[int]
62 |     initialized array where the results will be stored
63 | 
64 | Reference
65 | ---------
66 | WM Patefield,
67 | Algorithm AS 159:
68 | An Efficient Method of Generating RXC Tables with
69 | Given Row and Column Totals,
70 | Applied Statistics,
71 | Volume 30, Number 1, 1981, pages 91-97.
72 | )pbdoc";
73 | 
74 | void bind_simulation(py::module &m) {
75 |     m.def(
76 |         "_sim_2d_data_patefield",
77 |         &_sim_2d_data_patefield,
78 |         docstring,
79 |         py::arg("nrow"),
80 |         py::arg("ncol"),
81 |         py::arg("nrowt"),
82 |         py::arg("ncolt"),
83 |         py::arg("seed"),
84 |         py::arg("result")
85 |     );
86 | }
87 | 
88 | #endif  // PYTHON_PHIK_SIMCORE_SIMULATION_HPP_
89 | 


--------------------------------------------------------------------------------
/docs/README.rst:
--------------------------------------------------------------------------------
 1 | Generating Documentation with Sphinx
 2 | ====================================
 3 | 
 4 | This README is for generating and writing documentation using Sphinx.
 5 | On the repository there should already be the auto-generated files
 6 | along with the regular documentation.
 7 | 
 8 | Installing Sphinx
 9 | -----------------
10 | 
11 | First install Sphinx. Go to http://www.sphinx-doc.org/en/stable/ or run
12 | 
13 | ::
14 | 
15 |     pip install -U Sphinx
16 |     pip install -U sphinx-rtd-theme
17 |     conda install -c conda-forge nbsphinx
18 | 
19 | The eskapade/docs folder has the structure of a Sphinx project.
20 | However, if you want to make a new Sphinx project run:
21 | 
22 | ::
23 | 
24 |     sphinx-quickstart
25 | 
26 | It quickly generates a conf.py file which contains your configuration
27 | for your sphinx build.
28 | 
29 | Update the HTML docs
30 | --------------------
31 | 
32 | Now we want Sphinx to autogenerate from docstrings and other
33 | documentation in the code base. Luckily Sphinx has the apidoc
34 | functionality. This goes through a path, finds all the python files and
35 | depending on your arguments, parses certain parts of the code
36 | (docstring, hidden classes, etc.).
37 | 
38 | **First make sure your environment it setup properly. Python must be
39 | able to import all modules otherwise it will not work!**
40 | 
41 | From the the root of the repository:
42 | 
43 | ::
44 | 
45 |     $ source setup.sh
46 | 
47 | To run the autogeneration of the documentation type in /docs/:
48 | 
49 | ::
50 | 
51 |     ./autogenerate.sh
52 | 
53 | to scan the pyfiles and generate \*.rst files with the documentation.
54 | The script itself contains the usage of apidoc.
55 | 
56 | Now to make the actual documentation files run:
57 | 
58 | ::
59 | 
60 |     make clean
61 | 
62 | to clean up the old make of sphinx and run:
63 | 
64 | ::
65 | 
66 |     make html
67 | 
68 | to make the new html build. It will be stored in (your config can adjust
69 | this, but the default is:) docs/build/html/ The index.html is the
70 | starting page. Open this file to see the result.
71 | 
72 | Mounting a different repository to vagrant
73 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
74 | 
75 | When you want to develop code that is not part of the repository that 
76 | your vagrant is in, you can mount it seperately. This is done by changing
77 | the Vagrantfile, by changing the ``#mount`` line to the path of the repository
78 | that you want to mount:
79 | 
80 | ::
81 | 
82 |   config.vm.synced_folder "<PATH_TO_REPOSITORY>", "<LOCATION_TO_MOUNT>", id: "esrepo"
83 |     
84 | where the location to mount is e.g. /opt/eskapade.
85 | 
86 | What is an .rst file?
87 | ~~~~~~~~~~~~~~~~~~~~~
88 | 
89 | R(e)ST is the format that Sphinx uses it stands for ReSTructured
90 | (http://docutils.sourceforge.net/docs/user/rst/quickref.html). It looks
91 | for other RST files to import, see index.rst to see how the **toctree**
92 | refers to other files.
93 | 


--------------------------------------------------------------------------------
/phik/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | import pandas as pd
 4 | import numpy as np
 5 | 
 6 | 
 7 | def array_like_to_dataframe(
 8 |     x: Union[pd.Series, list, np.ndarray], y: Union[pd.Series, list, np.ndarray]
 9 | ) -> pd.DataFrame:
10 |     """Concat two array-like data structures into a DataFrame
11 | 
12 |     :param x: pd.Series, list or np.ndarray
13 |     :param y: pd.Series, list or np.ndarray
14 |     :return: pd.DataFrame
15 |     """
16 |     x_name = x.name if isinstance(x, pd.Series) else "x"
17 |     y_name = y.name if isinstance(y, pd.Series) else "y"
18 | 
19 |     return pd.DataFrame(np.array([x, y]).T, columns=[x_name, y_name])
20 | 
21 | 
22 | def guess_interval_cols(df: pd.DataFrame, verbose: bool = False) -> list:
23 |     """Select columns that have a dtype part of np.number
24 | 
25 |     :param df: DataFrame
26 |     :param bool verbose: print all interval columns that are guessed
27 |     :return: list of interval columns
28 |     """
29 |     interval_cols = df.select_dtypes(include=[np.number]).columns.tolist()
30 |     if interval_cols and verbose:
31 |         print("interval columns not set, guessing: {}".format(str(interval_cols)))
32 | 
33 |     if not isinstance(interval_cols, list):
34 |         raise ValueError("Could not guess interval columns")
35 |     return interval_cols
36 | 
37 | 
38 | def make_shapes_equal(observed: pd.DataFrame, expected: pd.DataFrame) -> pd.DataFrame:
39 |     """Make observed and expected (pandas) histograms equal in shape
40 | 
41 |     Sometimes expected histogram shape need filling / pruning to make its shape equal to observed. Give expected the
42 |     same number of columns and rows. Needed for proper histogram comparison.
43 | 
44 |     :param pd.DataFrame observed: The observed contingency table. The table contains the observed number of occurrences in each cell.
45 |     :param pd.DataFrame expected: The expected contingency table. The table contains the expected number of occurrences in each cell.
46 |     :return: expected frequencies, having the same shape as observed
47 |     """
48 |     # columns
49 |     o_cols = observed.columns.tolist()
50 |     e_cols = expected.columns.tolist()
51 |     o_cols_missing = list(set(e_cols) - set(o_cols))
52 |     e_cols_missing = list(set(o_cols) - set(e_cols))
53 |     # index
54 |     o_idx = observed.index.tolist()
55 |     e_idx = expected.index.tolist()
56 |     o_idx_missing = list(set(e_idx) - set(o_idx))
57 |     e_idx_missing = list(set(o_idx) - set(e_idx))
58 | 
59 |     # make expected columns equal to observed
60 |     for c in o_cols_missing:
61 |         observed[c] = 0.0
62 |     for c in e_cols_missing:
63 |         expected[c] = 0.0
64 |     observed.columns = sorted(observed.columns)
65 |     expected.columns = sorted(expected.columns)
66 |     # this should always be a match now
67 |     assert len(observed.columns) == len(expected.columns)
68 | 
69 |     # make expected index equal to observed
70 |     for i in o_idx_missing:
71 |         observed.loc[i] = np.zeros(len(observed.columns))
72 |     for i in e_idx_missing:
73 |         expected.loc[i] = np.zeros(len(expected.columns))
74 |     # this should always be a match now
75 |     assert len(observed.index) == len(expected.index)
76 | 
77 |     return expected
78 | 


--------------------------------------------------------------------------------
/docs/source/introduction.rst:
--------------------------------------------------------------------------------
 1 | ======================
 2 | Why did we build this?
 3 | ======================
 4 | 
 5 | When exploring a data set, for example to model one variable in terms of the others,
 6 | it is useful to summarize the dependencies between the variables, assess their significances, and
 7 | visualize the individual variable dependencies.
 8 | The ``PhiK`` correlation analyzer library contains several useful functions to help one do so.
 9 | 
10 | * This library implements a novel correlation coefficient, :math:`\phi_{K}`, with properties that - taken together - form
11 |   an advantage over existing methods.
12 | 
13 |   The calculation of correlation coefficients between paired data variables is a standard tool of analysis for every data analyst.
14 |   Pearson's correlation coefficient is a de facto standard in most fields, but by construction only works for interval variables
15 |   (sometimes called continuous variables). Pearson is unsuitable for data sets with mixed variable types,
16 |   e.g. where some variables are ordinal or categorical.
17 | 
18 |   While many correlation coefficients exist, each with different features, we have not been able to find a
19 |   correlation coefficient with Pearson-like characteristics 
20 |   and a sound statistical interpretation that works for interval, ordinal and categorical variable types alike.
21 | 
22 |   The correlation coefficient :math:`\phi_{K}` follows a uniform treatment for interval, ordinal and categorical variables,
23 |   captures non-linear dependencies, and is similar to Pearson's correlation coefficient in case of a bivariate normal input distribution.
24 | 
25 | * We found that, by default, popular analysis libraries such ``R`` and ``scipy`` make incorrect ("asymptotic") assumptions when assessing
26 |   the statistical significance of the :math:`\chi^2` contingency test of variable independence. In particular, the actual number of
27 |   degrees of freedom and the shape of the test statistic distribution can differ significantly from their theoretical
28 |   predictions in case of low to medium statistics data samples. This leads to incorrect p-values for the hypothesis test of variable
29 |   independence. A prescription has been implemented to fix these two mistakes.
30 |     
31 | * Visualizing the dependency between variables can be tricky, especially when dealing with (unordered) categorical variables. 
32 |   To help interpret any variable relationship found, we provide a method for the detection of
33 |   significant excesses or deficits of records with respect to the expected values in a contingency table, so-called outliers,
34 |   using a statistically independent evaluation for expected frequency of records, accounting for the uncertainty on the expectation.
35 |   We evaluate the significance of each outlier frequency in a table, and normalize and visualize these accordingly.
36 |   The resulting plots we find to be very valuable to help interpret variable dependencies,
37 |   and work alike for interval, ordinal and categorical variables.
38 | 
39 | The ``PhiK`` analysis library is particularly useful in modern-day analysis when studying the dependencies between a set of
40 | variables with mixed types, where often some variables are categorical.
41 | The package has been used by us to study surveys, insurance claims, correlograms, etc.
42 | 
43 | For details on the methodology behind the calculations, please see our publication.
44 | For the available examples on how to use the methods, please see the `tutorials <tutorials.html>`_ section.
45 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
  1 | =============
  2 | Release notes
  3 | =============
  4 | 
  5 | Version 0.12.5, Jul 2025
  6 | ------------------------
  7 | 
  8 | - FIX: scipy 1.16.0 no longer supports mvn, code now migrated to qmvn.
  9 |   https://github.com/KaveIO/PhiK/issues/101
 10 |   https://github.com/KaveIO/PhiK/pull/102
 11 | - Drop support for Python 3.8, has reached end of life.
 12 | 
 13 | Version 0.12.4, Jan 2024
 14 | ------------------------
 15 | 
 16 | - Add support for Python 3.12.
 17 | - ENH: added plotting kwargs to correlation_report function.
 18 |   https://github.com/KaveIO/PhiK/issues/58
 19 | - FIX: fix of bin edge values they are rounded with 1e-14
 20 |   https://github.com/KaveIO/PhiK/issues/60
 21 | - FIX: numpy random multinomial requires integer number of samples (for nixOS)
 22 |   https://github.com/KaveIO/PhiK/issues/73
 23 | - FIX: pandas deprecation warning
 24 |   https://github.com/KaveIO/PhiK/pull/74
 25 | - Drop support for Python 3.7, has reached end of life.
 26 | 
 27 | Version 0.12.3, Dec 2022
 28 | ------------------------
 29 | 
 30 | - Add support for Python 3.11
 31 | 
 32 | Version 0.12.2, Mar 2022
 33 | ------------------------
 34 | 
 35 | - Fix missing setup.py and pyproject.toml in source distribution
 36 | - Support wheels ARM MacOS (Apple silicone)
 37 | 
 38 | Version 0.12.1, Mar 2022
 39 | ------------------------
 40 | 
 41 | - Two fixes to make calculation of global phik robust: global phik capped in range [0, 1],
 42 |   and check for successful correlation matrix inversion.
 43 | - Migration to to scikit-build 0.13.1.
 44 | - Support wheels for Python 3.10.
 45 | 
 46 | 
 47 | Version 0.12.0, July 2021
 48 | -------------------------
 49 | 
 50 | C++ Extension
 51 | ~~~~~~~~~~~~~
 52 | 
 53 | Phi_K contains an optional C++ extension to compute the significance matrix using the `hypergeometric` method
 54 | (also called the`Patefield` method).
 55 | 
 56 | Note that the PyPi distributed wheels contain a pre-build extension for Linux, MacOS and Windows.
 57 | 
 58 | A manual (pip) setup will attempt to build and install the extension, if it fails it will install without the extension.
 59 | If so, using the `hypergeometric` method without the extension will trigger a
 60 | NotImplementedError.
 61 | 
 62 | Compiler requirements through Pybind11:
 63 | 
 64 | - Clang/LLVM 3.3 or newer (for Apple Xcode's clang, this is 5.0.0 or newer)
 65 | - GCC 4.8 or newer
 66 | - Microsoft Visual Studio 2015 Update 3 or newer
 67 | - Intel classic C++ compiler 18 or newer (ICC 20.2 tested in CI)
 68 | - Cygwin/GCC (previously tested on 2.5.1)
 69 | - NVCC (CUDA 11.0 tested in CI)
 70 | - NVIDIA PGI (20.9 tested in CI)
 71 | 
 72 | 
 73 | Other
 74 | ~~~~~
 75 | 
 76 | * You can now manually set the number of parallel jobs in the evaluation of Phi_K or its statistical significance
 77 |   (when using MC simulations). For example, to use 4 parallel jobs do:
 78 | 
 79 |   .. code-block:: python
 80 | 
 81 |     df.phik_matrix(njobs = 4)
 82 |     df.significance_matrix(njobs = 4)
 83 | 
 84 |   The default value is -1, in which case all available cores are used. When using ``njobs=1`` no parallel processing
 85 |   is applied.
 86 | 
 87 | * Phi_K can now be calculated with an independent expectation histogram:
 88 | 
 89 |   .. code-block:: python
 90 | 
 91 |     from phik.phik import phik_from_hist2d
 92 | 
 93 |     cols = ["mileage", "car_size"]
 94 |     interval_cols = ["mileage"]
 95 | 
 96 |     observed = df1[["feature1", "feature2"]].hist2d()
 97 |     expected = df2[["feature1", "feature2"]].hist2d()
 98 | 
 99 |     phik_value = phik_from_hist2d(observed=observed, expected=expected)
100 | 
101 |   The expected histogram is taken to be (relatively) large in number of counts
102 |   compared with the observed histogram.
103 | 
104 |   Or can compare two (pre-binned) datasets against each other directly. Again the expected dataset
105 |   is assumed to be relatively large:
106 | 
107 |   .. code-block:: python
108 | 
109 |     from phik.phik import phik_observed_vs_expected_from_rebinned_df
110 | 
111 |     phik_matrix = phik_observed_vs_expected_from_rebinned_df(df1_binned, df2_binned)
112 | 
113 | * Added links in the readme to the basic and advanced Phi_K tutorials on google colab.
114 | * Migrated the spark example Phi_K notebook from popmon to directly using histogrammar for histogram creation.
115 | 
116 | 
117 | 
118 | 
119 | Older versions
120 | --------------
121 | 
122 | * Please see documentation for full details: https://phik.readthedocs.io
123 | 


--------------------------------------------------------------------------------
/phik/betainc.py:
--------------------------------------------------------------------------------
  1 | """Project: PhiK - correlation analyzer library
  2 | 
  3 | Created: 2018/09/05
  4 | 
  5 | Description:
  6 |     Implementation of incomplete beta function
  7 | 
  8 | Authors:
  9 |     KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
 10 | 
 11 | Redistribution and use in source and binary forms, with or without
 12 | modification, are permitted according to the terms listed in the file
 13 | LICENSE.
 14 | """
 15 | import numpy as np
 16 | from scipy.special import gammaln
 17 | from typing import Tuple
 18 | 
 19 | 
 20 | def contfractbeta(
 21 |     a: float, b: float, x: float, ITMAX: int = 5000, EPS: float = 1.0e-7
 22 | ) -> float:
 23 |     """Continued fraction form of the incomplete Beta function.
 24 | 
 25 |     Code translated from: Numerical Recipes in C.
 26 | 
 27 |     Example kindly taken from blog:
 28 |     https://malishoaib.wordpress.com/2014/04/15/the-beautiful-beta-functions-in-raw-python/
 29 | 
 30 |     :param float a: a
 31 |     :param float b: b
 32 |     :param float x: x
 33 |     :param int ITMAX: max number of iterations, default is 5000.
 34 |     :param float EPS: epsilon precision parameter, default is 1e-7.
 35 |     :returns: continued fraction form
 36 |     :rtype: float
 37 |     """
 38 |     az = 1.0
 39 |     bm = 1.0
 40 |     am = 1.0
 41 |     qab = a + b
 42 |     qap = a + 1.0
 43 |     qam = a - 1.0
 44 |     bz = 1.0 - qab * x / qap
 45 | 
 46 |     for i in range(ITMAX + 1):
 47 |         em = float(i + 1)
 48 |         tem = em + em
 49 |         d = em * (b - em) * x / ((qam + tem) * (a + tem))
 50 |         ap = az + d * am
 51 |         bp = bz + d * bm
 52 |         d = -(a + em) * (qab + em) * x / ((qap + tem) * (a + tem))
 53 |         app = ap + d * az
 54 |         bpp = bp + d * bz
 55 |         aold = az
 56 |         am = ap / bpp
 57 |         bm = bp / bpp
 58 |         az = app / bpp
 59 |         bz = 1.0
 60 |         if abs(az - aold) < EPS * abs(az):
 61 |             return az
 62 | 
 63 |     raise ValueError(
 64 |         "a={0:f} or b={1:f} too large, or ITMAX={2:d} too small to compute incomplete beta function.".format(
 65 |             a, b, ITMAX
 66 |         )
 67 |     )
 68 | 
 69 | 
 70 | def incompbeta(a: float, b: float, x: float) -> float:
 71 |     """Evaluation of incomplete beta function.
 72 | 
 73 |     Code translated from: Numerical Recipes in C.
 74 | 
 75 |     Here a, b > 0 and 0 <= x <= 1.
 76 |     This function requires contfractbeta(a,b,x, ITMAX = 200)
 77 | 
 78 |     Example kindly taken from blog:
 79 |     https://malishoaib.wordpress.com/2014/04/15/the-beautiful-beta-functions-in-raw-python/
 80 | 
 81 |     :param float a: a
 82 |     :param float b: b
 83 |     :param float x: x
 84 |     :returns: incomplete beta function
 85 |     :rtype: float
 86 |     """
 87 |     # special cases
 88 |     if x == 0:
 89 |         return 0
 90 |     elif x == 1:
 91 |         return 1
 92 |     # default
 93 |     lbeta = gammaln(a + b) - gammaln(a) - gammaln(b) + a * np.log(x) + b * np.log(1 - x)
 94 |     if x < (a + 1) / (a + b + 2):
 95 |         p = np.exp(lbeta) * contfractbeta(a, b, x) / a
 96 |     else:
 97 |         p = 1 - np.exp(lbeta) * contfractbeta(b, a, 1 - x) / b
 98 |     return p
 99 | 
100 | 
101 | def log_incompbeta(a: float, b: float, x: float) -> Tuple[float, float]:
102 |     """Evaluation of logarithm of incomplete beta function
103 | 
104 |     Logarithm of incomplete beta function is implemented to ensure sufficient precision
105 |     for values very close to zero and one.
106 | 
107 |     Code translated from: Numerical Recipes in C.
108 | 
109 |     Here a, b > 0 and 0 <= x <= 1.
110 |     This function requires contfractbeta(a,b,x, ITMAX = 200)
111 | 
112 |     Example kindly taken from blog:
113 |     https://malishoaib.wordpress.com/2014/04/15/the-beautiful-beta-functions-in-raw-python/
114 | 
115 |     :param float a: a
116 |     :param float b: b
117 |     :param float x: x
118 |     :returns: tuple of log(incb) and log(1-incb)
119 |     :rtype: tuple
120 |     """
121 |     # special cases
122 |     if x == 0:
123 |         return -np.inf, 0
124 |     elif x == 1:
125 |         return 0, -np.inf
126 |     # default
127 |     lbeta = gammaln(a + b) - gammaln(a) - gammaln(b) + a * np.log(x) + b * np.log(1 - x)
128 | 
129 |     if x < (a + 1) / (a + b + 2):
130 |         p = np.exp(lbeta) * contfractbeta(a, b, x) / a
131 |         logp = lbeta + np.log(contfractbeta(a, b, x)) - np.log(a)
132 |         logq = np.log(1 - p)
133 |     else:
134 |         p = 1 - np.exp(lbeta) * (contfractbeta(b, a, 1 - x) / b)
135 |         logp = np.log(p)
136 |         logq = lbeta + np.log(contfractbeta(b, a, 1 - x)) - np.log(b)
137 |     return logp, logq
138 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ==========================
  2 | Phi_K Correlation Constant
  3 | ==========================
  4 | 
  5 | * Version: 0.12.5. Released: Jul 2025
  6 | * Release notes: https://github.com/KaveIO/PhiK/blob/master/CHANGES.rst
  7 | * Repository: https://github.com/kaveio/phik
  8 | * Documentation: https://phik.readthedocs.io
  9 | * Publication: `[offical] <https://www.sciencedirect.com/science/article/abs/pii/S0167947320301341>`_ `[arxiv pre-print] <https://arxiv.org/abs/1811.11440>`_
 10 | 
 11 | Phi_K is a practical correlation constant that works consistently between categorical, ordinal and interval variables.
 12 | It is based on several refinements to Pearson's hypothesis test of independence of two variables. Essentially, the
 13 | contingency test statistic of two variables is interpreted as if coming from a rotated bi-variate normal distribution,
 14 | where the tilt is interpreted as Phi_K.
 15 | 
 16 | The combined features of Phi_K form an advantage over existing coefficients. First, it works consistently between categorical, ordinal and interval variables.
 17 | Second, it captures non-linear dependency. Third, it reverts to the Pearson correlation coefficient in case of a bi-variate normal input distribution.
 18 | These are useful features when studying the correlation matrix of variables with mixed types.
 19 | 
 20 | For details on the methodology behind the calculations, please see our publication. Emphasis is paid to the proper evaluation of statistical significance of correlations and to the interpretation of variable relationships
 21 | in a contingency table, in particular in case of low statistics samples.
 22 | The presented algorithms are easy to use and available through this public Python library.
 23 | 
 24 | Example notebooks
 25 | =================
 26 | 
 27 | .. list-table::
 28 |    :widths: 60 40
 29 |    :header-rows: 1
 30 | 
 31 |    * - Static link
 32 |      - Google Colab link
 33 |    * - `basic tutorial <https://nbviewer.jupyter.org/github/KaveIO/PhiK/blob/master/phik/notebooks/phik_tutorial_basic.ipynb>`_
 34 |      - `basic on colab <https://colab.research.google.com/github/KaveIO/PhiK/blob/master/phik/notebooks/phik_tutorial_basic.ipynb>`_
 35 |    * - `advanced tutorial (detailed configuration) <https://nbviewer.jupyter.org/github/KaveIO/PhiK/blob/master/phik/notebooks/phik_tutorial_advanced.ipynb>`_
 36 |      - `advanced on colab <https://colab.research.google.com/github/KaveIO/PhiK/blob/master/phik/notebooks/phik_tutorial_advanced.ipynb>`_
 37 |    * - `spark tutorial <https://nbviewer.jupyter.org/github/KaveIO/PhiK/blob/master/phik/notebooks/phik_tutorial_spark.ipynb>`_
 38 |      - no spark available
 39 | 
 40 | Documentation
 41 | =============
 42 | 
 43 | The entire Phi_K documentation including tutorials can be found at `read-the-docs <https://phik.readthedocs.io>`_.
 44 | See the tutorials for detailed examples on how to run the code with pandas. We also have one example on how
 45 | calculate the Phi_K correlation matrix for a spark dataframe.
 46 | 
 47 | Check it out
 48 | ============
 49 | 
 50 | The Phi_K library requires Python >= 3.8 and is pip friendly. To get started, simply do:
 51 | 
 52 | .. code-block:: bash
 53 | 
 54 |   $ pip install phik
 55 | 
 56 | or check out the code from out GitHub repository:
 57 | 
 58 | .. code-block:: bash
 59 | 
 60 |   $ git clone https://github.com/KaveIO/PhiK.git
 61 |   $ pip install -e PhiK/
 62 | 
 63 | where in this example the code is installed in edit mode (option -e).
 64 | 
 65 | You can now use the package in Python with:
 66 | 
 67 | .. code-block:: python
 68 | 
 69 |   import phik
 70 | 
 71 | **Congratulations, you are now ready to use the PhiK correlation analyzer library!**
 72 | 
 73 | 
 74 | Quick run
 75 | =========
 76 | 
 77 | As a quick example, you can do:
 78 | 
 79 | .. code-block:: python
 80 | 
 81 |   import pandas as pd
 82 |   import phik
 83 |   from phik import resources, report
 84 | 
 85 |   # open fake car insurance data
 86 |   df = pd.read_csv( resources.fixture('fake_insurance_data.csv.gz') )
 87 |   df.head()
 88 | 
 89 |   # Pearson's correlation matrix between numeric variables (pandas functionality)
 90 |   df.corr()
 91 | 
 92 |   # get the phi_k correlation matrix between all variables
 93 |   df.phik_matrix()
 94 | 
 95 |   # get global correlations based on phi_k correlation matrix
 96 |   df.global_phik()
 97 | 
 98 |   # get the significance matrix (expressed as one-sided Z)
 99 |   # of the hypothesis test of each variable-pair dependency
100 |   df.significance_matrix()
101 | 
102 |   # contingency table of two columns
103 |   cols = ['mileage','car_size']
104 |   df[cols].hist2d()
105 | 
106 |   # normalized residuals of contingency test applied to cols
107 |   df[cols].outlier_significance_matrix()
108 | 
109 |   # show the normalized residuals of each variable-pair
110 |   df.outlier_significance_matrices()
111 | 
112 |   # generate a phik correlation report and save as test.pdf
113 |   report.correlation_report(df, pdf_file_name='test.pdf')
114 | 
115 | 
116 | For all available examples, please see the `tutorials <https://phik.readthedocs.io/en/latest/tutorials.html>`_ at read-the-docs.
117 | 
118 | 
119 | Contact and support
120 | ===================
121 | 
122 | * Issues and Ideas: https://github.com/kaveio/phik/issues
123 | 
124 | Please note that support is (only) provided on a best-effort basis.
125 | 


--------------------------------------------------------------------------------
/phik/data_quality.py:
--------------------------------------------------------------------------------
  1 | """Project: PhiK - correlation analyzer library
  2 | 
  3 | Created: 2018/12/28
  4 | 
  5 | Description:
  6 |     A set of functions to check for data quality issues in input data.
  7 | 
  8 | Authors:
  9 |     KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
 10 | 
 11 | Redistribution and use in source and binary forms, with or without
 12 | modification, are permitted according to the terms listed in the file
 13 | LICENSE.
 14 | """
 15 | 
 16 | import warnings
 17 | import copy
 18 | from typing import Tuple
 19 | 
 20 | import pandas as pd
 21 | import numpy as np
 22 | 
 23 | 
 24 | def dq_check_nunique_values(
 25 |     df: pd.DataFrame, interval_cols: list, dropna: bool = True
 26 | ) -> Tuple[pd.DataFrame, list]:
 27 |     """
 28 |     Basic data quality checks per column in a DataFrame.
 29 | 
 30 |     The following checks are done:
 31 | 
 32 |     1. For all non-interval variables, if the number of unique values per variable is larger than 100 a warning is printed.
 33 |     When the number of unique values is large, the variable is likely to be an interval variable. Calculation of phik
 34 |     will be slow(ish) for pairs of variables where one (or two) have many different values (i.e. many bins).
 35 | 
 36 |     2. For all interval variables, the number of unique values must be at least two. If the number of unique values is
 37 |     zero (i.e. all NaN) the column is removed. If the number of unique values is one, it is not possible to
 38 |     automatically create a binning for this variable (as min and max are the same). The variable is therefore dropped,
 39 |     irrespective of whether dropna is True or False.
 40 | 
 41 |     3. For all non-interval variables, the number of unique values must be at least either
 42 |     a) 1 if dropna=False (NaN is now also considered a valid category), or
 43 |     b) 2 if dropna=True
 44 | 
 45 |     The function returns a DataFrame where all columns with invalid data are removed. Also the list of interval_cols
 46 |     is updated and returned.
 47 | 
 48 |     :param pd.DataFrame df: input data
 49 |     :param list interval_cols: column names of columns with interval variables.
 50 |     :param bool dropna: remove NaN values when True
 51 |     :returns: cleaned data, updated list of interval columns
 52 |     """
 53 |     # check for existing columns
 54 |     interval_cols = [col for col in interval_cols if col in df.columns]
 55 | 
 56 |     # check non-interval variable for number of unique values
 57 |     for col in sorted(list(set(df.columns) - set(interval_cols))):
 58 |         if df[col].nunique() > 1000:
 59 |             warnings.warn(
 60 |                 "The number of unique values of variable {0:s} is large: {1:d}. Are you sure this is "
 61 |                 "not an interval variable? Analysis for pairs of variables including {0:s} can be slow.".format(
 62 |                     col, df[col].nunique()
 63 |                 )
 64 |             )
 65 | 
 66 |     drop_cols = []
 67 | 
 68 |     # check for interval values whether there are at least two unique values (otherwise I cannot bin automatically)
 69 |     for col in interval_cols:
 70 |         if df[col].nunique() < 2:
 71 |             drop_cols.append(col)
 72 |             warnings.warn(
 73 |                 "Not enough unique value for variable {0:s} for analysis {1:d}. Dropping this column".format(
 74 |                     col, df[col].nunique()
 75 |                 )
 76 |             )
 77 | 
 78 |     # check non-interval values whether there are at least two different values OR 1 value and NaN if dropna==False
 79 |     for col in sorted(list(set(df.columns) - set(interval_cols))):
 80 |         if df[col].nunique() == 0 or (df[col].nunique() == 1 and dropna):
 81 |             drop_cols.append(col)
 82 |             warnings.warn(
 83 |                 "Not enough unique value for variable {0:s} for analysis {1:d}. Dropping this column".format(
 84 |                     col, df[col].nunique()
 85 |                 )
 86 |             )
 87 | 
 88 |     df_clean = df.copy()
 89 |     interval_cols_clean = copy.copy(interval_cols)
 90 |     if len(drop_cols) > 0:
 91 |         # preserves column order: https://github.com/KaveIO/PhiK/issues/1
 92 |         df_clean.drop(columns=drop_cols, inplace=True)
 93 |         interval_cols_clean = [col for col in interval_cols if col not in drop_cols]
 94 | 
 95 |     return df_clean, interval_cols_clean
 96 | 
 97 | 
 98 | def dq_check_hist2d(hist2d: np.ndarray) -> bool:
 99 |     """Basic data quality checks for a contingency table
100 | 
101 |     The Following checks are done:
102 | 
103 |     1. There must be at least two bins in both the x and y direction.
104 | 
105 |     2. If the number of bins in the x and/or y direction is larger than 100 a warning is printed.
106 | 
107 |     :param hist2d: contingency table
108 |     :return: bool passed_check
109 |     """
110 | 
111 |     if 0 in hist2d.shape or 1 in hist2d.shape:
112 |         warnings.warn(
113 |             "Too few unique values for variable x ({0:d}) or y ({1:d})".format(
114 |                 hist2d.shape[0], hist2d.shape[1]
115 |             )
116 |         )
117 |         return False
118 |     if hist2d.shape[0] > 1000:
119 |         warnings.warn(
120 |             "The number of unique values of variable x is large: {0:d}. "
121 |             "Are you sure this is not an interval variable? Analysis might be slow.".format(
122 |                 hist2d.shape[0]
123 |             )
124 |         )
125 |     if hist2d.shape[1] > 1000:
126 |         warnings.warn(
127 |             "The number of unique values of variable y is large: {0:d}. "
128 |             "Are you sure this is not an interval variable? Analysis might be slow.".format(
129 |                 hist2d.shape[0]
130 |             )
131 |         )
132 | 
133 |     return True
134 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # PhiK documentation build configuration file for sphinx.
  4 | #
  5 | #
  6 | 
  7 | import os
  8 | #from unittest.mock import MagicMock
  9 | 
 10 | import phik
 11 | 
 12 | 
 13 | # Classes that use non-python modules are not always available in the
 14 | # RTD environment. By mocking them we can still import these classes
 15 | # in the code and RTD can subsequently go through the code and get 
 16 | # the docstrings.
 17 | 
 18 | #class Mock(MagicMock):
 19 | #    @classmethod
 20 | #    def __getattr__(cls, name):
 21 | #        return MagicMock()
 22 | 
 23 | # If extensions (or modules to document with autodoc) are in another directory,
 24 | # add these directories to sys.path here. If the directory is relative to the
 25 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 26 | # sys.path.insert(0, os.path.abspath(''))
 27 | 
 28 | # -- General configuration ------------------------------------------------
 29 | 
 30 | # If your documentation needs a minimal Sphinx version, state it here.
 31 | # needs_sphinx = '1.0'
 32 | 
 33 | # Add any Sphinx extension module names here, as strings. They can be
 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 35 | # ones.
 36 | extensions = [
 37 |     'sphinx.ext.autodoc',
 38 |     'sphinx.ext.mathjax',
 39 |     'sphinx.ext.ifconfig',
 40 | ]
 41 | 
 42 | # Add any paths that contain templates here, relative to this directory.
 43 | templates_path = ['_templates']
 44 | 
 45 | # The suffix(es) of source filenames.
 46 | # You can specify multiple suffix as a list of string:
 47 | # source_suffix = ['.rst', '.md']
 48 | source_suffix = '.rst'
 49 | 
 50 | # The encoding of source files.
 51 | # source_encoding = 'utf-8-sig'
 52 | 
 53 | # The master toctree document.
 54 | master_doc = 'index'
 55 | 
 56 | # General information about the project.
 57 | project = 'Phi_K correlation library'
 58 | copyright = '2018, KPMG Advisory N.V.'
 59 | author = 'KPMG Advanced Analytics & Big Data team'
 60 | version = phik.__version__
 61 | 
 62 | # The language for content autogenerated by Sphinx. Refer to documentation
 63 | # for a list of supported languages.
 64 | #
 65 | # This is also used if you do content translation via gettext catalogs.
 66 | # Usually you set "language" from the command line for these cases.
 67 | language = 'en'
 68 | 
 69 | # List of patterns, relative to source directory, that match files and
 70 | # directories to ignore when looking for source files.
 71 | exclude_patterns = ['*test*', 'phik.tutorials.*']
 72 | 
 73 | # The name of the Pygments (syntax highlighting) style to use.
 74 | pygments_style = 'sphinx'
 75 | 
 76 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 77 | todo_include_todos = False
 78 | 
 79 | # -- Options for HTML output ----------------------------------------------
 80 | 
 81 | # on_rtd is whether we are on readthedocs.org, this line of code grabbed from docs.readthedocs.org
 82 | on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
 83 | 
 84 | if not on_rtd:
 85 |     import sphinx_rtd_theme
 86 | 
 87 |     html_theme = "sphinx_rtd_theme"
 88 |     html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 89 | # otherwise, readthedocs.org uses their theme by default, so no need to specify it
 90 | 
 91 | # Add any paths that contain custom static files (such as style sheets) here,
 92 | # relative to this directory. They are copied after the builtin static files,
 93 | # so a file named "default.css" will overwrite the builtin "default.css".
 94 | html_static_path = ['_static']
 95 | 
 96 | # If false, no index is generated.
 97 | html_use_index = True
 98 | 
 99 | # If true, the index is split into individual pages for each letter.
100 | # html_split_index = False
101 | 
102 | # If true, links to the reST sources are added to the pages.
103 | html_show_sourcelink = True
104 | 
105 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
106 | # html_show_sphinx = True
107 | 
108 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
109 | html_show_copyright = True
110 | 
111 | # Language to be used for generating the HTML full-text search index.
112 | # Sphinx supports the following languages:
113 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
114 | #   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
115 | html_search_language = 'en'
116 | 
117 | # Output file base name for HTML help builder.
118 | htmlhelp_basename = 'PhiKdoc'
119 | 
120 | # -- Options for LaTeX output ---------------------------------------------
121 | 
122 | latex_elements = {
123 |     # The paper size ('letterpaper' or 'a4paper').
124 |     # 'papersize': 'letterpaper',
125 | 
126 |     # The font size ('10pt', '11pt' or '12pt').
127 |     # 'pointsize': '10pt',
128 | 
129 |     # Additional stuff for the LaTeX preamble.
130 |     # 'preamble': '',
131 | 
132 |     # Latex figure (float) alignment
133 |     # 'figure_align': 'htbp',
134 | }
135 | 
136 | # Grouping the document tree into LaTeX files. List of tuples
137 | # (source start file, target name, title,
138 | #  author, documentclass [howto, manual, or own class]).
139 | latex_documents = [
140 |     (master_doc, 'PhiK.tex', 'PhiK Documentation',
141 |      'KPMG Advanced Analytics & Big Data team', 'manual'),
142 | ]
143 | 
144 | # -- Options for manual page output ---------------------------------------
145 | 
146 | # One entry per manual page. List of tuples
147 | # (source start file, name, description, authors, manual section).
148 | man_pages = [
149 |     (master_doc, 'phik', 'PhiK Documentation',
150 |      [author], 1)
151 | ]
152 | 
153 | # -- Options for Texinfo output -------------------------------------------
154 | 
155 | # Grouping the document tree into Texinfo files. List of tuples
156 | # (source start file, target name, title, author,
157 | #  dir menu entry, description, category)
158 | texinfo_documents = [
159 |     (master_doc, 'PhiK', 'PhiK Documentation',
160 |      author, 'PhiK', 'One line description of project.',
161 |      'Miscellaneous'),
162 | ]
163 | 
164 | 
165 | def skip(app, what, name, obj, skip, options):
166 |     if name == "__init__":
167 |         return False
168 |     return skip
169 | 
170 | 
171 | def setup(app):
172 |     app.connect("autodoc-skip-member", skip)
173 | 


--------------------------------------------------------------------------------
/phik/notebooks/phik_tutorial_spark.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Phi_K spark tutorial\n",
  8 |     "\n",
  9 |     "This notebook shows you how to obtain the Phi_K correlation matrix for a spark dataframe.\n",
 10 |     "Calculating the Phi_K matrix consists of two steps:\n",
 11 |     "\n",
 12 |     "- Obtain the 2d contingency tables for all variable pairs. To make these we use the [`histogrammar` package](https://github.com/histogrammar/histogrammar-python).\n",
 13 |     "- Calculate the Phi_K value for each variable pair from its contingency table.\n",
 14 |     "\n",
 15 |     "Make sure you install the histogrammar package to make the 2d histograms, that are then used to calculate phik."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "%%capture\n",
 25 |     "# install histogrammar (if not installed yet)\n",
 26 |     "import sys\n",
 27 |     "\n",
 28 |     "!\"{sys.executable}\" -m pip install histogrammar"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "import itertools\n",
 38 |     "\n",
 39 |     "import pandas as pd\n",
 40 |     "import histogrammar as hg\n",
 41 |     "from histogrammar.plot.hist_numpy import get_2dgrid\n",
 42 |     "\n",
 43 |     "import phik\n",
 44 |     "from phik import resources\n",
 45 |     "from phik.phik import spark_phik_matrix_from_hist2d_dict"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "# histogramming is done using the histogrammar library"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "from pyspark.sql import SparkSession\n",
 62 |     "from pyspark import __version__ as pyspark_version\n",
 63 |     "\n",
 64 |     "scala = '2.12' if int(pyspark_version[0]) >= 3 else '2.11'\n",
 65 |     "hist_jar = f'io.github.histogrammar:histogrammar_{scala}:1.0.20'\n",
 66 |     "hist_spark_jar = f'io.github.histogrammar:histogrammar-sparksql_{scala}:1.0.20'\n",
 67 |     "\n",
 68 |     "spark = SparkSession.builder.config(\n",
 69 |     "    \"spark.jars.packages\", f'{hist_spark_jar},{hist_jar}'\n",
 70 |     ").getOrCreate()\n",
 71 |     "\n",
 72 |     "spark = SparkSession.builder.config(\n",
 73 |     "    \"spark.jars.packages\", f'{hist_spark_jar},{hist_jar}'\n",
 74 |     ").getOrCreate()\n",
 75 |     "\n",
 76 |     "sc = spark.sparkContext"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "# Load data\n",
 84 |     "\n",
 85 |     "A simulated dataset is part of the phik-package. The dataset concerns fake car insurance data. Load the dataset here:"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "data = pd.read_csv( resources.fixture('fake_insurance_data.csv.gz') )\n",
 95 |     "sdf = spark.createDataFrame(data)\n",
 96 |     "sdf.show()"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "combis = itertools.combinations_with_replacement(sdf.columns, 2)\n",
106 |     "combis = [list(c) for c in combis]"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "print(combis)"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "# step 1: create histograms (this runs spark histogrammar in the background)\n"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "# see the doc-string of hg_make_histograms() for binning options.\n",
132 |     "hists = sdf.hg_make_histograms(combis)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "# collect the numpy contingency tables into a dict\n",
142 |     "grids = {k:(get_2dgrid(h)[2]) for k,h in hists.items()}\n",
143 |     "print(grids)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "# we can store the histograms if we want to\n",
153 |     "if False:\n",
154 |     "    import pickle\n",
155 |     "\n",
156 |     "    with open('grids.pkl', 'wb') as outfile:\n",
157 |     "        pickle.dump(grids, outfile)\n",
158 |     "\n",
159 |     "    with open('grids.pkl', 'rb') as handle:\n",
160 |     "        grids = pickle.load(handle)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "# step 2: calculate phik matrix (runs rdd parallellization over all 2d histograms)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "phik_matrix = spark_phik_matrix_from_hist2d_dict(sc, grids)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "phik_matrix"
186 |    ]
187 |   }
188 |  ],
189 |  "metadata": {
190 |   "kernelspec": {
191 |    "display_name": "Python 3",
192 |    "language": "python",
193 |    "name": "python3"
194 |   },
195 |   "language_info": {
196 |    "codemirror_mode": {
197 |     "name": "ipython",
198 |     "version": 3
199 |    },
200 |    "file_extension": ".py",
201 |    "mimetype": "text/x-python",
202 |    "name": "python",
203 |    "nbconvert_exporter": "python",
204 |    "pygments_lexer": "ipython3",
205 |    "version": "3.6.8"
206 |   }
207 |  },
208 |  "nbformat": 4,
209 |  "nbformat_minor": 2
210 | }
211 | 


--------------------------------------------------------------------------------
/phik/statistics.py:
--------------------------------------------------------------------------------
  1 | """Project: PhiK - correlation coefficient package
  2 | 
  3 | Created: 2018/09/05
  4 | 
  5 | Description:
  6 |     Statistics helper functions, for the calculation of phik and significance
  7 |     of a contingency table.
  8 | 
  9 | Authors:
 10 |     KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
 11 | 
 12 | Redistribution and use in source and binary forms, with or without
 13 | modification, are permitted according to the terms listed in the file
 14 | LICENSE.
 15 | """
 16 | from typing import Union
 17 | 
 18 | import numpy as np
 19 | from scipy import stats
 20 | 
 21 | 
 22 | def get_dependent_frequency_estimates(vals: np.ndarray) -> np.ndarray:
 23 |     """
 24 |     Calculation of dependent expected frequencies.
 25 | 
 26 |     Calculation is based on the marginal sums of the table, i.e. dependent frequency estimates.
 27 |     :param vals: The contingency table. The table contains the observed number of occurrences in each category
 28 | 
 29 |     :returns exp: expected frequencies
 30 |     """
 31 | 
 32 |     # use existing scipy functionality
 33 |     return stats.contingency.expected_freq(vals)
 34 | 
 35 | 
 36 | def get_chi2_using_dependent_frequency_estimates(
 37 |     vals: np.ndarray, lambda_: str = "log-likelihood"
 38 | ) -> float:
 39 |     """
 40 |     Chi-square test of independence of variables in a contingency table.
 41 | 
 42 |     The expected frequencies are based on the
 43 |     marginal sums of the table, i.e. dependent frequency estimates.
 44 | 
 45 |     :param vals: The contingency table. The table contains the observed number of occurrences in each category
 46 |     :returns test_statistic: the test statistic value
 47 |     """
 48 | 
 49 |     values = vals[:]
 50 | 
 51 |     # remove rows with only zeros, scipy doesn't like them.
 52 |     values = values[~np.all(values == 0, axis=1)]
 53 |     # remove columns with only zeros, scipy doesn't like them.
 54 |     values = values.T[~np.all(values.T == 0, axis=1)].T
 55 | 
 56 |     # use existing scipy functionality
 57 |     test_statistic, _, _, _ = stats.chi2_contingency(values, lambda_=lambda_)
 58 | 
 59 |     return test_statistic
 60 | 
 61 | 
 62 | def get_pearson_chi_square(
 63 |     observed: np.ndarray, expected: np.ndarray = None, normalize: bool = True
 64 | ) -> float:
 65 |     """Calculate pearson chi square between observed and expected 2d contingency matrix
 66 | 
 67 |     :param observed: The observed contingency table. The table contains the observed number of occurrences in each cell.
 68 |     :param expected: The expected contingency table. The table contains the expected number of occurrences in each cell.
 69 |     :param bool normalize: normalize expected frequencies, default is True.
 70 |     :return: the pearson chi2 value
 71 |     """
 72 |     observed = np.asarray(observed)
 73 |     if np.any(observed < 0):
 74 |         raise ValueError("All values in `observed` must be non-negative.")
 75 |     if observed.size == 0:
 76 |         raise ValueError("No data; `observed` has size 0.")
 77 | 
 78 |     if expected is None:
 79 |         expected = get_dependent_frequency_estimates(observed)
 80 |     expected = np.asarray(expected)
 81 | 
 82 |     # important to ensure that observed and expected have same normalization
 83 |     if normalize:
 84 |         expected = expected * (np.sum(observed) / np.sum(expected))
 85 | 
 86 |     terms = np.divide(
 87 |         (observed.astype(np.float64) - expected) ** 2,
 88 |         expected,
 89 |         out=np.zeros_like(expected),
 90 |         where=expected != 0,
 91 |     )
 92 |     return np.sum(terms)
 93 | 
 94 | 
 95 | def estimate_ndof(chi2values: Union[list, np.ndarray]) -> float:
 96 |     """
 97 |     Estimation of the effective number of degrees of freedom.
 98 | 
 99 |     A good approximation of endof is the average value. Alternatively
100 |     a fit to the chi2 distribution can be make. Both values are returned.
101 | 
102 |     :param list chi2values: list of chi2 values
103 |     :returns: endof0, endof
104 |     """
105 | 
106 |     return np.mean(chi2values)
107 | 
108 | 
109 | def estimate_simple_ndof(observed: np.ndarray) -> int:
110 |     """
111 |     Simple estimation of the effective number of degrees of freedom.
112 | 
113 |     This equals the nominal calculation for ndof minus the number of empty bins in the
114 |     expected contingency table.
115 | 
116 |     :param observed: numpy array of observed cell counts
117 |     :returns: endof
118 |     """
119 | 
120 |     # use existing scipy functionality
121 |     expected = stats.contingency.expected_freq(observed)
122 |     endof = (
123 |         expected.size
124 |         - np.sum(expected.shape)
125 |         + expected.ndim
126 |         - 1
127 |         - (expected == 0).sum()
128 |     )
129 |     # require minimum number of degrees of freedom
130 |     if endof < 0:
131 |         endof = 0
132 |     return endof
133 | 
134 | 
135 | def theoretical_ndof(observed: np.ndarray) -> int:
136 |     """
137 |     Simple estimation of the effective number of degrees of freedom.
138 | 
139 |     This equals the nominal calculation for ndof minus the number of empty bins in the
140 |     expected contingency table.
141 | 
142 |     :param observed: numpy array of observed cell counts
143 |     :returns: theoretical ndof
144 |     """
145 | 
146 |     return observed.size - np.sum(observed.shape) + observed.ndim - 1
147 | 
148 | 
149 | def z_from_logp(logp: float, flip_sign: bool = False) -> float:
150 |     """
151 |     Convert logarithm of p-value into one-sided Z-value
152 | 
153 |     :param float logp: logarithm of p-value, should not be greater than 0
154 |     :param bool flip_sign: flip sign of Z-value, e.g. use for input log(1-p). Default is false.
155 |     :returns: statistical significance Z-value
156 |     :rtype: float
157 |     """
158 | 
159 |     # pvalue == 0, Z = infinity
160 |     if logp == -np.inf:
161 |         return np.inf if not flip_sign else -np.inf
162 | 
163 |     p_value = np.exp(logp)
164 | 
165 |     # scenario where p-value is numerically too small to evaluate Z
166 |     if p_value == 0:
167 |         # kicks in here when Z > 37
168 |         # approach valid when ~ Z > 1.5.
169 |         u = -2.0 * np.log(2 * np.pi) - 2.0 * logp
170 |         z_value = np.sqrt(u - np.log(u))
171 |     else:
172 |         z_value = -stats.norm.ppf(p_value)
173 | 
174 |     if flip_sign:
175 |         z_value *= -1.0
176 | 
177 |     return z_value
178 | 


--------------------------------------------------------------------------------
/phik/simulation.py:
--------------------------------------------------------------------------------
  1 | """Project: PhiK - correlation analyzer library
  2 | 
  3 | Created: 2018/09/05
  4 | 
  5 | Description:
  6 |     Helper functions to simulate 2D datasets
  7 | 
  8 | Authors:
  9 |     KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
 10 | 
 11 | Redistribution and use in source and binary forms, with or without
 12 | modification, are permitted according to the terms listed in the file
 13 | LICENSE.
 14 | """
 15 | 
 16 | import numpy as np
 17 | from joblib import Parallel, delayed
 18 | 
 19 | from .statistics import get_dependent_frequency_estimates
 20 | from .statistics import get_chi2_using_dependent_frequency_estimates
 21 | from phik.simcore import CPP_SUPPORT, _sim_2d_data_patefield
 22 | 
 23 | 
 24 | NUMPY_INT_MAX = np.iinfo(np.int32).max - 1
 25 | 
 26 | 
 27 | def sim_2d_data(hist:np.ndarray, ndata:int=0) -> np.ndarray:
 28 |     """
 29 |     Simulate a 2 dimensional dataset given a 2 dimensional pdf
 30 | 
 31 |     :param array-like hist: contingency table, which contains the observed number of occurrences in each category.
 32 |         This table is used as probability density function.
 33 |     :param int ndata: number of simulations
 34 |     :return: simulated data
 35 |     """
 36 | 
 37 |     if ndata <= 0:
 38 |         ndata = int(np.rint(hist.sum()))
 39 |     if ndata <= 0:
 40 |         raise ValueError('ndata (or hist.sum()) has to be positive')
 41 | 
 42 |     # scale and ravel
 43 |     hc = hist[:] / hist.sum()
 44 |     hcr = hc.ravel()
 45 | 
 46 |     hout = np.random.multinomial(n=ndata, pvals=hcr)
 47 |     hout2d = np.reshape(hout, hc.shape)
 48 |     return hout2d
 49 | 
 50 | 
 51 | def sim_2d_data_patefield(data: np.ndarray, seed : int = None) -> np.ndarray:
 52 |     """
 53 |     Simulate a two dimensional dataset with fixed row and column totals.
 54 | 
 55 |     Simulation algorithm by Patefield:
 56 |     W. M. Patefield, Applied Statistics 30, 91 (1981)
 57 |     Python implementation inspired by (C version):
 58 |     https://people.sc.fsu.edu/~jburkardt/c_src/asa159/asa159.html
 59 | 
 60 |     :param data: contingency table, which contains the observed number of occurrences in each category.\
 61 |     :param seed: optional seed for the simulation, primarily for testing purposes.\
 62 |     This table is used as probability density function.
 63 |     :return: simulated data
 64 |     """
 65 | 
 66 |     if not CPP_SUPPORT:
 67 |         raise NotImplementedError(
 68 |             'Patefield requires a compiled extension that was not found.'
 69 |         )
 70 | 
 71 |     # number of rows and columns
 72 |     nrows, ncols = data.shape
 73 | 
 74 |     # totals per row and column
 75 |     # NOTE we assume that sum will fit in a 32 bit int
 76 |     nrowt = np.rint(data.sum(axis=1)).astype(np.int32)
 77 |     ncolt = np.rint(data.sum(axis=0)).astype(np.int32)
 78 | 
 79 |     # set seed if it is None
 80 |     seed = seed or np.random.randint(0, NUMPY_INT_MAX)
 81 | 
 82 |     # allocate memory that will be set by _sim_2d_data_patefield
 83 |     matrix = np.empty(nrows * ncols, dtype=np.int32)
 84 | 
 85 |     # simulate the data, returned through matrix inplace modification
 86 |     _sim_2d_data_patefield(nrows, ncols, nrowt, ncolt, seed, matrix)
 87 |     return matrix.reshape(ncols, nrows).T
 88 | 
 89 | 
 90 | def sim_2d_product_multinominal(data:np.ndarray, axis: int) -> np.ndarray:
 91 |     """
 92 |     Simulate 2 dimensional data with either row or column totals fixed.
 93 | 
 94 |     :param data: contingency table, which contains the observed number of occurrences in each category.\
 95 |     This table is used as probability density function.
 96 |     :param axis: fix row totals (0) or column totals (1).
 97 |     :return: simulated data
 98 |     """
 99 | 
100 |     if axis == 1:
101 |         return np.array([list(sim_2d_data(data[i])) for i in range(data.shape[0])])
102 |     elif axis == 0:
103 |         return np.array([list(sim_2d_data(data.T[i])) for i in range(data.shape[1])]).T
104 |     else:
105 |         raise NotImplementedError("Axis should be 0 (row) or 1 (column).")
106 | 
107 | 
108 | def sim_data(data:np.ndarray, method:str='multinominal') -> np.ndarray:
109 |     """
110 |     Simulate a 2 dimensional dataset given a 2 dimensional pdf
111 | 
112 |     Several simulation methods are provided:
113 | 
114 |      - multinominal: Only the total number of records is fixed.
115 |      - row_product_multinominal: The row totals fixed in the sampling.
116 |      - col_product_multinominal: The column totals fixed in the sampling.
117 |      - hypergeometric: Both the row or column totals are fixed in the sampling. Note that this type of sampling is\
118 |     only available when row and column totals are integers.
119 | 
120 |     :param data: contingency table
121 |     :param str method: sampling method. Options: [multinominal, hypergeometric, row_product_multinominal,\
122 |      col_product_multinominal]
123 |     :return: simulated data
124 |     """
125 | 
126 |     if method == 'multinominal':
127 |         return sim_2d_data(data)
128 |     elif method == 'hypergeometric':
129 |         return sim_2d_data_patefield(data)
130 |     elif method == 'row_product_multinominal':
131 |         return sim_2d_product_multinominal(data, 0)
132 |     elif method == 'col_product_multinominal':
133 |         return sim_2d_product_multinominal(data, 1)
134 |     else:
135 |         raise NotImplementedError('selected method not recognized.')
136 | 
137 | 
138 | def sim_chi2_distribution(values: np.ndarray, nsim:int=1000, lambda_:str='log-likelihood',
139 |                           simulation_method:str='multinominal', alt_hypothesis:bool=False, njobs:int=-1) -> list:
140 |     """
141 |     Simulate 2D data and calculate the chi-square statistic for each simulated dataset.
142 | 
143 |     :param values: The contingency table. The table contains the observed number of occurrences in each category
144 |     :param int nsim: number of simulations (optional, default=1000)
145 |     :param str simulation_method: sampling method. Options: [multinominal, hypergeometric, row_product_multinominal,
146 |         col_product_multinominal]
147 |     :param str lambda_: test statistic. Available options are [pearson, log-likelihood].
148 |     :param bool alt_hypothesis: if True, simulate values directly, and not its dependent frequency estimates.
149 |     :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs.
150 |     :returns chi2s: list of chi2 values for each simulated dataset
151 |     """
152 |     exp_dep = get_dependent_frequency_estimates(values) if not alt_hypothesis else values
153 | 
154 |     if njobs == 1:
155 |         chi2s = [_simulate_and_fit(exp_dep, simulation_method, lambda_) for _ in range(nsim)]
156 |     else:
157 |         chi2s = Parallel(n_jobs=njobs)(delayed(_simulate_and_fit)(exp_dep, simulation_method, lambda_)
158 |                                         for _ in range(nsim))
159 | 
160 |     return chi2s
161 | 
162 | 
163 | def _simulate_and_fit(exp_dep: np.ndarray, simulation_method: str='multinominal',
164 |                       lambda_:str='log-likelihood') -> float:
165 |     """split off simulate function to allow for parallellization"""
166 |     simdata = sim_data(exp_dep, method=simulation_method)
167 |     simchi2 = get_chi2_using_dependent_frequency_estimates(simdata, lambda_)
168 |     return simchi2
169 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  applehelp  to make an Apple Help Book"
 34 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 35 | 	@echo "  epub       to make an epub"
 36 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 37 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 38 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 39 | 	@echo "  text       to make text files"
 40 | 	@echo "  man        to make manual pages"
 41 | 	@echo "  texinfo    to make Texinfo files"
 42 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 43 | 	@echo "  gettext    to make PO message catalogs"
 44 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 45 | 	@echo "  xml        to make Docutils-native XML files"
 46 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 47 | 	@echo "  linkcheck  to check all external links for integrity"
 48 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 49 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 50 | 
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | html:
 55 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 58 | 
 59 | dirhtml:
 60 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 61 | 	@echo
 62 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 63 | 
 64 | singlehtml:
 65 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 68 | 
 69 | pickle:
 70 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 71 | 	@echo
 72 | 	@echo "Build finished; now you can process the pickle files."
 73 | 
 74 | json:
 75 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 76 | 	@echo
 77 | 	@echo "Build finished; now you can process the JSON files."
 78 | 
 79 | htmlhelp:
 80 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 81 | 	@echo
 82 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 83 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 84 | 
 85 | qthelp:
 86 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 87 | 	@echo
 88 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 89 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 90 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/DecisionEngine.qhcp"
 91 | 	@echo "To view the help file:"
 92 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/DecisionEngine.qhc"
 93 | 
 94 | applehelp:
 95 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
 96 | 	@echo
 97 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
 98 | 	@echo "N.B. You won't be able to view it unless you put it in" \
 99 | 	      "~/Library/Documentation/Help or install it in your application" \
100 | 	      "bundle."
101 | 
102 | devhelp:
103 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
104 | 	@echo
105 | 	@echo "Build finished."
106 | 	@echo "To view the help file:"
107 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/DecisionEngine"
108 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/DecisionEngine"
109 | 	@echo "# devhelp"
110 | 
111 | epub:
112 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
113 | 	@echo
114 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
115 | 
116 | latex:
117 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
118 | 	@echo
119 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
120 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
121 | 	      "(use \`make latexpdf' here to do that automatically)."
122 | 
123 | latexpdf:
124 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
125 | 	@echo "Running LaTeX files through pdflatex..."
126 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
127 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
128 | 
129 | latexpdfja:
130 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
131 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
132 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
133 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
134 | 
135 | text:
136 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
137 | 	@echo
138 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
139 | 
140 | man:
141 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
142 | 	@echo
143 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
144 | 
145 | texinfo:
146 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
147 | 	@echo
148 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
149 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
150 | 	      "(use \`make info' here to do that automatically)."
151 | 
152 | info:
153 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
154 | 	@echo "Running Texinfo files through makeinfo..."
155 | 	make -C $(BUILDDIR)/texinfo info
156 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
157 | 
158 | gettext:
159 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
160 | 	@echo
161 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
162 | 
163 | changes:
164 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
165 | 	@echo
166 | 	@echo "The overview file is in $(BUILDDIR)/changes."
167 | 
168 | linkcheck:
169 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
170 | 	@echo
171 | 	@echo "Link check complete; look for any errors in the above output " \
172 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
173 | 
174 | doctest:
175 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
176 | 	@echo "Testing of doctests in the sources finished, look at the " \
177 | 	      "results in $(BUILDDIR)/doctest/output.txt."
178 | 
179 | coverage:
180 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
181 | 	@echo "Testing of coverage in the sources finished, look at the " \
182 | 	      "results in $(BUILDDIR)/coverage/python.txt."
183 | 
184 | xml:
185 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
186 | 	@echo
187 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
188 | 
189 | pseudoxml:
190 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
191 | 	@echo
192 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
193 | 


--------------------------------------------------------------------------------
/phik/bivariate.py:
--------------------------------------------------------------------------------
  1 | """Project: PhiK - correlation analyzer library
  2 | 
  3 | Created: 2019/11/23
  4 | 
  5 | Description:
  6 |     Convert Pearson correlation value into a chi2 value of a contingency test 
  7 |     matrix of a bivariate gaussian, and vice-versa.
  8 |     Calculation uses scipy's mvn library.
  9 | 
 10 | Authors:
 11 |     KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
 12 | 
 13 | Redistribution and use in source and binary forms, with or without
 14 | modification, are permitted according to the terms listed in the file
 15 | LICENSE.
 16 | """
 17 | import warnings
 18 | 
 19 | import numpy as np
 20 | import scipy
 21 | from scipy import optimize
 22 | 
 23 | _scipy_version = [int(v) for v in scipy.__version__.split('.')]
 24 | USE_QMVN = True if _scipy_version[0] >= 1 and _scipy_version[1] >= 16 else False
 25 | if USE_QMVN:
 26 |     from scipy.stats._qmvnt import _qauto, _qmvn
 27 | else:
 28 |     from scipy.stats._mvn import mvnun
 29 | 
 30 | 
 31 | 
 32 | 
 33 | def _mvn_un(rho: float, lower: tuple, upper: tuple,
 34 |             rng: np.random.Generator = np.random.default_rng(42)) -> float:
 35 |     """Perform integral of bivariate normal gauss with correlation
 36 | 
 37 |     Integral is performed using scipy's mvn library.
 38 | 
 39 |     :param float rho: tilt parameter
 40 |     :param tuple lower: tuple of lower corner of integral area
 41 |     :param tuple upper: tuple of upper corner of integral area
 42 |     :param np.random.Generator rng: default_rng(42), optional
 43 |     :returns float: integral value
 44 |     """
 45 |     mu = np.array([0.0, 0.0])
 46 |     S = np.array([[1.0, rho], [rho, 1.0]])
 47 |     return _calc_mvnun(lower=lower, upper=upper, mu=mu, S=S, rng=rng)
 48 | 
 49 | 
 50 | def _calc_mvnun(lower, upper, mu, S, rng = np.random.default_rng(42)):
 51 |     if USE_QMVN:
 52 |         res = _qauto(_qmvn, S, lower, upper, rng)[0]
 53 |     else:
 54 |         res = mvnun(lower, upper, mu, S)[0]
 55 |     return res
 56 | 
 57 | 
 58 | def _mvn_array(rho: float, sx: np.ndarray, sy: np.ndarray) -> list:
 59 |     """Array of integrals over bivariate normal gauss with correlation
 60 | 
 61 |     Integrals are performed using scipy's mvn library.
 62 | 
 63 |     :param float rho: tilt parameter
 64 |     :param np.ndarray sx: bin edges array of x-axis
 65 |     :param np.ndarray sy: bin edges array of y-axis
 66 |     :returns list: list of integral values
 67 |     """
 68 |     # ranges = [([sx[i], sy[j]], [sx[i+1], sy[j+1]]) for i in range(len(sx) - 1) for j in range(len(sy) - 1)]
 69 |     # corr = [mvn.mvnun(lower, upper, mu, S)[0] for lower, upper in ranges]
 70 |     # return corr
 71 | 
 72 |     # mean and covariance
 73 |     mu = np.array([0.0, 0.0])
 74 |     S = np.array([[1.0, rho], [rho, 1.0]])
 75 | 
 76 |     # callling mvn.mvnun is expensive, so we only calculate half of the matrix, then symmetrize
 77 |     # add half block, which is symmetric in x
 78 |     odd_odd = False
 79 |     ranges = [
 80 |         ([sx[i], sy[j]], [sx[i + 1], sy[j + 1]])
 81 |         for i in range((len(sx) - 1) // 2)
 82 |         for j in range(len(sy) - 1)
 83 |     ]
 84 |     # add odd middle row, which is symmetric in y
 85 |     if (len(sx) - 1) % 2 == 1:
 86 |         i = (len(sx) - 1) // 2
 87 |         ranges += [
 88 |             ([sx[i], sy[j]], [sx[i + 1], sy[j + 1]]) for j in range((len(sy) - 1) // 2)
 89 |         ]
 90 |         # add center point, add this only once
 91 |         if (len(sy) - 1) % 2 == 1:
 92 |             j = (len(sy) - 1) // 2
 93 |             ranges.append(([sx[i], sy[j]], [sx[i + 1], sy[j + 1]]))
 94 |             odd_odd = True
 95 | 
 96 |     corr = np.array([_calc_mvnun(lower, upper, mu, S) for lower, upper in ranges])
 97 |     # add second half, exclude center
 98 |     corr = np.concatenate([corr, corr if not odd_odd else corr[:-1]])
 99 |     return corr
100 | 
101 | 
102 | def bivariate_normal_theory(
103 |     rho: float,
104 |     nx: int = -1,
105 |     ny: int = -1,
106 |     n: int = 1,
107 |     sx: np.ndarray = None,
108 |     sy: np.ndarray = None,
109 | ) -> np.ndarray:
110 |     """Return binned pdf of bivariate normal distribution.
111 | 
112 |     This function returns a "perfect" binned bivariate normal distribution.
113 | 
114 |     :param float rho: tilt parameter
115 |     :param int nx: number of uniform bins on x-axis. alternative to sx.
116 |     :param int ny: number of uniform bins on y-axis. alternative to sy.
117 |     :param np.ndarray sx: bin edges array of x-axis. default is None.
118 |     :param np.ndarray sy: bin edges array of y-axis. default is None.
119 |     :param int n: number of entries. default is one.
120 |     :return: np.ndarray of binned bivariate normal pdf
121 |     """
122 | 
123 |     if n < 1:
124 |         raise ValueError("Number of entries needs to be one or greater.")
125 |     if sx is None:
126 |         sx = np.linspace(-5, 5, nx + 1)
127 |     if sy is None:
128 |         sy = np.linspace(-5, 5, ny + 1)
129 | 
130 |     bvn = np.zeros((ny, nx))
131 |     for i in range(len(sx) - 1):
132 |         for j in range(len(sy) - 1):
133 |             lower = (sx[i], sy[j])
134 |             upper = (sx[i + 1], sy[j + 1])
135 |             p = _mvn_un(rho, lower, upper)
136 |             bvn[j, i] = p
137 |     bvn *= n
138 | 
139 |     # patch for entry levels that are below machine precision
140 |     # (simulation does not work otherwise)
141 |     bvn[bvn < np.finfo(np.float).eps] = np.finfo(np.float).eps
142 | 
143 |     return bvn
144 | 
145 | 
146 | def chi2_from_phik(
147 |     rho: float,
148 |     n: int,
149 |     subtract_from_chi2: float = 0,
150 |     corr0: list = None,
151 |     scale: float = None,
152 |     sx: np.ndarray = None,
153 |     sy: np.ndarray = None,
154 |     pedestal: float = 0,
155 |     nx: int = -1,
156 |     ny: int = -1,
157 | ) -> float:
158 |     """Calculate chi2-value of bivariate gauss having correlation value rho
159 | 
160 |     Calculate no-noise chi2 value of bivar gauss with correlation rho,
161 |     with respect to bivariate gauss without any correlation.
162 | 
163 |     :param float rho: tilt parameter
164 |     :param int n: number of records
165 |     :param float subtract_from_chi2: value subtracted from chi2 calculation. default is 0.
166 |     :param list corr0: mvn_array result for rho=0. Default is None.
167 |     :param float scale: scale is multiplied with the chi2 if set.
168 |     :param np.ndarray sx: bin edges array of x-axis. default is None.
169 |     :param np.ndarray sy: bin edges array of y-axis. default is None.
170 |     :param float pedestal: pedestal is added to the chi2 if set.
171 |     :param int nx: number of uniform bins on x-axis. alternative to sx.
172 |     :param int ny: number of uniform bins on y-axis. alternative to sy.
173 |     :returns float: chi2 value
174 |     """
175 | 
176 |     if sx is None:
177 |         sx = np.linspace(-5, 5, nx + 1)
178 | 
179 |     if sy is None:
180 |         sy = np.linspace(-5, 5, ny + 1)
181 | 
182 |     if corr0 is None:
183 |         corr0 = _mvn_array(0, sx, sy)
184 |     if scale is None:
185 |         # scale ensures that for rho=1, chi2 is the maximum possible value
186 |         corr1 = _mvn_array(1, sx, sy)
187 |         delta_corr2 = (corr1 - corr0) ** 2
188 |         # protect against division by zero
189 |         ratio = np.divide(
190 |             delta_corr2, corr0, out=np.zeros_like(delta_corr2), where=corr0 != 0
191 |         )
192 |         chi2_one = n * np.sum(ratio)
193 |         # chi2_one = n * sum([((c1-c0)*(c1-c0)) / c0 for c0, c1 in zip(corr0, corr1)])
194 |         chi2_max = n * min(nx - 1, ny - 1)
195 |         scale = (chi2_max - pedestal) / chi2_one
196 | 
197 |     corrr = _mvn_array(rho, sx, sy)
198 |     delta_corr2 = (corrr - corr0) ** 2
199 |     # protect against division by zero
200 |     ratio = np.divide(
201 |         delta_corr2, corr0, out=np.zeros_like(delta_corr2), where=corr0 != 0
202 |     )
203 |     chi2_rho = n * np.sum(ratio)
204 |     # chi2_rho = (n * sum([((cr-c0)*(cr-c0)) / c0 for c0, cr in zip(corr0, corrr)]))
205 | 
206 |     chi2 = pedestal + chi2_rho * scale
207 |     return chi2 - subtract_from_chi2
208 | 
209 | 
210 | def phik_from_chi2(
211 |     chi2: float,
212 |     n: int,
213 |     nx: int,
214 |     ny: int,
215 |     sx: np.ndarray = None,
216 |     sy: np.ndarray = None,
217 |     pedestal: float = 0,
218 | ) -> float:
219 |     """
220 |     Correlation coefficient of bivariate gaussian derived from chi2-value
221 | 
222 |     Chi2-value gets converted into correlation coefficient of bivariate gauss
223 |     with correlation value rho, assuming giving binning and number of records.
224 |     Correlation coefficient value is between 0 and 1.
225 | 
226 |     Bivariate gaussian's range is set to [-5,5] by construction.
227 | 
228 |     :param float chi2: input chi2 value
229 |     :param int n: number of records
230 |     :param int nx: number of uniform bins on x-axis. alternative to sx.
231 |     :param int ny: number of uniform bins on y-axis. alternative to sy.
232 |     :param np.ndarray sx: bin edges array of x-axis. default is None.
233 |     :param np.ndarray sy: bin edges array of y-axis. default is None.
234 |     :param float pedestal: pedestal is added to the chi2 if set.
235 |     :returns float: correlation coefficient
236 |     """
237 | 
238 |     if pedestal < 0:
239 |         raise ValueError("noise pedestal should be greater than zero.")
240 | 
241 |     if sx is None:
242 |         sx = np.linspace(-5, 5, nx + 1)
243 |     elif nx <= 1:
244 |         raise ValueError("number of bins along x-axis is unknown")
245 |     if sy is None:
246 |         sy = np.linspace(-5, 5, ny + 1)
247 |     elif ny <= 1:
248 |         raise ValueError("number of bins along y-axis is unknown")
249 | 
250 |     corr0 = _mvn_array(0, sx, sy)
251 | 
252 |     # scale ensures that for rho=1, chi2 is the maximum possible value
253 |     corr1 = _mvn_array(1, sx, sy)
254 |     if 0 in corr0 and len(corr0) > 10000:
255 |         warnings.warn(
256 |             "Many cells: {0:d}. Are interval variables set correctly?".format(
257 |                 len(corr0)
258 |             )
259 |         )
260 | 
261 |     delta_corr2 = (corr1 - corr0) ** 2
262 |     # protect against division by zero
263 |     ratio = np.divide(
264 |         delta_corr2, corr0, out=np.zeros_like(delta_corr2), where=corr0 != 0
265 |     )
266 |     chi2_one = n * np.sum(ratio)
267 |     # chi2_one = n * sum([((c1-c0)*(c1-c0)) / c0 if c0 > 0 else 0 for c0,c1 in zip(corr0,corr1)])
268 |     chi2_max = n * min(nx - 1, ny - 1)
269 |     scale = (chi2_max - pedestal) / chi2_one
270 |     if chi2 > chi2_max and np.isclose(chi2, chi2_max, atol=1e-14):
271 |         chi2 = chi2_max
272 | 
273 |     # only solve for rho if chi2 exceeds noise pedestal
274 |     if chi2 <= pedestal:
275 |         return 0.0
276 |     elif chi2 >= chi2_max:
277 |         return 1.0
278 | 
279 |     rho = optimize.brentq(
280 |         chi2_from_phik, 0, 1, args=(n, chi2, corr0, scale, sx, sy, pedestal), xtol=1e-5
281 |     )
282 |     return rho
283 | 


--------------------------------------------------------------------------------
/phik/binning.py:
--------------------------------------------------------------------------------
  1 | """Project: PhiK - correlation analyzer library
  2 | 
  3 | Created: 2018/09/06
  4 | 
  5 | Description:
  6 |     A set of rebinning functions, to help rebin two lists into a 2d histogram.
  7 | 
  8 | Authors:
  9 |     KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
 10 | 
 11 | Redistribution and use in source and binary forms, with or without
 12 | modification, are permitted according to the terms listed in the file
 13 | LICENSE.
 14 | """
 15 | import sys
 16 | from typing import List, Optional, Tuple, Union
 17 | 
 18 | import numpy as np
 19 | import pandas as pd
 20 | 
 21 | from phik import definitions as defs
 22 | from phik.data_quality import dq_check_nunique_values
 23 | from phik.utils import array_like_to_dataframe, guess_interval_cols
 24 | 
 25 | 
 26 | def bin_edges(
 27 |     arr: Union[np.ndarray, list, pd.Series], nbins: int, quantile: bool = False
 28 | ) -> np.ndarray:
 29 |     """
 30 |     Create uniform or quantile bin-edges for the input array.
 31 | 
 32 |     :param arr: array like object with input data
 33 |     :param int nbins: the number of bin
 34 |     :param bool quantile: uniform bins (False) or bins based on quantiles (True)
 35 |     :returns: array with bin edges
 36 |     """
 37 | 
 38 |     if quantile:
 39 |         quantiles = np.linspace(0, 1, nbins + 1)
 40 |         xbins = np.quantile(arr[~np.isnan(arr)], quantiles)
 41 |         xbins[0] -= max(1e-14 * abs(xbins[0]), sys.float_info.min)
 42 |     else:
 43 |         min_value = np.min(arr[~np.isnan(arr)])
 44 |         constant = max(1e-14 * abs(min_value), sys.float_info.min)
 45 |         xbins = np.linspace(
 46 |             min_value - constant, np.max(arr[~np.isnan(arr)]), nbins + 1
 47 |         )
 48 | 
 49 |     return xbins
 50 | 
 51 | 
 52 | def bin_array(
 53 |     arr: Union[np.ndarray, list], bin_edges: Union[np.ndarray, list]
 54 | ) -> Tuple[np.ndarray, list]:
 55 |     """
 56 |     Index the data given the bin_edges.
 57 | 
 58 |     Underflow and overflow values are indicated.
 59 | 
 60 |     :param arr: array like object with input data
 61 |     :param bin_edges: list with bin edges.
 62 |     :returns: indexed data
 63 |     """
 64 | 
 65 |     # Bin data
 66 |     binned_arr = np.searchsorted(bin_edges, arr).astype(object)
 67 | 
 68 |     # Check if all bins are filled and store bin-labels
 69 |     bin_labels = []
 70 |     bin_indices = pd.Series(binned_arr).value_counts().index
 71 |     for i in range(1, len(bin_edges)):
 72 |         if i in bin_indices:
 73 |             bin_labels.append((bin_edges[i - 1], bin_edges[i]))
 74 | 
 75 |     # NaN values are added to the overflow bin. Restore NaN values:
 76 |     binned_arr[np.argwhere(np.isnan(arr))] = np.nan
 77 | 
 78 |     # Set underflow values to UF
 79 |     binned_arr[np.argwhere(binned_arr == 0)] = defs.UF
 80 | 
 81 |     # Set overflow values to OF
 82 |     binned_arr[np.argwhere(binned_arr == len(bin_edges))] = defs.OF
 83 | 
 84 |     return binned_arr, bin_labels
 85 | 
 86 | 
 87 | def bin_data(
 88 |     data: pd.DataFrame,
 89 |     cols: Union[list, np.ndarray, tuple] = (),
 90 |     bins: Union[int, list, np.ndarray, dict] = 10,
 91 |     quantile: bool = False,
 92 |     retbins: bool = False,
 93 | ):
 94 |     """
 95 |     Index the input DataFrame given the bin_edges for the columns specified in cols.
 96 | 
 97 |     :param DataFrame data: input data
 98 |     :param list cols: list of columns with numeric data which needs to be indexed
 99 |     :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\
100 |     E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
101 |     :param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True)
102 |     :returns: rebinned DataFrame
103 |     :rtype: pandas.DataFrame
104 |     """
105 |     xbins = None
106 |     if isinstance(bins, dict):
107 |         for col in cols:
108 |             if col not in bins:
109 |                 raise ValueError(
110 |                     "column {0} is not included in bins dictionary.".format(col)
111 |                 )
112 |     elif isinstance(bins, (list, np.ndarray)):
113 |         xbins = bins
114 | 
115 |     # MB 20210307: check for numeric bins turned off here, also done in dq_check_nunique_values().
116 | 
117 |     binned_data = data.copy()
118 | 
119 |     bins_dict = {}
120 |     for col in cols:
121 |         if np.issubdtype(type(bins), np.integer) or np.issubdtype(
122 |             type(bins), np.floating
123 |         ):
124 |             xbins = bin_edges(data[col].astype(float), int(bins), quantile=quantile)
125 |         elif isinstance(bins, dict):
126 |             if np.issubdtype(type(bins[col]), np.integer) or np.issubdtype(
127 |                 type(bins[col]), np.floating
128 |             ):
129 |                 xbins = bin_edges(
130 |                     data[col].astype(float), int(bins[col]), quantile=quantile
131 |                 )
132 |             elif isinstance(bins[col], (list, np.ndarray)):
133 |                 xbins = bins[col]
134 |         elif xbins is None:
135 |             raise ValueError(
136 |                 "Unexpected type for bins. The found type was '%s'" % str(type(bins))
137 |             )
138 | 
139 |         binned_data[col], bin_labels = bin_array(data[col].astype(float).values, xbins)
140 |         if retbins:
141 |             bins_dict[col] = bin_labels
142 | 
143 |     if retbins:
144 |         return binned_data, bins_dict
145 | 
146 |     return binned_data
147 | 
148 | 
149 | def auto_bin_data(
150 |     df: pd.DataFrame,
151 |     interval_cols: Optional[list] = None,
152 |     bins: Union[int, list, np.ndarray, dict] = 10,
153 |     quantile: bool = False,
154 |     dropna: bool = True,
155 |     verbose: bool = True,
156 | ) -> pd.DataFrame:
157 |     """
158 |     Index the input DataFrame with automatic bin_edges and interval columns.
159 | 
160 |     :param pd.DataFrame data_binned: input data
161 |     :param list interval_cols: column names of columns with interval variables.
162 |     :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column
163 |         the bins are specified. (default=10)\
164 |         E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
165 |     :param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True)
166 |     :param bool dropna: remove NaN values with True
167 |     :param bool verbose: if False, do not print all interval columns that are guessed
168 |     :return: phik correlation matrix
169 |     """
170 |     # guess interval columns
171 |     if interval_cols is None:
172 |         interval_cols = guess_interval_cols(df, verbose)
173 | 
174 |     # clean the data
175 |     df_clean, interval_cols_clean = dq_check_nunique_values(
176 |         df, interval_cols, dropna=dropna
177 |     )
178 | 
179 |     # perform rebinning
180 |     data_binned, binning_dict = bin_data(
181 |         df_clean, cols=interval_cols_clean, bins=bins, quantile=quantile, retbins=True
182 |     )
183 |     return data_binned, binning_dict
184 | 
185 | 
186 | def create_correlation_overview_table(
187 |     vals: List[Tuple[str, str, float]]
188 | ) -> pd.DataFrame:
189 |     """
190 |     Create overview table of phik/significance data.
191 | 
192 |     :param list vals: list holding tuples of data for each variable pair formatted as ('var1', 'var2', value)
193 |     :returns: symmetric table with phik/significances of all variable pairs
194 |     :rtype: pandas.DataFrame
195 |     """
196 | 
197 |     ll = []
198 |     for c0, c1, v in vals:
199 |         ll.append([c0, c1, v])
200 |         ll.append([c1, c0, v])
201 | 
202 |     corr_matrix = pd.DataFrame(ll, columns=["var1", "var2", "vals"]).pivot_table(
203 |         index="var1", columns="var2", values="vals"
204 |     )
205 |     corr_matrix.columns.name = None
206 |     corr_matrix.index.name = None
207 |     return corr_matrix
208 | 
209 | 
210 | def hist2d_from_rebinned_df(
211 |     data_binned: pd.DataFrame,
212 |     dropna: bool = True,
213 |     drop_underflow: bool = True,
214 |     drop_overflow: bool = True,
215 | ) -> pd.DataFrame:
216 |     """
217 |     Give binned 2d DataFrame of two columns of rebinned input DataFrame
218 | 
219 |     :param df: input data. DataFrame must contain exactly two columns
220 |     :param bool dropna: remove NaN values with True
221 |     :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\
222 |     a numeric variable)
223 |     :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\
224 |     a numeric variable)
225 |     :returns: histogram DataFrame
226 |     """
227 | 
228 |     c0, c1 = data_binned.columns
229 | 
230 |     if not dropna:
231 |         data_binned.fillna(defs.NaN, inplace=True)
232 |     if drop_underflow:
233 |         data_binned.replace(defs.UF, np.nan, inplace=True)
234 |     if drop_overflow:
235 |         data_binned.replace(defs.OF, np.nan, inplace=True)
236 | 
237 |     # create a contingency table
238 |     df_datahist = (
239 |         data_binned.groupby([c0, c1])[c0].count().to_frame().unstack().fillna(0)
240 |     )
241 |     df_datahist.columns = df_datahist.columns.droplevel()
242 | 
243 |     return df_datahist
244 | 
245 | 
246 | def hist2d(
247 |     df: pd.DataFrame,
248 |     interval_cols: Optional[Union[list, np.ndarray]] = None,
249 |     bins: Union[int, float, list, np.ndarray, dict] = 10,
250 |     quantile: bool = False,
251 |     dropna: bool = True,
252 |     drop_underflow: bool = True,
253 |     drop_overflow: bool = True,
254 |     retbins: bool = False,
255 |     verbose: bool = True,
256 | ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, dict]]:
257 |     """
258 |     Give binned 2d DataFrame of two columns of input DataFrame
259 | 
260 |     :param df: input data. DataFrame must contain exactly two columns
261 |     :param interval_cols: columns with interval variables which need to be binned
262 |     :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\
263 |     E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
264 |     :param bool quantile: when the number of bins is specified, use uniform binning (False) or quantile binning (True)
265 |     :param bool dropna: remove NaN values with True
266 |     :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\
267 |     a numeric variable)
268 |     :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\
269 |     a numeric variable)
270 |     :param bool verbose: if False, do not print all interval columns that are guessed
271 |     :returns: histogram DataFrame
272 |     """
273 | 
274 |     if len(df.columns) != 2:
275 |         raise ValueError("DataFrame should contain only two columns")
276 | 
277 |     if interval_cols is None:
278 |         interval_cols = guess_interval_cols(df, verbose)
279 | 
280 |     data_binned, binning_dict = bin_data(
281 |         df, interval_cols, retbins=True, bins=bins, quantile=quantile
282 |     )
283 |     datahist = hist2d_from_rebinned_df(
284 |         data_binned,
285 |         dropna=dropna,
286 |         drop_underflow=drop_underflow,
287 |         drop_overflow=drop_overflow,
288 |     )
289 | 
290 |     if retbins:
291 |         return datahist, binning_dict
292 | 
293 |     return datahist
294 | 
295 | 
296 | def hist2d_from_array(
297 |     x: Union[pd.Series, list, np.ndarray], y: [pd.Series, list, np.ndarray], **kwargs
298 | ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, dict]]:
299 |     """
300 |     Give binned 2d DataFrame of two input arrays
301 | 
302 |     :param x: input data. First array-like.
303 |     :param y: input data. Second array-like.
304 |     :param interval_cols: columns with interval variables which need to be binned
305 |     :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\
306 |     E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
307 |     :param bool quantile: when the number of bins is specified, use uniform binning (False) or quantile binning (True)
308 |     :param bool dropna: remove NaN values with True
309 |     :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\
310 |     a numeric variable)
311 |     :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\
312 |     a numeric variable)
313 |     :returns: histogram DataFrame
314 |     """
315 | 
316 |     df = array_like_to_dataframe(x, y)
317 |     return hist2d(df, **kwargs)
318 | 


--------------------------------------------------------------------------------
/tests/test_phik.py:
--------------------------------------------------------------------------------
  1 | """Project: Phi_K - correlation coefficient package
  2 | 
  3 | Created: 2018/11/13
  4 | 
  5 | Description:
  6 |     Collection of helper functions to get fixtures, i.e. for test data.
  7 |     These are mostly used by the (integration) tests and example notebooks.
  8 | 
  9 | Authors:
 10 |     KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
 11 | 
 12 | Redistribution and use in source and binary forms, with or without
 13 | modification, are permitted according to the terms listed in the file
 14 | LICENSE.
 15 | """
 16 | 
 17 | import unittest
 18 | import pytest
 19 | 
 20 | import pandas as pd
 21 | import numpy as np
 22 | from phik import resources, bivariate
 23 | from phik.simulation import sim_2d_data_patefield, CPP_SUPPORT
 24 | from phik.binning import auto_bin_data, bin_data
 25 | from phik.phik import phik_observed_vs_expected_from_rebinned_df, phik_from_hist2d
 26 | from phik.statistics import get_dependent_frequency_estimates
 27 | 
 28 | 
 29 | @pytest.mark.filterwarnings("ignore:Using or importing the ABCs from")
 30 | class PhiKTest(unittest.TestCase):
 31 |     """Tests for calculation of Phi_K"""
 32 | 
 33 |     def test_phik_calculation(self):
 34 |         """Test the calculation of Phi_K"""
 35 | 
 36 |         chi2 = bivariate.chi2_from_phik(0.5, 1000, nx=10, ny=10)
 37 |         self.assertTrue(np.isclose(chi2, 271.16068979654125, 1e-6))
 38 | 
 39 |         phik = bivariate.phik_from_chi2(chi2, 1000, 10, 10)
 40 |         self.assertTrue(np.isclose(phik, 0.5, 1e-6))
 41 | 
 42 |     def test_phik_from_hist2d(self):
 43 |         """Test the calculation of Phi_K value from hist2d"""
 44 | 
 45 |         # open fake car insurance data
 46 |         df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz"))
 47 | 
 48 |         # create contingency matrix
 49 |         cols = ["mileage", "car_size"]
 50 |         interval_cols = ["mileage"]
 51 |         observed = df[cols].hist2d(interval_cols=interval_cols)
 52 | 
 53 |         phik_value = phik_from_hist2d(observed)
 54 |         self.assertAlmostEqual(phik_value, 0.7685888294891855, places=3)
 55 | 
 56 |     def test_phik_observed_vs_expected_from_hist2d(self):
 57 |         """Test the calculation of Phi_K value from hist2d"""
 58 | 
 59 |         # open fake car insurance data
 60 |         df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz"))
 61 | 
 62 |         # create contingency matrix
 63 |         cols = ["mileage", "car_size"]
 64 |         interval_cols = ["mileage"]
 65 | 
 66 |         observed = df[cols].hist2d(interval_cols=interval_cols).values
 67 |         expected = get_dependent_frequency_estimates(observed)
 68 | 
 69 |         phik_value = phik_from_hist2d(observed=observed, expected=expected)
 70 |         self.assertAlmostEqual(phik_value, 0.7685888294891855, places=3)
 71 | 
 72 |     def test_phik_matrix(self):
 73 |         """Test the calculation of Phi_K"""
 74 |         # open fake car insurance data
 75 |         df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz"))
 76 |         cols = list(df.columns)
 77 | 
 78 |         # get the phi_k correlation matrix between all variables
 79 |         interval_cols = ["driver_age", "mileage"]
 80 |         phik_corr = df.phik_matrix(interval_cols=interval_cols)
 81 | 
 82 |         self.assertAlmostEqual(
 83 |             phik_corr.values[cols.index("car_color"), cols.index("area")],
 84 |             0.5904561614620166,
 85 |             places=3,
 86 |         )
 87 |         self.assertAlmostEqual(
 88 |             phik_corr.values[cols.index("area"), cols.index("car_color")],
 89 |             0.5904561614620166,
 90 |             places=3,
 91 |         )
 92 |         self.assertAlmostEqual(
 93 |             phik_corr.values[cols.index("mileage"), cols.index("car_size")],
 94 |             0.768588987856336,
 95 |             places=3,
 96 |         )
 97 |         self.assertAlmostEqual(
 98 |             phik_corr.values[cols.index("car_size"), cols.index("mileage")],
 99 |             0.768588987856336,
100 |             places=3,
101 |         )
102 | 
103 |     def test_phik_matrix_observed_vs_expected(self):
104 |         """Test the calculation of Phi_K"""
105 |         # open fake car insurance data
106 |         df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz"))
107 |         cols = list(df.columns)
108 | 
109 |         # get the phi_k correlation matrix between all variables
110 |         binned_df, _ = auto_bin_data(df)
111 |         phik_corr = phik_observed_vs_expected_from_rebinned_df(binned_df, binned_df)
112 | 
113 |         self.assertTrue(
114 |             np.isclose(
115 |                 phik_corr.values[cols.index("car_color"), cols.index("area")], 0.0
116 |             )
117 |         )
118 |         self.assertTrue(
119 |             np.isclose(
120 |                 phik_corr.values[cols.index("area"), cols.index("car_color")], 0.0
121 |             )
122 |         )
123 |         self.assertTrue(
124 |             np.isclose(
125 |                 phik_corr.values[cols.index("mileage"), cols.index("car_size")], 0.0
126 |             )
127 |         )
128 |         self.assertTrue(
129 |             np.isclose(
130 |                 phik_corr.values[cols.index("car_size"), cols.index("mileage")], 0.0
131 |             )
132 |         )
133 |         self.assertTrue(
134 |             np.isclose(
135 |                 phik_corr.values[cols.index("car_size"), cols.index("car_size")], 1.0
136 |             )
137 |         )
138 | 
139 |     def test_global_phik(self):
140 |         """Test the calculation of global Phi_K values"""
141 | 
142 |         # open fake car insurance data
143 |         df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz"))
144 | 
145 |         # get the global phi_k values
146 |         interval_cols = ["driver_age", "mileage"]
147 |         gk = df.global_phik(interval_cols=interval_cols)
148 | 
149 |         area = (np.where(gk[1] == "area"))[0][0]
150 |         car_size = (np.where(gk[1] == "car_size"))[0][0]
151 |         mileage = (np.where(gk[1] == "mileage"))[0][0]
152 | 
153 |         self.assertAlmostEqual(gk[0][area][0], 0.6057528003711345, places=3)
154 |         self.assertAlmostEqual(gk[0][car_size][0], 0.76858883, places=3)
155 |         self.assertAlmostEqual(gk[0][mileage][0], 0.768588987856336, places=3)
156 | 
157 |     def test_significance_matrix_asymptotic(self):
158 |         """Test significance calculation"""
159 | 
160 |         # open fake car insurance data
161 |         df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz"))
162 |         cols = list(df.columns)
163 |         # get significances
164 |         interval_cols = ["driver_age", "mileage"]
165 |         sm = df.significance_matrix(
166 |             interval_cols=interval_cols, significance_method="asymptotic"
167 |         )
168 | 
169 |         self.assertTrue(
170 |             np.isclose(
171 |                 sm.values[cols.index("car_color"), cols.index("area")],
172 |                 37.66184429195198,
173 |             )
174 |         )
175 |         self.assertTrue(
176 |             np.isclose(
177 |                 sm.values[cols.index("area"), cols.index("car_color")],
178 |                 37.66184429195198,
179 |             )
180 |         )
181 |         self.assertTrue(
182 |             np.isclose(
183 |                 sm.values[cols.index("mileage"), cols.index("car_size")],
184 |                 49.3323049685695,
185 |             )
186 |         )
187 |         self.assertTrue(
188 |             np.isclose(
189 |                 sm.values[cols.index("car_size"), cols.index("mileage")],
190 |                 49.3323049685695,
191 |             )
192 |         )
193 | 
194 |     def test_significance_matrix_hybrid(self):
195 |         """Test significance calculation"""
196 | 
197 |         # open fake car insurance data
198 |         df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz"))
199 |         cols = list(df.columns)
200 |         # get significances
201 |         interval_cols = ["driver_age", "mileage"]
202 |         sm = df.significance_matrix(
203 |             interval_cols=interval_cols, significance_method="hybrid"
204 |         )
205 | 
206 |         self.assertTrue(
207 |             np.isclose(
208 |                 sm.values[cols.index("car_color"), cols.index("area")],
209 |                 37.63086023595297,
210 |                 atol=10e-2,
211 |             )
212 |         )
213 |         self.assertTrue(
214 |             np.isclose(
215 |                 sm.values[cols.index("area"), cols.index("car_color")],
216 |                 37.63086023595297,
217 |                 atol=10e-2,
218 |             )
219 |         )
220 |         self.assertTrue(
221 |             np.isclose(
222 |                 sm.values[cols.index("mileage"), cols.index("car_size")],
223 |                 49.28345609465683,
224 |                 atol=10e-2,
225 |             )
226 |         )
227 |         self.assertTrue(
228 |             np.isclose(
229 |                 sm.values[cols.index("car_size"), cols.index("mileage")],
230 |                 49.28345609465683,
231 |                 atol=10e-2,
232 |             )
233 |         )
234 | 
235 |     def test_significance_matrix_mc(self):
236 |         """Test significance calculation"""
237 | 
238 |         # open fake car insurance data
239 |         df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz"))
240 |         cols = list(df.columns)
241 |         # get significances
242 |         interval_cols = ["driver_age", "mileage"]
243 |         sm = df.significance_matrix(
244 |             interval_cols=interval_cols, significance_method="MC"
245 |         )
246 | 
247 |         self.assertTrue(
248 |             np.isclose(sm.values[cols.index("car_color"), cols.index("area")], np.inf)
249 |         )
250 |         self.assertTrue(
251 |             np.isclose(sm.values[cols.index("area"), cols.index("car_color")], np.inf)
252 |         )
253 |         self.assertTrue(
254 |             np.isclose(sm.values[cols.index("mileage"), cols.index("car_size")], np.inf)
255 |         )
256 |         self.assertTrue(
257 |             np.isclose(sm.values[cols.index("car_size"), cols.index("mileage")], np.inf)
258 |         )
259 | 
260 |     def test_hist2d(self):
261 |         """Test the calculation of global Phi_K values"""
262 | 
263 |         # open fake car insurance data
264 |         df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz"))
265 | 
266 |         # create contingency matrix
267 |         cols = ["mileage", "car_size"]
268 |         interval_cols = ["mileage"]
269 |         h2d = df[cols].hist2d(interval_cols=interval_cols)
270 | 
271 |         self.assertEqual(h2d.values[1, 1], 10)
272 |         self.assertEqual(h2d.values[5, 5], 217)
273 | 
274 |     def test_hist2d_array(self):
275 |         """Test the calculation of global Phi_K values"""
276 | 
277 |         # open fake car insurance data
278 |         df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz"))
279 | 
280 |         # create contingency matrix
281 |         interval_cols = ["mileage"]
282 |         h2d = df["mileage"].hist2d(df["car_size"], interval_cols=interval_cols)
283 |         self.assertEqual(h2d.values[1, 1], 10)
284 |         self.assertEqual(h2d.values[5, 5], 217)
285 | 
286 |     def test_outlier_significance_matrix(self):
287 |         """Test the calculation of outlier significances"""
288 | 
289 |         # open fake car insurance data
290 |         df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz"))
291 | 
292 |         # calculate outlier significances
293 |         cols = ["mileage", "car_size"]
294 |         interval_cols = ["mileage"]
295 |         om = df[cols].outlier_significance_matrix(interval_cols=interval_cols)
296 | 
297 |         self.assertTrue(np.isclose(om.values[0, 1], 21.483476494343552))
298 |         self.assertTrue(np.isclose(om.values[2, 4], -1.246784034214704))
299 | 
300 |     def test_outlier_significance_matrices(self):
301 |         """Test the calculation of outlier significances"""
302 | 
303 |         # open fake car insurance data
304 |         df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz"))
305 | 
306 |         # calculate outlier significances
307 |         interval_cols = ["mileage", "driver_age"]
308 |         om = df.outlier_significance_matrices(interval_cols=interval_cols)
309 | 
310 |         self.assertTrue(isinstance(om, dict))
311 | 
312 |     @pytest.mark.skipif(not CPP_SUPPORT, reason="cpp not supported")
313 |     def test_simulation_2d_patefield(self):
314 |         """Test simulation code using patefield algorithm."""
315 |         og_state = np.random.get_state()
316 |         np.random.seed(42)
317 |         sample = np.random.randint(1, 200, (50, 2))
318 | 
319 |         # call test function
320 |         res = sim_2d_data_patefield(sample, seed=42).T
321 |         np.random.set_state(og_state)
322 |         mean0, mean1 = res.mean(1)
323 |         self.assertTrue(np.isclose(mean0, 105.46))
324 |         self.assertTrue(np.isclose(mean1, 91.18))
325 | 
326 |     def test_binning_bin_data_bins_tyes(self):
327 |         # Non regression test
328 |         # https://github.com/KaveIO/PhiK/issues/28
329 |         df = pd.DataFrame({"x": np.random.randn(10)})
330 |         bins_int = np.arange(5, 11, 1)
331 |         bins_float = np.arange(5, 11, 1.0)
332 |         bins_dict_int = {"x": np.uint8(10)}
333 |         bins_dict_float = {"x": np.float32(10.3)}
334 | 
335 |         for bins in bins_int:
336 |             bin_data(df, cols=["x"], bins=bins)
337 | 
338 |         for bins in bins_float:
339 |             bin_data(df, cols=["x"], bins=bins)
340 | 
341 |         bin_data(df, cols=["x"], bins=bins_dict_int)
342 |         bin_data(df, cols=["x"], bins=bins_dict_float)
343 | 


--------------------------------------------------------------------------------
/tests/integration/test_phik_tutorial_advanced.py:
--------------------------------------------------------------------------------
  1 | # # Phi_K advanced tutorial
  2 | #
  3 | # This notebook guides you through the more advanced functionality of the phik package. This notebook will not cover all the underlying theory, but will just attempt to give an overview of all the options that are available. For a theoretical description the user is referred to our paper.
  4 | #
  5 | # The package offers functionality on three related topics:
  6 | #
  7 | # 1. Phik correlation matrix
  8 | # 2. Significance matrix
  9 | # 3. Outlier significance matrix
 10 | 
 11 | # +
 12 | # import standard packages
 13 | import numpy as np
 14 | import pandas as pd
 15 | 
 16 | from phik import resources
 17 | from phik.decorators import *
 18 | 
 19 | # # Load data
 20 | #
 21 | # A simulated dataset is part of the phik-package. The dataset concerns car insurance data. Load the dataset here:
 22 | 
 23 | 
 24 | def test_advanced_notebook():
 25 |     data = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz"))
 26 | 
 27 |     data.head()
 28 | 
 29 |     # ## Specify bin types
 30 |     #
 31 |     # The phik-package offers a way to calculate correlations between variables of mixed types. Variable types can be inferred automatically although we recommend to variable types to be specified by the user.
 32 |     #
 33 |     # Because interval type variables need to be binned in order to calculate phik and the significance, a list of interval variables is created.
 34 | 
 35 |     # +
 36 |     data_types = {
 37 |         "severity": "interval",
 38 |         "driver_age": "interval",
 39 |         "satisfaction": "ordinal",
 40 |         "mileage": "interval",
 41 |         "car_size": "ordinal",
 42 |         "car_use": "ordinal",
 43 |         "car_color": "categorical",
 44 |         "area": "categorical",
 45 |     }
 46 | 
 47 |     interval_cols = [
 48 |         col for col, v in data_types.items() if v == "interval" and col in data.columns
 49 |     ]
 50 |     # interval_cols is used below
 51 |     # -
 52 | 
 53 |     # # Phik correlation matrix
 54 |     #
 55 |     # Now let's start calculating the correlation phik between pairs of variables.
 56 |     #
 57 |     # Note that the original dataset is used as input, the binning of interval variables is done automatically.
 58 | 
 59 |     phik_overview = data.phik_matrix(interval_cols=interval_cols)
 60 | 
 61 |     # ### Specify binning per interval variable
 62 |     #
 63 |     # Binning can be set per interval variable individually. One can set the number of bins, or specify a list of bin edges. Note that the measured phik correlation is dependent on the chosen binning.
 64 |     # The default binning is uniform between the min and max values of the interval variable.
 65 | 
 66 |     bins = {"mileage": 5, "driver_age": [18, 25, 35, 45, 55, 65, 125]}
 67 |     phik_overview = data.phik_matrix(interval_cols=interval_cols, bins=bins)
 68 | 
 69 |     # ### Do not apply noise correction
 70 |     #
 71 |     # For low statistics samples often a correlation larger than zero is measured when no correlation is actually present in the true underlying distribution. This is not only the case for phik, but also for the pearson correlation and Cramer's phi (see figure 4 in <font color='red'> XX </font>). In the phik calculation a noise correction is applied by default, to take into account erroneous correlation values as a result of low statistics. To switch off this noise cancellation (not recommended), do:
 72 | 
 73 |     phik_overview = data.phik_matrix(
 74 |         interval_cols=interval_cols, noise_correction=False
 75 |     )
 76 | 
 77 |     # ### Using a different expectation histogram
 78 |     #
 79 |     # By default phik compares the 2d distribution of two (binned) variables with the distribution that assumes no dependency between them. One can also change the expected distribution though. Phi_K is calculated in the same way, but using the other expectation distribution.
 80 | 
 81 |     from phik.binning import auto_bin_data
 82 |     from phik.phik import (phik_from_hist2d,
 83 |                            phik_observed_vs_expected_from_rebinned_df)
 84 |     from phik.statistics import get_dependent_frequency_estimates
 85 | 
 86 |     # get observed 2d histogram of two variables
 87 |     cols = ["mileage", "car_size"]
 88 |     icols = ["mileage"]
 89 |     observed = data[cols].hist2d(interval_cols=icols).values
 90 | 
 91 |     # default phik evaluation from observed distribution
 92 |     phik_value = phik_from_hist2d(observed)
 93 |     print(phik_value)
 94 | 
 95 |     # phik evaluation from an observed and expected distribution
 96 |     expected = get_dependent_frequency_estimates(observed)
 97 |     phik_value = phik_from_hist2d(observed=observed, expected=expected)
 98 |     print(phik_value)
 99 | 
100 |     # one can also compare two datasets against each other, and get a full phik matrix that way.
101 |     # this needs binned datasets though.
102 |     # (the user needs to make sure the binnings of both datasets are identical.)
103 |     data_binned, _ = auto_bin_data(data, interval_cols=interval_cols)
104 | 
105 |     # here we are comparing data_binned against itself
106 |     phik_matrix = phik_observed_vs_expected_from_rebinned_df(data_binned, data_binned)
107 | 
108 |     # all off-diagonal entries are zero, meaning the all 2d distributions of both datasets are identical.
109 |     # (by construction the diagonal is one.)
110 | 
111 |     # # Statistical significance of the correlation
112 |     #
113 |     # When assessing correlations it is good practise to evaluate both the correlation and the significance of the correlation: a large correlation may be statistically insignificant, and vice versa a small correlation may be very significant. For instance, scipy.stats.pearsonr returns both the pearson correlation and the p-value. Similarly, the phik package offers functionality the calculate a significance matrix. Significance is defined as:
114 |     #
115 |     # $$Z = \Phi^{-1}(1-p)\ ;\quad \Phi(z)=\frac{1}{\sqrt{2\pi}} \int_{-\infty}^{z} e^{-t^{2}/2}\,{\rm d}t $$
116 |     #
117 |     # Several corrections to the 'standard' p-value calculation are taken into account, making the method more robust for low statistics and sparse data cases. The user is referred to our paper for more details.
118 |     #
119 |     # Due to the corrections, the significance calculation can take a few seconds.
120 | 
121 |     significance_overview = data.significance_matrix(interval_cols=interval_cols)
122 | 
123 |     # ### Specify binning per interval variable
124 |     # Binning can be set per interval variable individually. One can set the number of bins, or specify a list of bin edges. Note that the measure phik correlation is dependent on the chosen binning.
125 | 
126 |     bins = {"mileage": 5, "driver_age": [18, 25, 35, 45, 55, 65, 125]}
127 |     significance_overview = data.significance_matrix(
128 |         interval_cols=interval_cols, bins=bins
129 |     )
130 | 
131 |     # ### Specify significance method
132 |     #
133 |     # The recommended method to calculate the significance of the correlation is a hybrid approach, which uses the G-test statistic. The number of degrees of freedom and an analytical, empirical description of the $\chi^2$ distribution are sed, based on Monte Carlo simulations. This method works well for both high as low statistics samples.
134 |     #
135 |     # Other approaches to calculate the significance are implemented:
136 |     # - asymptotic: fast, but over-estimates the number of degrees of freedom for low statistics samples, leading to erroneous values of the significance
137 |     # - MC: Many simulated samples are needed to accurately measure significances larger than 3, making this method computationally expensive.
138 |     #
139 | 
140 |     significance_overview = data.significance_matrix(
141 |         interval_cols=interval_cols, significance_method="asymptotic"
142 |     )
143 | 
144 |     # ### Simulation method
145 |     #
146 |     # The chi2 of a contingency table is measured using a comparison of the expected frequencies with the true frequencies in a contingency table. The expected frequencies can be simulated in a variety of ways. The following methods are implemented:
147 |     #
148 |     #  - multinominal: Only the total number of records is fixed. (default)
149 |     #  - row_product_multinominal: The row totals fixed in the sampling.
150 |     #  - col_product_multinominal: The column totals fixed in the sampling.
151 |     #  - hypergeometric: Both the row or column totals are fixed in the sampling. (Note that this type of sampling is only available when row and column totals are integers, which is usually the case.)
152 | 
153 |     # +
154 |     # --- Warning, can be slow
155 |     #     turned off here by default for unit testing purposes
156 | 
157 |     # significance_overview = data.significance_matrix(interval_cols=interval_cols, simulation_method='hypergeometric')
158 |     # significance_overview
159 |     # -
160 | 
161 |     # ### Expected frequencies
162 | 
163 |     from phik.simulation import (sim_2d_data, sim_2d_data_patefield,
164 |                                  sim_2d_product_multinominal)
165 | 
166 |     inputdata = data[["driver_age", "area"]].hist2d(interval_cols=["driver_age"])
167 | 
168 |     # #### Multinominal
169 | 
170 |     simdata = sim_2d_data(inputdata.values)
171 |     print("data total:", inputdata.sum().sum())
172 |     print("sim  total:", simdata.sum().sum())
173 |     print("data row totals:", inputdata.sum(axis=0).values)
174 |     print("sim  row totals:", simdata.sum(axis=0))
175 |     print("data column totals:", inputdata.sum(axis=1).values)
176 |     print("sim  column totals:", simdata.sum(axis=1))
177 | 
178 |     # #### product multinominal
179 | 
180 |     simdata = sim_2d_product_multinominal(inputdata.values, axis=0)
181 |     print("data total:", inputdata.sum().sum())
182 |     print("sim  total:", simdata.sum().sum())
183 |     print("data row totals:", inputdata.sum(axis=0).astype(int).values)
184 |     print("sim  row totals:", simdata.sum(axis=0).astype(int))
185 |     print("data column totals:", inputdata.sum(axis=1).astype(int).values)
186 |     print("sim  column totals:", simdata.sum(axis=1).astype(int))
187 | 
188 |     # #### hypergeometric ("patefield")
189 | 
190 |     # +
191 |     # patefield simulation needs compiled c++ code.
192 |     # only run this if the python binding to the (compiled) patefiled simulation function is found.
193 |     from phik.simcore import CPP_SUPPORT
194 | 
195 |     if CPP_SUPPORT:
196 |         simdata = sim_2d_data_patefield(inputdata.values)
197 |         print("data total:", inputdata.sum().sum())
198 |         print("sim  total:", simdata.sum().sum())
199 |         print("data row totals:", inputdata.sum(axis=0).astype(int).values)
200 |         print("sim  row totals:", simdata.sum(axis=0))
201 |         print("data column totals:", inputdata.sum(axis=1).astype(int).values)
202 |         print("sim  column totals:", simdata.sum(axis=1))
203 |     # -
204 | 
205 |     # # Outlier significance
206 |     #
207 |     # The normal pearson correlation between two interval variables is easy to interpret. However, the phik correlation between two variables of mixed type is not always easy to interpret, especially when it concerns categorical variables. Therefore, functionality is provided to detect "outliers": excesses and deficits over the expected frequencies  in the contingency table of two variables.
208 |     #
209 | 
210 |     # ### Example 1: mileage versus car_size
211 | 
212 |     # For the categorical variable pair mileage - car_size we measured:
213 |     #
214 |     # $$\phi_k = 0.77 \, ,\quad\quad \mathrm{significance} = 46.3$$
215 |     #
216 |     # Let's use the outlier significance functionality to gain a better understanding of this significance correlation between mileage and car size.
217 |     #
218 | 
219 |     # +
220 |     c0 = "mileage"
221 |     c1 = "car_size"
222 | 
223 |     tmp_interval_cols = ["mileage"]
224 |     # -
225 | 
226 |     outlier_signifs, binning_dict = data[[c0, c1]].outlier_significance_matrix(
227 |         interval_cols=tmp_interval_cols, retbins=True
228 |     )
229 | 
230 |     # ### Specify binning per interval variable
231 |     # Binning can be set per interval variable individually. One can set the number of bins, or specify a list of bin edges.
232 |     #
233 |     # Note: in case a bin is created without any records this bin will be automatically dropped in the phik and (outlier) significance calculations. However, in the outlier significance calculation this will currently lead to an error as the number of provided bin edges does not match the number of bins anymore.
234 | 
235 |     bins = [0, 1e2, 1e3, 1e4, 1e5, 1e6]
236 |     outlier_signifs, binning_dict = data[[c0, c1]].outlier_significance_matrix(
237 |         interval_cols=tmp_interval_cols, bins=bins, retbins=True
238 |     )
239 | 
240 |     # ### Specify binning per interval variable -- dealing with underflow and overflow
241 |     #
242 |     # When specifying custom bins as situation can occur when the minimal (maximum) value in the data is smaller (larger) than the minimum (maximum) bin edge. Data points outside the specified range will be collected in the underflow (UF) and overflow (OF) bins. One can choose how to deal with these under/overflow bins, by setting the drop_underflow and drop_overflow variables.
243 |     #
244 |     # Note that the drop_underflow and drop_overflow options are also available for the calculation of the phik matrix and the significance matrix.
245 | 
246 |     bins = [1e2, 1e3, 1e4, 1e5]
247 |     outlier_signifs, binning_dict = data[[c0, c1]].outlier_significance_matrix(
248 |         interval_cols=tmp_interval_cols,
249 |         bins=bins,
250 |         retbins=True,
251 |         drop_underflow=False,
252 |         drop_overflow=False,
253 |     )
254 | 
255 |     # ### Dealing with NaN's in the data
256 | 
257 |     # Let's add some missing values to our data
258 | 
259 |     data.loc[np.random.choice(range(len(data)), size=10), "car_size"] = np.nan
260 |     data.loc[np.random.choice(range(len(data)), size=10), "mileage"] = np.nan
261 | 
262 |     # Sometimes there can be information in the missing values and in which case you might want to consider the NaN values as a separate category. This can be achieved by setting the dropna argument to False.
263 | 
264 |     bins = [1e2, 1e3, 1e4, 1e5]
265 |     outlier_signifs, binning_dict = data[[c0, c1]].outlier_significance_matrix(
266 |         interval_cols=tmp_interval_cols,
267 |         bins=bins,
268 |         retbins=True,
269 |         drop_underflow=False,
270 |         drop_overflow=False,
271 |         dropna=False,
272 |     )
273 | 
274 |     # Here OF and UF are the underflow and overflow bin of car_size, respectively.
275 |     #
276 |     # To just ignore records with missing values set dropna to True (default).
277 | 
278 |     bins = [1e2, 1e3, 1e4, 1e5]
279 |     outlier_signifs, binning_dict = data[[c0, c1]].outlier_significance_matrix(
280 |         interval_cols=tmp_interval_cols,
281 |         bins=bins,
282 |         retbins=True,
283 |         drop_underflow=False,
284 |         drop_overflow=False,
285 |         dropna=True,
286 |     )
287 | 
288 |     # Note that the dropna option is also available for the calculation of the phik matrix and the significance matrix.
289 | 


--------------------------------------------------------------------------------
/tests/integration/test_phik_tutorial_basic.py:
--------------------------------------------------------------------------------
  1 | # # Phi_K basic tutorial
  2 | #
  3 | # This notebook guides you through the basic functionality of the phik package. The package offers functionality on three related topics:
  4 | #
  5 | # 1. Phik correlation matrix
  6 | # 2. Significance matrix
  7 | # 3. Outlier significance matrix
  8 | #
  9 | # For more information on the underlying theory, the user is referred to our paper.
 10 | 
 11 | import itertools
 12 | 
 13 | import matplotlib.pyplot as plt
 14 | # +
 15 | # import standard packages
 16 | import numpy as np
 17 | import pandas as pd
 18 | 
 19 | import phik
 20 | from phik import resources
 21 | from phik.binning import bin_data
 22 | from phik.report import plot_correlation_matrix
 23 | 
 24 | # # Load data
 25 | #
 26 | # A simulated dataset is part of the phik-package. The dataset concerns fake car insurance data. Load the dataset here:
 27 | 
 28 | 
 29 | def test_basic_notebook():
 30 |     data = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz"))
 31 | 
 32 |     # # Take a first look at the data
 33 | 
 34 |     # Let's use a simple data.head() to get an idea of what the data looks like and inspect the different types of variables.
 35 | 
 36 |     data.head()
 37 | 
 38 |     # # Specify bin types
 39 |     #
 40 |     # The phik-package offers a way to calculate correlations between variables of mixed types. Variable types can be inferred automatically although we recommend variable types to be specified by the user.
 41 |     #
 42 |     # Because interval type variables need to be binned in order to calculate phik and the significance, a list of interval variables is created.
 43 | 
 44 |     # +
 45 |     data_types = {
 46 |         "severity": "interval",
 47 |         "driver_age": "interval",
 48 |         "satisfaction": "ordinal",
 49 |         "mileage": "interval",
 50 |         "car_size": "ordinal",
 51 |         "car_use": "ordinal",
 52 |         "car_color": "categorical",
 53 |         "area": "categorical",
 54 |     }
 55 | 
 56 |     interval_cols = [
 57 |         col for col, v in data_types.items() if v == "interval" and col in data.columns
 58 |     ]
 59 |     # -
 60 | 
 61 |     # # Visually inspect pairwise correlations
 62 | 
 63 |     # ## Bin the interval variables
 64 |     #
 65 |     # To get a feeling for the data, let's bin the interval variables and create 2d histograms to inspect the correlations between variables. By binning the interval variables we can treat all variable types in the same way.
 66 |     #
 67 | 
 68 |     # bin the interval variables
 69 |     data_binned, binning_dict = bin_data(data, cols=interval_cols, retbins=True)
 70 | 
 71 |     # +
 72 |     # plot each variable pair
 73 |     plt.rc("text", usetex=False)
 74 | 
 75 |     n = 0
 76 |     for i in range(len(data.columns)):
 77 |         n = n + i
 78 | 
 79 |     ncols = 3
 80 |     nrows = int(np.ceil(n / ncols))
 81 |     fig, axes = plt.subplots(nrows, ncols, figsize=(15, 4 * nrows))
 82 |     ndecimals = 0
 83 | 
 84 |     for i, comb in enumerate(itertools.combinations(data_binned.columns.values, 2)):
 85 |         c = int(i % ncols)
 86 |         r = int((i - c) / ncols)
 87 | 
 88 |         # get data
 89 |         c0, c1 = comb
 90 |         datahist = (
 91 |             data_binned.groupby([c0, c1])[c0].count().to_frame().unstack().fillna(0)
 92 |         )
 93 |         datahist.columns = datahist.columns.droplevel()
 94 | 
 95 |         # plot data
 96 |         img = axes[r][c].pcolormesh(datahist.values, edgecolor="w", linewidth=1)
 97 | 
 98 |         # axis ticks and tick labels
 99 |         if c0 in binning_dict.keys():
100 |             ylabels = [
101 |                 "{1:.{0}f}_{2:.{0}f}".format(
102 |                     ndecimals, binning_dict[c0][i][0], binning_dict[c0][i][1]
103 |                 )
104 |                 for i in range(len(binning_dict[c0]))
105 |             ]
106 |         else:
107 |             ylabels = datahist.index
108 | 
109 |         if c1 in binning_dict.keys():
110 |             xlabels = [
111 |                 "{1:.{0}f}_{2:.{0}f}".format(
112 |                     ndecimals, binning_dict[c1][i][0], binning_dict[c1][i][1]
113 |                 )
114 |                 for i in range(len(binning_dict[c1]))
115 |             ]
116 |         else:
117 |             xlabels = datahist.columns
118 | 
119 |         # axis labels
120 |         axes[r][c].set_yticks(np.arange(len(ylabels)) + 0.5)
121 |         axes[r][c].set_xticks(np.arange(len(xlabels)) + 0.5)
122 |         axes[r][c].set_xticklabels(xlabels, rotation="vertical")
123 |         axes[r][c].set_yticklabels(ylabels, rotation="horizontal")
124 |         axes[r][c].set_xlabel(datahist.columns.name)
125 |         axes[r][c].set_ylabel(datahist.index.name)
126 |         axes[r][c].set_title("data")
127 | 
128 |     plt.tight_layout()
129 | 
130 |     # -
131 | 
132 |     # # Correlation: mileage vs car_size
133 |     #
134 |     # From the above plots it seems like there might be an interesting a correlation between mileage and car_size. Let's see what phik correlation is measured for this data.
135 | 
136 |     # +
137 |     x, y = data[["mileage", "car_size"]].T.values
138 | 
139 |     print("phik         =  %.2f" % phik.phik_from_array(x, y, num_vars=["x"]))
140 |     print("significance = %.2f" % phik.significance_from_array(x, y, num_vars=["x"])[1])
141 | 
142 |     # -
143 | 
144 |     # Indeed there is a correlation between these variables and the correlation is also significant. To better understand the correlation, we can have a look at the significance of excesses and deficits in the 2-dimensional contingency table, so-called "outlier significances".
145 | 
146 |     phik.outlier_significance_from_array(x, y, num_vars=["x"])
147 | 
148 |     # The values displayed in the matrix are the significances of the outlier frequencies, i.e. a large value means that the measured frequency for that bin is significantly different from the expected frequency in that bin.
149 |     #
150 |     # Let's visualise for easier interpretation.
151 | 
152 |     # +
153 |     outlier_signifs = phik.outlier_significance_from_array(x, y, num_vars=["x"])
154 | 
155 |     zvalues = outlier_signifs.values
156 |     xlabels = outlier_signifs.columns
157 |     ylabels = outlier_signifs.index
158 |     xlabel = "x"
159 |     ylabel = "y"
160 | 
161 |     plot_correlation_matrix(
162 |         zvalues,
163 |         x_labels=xlabels,
164 |         y_labels=ylabels,
165 |         x_label=xlabel,
166 |         y_label=ylabel,
167 |         vmin=-5,
168 |         vmax=5,
169 |         title="outlier significance",
170 |         identity_layout=False,
171 |         fontsize_factor=1.2,
172 |     )
173 |     # -
174 | 
175 |     # # $\phi_k$ functions for dataframes
176 |     #
177 |     # In our data we have 5 different columns, meaning we have to evaluate 4+3+2+1=10 pairs of variables for possible correlations. In a large dataset, with many different variables, this can easily become a cumbersome task. Can we do this more efficient? yes! We have provided functions that work on dataframes, to allow you to calculate the phik correlation, significance and outlier significance for all different variable combinations at once.
178 |     #
179 | 
180 |     # The functions are by default available after import of the phik package.
181 | 
182 |     # # $\phi_k$ correlation matrix
183 |     #
184 |     # Now let's start calculating the phik correlation coefficient between pairs of variables.
185 |     #
186 |     # Note that the original dataset is used as input, the binning of interval variables is done automatically.
187 | 
188 |     phik_overview = data.phik_matrix(interval_cols=interval_cols)
189 |     phik_overview
190 | 
191 |     # When no interval columns are provided, the code makes an educated guess
192 | 
193 |     data.phik_matrix()
194 | 
195 |     plot_correlation_matrix(
196 |         phik_overview.values,
197 |         x_labels=phik_overview.columns,
198 |         y_labels=phik_overview.index,
199 |         vmin=0,
200 |         vmax=1,
201 |         color_map="Blues",
202 |         title=r"correlation $\phi_K$",
203 |         fontsize_factor=1.5,
204 |         figsize=(7, 5.5),
205 |     )
206 |     plt.tight_layout()
207 | 
208 |     # # Global correlation: $g_k$
209 |     #
210 |     # The global correlation coefficient is a measure of the total correlation of one variable to all other variables in the dataset. They give an indication of how well on variable can be modelled in terms of the other variables. A calculation of the global correlation coefficient is provided within the phik package.
211 | 
212 |     global_correlation, global_labels = data.global_phik(interval_cols=interval_cols)
213 |     for c, l in zip(global_correlation, global_labels):
214 |         print(l, c[0])
215 | 
216 |     plot_correlation_matrix(
217 |         global_correlation,
218 |         x_labels=[""],
219 |         y_labels=global_labels,
220 |         vmin=0,
221 |         vmax=1,
222 |         figsize=(3.5, 4),
223 |         color_map="Blues",
224 |         title=r"$g_k$",
225 |         fontsize_factor=1.5,
226 |     )
227 |     plt.tight_layout()
228 | 
229 |     # # Statistical significance of the correlation: $Z$-score
230 |     #
231 |     # When assessing correlations it is good practise to evaluate both the correlation and the significance of the correlation: a large correlation may be statistically insignificant, and vice versa a small correlation may be very significant. For instance, scipy.stats.pearsonr returns both the pearson correlation and the p-value. Similarly, the phik package offers functionality the calculate a significance matrix. Significance is defined as:
232 |     #
233 |     # $$Z = \Phi^{-1}(1-p)\ ;\quad \Phi(z)=\frac{1}{\sqrt{2\pi}} \int_{-\infty}^{z} e^{-t^{2}/2}\,{\rm d}t $$
234 |     #
235 |     # Several corrections to the 'standard' p-value calculation are taken into account, making the method more robust for low statistics and sparse data cases. The user is referred to our paper for more details.
236 |     #
237 |     # As a result, the calculation may take a few seconds.
238 | 
239 |     significance_overview = data.significance_matrix(interval_cols=interval_cols)
240 |     significance_overview
241 | 
242 |     plot_correlation_matrix(
243 |         significance_overview.fillna(0).values,
244 |         x_labels=significance_overview.columns,
245 |         y_labels=significance_overview.index,
246 |         vmin=-5,
247 |         vmax=5,
248 |         title="significance",
249 |         usetex=False,
250 |         fontsize_factor=1.5,
251 |         figsize=(7, 5.5),
252 |     )
253 |     plt.tight_layout()
254 | 
255 |     # # Outlier significance
256 |     #
257 |     # The normal pearson correlation between two interval variables is easy to interpret. However, the phik correlation between two variables of mixed type is not always easy to interpret, especially when it concerns categorical variables. Therefore, functionality is provided to detect "outliers": excesses and deficits over the expected frequencies in the contingency table of two variables.
258 |     #
259 | 
260 |     # ### Example 1: car_color versus area
261 |     #
262 |     # For the categorical variable pair car_color - area we measured:
263 |     #
264 |     # $$\phi_k = 0.59 \, ,\quad\quad \mathrm{significance} = 37.6$$
265 |     #
266 |     # Let's use the outlier significance functionality to gain a better understanding of the significance correlation between car color and area.
267 |     #
268 | 
269 |     c1 = "car_color"
270 |     c0 = "area"
271 | 
272 |     outlier_signifs, binning_dict = data[[c0, c1]].outlier_significance_matrix(
273 |         retbins=True
274 |     )
275 |     outlier_signifs
276 | 
277 |     # +
278 |     zvalues = outlier_signifs.values
279 |     xlabels = binning_dict[c1] if c1 in binning_dict.keys() else outlier_signifs.columns
280 |     ylabels = binning_dict[c0] if c0 in binning_dict.keys() else outlier_signifs.index
281 |     xlabel = c1
282 |     ylabel = c0
283 | 
284 |     plot_correlation_matrix(
285 |         zvalues,
286 |         x_labels=xlabels,
287 |         y_labels=ylabels,
288 |         x_label=xlabel,
289 |         y_label=ylabel,
290 |         vmin=-5,
291 |         vmax=5,
292 |         title="outlier significance",
293 |         identity_layout=False,
294 |         fontsize_factor=1.2,
295 |     )
296 |     # -
297 | 
298 |     # The significance of each cell is expressed in terms of Z (one-sided).
299 |     #
300 |     # Interesting, owners of a green car are more likely to live in the country side, and black cars are more likely to travel on unpaved roads!
301 | 
302 |     # ### Example 2: mileage versus car_size
303 | 
304 |     # For the categorical variable pair mileage - car_size we measured:
305 |     #
306 |     # $$\phi_k = 0.77 \, ,\quad\quad \mathrm{significance} = 46.3$$
307 |     #
308 |     # Let's use the outlier significance functionality to gain a better understanding of this significance correlation between mileage and car size.
309 |     #
310 | 
311 |     # +
312 |     c0 = "mileage"
313 |     c1 = "car_size"
314 | 
315 |     tmp_interval_cols = ["mileage"]
316 |     # -
317 | 
318 |     outlier_signifs, binning_dict = data[[c0, c1]].outlier_significance_matrix(
319 |         interval_cols=tmp_interval_cols, retbins=True
320 |     )
321 |     outlier_signifs
322 | 
323 |     # Note that the interval variable mileage is binned automatically in 10 uniformly spaced bins!
324 | 
325 |     # +
326 |     zvalues = outlier_signifs.values
327 |     xlabels = outlier_signifs.columns
328 |     ylabels = outlier_signifs.index
329 |     xlabel = c1
330 |     ylabel = c0
331 | 
332 |     plot_correlation_matrix(
333 |         zvalues,
334 |         x_labels=xlabels,
335 |         y_labels=ylabels,
336 |         x_label=xlabel,
337 |         y_label=ylabel,
338 |         vmin=-5,
339 |         vmax=5,
340 |         title="outlier significance",
341 |         identity_layout=False,
342 |         fontsize_factor=1.2,
343 |     )
344 |     # -
345 | 
346 |     # # Correlation report
347 | 
348 |     # A full correlation report can be created automatically for a dataset by pairwise evaluation of all correlations, significances and outlier significances.
349 |     #
350 |     # Note that for a dataset with many different columns the number of outlier significances plots can grow large very rapidly. Therefore, the feature is implemented to only evaluate outlier significances for those variable pairs with a significance and correlation larger than the given thresholds.
351 | 
352 |     from phik import report
353 | 
354 |     rep = report.correlation_report(
355 |         data, significance_threshold=3, correlation_threshold=0.5
356 |     )
357 | 
358 |     # # Recap
359 | 
360 |     # To summarize, the main functions in the phik correlation package working on a dataframe are:
361 |     #
362 |     # - `df[twocols].hist2d()` or `series.hist2d(other_series)`
363 |     # - `df.phik_matrix()`
364 |     # - `df.global_phik()`
365 |     # - `df.significance_matrix()`
366 |     # - `df[twocols].outlier_significance_matrix()` or `series.hist2d(other_series)`
367 |     # - `df.outlier_significance_matrices()`
368 | 
369 |     data[["driver_age", "mileage"]].hist2d()
370 |     # Alternatively: data['driver_age'].hist2d(data['mileage'])
371 | 
372 |     data.phik_matrix()
373 | 
374 |     data.global_phik()
375 | 
376 |     data.significance_matrix()
377 | 
378 |     data[["area", "mileage"]].outlier_significance_matrix()
379 | 
380 |     os_matrices = data.outlier_significance_matrices()
381 | 
382 |     os_matrices.keys()
383 | 
384 |     os_matrices["car_color:mileage"]
385 | 


--------------------------------------------------------------------------------
/phik/simcore/asa159.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Taken from:
  3 |  * https://people.sc.fsu.edu/~jburkardt/cpp_src/asa159/asa159.html
  4 |  *
  5 |  * Michael Patefield,
  6 |  * Algorithm AS 159: An Efficient Method of Generating RXC Tables with Given Row and Column Totals,
  7 |  * Applied Statistics,
  8 |  * Volume 30, Number 1, 1981, pages 91-97.
  9 |  */
 10 | 
 11 | # include <cmath>
 12 | # include <cstdlib>
 13 | # include <ctime>
 14 | # include <iostream>
 15 | # include <iomanip>
 16 | #include <string>
 17 | 
 18 | using namespace std;
 19 | 
 20 | # include "asa159.hpp"
 21 | 
 22 | //****************************************************************************80
 23 | 
 24 | int i4_max ( int i1, int i2 )
 25 | 
 26 | //****************************************************************************80
 27 | //
 28 | //  Purpose:
 29 | //
 30 | //    I4_MAX returns the maximum of two I4's.
 31 | //
 32 | //  Licensing:
 33 | //
 34 | //    This code is distributed under the GNU LGPL license.
 35 | //
 36 | //  Modified:
 37 | //
 38 | //    13 October 1998
 39 | //
 40 | //  Author:
 41 | //
 42 | //    John Burkardt
 43 | //
 44 | //  Parameters:
 45 | //
 46 | //    Input, int I1, I2, are two integers to be compared.
 47 | //
 48 | //    Output, int I4_MAX, the larger of I1 and I2.
 49 | //
 50 | {
 51 |   int value;
 52 | 
 53 |   if ( i2 < i1 )
 54 |   {
 55 |     value = i1;
 56 |   }
 57 |   else
 58 |   {
 59 |     value = i2;
 60 |   }
 61 |   return value;
 62 | }
 63 | //****************************************************************************80
 64 | 
 65 | int i4_min ( int i1, int i2 )
 66 | 
 67 | //****************************************************************************80
 68 | //
 69 | //  Purpose:
 70 | //
 71 | //    I4_MIN returns the minimum of two I4's.
 72 | //
 73 | //  Licensing:
 74 | //
 75 | //    This code is distributed under the GNU LGPL license.
 76 | //
 77 | //  Modified:
 78 | //
 79 | //    13 October 1998
 80 | //
 81 | //  Author:
 82 | //
 83 | //    John Burkardt
 84 | //
 85 | //  Parameters:
 86 | //
 87 | //    Input, int I1, I2, two integers to be compared.
 88 | //
 89 | //    Output, int I4_MIN, the smaller of I1 and I2.
 90 | //
 91 | {
 92 |   int value;
 93 | 
 94 |   if ( i1 < i2 )
 95 |   {
 96 |     value = i1;
 97 |   }
 98 |   else
 99 |   {
100 |     value = i2;
101 |   }
102 |   return value;
103 | }
104 | //****************************************************************************80
105 | 
106 | void i4mat_print ( int m, int n, int a[], string title )
107 | 
108 | //****************************************************************************80
109 | //
110 | //  Purpose:
111 | //
112 | //    I4MAT_PRINT prints an I4MAT.
113 | //
114 | //  Discussion:
115 | //
116 | //    An I4MAT is an MxN array of I4's, stored by (I,J) -> [I+J*M].
117 | //
118 | //  Licensing:
119 | //
120 | //    This code is distributed under the GNU LGPL license.
121 | //
122 | //  Modified:
123 | //
124 | //    10 September 2009
125 | //
126 | //  Author:
127 | //
128 | //    John Burkardt
129 | //
130 | //  Parameters:
131 | //
132 | //    Input, int M, the number of rows in A.
133 | //
134 | //    Input, int N, the number of columns in A.
135 | //
136 | //    Input, int A[M*N], the M by N matrix.
137 | //
138 | //    Input, string TITLE, a title.
139 | //
140 | {
141 |   i4mat_print_some ( m, n, a, 1, 1, m, n, title );
142 | 
143 |   return;
144 | }
145 | //****************************************************************************80
146 | 
147 | void i4mat_print_some ( int m, int n, int a[], int ilo, int jlo, int ihi,
148 |   int jhi, string title )
149 | 
150 | //****************************************************************************80
151 | //
152 | //  Purpose:
153 | //
154 | //    I4MAT_PRINT_SOME prints some of an I4MAT.
155 | //
156 | //  Discussion:
157 | //
158 | //    An I4MAT is an MxN array of I4's, stored by (I,J) -> [I+J*M].
159 | //
160 | //  Licensing:
161 | //
162 | //    This code is distributed under the GNU LGPL license.
163 | //
164 | //  Modified:
165 | //
166 | //    20 August 2010
167 | //
168 | //  Author:
169 | //
170 | //    John Burkardt
171 | //
172 | //  Parameters:
173 | //
174 | //    Input, int M, the number of rows of the matrix.
175 | //    M must be positive.
176 | //
177 | //    Input, int N, the number of columns of the matrix.
178 | //    N must be positive.
179 | //
180 | //    Input, int A[M*N], the matrix.
181 | //
182 | //    Input, int ILO, JLO, IHI, JHI, designate the first row and
183 | //    column, and the last row and column to be printed.
184 | //
185 | //    Input, string TITLE, a title.
186 | //
187 | {
188 | # define INCX 10
189 | 
190 |   int i;
191 |   int i2hi;
192 |   int i2lo;
193 |   int j;
194 |   int j2hi;
195 |   int j2lo;
196 | 
197 |   cout << "\n";
198 |   cout << title << "\n";
199 | 
200 |   if ( m <= 0 || n <= 0 )
201 |   {
202 |     cout << "\n";
203 |     cout << "  (None)\n";
204 |     return;
205 |   }
206 | //
207 | //  Print the columns of the matrix, in strips of INCX.
208 | //
209 |   for ( j2lo = jlo; j2lo <= jhi; j2lo = j2lo + INCX )
210 |   {
211 |     j2hi = j2lo + INCX - 1;
212 |     if ( n < j2hi )
213 |     {
214 |       j2hi = n;
215 |     }
216 |     if ( jhi < j2hi )
217 |     {
218 |       j2hi = jhi;
219 |     }
220 | 
221 |     cout << "\n";
222 | //
223 | //  For each column J in the current range...
224 | //
225 | //  Write the header.
226 | //
227 |     cout << "  Col:";
228 |     for ( j = j2lo; j <= j2hi; j++ )
229 |     {
230 |       cout << "  " << setw(6) << j - 1;
231 |     }
232 |     cout << "\n";
233 |     cout << "  Row\n";
234 |     cout << "\n";
235 | //
236 | //  Determine the range of the rows in this strip.
237 | //
238 |     if ( 1 < ilo )
239 |     {
240 |       i2lo = ilo;
241 |     }
242 |     else
243 |     {
244 |       i2lo = 1;
245 |     }
246 |     if ( ihi < m )
247 |     {
248 |       i2hi = ihi;
249 |     }
250 |     else
251 |     {
252 |       i2hi = m;
253 |     }
254 | 
255 |     for ( i = i2lo; i <= i2hi; i++ )
256 |     {
257 | //
258 | //  Print out (up to INCX) entries in row I, that lie in the current strip.
259 | //
260 |       cout << setw(5) << i - 1 << ":";
261 |       for ( j = j2lo; j <= j2hi; j++ )
262 |       {
263 |         cout << "  " << setw(6) << a[i-1+(j-1)*m];
264 |       }
265 |       cout << "\n";
266 |     }
267 |   }
268 | 
269 |   return;
270 | # undef INCX
271 | }
272 | //****************************************************************************80
273 | 
274 | void i4vec_print ( int n, int a[], string title )
275 | 
276 | //****************************************************************************80
277 | //
278 | //  Purpose:
279 | //
280 | //    I4VEC_PRINT prints an I4VEC.
281 | //
282 | //  Discussion:
283 | //
284 | //    An I4VEC is a vector of I4's.
285 | //
286 | //  Licensing:
287 | //
288 | //    This code is distributed under the GNU LGPL license.
289 | //
290 | //  Modified:
291 | //
292 | //    14 November 2003
293 | //
294 | //  Author:
295 | //
296 | //    John Burkardt
297 | //
298 | //  Parameters:
299 | //
300 | //    Input, int N, the number of components of the vector.
301 | //
302 | //    Input, int A[N], the vector to be printed.
303 | //
304 | //    Input, string TITLE, a title.
305 | //
306 | {
307 |   int i;
308 | 
309 |   cout << "\n";
310 |   cout << title << "\n";
311 |   cout << "\n";
312 |   for ( i = 0; i < n; i++ )
313 |   {
314 |     cout << "  " << setw(8) << i
315 |          << ": " << setw(8) << a[i]  << "\n";
316 |   }
317 |   return;
318 | }
319 | //****************************************************************************80
320 | 
321 | int i4vec_sum ( int n, int a[] )
322 | 
323 | //****************************************************************************80
324 | //
325 | //  Purpose:
326 | //
327 | //    I4VEC_SUM sums the entries of an I4VEC.
328 | //
329 | //  Discussion:
330 | //
331 | //    An I4VEC is a vector of I4's.
332 | //
333 | //  Example:
334 | //
335 | //    Input:
336 | //
337 | //      A = ( 1, 2, 3, 4 )
338 | //
339 | //    Output:
340 | //
341 | //      I4VEC_SUM = 10
342 | //
343 | //  Licensing:
344 | //
345 | //    This code is distributed under the GNU LGPL license.
346 | //
347 | //  Modified:
348 | //
349 | //    26 May 1999
350 | //
351 | //  Author:
352 | //
353 | //    John Burkardt
354 | //
355 | //  Parameters:
356 | //
357 | //    Input, int N, the number of entries in the vector.
358 | //
359 | //    Input, int A[N], the vector to be summed.
360 | //
361 | //    Output, int I4VEC_SUM, the sum of the entries of A.
362 | //
363 | {
364 |   int i;
365 |   int sum;
366 | 
367 |   sum = 0;
368 |   for ( i = 0; i < n; i++ )
369 |   {
370 |     sum = sum + a[i];
371 |   }
372 | 
373 |   return sum;
374 | }
375 | //****************************************************************************80
376 | 
377 | double r8_uniform_01 ( int *seed )
378 | 
379 | //****************************************************************************80
380 | //
381 | //  Purpose:
382 | //
383 | //    R8_UNIFORM_01 is a unit pseudorandom R8.
384 | //
385 | //  Discussion:
386 | //
387 | //    This routine implements the recursion
388 | //
389 | //      seed = 16807 * seed mod ( 2**31 - 1 )
390 | //      unif = seed / ( 2**31 - 1 )
391 | //
392 | //    The integer arithmetic never requires more than 32 bits,
393 | //    including a sign bit.
394 | //
395 | //  Licensing:
396 | //
397 | //    This code is distributed under the GNU LGPL license.
398 | //
399 | //  Modified:
400 | //
401 | //    11 August 2004
402 | //
403 | //  Reference:
404 | //
405 | //    Paul Bratley, Bennett Fox, Linus Schrage,
406 | //    A Guide to Simulation,
407 | //    Springer Verlag, pages 201-202, 1983.
408 | //
409 | //    Bennett Fox,
410 | //    Algorithm 647:
411 | //    Implementation and Relative Efficiency of Quasirandom
412 | //    Sequence Generators,
413 | //    ACM Transactions on Mathematical Software,
414 | //    Volume 12, Number 4, pages 362-376, 1986.
415 | //
416 | //  Parameters:
417 | //
418 | //    Input/output, int *SEED, a seed for the random number generator.
419 | //
420 | //    Output, double R8_UNIFORM_01, a new pseudorandom variate, strictly between
421 | //    0 and 1.
422 | //
423 | {
424 |   int k;
425 |   double r;
426 | 
427 |   k = *seed / 127773;
428 | 
429 |   *seed = 16807 * ( *seed - k * 127773 ) - k * 2836;
430 | 
431 |   if ( *seed < 0 )
432 |   {
433 |     *seed = *seed + 2147483647;
434 |   }
435 | 
436 |   r = ( double ) ( *seed ) * 4.656612875E-10;
437 | 
438 |   return r;
439 | }
440 | //****************************************************************************80
441 | 
442 | void rcont2 ( int nrow, int ncol, int nrowt[], int ncolt[], bool *key,
443 |   int *seed, int matrix[],  int *ierror )
444 | 
445 | //****************************************************************************80
446 | //
447 | //  Purpose:
448 | //
449 | //    RCONT2 constructs a random two-way contingency table with given sums.
450 | //
451 | //  Discussion:
452 | //
453 | //    It is possible to specify row and column sum vectors which
454 | //    correspond to no table at all.  As far as I can see, this routine does
455 | //    not detect such a case.
456 | //
457 | //  Licensing:
458 | //
459 | //    This code is distributed under the GNU LGPL license.
460 | //
461 | //  Modified:
462 | //
463 | //    10 March 2009
464 | //
465 | //  Author:
466 | //
467 | //    Original FORTRAN77 version by WM Patefield.
468 | //    C++ version by John Burkardt.
469 | //
470 | //  Reference:
471 | //
472 | //    WM Patefield,
473 | //    Algorithm AS 159:
474 | //    An Efficient Method of Generating RXC Tables with
475 | //    Given Row and Column Totals,
476 | //    Applied Statistics,
477 | //    Volume 30, Number 1, 1981, pages 91-97.
478 | //
479 | //  Parameters:
480 | //
481 | //    Input, int NROW, NCOL, the number of rows and columns
482 | //    in the table.  NROW and NCOL must each be at least 2.
483 | //
484 | //    Input, int NROWT[NROW], NCOLT[NCOL], the row and column
485 | //    sums.  Each entry must be positive.
486 | //
487 | //    Input/output, bool *KEY, a flag that indicates whether data has
488 | //    been initialized for this problem.  Set KEY = .FALSE. before the first
489 | //    call.
490 | //
491 | //    Input/output, int *SEED, a seed for the random number generator.
492 | //
493 | //    Output, int MATRIX[NROW*NCOL], the matrix.
494 | //
495 | //    Output, int *IERROR, an error flag, which is returned
496 | //    as 0 if no error occurred.
497 | //
498 | {
499 |   bool done1;
500 |   bool done2;
501 |   static double *fact = NULL;
502 |   int i;
503 |   int ia;
504 |   int iap;
505 |   int ib;
506 |   int ic;
507 |   int id;
508 |   int idp;
509 |   int ie;
510 |   int igp;
511 |   int ihp;
512 |   int ii;
513 |   int iip;
514 |   int j;
515 |   int jc;
516 |   int *jwork;
517 |   int l;
518 |   bool lsm;
519 |   bool lsp;
520 |   int m;
521 |   int nll;
522 |   int nlm;
523 |   int nlmp;
524 |   int nrowtl;
525 |   static int ntotal = 0;
526 |   double r;
527 |   double sumprb;
528 |   double x;
529 |   double y;
530 | 
531 |   *ierror = 0;
532 | //
533 | //  On user's signal, set up the factorial table.
534 | //
535 |   if ( !(*key) )
536 |   {
537 | 
538 |     *key = true;
539 | 
540 |     if ( nrow <= 1 )
541 |     {
542 |       cout << "\n";
543 |       cout << "RCONT - Fatal error!\n";
544 |       cout << "  Input number of rows is less than 2.\n";
545 |       *ierror = 1;
546 |       return;
547 |     }
548 | 
549 |     if ( ncol <= 1 )
550 |     {
551 |       cout << "\n";
552 |       cout << "RCONT - Fatal error!\n";
553 |       cout << "  The number of columns is less than 2.\n";
554 |       *ierror = 2;
555 |       return;
556 |     }
557 | 
558 |     for ( i = 0; i < nrow; i++ )
559 |     {
560 |       if ( nrowt[i] <= 0 )
561 |       {
562 |         cout << "\n";
563 |         cout << "RCONT - Fatal error!\n";
564 |         cout << "  An entry in the row sum vector is not positive.\n";
565 |         *ierror = 3;
566 |         return;
567 |       }
568 |     }
569 | 
570 |     for ( j = 0; j < ncol; j++ )
571 |     {
572 |       if ( ncolt[j] <= 0 )
573 |       {
574 |         cout << "\n";
575 |         cout << "RCONT - Fatal error!\n";
576 |         cout << "  An entry in the column sum vector is not positive.\n";
577 |         *ierror = 4;
578 |         return;
579 |       }
580 |     }
581 | 
582 |     if ( i4vec_sum ( ncol, ncolt ) != i4vec_sum ( nrow, nrowt ) )
583 |     {
584 |       cout << "\n";
585 |       cout << "RCONT - Fatal error!\n";
586 |       cout << "  The row and column sum vectors do not have the same sum.\n";
587 |       *ierror = 6;
588 |       return;
589 |     }
590 | 
591 |     ntotal = i4vec_sum ( ncol, ncolt );
592 | 
593 |     if ( fact )
594 |     {
595 |       delete [] fact;
596 |     }
597 | 
598 |     fact = new double[ntotal+1];
599 | //
600 | //  Calculate log-factorials.
601 | //
602 |     x = 0.0;
603 |     fact[0] = 0.0;
604 |     for ( i = 1; i <= ntotal; i++ )
605 |     {
606 |       x = x + log ( ( double ) ( i ) );
607 |       fact[i] = x;
608 |     }
609 | 
610 |   }
611 | //
612 | //  Construct a random matrix.
613 | //
614 |   jwork = new int[ncol];
615 | 
616 |   for ( i = 0; i < ncol - 1; i++ )
617 |   {
618 |     jwork[i] = ncolt[i];
619 |   }
620 | 
621 |   jc = ntotal;
622 | 
623 |   for ( l = 0; l < nrow - 1; l++ )
624 |   {
625 |     nrowtl = nrowt[l];
626 |     ia = nrowtl;
627 |     ic = jc;
628 |     jc = jc - nrowtl;
629 | 
630 |     for ( m = 0; m < ncol - 1; m++ )
631 |     {
632 |       id = jwork[m];
633 |       ie = ic;
634 |       ic = ic - id;
635 |       ib = ie - ia;
636 |       ii = ib - id;
637 | //
638 | //  Test for zero entries in matrix.
639 | //
640 |       if ( ie == 0 )
641 |       {
642 |         ia = 0;
643 |         for ( j = m; j < ncol; j++ )
644 |         {
645 |           matrix[l+j*nrow] = 0;
646 |         }
647 |         break;
648 |       }
649 | //
650 | //  Generate a pseudo-random number.
651 | //
652 |       r = r8_uniform_01 ( seed );
653 | //
654 | //  Compute the conditional expected value of MATRIX(L,M).
655 | //
656 |       done1 = false;
657 | 
658 |       for ( ; ; )
659 |       {
660 |         nlm = ( int ) ( ( double ) ( ia * id ) / ( double ) ( ie ) + 0.5 );
661 |         iap = ia + 1;
662 |         idp = id + 1;
663 |         igp = idp - nlm;
664 |         ihp = iap - nlm;
665 |         nlmp = nlm + 1;
666 |         iip = ii + nlmp;
667 |         x = exp ( fact[iap-1] + fact[ib] + fact[ic] + fact[idp-1] -
668 |           fact[ie] - fact[nlmp-1] - fact[igp-1] - fact[ihp-1] - fact[iip-1] );
669 | 
670 |         if ( r <= x )
671 |         {
672 |           break;
673 |         }
674 | 
675 |         sumprb = x;
676 |         y = x;
677 |         nll = nlm;
678 |         lsp = false;
679 |         lsm = false;
680 | //
681 | //  Increment entry in row L, column M.
682 | //
683 |         while ( !lsp )
684 |         {
685 |           j = ( id - nlm ) * ( ia - nlm );
686 | 
687 |           if ( j == 0 )
688 |           {
689 |             lsp = true;
690 |           }
691 |           else
692 |           {
693 |             nlm = nlm + 1;
694 |             x = x * ( double ) ( j ) / ( double ) ( nlm * ( ii + nlm ) );
695 |             sumprb = sumprb + x;
696 | 
697 |             if ( r <= sumprb )
698 |             {
699 |               done1 = true;
700 |               break;
701 |             }
702 |           }
703 | 
704 |           done2 = false;
705 | 
706 |           while ( !lsm )
707 |           {
708 | //
709 | //  Decrement the entry in row L, column M.
710 | //
711 |             j = nll * ( ii + nll );
712 | 
713 |             if ( j == 0 )
714 |             {
715 |               lsm = true;
716 |               break;
717 |             }
718 | 
719 |             nll = nll - 1;
720 |             y = y * ( double ) ( j ) / ( double ) ( ( id - nll ) * ( ia - nll ) );
721 |             sumprb = sumprb + y;
722 | 
723 |             if ( r <= sumprb )
724 |             {
725 |               nlm = nll;
726 |               done2 = true;
727 |               break;
728 |             }
729 | 
730 |             if ( !lsp )
731 |             {
732 |               break;
733 |             }
734 | 
735 |           }
736 | 
737 |           if ( done2 )
738 |           {
739 |             break;
740 |           }
741 | 
742 |         }
743 | 
744 |         if ( done1 )
745 |         {
746 |           break;
747 |         }
748 | 
749 |         if ( done2 )
750 |         {
751 |           break;
752 |         }
753 | 
754 |         r = r8_uniform_01 ( seed );
755 |         r = sumprb * r;
756 | 
757 |       }
758 | 
759 |       matrix[l+m*nrow] = nlm;
760 |       ia = ia - nlm;
761 |       jwork[m] = jwork[m] - nlm;
762 | 
763 |     }
764 |     matrix[l+(ncol-1)*nrow] = ia;
765 |   }
766 | //
767 | //  Compute the last row.
768 | //
769 |   for ( j = 0; j < ncol - 1; j++ )
770 |   {
771 |     matrix[nrow-1+j*nrow] = jwork[j];
772 |   }
773 |   matrix[nrow-1+(ncol-1)*nrow] = ib - matrix[nrow-1+(ncol-2)*nrow];
774 | 
775 |   delete [] jwork;
776 | 
777 |   return;
778 | }
779 | //****************************************************************************80
780 | 
781 | void timestamp ( )
782 | 
783 | //****************************************************************************80
784 | //
785 | //  Purpose:
786 | //
787 | //    TIMESTAMP prints the current YMDHMS date as a time stamp.
788 | //
789 | //  Example:
790 | //
791 | //    May 31 2001 09:45:54 AM
792 | //
793 | //  Licensing:
794 | //
795 | //    This code is distributed under the GNU LGPL license.
796 | //
797 | //  Modified:
798 | //
799 | //    03 October 2003
800 | //
801 | //  Author:
802 | //
803 | //    John Burkardt
804 | //
805 | //  Parameters:
806 | //
807 | //    None
808 | //
809 | {
810 | # define TIME_SIZE 40
811 | 
812 |   static char time_buffer[TIME_SIZE];
813 |   const struct tm *tm;
814 |   time_t now;
815 | 
816 |   now = time ( NULL );
817 |   tm = localtime ( &now );
818 | 
819 |   strftime ( time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm );
820 | 
821 |   cout << time_buffer << "\n";
822 | 
823 |   return;
824 | # undef TIME_SIZE
825 | }
826 | 


--------------------------------------------------------------------------------
/phik/report.py:
--------------------------------------------------------------------------------
  1 | """Project: PhiK - correlation analyzer library
  2 | 
  3 | Created: 2018/09/06
  4 | 
  5 | Description:
  6 |     Functions to create nice correlation overview and matrix plots
  7 | 
  8 | Authors:
  9 |     KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
 10 | 
 11 | Redistribution and use in source and binary forms, with or without
 12 | modification, are permitted according to the terms listed in the file
 13 | LICENSE.
 14 | """
 15 | import itertools
 16 | import os
 17 | from typing import Callable, Dict, Tuple, Union
 18 | 
 19 | import matplotlib.pyplot as plt
 20 | import numpy as np
 21 | import pandas as pd
 22 | from matplotlib import colors
 23 | from matplotlib.backends.backend_pdf import PdfPages
 24 | 
 25 | from .binning import bin_data
 26 | from .data_quality import dq_check_nunique_values
 27 | from .outliers import outlier_significance_matrix_from_rebinned_df
 28 | from .phik import global_phik_from_rebinned_df, phik_from_rebinned_df
 29 | from .significance import significance_from_rebinned_df
 30 | from .utils import guess_interval_cols
 31 | 
 32 | 
 33 | def plot_hist_and_func(
 34 |     data: Union[list, np.ndarray, pd.Series],
 35 |     func: Callable,
 36 |     funcparams,
 37 |     xbins=False,
 38 |     labels=None,
 39 |     xlabel="",
 40 |     ylabel="",
 41 |     title="",
 42 |     xlimit=None,
 43 |     alpha=1,
 44 | ):
 45 |     """
 46 |     Create a histogram of the provided data and overlay with a function.
 47 | 
 48 |     :param list data: data
 49 |     :param function func: function of the type f(x, a, b, c) where parameters a, b, c are optional
 50 |     :param list funcparams: parameter values to be given to the function, to be specified as [a, b, c]
 51 |     :param xbins: specify binning of histogram, either by giving the number of bins or a list of bin edges
 52 |     :param labels: labels of histogram and function to be used in the legend
 53 |     :param xlabel: figure xlabel
 54 |     :param ylabel: figure ylabel
 55 |     :param title: figure title
 56 |     :param xlimit: x limits figure
 57 |     :param alpha: alpha histogram
 58 |     :return:
 59 |     """
 60 |     if labels is None:
 61 |         labels = ["", ""]
 62 | 
 63 |     # If binning is not specified, create binning here
 64 |     if not np.any(xbins) and not xlimit:
 65 |         xmin = np.min(data)
 66 |         xmax = np.max(data)
 67 |         xnbins = int(len(data) / 50 + 1)
 68 |         xbins = np.linspace(xmin, xmax, xnbins)
 69 |     elif type(xbins) == int or type(xbins) == float:
 70 |         xmin = np.min(data)
 71 |         if xlimit:
 72 |             xmin = xlimit[0]
 73 |         xmax = np.max(data)
 74 |         if xmax:
 75 |             xmax = xlimit[1]
 76 |         xnbins = int(xbins + 1)
 77 |         xbins = np.linspace(xmin, xmax, xnbins)
 78 | 
 79 |     # Plot a histogram of the data
 80 |     plt.hist(data, bins=xbins, label=labels[0], alpha=alpha)
 81 | 
 82 |     # Find bin centers for plotting the function
 83 |     xvals = xbins[:-1] + np.diff(xbins)[0] / 2
 84 |     bw = xbins[1] - xbins[0]
 85 |     # Plot the fit
 86 |     plt.plot(
 87 |         xvals, len(data) * bw * func(xvals, *funcparams), linewidth=2, label=labels[1]
 88 |     )
 89 | 
 90 |     if xlabel:
 91 |         plt.xlabel(xlabel)
 92 |     if ylabel:
 93 |         plt.ylabel(ylabel)
 94 |     if title:
 95 |         plt.title(title)
 96 | 
 97 |     if len(labels[0]) > 0:
 98 |         plt.legend()
 99 | 
100 | 
101 | def plot_correlation_matrix(
102 |     matrix_colors: np.ndarray,
103 |     x_labels: list,
104 |     y_labels: list,
105 |     pdf_file_name: str = "",
106 |     title: str = "correlation",
107 |     vmin: float = -1,
108 |     vmax: float = 1,
109 |     color_map: str = "RdYlGn",
110 |     x_label: str = "",
111 |     y_label: str = "",
112 |     top: int = 20,
113 |     matrix_numbers: np.ndarray = None,
114 |     print_both_numbers: bool = True,
115 |     figsize: tuple = (7, 5),
116 |     usetex: bool = False,
117 |     identity_layout: bool = True,
118 |     fontsize_factor: float = 1,
119 | ) -> None:
120 |     """Create and plot correlation matrix.
121 | 
122 |     Copied with permission from the eskapade package (pip install eskapade)
123 | 
124 |     :param matrix_colors: input correlation matrix
125 |     :param list x_labels: Labels for histogram x-axis bins
126 |     :param list y_labels: Labels for histogram y-axis bins
127 |     :param str pdf_file_name: if set, will store the plot in a pdf file
128 |     :param str title: if set, title of the plot
129 |     :param float vmin: minimum value of color legend (default is -1)
130 |     :param float vmax: maximum value of color legend (default is +1)
131 |     :param str x_label: Label for histogram x-axis
132 |     :param str y_label: Label for histogram y-axis
133 |     :param str color_map: color map passed to matplotlib pcolormesh. (default is 'RdYlGn')
134 |     :param int top: only print the top 20 characters of x-labels and y-labels. (default is 20)
135 |     :param matrix_numbers: input matrix used for plotting numbers. (default it matrix_colors)
136 |     :param identity_layout: Plot diagonal from right top to bottom left (True) or bottom left to top right (False)
137 |     """
138 |     if not isinstance(matrix_colors, np.ndarray):
139 |         raise TypeError("matrix_colors is not a numpy array.")
140 | 
141 |     # basic matrix checks
142 |     assert (matrix_colors.shape[0] == len(y_labels)) or (
143 |         matrix_colors.shape[0] + 1 == len(y_labels)
144 |     ), "matrix_colors shape inconsistent with number of y-labels"
145 |     assert (matrix_colors.shape[1] == len(x_labels)) or (
146 |         matrix_colors.shape[1] + 1 == len(x_labels)
147 |     ), "matrix_colors shape inconsistent with number of x-labels"
148 |     if matrix_numbers is None:
149 |         matrix_numbers = matrix_colors
150 |         print_both_numbers = False  # only one set of numbers possible
151 |     else:
152 |         assert matrix_numbers.shape[0] == len(
153 |             y_labels
154 |         ), "matrix_numbers shape inconsistent with number of y-labels"
155 |         assert matrix_numbers.shape[1] == len(
156 |             x_labels
157 |         ), "matrix_numbers shape inconsistent with number of x-labels"
158 | 
159 |     if identity_layout:
160 |         matrix_colors = np.array([a[::-1] for a in matrix_colors])
161 |         x_labels = x_labels[::-1]
162 |         if matrix_numbers is not None:
163 |             matrix_numbers = np.array([a[::-1] for a in matrix_numbers])
164 | 
165 |     plt.rc("text", usetex=usetex)
166 | 
167 |     fig, ax = plt.subplots(figsize=figsize)
168 |     # cmap = 'RdYlGn' #'YlGn'
169 |     norm = colors.Normalize(vmin=vmin, vmax=vmax)
170 |     img = ax.pcolormesh(
171 |         matrix_colors, cmap=color_map, edgecolor="w", linewidth=1, norm=norm
172 |     )
173 | 
174 |     # set x-axis properties
175 |     def tick(lab):
176 |         """Get tick."""
177 |         if isinstance(lab, (float, int)):
178 |             lab = "NaN" if np.isnan(lab) else "{0:.0f}".format(lab)
179 |         lab = str(lab)
180 |         if len(lab) > top:
181 |             lab = lab[:17] + "..."
182 |         return lab
183 | 
184 |     # reduce default fontsizes in case too many labels?
185 |     # nlabs = max(len(y_labels), len(x_labels))
186 | 
187 |     # axis ticks and tick labels
188 |     if len(x_labels) == matrix_colors.shape[1] + 1:
189 |         ax.set_xticks(np.arange(len(x_labels)))
190 |     else:
191 |         ax.set_xticks(np.arange(len(x_labels)) + 0.5)
192 |     ax.set_xticklabels(
193 |         [tick(lab) for lab in x_labels],
194 |         rotation="vertical",
195 |         fontsize=10 * fontsize_factor,
196 |     )
197 | 
198 |     if len(y_labels) == matrix_colors.shape[0] + 1:
199 |         ax.set_yticks(np.arange(len(y_labels)))
200 |     else:
201 |         ax.set_yticks(np.arange(len(y_labels)) + 0.5)
202 |     ax.set_yticklabels(
203 |         [tick(lab) for lab in y_labels],
204 |         rotation="horizontal",
205 |         fontsize=10 * fontsize_factor,
206 |     )
207 | 
208 |     # Turn ticks off in case no labels are provided
209 |     if len(x_labels) == 1 and len(x_labels[0]) == 0:
210 |         plt.tick_params(
211 |             axis="x",  # changes apply to the x-axis
212 |             which="both",  # both major and minor ticks are affected
213 |             bottom=False,  # ticks along the bottom edge are off
214 |             top=False,  # ticks along the top edge are off
215 |             labelbottom=False,
216 |         )
217 |     if len(y_labels) == 1 and len(y_labels[0]) == 0:
218 |         plt.tick_params(
219 |             axis="y",  # changes apply to the x-axis
220 |             which="both",  # both major and minor ticks are affected
221 |             left=False,  # ticks along the bottom edge are off
222 |             right=False,  # ticks along the top edge are off
223 |             labelbottom=False,
224 |         )
225 | 
226 |     # make plot look pretty
227 |     ax.set_title(title, fontsize=14 * fontsize_factor)
228 |     if x_label:
229 |         ax.set_xlabel(x_label, fontsize=12 * fontsize_factor)
230 |     if y_label:
231 |         ax.set_ylabel(y_label, fontsize=12 * fontsize_factor)
232 | 
233 |     fig.colorbar(img)
234 | 
235 |     # annotate with correlation values
236 |     numbers_set = (
237 |         [matrix_numbers] if not print_both_numbers else [matrix_numbers, matrix_colors]
238 |     )
239 |     for i in range(matrix_numbers.shape[1]):
240 |         for j in range(matrix_numbers.shape[0]):
241 |             point_color = float(matrix_colors[j][i])
242 |             white_cond = (
243 |                 (point_color < 0.7 * vmin)
244 |                 or (point_color >= 0.7 * vmax)
245 |                 or np.isnan(point_color)
246 |             )
247 |             y_offset = 0.5
248 |             for m, matrix in enumerate(numbers_set):
249 |                 if print_both_numbers:
250 |                     if m == 0:
251 |                         y_offset = 0.7
252 |                     elif m == 1:
253 |                         y_offset = 0.25
254 |                 point = float(matrix[j][i])
255 |                 label = "NaN" if np.isnan(point) else "{0:.2f}".format(point)
256 |                 color = "w" if white_cond else "k"
257 |                 ax.annotate(
258 |                     label,
259 |                     xy=(i + 0.5, j + y_offset),
260 |                     color=color,
261 |                     horizontalalignment="center",
262 |                     verticalalignment="center",
263 |                     fontsize=10 * fontsize_factor,
264 |                 )
265 | 
266 |     plt.tight_layout()
267 | 
268 |     # save plot in file
269 |     if pdf_file_name:
270 |         pdf_file = PdfPages(pdf_file_name)
271 |         plt.savefig(pdf_file, format="pdf", bbox_inches="tight", pad_inches=0)
272 |         plt.close()
273 |         pdf_file.close()
274 | 
275 | 
276 | def correlation_report(
277 |     data: pd.DataFrame,
278 |     interval_cols: list = None,
279 |     bins=10,
280 |     quantile: bool = False,
281 |     do_outliers: bool = True,
282 |     pdf_file_name: str = "",
283 |     significance_threshold: float = 3,
284 |     correlation_threshold: float = 0.5,
285 |     noise_correction: bool = True,
286 |     store_each_plot: bool = False,
287 |     lambda_significance: str = "log-likelihood",
288 |     simulation_method: str = "multinominal",
289 |     nsim_chi2: int = 1000,
290 |     significance_method: str = "asymptotic",
291 |     CI_method: str = "poisson",
292 |     verbose: bool = True,
293 |     plot_phik_matrix_kws: dict = {},
294 |     plot_global_phik_kws: dict = {},
295 |     plot_significance_matrix_kws: dict = {},
296 |     plot_outlier_significance_kws: dict = {},
297 | ) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, pd.DataFrame], Dict[str, str]]:
298 |     """
299 |     Create a correlation report for the given dataset.
300 | 
301 |     The following quantities are calculated:
302 | 
303 |     * The phik correlation matrix
304 |     * The significance matrix
305 |     * The outlier significances measured in pairs of variables. (optional)
306 | 
307 |     :param data: input dataframe
308 |     :param interval_cols: list of columns names of columns containing interval data
309 |     :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\
310 |     E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
311 |     :param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True)
312 |     :param do_outliers: Evaluate outlier significances of variable pairs (when True)
313 |     :param pdf_file_name: file name of the pdf where the results are stored
314 |     :param store_each_plot: store each plot in folder derived from pdf_file_name. If true, single pdf is no longer stored. Default is false.
315 |     :param significance_threshold: evaluate outlier significance for all variable pairs with a significance of \
316 |      uncorrelation higher than this threshold
317 |     :param correlation_threshold: evaluate outlier significance for all variable pairs with a phik correlation \
318 |      higher than this threshold
319 |     :param noise_correction: Apply noise correction in phik calculation
320 |     :param lambda_significance: test statistic used in significance calculation. Options: [pearson, log-likelihood]
321 |     :param simulation_method: sampling method using in significance calculation. Options: [mutlinominal, \
322 |     row_product_multinominal, col_product_multinominal, hypergeometric]
323 |     :param nsim_chi2: number of simulated datasets in significance calculation.
324 |     :param significance_method: method for significance calculation. Options: [asymptotic, MC, hybrid]
325 |     :param CI_method: method for uncertainty calculation for outlier significance calculation. Options: [poisson, \
326 |     exact_poisson]
327 |     :param bool verbose: if False, do not print all interval columns that are guessed
328 |     :param dict plot_phik_matrix_kws: kwargs passed to plot_correlation_matrix() to plot the phik matrix. \
329 |     updates the default plotting values.
330 |     :param dict plot_global_phik_kws: kwargs passed to plot_correlation_matrix() to plot the global-phik vector. \
331 |     updates the default plotting values.
332 |     :param dict plot_significance_matrix_kws: kwargs passed to plot_correlation_matrix() to plot significance matrix. \
333 |     updates the default plotting values.
334 |     :param dict plot_outlier_significance_kws: kwargs passed to plot_correlation_matrix() to plot the outlier \
335 |     significances. updates the default plotting values.
336 |     :returns: phik_matrix (pd.DataFrame), global_phik (np.array), significance_matrix (pd.DataFrame), \
337 |     outliers_overview (dictionary), output_files (dictionary)
338 |     """
339 | 
340 |     if interval_cols is None:
341 |         interval_cols = guess_interval_cols(data, verbose)
342 | 
343 |     data_clean, interval_cols_clean = dq_check_nunique_values(data, interval_cols)
344 | 
345 |     # create pdf(s) to save plots
346 |     output_files = dict()
347 |     plot_file_name = ""
348 |     if store_each_plot:
349 |         folder = os.path.dirname(pdf_file_name)
350 |         folder += "/" if folder else "./"
351 |         # if each plot is stored, single overview file is no longer stored.
352 |         # (b/c of problem with multiple PdfPages)
353 |         pdf_file_name = ""
354 |     if pdf_file_name:
355 |         pdf_file = PdfPages(pdf_file_name)
356 | 
357 |     data_binned, binning_dict = bin_data(
358 |         data_clean, interval_cols_clean, bins=bins, quantile=quantile, retbins=True
359 |     )
360 | 
361 |     ### 1. Phik
362 |     if store_each_plot:
363 |         plot_file_name = folder + "phik_matrix.pdf"
364 |         output_files["phik_matrix"] = plot_file_name
365 |     phik_matrix = phik_from_rebinned_df(data_binned, noise_correction)
366 | 
367 |     default_plot_phik_matrix = dict(
368 |         x_labels=phik_matrix.columns,
369 |         y_labels=phik_matrix.index,
370 |         vmin=0,
371 |         vmax=1,
372 |         color_map="Blues",
373 |         title=r"correlation $\phi_K$",
374 |         fontsize_factor=1.5,
375 |         figsize=(7, 5.5),
376 |         pdf_file_name=plot_file_name,
377 |     )
378 |     default_plot_phik_matrix.update(plot_phik_matrix_kws)
379 |     plot_correlation_matrix(phik_matrix.values, **default_plot_phik_matrix)
380 |     if pdf_file_name:
381 |         plt.savefig(pdf_file, format="pdf", bbox_inches="tight", pad_inches=0)
382 |         plt.show()
383 | 
384 |     ### 1b. global correlations
385 |     if store_each_plot:
386 |         plot_file_name = folder + "global_phik.pdf"
387 |         output_files["global_phik"] = plot_file_name
388 |     global_phik, global_labels = global_phik_from_rebinned_df(
389 |         data_binned, noise_correction
390 |     )
391 | 
392 |     default_plot_global_phik = dict(
393 |         x_labels=[""],
394 |         y_labels=global_labels,
395 |         vmin=0,
396 |         vmax=1,
397 |         figsize=(3.5, 4),
398 |         color_map="Blues",
399 |         title=r"$g_k$",
400 |         fontsize_factor=1.5,
401 |         pdf_file_name=plot_file_name,
402 |     )
403 |     default_plot_global_phik.update(plot_global_phik_kws)
404 |     plot_correlation_matrix(global_phik, **default_plot_global_phik)
405 |     # plt.tight_layout()
406 |     if pdf_file_name:
407 |         plt.savefig(pdf_file, format="pdf", bbox_inches="tight", pad_inches=0)
408 |         plt.show()
409 | 
410 |     ### 2. Significance
411 |     if store_each_plot:
412 |         plot_file_name = folder + "significance_matrix.pdf"
413 |         output_files["significance_matrix"] = plot_file_name
414 |     significance_matrix = significance_from_rebinned_df(
415 |         data_binned,
416 |         lambda_significance,
417 |         simulation_method,
418 |         nsim_chi2,
419 |         significance_method,
420 |     )
421 | 
422 |     default_plot_significance_matrix = dict(
423 |         x_labels=significance_matrix.columns,
424 |         y_labels=significance_matrix.index,
425 |         vmin=-5,
426 |         vmax=5,
427 |         title="significance",
428 |         usetex=False,
429 |         fontsize_factor=1.5,
430 |         figsize=(7, 5.5),
431 |         pdf_file_name=plot_file_name,
432 |     )
433 |     default_plot_significance_matrix.update(plot_significance_matrix_kws)
434 |     plot_correlation_matrix(
435 |         significance_matrix.fillna(0).values, **default_plot_significance_matrix
436 |     )
437 |     if pdf_file_name:
438 |         plt.savefig(pdf_file, format="pdf", bbox_inches="tight", pad_inches=0)
439 |         plt.show()
440 | 
441 |     ### 3. Outlier significance
442 |     outliers_overview = {}
443 |     if do_outliers:
444 |         for i, comb in enumerate(itertools.combinations(data_binned.columns, 2)):
445 |             c0, c1 = comb
446 |             if (
447 |                 abs(significance_matrix.loc[c0, c1]) < significance_threshold
448 |                 or phik_matrix.loc[c0, c1] < correlation_threshold
449 |             ):
450 |                 continue
451 | 
452 |             zvalues_df = outlier_significance_matrix_from_rebinned_df(
453 |                 data_binned[[c0, c1]].copy(), binning_dict, CI_method=CI_method
454 |             )
455 | 
456 |             combi = ":".join(comb).replace(" ", "_")
457 |             xlabels = zvalues_df.columns
458 |             ylabels = zvalues_df.index
459 |             xlabel = zvalues_df.columns.name
460 |             ylabel = zvalues_df.index.name
461 | 
462 |             if store_each_plot:
463 |                 plot_file_name = folder + "pulls_{0:s}.pdf".format(combi)
464 |                 output_files[combi] = plot_file_name
465 | 
466 |             default_plot_outlier_significance = dict(
467 |                 x_labels=xlabels,
468 |                 y_labels=ylabels,
469 |                 x_label=xlabel,
470 |                 y_label=ylabel,
471 |                 vmin=-5,
472 |                 vmax=5,
473 |                 title="outlier significance",
474 |                 identity_layout=False,
475 |                 fontsize_factor=1.2,
476 |                 pdf_file_name=plot_file_name,
477 |             )
478 |             default_plot_outlier_significance.update(plot_outlier_significance_kws)
479 |             plot_correlation_matrix(
480 |                 zvalues_df.values, **default_plot_outlier_significance
481 |             )
482 | 
483 |             outliers_overview[combi] = zvalues_df
484 | 
485 |             if pdf_file_name:
486 |                 plt.savefig(pdf_file, format="pdf", bbox_inches="tight", pad_inches=0)
487 |                 plt.show()
488 | 
489 |     # save plots
490 |     if pdf_file_name:
491 |         output_files["all"] = pdf_file_name
492 |         plt.close()
493 |         pdf_file.close()
494 | 
495 |     return (
496 |         phik_matrix,
497 |         global_phik,
498 |         significance_matrix,
499 |         outliers_overview,
500 |         output_files,
501 |     )
502 | 


--------------------------------------------------------------------------------
/phik/significance.py:
--------------------------------------------------------------------------------
  1 | """Project: PhiK - correlation analyzer library
  2 | 
  3 | Created: 2018/09/05
  4 | 
  5 | Description:
  6 |     Functions for doing the significance evaluation of an hypothesis test of variable independence
  7 |     using a contingency table.
  8 | 
  9 | Authors:
 10 |     KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
 11 | 
 12 | Redistribution and use in source and binary forms, with or without
 13 | modification, are permitted according to the terms listed in the file
 14 | LICENSE.
 15 | """
 16 | from typing import Tuple, Union
 17 | 
 18 | import numpy as np
 19 | import pandas as pd
 20 | import math
 21 | import itertools
 22 | import warnings
 23 | 
 24 | from scipy import stats
 25 | from scipy import special, optimize
 26 | 
 27 | from phik import definitions as defs
 28 | from .binning import bin_data, create_correlation_overview_table
 29 | from .statistics import get_chi2_using_dependent_frequency_estimates
 30 | from .statistics import estimate_ndof, theoretical_ndof
 31 | from .simulation import sim_chi2_distribution
 32 | from .data_quality import dq_check_nunique_values, dq_check_hist2d
 33 | from .utils import array_like_to_dataframe, guess_interval_cols
 34 | 
 35 | 
 36 | def fit_test_statistic_distribution(
 37 |     chi2s: Union[list, np.ndarray], nbins: int = 50
 38 | ) -> Tuple[float, float, float, float]:
 39 |     """
 40 |     Fit the hybrid chi2-distribution to the data to find f.
 41 | 
 42 |     Perform a binned likelihood fit to the data to find the optimal value for the fraction f in
 43 |     h(x|f) = N * (f * chi2(x, ndof) + (1-f) * gauss(x, ndof, sqrt(ndof))
 44 |     The parameter ndof is fixed in the fit using ndof = mean(x). The total number of datapoints N is also fixed.
 45 | 
 46 |     :param list chi2s: input data - a list of chi2 values
 47 |     :param int nbins: in order to fit the data a histogram is created with nbins number of bins
 48 |     :returns: f, ndof, sigma (width of gauss), bw (bin width)
 49 |     """
 50 | 
 51 |     def myfunc(x, N, f, k, sigma):
 52 |         return N * (f * stats.chi2.pdf(x, k) + (1 - f) * stats.norm.pdf(x, k, sigma))
 53 | 
 54 |     ffunc = lambda x, f: myfunc(x, nsim * bw, f, kmean, lsigma)
 55 | 
 56 |     def gtest(p, x, y):
 57 |         f = ffunc(x, *p)
 58 |         ll = f - special.xlogy(y, f) + special.gammaln(y + 1)
 59 |         return np.sqrt(ll)
 60 | 
 61 |     kmean = np.mean(chi2s)  # NOTE: this is used to fix kmean in the fit!
 62 |     lsigma = np.sqrt(kmean)  # NOTE: this is used to fix sigma in the fit!
 63 |     nsim = len(chi2s)  # NOTE: this is used to fix N in fit (N=nsim*bw) !
 64 | 
 65 |     yhist, xbounds = np.histogram(chi2s, bins=nbins)
 66 |     bw = xbounds[1] - xbounds[0]  # NOTE: this is used to fix N in fit (N=nsim*bw) !
 67 |     xhist = xbounds[:-1] + np.diff(xbounds) / 2
 68 | 
 69 |     initGuess = (1.0,)  # starting value for parameter f
 70 |     res = optimize.least_squares(
 71 |         gtest, initGuess, bounds=((0.0,), (1,)), args=(xhist, yhist)
 72 |     )
 73 | 
 74 |     return res.x[0], kmean, lsigma, bw
 75 | 
 76 | 
 77 | def hfunc(x: float, N: float, f: float, k: float, sigma: float) -> float:
 78 |     """
 79 |     Definition of the combined probability density function h(x)
 80 | 
 81 |     h(x|f) = N * (f * chi2(x, k) + (1-f) * gauss(x, k, sigma)
 82 | 
 83 |     :param float x: x
 84 |     :param float N: normalisation
 85 |     :param float f: fraction [0,1]
 86 |     :param float k: ndof of chi2 function and mean of gauss
 87 |     :param float sigma: width of gauss
 88 |     :return: h(x|f)
 89 |     """
 90 |     return N * (f * stats.chi2.pdf(x, k) + (1 - f) * stats.norm.pdf(x, k, sigma))
 91 | 
 92 | 
 93 | def significance_from_chi2_ndof(chi2: float, ndof: float) -> Tuple[float, float]:
 94 |     """
 95 |     Convert a chi2 into significance using knowledge about the number of degrees of freedom
 96 | 
 97 |     Conversion is done using asymptotic approximation.
 98 | 
 99 |     :param float chi2: chi2 value
100 |     :param float ndof: number of degrees of freedom
101 |     :returns: p_value, significance
102 |     """
103 |     p_value = stats.chi2.sf(chi2, ndof)
104 |     z_value = -stats.norm.ppf(p_value)
105 | 
106 |     # scenario where p_value is too small to evaluate Z
107 |     # use Chernoff approximation for p-value upper bound
108 |     # see: https://en.wikipedia.org/wiki/Chi-squared_distribution
109 |     if p_value == 0:
110 |         z = chi2 / ndof
111 |         u = -math.log(2 * math.pi) - ndof * math.log(z) + ndof * (z - 1)
112 |         z_value = math.sqrt(u - math.log(u))
113 | 
114 |     return p_value, z_value
115 | 
116 | 
117 | def significance_from_chi2_asymptotic(
118 |     values: np.ndarray, chi2: float
119 | ) -> Tuple[float, float]:
120 |     """
121 |     Convert a chi2 into significance using knowledge about the number of degrees of freedom
122 | 
123 |     Convention is done using asymptotic approximation.
124 | 
125 |     :param float chi2: chi2 value
126 |     :param float ndof: number of degrees of freedom
127 |     :returns: p_value, significance
128 |     """
129 | 
130 |     ndof = theoretical_ndof(values)
131 |     p_value, z_value = significance_from_chi2_ndof(chi2, ndof)
132 | 
133 |     return p_value, z_value
134 | 
135 | 
136 | def significance_from_chi2_MC(
137 |     chi2: float,
138 |     values: np.ndarray,
139 |     nsim: int = 1000,
140 |     lambda_: str = "log-likelihood",
141 |     simulation_method: str = "multinominal",
142 |     chi2s=None,
143 |     njobs: int = -1,
144 | ) -> Tuple[float, float]:
145 |     """
146 |     Convert a chi2 into significance using knowledge about the shape of the chi2 distribution of simulated data
147 | 
148 |     Calculate significance based on simulation (MC method), using a simple percentile.
149 | 
150 |     :param float chi2: chi2 value
151 |     :param list chi2s: provide your own chi2s values (optional)
152 |     :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs.
153 |     :returns: pvalue, significance
154 |     """
155 | 
156 |     # determine effective number of degrees of freedom using simulation
157 |     if chi2s is None:
158 |         chi2s = sim_chi2_distribution(
159 |             values,
160 |             nsim=nsim,
161 |             lambda_=lambda_,
162 |             simulation_method=simulation_method,
163 |             njobs=njobs,
164 |         )
165 | 
166 |     # calculate p_value based on simulation (MC method)
167 |     empirical_p_value = 1.0 - stats.percentileofscore(chi2s, chi2) / 100.0
168 |     empirical_z_value = -stats.norm.ppf(empirical_p_value)
169 | 
170 |     return empirical_p_value, empirical_z_value
171 | 
172 | 
173 | def significance_from_chi2_hybrid(
174 |     chi2: float,
175 |     values: np.ndarray,
176 |     nsim: int = 1000,
177 |     lambda_: str = "log-likelihood",
178 |     simulation_method: str = "multinominal",
179 |     chi2s=None,
180 |     njobs: int = -1,
181 | ) -> Tuple[float, float]:
182 |     """
183 |     Convert a chi2 into significance using a hybrid method
184 | 
185 |     This method combines the asymptotic method with the MC method, but applies several corrections:
186 | 
187 |     * use effective number of degrees of freedom instead of number of degrees of freedom. The effective number of\
188 |       degrees of freedom is measured as mean(chi2s), with chi2s a list of simulated chi2 values.
189 |     * for low statistics data sets, with on average less than 4 data points per bin, the distribution of chi2-values\
190 |       is better described by h(x|f) then by the usual chi2-distribution. Use h(x|f) to convert the chi2 value to \
191 |       the pvalue and significance.
192 | 
193 |     h(x|f) = N * (f * chi2(x, ndof) + (1-f) * gauss(x, ndof, sqrt(ndof))
194 | 
195 |     :param float chi2: chi2 value
196 |     :param list chi2s: provide your own chi2s values (optional)
197 |     :param float avg_per_bin: average number of data points per bin
198 |     :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs.
199 |     :returns: p_value, significance
200 |     """
201 | 
202 |     # determine effective number of degrees of freedom using simulation
203 |     if chi2s is None:
204 |         chi2s = sim_chi2_distribution(
205 |             values,
206 |             nsim=nsim,
207 |             lambda_=lambda_,
208 |             simulation_method=simulation_method,
209 |             njobs=njobs,
210 |         )
211 | 
212 |     # average number of records per bin
213 |     avg_per_bin = values.sum() / values.shape[0] * values.shape[1]
214 | 
215 |     if avg_per_bin <= 4:
216 |         f, endof, lsigma, bw = fit_test_statistic_distribution(chi2s)
217 |         pvalue_h = f * stats.chi2.sf(chi2, endof) + (1 - f) * stats.norm.sf(
218 |             chi2, endof, lsigma
219 |         )
220 |     else:
221 |         endof = estimate_ndof(chi2s)
222 |         pvalue_h = stats.chi2.sf(chi2, endof)
223 | 
224 |     zvalue_h = -stats.norm.ppf(pvalue_h)
225 | 
226 |     if pvalue_h == 0:
227 |         # apply Chernoff approximation as upper bound for p-value
228 |         # see: https://en.wikipedia.org/wiki/Chi-squared_distribution
229 |         z = chi2 / endof
230 |         u = -math.log(2 * math.pi) - endof * math.log(z) + endof * (z - 1)
231 |         if avg_per_bin <= 4:
232 |             u += -2 * math.log(f)
233 |         zvalue_h = math.sqrt(u - math.log(u))
234 | 
235 |     return pvalue_h, zvalue_h
236 | 
237 | 
238 | def significance_from_hist2d(
239 |     values: np.ndarray,
240 |     nsim: int = 1000,
241 |     lambda_: str = "log-likelihood",
242 |     simulation_method: str = "multinominal",
243 |     significance_method: str = "hybrid",
244 |     njobs: int = -1,
245 | ) -> Tuple[float, float]:
246 |     """
247 |     Calculate the significance of correlation of two variables based on the contingency table
248 | 
249 |     :param values: contingency table
250 |     :param int nsim: number of simulations
251 |     :param str lambda_: test statistic. Available options are [pearson, log-likelihood]
252 |     :param str simulation_method: simulation method. Options: [multinominal, row_product_multinominal, \
253 |      col_product_multinominal, hypergeometric].
254 |     :param str significance_method: significance_method. Options: [asymptotic, MC, hybrid]
255 |     :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs.
256 |     :return: pvalue, significance
257 |     """
258 | 
259 |     # chi2 of the data
260 |     chi2 = get_chi2_using_dependent_frequency_estimates(values, lambda_=lambda_)
261 | 
262 |     if significance_method == "asymptotic":
263 |         # calculate pvalue and zvalue based on chi2 and ndof (asymptotic method)
264 |         pvalue, zvalue = significance_from_chi2_asymptotic(values, chi2)
265 |     elif significance_method == "MC":
266 |         # calculate pvalue based on simulation (MC method)
267 |         pvalue, zvalue = significance_from_chi2_MC(
268 |             chi2,
269 |             values,
270 |             nsim=nsim,
271 |             lambda_=lambda_,
272 |             simulation_method=simulation_method,
273 |             njobs=njobs,
274 |         )
275 |     elif significance_method == "hybrid":
276 |         # low statistics : calculate pvalue and zvalue using h(x|f) and endof
277 |         # high statistics: calculate pvalue and zvalue using chi2-distribution and endof
278 |         pvalue, zvalue = significance_from_chi2_hybrid(
279 |             chi2,
280 |             values,
281 |             nsim=nsim,
282 |             lambda_=lambda_,
283 |             simulation_method=simulation_method,
284 |             njobs=njobs,
285 |         )
286 |     else:
287 |         raise NotImplementedError(
288 |             "simulation_method {0:s} is unknown".format(simulation_method)
289 |         )
290 | 
291 |     return pvalue, zvalue
292 | 
293 | 
294 | def significance_from_rebinned_df(
295 |     data_binned: pd.DataFrame,
296 |     lambda_: str = "log-likelihood",
297 |     simulation_method: str = "multinominal",
298 |     nsim: int = 1000,
299 |     significance_method: str = "hybrid",
300 |     dropna: bool = True,
301 |     drop_underflow: bool = True,
302 |     drop_overflow: bool = True,
303 |     njobs: int = -1,
304 | ) -> pd.DataFrame:
305 |     """
306 |     Calculate significance of correlation of all variable combinations in the DataFrame
307 | 
308 |     :param data_binned: input binned DataFrame
309 |     :param int nsim: number of simulations
310 |     :param str lambda_: test statistic. Available options are [pearson, log-likelihood]
311 |     :param str simulation_method: simulation method. Options: [mutlinominal, row_product_multinominal, \
312 |      col_product_multinominal, hypergeometric].
313 |     :param str significance_method: significance_method. Options: [asymptotic, MC, hybrid]
314 |     :param bool dropna: remove NaN values with True
315 |     :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\
316 |     a numeric variable)
317 |     :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\
318 |     a numeric variable)
319 |     :param int njobs: number of parallel jobs used for simulation. default is -1.
320 |     :return: significance matrix
321 |     """
322 | 
323 |     if not dropna:
324 |         # if not dropna replace the NaN values with the string NaN. Otherwise the rows with NaN are dropped
325 |         # by the groupby.
326 |         data_binned.replace(np.nan, defs.NaN, inplace=True)
327 |     if drop_underflow:
328 |         data_binned.replace(defs.UF, np.nan, inplace=True)
329 |     if drop_overflow:
330 |         data_binned.replace(defs.OF, np.nan, inplace=True)
331 | 
332 |     # cache column order (https://github.com/KaveIO/PhiK/issues/1)
333 |     column_order = data_binned.columns
334 |     signifs = []
335 |     for i, (c0, c1) in enumerate(
336 |         itertools.combinations_with_replacement(data_binned.columns.values, 2)
337 |     ):
338 |         datahist = (
339 |             data_binned.groupby([c0, c1])[c0].count().to_frame().unstack().fillna(0)
340 |         )
341 |         if 1 in datahist.shape or 0 in datahist.shape:
342 |             signifs.append((c0, c1, np.nan))
343 |             warnings.warn(
344 |                 "Too few unique values for variable {0:s} ({1:d}) or {2:s} ({3:d}) to calculate significance".format(
345 |                     c0, datahist.shape[0], c1, datahist.shape[1]
346 |                 )
347 |             )
348 |             continue
349 | 
350 |         datahist.columns = datahist.columns.droplevel()
351 |         datahist = datahist.values
352 |         pvalue, zvalue = significance_from_hist2d(
353 |             datahist,
354 |             nsim=nsim,
355 |             lambda_=lambda_,
356 |             simulation_method=simulation_method,
357 |             significance_method=significance_method,
358 |             njobs=njobs,
359 |         )
360 |         signifs.append((c0, c1, zvalue))
361 | 
362 |     if len(signifs) == 0:
363 |         return pd.DataFrame(np.nan, index=column_order, columns=column_order)
364 | 
365 |     significance_overview = create_correlation_overview_table(signifs)
366 | 
367 |     # restore column order
368 |     significance_overview = significance_overview.reindex(columns=column_order)
369 |     significance_overview = significance_overview.reindex(index=column_order)
370 | 
371 |     return significance_overview
372 | 
373 | 
374 | def significance_matrix(
375 |     df: pd.DataFrame,
376 |     interval_cols: list = None,
377 |     lambda_: str = "log-likelihood",
378 |     simulation_method: str = "multinominal",
379 |     nsim: int = 1000,
380 |     significance_method: str = "hybrid",
381 |     bins: Union[int, list, np.ndarray, dict] = 10,
382 |     dropna: bool = True,
383 |     drop_underflow: bool = True,
384 |     drop_overflow: bool = True,
385 |     verbose: bool = True,
386 |     njobs: int = -1,
387 | ) -> pd.DataFrame:
388 |     """
389 |     Calculate significance of correlation of all variable combinations in the dataframe
390 | 
391 |     :param pd.DataFrame df: input data
392 |     :param list interval_cols: column names of columns with interval variables.
393 |     :param int nsim: number of simulations
394 |     :param str lambda_: test statistic. Available options are [pearson, log-likelihood]
395 |     :param str simulation_method: simulation method. Options: [mutlinominal, row_product_multinominal, \
396 |      col_product_multinominal, hypergeometric].
397 |     :param int nsim: number of simulated datasets
398 |     :param str significance_method: significance_method. Options: [asymptotic, MC, hybrid]
399 |         :param bool dropna: remove NaN values with True
400 |     :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\
401 |     E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
402 |     :param bool dropna: remove NaN values with True
403 |     :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\
404 |     a numeric variable)
405 |     :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\
406 |     a numeric variable)
407 |     :param bool verbose: if False, do not print all interval columns that are guessed
408 |     :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs.
409 |     :return: significance matrix
410 |     """
411 | 
412 |     if interval_cols is None:
413 |         interval_cols = guess_interval_cols(df, verbose)
414 | 
415 |     df_clean, interval_cols_clean = dq_check_nunique_values(
416 |         df, interval_cols, dropna=dropna
417 |     )
418 | 
419 |     data_binned = bin_data(df_clean, interval_cols_clean, bins=bins)
420 | 
421 |     return significance_from_rebinned_df(
422 |         data_binned,
423 |         lambda_=lambda_,
424 |         simulation_method=simulation_method,
425 |         nsim=nsim,
426 |         significance_method=significance_method,
427 |         dropna=dropna,
428 |         drop_underflow=drop_underflow,
429 |         drop_overflow=drop_overflow,
430 |         njobs=njobs,
431 |     )
432 | 
433 | 
434 | def significance_from_array(
435 |     x: Union[np.ndarray, pd.Series],
436 |     y: Union[np.ndarray, pd.Series],
437 |     num_vars=None,
438 |     bins: Union[int, list, np.ndarray, dict] = 10,
439 |     quantile: bool = False,
440 |     lambda_: str = "log-likelihood",
441 |     nsim: int = 1000,
442 |     significance_method: str = "hybrid",
443 |     simulation_method: str = "multinominal",
444 |     dropna: bool = True,
445 |     drop_underflow: bool = True,
446 |     drop_overflow: bool = True,
447 |     njobs: int = -1,
448 | ) -> Tuple[float, float]:
449 |     """
450 |     Calculate the significance of correlation
451 | 
452 |     Calculate the significance of correlation for two variables which can be of interval, oridnal or categorical type.\
453 |     Interval variables will be binned.
454 | 
455 |     :param x: array-like input
456 |     :param y: array-like input
457 |     :param num_vars: list of numeric variables which need to be binned, e.g. ['x'] or ['x','y']
458 |     :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\
459 |     E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
460 |     :param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True)
461 |     :param str lambda_: test statistic. Available options are [pearson, log-likelihood]
462 |     :param int nsim: number of simulated datasets
463 |     :param str simulation_method: simulation method. Options: [mutlinominal, row_product_multinominal, \
464 |     col_product_multinominal, hypergeometric].
465 |     :param str significance_method: significance_method. Options: [asymptotic, MC, hybrid]
466 |     :param bool dropna: remove NaN values with True
467 |     :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\
468 |     a numeric variable)
469 |     :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\
470 |     a numeric variable)
471 |     :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs.
472 |     :return: p-value, significance
473 |     """
474 |     if num_vars is None:
475 |         num_vars = []
476 |     elif isinstance(num_vars, str):
477 |         num_vars = [num_vars]
478 | 
479 |     if len(num_vars) > 0:
480 |         df = array_like_to_dataframe(x, y)
481 |         x, y = bin_data(df, num_vars, bins=bins, quantile=quantile).T.values
482 | 
483 |     return significance_from_binned_array(
484 |         x,
485 |         y,
486 |         lambda_=lambda_,
487 |         significance_method=significance_method,
488 |         nsim=nsim,
489 |         simulation_method=simulation_method,
490 |         dropna=dropna,
491 |         drop_underflow=drop_underflow,
492 |         drop_overflow=drop_overflow,
493 |         njobs=njobs,
494 |     )
495 | 
496 | 
497 | def significance_from_binned_array(
498 |     x: Union[np.ndarray, pd.Series],
499 |     y: Union[np.ndarray, pd.Series],
500 |     lambda_: str = "log-likelihood",
501 |     significance_method: str = "hybrid",
502 |     nsim: int = 1000,
503 |     simulation_method: str = "multinominal",
504 |     dropna: bool = True,
505 |     drop_underflow: bool = True,
506 |     drop_overflow: bool = True,
507 |     njobs: int = -1,
508 | ) -> Tuple[float, float]:
509 |     """
510 |     Calculate the significance of correlation
511 | 
512 |     Calculate the significance of correlation for two variables which can be of interval, oridnal or categorical type. \
513 |     Interval variables need to be binned.
514 | 
515 |     :param x: array-like input
516 |     :param y: array-like input
517 |     :param str lambda_: test statistic. Available options are [pearson, log-likelihood]
518 |     :param str simulation_method: simulation method. Options: [multinominal, row_product_multinominal, \
519 |     col_product_multinominal, hypergeometric].
520 |     :param int nsim: number of simulated datasets
521 |     :param str significance_method: significance_method. Options: [asymptotic, MC, hybrid]
522 |     :param bool dropna: remove NaN values with True
523 |     :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\
524 |     a numeric variable)
525 |     :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\
526 |     a numeric variable)
527 |     :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs.
528 |     :return: p-value, significance
529 |     """
530 | 
531 |     if not dropna:
532 |         x = pd.Series(x).fillna(defs.NaN).astype(str).values
533 |         y = (
534 |             pd.Series(y).fillna(defs.NaN).astype(str).values
535 |         )  # crosstab cannot handle mixed type y!
536 | 
537 |     if drop_underflow or drop_overflow:
538 |         x = pd.Series(x).astype(str).values
539 |         y = pd.Series(y).astype(str).values
540 |         if drop_underflow:
541 |             x[np.where(x == defs.UF)] = np.nan
542 |             y[np.where(y == defs.UF)] = np.nan
543 |         if drop_overflow:
544 |             y[np.where(y == defs.OF)] = np.nan
545 |             x[np.where(x == defs.OF)] = np.nan
546 | 
547 |     hist2d = pd.crosstab(x, y).values
548 | 
549 |     if not dq_check_hist2d(hist2d):
550 |         return np.nan, np.nan
551 | 
552 |     return significance_from_hist2d(
553 |         hist2d,
554 |         lambda_=lambda_,
555 |         significance_method=significance_method,
556 |         simulation_method=simulation_method,
557 |         nsim=nsim,
558 |         njobs=njobs,
559 |     )
560 | 


--------------------------------------------------------------------------------