├── .gitignore ├── .gitattributes ├── docs ├── source │ ├── phik_index.rst │ ├── code.rst │ ├── phik.decorators.rst │ ├── index.rst │ ├── publication.rst │ ├── tutorials.rst │ ├── developing.rst │ ├── phik.rst │ ├── introduction.rst │ └── conf.py ├── autogenerate.sh ├── README.rst └── Makefile ├── phik ├── data │ └── fake_insurance_data.csv.gz ├── decorators │ ├── __init__.py │ └── pandas.py ├── simcore │ ├── bindings.cpp │ ├── __init__.py │ ├── asa159.hpp │ ├── simulation.hpp │ └── asa159.cpp ├── definitions.py ├── __init__.py ├── entry_points.py ├── resources.py ├── utils.py ├── betainc.py ├── data_quality.py ├── notebooks │ └── phik_tutorial_spark.ipynb ├── statistics.py ├── simulation.py ├── bivariate.py ├── binning.py ├── report.py └── significance.py ├── .github ├── dependabot.yml └── workflows │ ├── tests.yml │ ├── test_matrix.yml │ ├── valgrind.yml │ └── wheels.yml ├── .readthedocs.yml ├── .mbuild.sh ├── LICENSE ├── NOTICE ├── example.py ├── pyproject.toml ├── CMakeLists.txt ├── CHANGES.rst ├── README.rst └── tests ├── test_phik.py └── integration ├── test_phik_tutorial_advanced.py └── test_phik_tutorial_basic.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.so 2 | *egg-info* 3 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | phik/notebooks/* linguist-vendored 2 | -------------------------------------------------------------------------------- /docs/source/phik_index.rst: -------------------------------------------------------------------------------- 1 | PhiK 2 | ==== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | phik 8 | -------------------------------------------------------------------------------- /phik/data/fake_insurance_data.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaveIO/PhiK/HEAD/phik/data/fake_insurance_data.csv.gz -------------------------------------------------------------------------------- /docs/source/code.rst: -------------------------------------------------------------------------------- 1 | API Documentation 2 | ================= 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | phik_index 8 | -------------------------------------------------------------------------------- /phik/decorators/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | # import pandas DataFrame decorators 4 | from phik.decorators import pandas 5 | -------------------------------------------------------------------------------- /phik/simcore/bindings.cpp: -------------------------------------------------------------------------------- 1 | #include "simulation.hpp" 2 | #include 3 | 4 | PYBIND11_MODULE(_phik_simulation_core, m) { bind_simulation(m); } 5 | -------------------------------------------------------------------------------- /docs/autogenerate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # (re)create required directories 4 | rm -rf autogen 5 | mkdir -p source/_static autogen 6 | 7 | # auto-generate code documentation 8 | sphinx-apidoc -f -H PhiK -o autogen ../python/phik 9 | mv autogen/modules.rst autogen/phik_index.rst 10 | mv autogen/* source/ 11 | 12 | # remove auto-gen directory 13 | rm -rf autogen 14 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: 2 3 | updates: 4 | - package-ecosystem: pip 5 | directory: / 6 | # Check for updates once a day 7 | schedule: 8 | interval: daily 9 | allow: 10 | - dependency-type: all 11 | - package-ecosystem: github-actions 12 | directory: / 13 | # Check for updates once a week 14 | schedule: 15 | interval: weekly 16 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | # .readthedocs.yml 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.10" 13 | 14 | python: 15 | install: 16 | - method: pip 17 | path: . 18 | extra_requirements: 19 | - doc 20 | -------------------------------------------------------------------------------- /.mbuild.sh: -------------------------------------------------------------------------------- 1 | cmake -S . -G Ninja -B build \ 2 | -DCMAKE_BUILD_TYPE=Release \ 3 | -DSKBUILD_PROJECT_NAME="phik" \ 4 | -DSKBUILD_PROJECT_VERSION="0.12.4" \ 5 | -DPHIK_MBUILD=ON \ 6 | -DPython3_EXECUTABLE=$(python3 -c 'import sys; print(sys.executable)') \ 7 | -Dpybind11_DIR=$(python3 -c 'import pybind11; print(pybind11.get_cmake_dir())') \ 8 | -DCMAKE_EXPORT_COMPILE_COMMANDS=ON 9 | 10 | cmake --build build --target install --config Release --parallel 4 11 | -------------------------------------------------------------------------------- /docs/source/phik.decorators.rst: -------------------------------------------------------------------------------- 1 | phik.decorators package 2 | ======================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | phik.decorators.pandas module 8 | ----------------------------- 9 | 10 | .. automodule:: phik.decorators.pandas 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: phik.decorators 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /phik/definitions.py: -------------------------------------------------------------------------------- 1 | """Project: PhiK - correlation analyzer library 2 | 3 | Created: 2018/09/05 4 | 5 | Description: 6 | Definitions used throughout the phik package 7 | 8 | Authors: 9 | KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands 10 | 11 | Redistribution and use in source and binary forms, with or without 12 | modification, are permitted according to the terms listed in the file 13 | LICENSE. 14 | """ 15 | 16 | # names assigned to underflow and overflow bin when assigning bin indices 17 | OF = "OF" 18 | UF = "UF" 19 | # name replacement of np.nan 20 | NaN = "NaN" 21 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: push 4 | jobs: 5 | tests: 6 | runs-on: ubuntu-latest 7 | 8 | steps: 9 | - uses: actions/checkout@v6 10 | - name: Set up Python 11 | uses: actions/setup-python@v6 12 | with: 13 | python-version: 3.9 14 | - name: Install dependencies 15 | run: | 16 | python -m pip install --upgrade pip 17 | pip install . -v 18 | pip install "pytest>=4.0.2" "pytest-pylint>=0.13.0" 19 | - name: Test with pytest 20 | run: | 21 | cd tests 22 | pytest test_phik.py -W ignore::DeprecationWarning 23 | -------------------------------------------------------------------------------- /phik/simcore/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib.util 2 | 3 | try: 4 | _ext_spec = importlib.util.find_spec("phik.lib._phik_simulation_core") 5 | except ModuleNotFoundError: 6 | _ext_spec = None 7 | 8 | if _ext_spec is not None: 9 | from phik.lib._phik_simulation_core import _sim_2d_data_patefield 10 | 11 | CPP_SUPPORT = True 12 | else: 13 | CPP_SUPPORT = False 14 | 15 | def _sim_2d_data_patefield(*args, **kwargs): 16 | msg = "Patefield requires a compiled extension that was not found." 17 | raise NotImplementedError(msg) 18 | 19 | 20 | __all__ = ["CPP_SUPPORT", "_sim_2d_data_patefield"] 21 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. PhiK documentation master file, created by 2 | sphinx-quickstart on Thu Jul 7 14:25:54 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | 7 | .. include:: ../../README.rst 8 | 9 | 10 | Contents 11 | ======== 12 | 13 | .. toctree:: 14 | :maxdepth: 2 15 | 16 | introduction 17 | tutorials 18 | publication 19 | developing 20 | 21 | API 22 | --- 23 | 24 | .. toctree:: 25 | :maxdepth: 1 26 | 27 | code 28 | 29 | Indices and tables 30 | ------------------ 31 | 32 | * :ref:`genindex` 33 | * :ref:`modindex` 34 | -------------------------------------------------------------------------------- /phik/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | import importlib.metadata 3 | 4 | from phik import decorators 5 | from phik.outliers import ( 6 | outlier_significance_from_array, 7 | outlier_significance_matrices, 8 | outlier_significance_matrix, 9 | ) 10 | from phik.phik import global_phik_array, phik_from_array, phik_matrix 11 | from phik.significance import significance_from_array, significance_matrix 12 | 13 | __version__ = importlib.metadata.version("phik") 14 | 15 | __all__ = [ 16 | "decorators", 17 | "phik_from_array", 18 | "significance_from_array", 19 | "outlier_significance_from_array", 20 | "phik_matrix", 21 | "global_phik_array", 22 | "significance_matrix", 23 | "outlier_significance_matrices", 24 | "outlier_significance_matrix", 25 | ] 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | ############################################################################## 2 | # 3 | # Copyright 2016 KPMG Advisory N.V. (unless otherwise stated) 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | ############################################################################## 18 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # 3 | # NOTICE: pass-through licensing of bundled components 4 | # 5 | # PhiK gathers together a toolkit of pre-existing third-party open-source software components. 6 | # These software components are governed by their own licenses which PhiK does not 7 | # modify or supersede, please consult the originating authors. These components altogether 8 | # have a mixture of the following licenses: Apache 2.0, GPL 2.0, AGPL and LGPL, ZPL, MIT, PSF, 9 | # BSD and some BSD-like simple licenses. 10 | # For scipy and numpy see: http://docs.continuum.io/anaconda/licenses.html . 11 | # 12 | # Although we have examined the licenses to verify acceptance of commercial and non-commercial 13 | # use, please see and consult the original licenses or authors. 14 | # 15 | ################################################################################################ 16 | -------------------------------------------------------------------------------- /phik/entry_points.py: -------------------------------------------------------------------------------- 1 | """Project: PhiK - correlation analyzer library 2 | 3 | Created: 2018/11/13 4 | 5 | Description: 6 | Collection of phik entry points 7 | 8 | Authors: 9 | KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands 10 | 11 | Redistribution and use in source and binary forms, with or without 12 | modification, are permitted according to the terms listed in the file 13 | LICENSE. 14 | """ 15 | 16 | 17 | def phik_trial(): 18 | """Run Phi_K tests. 19 | 20 | We will keep this here until we've completed switch to pytest or nose and tox. 21 | We could also keep it, but I don't like the fact that packages etc. are 22 | hard coded. Gotta come up with 23 | a better solution. 24 | """ 25 | import sys 26 | import pytest 27 | 28 | # ['--pylint'] + 29 | # -r xs shows extra info on skips and xfails. 30 | default_options = ["-rxs"] 31 | args = sys.argv[1:] + default_options 32 | sys.exit(pytest.main(args)) 33 | -------------------------------------------------------------------------------- /phik/simcore/asa159.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Taken from: 3 | * https://people.sc.fsu.edu/~jburkardt/cpp_src/asa159/asa159.html 4 | * 5 | * Michael Patefield, 6 | * Algorithm AS 159: An Efficient Method of Generating RXC Tables with Given Row and Column Totals, 7 | * Applied Statistics, 8 | * Volume 30, Number 1, 1981, pages 91-97. 9 | */ 10 | 11 | #ifndef PYTHON_PHIK_SIMCORE_ASA159_HPP_ 12 | #define PYTHON_PHIK_SIMCORE_ASA159_HPP_ 13 | 14 | #include 15 | 16 | int i4_max ( int i1, int i2 ); 17 | int i4_min ( int i1, int i2 ); 18 | void i4mat_print ( int m, int n, int a[], std::string title ); 19 | void i4mat_print_some ( int m, int n, int a[], int ilo, int jlo, int ihi, 20 | int jhi, std::string title ); 21 | void i4vec_print ( int n, int a[], std::string title ); 22 | int i4vec_sum ( int n, int a[] ); 23 | double r8_uniform_01 ( int *seed ); 24 | void rcont2 ( int nrow, int ncol, int nrowt[], int ncolt[], bool *key, 25 | int *seed, int matrix[], int *ierror ); 26 | void timestamp ( ); 27 | 28 | #endif // PYTHON_PHIK_SIMCORE_ASA159_HPP_ 29 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | import phik 4 | from phik import resources, report 5 | 6 | # open fake car insurance data 7 | df = pd.read_csv( resources.fixture('fake_insurance_data.csv.gz') ) 8 | df.head() 9 | 10 | # Pearson's correlation matrix between numeric variables (pandas functionality) 11 | df.corr() 12 | 13 | # get the phi_k correlation matrix between all variables 14 | df.phik_matrix() 15 | 16 | # get global correlations based on phi_k correlation matrix 17 | df.global_phik() 18 | 19 | # get the significance matrix (expressed as one-sided Z) 20 | # of the hypothesis test of each variable-pair dependency 21 | df.significance_matrix() 22 | 23 | # contingency table of two columns 24 | cols = ['mileage', 'car_size'] 25 | df[cols].hist2d() 26 | 27 | # normalized residuals of contingency test applied to cols 28 | df[cols].outlier_significance_matrix() 29 | 30 | # show the normalized residuals of each variable-pair 31 | df.outlier_significance_matrices() 32 | 33 | # generate a phik correlation report and save as test.pdf 34 | report.correlation_report(df, pdf_file_name='test.pdf') 35 | -------------------------------------------------------------------------------- /.github/workflows/test_matrix.yml: -------------------------------------------------------------------------------- 1 | name: Test Matrix 2 | 3 | on: 4 | workflow_dispatch: 5 | pull_request: 6 | push: 7 | branches: 8 | - master 9 | 10 | jobs: 11 | build: 12 | name: ${{ matrix.platform }} Python ${{ matrix.python-version }} 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | platform: [windows-latest, macos-latest, ubuntu-latest] 17 | python-version: ["3.9", "3.10", "3.11", "3.12"] 18 | 19 | runs-on: ${{ matrix.platform }} 20 | 21 | steps: 22 | - uses: actions/checkout@v6 23 | with: 24 | submodules: true 25 | 26 | - uses: actions/setup-python@v6 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | 30 | - name: Add requirements 31 | run: | 32 | python -m pip install --upgrade pip wheel 33 | 34 | - name: Build and install 35 | run: pip install --verbose ".[test]" 36 | 37 | - name: Unit test 38 | run: | 39 | cd tests 40 | pytest test_phik.py -v -W ignore::DeprecationWarning 41 | 42 | - name: Integration test 43 | run: | 44 | cd tests 45 | pytest integration -v -W ignore::DeprecationWarning 46 | -------------------------------------------------------------------------------- /docs/source/publication.rst: -------------------------------------------------------------------------------- 1 | =================== 2 | Publication & Talks 3 | =================== 4 | 5 | Publication 6 | ----------- 7 | 8 | * peer-reviewed: https://www.sciencedirect.com/science/article/abs/pii/S0167947320301341 9 | * arXiv pre-print: https://arxiv.org/abs/1811.11440 10 | 11 | 12 | Talks 13 | ----- 14 | 15 | * Coming soon. 16 | 17 | 18 | Cite as 19 | ------- 20 | 21 | Baak, M., Koopman, R., Snoek, H., & Klous, S. (2020). A new correlation coefficient between categorical, ordinal and interval variables with Pearson characteristics. *Computational Statistics & Data Analysis*, 152, 107043. 22 | 23 | 24 | .. code-block:: latex 25 | 26 | @article{phik2020, 27 | title={A new correlation coefficient between categorical, ordinal and interval variables with Pearson characteristics}, 28 | author={Baak, M and Koopman, R and Snoek, H and Klous, S}, 29 | journal={Computational Statistics \& Data Analysis}, 30 | volume={152}, 31 | pages={107043}, 32 | year={2020}, 33 | publisher={Elsevier} 34 | } 35 | 36 | References 37 | ---------- 38 | 39 | * Web page: https://phik.readthedocs.io 40 | * Repository: https://github.com/kaveio/phik 41 | * Issues & Ideas: https://github.com/kaveio/phik/issues 42 | * Contact us at: kave [at] kpmg [dot] com 43 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["scikit-build-core>=0.3.3", "pybind11"] 3 | build-backend = "scikit_build_core.build" 4 | 5 | [project] 6 | name = "phik" 7 | version = "0.12.5" 8 | description = "Phi_K correlation analyzer library" 9 | readme = "README.rst" 10 | authors = [{ name = "KPMG N.V. The Netherlands", email = "kave@kpmg.com" }] 11 | requires-python = ">=3.9" 12 | classifiers = [ 13 | "Development Status :: 4 - Beta", 14 | "License :: OSI Approved :: MIT License", 15 | "Programming Language :: Python :: 3 :: Only", 16 | "Programming Language :: Python :: 3.9", 17 | "Programming Language :: Python :: 3.10", 18 | "Programming Language :: Python :: 3.11", 19 | "Programming Language :: Python :: 3.12", 20 | ] 21 | 22 | dependencies = [ 23 | "numpy>=1.18.0", 24 | "scipy>=1.5.2", 25 | "pandas>=0.25.1", 26 | "matplotlib>=2.2.3", 27 | "joblib>=0.14.1", 28 | ] 29 | 30 | [project.optional-dependencies] 31 | test = ["pytest>=4.0.2", "pytest-pylint>=0.13.0"] 32 | 33 | [tool.scikit-build] 34 | wheel.expand-macos-universal-tags = true 35 | cmake.build-type = "Release" 36 | logging.level = "WARNING" 37 | sdist.include = ["phik/simcore", "CMakeLists.txt"] 38 | 39 | [tool.pytest.ini_options] 40 | minversion = "6.0" 41 | addopts = ["-ra"] 42 | xfail_strict = true 43 | log_cli_level = "INFO" 44 | filterwarnings = ["error"] 45 | testpaths = ["tests"] 46 | -------------------------------------------------------------------------------- /docs/source/tutorials.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Tutorials 3 | ========= 4 | 5 | This section contains materials on how to use the Phi_K correlation analysis code. 6 | There are additional side notes on how certain aspects work and where to find parts of the code. 7 | For more in depth explanations on the functionality of the code-base, try the `API docs `_. 8 | 9 | The tutorials are available in the ``phik/notebooks`` directory. We have: 10 | 11 | * A basic tutorial: this covers the basics of calculating Phi_K, the statistical significance, and interpreting the correlation. 12 | * An advanced tutorial: this shows how to use the advanced features of the ``PhiK`` library. 13 | * A spark tutorial: this shows how to calculate the Phi_K correlation matrix for a spark dataframe. 14 | 15 | You can open these notebooks directly: 16 | 17 | * Run them interactively at `MyBinder `_. 18 | * View them statically: `basic tutorial `_ and the `advanced tutorial `_ and the `spark tutorial `_. 19 | -------------------------------------------------------------------------------- /phik/decorators/pandas.py: -------------------------------------------------------------------------------- 1 | """Project: PhiK - correlation analyzer library 2 | 3 | Module: phik.decorators.pandas 4 | 5 | Created: 2018/11/14 6 | 7 | Description: 8 | Decorators for pandas DataFrame objects 9 | 10 | Authors: 11 | KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands 12 | 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted according to the terms listed in the file 15 | LICENSE. 16 | """ 17 | 18 | from pandas import DataFrame, Series 19 | 20 | # add function to create a 2d histogram 21 | from phik.binning import hist2d, hist2d_from_array 22 | DataFrame.hist2d = hist2d 23 | Series.hist2d = hist2d_from_array 24 | 25 | # add phik correlation matrix function 26 | from phik.phik import phik_matrix, global_phik_array 27 | DataFrame.phik_matrix = phik_matrix 28 | DataFrame.global_phik = global_phik_array 29 | 30 | # add significance matrix function for variable dependencies 31 | from phik.significance import significance_matrix 32 | DataFrame.significance_matrix = significance_matrix 33 | 34 | # outlier matrix 35 | from phik.outliers import outlier_significance_matrices, outlier_significance_matrix, outlier_significance_from_array 36 | DataFrame.outlier_significance_matrices = outlier_significance_matrices 37 | DataFrame.outlier_significance_matrix = outlier_significance_matrix 38 | Series.outlier_significance_matrix = outlier_significance_from_array 39 | -------------------------------------------------------------------------------- /docs/source/developing.rst: -------------------------------------------------------------------------------- 1 | =========================== 2 | Developing and Contributing 3 | =========================== 4 | 5 | 6 | Working on the package 7 | ---------------------- 8 | You have some cool feature and/or algorithm you want to add to the package. How do you go about it? 9 | 10 | First clone the package. 11 | 12 | .. code-block:: bash 13 | 14 | git clone https://github.com/KaveIO/PhiK.git 15 | 16 | then 17 | 18 | .. code-block:: bash 19 | 20 | pip install -e PhiK/ 21 | 22 | this will install ``PhiK`` in editable mode, which will allow you to edit the code and run it as 23 | you would with a normal installation of the ``PhiK`` correlation analyzer package. 24 | 25 | To make sure that everything works try executing the tests, e.g. 26 | 27 | .. code-block:: bash 28 | 29 | cd PhiK/ 30 | phik_trial . 31 | 32 | or 33 | 34 | .. code-block:: bash 35 | 36 | cd PhiK/ 37 | python setup.py test 38 | 39 | That's it. 40 | 41 | 42 | Contributing 43 | ------------ 44 | 45 | When contributing to this repository, please first discuss the change you wish to make via issue, email, or any 46 | other method with the owners of this repository before making a change. You can find the contact information on the 47 | `index `_ page. 48 | 49 | Note that when contributing that all tests should succeed. 50 | 51 | 52 | Tips and Tricks 53 | --------------- 54 | 55 | - Enable auto reload in ``jupyter``: 56 | 57 | .. code-block:: python 58 | 59 | %load_ext autoreload 60 | 61 | this will reload modules before executing any user code. 62 | -------------------------------------------------------------------------------- /.github/workflows/valgrind.yml: -------------------------------------------------------------------------------- 1 | name: Valgrind 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - master 7 | workflow_dispatch: 8 | 9 | defaults: 10 | run: 11 | shell: bash 12 | 13 | jobs: 14 | pre_job: 15 | # continue-on-error: true # Uncomment once integration is finished 16 | runs-on: ubuntu-latest 17 | # Map a step output to a job output 18 | outputs: 19 | should_skip: ${{ steps.skip_check.outputs.should_skip }} 20 | steps: 21 | - id: skip_check 22 | uses: fkirc/skip-duplicate-actions@master 23 | with: 24 | # All of these options are optional, so you can remove them if you are happy with the defaults 25 | cancel_others: 'true' 26 | do_not_skip: '["pull_request", "workflow_dispatch", "schedule"]' 27 | build: 28 | name: Valgrind 29 | needs: pre_job 30 | if: ${{ needs.pre_job.outputs.should_skip != 'true' }} 31 | runs-on: ubuntu-latest 32 | 33 | steps: 34 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 35 | - uses: actions/checkout@v6 36 | with: 37 | submodules: false 38 | 39 | - uses: actions/setup-python@v6 40 | with: 41 | python-version: '3.10' 42 | 43 | - name: Install dependencies on ubuntu 44 | run: | 45 | sudo apt-get update 46 | sudo apt-get install -y valgrind 47 | 48 | - name: Install python packages 49 | run: | 50 | python -m pip install --upgrade pip pytest 51 | 52 | - name: Install 53 | run: | 54 | # temp fix for Valgrind issue with later versions 55 | pip install scipy==1.9.1 56 | CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" pip install . -v 57 | 58 | - name: Test 59 | run: | 60 | cd tests 61 | PYTHONMALLOC=malloc valgrind --leak-check=yes --track-origins=yes --log-file=valgrind-log.txt python -m pytest test_phik.py -W ignore::DeprecationWarning 62 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.17...3.27) 2 | 3 | # Scikit-build-core sets these values for you, or you can just hard-code the 4 | # name and version. 5 | project( 6 | ${SKBUILD_PROJECT_NAME} 7 | VERSION ${SKBUILD_PROJECT_VERSION} 8 | DESCRIPTION "C++ bindings for simulation RXC tables" 9 | LANGUAGES CXX) 10 | 11 | set(CMAKE_CXX_STANDARD 14) 12 | # Define CMAKE_INSTALL_xxx: LIBDIR, INCLUDEDIR 13 | include(GNUInstallDirs) 14 | 15 | find_package(Python REQUIRED COMPONENTS Interpreter Development.Module) 16 | find_package(pybind11 CONFIG REQUIRED) 17 | set(SUBPATH ${PROJECT_SOURCE_DIR}/phik/simcore/) 18 | 19 | # ############################################################################## 20 | # build ASA159 library # 21 | # ############################################################################## 22 | if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") 23 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") 24 | endif() 25 | 26 | add_library(_asa159 OBJECT ${SUBPATH}/asa159.cpp) 27 | 28 | target_include_directories(_asa159 PRIVATE ${SUBPATH}) 29 | # ############################################################################## 30 | # EXECUTABLE # 31 | # ############################################################################## 32 | pybind11_add_module(_phik_simulation_core MODULE ${SUBPATH}/bindings.cpp 33 | ${SUBPATH}/simulation.hpp $) 34 | 35 | target_compile_definitions(_phik_simulation_core 36 | PRIVATE VERSION_INFO=${SKBUILD_PROJECT_VERSION}) 37 | 38 | target_include_directories( 39 | _phik_simulation_core PUBLIC $ 40 | $) 41 | if(PHIK_MBUILD) 42 | set(CMAKE_INSTALL_PREFIX "${PROJECT_SOURCE_DIR}") 43 | endif() 44 | 45 | install(TARGETS _phik_simulation_core LIBRARY DESTINATION "${PROJECT_NAME}/lib") 46 | 47 | # Quiet a warning, since this project is only valid with SKBUILD 48 | set(ignoreMe "${SKBUILD}") 49 | -------------------------------------------------------------------------------- /.github/workflows/wheels.yml: -------------------------------------------------------------------------------- 1 | name: Wheels 2 | 3 | on: 4 | workflow_dispatch: 5 | pull_request: 6 | push: 7 | branches: 8 | - master 9 | release: 10 | types: 11 | - published 12 | 13 | jobs: 14 | make_sdist: 15 | name: Make SDist 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v6 19 | 20 | - name: Build SDist 21 | run: pipx run build --sdist 22 | 23 | - uses: actions/upload-artifact@v6 24 | with: 25 | name: artifact-sdist 26 | path: dist/*.tar.gz 27 | 28 | 29 | build_wheels: 30 | name: Wheels on ${{ matrix.os }} 31 | runs-on: ${{ matrix.os }} 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | os: [ubuntu-latest, windows-latest, macos-latest] 36 | 37 | steps: 38 | - uses: actions/checkout@v6 39 | with: 40 | submodules: true 41 | 42 | - uses: actions/setup-python@v6 43 | with: 44 | python-version: "3.12" 45 | 46 | - uses: pypa/cibuildwheel@v3.3.0 47 | env: 48 | CIBW_ENVIRONMENT: MACOSX_DEPLOYMENT_TARGET=10.13 49 | CIBW_BUILD: 'cp38-* cp39-* cp310-* cp311-* cp312-* cp313-* cp314-*' 50 | CIBW_TEST_EXTRAS: test 51 | CIBW_TEST_COMMAND: pytest {project}/tests/test_phik.py -W ignore::DeprecationWarning 52 | CIBW_ARCHS: "auto64" 53 | CIBW_ARCHS_MACOS: "x86_64 arm64" 54 | # Skip 32-bit builds 55 | CIBW_SKIP: "*-win32 *-manylinux_i686 *-musllinux_x86_64" 56 | 57 | - name: Show files 58 | run: ls -lh wheelhouse 59 | shell: bash 60 | 61 | - name: Verify clean directory 62 | run: git diff --exit-code 63 | shell: bash 64 | 65 | - name: Upload wheels 66 | uses: actions/upload-artifact@v6 67 | with: 68 | name: artifact-${{ matrix.os }} 69 | path: wheelhouse/*.whl 70 | 71 | upload_all: 72 | needs: [build_wheels, make_sdist] 73 | runs-on: ubuntu-latest 74 | if: github.event_name == 'release' && github.event.action == 'published' 75 | steps: 76 | - uses: actions/download-artifact@v7 77 | with: 78 | pattern: artifact-* 79 | merge-multiple: true 80 | path: dist 81 | 82 | - uses: pypa/gh-action-pypi-publish@release/v1 83 | with: 84 | user: __token__ 85 | password: ${{ secrets.pypi_password }} 86 | -------------------------------------------------------------------------------- /phik/resources.py: -------------------------------------------------------------------------------- 1 | """Project: PhiK - correlation analyzer library 2 | 3 | Created: 2018/11/13 4 | 5 | Description: 6 | Collection of helper functions to get fixtures, i.e. for test data and notebooks. 7 | These are mostly used by the (integration) tests and example notebooks. 8 | 9 | Authors: 10 | KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands 11 | 12 | Redistribution and use in source and binary forms, with or without 13 | modification, are permitted according to the terms listed in the file 14 | LICENSE. 15 | """ 16 | 17 | import pathlib 18 | from pathlib import Path 19 | 20 | ROOT_DIRECTORY = Path(__file__).resolve().parent 21 | 22 | 23 | # Fixtures 24 | _FIXTURE = {_.name: _ for _ in pathlib.Path(ROOT_DIRECTORY / "data").glob("*")} 25 | # Tutorial notebooks 26 | _NOTEBOOK = { 27 | _.name: _ for _ in pathlib.Path(ROOT_DIRECTORY / "notebooks").glob("*.ipynb") 28 | } 29 | 30 | # Resource types 31 | _RESOURCES = {"fixture": _FIXTURE, "notebook": _NOTEBOOK} 32 | 33 | 34 | def _resource(resource_type, name: str) -> str: 35 | """Return the full path filename of a resource. 36 | 37 | :param str resource_type: The type of the resource. 38 | :param str name: The name of the resource. 39 | :returns: The full path filename of the fixture data set. 40 | :rtype: str 41 | :raises FileNotFoundError: If the resource cannot be found. 42 | """ 43 | full_path = _RESOURCES[resource_type].get(name, None) 44 | 45 | if full_path and full_path.exists(): 46 | return str(full_path) 47 | 48 | raise FileNotFoundError( 49 | 'Could not find {resource_type} "{name!s}"! Does it exist?'.format( 50 | resource_type=resource_type, name=name 51 | ) 52 | ) 53 | 54 | 55 | def fixture(name: str) -> str: 56 | """Return the full path filename of a fixture data set. 57 | 58 | :param str name: The name of the fixture. 59 | :returns: The full path filename of the fixture data set. 60 | :rtype: str 61 | :raises FileNotFoundError: If the fixture cannot be found. 62 | """ 63 | return _resource("fixture", name) 64 | 65 | 66 | def notebook(name: str) -> str: 67 | """Return the full path filename of a tutorial notebook. 68 | 69 | :param str name: The name of the notebook. 70 | :returns: The full path filename of the notebook. 71 | :rtype: str 72 | :raises FileNotFoundError: If the notebook cannot be found. 73 | """ 74 | return _resource("notebook", name) 75 | -------------------------------------------------------------------------------- /docs/source/phik.rst: -------------------------------------------------------------------------------- 1 | phik package 2 | ============ 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | phik.decorators 10 | 11 | Submodules 12 | ---------- 13 | 14 | phik.betainc module 15 | ------------------- 16 | 17 | .. automodule:: phik.betainc 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | phik.binning module 23 | ------------------- 24 | 25 | .. automodule:: phik.binning 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | phik.bivariate module 31 | --------------------- 32 | 33 | .. automodule:: phik.bivariate 34 | :members: 35 | :undoc-members: 36 | :show-inheritance: 37 | 38 | phik.data\_quality module 39 | ------------------------- 40 | 41 | .. automodule:: phik.data_quality 42 | :members: 43 | :undoc-members: 44 | :show-inheritance: 45 | 46 | phik.definitions module 47 | ----------------------- 48 | 49 | .. automodule:: phik.definitions 50 | :members: 51 | :undoc-members: 52 | :show-inheritance: 53 | 54 | phik.entry\_points module 55 | ------------------------- 56 | 57 | .. automodule:: phik.entry_points 58 | :members: 59 | :undoc-members: 60 | :show-inheritance: 61 | 62 | phik.outliers module 63 | -------------------- 64 | 65 | .. automodule:: phik.outliers 66 | :members: 67 | :undoc-members: 68 | :show-inheritance: 69 | 70 | phik.phik module 71 | ---------------- 72 | 73 | .. automodule:: phik.phik 74 | :members: 75 | :undoc-members: 76 | :show-inheritance: 77 | 78 | phik.report module 79 | ------------------ 80 | 81 | .. automodule:: phik.report 82 | :members: 83 | :undoc-members: 84 | :show-inheritance: 85 | 86 | phik.resources module 87 | --------------------- 88 | 89 | .. automodule:: phik.resources 90 | :members: 91 | :undoc-members: 92 | :show-inheritance: 93 | 94 | phik.significance module 95 | ------------------------ 96 | 97 | .. automodule:: phik.significance 98 | :members: 99 | :undoc-members: 100 | :show-inheritance: 101 | 102 | phik.simulation module 103 | ---------------------- 104 | 105 | .. automodule:: phik.simulation 106 | :members: 107 | :undoc-members: 108 | :show-inheritance: 109 | 110 | phik.statistics module 111 | ---------------------- 112 | 113 | .. automodule:: phik.statistics 114 | :members: 115 | :undoc-members: 116 | :show-inheritance: 117 | 118 | phik.version module 119 | ------------------- 120 | 121 | .. automodule:: phik.version 122 | :members: 123 | :undoc-members: 124 | :show-inheritance: 125 | 126 | 127 | Module contents 128 | --------------- 129 | 130 | .. automodule:: phik 131 | :members: 132 | :undoc-members: 133 | :show-inheritance: 134 | -------------------------------------------------------------------------------- /phik/simcore/simulation.hpp: -------------------------------------------------------------------------------- 1 | /* python/phik/simulation/simulation.hpp wrapper and bindings for 2 | * Michael Patefield, 3 | * Algorithm AS 159: An Efficient Method of Generating RXC Tables with Given Row and Column Totals, 4 | * Applied Statistics, 5 | * Volume 30, Number 1, 1981, pages 91-97. 6 | * 7 | * https://people.sc.fsu.edu/~jburkardt/cpp_src/asa159/asa159.html 8 | */ 9 | 10 | #ifndef PYTHON_PHIK_SIMCORE_SIMULATION_HPP_ 11 | #define PYTHON_PHIK_SIMCORE_SIMULATION_HPP_ 12 | #include "asa159.hpp" 13 | #include 14 | #include 15 | 16 | namespace py = pybind11; 17 | 18 | struct simulation_error: std::exception { 19 | const char* p_message; 20 | explicit simulation_error(const char* message) : p_message(message) {} 21 | const char* what() const throw() { return p_message; } 22 | }; 23 | 24 | void _sim_2d_data_patefield( 25 | int nrow, 26 | int ncol, 27 | const py::array_t& nrowt, 28 | const py::array_t& ncolt, 29 | int seed, 30 | py::array_t& result 31 | ) { 32 | bool key = false; 33 | int ierror = 0; 34 | int* nrowt_ptr = reinterpret_cast(nrowt.request().ptr); 35 | int* ncolt_ptr = reinterpret_cast(ncolt.request().ptr); 36 | int* result_ptr = reinterpret_cast(result.request().ptr); 37 | 38 | // constructs a random two-way contingency table with given sums, 39 | // the underlying memory of result is directly modified 40 | rcont2(nrow, ncol, nrowt_ptr, ncolt_ptr, &key, &seed, result_ptr, &ierror); 41 | if (ierror != 0) { 42 | throw simulation_error("Could not construct two-way contingency table"); 43 | } 44 | return; 45 | } 46 | 47 | auto docstring = R"pbdoc(Construct a random two-way contingency table with given sums 48 | 49 | Parameters 50 | ---------- 51 | nrow : int 52 | number of rows in the table, should be >= 2 53 | ncol : int 54 | number of columns in the table, should be >= 2 55 | nrowt : np.array[int] 56 | the row sums, note all values should be positive 57 | ncolt : np.array[int] 58 | the col sums, note all values should be positive 59 | seed : int 60 | random seed for the generation 61 | result : np.array[int] 62 | initialized array where the results will be stored 63 | 64 | Reference 65 | --------- 66 | WM Patefield, 67 | Algorithm AS 159: 68 | An Efficient Method of Generating RXC Tables with 69 | Given Row and Column Totals, 70 | Applied Statistics, 71 | Volume 30, Number 1, 1981, pages 91-97. 72 | )pbdoc"; 73 | 74 | void bind_simulation(py::module &m) { 75 | m.def( 76 | "_sim_2d_data_patefield", 77 | &_sim_2d_data_patefield, 78 | docstring, 79 | py::arg("nrow"), 80 | py::arg("ncol"), 81 | py::arg("nrowt"), 82 | py::arg("ncolt"), 83 | py::arg("seed"), 84 | py::arg("result") 85 | ); 86 | } 87 | 88 | #endif // PYTHON_PHIK_SIMCORE_SIMULATION_HPP_ 89 | -------------------------------------------------------------------------------- /docs/README.rst: -------------------------------------------------------------------------------- 1 | Generating Documentation with Sphinx 2 | ==================================== 3 | 4 | This README is for generating and writing documentation using Sphinx. 5 | On the repository there should already be the auto-generated files 6 | along with the regular documentation. 7 | 8 | Installing Sphinx 9 | ----------------- 10 | 11 | First install Sphinx. Go to http://www.sphinx-doc.org/en/stable/ or run 12 | 13 | :: 14 | 15 | pip install -U Sphinx 16 | pip install -U sphinx-rtd-theme 17 | conda install -c conda-forge nbsphinx 18 | 19 | The eskapade/docs folder has the structure of a Sphinx project. 20 | However, if you want to make a new Sphinx project run: 21 | 22 | :: 23 | 24 | sphinx-quickstart 25 | 26 | It quickly generates a conf.py file which contains your configuration 27 | for your sphinx build. 28 | 29 | Update the HTML docs 30 | -------------------- 31 | 32 | Now we want Sphinx to autogenerate from docstrings and other 33 | documentation in the code base. Luckily Sphinx has the apidoc 34 | functionality. This goes through a path, finds all the python files and 35 | depending on your arguments, parses certain parts of the code 36 | (docstring, hidden classes, etc.). 37 | 38 | **First make sure your environment it setup properly. Python must be 39 | able to import all modules otherwise it will not work!** 40 | 41 | From the the root of the repository: 42 | 43 | :: 44 | 45 | $ source setup.sh 46 | 47 | To run the autogeneration of the documentation type in /docs/: 48 | 49 | :: 50 | 51 | ./autogenerate.sh 52 | 53 | to scan the pyfiles and generate \*.rst files with the documentation. 54 | The script itself contains the usage of apidoc. 55 | 56 | Now to make the actual documentation files run: 57 | 58 | :: 59 | 60 | make clean 61 | 62 | to clean up the old make of sphinx and run: 63 | 64 | :: 65 | 66 | make html 67 | 68 | to make the new html build. It will be stored in (your config can adjust 69 | this, but the default is:) docs/build/html/ The index.html is the 70 | starting page. Open this file to see the result. 71 | 72 | Mounting a different repository to vagrant 73 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 74 | 75 | When you want to develop code that is not part of the repository that 76 | your vagrant is in, you can mount it seperately. This is done by changing 77 | the Vagrantfile, by changing the ``#mount`` line to the path of the repository 78 | that you want to mount: 79 | 80 | :: 81 | 82 | config.vm.synced_folder "", "", id: "esrepo" 83 | 84 | where the location to mount is e.g. /opt/eskapade. 85 | 86 | What is an .rst file? 87 | ~~~~~~~~~~~~~~~~~~~~~ 88 | 89 | R(e)ST is the format that Sphinx uses it stands for ReSTructured 90 | (http://docutils.sourceforge.net/docs/user/rst/quickref.html). It looks 91 | for other RST files to import, see index.rst to see how the **toctree** 92 | refers to other files. 93 | -------------------------------------------------------------------------------- /phik/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import pandas as pd 4 | import numpy as np 5 | 6 | 7 | def array_like_to_dataframe( 8 | x: Union[pd.Series, list, np.ndarray], y: Union[pd.Series, list, np.ndarray] 9 | ) -> pd.DataFrame: 10 | """Concat two array-like data structures into a DataFrame 11 | 12 | :param x: pd.Series, list or np.ndarray 13 | :param y: pd.Series, list or np.ndarray 14 | :return: pd.DataFrame 15 | """ 16 | x_name = x.name if isinstance(x, pd.Series) else "x" 17 | y_name = y.name if isinstance(y, pd.Series) else "y" 18 | 19 | return pd.DataFrame(np.array([x, y]).T, columns=[x_name, y_name]) 20 | 21 | 22 | def guess_interval_cols(df: pd.DataFrame, verbose: bool = False) -> list: 23 | """Select columns that have a dtype part of np.number 24 | 25 | :param df: DataFrame 26 | :param bool verbose: print all interval columns that are guessed 27 | :return: list of interval columns 28 | """ 29 | interval_cols = df.select_dtypes(include=[np.number]).columns.tolist() 30 | if interval_cols and verbose: 31 | print("interval columns not set, guessing: {}".format(str(interval_cols))) 32 | 33 | if not isinstance(interval_cols, list): 34 | raise ValueError("Could not guess interval columns") 35 | return interval_cols 36 | 37 | 38 | def make_shapes_equal(observed: pd.DataFrame, expected: pd.DataFrame) -> pd.DataFrame: 39 | """Make observed and expected (pandas) histograms equal in shape 40 | 41 | Sometimes expected histogram shape need filling / pruning to make its shape equal to observed. Give expected the 42 | same number of columns and rows. Needed for proper histogram comparison. 43 | 44 | :param pd.DataFrame observed: The observed contingency table. The table contains the observed number of occurrences in each cell. 45 | :param pd.DataFrame expected: The expected contingency table. The table contains the expected number of occurrences in each cell. 46 | :return: expected frequencies, having the same shape as observed 47 | """ 48 | # columns 49 | o_cols = observed.columns.tolist() 50 | e_cols = expected.columns.tolist() 51 | o_cols_missing = list(set(e_cols) - set(o_cols)) 52 | e_cols_missing = list(set(o_cols) - set(e_cols)) 53 | # index 54 | o_idx = observed.index.tolist() 55 | e_idx = expected.index.tolist() 56 | o_idx_missing = list(set(e_idx) - set(o_idx)) 57 | e_idx_missing = list(set(o_idx) - set(e_idx)) 58 | 59 | # make expected columns equal to observed 60 | for c in o_cols_missing: 61 | observed[c] = 0.0 62 | for c in e_cols_missing: 63 | expected[c] = 0.0 64 | observed.columns = sorted(observed.columns) 65 | expected.columns = sorted(expected.columns) 66 | # this should always be a match now 67 | assert len(observed.columns) == len(expected.columns) 68 | 69 | # make expected index equal to observed 70 | for i in o_idx_missing: 71 | observed.loc[i] = np.zeros(len(observed.columns)) 72 | for i in e_idx_missing: 73 | expected.loc[i] = np.zeros(len(expected.columns)) 74 | # this should always be a match now 75 | assert len(observed.index) == len(expected.index) 76 | 77 | return expected 78 | -------------------------------------------------------------------------------- /docs/source/introduction.rst: -------------------------------------------------------------------------------- 1 | ====================== 2 | Why did we build this? 3 | ====================== 4 | 5 | When exploring a data set, for example to model one variable in terms of the others, 6 | it is useful to summarize the dependencies between the variables, assess their significances, and 7 | visualize the individual variable dependencies. 8 | The ``PhiK`` correlation analyzer library contains several useful functions to help one do so. 9 | 10 | * This library implements a novel correlation coefficient, :math:`\phi_{K}`, with properties that - taken together - form 11 | an advantage over existing methods. 12 | 13 | The calculation of correlation coefficients between paired data variables is a standard tool of analysis for every data analyst. 14 | Pearson's correlation coefficient is a de facto standard in most fields, but by construction only works for interval variables 15 | (sometimes called continuous variables). Pearson is unsuitable for data sets with mixed variable types, 16 | e.g. where some variables are ordinal or categorical. 17 | 18 | While many correlation coefficients exist, each with different features, we have not been able to find a 19 | correlation coefficient with Pearson-like characteristics 20 | and a sound statistical interpretation that works for interval, ordinal and categorical variable types alike. 21 | 22 | The correlation coefficient :math:`\phi_{K}` follows a uniform treatment for interval, ordinal and categorical variables, 23 | captures non-linear dependencies, and is similar to Pearson's correlation coefficient in case of a bivariate normal input distribution. 24 | 25 | * We found that, by default, popular analysis libraries such ``R`` and ``scipy`` make incorrect ("asymptotic") assumptions when assessing 26 | the statistical significance of the :math:`\chi^2` contingency test of variable independence. In particular, the actual number of 27 | degrees of freedom and the shape of the test statistic distribution can differ significantly from their theoretical 28 | predictions in case of low to medium statistics data samples. This leads to incorrect p-values for the hypothesis test of variable 29 | independence. A prescription has been implemented to fix these two mistakes. 30 | 31 | * Visualizing the dependency between variables can be tricky, especially when dealing with (unordered) categorical variables. 32 | To help interpret any variable relationship found, we provide a method for the detection of 33 | significant excesses or deficits of records with respect to the expected values in a contingency table, so-called outliers, 34 | using a statistically independent evaluation for expected frequency of records, accounting for the uncertainty on the expectation. 35 | We evaluate the significance of each outlier frequency in a table, and normalize and visualize these accordingly. 36 | The resulting plots we find to be very valuable to help interpret variable dependencies, 37 | and work alike for interval, ordinal and categorical variables. 38 | 39 | The ``PhiK`` analysis library is particularly useful in modern-day analysis when studying the dependencies between a set of 40 | variables with mixed types, where often some variables are categorical. 41 | The package has been used by us to study surveys, insurance claims, correlograms, etc. 42 | 43 | For details on the methodology behind the calculations, please see our publication. 44 | For the available examples on how to use the methods, please see the `tutorials `_ section. 45 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | Release notes 3 | ============= 4 | 5 | Version 0.12.5, Jul 2025 6 | ------------------------ 7 | 8 | - FIX: scipy 1.16.0 no longer supports mvn, code now migrated to qmvn. 9 | https://github.com/KaveIO/PhiK/issues/101 10 | https://github.com/KaveIO/PhiK/pull/102 11 | - Drop support for Python 3.8, has reached end of life. 12 | 13 | Version 0.12.4, Jan 2024 14 | ------------------------ 15 | 16 | - Add support for Python 3.12. 17 | - ENH: added plotting kwargs to correlation_report function. 18 | https://github.com/KaveIO/PhiK/issues/58 19 | - FIX: fix of bin edge values they are rounded with 1e-14 20 | https://github.com/KaveIO/PhiK/issues/60 21 | - FIX: numpy random multinomial requires integer number of samples (for nixOS) 22 | https://github.com/KaveIO/PhiK/issues/73 23 | - FIX: pandas deprecation warning 24 | https://github.com/KaveIO/PhiK/pull/74 25 | - Drop support for Python 3.7, has reached end of life. 26 | 27 | Version 0.12.3, Dec 2022 28 | ------------------------ 29 | 30 | - Add support for Python 3.11 31 | 32 | Version 0.12.2, Mar 2022 33 | ------------------------ 34 | 35 | - Fix missing setup.py and pyproject.toml in source distribution 36 | - Support wheels ARM MacOS (Apple silicone) 37 | 38 | Version 0.12.1, Mar 2022 39 | ------------------------ 40 | 41 | - Two fixes to make calculation of global phik robust: global phik capped in range [0, 1], 42 | and check for successful correlation matrix inversion. 43 | - Migration to to scikit-build 0.13.1. 44 | - Support wheels for Python 3.10. 45 | 46 | 47 | Version 0.12.0, July 2021 48 | ------------------------- 49 | 50 | C++ Extension 51 | ~~~~~~~~~~~~~ 52 | 53 | Phi_K contains an optional C++ extension to compute the significance matrix using the `hypergeometric` method 54 | (also called the`Patefield` method). 55 | 56 | Note that the PyPi distributed wheels contain a pre-build extension for Linux, MacOS and Windows. 57 | 58 | A manual (pip) setup will attempt to build and install the extension, if it fails it will install without the extension. 59 | If so, using the `hypergeometric` method without the extension will trigger a 60 | NotImplementedError. 61 | 62 | Compiler requirements through Pybind11: 63 | 64 | - Clang/LLVM 3.3 or newer (for Apple Xcode's clang, this is 5.0.0 or newer) 65 | - GCC 4.8 or newer 66 | - Microsoft Visual Studio 2015 Update 3 or newer 67 | - Intel classic C++ compiler 18 or newer (ICC 20.2 tested in CI) 68 | - Cygwin/GCC (previously tested on 2.5.1) 69 | - NVCC (CUDA 11.0 tested in CI) 70 | - NVIDIA PGI (20.9 tested in CI) 71 | 72 | 73 | Other 74 | ~~~~~ 75 | 76 | * You can now manually set the number of parallel jobs in the evaluation of Phi_K or its statistical significance 77 | (when using MC simulations). For example, to use 4 parallel jobs do: 78 | 79 | .. code-block:: python 80 | 81 | df.phik_matrix(njobs = 4) 82 | df.significance_matrix(njobs = 4) 83 | 84 | The default value is -1, in which case all available cores are used. When using ``njobs=1`` no parallel processing 85 | is applied. 86 | 87 | * Phi_K can now be calculated with an independent expectation histogram: 88 | 89 | .. code-block:: python 90 | 91 | from phik.phik import phik_from_hist2d 92 | 93 | cols = ["mileage", "car_size"] 94 | interval_cols = ["mileage"] 95 | 96 | observed = df1[["feature1", "feature2"]].hist2d() 97 | expected = df2[["feature1", "feature2"]].hist2d() 98 | 99 | phik_value = phik_from_hist2d(observed=observed, expected=expected) 100 | 101 | The expected histogram is taken to be (relatively) large in number of counts 102 | compared with the observed histogram. 103 | 104 | Or can compare two (pre-binned) datasets against each other directly. Again the expected dataset 105 | is assumed to be relatively large: 106 | 107 | .. code-block:: python 108 | 109 | from phik.phik import phik_observed_vs_expected_from_rebinned_df 110 | 111 | phik_matrix = phik_observed_vs_expected_from_rebinned_df(df1_binned, df2_binned) 112 | 113 | * Added links in the readme to the basic and advanced Phi_K tutorials on google colab. 114 | * Migrated the spark example Phi_K notebook from popmon to directly using histogrammar for histogram creation. 115 | 116 | 117 | 118 | 119 | Older versions 120 | -------------- 121 | 122 | * Please see documentation for full details: https://phik.readthedocs.io 123 | -------------------------------------------------------------------------------- /phik/betainc.py: -------------------------------------------------------------------------------- 1 | """Project: PhiK - correlation analyzer library 2 | 3 | Created: 2018/09/05 4 | 5 | Description: 6 | Implementation of incomplete beta function 7 | 8 | Authors: 9 | KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands 10 | 11 | Redistribution and use in source and binary forms, with or without 12 | modification, are permitted according to the terms listed in the file 13 | LICENSE. 14 | """ 15 | import numpy as np 16 | from scipy.special import gammaln 17 | from typing import Tuple 18 | 19 | 20 | def contfractbeta( 21 | a: float, b: float, x: float, ITMAX: int = 5000, EPS: float = 1.0e-7 22 | ) -> float: 23 | """Continued fraction form of the incomplete Beta function. 24 | 25 | Code translated from: Numerical Recipes in C. 26 | 27 | Example kindly taken from blog: 28 | https://malishoaib.wordpress.com/2014/04/15/the-beautiful-beta-functions-in-raw-python/ 29 | 30 | :param float a: a 31 | :param float b: b 32 | :param float x: x 33 | :param int ITMAX: max number of iterations, default is 5000. 34 | :param float EPS: epsilon precision parameter, default is 1e-7. 35 | :returns: continued fraction form 36 | :rtype: float 37 | """ 38 | az = 1.0 39 | bm = 1.0 40 | am = 1.0 41 | qab = a + b 42 | qap = a + 1.0 43 | qam = a - 1.0 44 | bz = 1.0 - qab * x / qap 45 | 46 | for i in range(ITMAX + 1): 47 | em = float(i + 1) 48 | tem = em + em 49 | d = em * (b - em) * x / ((qam + tem) * (a + tem)) 50 | ap = az + d * am 51 | bp = bz + d * bm 52 | d = -(a + em) * (qab + em) * x / ((qap + tem) * (a + tem)) 53 | app = ap + d * az 54 | bpp = bp + d * bz 55 | aold = az 56 | am = ap / bpp 57 | bm = bp / bpp 58 | az = app / bpp 59 | bz = 1.0 60 | if abs(az - aold) < EPS * abs(az): 61 | return az 62 | 63 | raise ValueError( 64 | "a={0:f} or b={1:f} too large, or ITMAX={2:d} too small to compute incomplete beta function.".format( 65 | a, b, ITMAX 66 | ) 67 | ) 68 | 69 | 70 | def incompbeta(a: float, b: float, x: float) -> float: 71 | """Evaluation of incomplete beta function. 72 | 73 | Code translated from: Numerical Recipes in C. 74 | 75 | Here a, b > 0 and 0 <= x <= 1. 76 | This function requires contfractbeta(a,b,x, ITMAX = 200) 77 | 78 | Example kindly taken from blog: 79 | https://malishoaib.wordpress.com/2014/04/15/the-beautiful-beta-functions-in-raw-python/ 80 | 81 | :param float a: a 82 | :param float b: b 83 | :param float x: x 84 | :returns: incomplete beta function 85 | :rtype: float 86 | """ 87 | # special cases 88 | if x == 0: 89 | return 0 90 | elif x == 1: 91 | return 1 92 | # default 93 | lbeta = gammaln(a + b) - gammaln(a) - gammaln(b) + a * np.log(x) + b * np.log(1 - x) 94 | if x < (a + 1) / (a + b + 2): 95 | p = np.exp(lbeta) * contfractbeta(a, b, x) / a 96 | else: 97 | p = 1 - np.exp(lbeta) * contfractbeta(b, a, 1 - x) / b 98 | return p 99 | 100 | 101 | def log_incompbeta(a: float, b: float, x: float) -> Tuple[float, float]: 102 | """Evaluation of logarithm of incomplete beta function 103 | 104 | Logarithm of incomplete beta function is implemented to ensure sufficient precision 105 | for values very close to zero and one. 106 | 107 | Code translated from: Numerical Recipes in C. 108 | 109 | Here a, b > 0 and 0 <= x <= 1. 110 | This function requires contfractbeta(a,b,x, ITMAX = 200) 111 | 112 | Example kindly taken from blog: 113 | https://malishoaib.wordpress.com/2014/04/15/the-beautiful-beta-functions-in-raw-python/ 114 | 115 | :param float a: a 116 | :param float b: b 117 | :param float x: x 118 | :returns: tuple of log(incb) and log(1-incb) 119 | :rtype: tuple 120 | """ 121 | # special cases 122 | if x == 0: 123 | return -np.inf, 0 124 | elif x == 1: 125 | return 0, -np.inf 126 | # default 127 | lbeta = gammaln(a + b) - gammaln(a) - gammaln(b) + a * np.log(x) + b * np.log(1 - x) 128 | 129 | if x < (a + 1) / (a + b + 2): 130 | p = np.exp(lbeta) * contfractbeta(a, b, x) / a 131 | logp = lbeta + np.log(contfractbeta(a, b, x)) - np.log(a) 132 | logq = np.log(1 - p) 133 | else: 134 | p = 1 - np.exp(lbeta) * (contfractbeta(b, a, 1 - x) / b) 135 | logp = np.log(p) 136 | logq = lbeta + np.log(contfractbeta(b, a, 1 - x)) - np.log(b) 137 | return logp, logq 138 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ========================== 2 | Phi_K Correlation Constant 3 | ========================== 4 | 5 | * Version: 0.12.5. Released: Jul 2025 6 | * Release notes: https://github.com/KaveIO/PhiK/blob/master/CHANGES.rst 7 | * Repository: https://github.com/kaveio/phik 8 | * Documentation: https://phik.readthedocs.io 9 | * Publication: `[offical] `_ `[arxiv pre-print] `_ 10 | 11 | Phi_K is a practical correlation constant that works consistently between categorical, ordinal and interval variables. 12 | It is based on several refinements to Pearson's hypothesis test of independence of two variables. Essentially, the 13 | contingency test statistic of two variables is interpreted as if coming from a rotated bi-variate normal distribution, 14 | where the tilt is interpreted as Phi_K. 15 | 16 | The combined features of Phi_K form an advantage over existing coefficients. First, it works consistently between categorical, ordinal and interval variables. 17 | Second, it captures non-linear dependency. Third, it reverts to the Pearson correlation coefficient in case of a bi-variate normal input distribution. 18 | These are useful features when studying the correlation matrix of variables with mixed types. 19 | 20 | For details on the methodology behind the calculations, please see our publication. Emphasis is paid to the proper evaluation of statistical significance of correlations and to the interpretation of variable relationships 21 | in a contingency table, in particular in case of low statistics samples. 22 | The presented algorithms are easy to use and available through this public Python library. 23 | 24 | Example notebooks 25 | ================= 26 | 27 | .. list-table:: 28 | :widths: 60 40 29 | :header-rows: 1 30 | 31 | * - Static link 32 | - Google Colab link 33 | * - `basic tutorial `_ 34 | - `basic on colab `_ 35 | * - `advanced tutorial (detailed configuration) `_ 36 | - `advanced on colab `_ 37 | * - `spark tutorial `_ 38 | - no spark available 39 | 40 | Documentation 41 | ============= 42 | 43 | The entire Phi_K documentation including tutorials can be found at `read-the-docs `_. 44 | See the tutorials for detailed examples on how to run the code with pandas. We also have one example on how 45 | calculate the Phi_K correlation matrix for a spark dataframe. 46 | 47 | Check it out 48 | ============ 49 | 50 | The Phi_K library requires Python >= 3.8 and is pip friendly. To get started, simply do: 51 | 52 | .. code-block:: bash 53 | 54 | $ pip install phik 55 | 56 | or check out the code from out GitHub repository: 57 | 58 | .. code-block:: bash 59 | 60 | $ git clone https://github.com/KaveIO/PhiK.git 61 | $ pip install -e PhiK/ 62 | 63 | where in this example the code is installed in edit mode (option -e). 64 | 65 | You can now use the package in Python with: 66 | 67 | .. code-block:: python 68 | 69 | import phik 70 | 71 | **Congratulations, you are now ready to use the PhiK correlation analyzer library!** 72 | 73 | 74 | Quick run 75 | ========= 76 | 77 | As a quick example, you can do: 78 | 79 | .. code-block:: python 80 | 81 | import pandas as pd 82 | import phik 83 | from phik import resources, report 84 | 85 | # open fake car insurance data 86 | df = pd.read_csv( resources.fixture('fake_insurance_data.csv.gz') ) 87 | df.head() 88 | 89 | # Pearson's correlation matrix between numeric variables (pandas functionality) 90 | df.corr() 91 | 92 | # get the phi_k correlation matrix between all variables 93 | df.phik_matrix() 94 | 95 | # get global correlations based on phi_k correlation matrix 96 | df.global_phik() 97 | 98 | # get the significance matrix (expressed as one-sided Z) 99 | # of the hypothesis test of each variable-pair dependency 100 | df.significance_matrix() 101 | 102 | # contingency table of two columns 103 | cols = ['mileage','car_size'] 104 | df[cols].hist2d() 105 | 106 | # normalized residuals of contingency test applied to cols 107 | df[cols].outlier_significance_matrix() 108 | 109 | # show the normalized residuals of each variable-pair 110 | df.outlier_significance_matrices() 111 | 112 | # generate a phik correlation report and save as test.pdf 113 | report.correlation_report(df, pdf_file_name='test.pdf') 114 | 115 | 116 | For all available examples, please see the `tutorials `_ at read-the-docs. 117 | 118 | 119 | Contact and support 120 | =================== 121 | 122 | * Issues and Ideas: https://github.com/kaveio/phik/issues 123 | 124 | Please note that support is (only) provided on a best-effort basis. 125 | -------------------------------------------------------------------------------- /phik/data_quality.py: -------------------------------------------------------------------------------- 1 | """Project: PhiK - correlation analyzer library 2 | 3 | Created: 2018/12/28 4 | 5 | Description: 6 | A set of functions to check for data quality issues in input data. 7 | 8 | Authors: 9 | KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands 10 | 11 | Redistribution and use in source and binary forms, with or without 12 | modification, are permitted according to the terms listed in the file 13 | LICENSE. 14 | """ 15 | 16 | import warnings 17 | import copy 18 | from typing import Tuple 19 | 20 | import pandas as pd 21 | import numpy as np 22 | 23 | 24 | def dq_check_nunique_values( 25 | df: pd.DataFrame, interval_cols: list, dropna: bool = True 26 | ) -> Tuple[pd.DataFrame, list]: 27 | """ 28 | Basic data quality checks per column in a DataFrame. 29 | 30 | The following checks are done: 31 | 32 | 1. For all non-interval variables, if the number of unique values per variable is larger than 100 a warning is printed. 33 | When the number of unique values is large, the variable is likely to be an interval variable. Calculation of phik 34 | will be slow(ish) for pairs of variables where one (or two) have many different values (i.e. many bins). 35 | 36 | 2. For all interval variables, the number of unique values must be at least two. If the number of unique values is 37 | zero (i.e. all NaN) the column is removed. If the number of unique values is one, it is not possible to 38 | automatically create a binning for this variable (as min and max are the same). The variable is therefore dropped, 39 | irrespective of whether dropna is True or False. 40 | 41 | 3. For all non-interval variables, the number of unique values must be at least either 42 | a) 1 if dropna=False (NaN is now also considered a valid category), or 43 | b) 2 if dropna=True 44 | 45 | The function returns a DataFrame where all columns with invalid data are removed. Also the list of interval_cols 46 | is updated and returned. 47 | 48 | :param pd.DataFrame df: input data 49 | :param list interval_cols: column names of columns with interval variables. 50 | :param bool dropna: remove NaN values when True 51 | :returns: cleaned data, updated list of interval columns 52 | """ 53 | # check for existing columns 54 | interval_cols = [col for col in interval_cols if col in df.columns] 55 | 56 | # check non-interval variable for number of unique values 57 | for col in sorted(list(set(df.columns) - set(interval_cols))): 58 | if df[col].nunique() > 1000: 59 | warnings.warn( 60 | "The number of unique values of variable {0:s} is large: {1:d}. Are you sure this is " 61 | "not an interval variable? Analysis for pairs of variables including {0:s} can be slow.".format( 62 | col, df[col].nunique() 63 | ) 64 | ) 65 | 66 | drop_cols = [] 67 | 68 | # check for interval values whether there are at least two unique values (otherwise I cannot bin automatically) 69 | for col in interval_cols: 70 | if df[col].nunique() < 2: 71 | drop_cols.append(col) 72 | warnings.warn( 73 | "Not enough unique value for variable {0:s} for analysis {1:d}. Dropping this column".format( 74 | col, df[col].nunique() 75 | ) 76 | ) 77 | 78 | # check non-interval values whether there are at least two different values OR 1 value and NaN if dropna==False 79 | for col in sorted(list(set(df.columns) - set(interval_cols))): 80 | if df[col].nunique() == 0 or (df[col].nunique() == 1 and dropna): 81 | drop_cols.append(col) 82 | warnings.warn( 83 | "Not enough unique value for variable {0:s} for analysis {1:d}. Dropping this column".format( 84 | col, df[col].nunique() 85 | ) 86 | ) 87 | 88 | df_clean = df.copy() 89 | interval_cols_clean = copy.copy(interval_cols) 90 | if len(drop_cols) > 0: 91 | # preserves column order: https://github.com/KaveIO/PhiK/issues/1 92 | df_clean.drop(columns=drop_cols, inplace=True) 93 | interval_cols_clean = [col for col in interval_cols if col not in drop_cols] 94 | 95 | return df_clean, interval_cols_clean 96 | 97 | 98 | def dq_check_hist2d(hist2d: np.ndarray) -> bool: 99 | """Basic data quality checks for a contingency table 100 | 101 | The Following checks are done: 102 | 103 | 1. There must be at least two bins in both the x and y direction. 104 | 105 | 2. If the number of bins in the x and/or y direction is larger than 100 a warning is printed. 106 | 107 | :param hist2d: contingency table 108 | :return: bool passed_check 109 | """ 110 | 111 | if 0 in hist2d.shape or 1 in hist2d.shape: 112 | warnings.warn( 113 | "Too few unique values for variable x ({0:d}) or y ({1:d})".format( 114 | hist2d.shape[0], hist2d.shape[1] 115 | ) 116 | ) 117 | return False 118 | if hist2d.shape[0] > 1000: 119 | warnings.warn( 120 | "The number of unique values of variable x is large: {0:d}. " 121 | "Are you sure this is not an interval variable? Analysis might be slow.".format( 122 | hist2d.shape[0] 123 | ) 124 | ) 125 | if hist2d.shape[1] > 1000: 126 | warnings.warn( 127 | "The number of unique values of variable y is large: {0:d}. " 128 | "Are you sure this is not an interval variable? Analysis might be slow.".format( 129 | hist2d.shape[0] 130 | ) 131 | ) 132 | 133 | return True 134 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # PhiK documentation build configuration file for sphinx. 4 | # 5 | # 6 | 7 | import os 8 | #from unittest.mock import MagicMock 9 | 10 | import phik 11 | 12 | 13 | # Classes that use non-python modules are not always available in the 14 | # RTD environment. By mocking them we can still import these classes 15 | # in the code and RTD can subsequently go through the code and get 16 | # the docstrings. 17 | 18 | #class Mock(MagicMock): 19 | # @classmethod 20 | # def __getattr__(cls, name): 21 | # return MagicMock() 22 | 23 | # If extensions (or modules to document with autodoc) are in another directory, 24 | # add these directories to sys.path here. If the directory is relative to the 25 | # documentation root, use os.path.abspath to make it absolute, like shown here. 26 | # sys.path.insert(0, os.path.abspath('')) 27 | 28 | # -- General configuration ------------------------------------------------ 29 | 30 | # If your documentation needs a minimal Sphinx version, state it here. 31 | # needs_sphinx = '1.0' 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35 | # ones. 36 | extensions = [ 37 | 'sphinx.ext.autodoc', 38 | 'sphinx.ext.mathjax', 39 | 'sphinx.ext.ifconfig', 40 | ] 41 | 42 | # Add any paths that contain templates here, relative to this directory. 43 | templates_path = ['_templates'] 44 | 45 | # The suffix(es) of source filenames. 46 | # You can specify multiple suffix as a list of string: 47 | # source_suffix = ['.rst', '.md'] 48 | source_suffix = '.rst' 49 | 50 | # The encoding of source files. 51 | # source_encoding = 'utf-8-sig' 52 | 53 | # The master toctree document. 54 | master_doc = 'index' 55 | 56 | # General information about the project. 57 | project = 'Phi_K correlation library' 58 | copyright = '2018, KPMG Advisory N.V.' 59 | author = 'KPMG Advanced Analytics & Big Data team' 60 | version = phik.__version__ 61 | 62 | # The language for content autogenerated by Sphinx. Refer to documentation 63 | # for a list of supported languages. 64 | # 65 | # This is also used if you do content translation via gettext catalogs. 66 | # Usually you set "language" from the command line for these cases. 67 | language = 'en' 68 | 69 | # List of patterns, relative to source directory, that match files and 70 | # directories to ignore when looking for source files. 71 | exclude_patterns = ['*test*', 'phik.tutorials.*'] 72 | 73 | # The name of the Pygments (syntax highlighting) style to use. 74 | pygments_style = 'sphinx' 75 | 76 | # If true, `todo` and `todoList` produce output, else they produce nothing. 77 | todo_include_todos = False 78 | 79 | # -- Options for HTML output ---------------------------------------------- 80 | 81 | # on_rtd is whether we are on readthedocs.org, this line of code grabbed from docs.readthedocs.org 82 | on_rtd = os.environ.get('READTHEDOCS', None) == 'True' 83 | 84 | if not on_rtd: 85 | import sphinx_rtd_theme 86 | 87 | html_theme = "sphinx_rtd_theme" 88 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 89 | # otherwise, readthedocs.org uses their theme by default, so no need to specify it 90 | 91 | # Add any paths that contain custom static files (such as style sheets) here, 92 | # relative to this directory. They are copied after the builtin static files, 93 | # so a file named "default.css" will overwrite the builtin "default.css". 94 | html_static_path = ['_static'] 95 | 96 | # If false, no index is generated. 97 | html_use_index = True 98 | 99 | # If true, the index is split into individual pages for each letter. 100 | # html_split_index = False 101 | 102 | # If true, links to the reST sources are added to the pages. 103 | html_show_sourcelink = True 104 | 105 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 106 | # html_show_sphinx = True 107 | 108 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 109 | html_show_copyright = True 110 | 111 | # Language to be used for generating the HTML full-text search index. 112 | # Sphinx supports the following languages: 113 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 114 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' 115 | html_search_language = 'en' 116 | 117 | # Output file base name for HTML help builder. 118 | htmlhelp_basename = 'PhiKdoc' 119 | 120 | # -- Options for LaTeX output --------------------------------------------- 121 | 122 | latex_elements = { 123 | # The paper size ('letterpaper' or 'a4paper'). 124 | # 'papersize': 'letterpaper', 125 | 126 | # The font size ('10pt', '11pt' or '12pt'). 127 | # 'pointsize': '10pt', 128 | 129 | # Additional stuff for the LaTeX preamble. 130 | # 'preamble': '', 131 | 132 | # Latex figure (float) alignment 133 | # 'figure_align': 'htbp', 134 | } 135 | 136 | # Grouping the document tree into LaTeX files. List of tuples 137 | # (source start file, target name, title, 138 | # author, documentclass [howto, manual, or own class]). 139 | latex_documents = [ 140 | (master_doc, 'PhiK.tex', 'PhiK Documentation', 141 | 'KPMG Advanced Analytics & Big Data team', 'manual'), 142 | ] 143 | 144 | # -- Options for manual page output --------------------------------------- 145 | 146 | # One entry per manual page. List of tuples 147 | # (source start file, name, description, authors, manual section). 148 | man_pages = [ 149 | (master_doc, 'phik', 'PhiK Documentation', 150 | [author], 1) 151 | ] 152 | 153 | # -- Options for Texinfo output ------------------------------------------- 154 | 155 | # Grouping the document tree into Texinfo files. List of tuples 156 | # (source start file, target name, title, author, 157 | # dir menu entry, description, category) 158 | texinfo_documents = [ 159 | (master_doc, 'PhiK', 'PhiK Documentation', 160 | author, 'PhiK', 'One line description of project.', 161 | 'Miscellaneous'), 162 | ] 163 | 164 | 165 | def skip(app, what, name, obj, skip, options): 166 | if name == "__init__": 167 | return False 168 | return skip 169 | 170 | 171 | def setup(app): 172 | app.connect("autodoc-skip-member", skip) 173 | -------------------------------------------------------------------------------- /phik/notebooks/phik_tutorial_spark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Phi_K spark tutorial\n", 8 | "\n", 9 | "This notebook shows you how to obtain the Phi_K correlation matrix for a spark dataframe.\n", 10 | "Calculating the Phi_K matrix consists of two steps:\n", 11 | "\n", 12 | "- Obtain the 2d contingency tables for all variable pairs. To make these we use the [`histogrammar` package](https://github.com/histogrammar/histogrammar-python).\n", 13 | "- Calculate the Phi_K value for each variable pair from its contingency table.\n", 14 | "\n", 15 | "Make sure you install the histogrammar package to make the 2d histograms, that are then used to calculate phik." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "%%capture\n", 25 | "# install histogrammar (if not installed yet)\n", 26 | "import sys\n", 27 | "\n", 28 | "!\"{sys.executable}\" -m pip install histogrammar" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "import itertools\n", 38 | "\n", 39 | "import pandas as pd\n", 40 | "import histogrammar as hg\n", 41 | "from histogrammar.plot.hist_numpy import get_2dgrid\n", 42 | "\n", 43 | "import phik\n", 44 | "from phik import resources\n", 45 | "from phik.phik import spark_phik_matrix_from_hist2d_dict" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "# histogramming is done using the histogrammar library" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "from pyspark.sql import SparkSession\n", 62 | "from pyspark import __version__ as pyspark_version\n", 63 | "\n", 64 | "scala = '2.12' if int(pyspark_version[0]) >= 3 else '2.11'\n", 65 | "hist_jar = f'io.github.histogrammar:histogrammar_{scala}:1.0.20'\n", 66 | "hist_spark_jar = f'io.github.histogrammar:histogrammar-sparksql_{scala}:1.0.20'\n", 67 | "\n", 68 | "spark = SparkSession.builder.config(\n", 69 | " \"spark.jars.packages\", f'{hist_spark_jar},{hist_jar}'\n", 70 | ").getOrCreate()\n", 71 | "\n", 72 | "spark = SparkSession.builder.config(\n", 73 | " \"spark.jars.packages\", f'{hist_spark_jar},{hist_jar}'\n", 74 | ").getOrCreate()\n", 75 | "\n", 76 | "sc = spark.sparkContext" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "# Load data\n", 84 | "\n", 85 | "A simulated dataset is part of the phik-package. The dataset concerns fake car insurance data. Load the dataset here:" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "data = pd.read_csv( resources.fixture('fake_insurance_data.csv.gz') )\n", 95 | "sdf = spark.createDataFrame(data)\n", 96 | "sdf.show()" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "combis = itertools.combinations_with_replacement(sdf.columns, 2)\n", 106 | "combis = [list(c) for c in combis]" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "print(combis)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "# step 1: create histograms (this runs spark histogrammar in the background)\n" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "# see the doc-string of hg_make_histograms() for binning options.\n", 132 | "hists = sdf.hg_make_histograms(combis)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "# collect the numpy contingency tables into a dict\n", 142 | "grids = {k:(get_2dgrid(h)[2]) for k,h in hists.items()}\n", 143 | "print(grids)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "# we can store the histograms if we want to\n", 153 | "if False:\n", 154 | " import pickle\n", 155 | "\n", 156 | " with open('grids.pkl', 'wb') as outfile:\n", 157 | " pickle.dump(grids, outfile)\n", 158 | "\n", 159 | " with open('grids.pkl', 'rb') as handle:\n", 160 | " grids = pickle.load(handle)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "# step 2: calculate phik matrix (runs rdd parallellization over all 2d histograms)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "phik_matrix = spark_phik_matrix_from_hist2d_dict(sc, grids)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "phik_matrix" 186 | ] 187 | } 188 | ], 189 | "metadata": { 190 | "kernelspec": { 191 | "display_name": "Python 3", 192 | "language": "python", 193 | "name": "python3" 194 | }, 195 | "language_info": { 196 | "codemirror_mode": { 197 | "name": "ipython", 198 | "version": 3 199 | }, 200 | "file_extension": ".py", 201 | "mimetype": "text/x-python", 202 | "name": "python", 203 | "nbconvert_exporter": "python", 204 | "pygments_lexer": "ipython3", 205 | "version": "3.6.8" 206 | } 207 | }, 208 | "nbformat": 4, 209 | "nbformat_minor": 2 210 | } 211 | -------------------------------------------------------------------------------- /phik/statistics.py: -------------------------------------------------------------------------------- 1 | """Project: PhiK - correlation coefficient package 2 | 3 | Created: 2018/09/05 4 | 5 | Description: 6 | Statistics helper functions, for the calculation of phik and significance 7 | of a contingency table. 8 | 9 | Authors: 10 | KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands 11 | 12 | Redistribution and use in source and binary forms, with or without 13 | modification, are permitted according to the terms listed in the file 14 | LICENSE. 15 | """ 16 | from typing import Union 17 | 18 | import numpy as np 19 | from scipy import stats 20 | 21 | 22 | def get_dependent_frequency_estimates(vals: np.ndarray) -> np.ndarray: 23 | """ 24 | Calculation of dependent expected frequencies. 25 | 26 | Calculation is based on the marginal sums of the table, i.e. dependent frequency estimates. 27 | :param vals: The contingency table. The table contains the observed number of occurrences in each category 28 | 29 | :returns exp: expected frequencies 30 | """ 31 | 32 | # use existing scipy functionality 33 | return stats.contingency.expected_freq(vals) 34 | 35 | 36 | def get_chi2_using_dependent_frequency_estimates( 37 | vals: np.ndarray, lambda_: str = "log-likelihood" 38 | ) -> float: 39 | """ 40 | Chi-square test of independence of variables in a contingency table. 41 | 42 | The expected frequencies are based on the 43 | marginal sums of the table, i.e. dependent frequency estimates. 44 | 45 | :param vals: The contingency table. The table contains the observed number of occurrences in each category 46 | :returns test_statistic: the test statistic value 47 | """ 48 | 49 | values = vals[:] 50 | 51 | # remove rows with only zeros, scipy doesn't like them. 52 | values = values[~np.all(values == 0, axis=1)] 53 | # remove columns with only zeros, scipy doesn't like them. 54 | values = values.T[~np.all(values.T == 0, axis=1)].T 55 | 56 | # use existing scipy functionality 57 | test_statistic, _, _, _ = stats.chi2_contingency(values, lambda_=lambda_) 58 | 59 | return test_statistic 60 | 61 | 62 | def get_pearson_chi_square( 63 | observed: np.ndarray, expected: np.ndarray = None, normalize: bool = True 64 | ) -> float: 65 | """Calculate pearson chi square between observed and expected 2d contingency matrix 66 | 67 | :param observed: The observed contingency table. The table contains the observed number of occurrences in each cell. 68 | :param expected: The expected contingency table. The table contains the expected number of occurrences in each cell. 69 | :param bool normalize: normalize expected frequencies, default is True. 70 | :return: the pearson chi2 value 71 | """ 72 | observed = np.asarray(observed) 73 | if np.any(observed < 0): 74 | raise ValueError("All values in `observed` must be non-negative.") 75 | if observed.size == 0: 76 | raise ValueError("No data; `observed` has size 0.") 77 | 78 | if expected is None: 79 | expected = get_dependent_frequency_estimates(observed) 80 | expected = np.asarray(expected) 81 | 82 | # important to ensure that observed and expected have same normalization 83 | if normalize: 84 | expected = expected * (np.sum(observed) / np.sum(expected)) 85 | 86 | terms = np.divide( 87 | (observed.astype(np.float64) - expected) ** 2, 88 | expected, 89 | out=np.zeros_like(expected), 90 | where=expected != 0, 91 | ) 92 | return np.sum(terms) 93 | 94 | 95 | def estimate_ndof(chi2values: Union[list, np.ndarray]) -> float: 96 | """ 97 | Estimation of the effective number of degrees of freedom. 98 | 99 | A good approximation of endof is the average value. Alternatively 100 | a fit to the chi2 distribution can be make. Both values are returned. 101 | 102 | :param list chi2values: list of chi2 values 103 | :returns: endof0, endof 104 | """ 105 | 106 | return np.mean(chi2values) 107 | 108 | 109 | def estimate_simple_ndof(observed: np.ndarray) -> int: 110 | """ 111 | Simple estimation of the effective number of degrees of freedom. 112 | 113 | This equals the nominal calculation for ndof minus the number of empty bins in the 114 | expected contingency table. 115 | 116 | :param observed: numpy array of observed cell counts 117 | :returns: endof 118 | """ 119 | 120 | # use existing scipy functionality 121 | expected = stats.contingency.expected_freq(observed) 122 | endof = ( 123 | expected.size 124 | - np.sum(expected.shape) 125 | + expected.ndim 126 | - 1 127 | - (expected == 0).sum() 128 | ) 129 | # require minimum number of degrees of freedom 130 | if endof < 0: 131 | endof = 0 132 | return endof 133 | 134 | 135 | def theoretical_ndof(observed: np.ndarray) -> int: 136 | """ 137 | Simple estimation of the effective number of degrees of freedom. 138 | 139 | This equals the nominal calculation for ndof minus the number of empty bins in the 140 | expected contingency table. 141 | 142 | :param observed: numpy array of observed cell counts 143 | :returns: theoretical ndof 144 | """ 145 | 146 | return observed.size - np.sum(observed.shape) + observed.ndim - 1 147 | 148 | 149 | def z_from_logp(logp: float, flip_sign: bool = False) -> float: 150 | """ 151 | Convert logarithm of p-value into one-sided Z-value 152 | 153 | :param float logp: logarithm of p-value, should not be greater than 0 154 | :param bool flip_sign: flip sign of Z-value, e.g. use for input log(1-p). Default is false. 155 | :returns: statistical significance Z-value 156 | :rtype: float 157 | """ 158 | 159 | # pvalue == 0, Z = infinity 160 | if logp == -np.inf: 161 | return np.inf if not flip_sign else -np.inf 162 | 163 | p_value = np.exp(logp) 164 | 165 | # scenario where p-value is numerically too small to evaluate Z 166 | if p_value == 0: 167 | # kicks in here when Z > 37 168 | # approach valid when ~ Z > 1.5. 169 | u = -2.0 * np.log(2 * np.pi) - 2.0 * logp 170 | z_value = np.sqrt(u - np.log(u)) 171 | else: 172 | z_value = -stats.norm.ppf(p_value) 173 | 174 | if flip_sign: 175 | z_value *= -1.0 176 | 177 | return z_value 178 | -------------------------------------------------------------------------------- /phik/simulation.py: -------------------------------------------------------------------------------- 1 | """Project: PhiK - correlation analyzer library 2 | 3 | Created: 2018/09/05 4 | 5 | Description: 6 | Helper functions to simulate 2D datasets 7 | 8 | Authors: 9 | KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands 10 | 11 | Redistribution and use in source and binary forms, with or without 12 | modification, are permitted according to the terms listed in the file 13 | LICENSE. 14 | """ 15 | 16 | import numpy as np 17 | from joblib import Parallel, delayed 18 | 19 | from .statistics import get_dependent_frequency_estimates 20 | from .statistics import get_chi2_using_dependent_frequency_estimates 21 | from phik.simcore import CPP_SUPPORT, _sim_2d_data_patefield 22 | 23 | 24 | NUMPY_INT_MAX = np.iinfo(np.int32).max - 1 25 | 26 | 27 | def sim_2d_data(hist:np.ndarray, ndata:int=0) -> np.ndarray: 28 | """ 29 | Simulate a 2 dimensional dataset given a 2 dimensional pdf 30 | 31 | :param array-like hist: contingency table, which contains the observed number of occurrences in each category. 32 | This table is used as probability density function. 33 | :param int ndata: number of simulations 34 | :return: simulated data 35 | """ 36 | 37 | if ndata <= 0: 38 | ndata = int(np.rint(hist.sum())) 39 | if ndata <= 0: 40 | raise ValueError('ndata (or hist.sum()) has to be positive') 41 | 42 | # scale and ravel 43 | hc = hist[:] / hist.sum() 44 | hcr = hc.ravel() 45 | 46 | hout = np.random.multinomial(n=ndata, pvals=hcr) 47 | hout2d = np.reshape(hout, hc.shape) 48 | return hout2d 49 | 50 | 51 | def sim_2d_data_patefield(data: np.ndarray, seed : int = None) -> np.ndarray: 52 | """ 53 | Simulate a two dimensional dataset with fixed row and column totals. 54 | 55 | Simulation algorithm by Patefield: 56 | W. M. Patefield, Applied Statistics 30, 91 (1981) 57 | Python implementation inspired by (C version): 58 | https://people.sc.fsu.edu/~jburkardt/c_src/asa159/asa159.html 59 | 60 | :param data: contingency table, which contains the observed number of occurrences in each category.\ 61 | :param seed: optional seed for the simulation, primarily for testing purposes.\ 62 | This table is used as probability density function. 63 | :return: simulated data 64 | """ 65 | 66 | if not CPP_SUPPORT: 67 | raise NotImplementedError( 68 | 'Patefield requires a compiled extension that was not found.' 69 | ) 70 | 71 | # number of rows and columns 72 | nrows, ncols = data.shape 73 | 74 | # totals per row and column 75 | # NOTE we assume that sum will fit in a 32 bit int 76 | nrowt = np.rint(data.sum(axis=1)).astype(np.int32) 77 | ncolt = np.rint(data.sum(axis=0)).astype(np.int32) 78 | 79 | # set seed if it is None 80 | seed = seed or np.random.randint(0, NUMPY_INT_MAX) 81 | 82 | # allocate memory that will be set by _sim_2d_data_patefield 83 | matrix = np.empty(nrows * ncols, dtype=np.int32) 84 | 85 | # simulate the data, returned through matrix inplace modification 86 | _sim_2d_data_patefield(nrows, ncols, nrowt, ncolt, seed, matrix) 87 | return matrix.reshape(ncols, nrows).T 88 | 89 | 90 | def sim_2d_product_multinominal(data:np.ndarray, axis: int) -> np.ndarray: 91 | """ 92 | Simulate 2 dimensional data with either row or column totals fixed. 93 | 94 | :param data: contingency table, which contains the observed number of occurrences in each category.\ 95 | This table is used as probability density function. 96 | :param axis: fix row totals (0) or column totals (1). 97 | :return: simulated data 98 | """ 99 | 100 | if axis == 1: 101 | return np.array([list(sim_2d_data(data[i])) for i in range(data.shape[0])]) 102 | elif axis == 0: 103 | return np.array([list(sim_2d_data(data.T[i])) for i in range(data.shape[1])]).T 104 | else: 105 | raise NotImplementedError("Axis should be 0 (row) or 1 (column).") 106 | 107 | 108 | def sim_data(data:np.ndarray, method:str='multinominal') -> np.ndarray: 109 | """ 110 | Simulate a 2 dimensional dataset given a 2 dimensional pdf 111 | 112 | Several simulation methods are provided: 113 | 114 | - multinominal: Only the total number of records is fixed. 115 | - row_product_multinominal: The row totals fixed in the sampling. 116 | - col_product_multinominal: The column totals fixed in the sampling. 117 | - hypergeometric: Both the row or column totals are fixed in the sampling. Note that this type of sampling is\ 118 | only available when row and column totals are integers. 119 | 120 | :param data: contingency table 121 | :param str method: sampling method. Options: [multinominal, hypergeometric, row_product_multinominal,\ 122 | col_product_multinominal] 123 | :return: simulated data 124 | """ 125 | 126 | if method == 'multinominal': 127 | return sim_2d_data(data) 128 | elif method == 'hypergeometric': 129 | return sim_2d_data_patefield(data) 130 | elif method == 'row_product_multinominal': 131 | return sim_2d_product_multinominal(data, 0) 132 | elif method == 'col_product_multinominal': 133 | return sim_2d_product_multinominal(data, 1) 134 | else: 135 | raise NotImplementedError('selected method not recognized.') 136 | 137 | 138 | def sim_chi2_distribution(values: np.ndarray, nsim:int=1000, lambda_:str='log-likelihood', 139 | simulation_method:str='multinominal', alt_hypothesis:bool=False, njobs:int=-1) -> list: 140 | """ 141 | Simulate 2D data and calculate the chi-square statistic for each simulated dataset. 142 | 143 | :param values: The contingency table. The table contains the observed number of occurrences in each category 144 | :param int nsim: number of simulations (optional, default=1000) 145 | :param str simulation_method: sampling method. Options: [multinominal, hypergeometric, row_product_multinominal, 146 | col_product_multinominal] 147 | :param str lambda_: test statistic. Available options are [pearson, log-likelihood]. 148 | :param bool alt_hypothesis: if True, simulate values directly, and not its dependent frequency estimates. 149 | :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs. 150 | :returns chi2s: list of chi2 values for each simulated dataset 151 | """ 152 | exp_dep = get_dependent_frequency_estimates(values) if not alt_hypothesis else values 153 | 154 | if njobs == 1: 155 | chi2s = [_simulate_and_fit(exp_dep, simulation_method, lambda_) for _ in range(nsim)] 156 | else: 157 | chi2s = Parallel(n_jobs=njobs)(delayed(_simulate_and_fit)(exp_dep, simulation_method, lambda_) 158 | for _ in range(nsim)) 159 | 160 | return chi2s 161 | 162 | 163 | def _simulate_and_fit(exp_dep: np.ndarray, simulation_method: str='multinominal', 164 | lambda_:str='log-likelihood') -> float: 165 | """split off simulate function to allow for parallellization""" 166 | simdata = sim_data(exp_dep, method=simulation_method) 167 | simchi2 = get_chi2_using_dependent_frequency_estimates(simdata, lambda_) 168 | return simchi2 169 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " applehelp to make an Apple Help Book" 34 | @echo " devhelp to make HTML files and a Devhelp project" 35 | @echo " epub to make an epub" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | html: 55 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 56 | @echo 57 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 58 | 59 | dirhtml: 60 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 61 | @echo 62 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 63 | 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | pickle: 70 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 71 | @echo 72 | @echo "Build finished; now you can process the pickle files." 73 | 74 | json: 75 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 76 | @echo 77 | @echo "Build finished; now you can process the JSON files." 78 | 79 | htmlhelp: 80 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 81 | @echo 82 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 83 | ".hhp project file in $(BUILDDIR)/htmlhelp." 84 | 85 | qthelp: 86 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 87 | @echo 88 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 89 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 90 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/DecisionEngine.qhcp" 91 | @echo "To view the help file:" 92 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/DecisionEngine.qhc" 93 | 94 | applehelp: 95 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 96 | @echo 97 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 98 | @echo "N.B. You won't be able to view it unless you put it in" \ 99 | "~/Library/Documentation/Help or install it in your application" \ 100 | "bundle." 101 | 102 | devhelp: 103 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 104 | @echo 105 | @echo "Build finished." 106 | @echo "To view the help file:" 107 | @echo "# mkdir -p $$HOME/.local/share/devhelp/DecisionEngine" 108 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/DecisionEngine" 109 | @echo "# devhelp" 110 | 111 | epub: 112 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 113 | @echo 114 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 115 | 116 | latex: 117 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 118 | @echo 119 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 120 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 121 | "(use \`make latexpdf' here to do that automatically)." 122 | 123 | latexpdf: 124 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 125 | @echo "Running LaTeX files through pdflatex..." 126 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 127 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 128 | 129 | latexpdfja: 130 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 131 | @echo "Running LaTeX files through platex and dvipdfmx..." 132 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 133 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 134 | 135 | text: 136 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 137 | @echo 138 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 139 | 140 | man: 141 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 142 | @echo 143 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 144 | 145 | texinfo: 146 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 147 | @echo 148 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 149 | @echo "Run \`make' in that directory to run these through makeinfo" \ 150 | "(use \`make info' here to do that automatically)." 151 | 152 | info: 153 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 154 | @echo "Running Texinfo files through makeinfo..." 155 | make -C $(BUILDDIR)/texinfo info 156 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 157 | 158 | gettext: 159 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 160 | @echo 161 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 162 | 163 | changes: 164 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 165 | @echo 166 | @echo "The overview file is in $(BUILDDIR)/changes." 167 | 168 | linkcheck: 169 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 170 | @echo 171 | @echo "Link check complete; look for any errors in the above output " \ 172 | "or in $(BUILDDIR)/linkcheck/output.txt." 173 | 174 | doctest: 175 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 176 | @echo "Testing of doctests in the sources finished, look at the " \ 177 | "results in $(BUILDDIR)/doctest/output.txt." 178 | 179 | coverage: 180 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 181 | @echo "Testing of coverage in the sources finished, look at the " \ 182 | "results in $(BUILDDIR)/coverage/python.txt." 183 | 184 | xml: 185 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 186 | @echo 187 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 188 | 189 | pseudoxml: 190 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 191 | @echo 192 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 193 | -------------------------------------------------------------------------------- /phik/bivariate.py: -------------------------------------------------------------------------------- 1 | """Project: PhiK - correlation analyzer library 2 | 3 | Created: 2019/11/23 4 | 5 | Description: 6 | Convert Pearson correlation value into a chi2 value of a contingency test 7 | matrix of a bivariate gaussian, and vice-versa. 8 | Calculation uses scipy's mvn library. 9 | 10 | Authors: 11 | KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands 12 | 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted according to the terms listed in the file 15 | LICENSE. 16 | """ 17 | import warnings 18 | 19 | import numpy as np 20 | import scipy 21 | from scipy import optimize 22 | 23 | _scipy_version = [int(v) for v in scipy.__version__.split('.')] 24 | USE_QMVN = True if _scipy_version[0] >= 1 and _scipy_version[1] >= 16 else False 25 | if USE_QMVN: 26 | from scipy.stats._qmvnt import _qauto, _qmvn 27 | else: 28 | from scipy.stats._mvn import mvnun 29 | 30 | 31 | 32 | 33 | def _mvn_un(rho: float, lower: tuple, upper: tuple, 34 | rng: np.random.Generator = np.random.default_rng(42)) -> float: 35 | """Perform integral of bivariate normal gauss with correlation 36 | 37 | Integral is performed using scipy's mvn library. 38 | 39 | :param float rho: tilt parameter 40 | :param tuple lower: tuple of lower corner of integral area 41 | :param tuple upper: tuple of upper corner of integral area 42 | :param np.random.Generator rng: default_rng(42), optional 43 | :returns float: integral value 44 | """ 45 | mu = np.array([0.0, 0.0]) 46 | S = np.array([[1.0, rho], [rho, 1.0]]) 47 | return _calc_mvnun(lower=lower, upper=upper, mu=mu, S=S, rng=rng) 48 | 49 | 50 | def _calc_mvnun(lower, upper, mu, S, rng = np.random.default_rng(42)): 51 | if USE_QMVN: 52 | res = _qauto(_qmvn, S, lower, upper, rng)[0] 53 | else: 54 | res = mvnun(lower, upper, mu, S)[0] 55 | return res 56 | 57 | 58 | def _mvn_array(rho: float, sx: np.ndarray, sy: np.ndarray) -> list: 59 | """Array of integrals over bivariate normal gauss with correlation 60 | 61 | Integrals are performed using scipy's mvn library. 62 | 63 | :param float rho: tilt parameter 64 | :param np.ndarray sx: bin edges array of x-axis 65 | :param np.ndarray sy: bin edges array of y-axis 66 | :returns list: list of integral values 67 | """ 68 | # ranges = [([sx[i], sy[j]], [sx[i+1], sy[j+1]]) for i in range(len(sx) - 1) for j in range(len(sy) - 1)] 69 | # corr = [mvn.mvnun(lower, upper, mu, S)[0] for lower, upper in ranges] 70 | # return corr 71 | 72 | # mean and covariance 73 | mu = np.array([0.0, 0.0]) 74 | S = np.array([[1.0, rho], [rho, 1.0]]) 75 | 76 | # callling mvn.mvnun is expensive, so we only calculate half of the matrix, then symmetrize 77 | # add half block, which is symmetric in x 78 | odd_odd = False 79 | ranges = [ 80 | ([sx[i], sy[j]], [sx[i + 1], sy[j + 1]]) 81 | for i in range((len(sx) - 1) // 2) 82 | for j in range(len(sy) - 1) 83 | ] 84 | # add odd middle row, which is symmetric in y 85 | if (len(sx) - 1) % 2 == 1: 86 | i = (len(sx) - 1) // 2 87 | ranges += [ 88 | ([sx[i], sy[j]], [sx[i + 1], sy[j + 1]]) for j in range((len(sy) - 1) // 2) 89 | ] 90 | # add center point, add this only once 91 | if (len(sy) - 1) % 2 == 1: 92 | j = (len(sy) - 1) // 2 93 | ranges.append(([sx[i], sy[j]], [sx[i + 1], sy[j + 1]])) 94 | odd_odd = True 95 | 96 | corr = np.array([_calc_mvnun(lower, upper, mu, S) for lower, upper in ranges]) 97 | # add second half, exclude center 98 | corr = np.concatenate([corr, corr if not odd_odd else corr[:-1]]) 99 | return corr 100 | 101 | 102 | def bivariate_normal_theory( 103 | rho: float, 104 | nx: int = -1, 105 | ny: int = -1, 106 | n: int = 1, 107 | sx: np.ndarray = None, 108 | sy: np.ndarray = None, 109 | ) -> np.ndarray: 110 | """Return binned pdf of bivariate normal distribution. 111 | 112 | This function returns a "perfect" binned bivariate normal distribution. 113 | 114 | :param float rho: tilt parameter 115 | :param int nx: number of uniform bins on x-axis. alternative to sx. 116 | :param int ny: number of uniform bins on y-axis. alternative to sy. 117 | :param np.ndarray sx: bin edges array of x-axis. default is None. 118 | :param np.ndarray sy: bin edges array of y-axis. default is None. 119 | :param int n: number of entries. default is one. 120 | :return: np.ndarray of binned bivariate normal pdf 121 | """ 122 | 123 | if n < 1: 124 | raise ValueError("Number of entries needs to be one or greater.") 125 | if sx is None: 126 | sx = np.linspace(-5, 5, nx + 1) 127 | if sy is None: 128 | sy = np.linspace(-5, 5, ny + 1) 129 | 130 | bvn = np.zeros((ny, nx)) 131 | for i in range(len(sx) - 1): 132 | for j in range(len(sy) - 1): 133 | lower = (sx[i], sy[j]) 134 | upper = (sx[i + 1], sy[j + 1]) 135 | p = _mvn_un(rho, lower, upper) 136 | bvn[j, i] = p 137 | bvn *= n 138 | 139 | # patch for entry levels that are below machine precision 140 | # (simulation does not work otherwise) 141 | bvn[bvn < np.finfo(np.float).eps] = np.finfo(np.float).eps 142 | 143 | return bvn 144 | 145 | 146 | def chi2_from_phik( 147 | rho: float, 148 | n: int, 149 | subtract_from_chi2: float = 0, 150 | corr0: list = None, 151 | scale: float = None, 152 | sx: np.ndarray = None, 153 | sy: np.ndarray = None, 154 | pedestal: float = 0, 155 | nx: int = -1, 156 | ny: int = -1, 157 | ) -> float: 158 | """Calculate chi2-value of bivariate gauss having correlation value rho 159 | 160 | Calculate no-noise chi2 value of bivar gauss with correlation rho, 161 | with respect to bivariate gauss without any correlation. 162 | 163 | :param float rho: tilt parameter 164 | :param int n: number of records 165 | :param float subtract_from_chi2: value subtracted from chi2 calculation. default is 0. 166 | :param list corr0: mvn_array result for rho=0. Default is None. 167 | :param float scale: scale is multiplied with the chi2 if set. 168 | :param np.ndarray sx: bin edges array of x-axis. default is None. 169 | :param np.ndarray sy: bin edges array of y-axis. default is None. 170 | :param float pedestal: pedestal is added to the chi2 if set. 171 | :param int nx: number of uniform bins on x-axis. alternative to sx. 172 | :param int ny: number of uniform bins on y-axis. alternative to sy. 173 | :returns float: chi2 value 174 | """ 175 | 176 | if sx is None: 177 | sx = np.linspace(-5, 5, nx + 1) 178 | 179 | if sy is None: 180 | sy = np.linspace(-5, 5, ny + 1) 181 | 182 | if corr0 is None: 183 | corr0 = _mvn_array(0, sx, sy) 184 | if scale is None: 185 | # scale ensures that for rho=1, chi2 is the maximum possible value 186 | corr1 = _mvn_array(1, sx, sy) 187 | delta_corr2 = (corr1 - corr0) ** 2 188 | # protect against division by zero 189 | ratio = np.divide( 190 | delta_corr2, corr0, out=np.zeros_like(delta_corr2), where=corr0 != 0 191 | ) 192 | chi2_one = n * np.sum(ratio) 193 | # chi2_one = n * sum([((c1-c0)*(c1-c0)) / c0 for c0, c1 in zip(corr0, corr1)]) 194 | chi2_max = n * min(nx - 1, ny - 1) 195 | scale = (chi2_max - pedestal) / chi2_one 196 | 197 | corrr = _mvn_array(rho, sx, sy) 198 | delta_corr2 = (corrr - corr0) ** 2 199 | # protect against division by zero 200 | ratio = np.divide( 201 | delta_corr2, corr0, out=np.zeros_like(delta_corr2), where=corr0 != 0 202 | ) 203 | chi2_rho = n * np.sum(ratio) 204 | # chi2_rho = (n * sum([((cr-c0)*(cr-c0)) / c0 for c0, cr in zip(corr0, corrr)])) 205 | 206 | chi2 = pedestal + chi2_rho * scale 207 | return chi2 - subtract_from_chi2 208 | 209 | 210 | def phik_from_chi2( 211 | chi2: float, 212 | n: int, 213 | nx: int, 214 | ny: int, 215 | sx: np.ndarray = None, 216 | sy: np.ndarray = None, 217 | pedestal: float = 0, 218 | ) -> float: 219 | """ 220 | Correlation coefficient of bivariate gaussian derived from chi2-value 221 | 222 | Chi2-value gets converted into correlation coefficient of bivariate gauss 223 | with correlation value rho, assuming giving binning and number of records. 224 | Correlation coefficient value is between 0 and 1. 225 | 226 | Bivariate gaussian's range is set to [-5,5] by construction. 227 | 228 | :param float chi2: input chi2 value 229 | :param int n: number of records 230 | :param int nx: number of uniform bins on x-axis. alternative to sx. 231 | :param int ny: number of uniform bins on y-axis. alternative to sy. 232 | :param np.ndarray sx: bin edges array of x-axis. default is None. 233 | :param np.ndarray sy: bin edges array of y-axis. default is None. 234 | :param float pedestal: pedestal is added to the chi2 if set. 235 | :returns float: correlation coefficient 236 | """ 237 | 238 | if pedestal < 0: 239 | raise ValueError("noise pedestal should be greater than zero.") 240 | 241 | if sx is None: 242 | sx = np.linspace(-5, 5, nx + 1) 243 | elif nx <= 1: 244 | raise ValueError("number of bins along x-axis is unknown") 245 | if sy is None: 246 | sy = np.linspace(-5, 5, ny + 1) 247 | elif ny <= 1: 248 | raise ValueError("number of bins along y-axis is unknown") 249 | 250 | corr0 = _mvn_array(0, sx, sy) 251 | 252 | # scale ensures that for rho=1, chi2 is the maximum possible value 253 | corr1 = _mvn_array(1, sx, sy) 254 | if 0 in corr0 and len(corr0) > 10000: 255 | warnings.warn( 256 | "Many cells: {0:d}. Are interval variables set correctly?".format( 257 | len(corr0) 258 | ) 259 | ) 260 | 261 | delta_corr2 = (corr1 - corr0) ** 2 262 | # protect against division by zero 263 | ratio = np.divide( 264 | delta_corr2, corr0, out=np.zeros_like(delta_corr2), where=corr0 != 0 265 | ) 266 | chi2_one = n * np.sum(ratio) 267 | # chi2_one = n * sum([((c1-c0)*(c1-c0)) / c0 if c0 > 0 else 0 for c0,c1 in zip(corr0,corr1)]) 268 | chi2_max = n * min(nx - 1, ny - 1) 269 | scale = (chi2_max - pedestal) / chi2_one 270 | if chi2 > chi2_max and np.isclose(chi2, chi2_max, atol=1e-14): 271 | chi2 = chi2_max 272 | 273 | # only solve for rho if chi2 exceeds noise pedestal 274 | if chi2 <= pedestal: 275 | return 0.0 276 | elif chi2 >= chi2_max: 277 | return 1.0 278 | 279 | rho = optimize.brentq( 280 | chi2_from_phik, 0, 1, args=(n, chi2, corr0, scale, sx, sy, pedestal), xtol=1e-5 281 | ) 282 | return rho 283 | -------------------------------------------------------------------------------- /phik/binning.py: -------------------------------------------------------------------------------- 1 | """Project: PhiK - correlation analyzer library 2 | 3 | Created: 2018/09/06 4 | 5 | Description: 6 | A set of rebinning functions, to help rebin two lists into a 2d histogram. 7 | 8 | Authors: 9 | KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands 10 | 11 | Redistribution and use in source and binary forms, with or without 12 | modification, are permitted according to the terms listed in the file 13 | LICENSE. 14 | """ 15 | import sys 16 | from typing import List, Optional, Tuple, Union 17 | 18 | import numpy as np 19 | import pandas as pd 20 | 21 | from phik import definitions as defs 22 | from phik.data_quality import dq_check_nunique_values 23 | from phik.utils import array_like_to_dataframe, guess_interval_cols 24 | 25 | 26 | def bin_edges( 27 | arr: Union[np.ndarray, list, pd.Series], nbins: int, quantile: bool = False 28 | ) -> np.ndarray: 29 | """ 30 | Create uniform or quantile bin-edges for the input array. 31 | 32 | :param arr: array like object with input data 33 | :param int nbins: the number of bin 34 | :param bool quantile: uniform bins (False) or bins based on quantiles (True) 35 | :returns: array with bin edges 36 | """ 37 | 38 | if quantile: 39 | quantiles = np.linspace(0, 1, nbins + 1) 40 | xbins = np.quantile(arr[~np.isnan(arr)], quantiles) 41 | xbins[0] -= max(1e-14 * abs(xbins[0]), sys.float_info.min) 42 | else: 43 | min_value = np.min(arr[~np.isnan(arr)]) 44 | constant = max(1e-14 * abs(min_value), sys.float_info.min) 45 | xbins = np.linspace( 46 | min_value - constant, np.max(arr[~np.isnan(arr)]), nbins + 1 47 | ) 48 | 49 | return xbins 50 | 51 | 52 | def bin_array( 53 | arr: Union[np.ndarray, list], bin_edges: Union[np.ndarray, list] 54 | ) -> Tuple[np.ndarray, list]: 55 | """ 56 | Index the data given the bin_edges. 57 | 58 | Underflow and overflow values are indicated. 59 | 60 | :param arr: array like object with input data 61 | :param bin_edges: list with bin edges. 62 | :returns: indexed data 63 | """ 64 | 65 | # Bin data 66 | binned_arr = np.searchsorted(bin_edges, arr).astype(object) 67 | 68 | # Check if all bins are filled and store bin-labels 69 | bin_labels = [] 70 | bin_indices = pd.Series(binned_arr).value_counts().index 71 | for i in range(1, len(bin_edges)): 72 | if i in bin_indices: 73 | bin_labels.append((bin_edges[i - 1], bin_edges[i])) 74 | 75 | # NaN values are added to the overflow bin. Restore NaN values: 76 | binned_arr[np.argwhere(np.isnan(arr))] = np.nan 77 | 78 | # Set underflow values to UF 79 | binned_arr[np.argwhere(binned_arr == 0)] = defs.UF 80 | 81 | # Set overflow values to OF 82 | binned_arr[np.argwhere(binned_arr == len(bin_edges))] = defs.OF 83 | 84 | return binned_arr, bin_labels 85 | 86 | 87 | def bin_data( 88 | data: pd.DataFrame, 89 | cols: Union[list, np.ndarray, tuple] = (), 90 | bins: Union[int, list, np.ndarray, dict] = 10, 91 | quantile: bool = False, 92 | retbins: bool = False, 93 | ): 94 | """ 95 | Index the input DataFrame given the bin_edges for the columns specified in cols. 96 | 97 | :param DataFrame data: input data 98 | :param list cols: list of columns with numeric data which needs to be indexed 99 | :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\ 100 | E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]} 101 | :param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True) 102 | :returns: rebinned DataFrame 103 | :rtype: pandas.DataFrame 104 | """ 105 | xbins = None 106 | if isinstance(bins, dict): 107 | for col in cols: 108 | if col not in bins: 109 | raise ValueError( 110 | "column {0} is not included in bins dictionary.".format(col) 111 | ) 112 | elif isinstance(bins, (list, np.ndarray)): 113 | xbins = bins 114 | 115 | # MB 20210307: check for numeric bins turned off here, also done in dq_check_nunique_values(). 116 | 117 | binned_data = data.copy() 118 | 119 | bins_dict = {} 120 | for col in cols: 121 | if np.issubdtype(type(bins), np.integer) or np.issubdtype( 122 | type(bins), np.floating 123 | ): 124 | xbins = bin_edges(data[col].astype(float), int(bins), quantile=quantile) 125 | elif isinstance(bins, dict): 126 | if np.issubdtype(type(bins[col]), np.integer) or np.issubdtype( 127 | type(bins[col]), np.floating 128 | ): 129 | xbins = bin_edges( 130 | data[col].astype(float), int(bins[col]), quantile=quantile 131 | ) 132 | elif isinstance(bins[col], (list, np.ndarray)): 133 | xbins = bins[col] 134 | elif xbins is None: 135 | raise ValueError( 136 | "Unexpected type for bins. The found type was '%s'" % str(type(bins)) 137 | ) 138 | 139 | binned_data[col], bin_labels = bin_array(data[col].astype(float).values, xbins) 140 | if retbins: 141 | bins_dict[col] = bin_labels 142 | 143 | if retbins: 144 | return binned_data, bins_dict 145 | 146 | return binned_data 147 | 148 | 149 | def auto_bin_data( 150 | df: pd.DataFrame, 151 | interval_cols: Optional[list] = None, 152 | bins: Union[int, list, np.ndarray, dict] = 10, 153 | quantile: bool = False, 154 | dropna: bool = True, 155 | verbose: bool = True, 156 | ) -> pd.DataFrame: 157 | """ 158 | Index the input DataFrame with automatic bin_edges and interval columns. 159 | 160 | :param pd.DataFrame data_binned: input data 161 | :param list interval_cols: column names of columns with interval variables. 162 | :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column 163 | the bins are specified. (default=10)\ 164 | E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]} 165 | :param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True) 166 | :param bool dropna: remove NaN values with True 167 | :param bool verbose: if False, do not print all interval columns that are guessed 168 | :return: phik correlation matrix 169 | """ 170 | # guess interval columns 171 | if interval_cols is None: 172 | interval_cols = guess_interval_cols(df, verbose) 173 | 174 | # clean the data 175 | df_clean, interval_cols_clean = dq_check_nunique_values( 176 | df, interval_cols, dropna=dropna 177 | ) 178 | 179 | # perform rebinning 180 | data_binned, binning_dict = bin_data( 181 | df_clean, cols=interval_cols_clean, bins=bins, quantile=quantile, retbins=True 182 | ) 183 | return data_binned, binning_dict 184 | 185 | 186 | def create_correlation_overview_table( 187 | vals: List[Tuple[str, str, float]] 188 | ) -> pd.DataFrame: 189 | """ 190 | Create overview table of phik/significance data. 191 | 192 | :param list vals: list holding tuples of data for each variable pair formatted as ('var1', 'var2', value) 193 | :returns: symmetric table with phik/significances of all variable pairs 194 | :rtype: pandas.DataFrame 195 | """ 196 | 197 | ll = [] 198 | for c0, c1, v in vals: 199 | ll.append([c0, c1, v]) 200 | ll.append([c1, c0, v]) 201 | 202 | corr_matrix = pd.DataFrame(ll, columns=["var1", "var2", "vals"]).pivot_table( 203 | index="var1", columns="var2", values="vals" 204 | ) 205 | corr_matrix.columns.name = None 206 | corr_matrix.index.name = None 207 | return corr_matrix 208 | 209 | 210 | def hist2d_from_rebinned_df( 211 | data_binned: pd.DataFrame, 212 | dropna: bool = True, 213 | drop_underflow: bool = True, 214 | drop_overflow: bool = True, 215 | ) -> pd.DataFrame: 216 | """ 217 | Give binned 2d DataFrame of two columns of rebinned input DataFrame 218 | 219 | :param df: input data. DataFrame must contain exactly two columns 220 | :param bool dropna: remove NaN values with True 221 | :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\ 222 | a numeric variable) 223 | :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\ 224 | a numeric variable) 225 | :returns: histogram DataFrame 226 | """ 227 | 228 | c0, c1 = data_binned.columns 229 | 230 | if not dropna: 231 | data_binned.fillna(defs.NaN, inplace=True) 232 | if drop_underflow: 233 | data_binned.replace(defs.UF, np.nan, inplace=True) 234 | if drop_overflow: 235 | data_binned.replace(defs.OF, np.nan, inplace=True) 236 | 237 | # create a contingency table 238 | df_datahist = ( 239 | data_binned.groupby([c0, c1])[c0].count().to_frame().unstack().fillna(0) 240 | ) 241 | df_datahist.columns = df_datahist.columns.droplevel() 242 | 243 | return df_datahist 244 | 245 | 246 | def hist2d( 247 | df: pd.DataFrame, 248 | interval_cols: Optional[Union[list, np.ndarray]] = None, 249 | bins: Union[int, float, list, np.ndarray, dict] = 10, 250 | quantile: bool = False, 251 | dropna: bool = True, 252 | drop_underflow: bool = True, 253 | drop_overflow: bool = True, 254 | retbins: bool = False, 255 | verbose: bool = True, 256 | ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, dict]]: 257 | """ 258 | Give binned 2d DataFrame of two columns of input DataFrame 259 | 260 | :param df: input data. DataFrame must contain exactly two columns 261 | :param interval_cols: columns with interval variables which need to be binned 262 | :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\ 263 | E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]} 264 | :param bool quantile: when the number of bins is specified, use uniform binning (False) or quantile binning (True) 265 | :param bool dropna: remove NaN values with True 266 | :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\ 267 | a numeric variable) 268 | :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\ 269 | a numeric variable) 270 | :param bool verbose: if False, do not print all interval columns that are guessed 271 | :returns: histogram DataFrame 272 | """ 273 | 274 | if len(df.columns) != 2: 275 | raise ValueError("DataFrame should contain only two columns") 276 | 277 | if interval_cols is None: 278 | interval_cols = guess_interval_cols(df, verbose) 279 | 280 | data_binned, binning_dict = bin_data( 281 | df, interval_cols, retbins=True, bins=bins, quantile=quantile 282 | ) 283 | datahist = hist2d_from_rebinned_df( 284 | data_binned, 285 | dropna=dropna, 286 | drop_underflow=drop_underflow, 287 | drop_overflow=drop_overflow, 288 | ) 289 | 290 | if retbins: 291 | return datahist, binning_dict 292 | 293 | return datahist 294 | 295 | 296 | def hist2d_from_array( 297 | x: Union[pd.Series, list, np.ndarray], y: [pd.Series, list, np.ndarray], **kwargs 298 | ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, dict]]: 299 | """ 300 | Give binned 2d DataFrame of two input arrays 301 | 302 | :param x: input data. First array-like. 303 | :param y: input data. Second array-like. 304 | :param interval_cols: columns with interval variables which need to be binned 305 | :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\ 306 | E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]} 307 | :param bool quantile: when the number of bins is specified, use uniform binning (False) or quantile binning (True) 308 | :param bool dropna: remove NaN values with True 309 | :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\ 310 | a numeric variable) 311 | :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\ 312 | a numeric variable) 313 | :returns: histogram DataFrame 314 | """ 315 | 316 | df = array_like_to_dataframe(x, y) 317 | return hist2d(df, **kwargs) 318 | -------------------------------------------------------------------------------- /tests/test_phik.py: -------------------------------------------------------------------------------- 1 | """Project: Phi_K - correlation coefficient package 2 | 3 | Created: 2018/11/13 4 | 5 | Description: 6 | Collection of helper functions to get fixtures, i.e. for test data. 7 | These are mostly used by the (integration) tests and example notebooks. 8 | 9 | Authors: 10 | KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands 11 | 12 | Redistribution and use in source and binary forms, with or without 13 | modification, are permitted according to the terms listed in the file 14 | LICENSE. 15 | """ 16 | 17 | import unittest 18 | import pytest 19 | 20 | import pandas as pd 21 | import numpy as np 22 | from phik import resources, bivariate 23 | from phik.simulation import sim_2d_data_patefield, CPP_SUPPORT 24 | from phik.binning import auto_bin_data, bin_data 25 | from phik.phik import phik_observed_vs_expected_from_rebinned_df, phik_from_hist2d 26 | from phik.statistics import get_dependent_frequency_estimates 27 | 28 | 29 | @pytest.mark.filterwarnings("ignore:Using or importing the ABCs from") 30 | class PhiKTest(unittest.TestCase): 31 | """Tests for calculation of Phi_K""" 32 | 33 | def test_phik_calculation(self): 34 | """Test the calculation of Phi_K""" 35 | 36 | chi2 = bivariate.chi2_from_phik(0.5, 1000, nx=10, ny=10) 37 | self.assertTrue(np.isclose(chi2, 271.16068979654125, 1e-6)) 38 | 39 | phik = bivariate.phik_from_chi2(chi2, 1000, 10, 10) 40 | self.assertTrue(np.isclose(phik, 0.5, 1e-6)) 41 | 42 | def test_phik_from_hist2d(self): 43 | """Test the calculation of Phi_K value from hist2d""" 44 | 45 | # open fake car insurance data 46 | df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz")) 47 | 48 | # create contingency matrix 49 | cols = ["mileage", "car_size"] 50 | interval_cols = ["mileage"] 51 | observed = df[cols].hist2d(interval_cols=interval_cols) 52 | 53 | phik_value = phik_from_hist2d(observed) 54 | self.assertAlmostEqual(phik_value, 0.7685888294891855, places=3) 55 | 56 | def test_phik_observed_vs_expected_from_hist2d(self): 57 | """Test the calculation of Phi_K value from hist2d""" 58 | 59 | # open fake car insurance data 60 | df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz")) 61 | 62 | # create contingency matrix 63 | cols = ["mileage", "car_size"] 64 | interval_cols = ["mileage"] 65 | 66 | observed = df[cols].hist2d(interval_cols=interval_cols).values 67 | expected = get_dependent_frequency_estimates(observed) 68 | 69 | phik_value = phik_from_hist2d(observed=observed, expected=expected) 70 | self.assertAlmostEqual(phik_value, 0.7685888294891855, places=3) 71 | 72 | def test_phik_matrix(self): 73 | """Test the calculation of Phi_K""" 74 | # open fake car insurance data 75 | df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz")) 76 | cols = list(df.columns) 77 | 78 | # get the phi_k correlation matrix between all variables 79 | interval_cols = ["driver_age", "mileage"] 80 | phik_corr = df.phik_matrix(interval_cols=interval_cols) 81 | 82 | self.assertAlmostEqual( 83 | phik_corr.values[cols.index("car_color"), cols.index("area")], 84 | 0.5904561614620166, 85 | places=3, 86 | ) 87 | self.assertAlmostEqual( 88 | phik_corr.values[cols.index("area"), cols.index("car_color")], 89 | 0.5904561614620166, 90 | places=3, 91 | ) 92 | self.assertAlmostEqual( 93 | phik_corr.values[cols.index("mileage"), cols.index("car_size")], 94 | 0.768588987856336, 95 | places=3, 96 | ) 97 | self.assertAlmostEqual( 98 | phik_corr.values[cols.index("car_size"), cols.index("mileage")], 99 | 0.768588987856336, 100 | places=3, 101 | ) 102 | 103 | def test_phik_matrix_observed_vs_expected(self): 104 | """Test the calculation of Phi_K""" 105 | # open fake car insurance data 106 | df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz")) 107 | cols = list(df.columns) 108 | 109 | # get the phi_k correlation matrix between all variables 110 | binned_df, _ = auto_bin_data(df) 111 | phik_corr = phik_observed_vs_expected_from_rebinned_df(binned_df, binned_df) 112 | 113 | self.assertTrue( 114 | np.isclose( 115 | phik_corr.values[cols.index("car_color"), cols.index("area")], 0.0 116 | ) 117 | ) 118 | self.assertTrue( 119 | np.isclose( 120 | phik_corr.values[cols.index("area"), cols.index("car_color")], 0.0 121 | ) 122 | ) 123 | self.assertTrue( 124 | np.isclose( 125 | phik_corr.values[cols.index("mileage"), cols.index("car_size")], 0.0 126 | ) 127 | ) 128 | self.assertTrue( 129 | np.isclose( 130 | phik_corr.values[cols.index("car_size"), cols.index("mileage")], 0.0 131 | ) 132 | ) 133 | self.assertTrue( 134 | np.isclose( 135 | phik_corr.values[cols.index("car_size"), cols.index("car_size")], 1.0 136 | ) 137 | ) 138 | 139 | def test_global_phik(self): 140 | """Test the calculation of global Phi_K values""" 141 | 142 | # open fake car insurance data 143 | df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz")) 144 | 145 | # get the global phi_k values 146 | interval_cols = ["driver_age", "mileage"] 147 | gk = df.global_phik(interval_cols=interval_cols) 148 | 149 | area = (np.where(gk[1] == "area"))[0][0] 150 | car_size = (np.where(gk[1] == "car_size"))[0][0] 151 | mileage = (np.where(gk[1] == "mileage"))[0][0] 152 | 153 | self.assertAlmostEqual(gk[0][area][0], 0.6057528003711345, places=3) 154 | self.assertAlmostEqual(gk[0][car_size][0], 0.76858883, places=3) 155 | self.assertAlmostEqual(gk[0][mileage][0], 0.768588987856336, places=3) 156 | 157 | def test_significance_matrix_asymptotic(self): 158 | """Test significance calculation""" 159 | 160 | # open fake car insurance data 161 | df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz")) 162 | cols = list(df.columns) 163 | # get significances 164 | interval_cols = ["driver_age", "mileage"] 165 | sm = df.significance_matrix( 166 | interval_cols=interval_cols, significance_method="asymptotic" 167 | ) 168 | 169 | self.assertTrue( 170 | np.isclose( 171 | sm.values[cols.index("car_color"), cols.index("area")], 172 | 37.66184429195198, 173 | ) 174 | ) 175 | self.assertTrue( 176 | np.isclose( 177 | sm.values[cols.index("area"), cols.index("car_color")], 178 | 37.66184429195198, 179 | ) 180 | ) 181 | self.assertTrue( 182 | np.isclose( 183 | sm.values[cols.index("mileage"), cols.index("car_size")], 184 | 49.3323049685695, 185 | ) 186 | ) 187 | self.assertTrue( 188 | np.isclose( 189 | sm.values[cols.index("car_size"), cols.index("mileage")], 190 | 49.3323049685695, 191 | ) 192 | ) 193 | 194 | def test_significance_matrix_hybrid(self): 195 | """Test significance calculation""" 196 | 197 | # open fake car insurance data 198 | df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz")) 199 | cols = list(df.columns) 200 | # get significances 201 | interval_cols = ["driver_age", "mileage"] 202 | sm = df.significance_matrix( 203 | interval_cols=interval_cols, significance_method="hybrid" 204 | ) 205 | 206 | self.assertTrue( 207 | np.isclose( 208 | sm.values[cols.index("car_color"), cols.index("area")], 209 | 37.63086023595297, 210 | atol=10e-2, 211 | ) 212 | ) 213 | self.assertTrue( 214 | np.isclose( 215 | sm.values[cols.index("area"), cols.index("car_color")], 216 | 37.63086023595297, 217 | atol=10e-2, 218 | ) 219 | ) 220 | self.assertTrue( 221 | np.isclose( 222 | sm.values[cols.index("mileage"), cols.index("car_size")], 223 | 49.28345609465683, 224 | atol=10e-2, 225 | ) 226 | ) 227 | self.assertTrue( 228 | np.isclose( 229 | sm.values[cols.index("car_size"), cols.index("mileage")], 230 | 49.28345609465683, 231 | atol=10e-2, 232 | ) 233 | ) 234 | 235 | def test_significance_matrix_mc(self): 236 | """Test significance calculation""" 237 | 238 | # open fake car insurance data 239 | df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz")) 240 | cols = list(df.columns) 241 | # get significances 242 | interval_cols = ["driver_age", "mileage"] 243 | sm = df.significance_matrix( 244 | interval_cols=interval_cols, significance_method="MC" 245 | ) 246 | 247 | self.assertTrue( 248 | np.isclose(sm.values[cols.index("car_color"), cols.index("area")], np.inf) 249 | ) 250 | self.assertTrue( 251 | np.isclose(sm.values[cols.index("area"), cols.index("car_color")], np.inf) 252 | ) 253 | self.assertTrue( 254 | np.isclose(sm.values[cols.index("mileage"), cols.index("car_size")], np.inf) 255 | ) 256 | self.assertTrue( 257 | np.isclose(sm.values[cols.index("car_size"), cols.index("mileage")], np.inf) 258 | ) 259 | 260 | def test_hist2d(self): 261 | """Test the calculation of global Phi_K values""" 262 | 263 | # open fake car insurance data 264 | df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz")) 265 | 266 | # create contingency matrix 267 | cols = ["mileage", "car_size"] 268 | interval_cols = ["mileage"] 269 | h2d = df[cols].hist2d(interval_cols=interval_cols) 270 | 271 | self.assertEqual(h2d.values[1, 1], 10) 272 | self.assertEqual(h2d.values[5, 5], 217) 273 | 274 | def test_hist2d_array(self): 275 | """Test the calculation of global Phi_K values""" 276 | 277 | # open fake car insurance data 278 | df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz")) 279 | 280 | # create contingency matrix 281 | interval_cols = ["mileage"] 282 | h2d = df["mileage"].hist2d(df["car_size"], interval_cols=interval_cols) 283 | self.assertEqual(h2d.values[1, 1], 10) 284 | self.assertEqual(h2d.values[5, 5], 217) 285 | 286 | def test_outlier_significance_matrix(self): 287 | """Test the calculation of outlier significances""" 288 | 289 | # open fake car insurance data 290 | df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz")) 291 | 292 | # calculate outlier significances 293 | cols = ["mileage", "car_size"] 294 | interval_cols = ["mileage"] 295 | om = df[cols].outlier_significance_matrix(interval_cols=interval_cols) 296 | 297 | self.assertTrue(np.isclose(om.values[0, 1], 21.483476494343552)) 298 | self.assertTrue(np.isclose(om.values[2, 4], -1.246784034214704)) 299 | 300 | def test_outlier_significance_matrices(self): 301 | """Test the calculation of outlier significances""" 302 | 303 | # open fake car insurance data 304 | df = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz")) 305 | 306 | # calculate outlier significances 307 | interval_cols = ["mileage", "driver_age"] 308 | om = df.outlier_significance_matrices(interval_cols=interval_cols) 309 | 310 | self.assertTrue(isinstance(om, dict)) 311 | 312 | @pytest.mark.skipif(not CPP_SUPPORT, reason="cpp not supported") 313 | def test_simulation_2d_patefield(self): 314 | """Test simulation code using patefield algorithm.""" 315 | og_state = np.random.get_state() 316 | np.random.seed(42) 317 | sample = np.random.randint(1, 200, (50, 2)) 318 | 319 | # call test function 320 | res = sim_2d_data_patefield(sample, seed=42).T 321 | np.random.set_state(og_state) 322 | mean0, mean1 = res.mean(1) 323 | self.assertTrue(np.isclose(mean0, 105.46)) 324 | self.assertTrue(np.isclose(mean1, 91.18)) 325 | 326 | def test_binning_bin_data_bins_tyes(self): 327 | # Non regression test 328 | # https://github.com/KaveIO/PhiK/issues/28 329 | df = pd.DataFrame({"x": np.random.randn(10)}) 330 | bins_int = np.arange(5, 11, 1) 331 | bins_float = np.arange(5, 11, 1.0) 332 | bins_dict_int = {"x": np.uint8(10)} 333 | bins_dict_float = {"x": np.float32(10.3)} 334 | 335 | for bins in bins_int: 336 | bin_data(df, cols=["x"], bins=bins) 337 | 338 | for bins in bins_float: 339 | bin_data(df, cols=["x"], bins=bins) 340 | 341 | bin_data(df, cols=["x"], bins=bins_dict_int) 342 | bin_data(df, cols=["x"], bins=bins_dict_float) 343 | -------------------------------------------------------------------------------- /tests/integration/test_phik_tutorial_advanced.py: -------------------------------------------------------------------------------- 1 | # # Phi_K advanced tutorial 2 | # 3 | # This notebook guides you through the more advanced functionality of the phik package. This notebook will not cover all the underlying theory, but will just attempt to give an overview of all the options that are available. For a theoretical description the user is referred to our paper. 4 | # 5 | # The package offers functionality on three related topics: 6 | # 7 | # 1. Phik correlation matrix 8 | # 2. Significance matrix 9 | # 3. Outlier significance matrix 10 | 11 | # + 12 | # import standard packages 13 | import numpy as np 14 | import pandas as pd 15 | 16 | from phik import resources 17 | from phik.decorators import * 18 | 19 | # # Load data 20 | # 21 | # A simulated dataset is part of the phik-package. The dataset concerns car insurance data. Load the dataset here: 22 | 23 | 24 | def test_advanced_notebook(): 25 | data = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz")) 26 | 27 | data.head() 28 | 29 | # ## Specify bin types 30 | # 31 | # The phik-package offers a way to calculate correlations between variables of mixed types. Variable types can be inferred automatically although we recommend to variable types to be specified by the user. 32 | # 33 | # Because interval type variables need to be binned in order to calculate phik and the significance, a list of interval variables is created. 34 | 35 | # + 36 | data_types = { 37 | "severity": "interval", 38 | "driver_age": "interval", 39 | "satisfaction": "ordinal", 40 | "mileage": "interval", 41 | "car_size": "ordinal", 42 | "car_use": "ordinal", 43 | "car_color": "categorical", 44 | "area": "categorical", 45 | } 46 | 47 | interval_cols = [ 48 | col for col, v in data_types.items() if v == "interval" and col in data.columns 49 | ] 50 | # interval_cols is used below 51 | # - 52 | 53 | # # Phik correlation matrix 54 | # 55 | # Now let's start calculating the correlation phik between pairs of variables. 56 | # 57 | # Note that the original dataset is used as input, the binning of interval variables is done automatically. 58 | 59 | phik_overview = data.phik_matrix(interval_cols=interval_cols) 60 | 61 | # ### Specify binning per interval variable 62 | # 63 | # Binning can be set per interval variable individually. One can set the number of bins, or specify a list of bin edges. Note that the measured phik correlation is dependent on the chosen binning. 64 | # The default binning is uniform between the min and max values of the interval variable. 65 | 66 | bins = {"mileage": 5, "driver_age": [18, 25, 35, 45, 55, 65, 125]} 67 | phik_overview = data.phik_matrix(interval_cols=interval_cols, bins=bins) 68 | 69 | # ### Do not apply noise correction 70 | # 71 | # For low statistics samples often a correlation larger than zero is measured when no correlation is actually present in the true underlying distribution. This is not only the case for phik, but also for the pearson correlation and Cramer's phi (see figure 4 in XX ). In the phik calculation a noise correction is applied by default, to take into account erroneous correlation values as a result of low statistics. To switch off this noise cancellation (not recommended), do: 72 | 73 | phik_overview = data.phik_matrix( 74 | interval_cols=interval_cols, noise_correction=False 75 | ) 76 | 77 | # ### Using a different expectation histogram 78 | # 79 | # By default phik compares the 2d distribution of two (binned) variables with the distribution that assumes no dependency between them. One can also change the expected distribution though. Phi_K is calculated in the same way, but using the other expectation distribution. 80 | 81 | from phik.binning import auto_bin_data 82 | from phik.phik import (phik_from_hist2d, 83 | phik_observed_vs_expected_from_rebinned_df) 84 | from phik.statistics import get_dependent_frequency_estimates 85 | 86 | # get observed 2d histogram of two variables 87 | cols = ["mileage", "car_size"] 88 | icols = ["mileage"] 89 | observed = data[cols].hist2d(interval_cols=icols).values 90 | 91 | # default phik evaluation from observed distribution 92 | phik_value = phik_from_hist2d(observed) 93 | print(phik_value) 94 | 95 | # phik evaluation from an observed and expected distribution 96 | expected = get_dependent_frequency_estimates(observed) 97 | phik_value = phik_from_hist2d(observed=observed, expected=expected) 98 | print(phik_value) 99 | 100 | # one can also compare two datasets against each other, and get a full phik matrix that way. 101 | # this needs binned datasets though. 102 | # (the user needs to make sure the binnings of both datasets are identical.) 103 | data_binned, _ = auto_bin_data(data, interval_cols=interval_cols) 104 | 105 | # here we are comparing data_binned against itself 106 | phik_matrix = phik_observed_vs_expected_from_rebinned_df(data_binned, data_binned) 107 | 108 | # all off-diagonal entries are zero, meaning the all 2d distributions of both datasets are identical. 109 | # (by construction the diagonal is one.) 110 | 111 | # # Statistical significance of the correlation 112 | # 113 | # When assessing correlations it is good practise to evaluate both the correlation and the significance of the correlation: a large correlation may be statistically insignificant, and vice versa a small correlation may be very significant. For instance, scipy.stats.pearsonr returns both the pearson correlation and the p-value. Similarly, the phik package offers functionality the calculate a significance matrix. Significance is defined as: 114 | # 115 | # $$Z = \Phi^{-1}(1-p)\ ;\quad \Phi(z)=\frac{1}{\sqrt{2\pi}} \int_{-\infty}^{z} e^{-t^{2}/2}\,{\rm d}t $$ 116 | # 117 | # Several corrections to the 'standard' p-value calculation are taken into account, making the method more robust for low statistics and sparse data cases. The user is referred to our paper for more details. 118 | # 119 | # Due to the corrections, the significance calculation can take a few seconds. 120 | 121 | significance_overview = data.significance_matrix(interval_cols=interval_cols) 122 | 123 | # ### Specify binning per interval variable 124 | # Binning can be set per interval variable individually. One can set the number of bins, or specify a list of bin edges. Note that the measure phik correlation is dependent on the chosen binning. 125 | 126 | bins = {"mileage": 5, "driver_age": [18, 25, 35, 45, 55, 65, 125]} 127 | significance_overview = data.significance_matrix( 128 | interval_cols=interval_cols, bins=bins 129 | ) 130 | 131 | # ### Specify significance method 132 | # 133 | # The recommended method to calculate the significance of the correlation is a hybrid approach, which uses the G-test statistic. The number of degrees of freedom and an analytical, empirical description of the $\chi^2$ distribution are sed, based on Monte Carlo simulations. This method works well for both high as low statistics samples. 134 | # 135 | # Other approaches to calculate the significance are implemented: 136 | # - asymptotic: fast, but over-estimates the number of degrees of freedom for low statistics samples, leading to erroneous values of the significance 137 | # - MC: Many simulated samples are needed to accurately measure significances larger than 3, making this method computationally expensive. 138 | # 139 | 140 | significance_overview = data.significance_matrix( 141 | interval_cols=interval_cols, significance_method="asymptotic" 142 | ) 143 | 144 | # ### Simulation method 145 | # 146 | # The chi2 of a contingency table is measured using a comparison of the expected frequencies with the true frequencies in a contingency table. The expected frequencies can be simulated in a variety of ways. The following methods are implemented: 147 | # 148 | # - multinominal: Only the total number of records is fixed. (default) 149 | # - row_product_multinominal: The row totals fixed in the sampling. 150 | # - col_product_multinominal: The column totals fixed in the sampling. 151 | # - hypergeometric: Both the row or column totals are fixed in the sampling. (Note that this type of sampling is only available when row and column totals are integers, which is usually the case.) 152 | 153 | # + 154 | # --- Warning, can be slow 155 | # turned off here by default for unit testing purposes 156 | 157 | # significance_overview = data.significance_matrix(interval_cols=interval_cols, simulation_method='hypergeometric') 158 | # significance_overview 159 | # - 160 | 161 | # ### Expected frequencies 162 | 163 | from phik.simulation import (sim_2d_data, sim_2d_data_patefield, 164 | sim_2d_product_multinominal) 165 | 166 | inputdata = data[["driver_age", "area"]].hist2d(interval_cols=["driver_age"]) 167 | 168 | # #### Multinominal 169 | 170 | simdata = sim_2d_data(inputdata.values) 171 | print("data total:", inputdata.sum().sum()) 172 | print("sim total:", simdata.sum().sum()) 173 | print("data row totals:", inputdata.sum(axis=0).values) 174 | print("sim row totals:", simdata.sum(axis=0)) 175 | print("data column totals:", inputdata.sum(axis=1).values) 176 | print("sim column totals:", simdata.sum(axis=1)) 177 | 178 | # #### product multinominal 179 | 180 | simdata = sim_2d_product_multinominal(inputdata.values, axis=0) 181 | print("data total:", inputdata.sum().sum()) 182 | print("sim total:", simdata.sum().sum()) 183 | print("data row totals:", inputdata.sum(axis=0).astype(int).values) 184 | print("sim row totals:", simdata.sum(axis=0).astype(int)) 185 | print("data column totals:", inputdata.sum(axis=1).astype(int).values) 186 | print("sim column totals:", simdata.sum(axis=1).astype(int)) 187 | 188 | # #### hypergeometric ("patefield") 189 | 190 | # + 191 | # patefield simulation needs compiled c++ code. 192 | # only run this if the python binding to the (compiled) patefiled simulation function is found. 193 | from phik.simcore import CPP_SUPPORT 194 | 195 | if CPP_SUPPORT: 196 | simdata = sim_2d_data_patefield(inputdata.values) 197 | print("data total:", inputdata.sum().sum()) 198 | print("sim total:", simdata.sum().sum()) 199 | print("data row totals:", inputdata.sum(axis=0).astype(int).values) 200 | print("sim row totals:", simdata.sum(axis=0)) 201 | print("data column totals:", inputdata.sum(axis=1).astype(int).values) 202 | print("sim column totals:", simdata.sum(axis=1)) 203 | # - 204 | 205 | # # Outlier significance 206 | # 207 | # The normal pearson correlation between two interval variables is easy to interpret. However, the phik correlation between two variables of mixed type is not always easy to interpret, especially when it concerns categorical variables. Therefore, functionality is provided to detect "outliers": excesses and deficits over the expected frequencies in the contingency table of two variables. 208 | # 209 | 210 | # ### Example 1: mileage versus car_size 211 | 212 | # For the categorical variable pair mileage - car_size we measured: 213 | # 214 | # $$\phi_k = 0.77 \, ,\quad\quad \mathrm{significance} = 46.3$$ 215 | # 216 | # Let's use the outlier significance functionality to gain a better understanding of this significance correlation between mileage and car size. 217 | # 218 | 219 | # + 220 | c0 = "mileage" 221 | c1 = "car_size" 222 | 223 | tmp_interval_cols = ["mileage"] 224 | # - 225 | 226 | outlier_signifs, binning_dict = data[[c0, c1]].outlier_significance_matrix( 227 | interval_cols=tmp_interval_cols, retbins=True 228 | ) 229 | 230 | # ### Specify binning per interval variable 231 | # Binning can be set per interval variable individually. One can set the number of bins, or specify a list of bin edges. 232 | # 233 | # Note: in case a bin is created without any records this bin will be automatically dropped in the phik and (outlier) significance calculations. However, in the outlier significance calculation this will currently lead to an error as the number of provided bin edges does not match the number of bins anymore. 234 | 235 | bins = [0, 1e2, 1e3, 1e4, 1e5, 1e6] 236 | outlier_signifs, binning_dict = data[[c0, c1]].outlier_significance_matrix( 237 | interval_cols=tmp_interval_cols, bins=bins, retbins=True 238 | ) 239 | 240 | # ### Specify binning per interval variable -- dealing with underflow and overflow 241 | # 242 | # When specifying custom bins as situation can occur when the minimal (maximum) value in the data is smaller (larger) than the minimum (maximum) bin edge. Data points outside the specified range will be collected in the underflow (UF) and overflow (OF) bins. One can choose how to deal with these under/overflow bins, by setting the drop_underflow and drop_overflow variables. 243 | # 244 | # Note that the drop_underflow and drop_overflow options are also available for the calculation of the phik matrix and the significance matrix. 245 | 246 | bins = [1e2, 1e3, 1e4, 1e5] 247 | outlier_signifs, binning_dict = data[[c0, c1]].outlier_significance_matrix( 248 | interval_cols=tmp_interval_cols, 249 | bins=bins, 250 | retbins=True, 251 | drop_underflow=False, 252 | drop_overflow=False, 253 | ) 254 | 255 | # ### Dealing with NaN's in the data 256 | 257 | # Let's add some missing values to our data 258 | 259 | data.loc[np.random.choice(range(len(data)), size=10), "car_size"] = np.nan 260 | data.loc[np.random.choice(range(len(data)), size=10), "mileage"] = np.nan 261 | 262 | # Sometimes there can be information in the missing values and in which case you might want to consider the NaN values as a separate category. This can be achieved by setting the dropna argument to False. 263 | 264 | bins = [1e2, 1e3, 1e4, 1e5] 265 | outlier_signifs, binning_dict = data[[c0, c1]].outlier_significance_matrix( 266 | interval_cols=tmp_interval_cols, 267 | bins=bins, 268 | retbins=True, 269 | drop_underflow=False, 270 | drop_overflow=False, 271 | dropna=False, 272 | ) 273 | 274 | # Here OF and UF are the underflow and overflow bin of car_size, respectively. 275 | # 276 | # To just ignore records with missing values set dropna to True (default). 277 | 278 | bins = [1e2, 1e3, 1e4, 1e5] 279 | outlier_signifs, binning_dict = data[[c0, c1]].outlier_significance_matrix( 280 | interval_cols=tmp_interval_cols, 281 | bins=bins, 282 | retbins=True, 283 | drop_underflow=False, 284 | drop_overflow=False, 285 | dropna=True, 286 | ) 287 | 288 | # Note that the dropna option is also available for the calculation of the phik matrix and the significance matrix. 289 | -------------------------------------------------------------------------------- /tests/integration/test_phik_tutorial_basic.py: -------------------------------------------------------------------------------- 1 | # # Phi_K basic tutorial 2 | # 3 | # This notebook guides you through the basic functionality of the phik package. The package offers functionality on three related topics: 4 | # 5 | # 1. Phik correlation matrix 6 | # 2. Significance matrix 7 | # 3. Outlier significance matrix 8 | # 9 | # For more information on the underlying theory, the user is referred to our paper. 10 | 11 | import itertools 12 | 13 | import matplotlib.pyplot as plt 14 | # + 15 | # import standard packages 16 | import numpy as np 17 | import pandas as pd 18 | 19 | import phik 20 | from phik import resources 21 | from phik.binning import bin_data 22 | from phik.report import plot_correlation_matrix 23 | 24 | # # Load data 25 | # 26 | # A simulated dataset is part of the phik-package. The dataset concerns fake car insurance data. Load the dataset here: 27 | 28 | 29 | def test_basic_notebook(): 30 | data = pd.read_csv(resources.fixture("fake_insurance_data.csv.gz")) 31 | 32 | # # Take a first look at the data 33 | 34 | # Let's use a simple data.head() to get an idea of what the data looks like and inspect the different types of variables. 35 | 36 | data.head() 37 | 38 | # # Specify bin types 39 | # 40 | # The phik-package offers a way to calculate correlations between variables of mixed types. Variable types can be inferred automatically although we recommend variable types to be specified by the user. 41 | # 42 | # Because interval type variables need to be binned in order to calculate phik and the significance, a list of interval variables is created. 43 | 44 | # + 45 | data_types = { 46 | "severity": "interval", 47 | "driver_age": "interval", 48 | "satisfaction": "ordinal", 49 | "mileage": "interval", 50 | "car_size": "ordinal", 51 | "car_use": "ordinal", 52 | "car_color": "categorical", 53 | "area": "categorical", 54 | } 55 | 56 | interval_cols = [ 57 | col for col, v in data_types.items() if v == "interval" and col in data.columns 58 | ] 59 | # - 60 | 61 | # # Visually inspect pairwise correlations 62 | 63 | # ## Bin the interval variables 64 | # 65 | # To get a feeling for the data, let's bin the interval variables and create 2d histograms to inspect the correlations between variables. By binning the interval variables we can treat all variable types in the same way. 66 | # 67 | 68 | # bin the interval variables 69 | data_binned, binning_dict = bin_data(data, cols=interval_cols, retbins=True) 70 | 71 | # + 72 | # plot each variable pair 73 | plt.rc("text", usetex=False) 74 | 75 | n = 0 76 | for i in range(len(data.columns)): 77 | n = n + i 78 | 79 | ncols = 3 80 | nrows = int(np.ceil(n / ncols)) 81 | fig, axes = plt.subplots(nrows, ncols, figsize=(15, 4 * nrows)) 82 | ndecimals = 0 83 | 84 | for i, comb in enumerate(itertools.combinations(data_binned.columns.values, 2)): 85 | c = int(i % ncols) 86 | r = int((i - c) / ncols) 87 | 88 | # get data 89 | c0, c1 = comb 90 | datahist = ( 91 | data_binned.groupby([c0, c1])[c0].count().to_frame().unstack().fillna(0) 92 | ) 93 | datahist.columns = datahist.columns.droplevel() 94 | 95 | # plot data 96 | img = axes[r][c].pcolormesh(datahist.values, edgecolor="w", linewidth=1) 97 | 98 | # axis ticks and tick labels 99 | if c0 in binning_dict.keys(): 100 | ylabels = [ 101 | "{1:.{0}f}_{2:.{0}f}".format( 102 | ndecimals, binning_dict[c0][i][0], binning_dict[c0][i][1] 103 | ) 104 | for i in range(len(binning_dict[c0])) 105 | ] 106 | else: 107 | ylabels = datahist.index 108 | 109 | if c1 in binning_dict.keys(): 110 | xlabels = [ 111 | "{1:.{0}f}_{2:.{0}f}".format( 112 | ndecimals, binning_dict[c1][i][0], binning_dict[c1][i][1] 113 | ) 114 | for i in range(len(binning_dict[c1])) 115 | ] 116 | else: 117 | xlabels = datahist.columns 118 | 119 | # axis labels 120 | axes[r][c].set_yticks(np.arange(len(ylabels)) + 0.5) 121 | axes[r][c].set_xticks(np.arange(len(xlabels)) + 0.5) 122 | axes[r][c].set_xticklabels(xlabels, rotation="vertical") 123 | axes[r][c].set_yticklabels(ylabels, rotation="horizontal") 124 | axes[r][c].set_xlabel(datahist.columns.name) 125 | axes[r][c].set_ylabel(datahist.index.name) 126 | axes[r][c].set_title("data") 127 | 128 | plt.tight_layout() 129 | 130 | # - 131 | 132 | # # Correlation: mileage vs car_size 133 | # 134 | # From the above plots it seems like there might be an interesting a correlation between mileage and car_size. Let's see what phik correlation is measured for this data. 135 | 136 | # + 137 | x, y = data[["mileage", "car_size"]].T.values 138 | 139 | print("phik = %.2f" % phik.phik_from_array(x, y, num_vars=["x"])) 140 | print("significance = %.2f" % phik.significance_from_array(x, y, num_vars=["x"])[1]) 141 | 142 | # - 143 | 144 | # Indeed there is a correlation between these variables and the correlation is also significant. To better understand the correlation, we can have a look at the significance of excesses and deficits in the 2-dimensional contingency table, so-called "outlier significances". 145 | 146 | phik.outlier_significance_from_array(x, y, num_vars=["x"]) 147 | 148 | # The values displayed in the matrix are the significances of the outlier frequencies, i.e. a large value means that the measured frequency for that bin is significantly different from the expected frequency in that bin. 149 | # 150 | # Let's visualise for easier interpretation. 151 | 152 | # + 153 | outlier_signifs = phik.outlier_significance_from_array(x, y, num_vars=["x"]) 154 | 155 | zvalues = outlier_signifs.values 156 | xlabels = outlier_signifs.columns 157 | ylabels = outlier_signifs.index 158 | xlabel = "x" 159 | ylabel = "y" 160 | 161 | plot_correlation_matrix( 162 | zvalues, 163 | x_labels=xlabels, 164 | y_labels=ylabels, 165 | x_label=xlabel, 166 | y_label=ylabel, 167 | vmin=-5, 168 | vmax=5, 169 | title="outlier significance", 170 | identity_layout=False, 171 | fontsize_factor=1.2, 172 | ) 173 | # - 174 | 175 | # # $\phi_k$ functions for dataframes 176 | # 177 | # In our data we have 5 different columns, meaning we have to evaluate 4+3+2+1=10 pairs of variables for possible correlations. In a large dataset, with many different variables, this can easily become a cumbersome task. Can we do this more efficient? yes! We have provided functions that work on dataframes, to allow you to calculate the phik correlation, significance and outlier significance for all different variable combinations at once. 178 | # 179 | 180 | # The functions are by default available after import of the phik package. 181 | 182 | # # $\phi_k$ correlation matrix 183 | # 184 | # Now let's start calculating the phik correlation coefficient between pairs of variables. 185 | # 186 | # Note that the original dataset is used as input, the binning of interval variables is done automatically. 187 | 188 | phik_overview = data.phik_matrix(interval_cols=interval_cols) 189 | phik_overview 190 | 191 | # When no interval columns are provided, the code makes an educated guess 192 | 193 | data.phik_matrix() 194 | 195 | plot_correlation_matrix( 196 | phik_overview.values, 197 | x_labels=phik_overview.columns, 198 | y_labels=phik_overview.index, 199 | vmin=0, 200 | vmax=1, 201 | color_map="Blues", 202 | title=r"correlation $\phi_K$", 203 | fontsize_factor=1.5, 204 | figsize=(7, 5.5), 205 | ) 206 | plt.tight_layout() 207 | 208 | # # Global correlation: $g_k$ 209 | # 210 | # The global correlation coefficient is a measure of the total correlation of one variable to all other variables in the dataset. They give an indication of how well on variable can be modelled in terms of the other variables. A calculation of the global correlation coefficient is provided within the phik package. 211 | 212 | global_correlation, global_labels = data.global_phik(interval_cols=interval_cols) 213 | for c, l in zip(global_correlation, global_labels): 214 | print(l, c[0]) 215 | 216 | plot_correlation_matrix( 217 | global_correlation, 218 | x_labels=[""], 219 | y_labels=global_labels, 220 | vmin=0, 221 | vmax=1, 222 | figsize=(3.5, 4), 223 | color_map="Blues", 224 | title=r"$g_k$", 225 | fontsize_factor=1.5, 226 | ) 227 | plt.tight_layout() 228 | 229 | # # Statistical significance of the correlation: $Z$-score 230 | # 231 | # When assessing correlations it is good practise to evaluate both the correlation and the significance of the correlation: a large correlation may be statistically insignificant, and vice versa a small correlation may be very significant. For instance, scipy.stats.pearsonr returns both the pearson correlation and the p-value. Similarly, the phik package offers functionality the calculate a significance matrix. Significance is defined as: 232 | # 233 | # $$Z = \Phi^{-1}(1-p)\ ;\quad \Phi(z)=\frac{1}{\sqrt{2\pi}} \int_{-\infty}^{z} e^{-t^{2}/2}\,{\rm d}t $$ 234 | # 235 | # Several corrections to the 'standard' p-value calculation are taken into account, making the method more robust for low statistics and sparse data cases. The user is referred to our paper for more details. 236 | # 237 | # As a result, the calculation may take a few seconds. 238 | 239 | significance_overview = data.significance_matrix(interval_cols=interval_cols) 240 | significance_overview 241 | 242 | plot_correlation_matrix( 243 | significance_overview.fillna(0).values, 244 | x_labels=significance_overview.columns, 245 | y_labels=significance_overview.index, 246 | vmin=-5, 247 | vmax=5, 248 | title="significance", 249 | usetex=False, 250 | fontsize_factor=1.5, 251 | figsize=(7, 5.5), 252 | ) 253 | plt.tight_layout() 254 | 255 | # # Outlier significance 256 | # 257 | # The normal pearson correlation between two interval variables is easy to interpret. However, the phik correlation between two variables of mixed type is not always easy to interpret, especially when it concerns categorical variables. Therefore, functionality is provided to detect "outliers": excesses and deficits over the expected frequencies in the contingency table of two variables. 258 | # 259 | 260 | # ### Example 1: car_color versus area 261 | # 262 | # For the categorical variable pair car_color - area we measured: 263 | # 264 | # $$\phi_k = 0.59 \, ,\quad\quad \mathrm{significance} = 37.6$$ 265 | # 266 | # Let's use the outlier significance functionality to gain a better understanding of the significance correlation between car color and area. 267 | # 268 | 269 | c1 = "car_color" 270 | c0 = "area" 271 | 272 | outlier_signifs, binning_dict = data[[c0, c1]].outlier_significance_matrix( 273 | retbins=True 274 | ) 275 | outlier_signifs 276 | 277 | # + 278 | zvalues = outlier_signifs.values 279 | xlabels = binning_dict[c1] if c1 in binning_dict.keys() else outlier_signifs.columns 280 | ylabels = binning_dict[c0] if c0 in binning_dict.keys() else outlier_signifs.index 281 | xlabel = c1 282 | ylabel = c0 283 | 284 | plot_correlation_matrix( 285 | zvalues, 286 | x_labels=xlabels, 287 | y_labels=ylabels, 288 | x_label=xlabel, 289 | y_label=ylabel, 290 | vmin=-5, 291 | vmax=5, 292 | title="outlier significance", 293 | identity_layout=False, 294 | fontsize_factor=1.2, 295 | ) 296 | # - 297 | 298 | # The significance of each cell is expressed in terms of Z (one-sided). 299 | # 300 | # Interesting, owners of a green car are more likely to live in the country side, and black cars are more likely to travel on unpaved roads! 301 | 302 | # ### Example 2: mileage versus car_size 303 | 304 | # For the categorical variable pair mileage - car_size we measured: 305 | # 306 | # $$\phi_k = 0.77 \, ,\quad\quad \mathrm{significance} = 46.3$$ 307 | # 308 | # Let's use the outlier significance functionality to gain a better understanding of this significance correlation between mileage and car size. 309 | # 310 | 311 | # + 312 | c0 = "mileage" 313 | c1 = "car_size" 314 | 315 | tmp_interval_cols = ["mileage"] 316 | # - 317 | 318 | outlier_signifs, binning_dict = data[[c0, c1]].outlier_significance_matrix( 319 | interval_cols=tmp_interval_cols, retbins=True 320 | ) 321 | outlier_signifs 322 | 323 | # Note that the interval variable mileage is binned automatically in 10 uniformly spaced bins! 324 | 325 | # + 326 | zvalues = outlier_signifs.values 327 | xlabels = outlier_signifs.columns 328 | ylabels = outlier_signifs.index 329 | xlabel = c1 330 | ylabel = c0 331 | 332 | plot_correlation_matrix( 333 | zvalues, 334 | x_labels=xlabels, 335 | y_labels=ylabels, 336 | x_label=xlabel, 337 | y_label=ylabel, 338 | vmin=-5, 339 | vmax=5, 340 | title="outlier significance", 341 | identity_layout=False, 342 | fontsize_factor=1.2, 343 | ) 344 | # - 345 | 346 | # # Correlation report 347 | 348 | # A full correlation report can be created automatically for a dataset by pairwise evaluation of all correlations, significances and outlier significances. 349 | # 350 | # Note that for a dataset with many different columns the number of outlier significances plots can grow large very rapidly. Therefore, the feature is implemented to only evaluate outlier significances for those variable pairs with a significance and correlation larger than the given thresholds. 351 | 352 | from phik import report 353 | 354 | rep = report.correlation_report( 355 | data, significance_threshold=3, correlation_threshold=0.5 356 | ) 357 | 358 | # # Recap 359 | 360 | # To summarize, the main functions in the phik correlation package working on a dataframe are: 361 | # 362 | # - `df[twocols].hist2d()` or `series.hist2d(other_series)` 363 | # - `df.phik_matrix()` 364 | # - `df.global_phik()` 365 | # - `df.significance_matrix()` 366 | # - `df[twocols].outlier_significance_matrix()` or `series.hist2d(other_series)` 367 | # - `df.outlier_significance_matrices()` 368 | 369 | data[["driver_age", "mileage"]].hist2d() 370 | # Alternatively: data['driver_age'].hist2d(data['mileage']) 371 | 372 | data.phik_matrix() 373 | 374 | data.global_phik() 375 | 376 | data.significance_matrix() 377 | 378 | data[["area", "mileage"]].outlier_significance_matrix() 379 | 380 | os_matrices = data.outlier_significance_matrices() 381 | 382 | os_matrices.keys() 383 | 384 | os_matrices["car_color:mileage"] 385 | -------------------------------------------------------------------------------- /phik/simcore/asa159.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Taken from: 3 | * https://people.sc.fsu.edu/~jburkardt/cpp_src/asa159/asa159.html 4 | * 5 | * Michael Patefield, 6 | * Algorithm AS 159: An Efficient Method of Generating RXC Tables with Given Row and Column Totals, 7 | * Applied Statistics, 8 | * Volume 30, Number 1, 1981, pages 91-97. 9 | */ 10 | 11 | # include 12 | # include 13 | # include 14 | # include 15 | # include 16 | #include 17 | 18 | using namespace std; 19 | 20 | # include "asa159.hpp" 21 | 22 | //****************************************************************************80 23 | 24 | int i4_max ( int i1, int i2 ) 25 | 26 | //****************************************************************************80 27 | // 28 | // Purpose: 29 | // 30 | // I4_MAX returns the maximum of two I4's. 31 | // 32 | // Licensing: 33 | // 34 | // This code is distributed under the GNU LGPL license. 35 | // 36 | // Modified: 37 | // 38 | // 13 October 1998 39 | // 40 | // Author: 41 | // 42 | // John Burkardt 43 | // 44 | // Parameters: 45 | // 46 | // Input, int I1, I2, are two integers to be compared. 47 | // 48 | // Output, int I4_MAX, the larger of I1 and I2. 49 | // 50 | { 51 | int value; 52 | 53 | if ( i2 < i1 ) 54 | { 55 | value = i1; 56 | } 57 | else 58 | { 59 | value = i2; 60 | } 61 | return value; 62 | } 63 | //****************************************************************************80 64 | 65 | int i4_min ( int i1, int i2 ) 66 | 67 | //****************************************************************************80 68 | // 69 | // Purpose: 70 | // 71 | // I4_MIN returns the minimum of two I4's. 72 | // 73 | // Licensing: 74 | // 75 | // This code is distributed under the GNU LGPL license. 76 | // 77 | // Modified: 78 | // 79 | // 13 October 1998 80 | // 81 | // Author: 82 | // 83 | // John Burkardt 84 | // 85 | // Parameters: 86 | // 87 | // Input, int I1, I2, two integers to be compared. 88 | // 89 | // Output, int I4_MIN, the smaller of I1 and I2. 90 | // 91 | { 92 | int value; 93 | 94 | if ( i1 < i2 ) 95 | { 96 | value = i1; 97 | } 98 | else 99 | { 100 | value = i2; 101 | } 102 | return value; 103 | } 104 | //****************************************************************************80 105 | 106 | void i4mat_print ( int m, int n, int a[], string title ) 107 | 108 | //****************************************************************************80 109 | // 110 | // Purpose: 111 | // 112 | // I4MAT_PRINT prints an I4MAT. 113 | // 114 | // Discussion: 115 | // 116 | // An I4MAT is an MxN array of I4's, stored by (I,J) -> [I+J*M]. 117 | // 118 | // Licensing: 119 | // 120 | // This code is distributed under the GNU LGPL license. 121 | // 122 | // Modified: 123 | // 124 | // 10 September 2009 125 | // 126 | // Author: 127 | // 128 | // John Burkardt 129 | // 130 | // Parameters: 131 | // 132 | // Input, int M, the number of rows in A. 133 | // 134 | // Input, int N, the number of columns in A. 135 | // 136 | // Input, int A[M*N], the M by N matrix. 137 | // 138 | // Input, string TITLE, a title. 139 | // 140 | { 141 | i4mat_print_some ( m, n, a, 1, 1, m, n, title ); 142 | 143 | return; 144 | } 145 | //****************************************************************************80 146 | 147 | void i4mat_print_some ( int m, int n, int a[], int ilo, int jlo, int ihi, 148 | int jhi, string title ) 149 | 150 | //****************************************************************************80 151 | // 152 | // Purpose: 153 | // 154 | // I4MAT_PRINT_SOME prints some of an I4MAT. 155 | // 156 | // Discussion: 157 | // 158 | // An I4MAT is an MxN array of I4's, stored by (I,J) -> [I+J*M]. 159 | // 160 | // Licensing: 161 | // 162 | // This code is distributed under the GNU LGPL license. 163 | // 164 | // Modified: 165 | // 166 | // 20 August 2010 167 | // 168 | // Author: 169 | // 170 | // John Burkardt 171 | // 172 | // Parameters: 173 | // 174 | // Input, int M, the number of rows of the matrix. 175 | // M must be positive. 176 | // 177 | // Input, int N, the number of columns of the matrix. 178 | // N must be positive. 179 | // 180 | // Input, int A[M*N], the matrix. 181 | // 182 | // Input, int ILO, JLO, IHI, JHI, designate the first row and 183 | // column, and the last row and column to be printed. 184 | // 185 | // Input, string TITLE, a title. 186 | // 187 | { 188 | # define INCX 10 189 | 190 | int i; 191 | int i2hi; 192 | int i2lo; 193 | int j; 194 | int j2hi; 195 | int j2lo; 196 | 197 | cout << "\n"; 198 | cout << title << "\n"; 199 | 200 | if ( m <= 0 || n <= 0 ) 201 | { 202 | cout << "\n"; 203 | cout << " (None)\n"; 204 | return; 205 | } 206 | // 207 | // Print the columns of the matrix, in strips of INCX. 208 | // 209 | for ( j2lo = jlo; j2lo <= jhi; j2lo = j2lo + INCX ) 210 | { 211 | j2hi = j2lo + INCX - 1; 212 | if ( n < j2hi ) 213 | { 214 | j2hi = n; 215 | } 216 | if ( jhi < j2hi ) 217 | { 218 | j2hi = jhi; 219 | } 220 | 221 | cout << "\n"; 222 | // 223 | // For each column J in the current range... 224 | // 225 | // Write the header. 226 | // 227 | cout << " Col:"; 228 | for ( j = j2lo; j <= j2hi; j++ ) 229 | { 230 | cout << " " << setw(6) << j - 1; 231 | } 232 | cout << "\n"; 233 | cout << " Row\n"; 234 | cout << "\n"; 235 | // 236 | // Determine the range of the rows in this strip. 237 | // 238 | if ( 1 < ilo ) 239 | { 240 | i2lo = ilo; 241 | } 242 | else 243 | { 244 | i2lo = 1; 245 | } 246 | if ( ihi < m ) 247 | { 248 | i2hi = ihi; 249 | } 250 | else 251 | { 252 | i2hi = m; 253 | } 254 | 255 | for ( i = i2lo; i <= i2hi; i++ ) 256 | { 257 | // 258 | // Print out (up to INCX) entries in row I, that lie in the current strip. 259 | // 260 | cout << setw(5) << i - 1 << ":"; 261 | for ( j = j2lo; j <= j2hi; j++ ) 262 | { 263 | cout << " " << setw(6) << a[i-1+(j-1)*m]; 264 | } 265 | cout << "\n"; 266 | } 267 | } 268 | 269 | return; 270 | # undef INCX 271 | } 272 | //****************************************************************************80 273 | 274 | void i4vec_print ( int n, int a[], string title ) 275 | 276 | //****************************************************************************80 277 | // 278 | // Purpose: 279 | // 280 | // I4VEC_PRINT prints an I4VEC. 281 | // 282 | // Discussion: 283 | // 284 | // An I4VEC is a vector of I4's. 285 | // 286 | // Licensing: 287 | // 288 | // This code is distributed under the GNU LGPL license. 289 | // 290 | // Modified: 291 | // 292 | // 14 November 2003 293 | // 294 | // Author: 295 | // 296 | // John Burkardt 297 | // 298 | // Parameters: 299 | // 300 | // Input, int N, the number of components of the vector. 301 | // 302 | // Input, int A[N], the vector to be printed. 303 | // 304 | // Input, string TITLE, a title. 305 | // 306 | { 307 | int i; 308 | 309 | cout << "\n"; 310 | cout << title << "\n"; 311 | cout << "\n"; 312 | for ( i = 0; i < n; i++ ) 313 | { 314 | cout << " " << setw(8) << i 315 | << ": " << setw(8) << a[i] << "\n"; 316 | } 317 | return; 318 | } 319 | //****************************************************************************80 320 | 321 | int i4vec_sum ( int n, int a[] ) 322 | 323 | //****************************************************************************80 324 | // 325 | // Purpose: 326 | // 327 | // I4VEC_SUM sums the entries of an I4VEC. 328 | // 329 | // Discussion: 330 | // 331 | // An I4VEC is a vector of I4's. 332 | // 333 | // Example: 334 | // 335 | // Input: 336 | // 337 | // A = ( 1, 2, 3, 4 ) 338 | // 339 | // Output: 340 | // 341 | // I4VEC_SUM = 10 342 | // 343 | // Licensing: 344 | // 345 | // This code is distributed under the GNU LGPL license. 346 | // 347 | // Modified: 348 | // 349 | // 26 May 1999 350 | // 351 | // Author: 352 | // 353 | // John Burkardt 354 | // 355 | // Parameters: 356 | // 357 | // Input, int N, the number of entries in the vector. 358 | // 359 | // Input, int A[N], the vector to be summed. 360 | // 361 | // Output, int I4VEC_SUM, the sum of the entries of A. 362 | // 363 | { 364 | int i; 365 | int sum; 366 | 367 | sum = 0; 368 | for ( i = 0; i < n; i++ ) 369 | { 370 | sum = sum + a[i]; 371 | } 372 | 373 | return sum; 374 | } 375 | //****************************************************************************80 376 | 377 | double r8_uniform_01 ( int *seed ) 378 | 379 | //****************************************************************************80 380 | // 381 | // Purpose: 382 | // 383 | // R8_UNIFORM_01 is a unit pseudorandom R8. 384 | // 385 | // Discussion: 386 | // 387 | // This routine implements the recursion 388 | // 389 | // seed = 16807 * seed mod ( 2**31 - 1 ) 390 | // unif = seed / ( 2**31 - 1 ) 391 | // 392 | // The integer arithmetic never requires more than 32 bits, 393 | // including a sign bit. 394 | // 395 | // Licensing: 396 | // 397 | // This code is distributed under the GNU LGPL license. 398 | // 399 | // Modified: 400 | // 401 | // 11 August 2004 402 | // 403 | // Reference: 404 | // 405 | // Paul Bratley, Bennett Fox, Linus Schrage, 406 | // A Guide to Simulation, 407 | // Springer Verlag, pages 201-202, 1983. 408 | // 409 | // Bennett Fox, 410 | // Algorithm 647: 411 | // Implementation and Relative Efficiency of Quasirandom 412 | // Sequence Generators, 413 | // ACM Transactions on Mathematical Software, 414 | // Volume 12, Number 4, pages 362-376, 1986. 415 | // 416 | // Parameters: 417 | // 418 | // Input/output, int *SEED, a seed for the random number generator. 419 | // 420 | // Output, double R8_UNIFORM_01, a new pseudorandom variate, strictly between 421 | // 0 and 1. 422 | // 423 | { 424 | int k; 425 | double r; 426 | 427 | k = *seed / 127773; 428 | 429 | *seed = 16807 * ( *seed - k * 127773 ) - k * 2836; 430 | 431 | if ( *seed < 0 ) 432 | { 433 | *seed = *seed + 2147483647; 434 | } 435 | 436 | r = ( double ) ( *seed ) * 4.656612875E-10; 437 | 438 | return r; 439 | } 440 | //****************************************************************************80 441 | 442 | void rcont2 ( int nrow, int ncol, int nrowt[], int ncolt[], bool *key, 443 | int *seed, int matrix[], int *ierror ) 444 | 445 | //****************************************************************************80 446 | // 447 | // Purpose: 448 | // 449 | // RCONT2 constructs a random two-way contingency table with given sums. 450 | // 451 | // Discussion: 452 | // 453 | // It is possible to specify row and column sum vectors which 454 | // correspond to no table at all. As far as I can see, this routine does 455 | // not detect such a case. 456 | // 457 | // Licensing: 458 | // 459 | // This code is distributed under the GNU LGPL license. 460 | // 461 | // Modified: 462 | // 463 | // 10 March 2009 464 | // 465 | // Author: 466 | // 467 | // Original FORTRAN77 version by WM Patefield. 468 | // C++ version by John Burkardt. 469 | // 470 | // Reference: 471 | // 472 | // WM Patefield, 473 | // Algorithm AS 159: 474 | // An Efficient Method of Generating RXC Tables with 475 | // Given Row and Column Totals, 476 | // Applied Statistics, 477 | // Volume 30, Number 1, 1981, pages 91-97. 478 | // 479 | // Parameters: 480 | // 481 | // Input, int NROW, NCOL, the number of rows and columns 482 | // in the table. NROW and NCOL must each be at least 2. 483 | // 484 | // Input, int NROWT[NROW], NCOLT[NCOL], the row and column 485 | // sums. Each entry must be positive. 486 | // 487 | // Input/output, bool *KEY, a flag that indicates whether data has 488 | // been initialized for this problem. Set KEY = .FALSE. before the first 489 | // call. 490 | // 491 | // Input/output, int *SEED, a seed for the random number generator. 492 | // 493 | // Output, int MATRIX[NROW*NCOL], the matrix. 494 | // 495 | // Output, int *IERROR, an error flag, which is returned 496 | // as 0 if no error occurred. 497 | // 498 | { 499 | bool done1; 500 | bool done2; 501 | static double *fact = NULL; 502 | int i; 503 | int ia; 504 | int iap; 505 | int ib; 506 | int ic; 507 | int id; 508 | int idp; 509 | int ie; 510 | int igp; 511 | int ihp; 512 | int ii; 513 | int iip; 514 | int j; 515 | int jc; 516 | int *jwork; 517 | int l; 518 | bool lsm; 519 | bool lsp; 520 | int m; 521 | int nll; 522 | int nlm; 523 | int nlmp; 524 | int nrowtl; 525 | static int ntotal = 0; 526 | double r; 527 | double sumprb; 528 | double x; 529 | double y; 530 | 531 | *ierror = 0; 532 | // 533 | // On user's signal, set up the factorial table. 534 | // 535 | if ( !(*key) ) 536 | { 537 | 538 | *key = true; 539 | 540 | if ( nrow <= 1 ) 541 | { 542 | cout << "\n"; 543 | cout << "RCONT - Fatal error!\n"; 544 | cout << " Input number of rows is less than 2.\n"; 545 | *ierror = 1; 546 | return; 547 | } 548 | 549 | if ( ncol <= 1 ) 550 | { 551 | cout << "\n"; 552 | cout << "RCONT - Fatal error!\n"; 553 | cout << " The number of columns is less than 2.\n"; 554 | *ierror = 2; 555 | return; 556 | } 557 | 558 | for ( i = 0; i < nrow; i++ ) 559 | { 560 | if ( nrowt[i] <= 0 ) 561 | { 562 | cout << "\n"; 563 | cout << "RCONT - Fatal error!\n"; 564 | cout << " An entry in the row sum vector is not positive.\n"; 565 | *ierror = 3; 566 | return; 567 | } 568 | } 569 | 570 | for ( j = 0; j < ncol; j++ ) 571 | { 572 | if ( ncolt[j] <= 0 ) 573 | { 574 | cout << "\n"; 575 | cout << "RCONT - Fatal error!\n"; 576 | cout << " An entry in the column sum vector is not positive.\n"; 577 | *ierror = 4; 578 | return; 579 | } 580 | } 581 | 582 | if ( i4vec_sum ( ncol, ncolt ) != i4vec_sum ( nrow, nrowt ) ) 583 | { 584 | cout << "\n"; 585 | cout << "RCONT - Fatal error!\n"; 586 | cout << " The row and column sum vectors do not have the same sum.\n"; 587 | *ierror = 6; 588 | return; 589 | } 590 | 591 | ntotal = i4vec_sum ( ncol, ncolt ); 592 | 593 | if ( fact ) 594 | { 595 | delete [] fact; 596 | } 597 | 598 | fact = new double[ntotal+1]; 599 | // 600 | // Calculate log-factorials. 601 | // 602 | x = 0.0; 603 | fact[0] = 0.0; 604 | for ( i = 1; i <= ntotal; i++ ) 605 | { 606 | x = x + log ( ( double ) ( i ) ); 607 | fact[i] = x; 608 | } 609 | 610 | } 611 | // 612 | // Construct a random matrix. 613 | // 614 | jwork = new int[ncol]; 615 | 616 | for ( i = 0; i < ncol - 1; i++ ) 617 | { 618 | jwork[i] = ncolt[i]; 619 | } 620 | 621 | jc = ntotal; 622 | 623 | for ( l = 0; l < nrow - 1; l++ ) 624 | { 625 | nrowtl = nrowt[l]; 626 | ia = nrowtl; 627 | ic = jc; 628 | jc = jc - nrowtl; 629 | 630 | for ( m = 0; m < ncol - 1; m++ ) 631 | { 632 | id = jwork[m]; 633 | ie = ic; 634 | ic = ic - id; 635 | ib = ie - ia; 636 | ii = ib - id; 637 | // 638 | // Test for zero entries in matrix. 639 | // 640 | if ( ie == 0 ) 641 | { 642 | ia = 0; 643 | for ( j = m; j < ncol; j++ ) 644 | { 645 | matrix[l+j*nrow] = 0; 646 | } 647 | break; 648 | } 649 | // 650 | // Generate a pseudo-random number. 651 | // 652 | r = r8_uniform_01 ( seed ); 653 | // 654 | // Compute the conditional expected value of MATRIX(L,M). 655 | // 656 | done1 = false; 657 | 658 | for ( ; ; ) 659 | { 660 | nlm = ( int ) ( ( double ) ( ia * id ) / ( double ) ( ie ) + 0.5 ); 661 | iap = ia + 1; 662 | idp = id + 1; 663 | igp = idp - nlm; 664 | ihp = iap - nlm; 665 | nlmp = nlm + 1; 666 | iip = ii + nlmp; 667 | x = exp ( fact[iap-1] + fact[ib] + fact[ic] + fact[idp-1] - 668 | fact[ie] - fact[nlmp-1] - fact[igp-1] - fact[ihp-1] - fact[iip-1] ); 669 | 670 | if ( r <= x ) 671 | { 672 | break; 673 | } 674 | 675 | sumprb = x; 676 | y = x; 677 | nll = nlm; 678 | lsp = false; 679 | lsm = false; 680 | // 681 | // Increment entry in row L, column M. 682 | // 683 | while ( !lsp ) 684 | { 685 | j = ( id - nlm ) * ( ia - nlm ); 686 | 687 | if ( j == 0 ) 688 | { 689 | lsp = true; 690 | } 691 | else 692 | { 693 | nlm = nlm + 1; 694 | x = x * ( double ) ( j ) / ( double ) ( nlm * ( ii + nlm ) ); 695 | sumprb = sumprb + x; 696 | 697 | if ( r <= sumprb ) 698 | { 699 | done1 = true; 700 | break; 701 | } 702 | } 703 | 704 | done2 = false; 705 | 706 | while ( !lsm ) 707 | { 708 | // 709 | // Decrement the entry in row L, column M. 710 | // 711 | j = nll * ( ii + nll ); 712 | 713 | if ( j == 0 ) 714 | { 715 | lsm = true; 716 | break; 717 | } 718 | 719 | nll = nll - 1; 720 | y = y * ( double ) ( j ) / ( double ) ( ( id - nll ) * ( ia - nll ) ); 721 | sumprb = sumprb + y; 722 | 723 | if ( r <= sumprb ) 724 | { 725 | nlm = nll; 726 | done2 = true; 727 | break; 728 | } 729 | 730 | if ( !lsp ) 731 | { 732 | break; 733 | } 734 | 735 | } 736 | 737 | if ( done2 ) 738 | { 739 | break; 740 | } 741 | 742 | } 743 | 744 | if ( done1 ) 745 | { 746 | break; 747 | } 748 | 749 | if ( done2 ) 750 | { 751 | break; 752 | } 753 | 754 | r = r8_uniform_01 ( seed ); 755 | r = sumprb * r; 756 | 757 | } 758 | 759 | matrix[l+m*nrow] = nlm; 760 | ia = ia - nlm; 761 | jwork[m] = jwork[m] - nlm; 762 | 763 | } 764 | matrix[l+(ncol-1)*nrow] = ia; 765 | } 766 | // 767 | // Compute the last row. 768 | // 769 | for ( j = 0; j < ncol - 1; j++ ) 770 | { 771 | matrix[nrow-1+j*nrow] = jwork[j]; 772 | } 773 | matrix[nrow-1+(ncol-1)*nrow] = ib - matrix[nrow-1+(ncol-2)*nrow]; 774 | 775 | delete [] jwork; 776 | 777 | return; 778 | } 779 | //****************************************************************************80 780 | 781 | void timestamp ( ) 782 | 783 | //****************************************************************************80 784 | // 785 | // Purpose: 786 | // 787 | // TIMESTAMP prints the current YMDHMS date as a time stamp. 788 | // 789 | // Example: 790 | // 791 | // May 31 2001 09:45:54 AM 792 | // 793 | // Licensing: 794 | // 795 | // This code is distributed under the GNU LGPL license. 796 | // 797 | // Modified: 798 | // 799 | // 03 October 2003 800 | // 801 | // Author: 802 | // 803 | // John Burkardt 804 | // 805 | // Parameters: 806 | // 807 | // None 808 | // 809 | { 810 | # define TIME_SIZE 40 811 | 812 | static char time_buffer[TIME_SIZE]; 813 | const struct tm *tm; 814 | time_t now; 815 | 816 | now = time ( NULL ); 817 | tm = localtime ( &now ); 818 | 819 | strftime ( time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm ); 820 | 821 | cout << time_buffer << "\n"; 822 | 823 | return; 824 | # undef TIME_SIZE 825 | } 826 | -------------------------------------------------------------------------------- /phik/report.py: -------------------------------------------------------------------------------- 1 | """Project: PhiK - correlation analyzer library 2 | 3 | Created: 2018/09/06 4 | 5 | Description: 6 | Functions to create nice correlation overview and matrix plots 7 | 8 | Authors: 9 | KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands 10 | 11 | Redistribution and use in source and binary forms, with or without 12 | modification, are permitted according to the terms listed in the file 13 | LICENSE. 14 | """ 15 | import itertools 16 | import os 17 | from typing import Callable, Dict, Tuple, Union 18 | 19 | import matplotlib.pyplot as plt 20 | import numpy as np 21 | import pandas as pd 22 | from matplotlib import colors 23 | from matplotlib.backends.backend_pdf import PdfPages 24 | 25 | from .binning import bin_data 26 | from .data_quality import dq_check_nunique_values 27 | from .outliers import outlier_significance_matrix_from_rebinned_df 28 | from .phik import global_phik_from_rebinned_df, phik_from_rebinned_df 29 | from .significance import significance_from_rebinned_df 30 | from .utils import guess_interval_cols 31 | 32 | 33 | def plot_hist_and_func( 34 | data: Union[list, np.ndarray, pd.Series], 35 | func: Callable, 36 | funcparams, 37 | xbins=False, 38 | labels=None, 39 | xlabel="", 40 | ylabel="", 41 | title="", 42 | xlimit=None, 43 | alpha=1, 44 | ): 45 | """ 46 | Create a histogram of the provided data and overlay with a function. 47 | 48 | :param list data: data 49 | :param function func: function of the type f(x, a, b, c) where parameters a, b, c are optional 50 | :param list funcparams: parameter values to be given to the function, to be specified as [a, b, c] 51 | :param xbins: specify binning of histogram, either by giving the number of bins or a list of bin edges 52 | :param labels: labels of histogram and function to be used in the legend 53 | :param xlabel: figure xlabel 54 | :param ylabel: figure ylabel 55 | :param title: figure title 56 | :param xlimit: x limits figure 57 | :param alpha: alpha histogram 58 | :return: 59 | """ 60 | if labels is None: 61 | labels = ["", ""] 62 | 63 | # If binning is not specified, create binning here 64 | if not np.any(xbins) and not xlimit: 65 | xmin = np.min(data) 66 | xmax = np.max(data) 67 | xnbins = int(len(data) / 50 + 1) 68 | xbins = np.linspace(xmin, xmax, xnbins) 69 | elif type(xbins) == int or type(xbins) == float: 70 | xmin = np.min(data) 71 | if xlimit: 72 | xmin = xlimit[0] 73 | xmax = np.max(data) 74 | if xmax: 75 | xmax = xlimit[1] 76 | xnbins = int(xbins + 1) 77 | xbins = np.linspace(xmin, xmax, xnbins) 78 | 79 | # Plot a histogram of the data 80 | plt.hist(data, bins=xbins, label=labels[0], alpha=alpha) 81 | 82 | # Find bin centers for plotting the function 83 | xvals = xbins[:-1] + np.diff(xbins)[0] / 2 84 | bw = xbins[1] - xbins[0] 85 | # Plot the fit 86 | plt.plot( 87 | xvals, len(data) * bw * func(xvals, *funcparams), linewidth=2, label=labels[1] 88 | ) 89 | 90 | if xlabel: 91 | plt.xlabel(xlabel) 92 | if ylabel: 93 | plt.ylabel(ylabel) 94 | if title: 95 | plt.title(title) 96 | 97 | if len(labels[0]) > 0: 98 | plt.legend() 99 | 100 | 101 | def plot_correlation_matrix( 102 | matrix_colors: np.ndarray, 103 | x_labels: list, 104 | y_labels: list, 105 | pdf_file_name: str = "", 106 | title: str = "correlation", 107 | vmin: float = -1, 108 | vmax: float = 1, 109 | color_map: str = "RdYlGn", 110 | x_label: str = "", 111 | y_label: str = "", 112 | top: int = 20, 113 | matrix_numbers: np.ndarray = None, 114 | print_both_numbers: bool = True, 115 | figsize: tuple = (7, 5), 116 | usetex: bool = False, 117 | identity_layout: bool = True, 118 | fontsize_factor: float = 1, 119 | ) -> None: 120 | """Create and plot correlation matrix. 121 | 122 | Copied with permission from the eskapade package (pip install eskapade) 123 | 124 | :param matrix_colors: input correlation matrix 125 | :param list x_labels: Labels for histogram x-axis bins 126 | :param list y_labels: Labels for histogram y-axis bins 127 | :param str pdf_file_name: if set, will store the plot in a pdf file 128 | :param str title: if set, title of the plot 129 | :param float vmin: minimum value of color legend (default is -1) 130 | :param float vmax: maximum value of color legend (default is +1) 131 | :param str x_label: Label for histogram x-axis 132 | :param str y_label: Label for histogram y-axis 133 | :param str color_map: color map passed to matplotlib pcolormesh. (default is 'RdYlGn') 134 | :param int top: only print the top 20 characters of x-labels and y-labels. (default is 20) 135 | :param matrix_numbers: input matrix used for plotting numbers. (default it matrix_colors) 136 | :param identity_layout: Plot diagonal from right top to bottom left (True) or bottom left to top right (False) 137 | """ 138 | if not isinstance(matrix_colors, np.ndarray): 139 | raise TypeError("matrix_colors is not a numpy array.") 140 | 141 | # basic matrix checks 142 | assert (matrix_colors.shape[0] == len(y_labels)) or ( 143 | matrix_colors.shape[0] + 1 == len(y_labels) 144 | ), "matrix_colors shape inconsistent with number of y-labels" 145 | assert (matrix_colors.shape[1] == len(x_labels)) or ( 146 | matrix_colors.shape[1] + 1 == len(x_labels) 147 | ), "matrix_colors shape inconsistent with number of x-labels" 148 | if matrix_numbers is None: 149 | matrix_numbers = matrix_colors 150 | print_both_numbers = False # only one set of numbers possible 151 | else: 152 | assert matrix_numbers.shape[0] == len( 153 | y_labels 154 | ), "matrix_numbers shape inconsistent with number of y-labels" 155 | assert matrix_numbers.shape[1] == len( 156 | x_labels 157 | ), "matrix_numbers shape inconsistent with number of x-labels" 158 | 159 | if identity_layout: 160 | matrix_colors = np.array([a[::-1] for a in matrix_colors]) 161 | x_labels = x_labels[::-1] 162 | if matrix_numbers is not None: 163 | matrix_numbers = np.array([a[::-1] for a in matrix_numbers]) 164 | 165 | plt.rc("text", usetex=usetex) 166 | 167 | fig, ax = plt.subplots(figsize=figsize) 168 | # cmap = 'RdYlGn' #'YlGn' 169 | norm = colors.Normalize(vmin=vmin, vmax=vmax) 170 | img = ax.pcolormesh( 171 | matrix_colors, cmap=color_map, edgecolor="w", linewidth=1, norm=norm 172 | ) 173 | 174 | # set x-axis properties 175 | def tick(lab): 176 | """Get tick.""" 177 | if isinstance(lab, (float, int)): 178 | lab = "NaN" if np.isnan(lab) else "{0:.0f}".format(lab) 179 | lab = str(lab) 180 | if len(lab) > top: 181 | lab = lab[:17] + "..." 182 | return lab 183 | 184 | # reduce default fontsizes in case too many labels? 185 | # nlabs = max(len(y_labels), len(x_labels)) 186 | 187 | # axis ticks and tick labels 188 | if len(x_labels) == matrix_colors.shape[1] + 1: 189 | ax.set_xticks(np.arange(len(x_labels))) 190 | else: 191 | ax.set_xticks(np.arange(len(x_labels)) + 0.5) 192 | ax.set_xticklabels( 193 | [tick(lab) for lab in x_labels], 194 | rotation="vertical", 195 | fontsize=10 * fontsize_factor, 196 | ) 197 | 198 | if len(y_labels) == matrix_colors.shape[0] + 1: 199 | ax.set_yticks(np.arange(len(y_labels))) 200 | else: 201 | ax.set_yticks(np.arange(len(y_labels)) + 0.5) 202 | ax.set_yticklabels( 203 | [tick(lab) for lab in y_labels], 204 | rotation="horizontal", 205 | fontsize=10 * fontsize_factor, 206 | ) 207 | 208 | # Turn ticks off in case no labels are provided 209 | if len(x_labels) == 1 and len(x_labels[0]) == 0: 210 | plt.tick_params( 211 | axis="x", # changes apply to the x-axis 212 | which="both", # both major and minor ticks are affected 213 | bottom=False, # ticks along the bottom edge are off 214 | top=False, # ticks along the top edge are off 215 | labelbottom=False, 216 | ) 217 | if len(y_labels) == 1 and len(y_labels[0]) == 0: 218 | plt.tick_params( 219 | axis="y", # changes apply to the x-axis 220 | which="both", # both major and minor ticks are affected 221 | left=False, # ticks along the bottom edge are off 222 | right=False, # ticks along the top edge are off 223 | labelbottom=False, 224 | ) 225 | 226 | # make plot look pretty 227 | ax.set_title(title, fontsize=14 * fontsize_factor) 228 | if x_label: 229 | ax.set_xlabel(x_label, fontsize=12 * fontsize_factor) 230 | if y_label: 231 | ax.set_ylabel(y_label, fontsize=12 * fontsize_factor) 232 | 233 | fig.colorbar(img) 234 | 235 | # annotate with correlation values 236 | numbers_set = ( 237 | [matrix_numbers] if not print_both_numbers else [matrix_numbers, matrix_colors] 238 | ) 239 | for i in range(matrix_numbers.shape[1]): 240 | for j in range(matrix_numbers.shape[0]): 241 | point_color = float(matrix_colors[j][i]) 242 | white_cond = ( 243 | (point_color < 0.7 * vmin) 244 | or (point_color >= 0.7 * vmax) 245 | or np.isnan(point_color) 246 | ) 247 | y_offset = 0.5 248 | for m, matrix in enumerate(numbers_set): 249 | if print_both_numbers: 250 | if m == 0: 251 | y_offset = 0.7 252 | elif m == 1: 253 | y_offset = 0.25 254 | point = float(matrix[j][i]) 255 | label = "NaN" if np.isnan(point) else "{0:.2f}".format(point) 256 | color = "w" if white_cond else "k" 257 | ax.annotate( 258 | label, 259 | xy=(i + 0.5, j + y_offset), 260 | color=color, 261 | horizontalalignment="center", 262 | verticalalignment="center", 263 | fontsize=10 * fontsize_factor, 264 | ) 265 | 266 | plt.tight_layout() 267 | 268 | # save plot in file 269 | if pdf_file_name: 270 | pdf_file = PdfPages(pdf_file_name) 271 | plt.savefig(pdf_file, format="pdf", bbox_inches="tight", pad_inches=0) 272 | plt.close() 273 | pdf_file.close() 274 | 275 | 276 | def correlation_report( 277 | data: pd.DataFrame, 278 | interval_cols: list = None, 279 | bins=10, 280 | quantile: bool = False, 281 | do_outliers: bool = True, 282 | pdf_file_name: str = "", 283 | significance_threshold: float = 3, 284 | correlation_threshold: float = 0.5, 285 | noise_correction: bool = True, 286 | store_each_plot: bool = False, 287 | lambda_significance: str = "log-likelihood", 288 | simulation_method: str = "multinominal", 289 | nsim_chi2: int = 1000, 290 | significance_method: str = "asymptotic", 291 | CI_method: str = "poisson", 292 | verbose: bool = True, 293 | plot_phik_matrix_kws: dict = {}, 294 | plot_global_phik_kws: dict = {}, 295 | plot_significance_matrix_kws: dict = {}, 296 | plot_outlier_significance_kws: dict = {}, 297 | ) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, pd.DataFrame], Dict[str, str]]: 298 | """ 299 | Create a correlation report for the given dataset. 300 | 301 | The following quantities are calculated: 302 | 303 | * The phik correlation matrix 304 | * The significance matrix 305 | * The outlier significances measured in pairs of variables. (optional) 306 | 307 | :param data: input dataframe 308 | :param interval_cols: list of columns names of columns containing interval data 309 | :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\ 310 | E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]} 311 | :param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True) 312 | :param do_outliers: Evaluate outlier significances of variable pairs (when True) 313 | :param pdf_file_name: file name of the pdf where the results are stored 314 | :param store_each_plot: store each plot in folder derived from pdf_file_name. If true, single pdf is no longer stored. Default is false. 315 | :param significance_threshold: evaluate outlier significance for all variable pairs with a significance of \ 316 | uncorrelation higher than this threshold 317 | :param correlation_threshold: evaluate outlier significance for all variable pairs with a phik correlation \ 318 | higher than this threshold 319 | :param noise_correction: Apply noise correction in phik calculation 320 | :param lambda_significance: test statistic used in significance calculation. Options: [pearson, log-likelihood] 321 | :param simulation_method: sampling method using in significance calculation. Options: [mutlinominal, \ 322 | row_product_multinominal, col_product_multinominal, hypergeometric] 323 | :param nsim_chi2: number of simulated datasets in significance calculation. 324 | :param significance_method: method for significance calculation. Options: [asymptotic, MC, hybrid] 325 | :param CI_method: method for uncertainty calculation for outlier significance calculation. Options: [poisson, \ 326 | exact_poisson] 327 | :param bool verbose: if False, do not print all interval columns that are guessed 328 | :param dict plot_phik_matrix_kws: kwargs passed to plot_correlation_matrix() to plot the phik matrix. \ 329 | updates the default plotting values. 330 | :param dict plot_global_phik_kws: kwargs passed to plot_correlation_matrix() to plot the global-phik vector. \ 331 | updates the default plotting values. 332 | :param dict plot_significance_matrix_kws: kwargs passed to plot_correlation_matrix() to plot significance matrix. \ 333 | updates the default plotting values. 334 | :param dict plot_outlier_significance_kws: kwargs passed to plot_correlation_matrix() to plot the outlier \ 335 | significances. updates the default plotting values. 336 | :returns: phik_matrix (pd.DataFrame), global_phik (np.array), significance_matrix (pd.DataFrame), \ 337 | outliers_overview (dictionary), output_files (dictionary) 338 | """ 339 | 340 | if interval_cols is None: 341 | interval_cols = guess_interval_cols(data, verbose) 342 | 343 | data_clean, interval_cols_clean = dq_check_nunique_values(data, interval_cols) 344 | 345 | # create pdf(s) to save plots 346 | output_files = dict() 347 | plot_file_name = "" 348 | if store_each_plot: 349 | folder = os.path.dirname(pdf_file_name) 350 | folder += "/" if folder else "./" 351 | # if each plot is stored, single overview file is no longer stored. 352 | # (b/c of problem with multiple PdfPages) 353 | pdf_file_name = "" 354 | if pdf_file_name: 355 | pdf_file = PdfPages(pdf_file_name) 356 | 357 | data_binned, binning_dict = bin_data( 358 | data_clean, interval_cols_clean, bins=bins, quantile=quantile, retbins=True 359 | ) 360 | 361 | ### 1. Phik 362 | if store_each_plot: 363 | plot_file_name = folder + "phik_matrix.pdf" 364 | output_files["phik_matrix"] = plot_file_name 365 | phik_matrix = phik_from_rebinned_df(data_binned, noise_correction) 366 | 367 | default_plot_phik_matrix = dict( 368 | x_labels=phik_matrix.columns, 369 | y_labels=phik_matrix.index, 370 | vmin=0, 371 | vmax=1, 372 | color_map="Blues", 373 | title=r"correlation $\phi_K$", 374 | fontsize_factor=1.5, 375 | figsize=(7, 5.5), 376 | pdf_file_name=plot_file_name, 377 | ) 378 | default_plot_phik_matrix.update(plot_phik_matrix_kws) 379 | plot_correlation_matrix(phik_matrix.values, **default_plot_phik_matrix) 380 | if pdf_file_name: 381 | plt.savefig(pdf_file, format="pdf", bbox_inches="tight", pad_inches=0) 382 | plt.show() 383 | 384 | ### 1b. global correlations 385 | if store_each_plot: 386 | plot_file_name = folder + "global_phik.pdf" 387 | output_files["global_phik"] = plot_file_name 388 | global_phik, global_labels = global_phik_from_rebinned_df( 389 | data_binned, noise_correction 390 | ) 391 | 392 | default_plot_global_phik = dict( 393 | x_labels=[""], 394 | y_labels=global_labels, 395 | vmin=0, 396 | vmax=1, 397 | figsize=(3.5, 4), 398 | color_map="Blues", 399 | title=r"$g_k$", 400 | fontsize_factor=1.5, 401 | pdf_file_name=plot_file_name, 402 | ) 403 | default_plot_global_phik.update(plot_global_phik_kws) 404 | plot_correlation_matrix(global_phik, **default_plot_global_phik) 405 | # plt.tight_layout() 406 | if pdf_file_name: 407 | plt.savefig(pdf_file, format="pdf", bbox_inches="tight", pad_inches=0) 408 | plt.show() 409 | 410 | ### 2. Significance 411 | if store_each_plot: 412 | plot_file_name = folder + "significance_matrix.pdf" 413 | output_files["significance_matrix"] = plot_file_name 414 | significance_matrix = significance_from_rebinned_df( 415 | data_binned, 416 | lambda_significance, 417 | simulation_method, 418 | nsim_chi2, 419 | significance_method, 420 | ) 421 | 422 | default_plot_significance_matrix = dict( 423 | x_labels=significance_matrix.columns, 424 | y_labels=significance_matrix.index, 425 | vmin=-5, 426 | vmax=5, 427 | title="significance", 428 | usetex=False, 429 | fontsize_factor=1.5, 430 | figsize=(7, 5.5), 431 | pdf_file_name=plot_file_name, 432 | ) 433 | default_plot_significance_matrix.update(plot_significance_matrix_kws) 434 | plot_correlation_matrix( 435 | significance_matrix.fillna(0).values, **default_plot_significance_matrix 436 | ) 437 | if pdf_file_name: 438 | plt.savefig(pdf_file, format="pdf", bbox_inches="tight", pad_inches=0) 439 | plt.show() 440 | 441 | ### 3. Outlier significance 442 | outliers_overview = {} 443 | if do_outliers: 444 | for i, comb in enumerate(itertools.combinations(data_binned.columns, 2)): 445 | c0, c1 = comb 446 | if ( 447 | abs(significance_matrix.loc[c0, c1]) < significance_threshold 448 | or phik_matrix.loc[c0, c1] < correlation_threshold 449 | ): 450 | continue 451 | 452 | zvalues_df = outlier_significance_matrix_from_rebinned_df( 453 | data_binned[[c0, c1]].copy(), binning_dict, CI_method=CI_method 454 | ) 455 | 456 | combi = ":".join(comb).replace(" ", "_") 457 | xlabels = zvalues_df.columns 458 | ylabels = zvalues_df.index 459 | xlabel = zvalues_df.columns.name 460 | ylabel = zvalues_df.index.name 461 | 462 | if store_each_plot: 463 | plot_file_name = folder + "pulls_{0:s}.pdf".format(combi) 464 | output_files[combi] = plot_file_name 465 | 466 | default_plot_outlier_significance = dict( 467 | x_labels=xlabels, 468 | y_labels=ylabels, 469 | x_label=xlabel, 470 | y_label=ylabel, 471 | vmin=-5, 472 | vmax=5, 473 | title="outlier significance", 474 | identity_layout=False, 475 | fontsize_factor=1.2, 476 | pdf_file_name=plot_file_name, 477 | ) 478 | default_plot_outlier_significance.update(plot_outlier_significance_kws) 479 | plot_correlation_matrix( 480 | zvalues_df.values, **default_plot_outlier_significance 481 | ) 482 | 483 | outliers_overview[combi] = zvalues_df 484 | 485 | if pdf_file_name: 486 | plt.savefig(pdf_file, format="pdf", bbox_inches="tight", pad_inches=0) 487 | plt.show() 488 | 489 | # save plots 490 | if pdf_file_name: 491 | output_files["all"] = pdf_file_name 492 | plt.close() 493 | pdf_file.close() 494 | 495 | return ( 496 | phik_matrix, 497 | global_phik, 498 | significance_matrix, 499 | outliers_overview, 500 | output_files, 501 | ) 502 | -------------------------------------------------------------------------------- /phik/significance.py: -------------------------------------------------------------------------------- 1 | """Project: PhiK - correlation analyzer library 2 | 3 | Created: 2018/09/05 4 | 5 | Description: 6 | Functions for doing the significance evaluation of an hypothesis test of variable independence 7 | using a contingency table. 8 | 9 | Authors: 10 | KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands 11 | 12 | Redistribution and use in source and binary forms, with or without 13 | modification, are permitted according to the terms listed in the file 14 | LICENSE. 15 | """ 16 | from typing import Tuple, Union 17 | 18 | import numpy as np 19 | import pandas as pd 20 | import math 21 | import itertools 22 | import warnings 23 | 24 | from scipy import stats 25 | from scipy import special, optimize 26 | 27 | from phik import definitions as defs 28 | from .binning import bin_data, create_correlation_overview_table 29 | from .statistics import get_chi2_using_dependent_frequency_estimates 30 | from .statistics import estimate_ndof, theoretical_ndof 31 | from .simulation import sim_chi2_distribution 32 | from .data_quality import dq_check_nunique_values, dq_check_hist2d 33 | from .utils import array_like_to_dataframe, guess_interval_cols 34 | 35 | 36 | def fit_test_statistic_distribution( 37 | chi2s: Union[list, np.ndarray], nbins: int = 50 38 | ) -> Tuple[float, float, float, float]: 39 | """ 40 | Fit the hybrid chi2-distribution to the data to find f. 41 | 42 | Perform a binned likelihood fit to the data to find the optimal value for the fraction f in 43 | h(x|f) = N * (f * chi2(x, ndof) + (1-f) * gauss(x, ndof, sqrt(ndof)) 44 | The parameter ndof is fixed in the fit using ndof = mean(x). The total number of datapoints N is also fixed. 45 | 46 | :param list chi2s: input data - a list of chi2 values 47 | :param int nbins: in order to fit the data a histogram is created with nbins number of bins 48 | :returns: f, ndof, sigma (width of gauss), bw (bin width) 49 | """ 50 | 51 | def myfunc(x, N, f, k, sigma): 52 | return N * (f * stats.chi2.pdf(x, k) + (1 - f) * stats.norm.pdf(x, k, sigma)) 53 | 54 | ffunc = lambda x, f: myfunc(x, nsim * bw, f, kmean, lsigma) 55 | 56 | def gtest(p, x, y): 57 | f = ffunc(x, *p) 58 | ll = f - special.xlogy(y, f) + special.gammaln(y + 1) 59 | return np.sqrt(ll) 60 | 61 | kmean = np.mean(chi2s) # NOTE: this is used to fix kmean in the fit! 62 | lsigma = np.sqrt(kmean) # NOTE: this is used to fix sigma in the fit! 63 | nsim = len(chi2s) # NOTE: this is used to fix N in fit (N=nsim*bw) ! 64 | 65 | yhist, xbounds = np.histogram(chi2s, bins=nbins) 66 | bw = xbounds[1] - xbounds[0] # NOTE: this is used to fix N in fit (N=nsim*bw) ! 67 | xhist = xbounds[:-1] + np.diff(xbounds) / 2 68 | 69 | initGuess = (1.0,) # starting value for parameter f 70 | res = optimize.least_squares( 71 | gtest, initGuess, bounds=((0.0,), (1,)), args=(xhist, yhist) 72 | ) 73 | 74 | return res.x[0], kmean, lsigma, bw 75 | 76 | 77 | def hfunc(x: float, N: float, f: float, k: float, sigma: float) -> float: 78 | """ 79 | Definition of the combined probability density function h(x) 80 | 81 | h(x|f) = N * (f * chi2(x, k) + (1-f) * gauss(x, k, sigma) 82 | 83 | :param float x: x 84 | :param float N: normalisation 85 | :param float f: fraction [0,1] 86 | :param float k: ndof of chi2 function and mean of gauss 87 | :param float sigma: width of gauss 88 | :return: h(x|f) 89 | """ 90 | return N * (f * stats.chi2.pdf(x, k) + (1 - f) * stats.norm.pdf(x, k, sigma)) 91 | 92 | 93 | def significance_from_chi2_ndof(chi2: float, ndof: float) -> Tuple[float, float]: 94 | """ 95 | Convert a chi2 into significance using knowledge about the number of degrees of freedom 96 | 97 | Conversion is done using asymptotic approximation. 98 | 99 | :param float chi2: chi2 value 100 | :param float ndof: number of degrees of freedom 101 | :returns: p_value, significance 102 | """ 103 | p_value = stats.chi2.sf(chi2, ndof) 104 | z_value = -stats.norm.ppf(p_value) 105 | 106 | # scenario where p_value is too small to evaluate Z 107 | # use Chernoff approximation for p-value upper bound 108 | # see: https://en.wikipedia.org/wiki/Chi-squared_distribution 109 | if p_value == 0: 110 | z = chi2 / ndof 111 | u = -math.log(2 * math.pi) - ndof * math.log(z) + ndof * (z - 1) 112 | z_value = math.sqrt(u - math.log(u)) 113 | 114 | return p_value, z_value 115 | 116 | 117 | def significance_from_chi2_asymptotic( 118 | values: np.ndarray, chi2: float 119 | ) -> Tuple[float, float]: 120 | """ 121 | Convert a chi2 into significance using knowledge about the number of degrees of freedom 122 | 123 | Convention is done using asymptotic approximation. 124 | 125 | :param float chi2: chi2 value 126 | :param float ndof: number of degrees of freedom 127 | :returns: p_value, significance 128 | """ 129 | 130 | ndof = theoretical_ndof(values) 131 | p_value, z_value = significance_from_chi2_ndof(chi2, ndof) 132 | 133 | return p_value, z_value 134 | 135 | 136 | def significance_from_chi2_MC( 137 | chi2: float, 138 | values: np.ndarray, 139 | nsim: int = 1000, 140 | lambda_: str = "log-likelihood", 141 | simulation_method: str = "multinominal", 142 | chi2s=None, 143 | njobs: int = -1, 144 | ) -> Tuple[float, float]: 145 | """ 146 | Convert a chi2 into significance using knowledge about the shape of the chi2 distribution of simulated data 147 | 148 | Calculate significance based on simulation (MC method), using a simple percentile. 149 | 150 | :param float chi2: chi2 value 151 | :param list chi2s: provide your own chi2s values (optional) 152 | :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs. 153 | :returns: pvalue, significance 154 | """ 155 | 156 | # determine effective number of degrees of freedom using simulation 157 | if chi2s is None: 158 | chi2s = sim_chi2_distribution( 159 | values, 160 | nsim=nsim, 161 | lambda_=lambda_, 162 | simulation_method=simulation_method, 163 | njobs=njobs, 164 | ) 165 | 166 | # calculate p_value based on simulation (MC method) 167 | empirical_p_value = 1.0 - stats.percentileofscore(chi2s, chi2) / 100.0 168 | empirical_z_value = -stats.norm.ppf(empirical_p_value) 169 | 170 | return empirical_p_value, empirical_z_value 171 | 172 | 173 | def significance_from_chi2_hybrid( 174 | chi2: float, 175 | values: np.ndarray, 176 | nsim: int = 1000, 177 | lambda_: str = "log-likelihood", 178 | simulation_method: str = "multinominal", 179 | chi2s=None, 180 | njobs: int = -1, 181 | ) -> Tuple[float, float]: 182 | """ 183 | Convert a chi2 into significance using a hybrid method 184 | 185 | This method combines the asymptotic method with the MC method, but applies several corrections: 186 | 187 | * use effective number of degrees of freedom instead of number of degrees of freedom. The effective number of\ 188 | degrees of freedom is measured as mean(chi2s), with chi2s a list of simulated chi2 values. 189 | * for low statistics data sets, with on average less than 4 data points per bin, the distribution of chi2-values\ 190 | is better described by h(x|f) then by the usual chi2-distribution. Use h(x|f) to convert the chi2 value to \ 191 | the pvalue and significance. 192 | 193 | h(x|f) = N * (f * chi2(x, ndof) + (1-f) * gauss(x, ndof, sqrt(ndof)) 194 | 195 | :param float chi2: chi2 value 196 | :param list chi2s: provide your own chi2s values (optional) 197 | :param float avg_per_bin: average number of data points per bin 198 | :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs. 199 | :returns: p_value, significance 200 | """ 201 | 202 | # determine effective number of degrees of freedom using simulation 203 | if chi2s is None: 204 | chi2s = sim_chi2_distribution( 205 | values, 206 | nsim=nsim, 207 | lambda_=lambda_, 208 | simulation_method=simulation_method, 209 | njobs=njobs, 210 | ) 211 | 212 | # average number of records per bin 213 | avg_per_bin = values.sum() / values.shape[0] * values.shape[1] 214 | 215 | if avg_per_bin <= 4: 216 | f, endof, lsigma, bw = fit_test_statistic_distribution(chi2s) 217 | pvalue_h = f * stats.chi2.sf(chi2, endof) + (1 - f) * stats.norm.sf( 218 | chi2, endof, lsigma 219 | ) 220 | else: 221 | endof = estimate_ndof(chi2s) 222 | pvalue_h = stats.chi2.sf(chi2, endof) 223 | 224 | zvalue_h = -stats.norm.ppf(pvalue_h) 225 | 226 | if pvalue_h == 0: 227 | # apply Chernoff approximation as upper bound for p-value 228 | # see: https://en.wikipedia.org/wiki/Chi-squared_distribution 229 | z = chi2 / endof 230 | u = -math.log(2 * math.pi) - endof * math.log(z) + endof * (z - 1) 231 | if avg_per_bin <= 4: 232 | u += -2 * math.log(f) 233 | zvalue_h = math.sqrt(u - math.log(u)) 234 | 235 | return pvalue_h, zvalue_h 236 | 237 | 238 | def significance_from_hist2d( 239 | values: np.ndarray, 240 | nsim: int = 1000, 241 | lambda_: str = "log-likelihood", 242 | simulation_method: str = "multinominal", 243 | significance_method: str = "hybrid", 244 | njobs: int = -1, 245 | ) -> Tuple[float, float]: 246 | """ 247 | Calculate the significance of correlation of two variables based on the contingency table 248 | 249 | :param values: contingency table 250 | :param int nsim: number of simulations 251 | :param str lambda_: test statistic. Available options are [pearson, log-likelihood] 252 | :param str simulation_method: simulation method. Options: [multinominal, row_product_multinominal, \ 253 | col_product_multinominal, hypergeometric]. 254 | :param str significance_method: significance_method. Options: [asymptotic, MC, hybrid] 255 | :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs. 256 | :return: pvalue, significance 257 | """ 258 | 259 | # chi2 of the data 260 | chi2 = get_chi2_using_dependent_frequency_estimates(values, lambda_=lambda_) 261 | 262 | if significance_method == "asymptotic": 263 | # calculate pvalue and zvalue based on chi2 and ndof (asymptotic method) 264 | pvalue, zvalue = significance_from_chi2_asymptotic(values, chi2) 265 | elif significance_method == "MC": 266 | # calculate pvalue based on simulation (MC method) 267 | pvalue, zvalue = significance_from_chi2_MC( 268 | chi2, 269 | values, 270 | nsim=nsim, 271 | lambda_=lambda_, 272 | simulation_method=simulation_method, 273 | njobs=njobs, 274 | ) 275 | elif significance_method == "hybrid": 276 | # low statistics : calculate pvalue and zvalue using h(x|f) and endof 277 | # high statistics: calculate pvalue and zvalue using chi2-distribution and endof 278 | pvalue, zvalue = significance_from_chi2_hybrid( 279 | chi2, 280 | values, 281 | nsim=nsim, 282 | lambda_=lambda_, 283 | simulation_method=simulation_method, 284 | njobs=njobs, 285 | ) 286 | else: 287 | raise NotImplementedError( 288 | "simulation_method {0:s} is unknown".format(simulation_method) 289 | ) 290 | 291 | return pvalue, zvalue 292 | 293 | 294 | def significance_from_rebinned_df( 295 | data_binned: pd.DataFrame, 296 | lambda_: str = "log-likelihood", 297 | simulation_method: str = "multinominal", 298 | nsim: int = 1000, 299 | significance_method: str = "hybrid", 300 | dropna: bool = True, 301 | drop_underflow: bool = True, 302 | drop_overflow: bool = True, 303 | njobs: int = -1, 304 | ) -> pd.DataFrame: 305 | """ 306 | Calculate significance of correlation of all variable combinations in the DataFrame 307 | 308 | :param data_binned: input binned DataFrame 309 | :param int nsim: number of simulations 310 | :param str lambda_: test statistic. Available options are [pearson, log-likelihood] 311 | :param str simulation_method: simulation method. Options: [mutlinominal, row_product_multinominal, \ 312 | col_product_multinominal, hypergeometric]. 313 | :param str significance_method: significance_method. Options: [asymptotic, MC, hybrid] 314 | :param bool dropna: remove NaN values with True 315 | :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\ 316 | a numeric variable) 317 | :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\ 318 | a numeric variable) 319 | :param int njobs: number of parallel jobs used for simulation. default is -1. 320 | :return: significance matrix 321 | """ 322 | 323 | if not dropna: 324 | # if not dropna replace the NaN values with the string NaN. Otherwise the rows with NaN are dropped 325 | # by the groupby. 326 | data_binned.replace(np.nan, defs.NaN, inplace=True) 327 | if drop_underflow: 328 | data_binned.replace(defs.UF, np.nan, inplace=True) 329 | if drop_overflow: 330 | data_binned.replace(defs.OF, np.nan, inplace=True) 331 | 332 | # cache column order (https://github.com/KaveIO/PhiK/issues/1) 333 | column_order = data_binned.columns 334 | signifs = [] 335 | for i, (c0, c1) in enumerate( 336 | itertools.combinations_with_replacement(data_binned.columns.values, 2) 337 | ): 338 | datahist = ( 339 | data_binned.groupby([c0, c1])[c0].count().to_frame().unstack().fillna(0) 340 | ) 341 | if 1 in datahist.shape or 0 in datahist.shape: 342 | signifs.append((c0, c1, np.nan)) 343 | warnings.warn( 344 | "Too few unique values for variable {0:s} ({1:d}) or {2:s} ({3:d}) to calculate significance".format( 345 | c0, datahist.shape[0], c1, datahist.shape[1] 346 | ) 347 | ) 348 | continue 349 | 350 | datahist.columns = datahist.columns.droplevel() 351 | datahist = datahist.values 352 | pvalue, zvalue = significance_from_hist2d( 353 | datahist, 354 | nsim=nsim, 355 | lambda_=lambda_, 356 | simulation_method=simulation_method, 357 | significance_method=significance_method, 358 | njobs=njobs, 359 | ) 360 | signifs.append((c0, c1, zvalue)) 361 | 362 | if len(signifs) == 0: 363 | return pd.DataFrame(np.nan, index=column_order, columns=column_order) 364 | 365 | significance_overview = create_correlation_overview_table(signifs) 366 | 367 | # restore column order 368 | significance_overview = significance_overview.reindex(columns=column_order) 369 | significance_overview = significance_overview.reindex(index=column_order) 370 | 371 | return significance_overview 372 | 373 | 374 | def significance_matrix( 375 | df: pd.DataFrame, 376 | interval_cols: list = None, 377 | lambda_: str = "log-likelihood", 378 | simulation_method: str = "multinominal", 379 | nsim: int = 1000, 380 | significance_method: str = "hybrid", 381 | bins: Union[int, list, np.ndarray, dict] = 10, 382 | dropna: bool = True, 383 | drop_underflow: bool = True, 384 | drop_overflow: bool = True, 385 | verbose: bool = True, 386 | njobs: int = -1, 387 | ) -> pd.DataFrame: 388 | """ 389 | Calculate significance of correlation of all variable combinations in the dataframe 390 | 391 | :param pd.DataFrame df: input data 392 | :param list interval_cols: column names of columns with interval variables. 393 | :param int nsim: number of simulations 394 | :param str lambda_: test statistic. Available options are [pearson, log-likelihood] 395 | :param str simulation_method: simulation method. Options: [mutlinominal, row_product_multinominal, \ 396 | col_product_multinominal, hypergeometric]. 397 | :param int nsim: number of simulated datasets 398 | :param str significance_method: significance_method. Options: [asymptotic, MC, hybrid] 399 | :param bool dropna: remove NaN values with True 400 | :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\ 401 | E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]} 402 | :param bool dropna: remove NaN values with True 403 | :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\ 404 | a numeric variable) 405 | :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\ 406 | a numeric variable) 407 | :param bool verbose: if False, do not print all interval columns that are guessed 408 | :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs. 409 | :return: significance matrix 410 | """ 411 | 412 | if interval_cols is None: 413 | interval_cols = guess_interval_cols(df, verbose) 414 | 415 | df_clean, interval_cols_clean = dq_check_nunique_values( 416 | df, interval_cols, dropna=dropna 417 | ) 418 | 419 | data_binned = bin_data(df_clean, interval_cols_clean, bins=bins) 420 | 421 | return significance_from_rebinned_df( 422 | data_binned, 423 | lambda_=lambda_, 424 | simulation_method=simulation_method, 425 | nsim=nsim, 426 | significance_method=significance_method, 427 | dropna=dropna, 428 | drop_underflow=drop_underflow, 429 | drop_overflow=drop_overflow, 430 | njobs=njobs, 431 | ) 432 | 433 | 434 | def significance_from_array( 435 | x: Union[np.ndarray, pd.Series], 436 | y: Union[np.ndarray, pd.Series], 437 | num_vars=None, 438 | bins: Union[int, list, np.ndarray, dict] = 10, 439 | quantile: bool = False, 440 | lambda_: str = "log-likelihood", 441 | nsim: int = 1000, 442 | significance_method: str = "hybrid", 443 | simulation_method: str = "multinominal", 444 | dropna: bool = True, 445 | drop_underflow: bool = True, 446 | drop_overflow: bool = True, 447 | njobs: int = -1, 448 | ) -> Tuple[float, float]: 449 | """ 450 | Calculate the significance of correlation 451 | 452 | Calculate the significance of correlation for two variables which can be of interval, oridnal or categorical type.\ 453 | Interval variables will be binned. 454 | 455 | :param x: array-like input 456 | :param y: array-like input 457 | :param num_vars: list of numeric variables which need to be binned, e.g. ['x'] or ['x','y'] 458 | :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\ 459 | E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]} 460 | :param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True) 461 | :param str lambda_: test statistic. Available options are [pearson, log-likelihood] 462 | :param int nsim: number of simulated datasets 463 | :param str simulation_method: simulation method. Options: [mutlinominal, row_product_multinominal, \ 464 | col_product_multinominal, hypergeometric]. 465 | :param str significance_method: significance_method. Options: [asymptotic, MC, hybrid] 466 | :param bool dropna: remove NaN values with True 467 | :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\ 468 | a numeric variable) 469 | :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\ 470 | a numeric variable) 471 | :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs. 472 | :return: p-value, significance 473 | """ 474 | if num_vars is None: 475 | num_vars = [] 476 | elif isinstance(num_vars, str): 477 | num_vars = [num_vars] 478 | 479 | if len(num_vars) > 0: 480 | df = array_like_to_dataframe(x, y) 481 | x, y = bin_data(df, num_vars, bins=bins, quantile=quantile).T.values 482 | 483 | return significance_from_binned_array( 484 | x, 485 | y, 486 | lambda_=lambda_, 487 | significance_method=significance_method, 488 | nsim=nsim, 489 | simulation_method=simulation_method, 490 | dropna=dropna, 491 | drop_underflow=drop_underflow, 492 | drop_overflow=drop_overflow, 493 | njobs=njobs, 494 | ) 495 | 496 | 497 | def significance_from_binned_array( 498 | x: Union[np.ndarray, pd.Series], 499 | y: Union[np.ndarray, pd.Series], 500 | lambda_: str = "log-likelihood", 501 | significance_method: str = "hybrid", 502 | nsim: int = 1000, 503 | simulation_method: str = "multinominal", 504 | dropna: bool = True, 505 | drop_underflow: bool = True, 506 | drop_overflow: bool = True, 507 | njobs: int = -1, 508 | ) -> Tuple[float, float]: 509 | """ 510 | Calculate the significance of correlation 511 | 512 | Calculate the significance of correlation for two variables which can be of interval, oridnal or categorical type. \ 513 | Interval variables need to be binned. 514 | 515 | :param x: array-like input 516 | :param y: array-like input 517 | :param str lambda_: test statistic. Available options are [pearson, log-likelihood] 518 | :param str simulation_method: simulation method. Options: [multinominal, row_product_multinominal, \ 519 | col_product_multinominal, hypergeometric]. 520 | :param int nsim: number of simulated datasets 521 | :param str significance_method: significance_method. Options: [asymptotic, MC, hybrid] 522 | :param bool dropna: remove NaN values with True 523 | :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\ 524 | a numeric variable) 525 | :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\ 526 | a numeric variable) 527 | :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs. 528 | :return: p-value, significance 529 | """ 530 | 531 | if not dropna: 532 | x = pd.Series(x).fillna(defs.NaN).astype(str).values 533 | y = ( 534 | pd.Series(y).fillna(defs.NaN).astype(str).values 535 | ) # crosstab cannot handle mixed type y! 536 | 537 | if drop_underflow or drop_overflow: 538 | x = pd.Series(x).astype(str).values 539 | y = pd.Series(y).astype(str).values 540 | if drop_underflow: 541 | x[np.where(x == defs.UF)] = np.nan 542 | y[np.where(y == defs.UF)] = np.nan 543 | if drop_overflow: 544 | y[np.where(y == defs.OF)] = np.nan 545 | x[np.where(x == defs.OF)] = np.nan 546 | 547 | hist2d = pd.crosstab(x, y).values 548 | 549 | if not dq_check_hist2d(hist2d): 550 | return np.nan, np.nan 551 | 552 | return significance_from_hist2d( 553 | hist2d, 554 | lambda_=lambda_, 555 | significance_method=significance_method, 556 | simulation_method=simulation_method, 557 | nsim=nsim, 558 | njobs=njobs, 559 | ) 560 | --------------------------------------------------------------------------------