├── .dockerignore ├── .github └── FUNDING.yml ├── .gitignore ├── .readthedocs.yml ├── Dockerfile ├── LICENSE.txt ├── MANIFEST.in ├── Makefile ├── Makefile.bat ├── README.md ├── build.sh ├── docs ├── Makefile ├── autobuild.bat ├── autobuild.sh ├── make.bat └── source │ ├── _code │ ├── binary-demo.py │ ├── biserial-demo.py │ ├── categorical-demo.py │ ├── concordance-demo.py │ ├── confusion-demo.py │ ├── continuous-demo.py │ ├── corr-ratio-demo.py │ ├── dataframe-tip.py │ ├── multiprocessing-tip.py │ └── spark-demo.py │ ├── _logo │ ├── logo-1000.png │ ├── logo-150.png │ ├── logo-250.png │ ├── logo-50.png │ └── logo-500.png │ ├── _static │ ├── .gitkeep │ ├── css │ │ └── override.css │ ├── favicon.ico │ └── images │ │ ├── logo-small.png │ │ ├── logo.png │ │ ├── ooc-logo.png │ │ └── ooc-small.png │ ├── _templates │ └── .gitkeep │ ├── conf.py │ ├── deepdives.rst │ ├── index.rst │ ├── intro.rst │ ├── modules.rst │ ├── pypair.rst │ ├── quicklist.rst │ ├── quickstart.rst │ ├── refs.bib │ ├── robots.txt │ └── zzz-bib.rst ├── logo.png ├── misc ├── SPARK.md ├── binary-measures.csv ├── binary-measures.py ├── count-measures.py ├── ipynb │ ├── binary-binary.ipynb │ └── cat-cat.ipynb └── scratch.py ├── publish.sh ├── pypair ├── __init__.py ├── association.py ├── biserial.py ├── contingency.py ├── continuous.py ├── decorator.py ├── spark.py └── util.py ├── requirements.txt ├── setup.cfg ├── setup.py └── tests ├── __init__.py ├── test_association.py ├── test_contingency.py ├── test_spark.py └── test_table.py /.dockerignore: -------------------------------------------------------------------------------- 1 | **/*.pyc 2 | .idea/ 3 | docs/build/ 4 | .pytest_cache/ 5 | build/ 6 | coverage/ 7 | dist/ 8 | pypair.egg-info/ 9 | docs/build/ 10 | .coverage 11 | .noseids 12 | .ipynb_checkpoints/ 13 | joblib_memmap/ 14 | .DS_store -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: vangj 4 | patreon: vangj 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: https://oneoffcoder.com/ 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/*.pyc 2 | .idea/ 3 | docs/build/ 4 | coverage/ 5 | .coverage 6 | .noseids 7 | dist/ 8 | pypair.egg-info/ 9 | build/ 10 | .ipynb_checkpoints/ 11 | .pypirc 12 | .pypircc 13 | joblib_memmap/ 14 | .pytest_cache/ 15 | .DS_store 16 | .vscode -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/source/conf.py 11 | 12 | # Build documentation with MkDocs 13 | #mkdocs: 14 | # configuration: mkdocs.yml 15 | 16 | # Optionally build your docs in additional formats such as PDF 17 | formats: 18 | - htmlzip 19 | 20 | # Optionally set the version of Python and requirements required to build your docs 21 | python: 22 | version: 3.8 23 | install: 24 | - requirements: requirements.txt -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM oneoffcoder/python-java:latest 2 | 3 | LABEL author="Jee Vang, Ph.D." 4 | LABEL email="vangjee@gmail.com" 5 | 6 | ARG AAPI_VERSION 7 | ARG APYPI_REPO 8 | 9 | ENV API_VERSION=$AAPI_VERSION 10 | ENV PYPI_REPO=$APYPI_REPO 11 | 12 | RUN apt-get update \ 13 | && apt-get upgrade -y 14 | COPY . /py-pair 15 | RUN pip install -r /py-pair/requirements.txt 16 | RUN /py-pair/publish.sh -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2017 Jee Vang 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt README.md 2 | prune tests* -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: init clean lint test 2 | .DEFAULT_GOAL := build 3 | 4 | init: 5 | pip install -r requirements.txt 6 | 7 | lint: 8 | python -m flake8 ./pypair 9 | 10 | test: clean lint 11 | nosetests tests 12 | 13 | build: test 14 | python setup.py bdist_egg sdist bdist_wheel 15 | 16 | install: build 17 | python setup.py install 18 | 19 | publish: build 20 | python setup.py sdist upload -r pypi 21 | 22 | compile: 23 | python -m compileall -f ./pypair 24 | 25 | clean: 26 | find . -type f -name '*.pyc' -delete 27 | find . -type d -name '__pycache__' -delete 28 | rm -fr coverage/ 29 | rm -fr dist/ 30 | rm -fr build/ 31 | rm -fr pypair.egg-info/ 32 | rm -fr pypair/pypair.egg-info/ 33 | rm -fr jupyter/.ipynb_checkpoints/ 34 | rm -fr joblib_memmap/ 35 | rm -fr docs/build/ 36 | rm -fr .pytest_cache/ 37 | rm -f .coverage 38 | rm -f .noseids 39 | 40 | -------------------------------------------------------------------------------- /Makefile.bat: -------------------------------------------------------------------------------- 1 | @ECHO off 2 | if /I %1 == default goto :default 3 | if /I %1 == init goto :init 4 | if /I %1 == lint goto :lint 5 | if /I %1 == test goto :test 6 | if /I %1 == clean goto :clean 7 | if /I %1 == build goto :build 8 | if /I %1 == install goto :install 9 | 10 | goto :eof ::can be ommited to run the `default` function similarly to makefiles 11 | 12 | :default 13 | goto :test 14 | 15 | :init 16 | pip install -r requirements.txt 17 | goto :eof 18 | 19 | :lint 20 | python -m flake8 ./pypair 21 | goto :eof 22 | 23 | :test 24 | nosetests tests 25 | goto :eof 26 | 27 | :clean 28 | del /S *.pyc 29 | rmdir /S /Q coverage 30 | rmdir /S /Q dist 31 | rmdir /S /Q build 32 | rmdir /S /Q pypair.egg-info 33 | rmdir /S /Q pypair/pypair.egg-info 34 | rmdir /S /Q jupyter/.ipynb_checkpoints 35 | rmdir /S /Q docs/build 36 | rmdir /S /Q joblib_memmap 37 | rmdir /S /Q .pytest_cache 38 | del .coverage 39 | del .noseids 40 | goto :eof 41 | 42 | :build 43 | python setup.py bdist_egg sdist bdist_wheel 44 | goto :eof 45 | 46 | :install 47 | python setup.py install 48 | goto :eof -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![pypair logo](https://py-pair.readthedocs.io/_images/logo.png) 2 | 3 | # PyPair 4 | 5 | PyPair is a statistical library to compute pairwise association between any two variables. In general, statistical variable types are viewed as `categorical` or `continuous`. Categorical variables have no inherit order to their values, while continuous variables do. This API has `over 130 association measures` implemented for any combination of categorical and/or continuous variables. 6 | 7 | To install. 8 | 9 | ```bash 10 | pip install pypair 11 | ``` 12 | 13 | Additional links. 14 | 15 | - [Documentation](https://py-pair.readthedocs.io/) 16 | - [PyPi](https://pypi.org/project/pypair/) 17 | - [Gitter](https://gitter.im/dataflava/py-pair) 18 | 19 | Here's a short and sweet snippet for using the API against a dataframe that stores strictly binary data. The Pandas `DataFrame.corr()` method no longer processes non-numeric fields! 20 | 21 | ```python 22 | from pypair.association import binary_binary 23 | from pypair.util import corr 24 | 25 | jaccard = lambda a, b: binary_binary(a, b, measure='jaccard') 26 | tanimoto = lambda a, b: binary_binary(a, b, measure='tanimoto_i') 27 | 28 | df = get_a_pandas_binary_dataframe() 29 | 30 | jaccard_corr = corr(df, jaccard) 31 | tanimoto_corr = corr(df, tanimoto) 32 | 33 | print(jaccard_corr) 34 | print('-' * 15) 35 | print(tanimoto_corr) 36 | ``` 37 | 38 | Another way to get started with PyPair is to use the `convenience` methods whose names indicate the variable pair types. 39 | 40 | ```python 41 | from pypair.association import binary_binary, categorical_categorical, \ 42 | binary_continuous, concordance, categorical_continuous, continuous_continuous, confusion, agreement 43 | 44 | # assume a and b are the appropriate iterables of values for 2 variables 45 | jaccard = binary_binary(a, b, measure='jaccard') 46 | acc = confusion(a, b, measure='acc') 47 | phi = categorical_categorical(a, b, measure='phi') 48 | kappa = agreement(a, b, measure='cohen_k') 49 | biserial = binary_continuous(a, b, measure='biserial') 50 | tau = concordance(a, b, measure='kendall_tau') 51 | eta = categorical_continuous(a, b, measure='eta') 52 | pearson = continuous_continuous(a, b, measure='pearson') 53 | ``` 54 | 55 | # Software Copyright 56 | 57 | ``` 58 | Copyright 2020 One-Off Coder 59 | 60 | Licensed under the Apache License, Version 2.0 (the "License"); 61 | you may not use this file except in compliance with the License. 62 | You may obtain a copy of the License at 63 | 64 | http://www.apache.org/licenses/LICENSE-2.0 65 | 66 | Unless required by applicable law or agreed to in writing, software 67 | distributed under the License is distributed on an "AS IS" BASIS, 68 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 69 | See the License for the specific language governing permissions and 70 | limitations under the License. 71 | ``` 72 | 73 | # Book Copyright 74 | 75 | Copyright 2020 One-Off Coder 76 | 77 | This work is licensed under a [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/) by [One-Off Coder](https://www.oneoffcoder.com). 78 | 79 | ![Creative Commons Attribution 4.0 International License](https://i.creativecommons.org/l/by/4.0/88x31.png "Creative Commons Attribution 4.0 International License") 80 | 81 | # Art Copyright 82 | 83 | Copyright 2020 Daytchia Vang 84 | 85 | # Citation 86 | 87 | ``` 88 | @misc{oneoffcoder_pypair_2020, 89 | title={PyPair, A Statistical API for Bivariate Association Measures}, 90 | url={https://github.com/oneoffcoder/py-pair}, 91 | author={Jee Vang}, 92 | year={2020}, 93 | month={Nov}} 94 | ``` 95 | 96 | # Sponsor, Love 97 | 98 | - [Patreon](https://www.patreon.com/vangj) 99 | - [GitHub](https://github.com/sponsors/vangj) -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DOCKER_FILE=Dockerfile 4 | DOCKER_REPO=pypair 5 | DOCKER_TAG=local 6 | AAPI_VERSION=version 7 | APYPI_REPO=repo 8 | 9 | while getopts v:r: option 10 | do 11 | case "${option}" 12 | in 13 | v) AAPI_VERSION=${OPTARG};; 14 | r) APYPI_REPO=${OPTARG};; 15 | esac 16 | done 17 | 18 | if [[ "version" == AAPI_VERSION || "repo" == $APYPI_REPO ]]; then 19 | echo "Usage: ./build.sh -r [pypi|testpypi] -v [version]" 20 | echo " -r repository, pypi or testpypi" 21 | echo " -v version e.g. 0.2.5" 22 | else 23 | docker build --no-cache \ 24 | -f $DOCKER_FILE \ 25 | --build-arg AAPI_VERSION=$AAPI_VERSION \ 26 | --build-arg APYPI_REPO=$APYPI_REPO \ 27 | -t ${DOCKER_REPO}:${DOCKER_TAG} . 28 | fi -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/autobuild.bat: -------------------------------------------------------------------------------- 1 | python -m sphinx_autobuild ./source ./build -b html --host 0.0.0.0 --port 8000 -------------------------------------------------------------------------------- /docs/autobuild.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m sphinx_autobuild ./source ./build -b html --host 0.0.0.0 --port 8000 4 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/_code/binary-demo.py: -------------------------------------------------------------------------------- 1 | from pypair.association import binary_binary 2 | from pypair.contingency import BinaryTable 3 | 4 | get_data = lambda x, y, n: [(x, y) for _ in range(n)] 5 | data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242) 6 | a = [a for a, _ in data] 7 | b = [b for _, b in data] 8 | 9 | for m in BinaryTable.measures(): 10 | r = binary_binary(a, b, m) 11 | print(f'{r}: {m}') 12 | 13 | print('-' * 15) 14 | 15 | table = BinaryTable(a, b) 16 | for m in table.measures(): 17 | r = table.get(m) 18 | print(f'{r}: {m}') 19 | -------------------------------------------------------------------------------- /docs/source/_code/biserial-demo.py: -------------------------------------------------------------------------------- 1 | from pypair.association import binary_continuous 2 | from pypair.biserial import Biserial 3 | 4 | get_data = lambda x, y, n: [(x, y) for _ in range(n)] 5 | data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242) 6 | a = [a for a, _ in data] 7 | b = [b for _, b in data] 8 | 9 | for m in Biserial.measures(): 10 | r = binary_continuous(a, b, m) 11 | print(f'{r}: {m}') 12 | 13 | print('-' * 15) 14 | 15 | biserial = Biserial(a, b) 16 | for m in biserial.measures(): 17 | r = biserial.get(m) 18 | print(f'{r}: {m}') 19 | -------------------------------------------------------------------------------- /docs/source/_code/categorical-demo.py: -------------------------------------------------------------------------------- 1 | from pypair.association import categorical_categorical 2 | from pypair.contingency import CategoricalTable 3 | 4 | get_data = lambda x, y, n: [(x, y) for _ in range(n)] 5 | data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242) 6 | a = [a for a, _ in data] 7 | b = [b for _, b in data] 8 | 9 | for m in CategoricalTable.measures(): 10 | r = categorical_categorical(a, b, m) 11 | print(f'{r}: {m}') 12 | 13 | print('-' * 15) 14 | 15 | table = CategoricalTable(a, b) 16 | for m in table.measures(): 17 | r = table.get(m) 18 | print(f'{r}: {m}') 19 | -------------------------------------------------------------------------------- /docs/source/_code/concordance-demo.py: -------------------------------------------------------------------------------- 1 | from pypair.association import concordance 2 | from pypair.continuous import Concordance 3 | 4 | a = [1, 2, 3] 5 | b = [3, 2, 1] 6 | 7 | for m in Concordance.measures(): 8 | r = concordance(a, b, m) 9 | print(f'{r}: {m}') 10 | 11 | print('-' * 15) 12 | 13 | con = Concordance(a, b) 14 | for m in con.measures(): 15 | r = con.get(m) 16 | print(f'{r}: {m}') 17 | -------------------------------------------------------------------------------- /docs/source/_code/confusion-demo.py: -------------------------------------------------------------------------------- 1 | from pypair.association import confusion 2 | from pypair.contingency import ConfusionMatrix 3 | 4 | 5 | def get_data(): 6 | """ 7 | Data taken from `here `_. 8 | A pair of binary variables, `a` and `p`, are returned. 9 | 10 | :return: a, p 11 | """ 12 | tn = [(0, 0) for _ in range(50)] 13 | fp = [(0, 1) for _ in range(10)] 14 | fn = [(1, 0) for _ in range(5)] 15 | tp = [(1, 1) for _ in range(100)] 16 | data = tn + fp + fn + tp 17 | a = [a for a, _ in data] 18 | p = [b for _, b in data] 19 | return a, p 20 | 21 | 22 | a, p = get_data() 23 | 24 | # if you need to quickly get just one association measure 25 | r = confusion(a, p, measure='acc') 26 | print(r) 27 | 28 | print('-' * 15) 29 | 30 | # you can also get a list of available association measures 31 | # and loop over to call confusion(...) 32 | # this is more convenient, but less fast 33 | for m in ConfusionMatrix.measures(): 34 | r = confusion(a, p, m) 35 | print(f'{r}: {m}') 36 | 37 | print('-' * 15) 38 | 39 | # if you need multiple association measures, then 40 | # build the confusion matrix table 41 | # this is less convenient, but much faster 42 | matrix = ConfusionMatrix(a, p) 43 | for m in matrix.measures(): 44 | r = matrix.get(m) 45 | print(f'{r}: {m}') 46 | -------------------------------------------------------------------------------- /docs/source/_code/continuous-demo.py: -------------------------------------------------------------------------------- 1 | from pypair.association import continuous_continuous 2 | from pypair.continuous import Continuous 3 | 4 | x = [x for x in range(10)] 5 | y = [y for y in range(10)] 6 | 7 | for m in Continuous.measures(): 8 | r = continuous_continuous(x, y, m) 9 | print(f'{r}: {m}') 10 | 11 | print('-' * 15) 12 | 13 | con = Continuous(x, y) 14 | for m in con.measures(): 15 | r = con.get(m) 16 | print(f'{r}: {m}') 17 | -------------------------------------------------------------------------------- /docs/source/_code/corr-ratio-demo.py: -------------------------------------------------------------------------------- 1 | from pypair.association import categorical_continuous 2 | from pypair.continuous import CorrelationRatio 3 | 4 | data = [ 5 | ('a', 45), ('a', 70), ('a', 29), ('a', 15), ('a', 21), 6 | ('g', 40), ('g', 20), ('g', 30), ('g', 42), 7 | ('s', 65), ('s', 95), ('s', 80), ('s', 70), ('s', 85), ('s', 73) 8 | ] 9 | x = [x for x, _ in data] 10 | y = [y for _, y in data] 11 | for m in CorrelationRatio.measures(): 12 | r = categorical_continuous(x, y, m) 13 | print(f'{r}: {m}') 14 | 15 | print('-' * 15) 16 | 17 | cr = CorrelationRatio(x, y) 18 | for m in cr.measures(): 19 | r = cr.get(m) 20 | print(f'{r}: {m}') 21 | -------------------------------------------------------------------------------- /docs/source/_code/dataframe-tip.py: -------------------------------------------------------------------------------- 1 | from random import randint 2 | 3 | import pandas as pd 4 | 5 | from pypair.association import binary_binary 6 | from pypair.util import corr 7 | 8 | 9 | def get_data(n_rows=1000, n_cols=5): 10 | data = [tuple([randint(0, 1) for _ in range(n_cols)]) for _ in range(n_rows)] 11 | cols = [f'x{i}' for i in range(n_cols)] 12 | return pd.DataFrame(data, columns=cols) 13 | 14 | 15 | if __name__ == '__main__': 16 | jaccard = lambda a, b: binary_binary(a, b, measure='jaccard') 17 | tanimoto = lambda a, b: binary_binary(a, b, measure='tanimoto_i') 18 | 19 | df = get_data() 20 | jaccard_corr = corr(df, jaccard) 21 | tanimoto_corr = corr(df, tanimoto) 22 | 23 | print(jaccard_corr) 24 | print('-' * 15) 25 | print(tanimoto_corr) 26 | -------------------------------------------------------------------------------- /docs/source/_code/multiprocessing-tip.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import random 4 | from random import randint 5 | from pypair.association import binary_binary 6 | from itertools import combinations 7 | from multiprocessing import Pool 8 | 9 | np.random.seed(37) 10 | random.seed(37) 11 | 12 | def get_data(n_rows=1000, n_cols=5): 13 | data = [tuple([randint(0, 1) for _ in range(n_cols)]) for _ in range(n_rows)] 14 | cols = [f'x{i}' for i in range(n_cols)] 15 | return pd.DataFrame(data, columns=cols) 16 | 17 | def compute(a, b, df): 18 | x = df[a] 19 | y = df[b] 20 | return f'{a}_{b}', binary_binary(x, y, measure='jaccard') 21 | 22 | if __name__ == '__main__': 23 | df = get_data() 24 | 25 | with Pool(10) as pool: 26 | pairs = ((a, b, df) for a, b in combinations(df.columns, 2)) 27 | bc = pool.starmap(compute, pairs) 28 | 29 | bc = sorted(bc, key=lambda tup: tup[0]) 30 | print(dict(bc)) -------------------------------------------------------------------------------- /docs/source/_code/spark-demo.py: -------------------------------------------------------------------------------- 1 | import json 2 | from random import choice 3 | 4 | import pandas as pd 5 | from pyspark.sql import SparkSession 6 | 7 | from pypair.spark import binary_binary, confusion, categorical_categorical, agreement, binary_continuous, concordance, \ 8 | categorical_continuous, continuous_continuous 9 | 10 | 11 | def _get_binary_binary_data(spark): 12 | """ 13 | Gets dummy binary-binary data in a Spark dataframe. 14 | 15 | :return: Spark dataframe. 16 | """ 17 | get_data = lambda x, y, n: [(x, y) * 2 for _ in range(n)] 18 | data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242) 19 | pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4']) 20 | sdf = spark.createDataFrame(pdf) 21 | return sdf 22 | 23 | 24 | def _get_confusion_data(spark): 25 | """ 26 | Gets dummy binary-binary data in Spark dataframe. For use with confusion matrix analysis. 27 | 28 | :return: Spark dataframe. 29 | """ 30 | tn = [(0, 0) * 2 for _ in range(50)] 31 | fp = [(0, 1) * 2 for _ in range(10)] 32 | fn = [(1, 0) * 2 for _ in range(5)] 33 | tp = [(1, 1) * 2 for _ in range(100)] 34 | data = tn + fp + fn + tp 35 | pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4']) 36 | sdf = spark.createDataFrame(pdf) 37 | return sdf 38 | 39 | 40 | def _get_categorical_categorical_data(spark): 41 | """ 42 | Gets dummy categorical-categorical data in Spark dataframe. 43 | 44 | :return: Spark dataframe. 45 | """ 46 | x_domain = ['a', 'b', 'c'] 47 | y_domain = ['a', 'b'] 48 | 49 | get_x = lambda: choice(x_domain) 50 | get_y = lambda: choice(y_domain) 51 | get_data = lambda: {f'x{i}': v for i, v in enumerate((get_x(), get_y(), get_x(), get_y()))} 52 | 53 | pdf = pd.DataFrame([get_data() for _ in range(100)]) 54 | sdf = spark.createDataFrame(pdf) 55 | return sdf 56 | 57 | 58 | def _get_binary_continuous_data(spark): 59 | """ 60 | Gets dummy `binary-continuous data `_. 61 | 62 | :return: Spark dataframe. 63 | """ 64 | data = [ 65 | (1, 10), (1, 11), (1, 6), (1, 11), (0, 4), 66 | (0, 3), (1, 12), (0, 2), (0, 2), (0, 1) 67 | ] 68 | pdf = pd.DataFrame(data, columns=['gender', 'years']) 69 | sdf = spark.createDataFrame(pdf) 70 | return sdf 71 | 72 | 73 | def _get_concordance_data(spark): 74 | """ 75 | Gets dummy concordance data. 76 | 77 | :return: Spark dataframe. 78 | """ 79 | a = [1, 2, 3] 80 | b = [3, 2, 1] 81 | pdf = pd.DataFrame({'a': a, 'b': b, 'c': a, 'd': b}) 82 | sdf = spark.createDataFrame(pdf) 83 | return sdf 84 | 85 | 86 | def _get_categorical_continuous_data(spark): 87 | data = [ 88 | ('a', 45), ('a', 70), ('a', 29), ('a', 15), ('a', 21), 89 | ('g', 40), ('g', 20), ('g', 30), ('g', 42), 90 | ('s', 65), ('s', 95), ('s', 80), ('s', 70), ('s', 85), ('s', 73) 91 | ] 92 | data = [tup * 2 for tup in data] 93 | pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4']) 94 | sdf = spark.createDataFrame(pdf) 95 | return sdf 96 | 97 | 98 | def _get_continuous_continuous_data(spark): 99 | """ 100 | Gets dummy continuous-continuous data. 101 | See `site `_. 102 | 103 | :return: Spark dataframe. 104 | """ 105 | data = [ 106 | (12, 9), 107 | (10, 12), 108 | (9, 12), 109 | (14, 11), 110 | (10, 8), 111 | (11, 9), 112 | (10, 9), 113 | (10, 6), 114 | (14, 12), 115 | (9, 11), 116 | (11, 12), 117 | (10, 7), 118 | (11, 13), 119 | (15, 14), 120 | (8, 11), 121 | (11, 11), 122 | (9, 8), 123 | (9, 9), 124 | (10, 11), 125 | (12, 9), 126 | (11, 12), 127 | (10, 12), 128 | (9, 7), 129 | (7, 9), 130 | (12, 14) 131 | ] 132 | pdf = pd.DataFrame([item * 2 for item in data], columns=['x1', 'x2', 'x3', 'x4']) 133 | sdf = spark.createDataFrame(pdf) 134 | return sdf 135 | 136 | 137 | spark = None 138 | 139 | try: 140 | # create a spark session 141 | spark = (SparkSession.builder 142 | .master('local[4]') 143 | .appName('local-testing-pyspark') 144 | .getOrCreate()) 145 | 146 | # create some spark dataframes 147 | bin_sdf = _get_binary_binary_data(spark) 148 | con_sdf = _get_confusion_data(spark) 149 | cat_sdf = _get_categorical_categorical_data(spark) 150 | bcn_sdf = _get_binary_continuous_data(spark) 151 | crd_sdf = _get_concordance_data(spark) 152 | ccn_sdf = _get_categorical_continuous_data(spark) 153 | cnt_sdf = _get_continuous_continuous_data(spark) 154 | 155 | # call these methods to get the association measures 156 | bin_results = binary_binary(bin_sdf).collect() 157 | con_results = confusion(con_sdf).collect() 158 | cat_results = categorical_categorical(cat_sdf).collect() 159 | agr_results = agreement(bin_sdf).collect() 160 | bcn_results = binary_continuous(bcn_sdf, binary=['gender'], continuous=['years']).collect() 161 | crd_results = concordance(crd_sdf).collect() 162 | ccn_results = categorical_continuous(ccn_sdf, ['x1', 'x3'], ['x2', 'x4']).collect() 163 | cnt_results = continuous_continuous(cnt_sdf).collect() 164 | 165 | # convert the lists to dictionaries 166 | bin_results = {tup[0]: tup[1] for tup in bin_results} 167 | con_results = {tup[0]: tup[1] for tup in con_results} 168 | cat_results = {tup[0]: tup[1] for tup in cat_results} 169 | agr_results = {tup[0]: tup[1] for tup in agr_results} 170 | bcn_results = {tup[0]: tup[1] for tup in bcn_results} 171 | crd_results = {tup[0]: tup[1] for tup in crd_results} 172 | ccn_results = {tup[0]: tup[1] for tup in ccn_results} 173 | cnt_results = {tup[0]: tup[1] for tup in cnt_results} 174 | 175 | # pretty print 176 | to_json = lambda r: json.dumps({f'{k[0]}_{k[1]}': v for k, v in r.items()}, indent=1) 177 | print(to_json(bin_results)) 178 | print('-' * 10) 179 | print(to_json(con_results)) 180 | print('*' * 10) 181 | print(to_json(cat_results)) 182 | print('~' * 10) 183 | print(to_json(agr_results)) 184 | print('-' * 10) 185 | print(to_json(bcn_results)) 186 | print('=' * 10) 187 | print(to_json(crd_results)) 188 | print('`' * 10) 189 | print(to_json(ccn_results)) 190 | print('/' * 10) 191 | print(to_json(cnt_results)) 192 | except Exception as e: 193 | print(e) 194 | finally: 195 | try: 196 | spark.stop() 197 | print('closed spark') 198 | except Exception as e: 199 | print(e) 200 | -------------------------------------------------------------------------------- /docs/source/_logo/logo-1000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_logo/logo-1000.png -------------------------------------------------------------------------------- /docs/source/_logo/logo-150.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_logo/logo-150.png -------------------------------------------------------------------------------- /docs/source/_logo/logo-250.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_logo/logo-250.png -------------------------------------------------------------------------------- /docs/source/_logo/logo-50.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_logo/logo-50.png -------------------------------------------------------------------------------- /docs/source/_logo/logo-500.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_logo/logo-500.png -------------------------------------------------------------------------------- /docs/source/_static/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_static/.gitkeep -------------------------------------------------------------------------------- /docs/source/_static/css/override.css: -------------------------------------------------------------------------------- 1 | table.expand { 2 | width: 100%; 3 | } 4 | table.rc-headers, th.rc-headers, td.rc-headers { 5 | border: 1px dashed blue; 6 | border-collapse: collapse; 7 | padding: 5px; 8 | } 9 | th.heading, td.heading { 10 | font-weight: bold; 11 | } -------------------------------------------------------------------------------- /docs/source/_static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_static/favicon.ico -------------------------------------------------------------------------------- /docs/source/_static/images/logo-small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_static/images/logo-small.png -------------------------------------------------------------------------------- /docs/source/_static/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_static/images/logo.png -------------------------------------------------------------------------------- /docs/source/_static/images/ooc-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_static/images/ooc-logo.png -------------------------------------------------------------------------------- /docs/source/_static/images/ooc-small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_static/images/ooc-small.png -------------------------------------------------------------------------------- /docs/source/_templates/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_templates/.gitkeep -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('../../')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'PyPair' 21 | copyright = '2020, One-Off Coder' 22 | author = 'Jee Vang' 23 | 24 | # The full version, including alpha/beta/rc tags 25 | release = '3.0.8' 26 | 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | 'sphinx.ext.autodoc', 35 | 'sphinx.ext.doctest', 36 | 'sphinx.ext.todo', 37 | 'sphinx.ext.coverage', 38 | 'sphinx.ext.mathjax', 39 | 'sphinx.ext.githubpages', 40 | 'sphinxcontrib.bibtex', 41 | 'sphinxcontrib.blockdiag', 42 | 'sphinx_sitemap' 43 | ] 44 | 45 | bibtex_bibfiles = ['refs.bib'] 46 | 47 | # Add any paths that contain templates here, relative to this directory. 48 | templates_path = ['_templates'] 49 | 50 | # List of patterns, relative to source directory, that match files and 51 | # directories to ignore when looking for source files. 52 | # This pattern also affects html_static_path and html_extra_path. 53 | exclude_patterns = [] 54 | 55 | 56 | # -- Options for HTML output ------------------------------------------------- 57 | 58 | # The theme to use for HTML and HTML Help pages. See the documentation for 59 | # a list of builtin themes. 60 | # 61 | html_theme = 'sphinx_rtd_theme' 62 | 63 | # Add any paths that contain custom static files (such as style sheets) here, 64 | # relative to this directory. They are copied after the builtin static files, 65 | # so a file named "default.css" will overwrite the builtin "default.css". 66 | html_static_path = ['_static'] 67 | html_css_files = [ 68 | 'css/override.css', 69 | ] 70 | html_show_sourcelink = False 71 | html_show_sphinx = False 72 | html_last_updated_fmt = '%b %d, %Y, %X' 73 | html_logo = '_static/images/logo-small.png' 74 | html_favicon = '_static/favicon.ico' 75 | html_theme_options = { 76 | 'canonical_url': 'https://py-pair.readthedocs.io/', 77 | 'analytics_id': 'UA-150762273-1', # Provided by Google in your dashboard 78 | 'logo_only': False, 79 | 'display_version': True, 80 | 'prev_next_buttons_location': 'bottom', 81 | 'style_external_links': True, 82 | 'style_nav_header_background': '#0085CA', 83 | # Toc options 84 | 'collapse_navigation': True, 85 | 'sticky_navigation': True, 86 | 'navigation_depth': 4, 87 | 'includehidden': True, 88 | 'titles_only': False 89 | } -------------------------------------------------------------------------------- /docs/source/deepdives.rst: -------------------------------------------------------------------------------- 1 | Selected Deep Dives 2 | =================== 3 | 4 | Let's go into some association measures in more details. 5 | 6 | Binary association 7 | ------------------ 8 | 9 | The association between binary variables have been studied prolifically in the last 100 years :cite:`2010:choi,1970:cox,1984:reynolds,2019:warrens,2020:ibm-proximities`. A binary variable has only two values. It is typical to re-encode these values into 0 or 1. How and why each of these two values are mapped to 0 or 1 is subjective, arbitrary and/or context-specific. For example, if we have a variable that captures the handedness, favoring left or right hand, of a person, we could map left to 0 and right to 1, or, left to 1 and right to 0. The 0-1 value representation of a binary variable's values is the common foundation for understanding association. Below is a contingency table created from two binary variables. Notice the main values of the tables are `a`, `b`, `c` and `d`. 10 | 11 | - :math:`a = N_{11}` is the count of when the two variables have a value of 1 12 | - :math:`b = N_{10}` is the count of when the row variable has a value of 1 and the column variable has a value of 0 13 | - :math:`c = N_{01}` is the count of when the row variable has a value of 0 and the column variable has a value of 1 14 | - :math:`d = N_{00}` is the count of when the two variables have a value of 0 15 | 16 | Also, look at how the table is structured with the value 1 coming before the value 0 in both the rows and columns. 17 | 18 | .. list-table:: Contingency table for two binary variables 19 | 20 | * - 21 | - 1 22 | - 0 23 | - Total 24 | * - 1 25 | - a 26 | - b 27 | - a + b 28 | * - 0 29 | - c 30 | - d 31 | - c + d 32 | * - Total 33 | - a + c 34 | - b + d 35 | - n = a + b + c + d 36 | 37 | Note that a and d are `matches` and b and c are `mismatches`. Sometimes, depending on the context, matching on 0 is not considered a match. For example, if 1 is the presence of something and 0 is the absence, then an observation of absence and absence does not really feel right to consider as a match (you cannot say two things match on what is not there). Additionally, when 1 is presence and 0 is absence, and the data is very sparse (a lot of 0's compared to 1's), considering absence and absence as matching will make it appear that the two variables are very similar. 38 | 39 | In :cite:`2010:choi`, there are 76 similarity and distance measures identified (some are not unique and/or redundant). Similarity is how `alike` are two things, and distance is how `different` are two things; or, in other words, similarity is how close are two things and distance is how far apart are two things. If a similarity or distance measure produces a value in :math:`[0, 1]`, then we can convert between the two easily. 40 | 41 | - If :math:`s` is the similarity, then :math:`d = 1 - s` is the distance. 42 | - If :math:`d` is the distance, then :math:`s = 1 - d` is the similarity. 43 | 44 | If we use a contingency table to summarize a bivariate binary data, the following similarity and distance measures may be derived entirely from `a`, `b`, `c` and/or `d`. The general pattern is that similarity and distance is always a ratio. The numerator in the ratio defines what we are interested in measuring. When we have `a` and/or `d` in the numerator, it is likely we are measuring similarity; when we have `b` and/or `c` in the numerator, it is likely we are measuring distance. The denominator considers what is important in considering; is it the matches, mismatches or both? The following tables list some identified similarity and distance measures based off of 2 x 2 contingency tables. 45 | 46 | .. list-table:: Similarity measures for 2 x 2 contingency table :cite:`2010:choi,2019:warrens,2020:psu-binary` 47 | :header-rows: 1 48 | 49 | * - Name 50 | - Computation 51 | * - 3W-Jaccard 52 | - :math:`\frac{3a}{3a+b+c}` 53 | * - Ample 54 | - :math:`\left|\frac{a(c+d)}{c(a+b)}\right|` 55 | * - Anderberg 56 | - :math:`\frac{\sigma-\sigma'}{2n}` 57 | * - Baroni-Urbani-Buser-I 58 | - :math:`\frac{\sqrt{ad}+a}{\sqrt{ad}+a+b+c}` 59 | * - Baroni-Urbani-Buser-II 60 | - :math:`\frac{\sqrt{ad}+a-(b+c)}{\sqrt{ad}+a+b+c}` 61 | * - Braun-Banquet 62 | - :math:`\frac{a}{\max(a+b,a+c)}` 63 | * - Cole :cite:`2010:choi,2019:warrens` 64 | - :math:`\frac{\sqrt{2}(ad-bc)}{\sqrt{(ad-bc)^2-(a+b)(a+c)(b+d)(c+d)}}` 65 | * - 66 | - :math:`\frac{ad-bc}{\min((a+b)(a+c),(b+d)(c+d))}` 67 | * - Cosine 68 | - :math:`\frac{a}{(a+b)(a+c)}` 69 | * - Dennis 70 | - :math:`\frac{ad-bc}{\sqrt{n(a+b)(a+c)}}` 71 | * - Dice; Czekanowski; Nei-Li 72 | - :math:`\frac{2a}{2a+b+c}` 73 | * - Disperson 74 | - :math:`\frac{ad-bc}{(a+b+c+d)^2}` 75 | * - Driver-Kroeber 76 | - :math:`\frac{a}{2}\left(\frac{1}{a+b}+\frac{1}{a+c}\right)` 77 | * - Eyraud 78 | - :math:`\frac{n^2(na-(a+b)(a+c))}{(a+b)(a+c)(b+d)(c+d)}` 79 | * - Fager-McGowan 80 | - :math:`\frac{a}{\sqrt{(a+b)(a+c)}}-\frac{max(a+b,a+c)}{2}` 81 | * - Faith 82 | - :math:`\frac{a+0.5d}{a+b+c+d}` 83 | * - Forbes-II 84 | - :math:`\frac{na-(a+b)(a+c)}{n \min(a+b,a+c) - (a+b)(a+c)}` 85 | * - Forbesi 86 | - :math:`\frac{na}{(a+b)(a+c)}` 87 | * - Fossum 88 | - :math:`\frac{n(a-0.5)^2}{(a+b)(a+c)}` 89 | * - Gilbert-Wells 90 | - :math:`\log a - \log n - \log \frac{a+b}{n} - \log \frac{a+c}{n}` 91 | * - Goodman-Kruskal 92 | - :math:`\frac{\sigma - \sigma'}{2n-\sigma'}` 93 | * - 94 | - :math:`\sigma=\max(a,b)+\max(c,d)+\max(a,c)+\max(b,d)` 95 | * - 96 | - :math:`\sigma'=\max(a+c,b+d)+\max(a+b,c+d)` 97 | * - Gower 98 | - :math:`\frac{a+d}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}` 99 | * - Gower-Legendre 100 | - :math:`\frac{a+d}{a+0.5b+0.5c+d}` 101 | * - Hamann 102 | - :math:`\frac{(a+d)-(b+c)}{a+b+c+d}` 103 | * - Inner Product 104 | - :math:`a+d` 105 | * - Intersection 106 | - :math:`a` 107 | * - Jaccard :cite:`2020:wiki-jaccard` 108 | - :math:`\frac{a}{a+b+c}` 109 | * - Johnson 110 | - :math:`\frac{a}{a+b}+\frac{a}{a+c}` 111 | * - Kulczynski-I 112 | - :math:`\frac{a}{b+c}` 113 | * - Kulczynski-II 114 | - :math:`\frac{0.5a(2a+b+c)}{(a+b)(a+c)}` 115 | * - 116 | - :math:`\frac{1}{2}\left(\frac{a}{a + b} + \frac{a}{a + c}\right)` 117 | * - McConnaughey 118 | - :math:`\frac{a^2 - bc}{(a+b)(a+c)}` 119 | * - Michael 120 | - :math:`\frac{4(ad-bc)}{(a+d)^2+(b+c)^2}` 121 | * - Mountford 122 | - :math:`\frac{a}{0.5(ab + ac) + bc}` 123 | * - Ochiai-I :cite:`2020:stack-sim`; Otsuka; Fowlkes-Mallows Index :cite:`2020:wiki-fowlkes` 124 | - :math:`\frac{a}{\sqrt{(a+b)(a+c)}}` 125 | * - 126 | - :math:`\sqrt{\frac{a}{a + b}\frac{a}{a + c}}` 127 | * - Ochiai-II 128 | - :math:`\frac{ad}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}` 129 | * - Pearson-Heron-I 130 | - :math:`\frac{ad-bc}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}` 131 | * - Pearson-Heron-II 132 | - :math:`\cos\left(\frac{\pi \sqrt{bc}}{\sqrt{ad}+\sqrt{bc}}\right)` 133 | * - Pearson-I 134 | - :math:`\chi^2=\frac{n(ad-bc)^2}{(a+b)(a+c)(c+d)(b+d)}` 135 | * - Pearson-II 136 | - :math:`\sqrt{\frac{\chi^2}{n+\chi^2}}` 137 | * - Pearson-II 138 | - :math:`\sqrt{\frac{\rho}{n+\rho}}` 139 | * - 140 | - :math:`\rho=\frac{ad-bc}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}` 141 | * - Peirce 142 | - :math:`\frac{ab+bc}{ab+2bc+cd}` 143 | * - Roger-Tanimoto 144 | - :math:`\frac{a+d}{a+2b+2c+d}` 145 | * - Russell-Rao 146 | - :math:`\frac{a}{a+b+c+d}` 147 | * - Simpson; Overlap :cite:`2020:wiki-overlap` 148 | - :math:`\frac{a}{\min(a+b,a+c)}` 149 | * - Sokal-Michener; Rand Index 150 | - :math:`\frac{a+d}{a+b+c+d}` 151 | * - Sokal-Sneath-I 152 | - :math:`\frac{a}{a+2b+2c}` 153 | * - Sokal-Sneath-II 154 | - :math:`\frac{2a+2d}{2a+b+c+2d}` 155 | * - Sokal-Sneath-III 156 | - :math:`\frac{a+d}{b+c}` 157 | * - Sokal-Sneath-IV 158 | - :math:`\frac{1}{4}\left(\frac{a}{a+b}+\frac{a}{a+c}+\frac{d}{b+d}+\frac{d}{b+d}\right)` 159 | * - Sokal-Sneath-V 160 | - :math:`\frac{ad}{(a+b)(a+c)(b+d)\sqrt{c+d}}` 161 | * - Sørensen–Dice :cite:`2020:wiki-dice` 162 | - :math:`\frac{2(a + d)}{2(a + d) + b + c}` 163 | * - Sorgenfrei 164 | - :math:`\frac{a^2}{(a+b)(a+c)}` 165 | * - Stiles 166 | - :math:`\log_{10} \frac{n\left(|ad-bc|-\frac{n}{2}\right)^2}{(a+b)(a+c)(b+d)(c+d)}` 167 | * - Tanimoto-I 168 | - :math:`\frac{a}{2a+b+c}` 169 | * - Tanimoto-II :cite:`2020:wiki-jaccard` 170 | - :math:`\frac{a}{b + c}` 171 | * - Tarwid 172 | - :math:`\frac{na - (a+b)(a+c)}{na + (a+b)(a+c)}` 173 | * - Tarantula 174 | - :math:`\frac{a(c+d)}{c(a+b)}` 175 | * - Tetrachoric 176 | - :math:`\frac{y-1}{y+1}` 177 | * - 178 | - :math:`y = \left(\frac{ad}{bc}\right)^{\frac{\pi}{4}}` 179 | * - Tversky Index :cite:`2020:wiki-tversky` 180 | - :math:`\frac{a}{a+\theta b+ \phi c}` 181 | * - 182 | - :math:`\theta` and :math:`\phi` are user-supplied parameters 183 | * - Yule-Q 184 | - :math:`\frac{ad-bc}{ad+bc}` 185 | * - Yule-w 186 | - :math:`\frac{\sqrt{ad}-\sqrt{bc}}{\sqrt{ad}+\sqrt{bc}}` 187 | 188 | .. list-table:: Distance measures for 2 x 2 contingency table :cite:`2010:choi` 189 | :header-rows: 1 190 | 191 | * - Name 192 | - Computation 193 | * - Chord 194 | - :math:`\sqrt{2\left(1 - \frac{a}{\sqrt{(a+b)(a+c)}}\right)}` 195 | * - Euclid 196 | - :math:`\sqrt{b+c}` 197 | * - Hamming; Canberra; Manhattan; Cityblock; Minkowski 198 | - :math:`b+c` 199 | * - Hellinger 200 | - :math:`2\sqrt{1 - \frac{a}{\sqrt{(a+b)(a+c)}}}` 201 | * - Jaccard distance :cite:`2020:wiki-jaccard` 202 | - :math:`\frac{b + c}{a + b + c}` 203 | * - Lance-Williams; Bray-Curtis 204 | - :math:`\frac{b+c}{2a+b+c}` 205 | * - Mean-Manhattan 206 | - :math:`\frac{b+c}{a+b+c+d}` 207 | * - Pattern Difference 208 | - :math:`\frac{4bc}{(a+b+c+d)^2}` 209 | * - Shape Difference 210 | - :math:`\frac{n(b+c)-(b-c)^2}{(a+b+c+d)^2}` 211 | * - Size Difference 212 | - :math:`\frac{(b+c)^2}{(a+b+c+d)^2}` 213 | * - Squared-Euclid 214 | - :math:`\sqrt{(b+c)^2}` 215 | * - Vari 216 | - :math:`\frac{b+c}{4a+4b+4c+4d}` 217 | * - Yule-Q 218 | - :math:`\frac{2bc}{ad+bc}` 219 | 220 | Instead of using `a`, `b`, `c` and `d` from a contingency table to define these association measures, it is common to use set notation. For two binary variables, :math:`X` and :math:`Y`, the following are equivalent. 221 | 222 | - :math:`|X \cap Y| = a` 223 | - :math:`|X \setminus Y| = b` 224 | - :math:`|Y \setminus X| = c` 225 | - :math:`|X \cup Y| = a + b + c` 226 | 227 | You will notice that `d` does not show up in the above relationship. 228 | 229 | Concordant, discordant, tie 230 | --------------------------- 231 | 232 | Let's try to understand how to determine if a pair of observations are concordant, discordant or tied. We have made up an example dataset below having two variables :math:`X` and :math:`Y`. Note that there are 6 observations, and as such, each observation is associated with an index from 1 to 6. An observation has a pair of values, one for :math:`X` and one for :math:`Y`. 233 | 234 | .. warning:: 235 | Do **not** get the `pair of values of an observation` confused with a `pair of observations`. 236 | 237 | .. list-table:: Raw Data for :math:`X` and :math:`Y` 238 | :header-rows: 1 239 | 240 | * - Index 241 | - :math:`X` 242 | - :math:`Y` 243 | * - 1 244 | - 1 245 | - 3 246 | * - 2 247 | - 1 248 | - 3 249 | * - 3 250 | - 2 251 | - 4 252 | * - 4 253 | - 0 254 | - 2 255 | * - 5 256 | - 0 257 | - 4 258 | * - 6 259 | - 2 260 | - 2 261 | 262 | Because there are 6 observations, there are :math:`{{6}\choose{2}} = 15` possible pairs of observations. If we denote an observation by its corresponding index as :math:`O_i`, then the observations are then as follows. 263 | 264 | - :math:`O_1 = (1, 3)` 265 | - :math:`O_2 = (1, 3)` 266 | - :math:`O_3 = (2, 4)` 267 | - :math:`O_4 = (0, 2)` 268 | - :math:`O_5 = (0, 4)` 269 | - :math:`O_6 = (2, 2)` 270 | 271 | The 15 possible `combinations` of observation pairings are as follows. 272 | 273 | - :math:`O_1, O_2` 274 | - :math:`O_1, O_3` 275 | - :math:`O_1, O_4` 276 | - :math:`O_1, O_5` 277 | - :math:`O_1, O_6` 278 | - :math:`O_2, O_3` 279 | - :math:`O_2, O_4` 280 | - :math:`O_2, O_5` 281 | - :math:`O_2, O_6` 282 | - :math:`O_3, O_4` 283 | - :math:`O_3, O_5` 284 | - :math:`O_3, O_6` 285 | - :math:`O_4, O_5` 286 | - :math:`O_4, O_6` 287 | - :math:`O_5, O_6` 288 | 289 | For each one of these observation pairs, we can determine if such a pair is concordant, discordant or tied. There's a couple ways to determine concordant, discordant or tie status. The easiest way to determine so is mathematically. Another way is to use rules. Both are equivalent. Because we will use abstract notation to describe these math and rules used to determine concordant, discordant or tie for each pair, and because we are striving for clarity, let's expand these observation pairs into their component pairs of values and also their corresponding :math:`X` and :math:`Y` indexed notation. 290 | 291 | - :math:`O_1, O_2 = (1, 3), (1, 3) = (X_1, Y_1), (X_2, Y_2)` 292 | - :math:`O_1, O_3 = (1, 3), (2, 4) = (X_1, Y_1), (X_3, Y_3)` 293 | - :math:`O_1, O_4 = (1, 3), (0, 2) = (X_1, Y_1), (X_4, Y_4)` 294 | - :math:`O_1, O_5 = (1, 3), (0, 4) = (X_1, Y_1), (X_5, Y_5)` 295 | - :math:`O_1, O_6 = (1, 3), (2, 2) = (X_1, Y_1), (X_6, Y_6)` 296 | - :math:`O_2, O_3 = (1, 3), (2, 4) = (X_2, Y_2), (X_3, Y_3)` 297 | - :math:`O_2, O_4 = (1, 3), (0, 2) = (X_2, Y_2), (X_4, Y_4)` 298 | - :math:`O_2, O_5 = (1, 3), (0, 4) = (X_2, Y_2), (X_5, Y_5)` 299 | - :math:`O_2, O_6 = (1, 3), (2, 2) = (X_2, Y_2), (X_6, Y_6)` 300 | - :math:`O_3, O_4 = (2, 4), (0, 2) = (X_3, Y_3), (X_4, Y_4)` 301 | - :math:`O_3, O_5 = (2, 4), (0, 4) = (X_3, Y_3), (X_5, Y_5)` 302 | - :math:`O_3, O_6 = (2, 4), (2, 2) = (X_3, Y_3), (X_6, Y_6)` 303 | - :math:`O_4, O_5 = (0, 2), (0, 4) = (X_4, Y_4), (X_5, Y_5)` 304 | - :math:`O_4, O_6 = (0, 2), (2, 2) = (X_4, Y_4), (X_6, Y_6)` 305 | - :math:`O_5, O_6 = (0, 4), (2, 2) = (X_5, Y_5), (X_6, Y_6)` 306 | 307 | Now we can finally attempt to describe how to determine if any pair of observations is concordant, discordant or tied. If we want to use math to determine so, then, for any two pairs of observations :math:`(X_i, Y_i)` and :math:`(X_j, Y_j)`, the following determines the status. 308 | 309 | - concordant when :math:`(X_j - X_i)(Y_j - Y_i) > 0` 310 | - discordant when :math:`(X_j - X_i)(Y_j - Y_i) < 0` 311 | - tied when :math:`(X_j - X_i)(Y_j - Y_i) = 0` 312 | 313 | If we like rules, then the following determines the status. 314 | 315 | - concordant if :math:`X_i < X_j` and :math:`Y_i < Y_j` **or** :math:`X_i > X_j` and :math:`Y_i > Y_j` 316 | - discordant if :math:`X_i < X_j` and :math:`Y_i > Y_j` **or** :math:`X_i > X_j` and :math:`Y_i < Y_j` 317 | - tied if :math:`X_i = X_j` **or** :math:`Y_i = Y_j` 318 | 319 | All pairs of observations will evaluate categorically to one of these statuses. Continuing with our dummy data above, the concordancy status of the 15 pairs of observations are as follows (where concordant is C, discordant is D and tied is T). 320 | 321 | .. list-table:: Concordancy Status 322 | :header-rows: 1 323 | 324 | * - :math:`(X_i, Y_i)` 325 | - :math:`(X_j, Y_j)` 326 | - status 327 | * - :math:`(1, 3)` 328 | - :math:`(1, 3)` 329 | - T 330 | * - :math:`(1, 3)` 331 | - :math:`(2, 4)` 332 | - C 333 | * - :math:`(1, 3)` 334 | - :math:`(0, 2)` 335 | - C 336 | * - :math:`(1, 3)` 337 | - :math:`(0, 4)` 338 | - D 339 | * - :math:`(1, 3)` 340 | - :math:`(2, 2)` 341 | - D 342 | * - :math:`(1, 3)` 343 | - :math:`(2, 4)` 344 | - C 345 | * - :math:`(1, 3)` 346 | - :math:`(0, 2)` 347 | - C 348 | * - :math:`(1, 3)` 349 | - :math:`(0, 4)` 350 | - D 351 | * - :math:`(1, 3)` 352 | - :math:`(2, 2)` 353 | - D 354 | * - :math:`(2, 4)` 355 | - :math:`(0, 2)` 356 | - C 357 | * - :math:`(2, 4)` 358 | - :math:`(0, 4)` 359 | - C 360 | * - :math:`(2, 4)` 361 | - :math:`(2, 2)` 362 | - T 363 | * - :math:`(0, 2)` 364 | - :math:`(0, 4)` 365 | - T 366 | * - :math:`(0, 2)` 367 | - :math:`(2, 2)` 368 | - T 369 | * - :math:`(0, 4)` 370 | - :math:`(2, 2)` 371 | - D 372 | 373 | In this data set, the counts are :math:`C=6`, :math:`D=5` and :math:`T=4`. If we divide these counts with the total of pairs of observations, then we get the following probabilities. 374 | 375 | - :math:`\pi_C = \frac{C}{{n \choose 2}} = \frac{6}{15} = 0.40` 376 | - :math:`\pi_D = \frac{D}{{n \choose 2}} = \frac{5}{15} = 0.33` 377 | - :math:`\pi_T = \frac{T}{{n \choose 2}} = \frac{4}{15} = 0.27` 378 | 379 | Sometimes, it is desirable to distinguish between the types of ties. There are three possible types of ties. 380 | 381 | - :math:`T^X` are ties on only :math:`X` 382 | - :math:`T^Y` are ties on only :math:`Y` 383 | - :math:`T^{XY}` are ties on both :math:`X` and :math:`Y` 384 | 385 | Note, :math:`T = T^X + T^Y + T^{XY}`. If we want to distinguish between the tie types, then the status of each pair of observations is as follows. 386 | 387 | .. list-table:: Concordancy Status 388 | :header-rows: 1 389 | 390 | * - :math:`(X_i, Y_i)` 391 | - :math:`(X_j, Y_j)` 392 | - status 393 | * - :math:`(1, 3)` 394 | - :math:`(1, 3)` 395 | - :math:`T^{XY}` 396 | * - :math:`(1, 3)` 397 | - :math:`(2, 4)` 398 | - C 399 | * - :math:`(1, 3)` 400 | - :math:`(0, 2)` 401 | - C 402 | * - :math:`(1, 3)` 403 | - :math:`(0, 4)` 404 | - D 405 | * - :math:`(1, 3)` 406 | - :math:`(2, 2)` 407 | - D 408 | * - :math:`(1, 3)` 409 | - :math:`(2, 4)` 410 | - C 411 | * - :math:`(1, 3)` 412 | - :math:`(0, 2)` 413 | - C 414 | * - :math:`(1, 3)` 415 | - :math:`(0, 4)` 416 | - D 417 | * - :math:`(1, 3)` 418 | - :math:`(2, 2)` 419 | - D 420 | * - :math:`(2, 4)` 421 | - :math:`(0, 2)` 422 | - C 423 | * - :math:`(2, 4)` 424 | - :math:`(0, 4)` 425 | - C 426 | * - :math:`(2, 4)` 427 | - :math:`(2, 2)` 428 | - :math:`T^X` 429 | * - :math:`(0, 2)` 430 | - :math:`(0, 4)` 431 | - :math:`T^X` 432 | * - :math:`(0, 2)` 433 | - :math:`(2, 2)` 434 | - :math:`T^Y` 435 | * - :math:`(0, 4)` 436 | - :math:`(2, 2)` 437 | - D 438 | 439 | Distinguishing between ties, in this data set, the counts are :math:`C=6`, :math:`D=5`, :math:`T^X=2`, :math:`T^Y=1` and :math:`T^{XY}=1`. The probabilities of these statuses are as follows. 440 | 441 | - :math:`\pi_C = \frac{C}{{n \choose 2}} = \frac{6}{15} = 0.40` 442 | - :math:`\pi_D = \frac{D}{{n \choose 2}} = \frac{5}{15} = 0.33` 443 | - :math:`\pi_{T^X} = \frac{T^X}{{n \choose 2}} = \frac{2}{15} = 0.13` 444 | - :math:`\pi_{T^Y} = \frac{T^Y}{{n \choose 2}} = \frac{1}{15} = 0.07` 445 | - :math:`\pi_{T^{XY}} = \frac{T^{XY}}{{n \choose 2}} = \frac{1}{15} = 0.07` 446 | 447 | There are quite a few measures of associations using concordance as the basis for strength of association. 448 | 449 | .. list-table:: Association measures using concordance 450 | :header-rows: 1 451 | 452 | * - Association Measure 453 | - Formula 454 | * - Goodman-Kruskal's :math:`\gamma` 455 | - :math:`\gamma = \frac{\pi_C - \pi_D}{1 - \pi_T}` 456 | * - Somers' :math:`d` 457 | - :math:`d_{Y \cdot X} = \frac{\pi_C - \pi_D}{\pi_C + \pi_D + \pi_{T^Y}}` 458 | * - 459 | - :math:`d_{X \cdot Y} = \frac{\pi_C - \pi_D}{\pi_C + \pi_D + \pi_{T^X}}` 460 | * - Kendall's :math:`\tau` 461 | - :math:`\tau = \frac{C - D}{{n \choose 2}}` 462 | 463 | .. note:: 464 | Sometimes `Somers' d` is written as `Somers' D`, `Somers' Delta` or even incorrectly as `Somer's D` :cite:`2017:glen,2020:wiki-somersd`. Somers' d has two versions, one that is symmetric and one that is asymmetric. The asymmetric Somers' d is the one most typically referred to :cite:`2017:glen`. The definition of Somers' d presented here is the asymmetric one, which explains :math:`d_{Y \cdot X}` and :math:`d_{X \cdot Y}`. 465 | 466 | Goodman-Kruskal's :math:`\lambda` 467 | --------------------------------- 468 | 469 | Goodman-Kruskal's lambda :math:`\lambda_{A|B}` measures the `proportional reduction in error` ``PRE`` for two categorical variables, :math:`A` and :math:`B`, when we want to understand how knowing :math:`B` reduces the probability of an error in predicting :math:`A`. :math:`\lambda_{A|B}` is estimated as follows. 470 | 471 | :math:`\lambda_{A|B} = \frac{P_E - P_{E|B}}{P_E}` 472 | 473 | Where, 474 | 475 | - :math:`P_E = 1 - \frac{\max_c N_{+c}}{N_{++}}` 476 | - :math:`P_{E|B} = 1 - \frac{\sum_r \max_c N_{rc}}{N_{++}}` 477 | 478 | In meaningful language. 479 | 480 | - :math:`P_E` is the probability of an error in predicting :math:`A` 481 | - :math:`P_{E|B}` is the probability of an error in predicting :math:`A` given knowledge of :math:`B` 482 | 483 | The terms :math:`N_{+c}`, :math:`N_{rc}` and :math:`N_{++}` comes from the contingency table we build from :math:`A` and :math:`B` (:math:`A` is in the columns and :math:`B` is in the rows) and denote the column marginal for the `c-th` column, total count for the `r-th` and `c-th` cell and total, correspondingly. To be clear. 484 | 485 | - :math:`N_{+c}` is the column marginal for the `c-th` column 486 | - :math:`N_{rc}` is total count for the `r-th` and `c-th` cell 487 | - :math:`N_{++}` is total number of observations 488 | 489 | The contingency table induced with :math:`A` in the columns and :math:`B` in the rows will look like the following. Note that :math:`A` has `C` columns and :math:`B` has `R` rows, or, in other words, :math:`A` has `C` values and :math:`B` has `R` values. 490 | 491 | .. list-table:: Contingency Table for :math:`A` and :math:`B` 492 | 493 | * - 494 | - :math:`A_1` 495 | - :math:`A_2` 496 | - :math:`\dotsb` 497 | - :math:`A_C` 498 | * - :math:`B_1` 499 | - :math:`N_{11}` 500 | - :math:`N_{12}` 501 | - :math:`\dotsb` 502 | - :math:`N_{1C}` 503 | * - :math:`B_2` 504 | - :math:`N_{21}` 505 | - :math:`N_{22}` 506 | - :math:`\dotsb` 507 | - :math:`N_{2C}` 508 | * - :math:`\vdots` 509 | - :math:`\vdots` 510 | - :math:`\vdots` 511 | - 512 | - :math:`\vdots` 513 | * - :math:`B_R` 514 | - :math:`N_{R1}` 515 | - :math:`N_{R2}` 516 | - :math:`\dotsb` 517 | - :math:`N_{RC}` 518 | 519 | The table above only shows the cell counts :math:`N_{11}, N_{12}, \ldots, N_{RC}` and **not** the row and column marginals. Below, we expand the contingency table to include 520 | 521 | - the row marginals :math:`N_{1+}, N_{2+}, \ldots, N_{R+}`, as well as, 522 | - the column marginals :math:`N_{+1}, N_{+2}, \ldots, N_{+C}`. 523 | 524 | .. list-table:: Contingency Table for :math:`A` and :math:`B` 525 | 526 | * - 527 | - :math:`A_1` 528 | - :math:`A_2` 529 | - :math:`\dotsb` 530 | - :math:`A_C` 531 | - 532 | * - :math:`B_1` 533 | - :math:`N_{11}` 534 | - :math:`N_{12}` 535 | - :math:`\dotsb` 536 | - :math:`N_{1C}` 537 | - :math:`N_{1+}` 538 | * - :math:`B_2` 539 | - :math:`N_{21}` 540 | - :math:`N_{22}` 541 | - :math:`\dotsb` 542 | - :math:`N_{2C}` 543 | - :math:`N_{2+}` 544 | * - :math:`\vdots` 545 | - :math:`\vdots` 546 | - :math:`\vdots` 547 | - 548 | - :math:`\vdots` 549 | - :math:`\vdots` 550 | * - :math:`B_R` 551 | - :math:`N_{R1}` 552 | - :math:`N_{R2}` 553 | - :math:`\dotsb` 554 | - :math:`N_{RC}` 555 | - :math:`N_{R+}` 556 | * - 557 | - :math:`N_{+1}` 558 | - :math:`N_{+2}` 559 | - :math:`\dotsb` 560 | - :math:`N_{+C}` 561 | - :math:`N_{++}` 562 | 563 | Note that the row marginal for a row is the sum of the values across the columns, and the column marginal for a colum is the sum of the values down the rows. 564 | 565 | - :math:`N_{R+} = \sum_C N_{RC}` 566 | - :math:`N_{+C} = \sum_R N_{RC}` 567 | 568 | Also, :math:`N_{++}` is just the sum over all the cells (excluding the row and column marginals). :math:`N_{++}` is really just the sample size. 569 | 570 | - :math:`N_{++} = \sum_R \sum_C N_{RC}` 571 | 572 | Let's go back to computing :math:`P_E` and :math:`P_{E|B}`. 573 | 574 | :math:`P_E` is given as follows. 575 | 576 | - :math:`P_E = 1 - \frac{\max_c N_{+c}}{N_{++}}` 577 | 578 | :math:`\max_c N_{+c}` is returning the maximum of the column marginals, and :math:`\frac{\max_c N_{+c}}{N_{++}}` is just a probability. What probability is this one? It is the largest probability associated with a value of :math:`A` (specifically, the value of :math:`A` with the largest counts). If we were to predict which value of :math:`A` would show up, we would choose the value of :math:`A` with the highest probability (it is the most likely). We would be correct :math:`\frac{\max_c N_{+c}}{N_{++}}` percent of the time, and we would be wrong :math:`1 - \frac{\max_c N_{+c}}{N_{++}}` percent of the time. Thus, :math:`P_E` is the error in predicting :math:`A` (knowing nothing else other than the distribution, or `probability mass function` ``PMF`` of :math:`A`). 579 | 580 | :math:`P_{E|B}` is given as follows. 581 | 582 | - :math:`P_{E|B} = 1 - \frac{\sum_r \max_c N_{rc}}{N_{++}}` 583 | 584 | What is :math:`\max_c N_{rc}` giving us? It is giving us the maximum cell count for the `r-th` row. :math:`\sum_r \max_c N_{rc}` adds up the all the largest values in each row, and :math:`\frac{\sum_r \max_c N_{rc}}{N_{++}}` is again a probability. What probability is this one? This probability is the one associated with predicting the value of :math:`A` when we know :math:`B`. When we know what the value of :math:`B` is, then the value of :math:`A` should be the one with the largest count (it has the highest probability, or, equivalently, the highest count). When we know the value of :math:`B` and by always choosing the value of :math:`A` with the highest count associated with that value of :math:`B`, we are correct :math:`\frac{\sum_r \max_c N_{rc}}{N_{++}}` percent of the time and incorrect :math:`1 - \frac{\sum_r \max_c N_{rc}}{N_{++}}` percent of the time. Thus, :math:`P_{E|B}` is the error in predicting :math:`A` when we know the value of :math:`B` and the PMF of :math:`A` given :math:`B`. 585 | 586 | The expression :math:`P_E - P_{E|B}` is the reduction in the probability of an error in predicting :math:`A` given knowledge of :math:`B`. This expression represents the `reduction in error` in the phrase/term ``PRE``. The proportional part in ``PRE`` comes from the expression :math:`\frac{P_E - P_{E|B}}{P_E}`, which is a proportion. 587 | 588 | What :math:`\lambda_{A|B}` is trying to compute is the reduction of error in predicting :math:`A` when we know :math:`B`. Did we reduce any prediction error of :math:`A` by knowing :math:`B`? 589 | 590 | - When :math:`\lambda_{A|B} = 0`, this value means that knowing :math:`B` did not reduce any prediction error in :math:`A`. The only way to get :math:`\lambda_{A|B} = 0` is when :math:`P_E = P_{E|B}`. 591 | - When :math:`\lambda_{A|B} = 1`, this value means that knowing :math:`B` completely reduced all prediction errors in :math:`A`. The only way to get :math:`\lambda_{A|B} = 1` is when :math:`P_{E|B} = 0`. 592 | 593 | Generally speaking, :math:`\lambda_{A|B} \neq \lambda_{B|A}`, and :math:`\lambda` is thus an asymmetric association measure. To compute :math:`\lambda_{B|A}`, simply put :math:`B` in the columns and :math:`A` in the rows and reuse the formulas above. 594 | 595 | Furthermore, :math:`\lambda` can be used in studies of causality :cite:`1983:liebetrau`. We are not saying it is appropriate or even possible to entertain causality with just two variables alone :cite:`2020:pearl,2016:pearl,2009:pearl,1988:pearl`, but, when we have two categorical variables and want to know which is likely the cause and which the effect, the asymmetry between :math:`\lambda_{A|B}` and :math:`\lambda_{B|A}` may prove informational :cite:`2020:wiki-prospect`. Causal analysis based on two variables alone has been studied :cite:`2008:nips`. 596 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: A statistical API for bivariate association measures. 3 | :keywords: python, statistics, bivariate, association, categorical, binary, nominal, ordinal, continuous, ratio, interval, contingency table analysis, apache spark, spark, high performance computing, massively parallel processing, hpc, mpp, causality, symmetric, asymmetric, correlation, confusion matrix, concordance, ranking 4 | :robots: index, follow 5 | :abstract: A statistical API for bivariate association measures. There are over 130 association measures identified between the product of categorical and continuous variable types. 6 | :author: Jee Vang, Ph.D. 7 | :contact: g@oneoffcoder.com 8 | :copyright: One-Off Coder 9 | :content: global 10 | :generator: Sphinx 11 | :language: English 12 | :rating: general 13 | :reply-to: info@oneoffcoder.com 14 | :web_author: Jee Vang, Ph.D. 15 | :revisit-after: 1 days 16 | 17 | .. PyPair documentation master file, created by 18 | sphinx-quickstart on Wed Nov 11 22:56:50 2020. 19 | You can adapt this file completely to your liking, but it should at least 20 | contain the root `toctree` directive. 21 | 22 | PyPair 23 | ====== 24 | 25 | .. image:: _static/images/logo.png 26 | :align: center 27 | :alt: pypair logo. 28 | 29 | PyPair is a statistical library to compute pairwise association between any two types of variables. You can use the library locally on your laptop or desktop, or, you may use it on a `Spark `_ cluster. 30 | 31 | .. blockdiag:: 32 | 33 | diagram { 34 | default_shape = roundedbox 35 | span_width = 32 36 | span_height = 20 37 | default_fontsize = 11 38 | edge_layout = normal 39 | orientation = landscape 40 | 41 | V [label = "Variable", color = pink] 42 | C [label = "Continuous", color = "#edfa78"] 43 | I [label = "Interval", color = "#def514"] 44 | R [label = "Ratio", color = "#def514"] 45 | A [label = "Categorical", color = "#e0e0e0"] 46 | B [label = "Binary", color ="#e4ede6"] 47 | N [label = "Nominal", color ="#e4ede6"] 48 | O [label = "Ordinal", color ="#e4ede6"] 49 | 50 | V -> A, C 51 | C -> I, R 52 | A -> B, N, O 53 | } 54 | 55 | You may install ``py-pair`` from `pypi `_. 56 | 57 | .. code:: bash 58 | 59 | pip install pypair 60 | 61 | .. toctree:: 62 | :maxdepth: 2 63 | :caption: Contents 64 | 65 | intro 66 | quicklist 67 | quickstart 68 | deepdives 69 | zzz-bib 70 | 71 | .. toctree:: 72 | :maxdepth: 2 73 | :caption: API Documentation 74 | 75 | modules 76 | 77 | Indices and tables 78 | ================== 79 | 80 | * :ref:`genindex` 81 | * :ref:`modindex` 82 | * :ref:`search` 83 | 84 | About 85 | ===== 86 | 87 | .. image:: _static/images/ooc-logo.png 88 | :alt: One-Off Coder logo. 89 | 90 | One-Off Coder is an educational, service and product company. Please visit us online to discover how we may help you achieve life-long success in your personal coding career or with your company's business goals and objectives. 91 | 92 | - |Website_Link| 93 | - |Facebook_Link| 94 | - |Twitter_Link| 95 | - |Instagram_Link| 96 | - |YouTube_Link| 97 | - |LinkedIn_Link| 98 | 99 | .. |Website_Link| raw:: html 100 | 101 | Website 102 | 103 | .. |Facebook_Link| raw:: html 104 | 105 | Facebook 106 | 107 | .. |Twitter_Link| raw:: html 108 | 109 | Twitter 110 | 111 | .. |Instagram_Link| raw:: html 112 | 113 | Instagram 114 | 115 | .. |YouTube_Link| raw:: html 116 | 117 | YouTube 118 | 119 | .. |LinkedIn_Link| raw:: html 120 | 121 | LinkedIn 122 | 123 | Copyright 124 | ========= 125 | 126 | Documentation 127 | ------------- 128 | 129 | .. raw:: html 130 | 131 | 132 | This work is licensed under a Creative Commons Attribution 4.0 International License by One-Off Coder. 133 |
134 |
135 | 136 | Creative Commons License 137 | 138 |
139 |
140 | 141 | 142 | Software 143 | -------- 144 | 145 | :: 146 | 147 | Copyright 2020 One-Off Coder 148 | 149 | Licensed under the Apache License, Version 2.0 (the "License"); 150 | you may not use this file except in compliance with the License. 151 | You may obtain a copy of the License at 152 | 153 | http://www.apache.org/licenses/LICENSE-2.0 154 | 155 | Unless required by applicable law or agreed to in writing, software 156 | distributed under the License is distributed on an "AS IS" BASIS, 157 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 158 | See the License for the specific language governing permissions and 159 | limitations under the License. 160 | 161 | Art 162 | --- 163 | 164 | :: 165 | 166 | Copyright 2020 Daytchia Vang 167 | 168 | Citation 169 | ======== 170 | 171 | :: 172 | 173 | @misc{oneoffcoder_pypair_2020, 174 | title={PyPair, A Statistical API for Bivariate Association Measures}, 175 | url={https://github.com/oneoffcoder/py-pair}, 176 | author={Jee Vang}, 177 | year={2020}, 178 | month={Nov}} 179 | 180 | Author 181 | ====== 182 | 183 | Jee Vang, Ph.D. 184 | 185 | - |Patreon_Link| 186 | - |Github_Link| 187 | 188 | .. |Patreon_Link| raw:: html 189 | 190 | Patreon: support is appreciated 191 | 192 | .. |Github_Link| raw:: html 193 | 194 | GitHub: sponsorship will help us change the world for the better 195 | 196 | Help 197 | ==== 198 | 199 | - |Source_Link| 200 | - |Gitter_Link| 201 | 202 | .. |Source_Link| raw:: html 203 | 204 | GitHub: source code 205 | 206 | .. |Gitter_Link| raw:: html 207 | 208 | Gitter: chat -------------------------------------------------------------------------------- /docs/source/intro.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | ============ 3 | 4 | PyPair is a statistical library to compute pairwise association between any two variables. A reasonable taxonomy of variable types in statistics is as follows :cite:`2020:uom,2020:idre,2020:laerd,2020:graphpad,2020:minitab`. 5 | 6 | - ``Categorical``: A variable whose values have no intrinsic ordering. An example is a variable indicating the continents: North America, South America, Asia, Arctic, Antarctica, Africa and Europe. There is no ordering to these continents; we cannot say North America comes before Africa. Categorical variables are also referred to as `qualitative` variables. 7 | - ``Binary``: A categorical variable that has only 2 values. An example is a variable indicating whether or not someone likes to eat pizza; the values could be ``yes`` or ``no``. It is common to encode the binary values to ``0`` and ``1`` for storage and numerical convenience, but do not be fooled, there is still no numerical ordering. These variables are also referred to in the wild as `dichotomous` variables. 8 | - ``Nominal``: A categorical variable that has 3 or more values. When most people think of categorical variables, they think of nominal variables. 9 | - ``Ordinal``: A categorical variable whose values have a logical order but the difference between any two values do not give meaningful numerical magnitude. An example of an ordinal variable is one that indicates the performance on a test: good, better, best. We know that good is the base, better is the comparative and best is the superlative, however, we cannot say that the difference between best and good is two numbers up. For all we know, best can be orders of magnitude away from good. 10 | - ``Continuous``: A variable whose values are (basically) numbers, and thus, have meaningful ordering. A continuous variable may have an infinite number of values. Continuous variables are also referred to as `quantitative` variables. 11 | - ``Interval``: A continuous variable that is one whose values exists on a continuum of numerical values. Temperature measured in Celcius or Fahrenheit is an example of an interval variable. 12 | - ``Ratio``: An interval variable with a true zero. Temperature measured in Kelvin is an example of a ratio variable. 13 | 14 | .. note:: 15 | If we have a variable capturing eye colors, the possible values may be blue, green or brown. On first sight, this variable may be considered a nominal variable. Instead of capturing the eye color categorically, what if we measure the wavelengths of eye colors? Below are estimations of each of the wavelengths (nanometers) corresponding to these colors. 16 | 17 | - blue: 450 18 | - green: 550 19 | - brown: 600 20 | 21 | Which variable type does the eye color variable become? 22 | 23 | .. note:: 24 | There is also much use of the term ``discrete variable``, and sometimes it refers to categorical or continuous variables. In general, a discrete variable has a finite set of values, and in this sense, a discrete variable could be a categorical variable. We have seen many cases of a continuous variable (infinite values) undergoing `discretization` (finite values). The resulting variable from discretization is often treated as a categorical variable by applying statistical operations appropriate for that type of variable. Yet, in some cases, a continuous variable can also be a discrete variable. If we have a variable to capture age (whole numbers only), we might observe a range :math:`[0, 120]`. There are 121 values (zero is included), but still, we can treat this age variable like a ratio variable. 25 | 26 | Assuming we have data and we know the variable types in this data using the taxonomy above, we might want to make a progression of analyses from univariate, bivariate and to multivariate analyses. Along the way, for bivariate analysis, we are often curious about the association between any two pairs of variables. We want to know both the magnitude (the strength, is it small or big?) and direction (the sign, is it positive or negative?) of the association. When the variables are all of the same type, association measures may be abound to conduct pairwise association; if all the variables are continuous, we might just want to apply canonical Pearson correlation. 27 | 28 | The tough situation is when we have a mixed variable type of dataset; and this tough situation is quite often the normal situation. How do we find the association between a continuous and categorical variable? We can create a table as below to map the available association measure approaches for any two types of variables :cite:`2020:calkins,2020:psu-continuous`. (In the table below, we collapse all categorical and continuous variable types). 29 | 30 | .. raw:: html 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 48 | 49 | 50 | 51 | 52 | 60 | 66 | 67 |
CategoricalContinuous
Categorical 41 |
    42 |
  • Confusion Matrix
  • 43 |
  • 2x2 Contingency Table Analysis
  • 44 |
  • RxC Contingency Table Analysis
  • 45 |
  • Agreement Analysis
  • 46 |
47 |
-
Continuous 53 |
    54 |
  • Biserial
  • 55 |
  • Correlation Ratio
  • 56 |
  • ANOVA
  • 57 |
  • Clustering Coefficients
  • 58 |
59 |
61 |
    62 |
  • Correlation
  • 63 |
  • Concordance
  • 64 |
65 |
68 | 69 | The ultimate goal of this project is to identify as many measures of associations for these unique pairs of variable types and to implement these association measures in a unified application programming interface (API). 70 | 71 | .. note:: 72 | We use the term `association` over `correlation` since the latter typically connotes canonical Pearson correlation or association between two continuous variables. The term `association` is more general and can cover specific types of association, such as `agreement` measures, along side with those dealing with continuous variables :cite:`1983:liebetrau`. 73 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | .. toctree:: 2 | :maxdepth: 4 3 | 4 | pypair 5 | -------------------------------------------------------------------------------- /docs/source/pypair.rst: -------------------------------------------------------------------------------- 1 | PyPair 2 | ====== 3 | 4 | Contingency Table Analysis 5 | -------------------------- 6 | 7 | These are the basic contingency tables used to analyze categorical data. 8 | 9 | - CategoricalTable 10 | - BinaryTable 11 | - ConfusionMatrix 12 | - AgreementTable 13 | 14 | .. automodule:: pypair.contingency 15 | :members: 16 | :undoc-members: 17 | :show-inheritance: 18 | :special-members: __init__ 19 | 20 | Biserial 21 | -------- 22 | 23 | These are the biserial association measures. 24 | 25 | .. automodule:: pypair.biserial 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | :special-members: __init__ 30 | 31 | Continuous 32 | ---------- 33 | 34 | These are the continuous association measures. 35 | 36 | .. automodule:: pypair.continuous 37 | :members: 38 | :undoc-members: 39 | :show-inheritance: 40 | :special-members: __init__ 41 | 42 | Associations 43 | ------------ 44 | 45 | Some of the functions here are just wrappers around the contingency tables and may be looked at as convenience methods to simply pass in data for two variables. If you need more than the specific association, you are encouraged to build the appropriate contingency table and then call upon the measures you need. 46 | 47 | .. automodule:: pypair.association 48 | :members: 49 | :undoc-members: 50 | :show-inheritance: 51 | :special-members: __init__ 52 | 53 | Decorators 54 | ---------- 55 | 56 | These are decorators. 57 | 58 | .. automodule:: pypair.decorator 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | :special-members: __init__ 63 | 64 | Utility 65 | ------- 66 | 67 | These are utility functions. 68 | 69 | .. automodule:: pypair.util 70 | :members: 71 | :undoc-members: 72 | :show-inheritance: 73 | :special-members: __init__ 74 | 75 | Spark 76 | ----- 77 | 78 | These are functions that you can use in a Spark. You must pass in a Spark dataframe and you will get a ``pair-RDD`` as output. The pair-RDD will have the following as its keys and values. 79 | 80 | - key: in the form of a tuple of strings ``(k1, k2)`` where k1 and k2 are names of variables (column names) 81 | - value: a dictionary ``{'acc': 0.8, 'tpr': 0.9, 'fpr': 0.8, ...}`` where keys are association measure names and values are the corresponding association values 82 | 83 | 84 | 85 | .. automodule:: pypair.spark 86 | :members: 87 | :undoc-members: 88 | :show-inheritance: 89 | :special-members: __init__ 90 | -------------------------------------------------------------------------------- /docs/source/quicklist.rst: -------------------------------------------------------------------------------- 1 | Quick List 2 | ========== 3 | 4 | Below are just some quick listing of association measures without any description. These association measures are grouped by variable pair types and/or approach. 5 | 6 | Binary-Binary (88) 7 | ------------------ 8 | 9 | - adjusted_rand_index 10 | - ample 11 | - anderberg 12 | - baroni_urbani_buser_i 13 | - baroni_urbani_buser_ii 14 | - braun_banquet 15 | - chisq 16 | - chisq 17 | - chisq_dof 18 | - chord 19 | - cole_i 20 | - cole_ii 21 | - contingency_coefficient 22 | - cosine 23 | - cramer_v 24 | - dennis 25 | - dice 26 | - disperson 27 | - driver_kroeber 28 | - euclid 29 | - eyraud 30 | - fager_mcgowan 31 | - faith 32 | - forbes_ii 33 | - forbesi 34 | - fossum 35 | - gilbert_wells 36 | - gk_lambda 37 | - gk_lambda_reversed 38 | - goodman_kruskal 39 | - gower 40 | - gower_legendre 41 | - hamann 42 | - hamming 43 | - hellinger 44 | - inner_product 45 | - intersection 46 | - jaccard 47 | - jaccard_3w 48 | - jaccard_distance 49 | - johnson 50 | - kulcyznski_ii 51 | - kulczynski_i 52 | - lance_williams 53 | - mcconnaughey 54 | - mcnemar_test 55 | - mean_manhattan 56 | - michael 57 | - mountford 58 | - mutual_information 59 | - ochia_i 60 | - ochia_ii 61 | - odds_ratio 62 | - pattern_difference 63 | - pearson_heron_i 64 | - pearson_heron_ii 65 | - pearson_i 66 | - peirce 67 | - person_ii 68 | - phi 69 | - roger_tanimoto 70 | - russel_rao 71 | - shape_difference 72 | - simpson 73 | - size_difference 74 | - sokal_michener 75 | - sokal_sneath_i 76 | - sokal_sneath_ii 77 | - sokal_sneath_iii 78 | - sokal_sneath_iv 79 | - sokal_sneath_v 80 | - sorensen_dice 81 | - sorgenfrei 82 | - stiles 83 | - tanimoto_distance 84 | - tanimoto_i 85 | - tanimoto_ii 86 | - tarantula 87 | - tarwid 88 | - tetrachoric 89 | - tschuprow_t 90 | - uncertainty_coefficient 91 | - uncertainty_coefficient_reversed 92 | - vari 93 | - yule_q 94 | - yule_q_difference 95 | - yule_w 96 | - yule_y 97 | 98 | Confusion Matrix, Binary-Binary (29) 99 | ------------------------------------ 100 | 101 | - acc 102 | - ba 103 | - bm 104 | - dor 105 | - f1 106 | - fdr 107 | - fn 108 | - fnr 109 | - fomr 110 | - fp 111 | - fpr 112 | - mcc 113 | - mk 114 | - n 115 | - nlr 116 | - npv 117 | - plr 118 | - ppv 119 | - precision 120 | - prevalence 121 | - pt 122 | - recall 123 | - sensitivity 124 | - specificity 125 | - tn 126 | - tnr 127 | - tp 128 | - tpr 129 | - ts 130 | 131 | Categorical-Categorical (9) 132 | --------------------------- 133 | 134 | - adjusted_rand_index 135 | - chisq 136 | - chisq_dof 137 | - gk_lambda 138 | - gk_lambda_reversed 139 | - mutual_information 140 | - phi 141 | - uncertainty_coefficient 142 | - uncertainty_coefficient_reversed 143 | 144 | Categorical-Continuous, Biserial (3) 145 | ------------------------------------ 146 | 147 | - biserial 148 | - point_biserial 149 | - rank_biserial 150 | 151 | Categorical-Continuous (7) 152 | -------------------------- 153 | 154 | - anova 155 | - calinski_harabasz 156 | - davies_bouldin 157 | - eta 158 | - eta_squared 159 | - kruskal 160 | - silhouette 161 | 162 | Ordinal-Ordinal, Concordance (3) 163 | -------------------------------- 164 | 165 | - goodman_kruskal_gamma 166 | - kendall_tau 167 | - somers_d 168 | 169 | Continuous-Continuous (4) 170 | ------------------------- 171 | 172 | - kendall 173 | - pearson 174 | - regression 175 | - spearman -------------------------------------------------------------------------------- /docs/source/quickstart.rst: -------------------------------------------------------------------------------- 1 | Quickstart 2 | ========== 3 | 4 | Installation 5 | ------------ 6 | 7 | Use PyPi to install the `package `_. 8 | 9 | .. code:: bash 10 | 11 | pip install pypair 12 | 13 | Confusion Matrix 14 | ---------------- 15 | 16 | A confusion matrix is typically used to judge binary classification performance. There are two variables, :math:`A` and :math:`P`, where :math:`A` is the actual value (ground truth) and :math:`P` is the predicted value. The example below shows how to use the convenience method ``confusion()`` and the class ``ConfusionMatrix`` to get association measures derived from the confusion matrix. 17 | 18 | .. literalinclude:: _code/confusion-demo.py 19 | :language: python 20 | :linenos: 21 | 22 | Binary-Binary 23 | ------------- 24 | 25 | Association measures for binary-binary variables are computed using ``binary_binary()`` or ``BinaryTable``. 26 | 27 | .. literalinclude:: _code/binary-demo.py 28 | :language: python 29 | :linenos: 30 | 31 | Categorical-Categorical 32 | ----------------------- 33 | 34 | Association measures for categorical-categorical variables are computed using ``categorical_categorical()`` or ``CategoricalTable``. 35 | 36 | .. literalinclude:: _code/categorical-demo.py 37 | :language: python 38 | :linenos: 39 | 40 | Binary-Continuous 41 | ----------------- 42 | 43 | Association measures for binary-continuous variables are computed using ``binary_continuous()`` or ``Biserial``. 44 | 45 | .. literalinclude:: _code/biserial-demo.py 46 | :language: python 47 | :linenos: 48 | 49 | Ordinal-Ordinal, Concordance 50 | ---------------------------- 51 | Concordance measures are used for ordinal-ordinal or continuous-continuous variables using ``concordance()`` or ``Concordance()``. 52 | 53 | .. literalinclude:: _code/concordance-demo.py 54 | :language: python 55 | :linenos: 56 | 57 | Categorical-Continuous 58 | ---------------------- 59 | Categorical-continuous association measures are computed using ``categorical_continuous()`` or ``CorrelationRatio``. 60 | 61 | .. literalinclude:: _code/corr-ratio-demo.py 62 | :language: python 63 | :linenos: 64 | 65 | Continuous-Continuous 66 | --------------------- 67 | 68 | Association measures for continuous-continuous variables are computed using ``continuous_continuous()`` or ``Continuous``. 69 | 70 | .. literalinclude:: _code/continuous-demo.py 71 | :language: python 72 | :linenos: 73 | 74 | Recipe 75 | ------ 76 | 77 | Here's a recipe in using multiprocessing to compute pairwise association with binary data. 78 | 79 | .. literalinclude:: _code/multiprocessing-tip.py 80 | :language: python 81 | :linenos: 82 | 83 | Here's a nifty utility method to create a correlation matrix. The input data frame must be all the same type and you must supply a function. Note that Pandas ``DataFrame.corr()`` no longer supports processing non-numeric data; fields that are not numeric will be simply skipped over. Why? 84 | 85 | .. literalinclude:: _code/dataframe-tip.py 86 | :language: python 87 | :linenos: 88 | 89 | Apache Spark 90 | ------------ 91 | 92 | Spark is supported for some of the association measures. `Active support `_ is appreciated. Below are some code samples to get you started. 93 | 94 | .. literalinclude:: _code/spark-demo.py 95 | :language: python 96 | :linenos: -------------------------------------------------------------------------------- /docs/source/refs.bib: -------------------------------------------------------------------------------- 1 | @misc{2020:calkins, 2 | author = {Keith G. Calkins}, 3 | title = {More Correlation Coefficients}, 4 | url = {https://www.andrews.edu/~calkins/math/edrm611/edrm13.htm}, 5 | addendum = "(accessed: 11.12.2020)" 6 | } 7 | @misc{2020:uom, 8 | author = {University of Minnesota}, 9 | title = {Types of Variables}, 10 | url = {https://cyfar.org/types-variables}, 11 | addendum = "(accessed: 11.12.2020)" 12 | } 13 | @misc{2020:idre, 14 | author = {Institute for Digital Research and Education}, 15 | title = {What is the difference between categorical, ordinal and numerical variables?}, 16 | url = {https://stats.idre.ucla.edu/other/mult-pkg/whatstat/what-is-the-difference-between-categorical-ordinal-and-numerical-variables/}, 17 | addendum = "(accessed: 11.12.2020)" 18 | } 19 | @misc{2020:minitab, 20 | author = {Minitab}, 21 | title = {What are categorical, discrete, and continuous variables?}, 22 | url = {https://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/supporting-topics/basics/what-are-categorical-discrete-and-continuous-variables/}, 23 | addendum = "(accessed: 11.12.2020)" 24 | } 25 | @misc{2020:laerd, 26 | author = {Laerd Statistics}, 27 | title = {Types of Variable}, 28 | url = {https://statistics.laerd.com/statistical-guides/types-of-variable.php}, 29 | addendum = "(accessed: 11.12.2020)" 30 | } 31 | @misc{2020:graphpad, 32 | author = {GraphPad}, 33 | title = {What is the difference between ordinal, interval and ratio variables? Why should I care?}, 34 | url = {https://www.graphpad.com/support/faq/what-is-the-difference-between-ordinal-interval-and-ratio-variables-why-should-i-care/}, 35 | addendum = "(accessed: 11.12.2020)" 36 | } 37 | @misc{2020:wiki-somersd, 38 | author = {Wikipedia}, 39 | title = {Somer's D}, 40 | url = {https://en.wikipedia.org/wiki/Somers%27_D}, 41 | addendum = "(accessed: 11.12.2020)" 42 | } 43 | @misc{2020:wiki-jaccard, 44 | author = {Wikipedia}, 45 | title = {Jaccard index}, 46 | url = {https://en.wikipedia.org/wiki/Jaccard_index}, 47 | addendum = "(accessed: 11.14.2020)" 48 | } 49 | @misc{2020:wiki-dice, 50 | author = {Wikipedia}, 51 | title = {Sørensen–Dice coefficient}, 52 | url = {https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient}, 53 | addendum = "(accessed: 11.14.2020)" 54 | } 55 | @misc{2020:wiki-tversky, 56 | author = {Wikipedia}, 57 | title = {Tversky index}, 58 | url = {https://en.wikipedia.org/wiki/Tversky_index}, 59 | addendum = "(accessed: 11.14.2020)" 60 | } 61 | @misc{2020:wiki-prospect, 62 | author = {Wikipedia}, 63 | title = {Prospect theory}, 64 | url = {https://en.wikipedia.org/wiki/Prospect_theory}, 65 | addendum = "(accessed: 11.14.2020)" 66 | } 67 | @misc{2020:wiki-overlap, 68 | author = {Wikipedia}, 69 | title = {Overlap coefficient}, 70 | url = {https://en.wikipedia.org/wiki/Overlap_coefficient}, 71 | addendum = "(accessed: 11.14.2020)" 72 | } 73 | @misc{2020:wiki-fowlkes, 74 | author = {Wikipedia}, 75 | title = {Fowlkes-Mallows Index}, 76 | url = {https://en.wikipedia.org/wiki/Fowlkes%E2%80%93Mallows_index}, 77 | addendum = "(accessed: 11.14.2020)" 78 | } 79 | @misc{2008:nips, 80 | author = {NIPS}, 81 | title = {NIPS 2008 Workshop on Causality}, 82 | url = {http://clopinet.com/isabelle/Projects/NIPS2008/}, 83 | addendum = "(accessed: 11.13.2020)" 84 | } 85 | @Book{1983:liebetrau, 86 | author = {Albert M. Liebetrau}, 87 | title = {Measures of association}, 88 | publisher = {Sage Publications, Inc.}, 89 | year = {1983} 90 | } 91 | @Book{1984:reynolds, 92 | author = {H. T. Reynolds}, 93 | title = {Analysis of nominal data}, 94 | publisher = {Sage Publications, Inc.}, 95 | year = {1984} 96 | } 97 | @Book{1970:cox, 98 | author = {D. R. Cox}, 99 | title = {Analysis of binary data}, 100 | publisher = {Chapman and Hall}, 101 | year = {1970} 102 | } 103 | @Book{2020:pearl, 104 | author = {Judea Pearl}, 105 | title = {The Book of Why: The New Science of Cause and Effect}, 106 | publisher = {Basic Books}, 107 | year = {2020} 108 | } 109 | @Book{2016:pearl, 110 | author = {Judea Pearl}, 111 | title = {Causal Inference in Statistics - A Primer}, 112 | publisher = {Wiley}, 113 | year = {2016} 114 | } 115 | @Book{2009:pearl, 116 | author = {Judea Pearl}, 117 | title = {Causality: Models, Reasoning and Inference}, 118 | publisher = {Chapman and Hall}, 119 | year = {2009} 120 | } 121 | @Book{1988:pearl, 122 | author = {Judea Pearl}, 123 | title = {Probabilistic Reasoning in Intelligent Systems: Networks of Plausible Inference}, 124 | publisher = {Morgan Kaufmann}, 125 | year = {1988} 126 | } 127 | @misc{2017:glen, 128 | author = {Stephanie Glen}, 129 | title = {What is Somers’ Delta?}, 130 | url = {https://www.statisticshowto.com/somers-d}, 131 | addendum = "(accessed: 11.14.2020)" 132 | } 133 | @misc{2020:psu-binary, 134 | author = {Penn State University}, 135 | title = {Measures of Association for Binary Variables}, 136 | url = {https://online.stat.psu.edu/stat505/lesson/14/14.3}, 137 | addendum = "(accessed: 11.14.2020)" 138 | } 139 | @misc{2020:psu-continuous, 140 | author = {Penn State University}, 141 | title = {Measures of Association for Continuous Variables}, 142 | url = {https://online.stat.psu.edu/stat505/lesson/14/14.2}, 143 | addendum = "(accessed: 11.14.2020)" 144 | } 145 | @misc{2020:ibm-proximities, 146 | author = {IBM Proximities}, 147 | title = {Measures for binary data}, 148 | url = {https://www.ibm.com/support/knowledgecenter/SSLVMB_24.0.0/spss/base/syn_proximities_measures_binary_data.html}, 149 | addendum = "(accessed: 11.14.2020)" 150 | } 151 | @misc{2020:stack-sim, 152 | author = {Stack Exchange}, 153 | title = {Measures for binary data}, 154 | url = {https://stats.stackexchange.com/questions/61705/similarity-coefficients-for-binary-data-why-choose-jaccard-over-russell-and-rao}, 155 | addendum = "(accessed: 11.14.2020)" 156 | } 157 | @article{2010:choi, 158 | author = {Seung-Seok Choi, Sung-Hyuk Cha, Charles C. Tappert}, 159 | title = {A Survey of Binary Similarity and Distance Measures}, 160 | journal = {Systemics, Cybernetics and Informatics}, 161 | year = 2010, 162 | number = 1, 163 | volume = 8 164 | } 165 | @article{2019:warrens, 166 | author = {Matthijs J. Warrens}, 167 | title = {Similarity measures for 2 x 2 tables}, 168 | journal = {Journal of Intelligent & Fuzzy Systems}, 169 | year = 2019, 170 | volume = 36 171 | } 172 | 173 | -------------------------------------------------------------------------------- /docs/source/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Allow: / 3 | Sitemap: https://py-pair.readthedocs.io/sitemap.xml -------------------------------------------------------------------------------- /docs/source/zzz-bib.rst: -------------------------------------------------------------------------------- 1 | Bibliography 2 | ------------ 3 | 4 | .. bibliography:: refs.bib 5 | :all: -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/logo.png -------------------------------------------------------------------------------- /misc/SPARK.md: -------------------------------------------------------------------------------- 1 | # Spark Tinkering 2 | 3 | To run Spark + Jupyter container. Then go to [http://localhost:8888](http://localhost:8888). 4 | 5 | On Linux. 6 | 7 | ```bash 8 | docker run -it \ 9 | -p 9870:9870 \ 10 | -p 8088:8088 \ 11 | -p 8080:8080 \ 12 | -p 18080:18080 \ 13 | -p 9000:9000 \ 14 | -p 8888:8888 \ 15 | -p 9864:9864 \ 16 | -v $HOME/git/py-pair/misc/ipynb:/root/ipynb \ 17 | -e PYSPARK_MASTER=spark://localhost:7077 \ 18 | -e NOTEBOOK_PASSWORD='' \ 19 | oneoffcoder/spark-jupyter 20 | ``` 21 | 22 | On Windows. 23 | 24 | ```bash 25 | docker run -it ^ 26 | -p 9870:9870 ^ 27 | -p 8088:8088 ^ 28 | -p 8080:8080 ^ 29 | -p 18080:18080 ^ 30 | -p 9000:9000 ^ 31 | -p 8888:8888 ^ 32 | -p 9864:9864 ^ 33 | -v ./git/py-pair/misc/ipynb:/root/ipynb ^ 34 | -e PYSPARK_MASTER=spark://localhost:7077 ^ 35 | -e NOTEBOOK_PASSWORD='' ^ 36 | oneoffcoder/spark-jupyter 37 | ``` -------------------------------------------------------------------------------- /misc/binary-measures.csv: -------------------------------------------------------------------------------- 1 | type,name,equation 2 | s,Jaccard,"\frac{a}{a+b+c}" 3 | s,Dice;Czekanowski;Nei-Li,"\frac{2a}{2a+b+c}" 4 | s,3W-Jaccard,"\frac{3a}{3a+b+c}" 5 | s,Sokal-Sneath-I,"\frac{a}{a+2b+2c}" 6 | s,Sokal-Michener,"\frac{a+d}{a+b+c+d}" 7 | s,Sokal-Sneath-II,"\frac{2a+2d}{2a+b+c+2d}" 8 | s,Roger-Tanimoto,"\frac{a+d}{a+2b+2c+d}" 9 | s,Faith,"\frac{a+0.5d}{a+b+c+d}" 10 | s,Gower-Legendre,"\frac{a+d}{a+0.5b+0.5c+d}" 11 | s,Intersection,"a" 12 | s,Inner Product,"a+d" 13 | s,Russell-Rao,"\frac{a}{a+b+c+d}" 14 | s,Cosine,"\frac{a}{(a+b)(a+c)}" 15 | s,Gilbert-Wells,"\log a - \log p - \log \frac{a+b}{p} - \log \frac{a+c}{p}" 16 | s,Ochiai-I;Otsuka,"\frac{a}{\sqrt{(a+b)(a+c)}}" 17 | s,Forbesi,"\frac{pa}{(a+b)(a+c)}" 18 | s,Fossum,"\frac{n(a-0.5)^2}{(a+b)(a+c)}" 19 | s,Sorgenfrei,"\frac{a^2}{(a+b)(a+c)}" 20 | s,Mountford,"\frac{a}{0.5(ab + ac) + bc}" 21 | s,McConnaughey,"\frac{a^2 - bc}{(a+b)(a+c)}" 22 | s,Tarwid,"\frac{na - (a+b)(a+c)}{na + (a+b)(a+c)}" 23 | s,Kulczynski-II,"\frac{0.5a(2a+b+c)}{(a+b)(a+c)}" 24 | s,Driver-Kroeber,"\frac{a}{2}\left(\frac{1}{a+b}+\frac{1}{a+c}\right)" 25 | s,Johnson,"\frac{a}{a+b}+\frac{a}{a+c}" 26 | s,Dennis,"\frac{ad-bc}{\sqrt{n(a+b)(a+c)}}" 27 | s,Simpson,"\frac{a}{\min(a+b,a+c)}" 28 | s,Braun-Banquet,"\frac{a}{\max(a+b,a+c)}" 29 | s,Fager-McGowan,"\frac{a}{\sqrt{(a+b)(a+c)}}-\frac{max(a+b,a+c)}{2}" 30 | s,Forbes-II,"\frac{na-(a+b)(a+c)}{n \min(a+b,a+c) - (a+b)(a+c)}" 31 | s,Sokal-Sneath-IV,"\frac{1}{4}\left(\frac{a}{a+b}+\frac{a}{a+c}+\frac{d}{b+d}+\frac{d}{b+d}\right)" 32 | s,Gower,"\frac{a+d}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}" 33 | s,Pearson-I,"\chi^2=\frac{n(ad-bc)^2}{(a+b)(a+c)(c+d)(b+d)}" 34 | s,Pearson-II,"\sqrt{\frac{\chi^2}{n+\chi^2}}" 35 | s,Pearson-II,"\sqrt{\frac{\rho}{n+\rho}}; \rho=\frac{ad-bc}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}" 36 | s,Pearson-Heron-I,"\frac{ad-bc}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}" 37 | s,Pearson-Heron-II,"\cos\left(\frac{\pi \sqrt{bc}}{\sqrt{ad}+\sqrt{bc}}\right)" 38 | s,Sokal-Sneath-III,"\frac{a+d}{b+c}" 39 | s,Sokal-Sneath-V,"\frac{ad}{(a+b)(a+c)(b+d)\sqrt{c+d}}" 40 | s,Cole,"\frac{\sqrt{2}(ad-bc)}{\sqrt{(ad-bc)^2-(a+b)(a+c)(b+d)(c+d)}}" 41 | s,Stiles,"\log_{10} \frac{n\left(|ad-bc|-\frac{n}{2}\right)^2}{(a+b)(a+c)(b+d)(c+d)}" 42 | s,Ochiai-II,"\frac{ad}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}" 43 | s,Yule-Q,"\frac{ad-bc}{ad+bc}" 44 | s,Yule-w,"\frac{\sqrt{ad}-\sqrt{bc}}{\sqrt{ad}+\sqrt{bc}}" 45 | s,Kulczynski-I,"\frac{a}{b+c}" 46 | s,Tanimoto,"\frac{a}{2a+b+c}" 47 | s,Disperson,"\frac{ad-bc}{(a+b+c+d)^2}" 48 | s,Hamann,"\frac{(a+d)-(b+c)}{a+b+c+d}" 49 | s,Micahel,"\frac{4(ad-bc)}{(a+d)^2+(b+c)^2}" 50 | s,Goodman-Kruskal,"\frac{\sigma - \sigma'}{2n-\sigma'}; \sigma=\max(a,b)+\max(c,d)+\max(a,c)+\max(b,d); \sigma'=\max(a+c,b+d)+\max(a+b,c+d)" 51 | s,Anderberg,"\frac{\sigma-\sigma'}{2n}" 52 | s,Baroni-Urbani-Buser-I,"\frac{\sqrt{ad}+a}{\sqrt{ad}+a+b+c}" 53 | s,Baroni-Urbani-Buser-II,"\frac{\sqrt{ad}+a-(b+c)}{\sqrt{ad}+a+b+c}" 54 | s,Peirce,"\frac{ab+bc}{ab+2bc+cd}" 55 | s,Eyraud,"\frac{n^2(na-(a+b)(a+c))}{(a+b)(a+c)(b+d)(c+d)}" 56 | s,Trantuala,"\frac{a(c+d)}{c(a+b)}" 57 | s,Ample,"\left|\frac{a(c+d)}{c(a+b)}\right|" 58 | d,Hamming;Canberra;Manhattan;Cityblock;Minkowski,"b+c" 59 | d,Euclid,"\sqrt{b+c}" 60 | d,Squared-Euclid,"\sqrt{(b+c)^2}" 61 | d,Mean-Manhattan,"\frac{b+c}{a+b+c+d}" 62 | d,Vari,"\frac{b+c}{4a+4b+4c+4d}" 63 | d,Size Difference,"\frac{(b+c)^2}{(a+b+c+d)^2}" 64 | d,Shape Difference,"\frac{n(b+c)-(b-c)^2}{(a+b+c+d)^2}" 65 | d,Pattern Difference,"\frac{4bc}{(a+b+c+d)^2}" 66 | d,Lance-Williams;Bray-Curtis,"\frac{b+c}{2a+b+c}" 67 | d,Hellinger,"2\sqrt{1 - \frac{a}{\sqrt{(a+b)(a+c)}}}" 68 | d,Chord,"\sqrt{2\left(1 - \frac{a}{\sqrt{(a+b)(a+c)}}\right)}" 69 | d,Yule-Q,"\frac{2bc}{ad+bc}" -------------------------------------------------------------------------------- /misc/binary-measures.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | df = pd.read_csv('binary-measures.csv') 4 | print(df) 5 | print(df.columns) 6 | print(df.shape) 7 | print(len(df.equation.unique())) 8 | 9 | equations = [r.equation for _, r in df.iterrows()] 10 | print(len(equations)) 11 | m = {e: 0 for e in equations} 12 | for e in equations: 13 | m[e] += 1 14 | for e, c in m.items(): 15 | if c > 1: 16 | print(e, c) 17 | 18 | for _, r in df[df.type == 'd'].sort_values(['name']).iterrows(): 19 | e_name = r['name'] 20 | e_form = r.equation 21 | name = f' * - {e_name}' 22 | eqn = f' - :math:`{e_form}`' 23 | print(name) 24 | print(eqn) 25 | -------------------------------------------------------------------------------- /misc/count-measures.py: -------------------------------------------------------------------------------- 1 | from pypair.biserial import Biserial 2 | from pypair.contingency import BinaryTable, CategoricalTable, ConfusionMatrix 3 | from pypair.continuous import Concordance, CorrelationRatio, Continuous 4 | 5 | measures = [ 6 | ('Binary-Binary', BinaryTable.measures()), 7 | ('Confusion Matrix, Binary-Binary', ConfusionMatrix.measures()), 8 | ('Categorical-Categorical', CategoricalTable.measures()), 9 | ('Categorical-Continuous, Biserial', Biserial.measures()), 10 | ('Categorical-Continuous', CorrelationRatio.measures()), 11 | ('Ordinal-Ordinal, Concordance', Concordance.measures()), 12 | ('Continuous-Continuous', Continuous.measures()) 13 | ] 14 | print(sum([len(m) for _, m in measures])) 15 | 16 | for n, items in measures: 17 | title = f'{n} ({len(items)})' 18 | print(title) 19 | print('-' * len(title)) 20 | print('') 21 | for m in items: 22 | print(f'- {m}') 23 | print('') 24 | 25 | -------------------------------------------------------------------------------- /misc/ipynb/binary-binary.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 10, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "\n", 11 | "get_data = lambda x, y, n: [(x, y) * 2 for _ in range(n)]\n", 12 | "data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)\n", 13 | "pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4'])\n", 14 | "sdf = sqlContext.createDataFrame(pdf)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 13, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "root\n", 27 | " |-- x1: long (nullable = true)\n", 28 | " |-- x2: long (nullable = true)\n", 29 | " |-- x3: long (nullable = true)\n", 30 | " |-- x4: long (nullable = true)\n", 31 | "\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "sdf.printSchema()" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 29, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "from itertools import combinations\n", 46 | "\n", 47 | "def to_counts(d):\n", 48 | " def as_key(k1, k2):\n", 49 | " keys = sorted([k1, k2])\n", 50 | " return keys[0], keys[1]\n", 51 | " \n", 52 | " def as_count(v1, v2):\n", 53 | " a, b, c, d = 0, 0, 0, 0\n", 54 | " if v1 is not None and v2 is not None:\n", 55 | " if v1 == 1 and v2 == 1:\n", 56 | " a = 1\n", 57 | " elif v1 == 1 and v2 == 0:\n", 58 | " b = 1\n", 59 | " elif v1 == 0 and v2 == 1:\n", 60 | " c = 1\n", 61 | " else:\n", 62 | " d = 1\n", 63 | " return a, b, c, d\n", 64 | " \n", 65 | " def transform(k1, k2):\n", 66 | " v1, v2 = d[k1], d[k2]\n", 67 | " return as_key(k1, k2), as_count(v1, v2)\n", 68 | " \n", 69 | " return [transform(k1, k2) for k1, k2 in combinations(d.keys(), 2)]\n", 70 | "\n", 71 | "def add_counts(a, b):\n", 72 | " return a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]\n", 73 | "\n", 74 | "# to_counts({'x1': 1, 'x2': 1, 'x3': 1, 'x4': 1})" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 31, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/plain": [ 85 | "[(('x1', 'x2'), (207, 282, 231, 242)),\n", 86 | " (('x1', 'x3'), (489, 0, 0, 473)),\n", 87 | " (('x1', 'x4'), (207, 282, 231, 242)),\n", 88 | " (('x2', 'x3'), (207, 231, 282, 242)),\n", 89 | " (('x2', 'x4'), (438, 0, 0, 524)),\n", 90 | " (('x3', 'x4'), (207, 282, 231, 242))]" 91 | ] 92 | }, 93 | "execution_count": 31, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "sdf.rdd\\\n", 100 | " .flatMap(lambda r: to_counts(r.asDict()))\\\n", 101 | " .reduceByKey(lambda a, b: add_counts(a, b))\\\n", 102 | " .sortByKey()\\\n", 103 | " .collect()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.7.6" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 4 135 | } 136 | -------------------------------------------------------------------------------- /misc/ipynb/cat-cat.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 10, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from random import choice\n", 10 | "\n", 11 | "\n", 12 | "x_domain = ['a', 'b', 'c']\n", 13 | "y_domain = ['a', 'b']\n", 14 | "\n", 15 | "get_x = lambda: choice(x_domain)\n", 16 | "get_y = lambda: choice(y_domain)\n", 17 | "get_data = lambda: {f'x{i}':v for i, v in enumerate((get_x(), get_y(), get_x(), get_y()))}\n", 18 | "\n", 19 | "data = [get_data() for _ in range(10)]" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 19, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "from itertools import combinations, chain\n", 29 | "\n", 30 | "def to_count(d):\n", 31 | " def count(k1, k2):\n", 32 | " tups = [(k1, d[k1]), (k2, d[k2])]\n", 33 | " tups = sorted(tups, key=lambda t: t[0])\n", 34 | " \n", 35 | " return (tups[0][0], tups[1][0], tups[0][1], tups[1][1]), 1\n", 36 | " \n", 37 | " return [count(k1, k2) for k1, k2 in combinations(d.keys(), 2)]\n", 38 | " \n", 39 | "t = map(lambda d: to_count(d), data)\n", 40 | "t = chain(*t)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 20, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/plain": [ 51 | "[(('x0', 'x1', 'c', 'a'), 1),\n", 52 | " (('x0', 'x2', 'c', 'c'), 1),\n", 53 | " (('x0', 'x3', 'c', 'a'), 1),\n", 54 | " (('x1', 'x2', 'a', 'c'), 1),\n", 55 | " (('x1', 'x3', 'a', 'a'), 1),\n", 56 | " (('x2', 'x3', 'c', 'a'), 1),\n", 57 | " (('x0', 'x1', 'a', 'a'), 1),\n", 58 | " (('x0', 'x2', 'a', 'b'), 1),\n", 59 | " (('x0', 'x3', 'a', 'b'), 1),\n", 60 | " (('x1', 'x2', 'a', 'b'), 1),\n", 61 | " (('x1', 'x3', 'a', 'b'), 1),\n", 62 | " (('x2', 'x3', 'b', 'b'), 1),\n", 63 | " (('x0', 'x1', 'b', 'a'), 1),\n", 64 | " (('x0', 'x2', 'b', 'a'), 1),\n", 65 | " (('x0', 'x3', 'b', 'b'), 1),\n", 66 | " (('x1', 'x2', 'a', 'a'), 1),\n", 67 | " (('x1', 'x3', 'a', 'b'), 1),\n", 68 | " (('x2', 'x3', 'a', 'b'), 1),\n", 69 | " (('x0', 'x1', 'a', 'b'), 1),\n", 70 | " (('x0', 'x2', 'a', 'a'), 1),\n", 71 | " (('x0', 'x3', 'a', 'a'), 1),\n", 72 | " (('x1', 'x2', 'b', 'a'), 1),\n", 73 | " (('x1', 'x3', 'b', 'a'), 1),\n", 74 | " (('x2', 'x3', 'a', 'a'), 1),\n", 75 | " (('x0', 'x1', 'b', 'a'), 1),\n", 76 | " (('x0', 'x2', 'b', 'a'), 1),\n", 77 | " (('x0', 'x3', 'b', 'a'), 1),\n", 78 | " (('x1', 'x2', 'a', 'a'), 1),\n", 79 | " (('x1', 'x3', 'a', 'a'), 1),\n", 80 | " (('x2', 'x3', 'a', 'a'), 1),\n", 81 | " (('x0', 'x1', 'b', 'a'), 1),\n", 82 | " (('x0', 'x2', 'b', 'b'), 1),\n", 83 | " (('x0', 'x3', 'b', 'b'), 1),\n", 84 | " (('x1', 'x2', 'a', 'b'), 1),\n", 85 | " (('x1', 'x3', 'a', 'b'), 1),\n", 86 | " (('x2', 'x3', 'b', 'b'), 1),\n", 87 | " (('x0', 'x1', 'c', 'b'), 1),\n", 88 | " (('x0', 'x2', 'c', 'a'), 1),\n", 89 | " (('x0', 'x3', 'c', 'a'), 1),\n", 90 | " (('x1', 'x2', 'b', 'a'), 1),\n", 91 | " (('x1', 'x3', 'b', 'a'), 1),\n", 92 | " (('x2', 'x3', 'a', 'a'), 1),\n", 93 | " (('x0', 'x1', 'b', 'b'), 1),\n", 94 | " (('x0', 'x2', 'b', 'b'), 1),\n", 95 | " (('x0', 'x3', 'b', 'a'), 1),\n", 96 | " (('x1', 'x2', 'b', 'b'), 1),\n", 97 | " (('x1', 'x3', 'b', 'a'), 1),\n", 98 | " (('x2', 'x3', 'b', 'a'), 1),\n", 99 | " (('x0', 'x1', 'a', 'a'), 1),\n", 100 | " (('x0', 'x2', 'a', 'a'), 1),\n", 101 | " (('x0', 'x3', 'a', 'b'), 1),\n", 102 | " (('x1', 'x2', 'a', 'a'), 1),\n", 103 | " (('x1', 'x3', 'a', 'b'), 1),\n", 104 | " (('x2', 'x3', 'a', 'b'), 1),\n", 105 | " (('x0', 'x1', 'b', 'a'), 1),\n", 106 | " (('x0', 'x2', 'b', 'b'), 1),\n", 107 | " (('x0', 'x3', 'b', 'a'), 1),\n", 108 | " (('x1', 'x2', 'a', 'b'), 1),\n", 109 | " (('x1', 'x3', 'a', 'a'), 1),\n", 110 | " (('x2', 'x3', 'b', 'a'), 1)]" 111 | ] 112 | }, 113 | "execution_count": 20, 114 | "metadata": {}, 115 | "output_type": "execute_result" 116 | } 117 | ], 118 | "source": [ 119 | "list(t)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [] 128 | } 129 | ], 130 | "metadata": { 131 | "kernelspec": { 132 | "display_name": "Python 3", 133 | "language": "python", 134 | "name": "python3" 135 | }, 136 | "language_info": { 137 | "codemirror_mode": { 138 | "name": "ipython", 139 | "version": 3 140 | }, 141 | "file_extension": ".py", 142 | "mimetype": "text/x-python", 143 | "name": "python", 144 | "nbconvert_exporter": "python", 145 | "pygments_lexer": "ipython3", 146 | "version": "3.7.3" 147 | } 148 | }, 149 | "nbformat": 4, 150 | "nbformat_minor": 2 151 | } 152 | -------------------------------------------------------------------------------- /misc/scratch.py: -------------------------------------------------------------------------------- 1 | from pypair.contingency import CategoricalTable, BinaryTable, ConfusionMatrix, AgreementTable 2 | 3 | a = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] 4 | b = [0, 0, 0, 1, 1, 0, 0, 1, 1, 1] 5 | 6 | cat = CategoricalTable(a, b) 7 | bin = BinaryTable(a, b) 8 | con = ConfusionMatrix(a, b) 9 | agr = AgreementTable(a, b) 10 | 11 | print(cat.measures()) 12 | print(CategoricalTable.measures()) 13 | print('-' * 15) 14 | print(bin.measures()) 15 | print(BinaryTable.measures()) 16 | print('-' * 15) 17 | print(con.measures()) 18 | print(ConfusionMatrix.measures()) 19 | print('-' * 15) 20 | print(agr.measures()) 21 | print(AgreementTable.measures()) 22 | 23 | print('~' * 15) 24 | print('~' * 15) 25 | 26 | 27 | def print_measures(computer): 28 | r = {m: computer.get(m) for m in computer.measures()} 29 | print(r) 30 | 31 | 32 | print_measures(cat) 33 | print_measures(bin) 34 | print_measures(con) 35 | print_measures(agr) 36 | -------------------------------------------------------------------------------- /publish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SOURCE_DIST=/py-pair/dist/pypair-${API_VERSION}.tar.gz 4 | 5 | buildCode() { 6 | echo "start the build" 7 | cd /py-pair \ 8 | && make clean \ 9 | && make \ 10 | && twine check dist/* 11 | } 12 | 13 | updateVersion() { 14 | echo "replace version of software to ${API_VERSION}" 15 | sed -i "s/version = 0.0.1/version = ${API_VERSION}/g" /py-pair/setup.cfg 16 | } 17 | 18 | copyCredentials() { 19 | if [[ -f /py-pair/.pypirc ]]; then 20 | echo "copying over .pypirc" 21 | cp /py-pair/.pypirc /root/.pypirc 22 | fi 23 | } 24 | 25 | publish() { 26 | echo "python publish" 27 | 28 | if [[ -f /root/.pypirc ]]; then 29 | if [[ -f ${SOURCE_DIST} ]]; then 30 | echo "uploading source" 31 | cd /py-pair \ 32 | && make clean \ 33 | && python setup.py sdist \ 34 | && twine upload --repository ${PYPI_REPO} ${SOURCE_DIST} 35 | else 36 | echo "no ${SOURCE_DIST} found!" 37 | fi 38 | else 39 | echo "no .pypirc found!" 40 | fi 41 | } 42 | 43 | cleanUp() { 44 | if [[ -f /root/.pypirc ]]; then 45 | echo "cleaning up" 46 | rm -f /root/.pypirc 47 | fi 48 | } 49 | 50 | build() { 51 | echo "python build" 52 | buildCode 53 | publish 54 | } 55 | 56 | conda init bash 57 | . /root/.bashrc 58 | updateVersion 59 | copyCredentials 60 | build 61 | cleanUp 62 | 63 | echo "done!" -------------------------------------------------------------------------------- /pypair/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/pypair/__init__.py -------------------------------------------------------------------------------- /pypair/association.py: -------------------------------------------------------------------------------- 1 | from pypair.biserial import Biserial 2 | from pypair.contingency import BinaryTable, CategoricalTable, ConfusionMatrix, AgreementTable 3 | from pypair.continuous import Concordance, CorrelationRatio, Continuous 4 | 5 | 6 | def confusion(a, b, measure='acc', a_0=0, a_1=1, b_0=0, b_1=1): 7 | """ 8 | Gets the specified confusion matrix stats. 9 | 10 | :param a: Binary variable (iterable). 11 | :param b: Binary variable (iterable). 12 | :param measure: Measure. Default is `acc`. 13 | :param a_0: The a zero value. Default 0. 14 | :param a_1: The a one value. Default 1. 15 | :param b_0: The b zero value. Default 0. 16 | :param b_1: The b one value. Default 1. 17 | :return: Measure. 18 | """ 19 | if measure not in ConfusionMatrix.measures(): 20 | raise ValueError(f'{measure} is not a valid association measure.') 21 | return ConfusionMatrix(a, b, a_0=a_0, a_1=a_1, b_0=b_0, b_1=b_1).get(measure) 22 | 23 | 24 | def binary_binary(a, b, measure='chisq', a_0=0, a_1=1, b_0=0, b_1=1): 25 | """ 26 | Gets the binary-binary association. 27 | 28 | :param a: Binary variable (iterable). 29 | :param b: Binary variable (iterable). 30 | :param measure: Measure. Default is `chisq`. 31 | :param a_0: The a zero value. Default 0. 32 | :param a_1: The a one value. Default 1. 33 | :param b_0: The b zero value. Default 0. 34 | :param b_1: The b one value. Default 1. 35 | :return: Measure. 36 | """ 37 | if measure not in BinaryTable.measures(): 38 | raise ValueError(f'{measure} is not a valid association measure.') 39 | return BinaryTable(a, b, a_0=a_0, a_1=a_1, b_0=b_0, b_1=b_1).get(measure) 40 | 41 | 42 | def categorical_categorical(a, b, measure='chisq', a_vals=None, b_vals=None): 43 | """ 44 | Gets the categorical-categorical association. 45 | 46 | :param a: Categorical variable (iterable). 47 | :param b: Categorical variable (iterable). 48 | :param measure: Measure. Default is `chisq`. 49 | :param a_vals: The unique values in `a`. 50 | :param b_vals: The unique values in `b`. 51 | :return: Measure. 52 | """ 53 | if measure not in CategoricalTable.measures(): 54 | raise ValueError(f'{measure} is not a valid association measure.') 55 | return CategoricalTable(a, b, a_vals=a_vals, b_vals=b_vals).get(measure) 56 | 57 | 58 | def agreement(a, b, measure='chohen_k', a_vals=None, b_vals=None): 59 | """ 60 | Gets the agreement association. 61 | 62 | :param a: Categorical variable (iterable). 63 | :param b: Categorical variable (iterable). 64 | :param measure: Measure. Default is `chohen_k`. 65 | :param a_vals: The unique values in `a`. 66 | :param b_vals: The unique values in `b`. 67 | :return: Measure. 68 | """ 69 | if measure not in AgreementTable.measures(): 70 | raise ValueError(f'{measure} is not a valid association measure.') 71 | return AgreementTable(a, b, a_vals=a_vals, b_vals=b_vals).get(measure) 72 | 73 | 74 | def binary_continuous(b, c, measure='biserial', b_0=0, b_1=1): 75 | """ 76 | Gets the binary-continuous association. 77 | 78 | :param b: Binary variable (iterable). 79 | :param c: Continuous variable (iterable). 80 | :param measure: Measure. Default is `biserial`. 81 | :param b_0: Value when `b` is zero. Default 0. 82 | :param b_1: Value when `b` is one. Default is 1. 83 | :return: Measure. 84 | """ 85 | if measure not in Biserial.measures(): 86 | raise ValueError(f'{measure} is not a valid association measure.') 87 | return Biserial(b, c, b_0=b_0, b_1=b_1).get(measure) 88 | 89 | 90 | def categorical_continuous(x, y, measure='eta'): 91 | """ 92 | Gets the categorical-continuous association. 93 | 94 | :param x: Categorical variable (iterable). 95 | :param y: Continuous variable (iterable). 96 | :param measure: Measure. Default is `eta`. 97 | :return: Measure. 98 | """ 99 | if measure not in CorrelationRatio.measures(): 100 | raise ValueError(f'{measure} is not a valid association measure.') 101 | return CorrelationRatio(x, y).get(measure) 102 | 103 | 104 | def concordance(x, y, measure='kendall_tau'): 105 | """ 106 | Gets the specified concordance between the two variables. 107 | 108 | :param x: Continuous or ordinal variable (iterable). 109 | :param y: Continuous or ordinal variable (iterable). 110 | :param measure: Measure. Default is `kendall_tau`. 111 | :return: Measure. 112 | """ 113 | if measure not in Concordance.measures(): 114 | raise ValueError(f'{measure} is not a valid association measure.') 115 | return Concordance(x, y).get(measure) 116 | 117 | 118 | def continuous_continuous(x, y, measure='pearson'): 119 | """ 120 | Gets the continuous-continuous association. 121 | 122 | :param x: Continuous variable (iterable). 123 | :param y: Continuous variable (iterable). 124 | :param measure: Measure. Default is 'pearson'. 125 | :return: Measure. 126 | """ 127 | if measure not in Continuous.measures(): 128 | raise ValueError(f'{measure} is not a valid association measure.') 129 | return Continuous(x, y).get(measure) 130 | -------------------------------------------------------------------------------- /pypair/biserial.py: -------------------------------------------------------------------------------- 1 | from functools import lru_cache 2 | from math import sqrt 3 | 4 | import pandas as pd 5 | from scipy.stats import norm 6 | 7 | from pypair.util import MeasureMixin 8 | 9 | 10 | class BiserialMixin(object): 11 | """ 12 | Biserial computations based off of :math:`n, p, q, y_0, y_1, \\sigma`. 13 | 14 | """ 15 | 16 | @property 17 | @lru_cache(maxsize=None) 18 | def __params(self): 19 | """ 20 | Gets the parameters associated with the data. 21 | 22 | - n: total 23 | - p: P(b=0) 24 | - q: 1 - p 25 | - y_0: average of c when b=0 26 | - y_1: average of c when b=1 27 | - std: standard deviation of c 28 | 29 | :return: n, p, q, y_0, y_1, std 30 | """ 31 | return self._n, self._p, self._q, self._y_0, self._y_1, self._std 32 | 33 | @property 34 | @lru_cache(maxsize=None) 35 | def biserial(self): 36 | """ 37 | Computes the biserial correlation between a binary and continuous variable. The biserial correlation 38 | :math:`r_b` can be computed from the point-biserial correlation :math:`r_{\\mathrm{pb}}` as follows. 39 | 40 | :math:`r_b = \\frac{r_{\\mathrm{pb}}}{h} \\sqrt{pq}` 41 | 42 | The tricky thing to explain is the :math:`h` parameter. :math:`h` is defined as the 43 | height of the standard normal distribution at z, where :math:`P(z'z) = p`. 44 | The way to get :math:`h` in practice is take the inverse standard normal of :math:`q`, and 45 | then take the standard normal probability of that result. Using Scipy `norm.pdf(norm.ppf(q))`. 46 | 47 | References 48 | 49 | - `Point-Biserial Correlation & Biserial Correlation: Definition, Examples `_ 50 | - `Point-Biserial and Biserial Correlations `_ 51 | - `Real Statistics Using Excel `_ 52 | - `NORM.S.DIST function `_ 53 | - `NORM.S.INV function `_ 54 | - `scipy.stats.norm `_ 55 | - `How to calculate the inverse of the normal cumulative distribution function in python? `_ 56 | 57 | :return: Biserial correlation coefficient. 58 | """ 59 | n, p, q, y_0, y_1, std = self.__params 60 | 61 | r_pb = (y_1 - y_0) * sqrt(p * q) / std 62 | 63 | y = norm.pdf(norm.ppf(q)) 64 | r_b = r_pb * sqrt(p * q) / y 65 | 66 | return r_b 67 | 68 | @property 69 | @lru_cache(maxsize=None) 70 | def point_biserial(self): 71 | """ 72 | Computes the `point-biserial correlation coefficient `_ 73 | between a binary variable :math:`X` and a continuous variable :math:`Y`. 74 | 75 | :math:`r_{\\mathrm{pb}} = \\frac{(Y_1 - Y_0) \\sqrt{pq}}{\\sigma_Y}` 76 | 77 | Where 78 | 79 | - :math:`Y_0` is the average of :math:`Y` when :math:`X=0` 80 | - :math:`Y_1` is the average of :math:`Y` when :math:`X=1` 81 | - :math:`\\sigma_Y`` is the standard deviation of :math:`Y` 82 | - :math:`p` is :math:`P(X=1)` 83 | - :math:`q` is :math:`1 - p` 84 | 85 | :return: Point-biserial correlation coefficient. 86 | """ 87 | n, p, q, y_0, y_1, std = self.__params 88 | 89 | r = (y_1 - y_0) * sqrt(p * q) / std 90 | return r 91 | 92 | @property 93 | @lru_cache(maxsize=None) 94 | def rank_biserial(self): 95 | """ 96 | Computes the rank-biserial correlation between a binary variable :math:`X` and a continuous variable :math:`Y`. 97 | 98 | :math:`r_r = \\frac{2 (Y_1 - Y_0)}{n}` 99 | 100 | Where 101 | 102 | - :math:`Y_0` is the average of :math:`Y` when :math:`X=0` 103 | - :math:`Y_1` is the average of :math:`Y` when :math:`X=1` 104 | - :math:`n` is the total number of data 105 | 106 | :return: Rank-biserial correlation. 107 | """ 108 | n, p, q, y_0, y_1, std = self.__params 109 | 110 | r = 2 * (y_1 - y_0) / n 111 | return r 112 | 113 | 114 | class Biserial(MeasureMixin, BiserialMixin, object): 115 | """ 116 | Biserial association between a binary and continuous variable. 117 | """ 118 | 119 | def __init__(self, b, c, b_0=0, b_1=1): 120 | """ 121 | ctor. 122 | 123 | :param b: Binary variable (iterable). 124 | :param c: Continuous variable (iterable). 125 | :param b_0: Value for b is zero. Default 0. 126 | :param b_1: Value for b is one. Default 1. 127 | """ 128 | df = pd.DataFrame([(x, y) for x, y in zip(b, c) if pd.notna(x)], columns=['b', 'c']) 129 | 130 | n = df.shape[0] 131 | p = df[df.b == b_1].shape[0] / n 132 | q = 1.0 - p 133 | 134 | y_0 = df[df.b == b_0].c.mean() 135 | y_1 = df[df.b == b_1].c.mean() 136 | std = df.c.std() 137 | 138 | self._n = n 139 | self._p = p 140 | self._q = q 141 | self._y_0 = y_0 142 | self._y_1 = y_1 143 | self._std = std 144 | 145 | 146 | class BiserialStats(MeasureMixin, BiserialMixin, object): 147 | """ 148 | Computes biserial stats. 149 | """ 150 | 151 | def __init__(self, n, p, y_0, y_1, std): 152 | """ 153 | ctor. 154 | 155 | :param n: Total number of samples. 156 | :param p: :math:`P(Y|X=0)`. 157 | :param y_0: Average of :math:`Y` when :math:`X=0`. :math:`\\bar{Y}_0` 158 | :param y_1: Average of :math:`Y` when :math:`X=1`. :math:`\\bar{Y}_1` 159 | :param std: Standard deviation of :math:`Y`, :math:`\\sigma`. 160 | """ 161 | self._n = n 162 | self._p = p 163 | self._q = 1.0 - p 164 | self._y_0 = y_0 165 | self._y_1 = y_1 166 | self._std = std 167 | -------------------------------------------------------------------------------- /pypair/continuous.py: -------------------------------------------------------------------------------- 1 | from functools import reduce, lru_cache 2 | from itertools import combinations 3 | from math import sqrt 4 | 5 | import pandas as pd 6 | from scipy.stats import pearsonr, spearmanr, kendalltau, f_oneway, kruskal, linregress 7 | from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score 8 | 9 | from pypair.util import MeasureMixin 10 | 11 | 12 | class ConcordantCounts(object): 13 | """ 14 | Stores the concordance, discordant and tie counts. 15 | """ 16 | 17 | def __init__(self, d, t_xy, t_x, t_y, c): 18 | """ 19 | ctor. 20 | 21 | :param d: Discordant. 22 | :param t_xy: Tie. 23 | :param t_x: Tie on X. 24 | :param t_y: Tie on Y. 25 | :param c: Concordant. 26 | """ 27 | self.d = d 28 | self.t_xy = t_xy 29 | self.t_x = t_x 30 | self.t_y = t_y 31 | self.c = c 32 | 33 | def __add__(self, other): 34 | d = self.d + other.d 35 | t_xy = self.t_xy + other.t_xy 36 | t_x = self.t_x + other.t_x 37 | t_y = self.t_y + other.t_y 38 | c = self.c + other.c 39 | return ConcordantCounts(d, t_xy, t_x, t_y, c) 40 | 41 | 42 | class Continuous(MeasureMixin, object): 43 | def __init__(self, a, b): 44 | """ 45 | ctor. 46 | 47 | :param a: Continuous variable (iterable). 48 | :param b: Continuous variable (iterable). 49 | """ 50 | self.__a = a 51 | self.__b = b 52 | 53 | @property 54 | @lru_cache(maxsize=None) 55 | def pearson(self): 56 | """ 57 | `Pearson's r `_. 58 | 59 | :return: Pearson's r, p-value. 60 | """ 61 | return pearsonr(self.__a, self.__b) 62 | 63 | @property 64 | @lru_cache(maxsize=None) 65 | def spearman(self): 66 | """ 67 | `Spearman's r `_. 68 | 69 | :return: Spearman's r, p-value. 70 | """ 71 | r = spearmanr(self.__a, self.__b) 72 | return r.correlation, r.pvalue 73 | 74 | @property 75 | @lru_cache(maxsize=None) 76 | def kendall(self): 77 | """ 78 | `Kendall's tau `_. 79 | 80 | :return: Kendall's tau, p-value. 81 | """ 82 | r = kendalltau(self.__a, self.__b) 83 | return r.correlation, r.pvalue 84 | 85 | @property 86 | @lru_cache(maxsize=None) 87 | def regression(self): 88 | """ 89 | `Line regression `_. 90 | 91 | :return: Coefficient, p-value 92 | """ 93 | slope, intercept, r_value, p_value, std_err = linregress(self.__a, self.__b) 94 | return r_value, p_value 95 | 96 | 97 | class CorrelationRatio(MeasureMixin, object): 98 | """ 99 | `Correlation ratio `_. 100 | 101 | """ 102 | 103 | def __init__(self, x, y): 104 | """ 105 | ctor. 106 | 107 | :param x: Categorical variable (iterable). 108 | :param y: Continuous variable (iterable). 109 | """ 110 | is_valid = lambda a, b: a is not None and b is not None 111 | self.__df = pd.DataFrame([(a, b) for a, b, in zip(x, y) if is_valid(a, b)], columns=['x', 'y']) 112 | 113 | @property 114 | @lru_cache(maxsize=None) 115 | def __mean(self): 116 | """ 117 | Gets the mean of :math:`\\bar{y}`. 118 | 119 | :return: :math:`\\bar{y}`. 120 | """ 121 | return self.__df.y.mean() 122 | 123 | @property 124 | @lru_cache(maxsize=None) 125 | def __sigma_cat(self): 126 | """ 127 | Gets :math:`\\sigma_{\\bar{y}}^2` 128 | 129 | :return: :math:`\\sigma_{\\bar{y}}^2`. 130 | """ 131 | stats = self.__df.groupby(['x']).agg(['count', 'mean']).reset_index() 132 | stats.columns = stats.columns.droplevel(0) 133 | stats = stats.rename(columns={'': 'x', 'count': 'n_x', 'mean': 'y_x'}) 134 | y = self.__mean 135 | 136 | sigma = sum([r.n_x * (r.y_x - y) ** 2 for _, r in stats.iterrows()]) 137 | 138 | return sigma 139 | 140 | @property 141 | def __sigma_sam(self): 142 | """ 143 | Gets :math:`\\sigma_{y}^2` 144 | 145 | :return: :math:`\\sigma_{y}^2`. 146 | """ 147 | y = self.__mean 148 | sigma = sum((self.__df.y - y) ** 2) 149 | 150 | return sigma 151 | 152 | @property 153 | @lru_cache(maxsize=None) 154 | def eta_squared(self): 155 | """ 156 | Gets :math:`\\eta^2 = \\frac{\\sigma_{\\bar{y}}^2}{\\sigma_{y}^2}` 157 | 158 | :return: :math:`\\eta^2`. 159 | """ 160 | sigma_cat = self.__sigma_cat 161 | sigma_sam = self.__sigma_sam 162 | eta = sigma_cat / sigma_sam 163 | return eta 164 | 165 | @property 166 | @lru_cache(maxsize=None) 167 | def eta(self): 168 | """ 169 | Gets :math:`\\eta`. 170 | 171 | :returns: :math:`\\eta`. 172 | """ 173 | return sqrt(self.eta_squared) 174 | 175 | @property 176 | @lru_cache(maxsize=None) 177 | def anova(self): 178 | """ 179 | Computes an `ANOVA test `_. 180 | 181 | :return: F-statistic, p-value. 182 | """ 183 | df = self.__df 184 | samples = [df[df.x == x].y for x in df.x.unique()] 185 | r = f_oneway(*samples) 186 | return r.statistic, r.pvalue 187 | 188 | @property 189 | @lru_cache(maxsize=None) 190 | def kruskal(self): 191 | """ 192 | Computes the `Kruskal-Wallis H-test `_. 193 | 194 | :return: H-statistic, p-value. 195 | """ 196 | df = self.__df 197 | samples = [df[df.x == x].y for x in df.x.unique()] 198 | r = kruskal(*samples) 199 | return r.statistic, r.pvalue 200 | 201 | @property 202 | @lru_cache(maxsize=None) 203 | def silhouette(self): 204 | """ 205 | `Silhouette coefficient `_. 206 | 207 | :return: Silhouette coefficient. 208 | """ 209 | labels = self.__df.x 210 | X = self.__df[['y']] 211 | return silhouette_score(X, labels) 212 | 213 | @property 214 | @lru_cache(maxsize=None) 215 | def davies_bouldin(self): 216 | """ 217 | `Davies-Bouldin Index `_. 218 | 219 | :return: Davies-Bouldin Index. 220 | """ 221 | labels = self.__df.x 222 | X = self.__df[['y']] 223 | return davies_bouldin_score(X, labels) 224 | 225 | @property 226 | @lru_cache(maxsize=None) 227 | def calinski_harabasz(self): 228 | """ 229 | `Calinski-Harabasz Index `_. 230 | 231 | :return: Calinski-Harabasz Index. 232 | """ 233 | labels = self.__df.x 234 | X = self.__df[['y']] 235 | return calinski_harabasz_score(X, labels) 236 | 237 | 238 | class ConcordanceMixin(object): 239 | 240 | @property 241 | @lru_cache(maxsize=None) 242 | def __counts(self): 243 | return self._d, self._t_xy, self._t_x, self._t_y, self._c, self._n 244 | 245 | @property 246 | @lru_cache(maxsize=None) 247 | def __probs(self): 248 | n = self._n 249 | return self._d / n, self._t_xy / n, self._t_x / n, self._t_y / n, self._c / n, n 250 | 251 | @property 252 | @lru_cache(maxsize=None) 253 | def kendall_tau(self): 254 | """ 255 | Kendall's :math:`\\tau` is defined as follows. 256 | 257 | :math:`\\tau = \\frac{C - D}{{{n}\\choose{2}}}` 258 | 259 | Where 260 | 261 | - :math:`C` is the number of concordant pairs 262 | - :math:`D` is the number of discordant pairs 263 | - :math:`n` is the sample size 264 | 265 | :return: :math:`\\tau`. 266 | """ 267 | d, t_xy, t_x, t_y, c, n = self.__counts 268 | t = (c - d) / (n * (n - 1) / 2) 269 | return t 270 | 271 | @property 272 | @lru_cache(maxsize=None) 273 | def somers_d(self): 274 | """ 275 | Computes `Somers' d `_ for two continuous 276 | variables. Note that Somers' d is defined for :math:`d_{X \\cdot Y}` and :math:`d_{Y \\cdot X}` 277 | and in general :math:`d_{X \\cdot Y} \\neq d_{Y \\cdot X}`. 278 | 279 | - :math:`d_{Y \\cdot X} = \\frac{\\pi_c - \\pi_d}{\\pi_c + \\pi_d + \\pi_t^Y}` 280 | - :math:`d_{X \\cdot Y} = \\frac{\\pi_c - \\pi_d}{\\pi_c + \\pi_d + \\pi_t^X}` 281 | 282 | Where 283 | 284 | - :math:`\\pi_c = \\frac{C}{n}` 285 | - :math:`\\pi_d = \\frac{D}{n}` 286 | - :math:`\\pi_t^X = \\frac{T^X}{n}` 287 | - :math:`\\pi_t^Y = \\frac{T^Y}{n}` 288 | - :math:`C` is the number of concordant pairs 289 | - :math:`D` is the number of discordant pairs 290 | - :math:`T^X` is the number of ties on :math:`X` 291 | - :math:`T^Y` is the number of ties on :math:`Y` 292 | - :math:`n` is the sample size 293 | 294 | :return: :math:`d_{X \\cdot Y}`, :math:`d_{Y \\cdot X}`. 295 | """ 296 | p_d, p_txy, p_tx, p_ty, p_c, n = self.__probs 297 | 298 | d_yx = (p_c - p_d) / (p_c + p_d + p_ty) 299 | d_xy = (p_c - p_d) / (p_c + p_d + p_tx) 300 | 301 | return d_yx, d_xy 302 | 303 | @property 304 | @lru_cache(maxsize=None) 305 | def goodman_kruskal_gamma(self): 306 | """ 307 | Goodman-Kruskal :math:`\\gamma` is like Somer's D. It is defined as follows. 308 | 309 | :math:`\\gamma = \\frac{\\pi_c - \\pi_d}{1 - \\pi_t}` 310 | 311 | Where 312 | 313 | - :math:`\\pi_c = \\frac{C}{n}` 314 | - :math:`\\pi_d = \\frac{D}{n}` 315 | - :math:`\\pi_t = \\frac{T}{n}` 316 | - :math:`C` is the number of concordant pairs 317 | - :math:`D` is the number of discordant pairs 318 | - :math:`T` is the number of ties 319 | - :math:`n` is the sample size 320 | 321 | :return: :math:`\\gamma`. 322 | """ 323 | p_d, p_txy, p_tx, p_ty, p_c, n = self.__probs 324 | p_t = p_txy + p_tx + p_ty 325 | 326 | gamma = (p_c - p_d) / (1 - p_t) 327 | 328 | return gamma 329 | 330 | 331 | class Concordance(MeasureMixin, ConcordanceMixin, object): 332 | """ 333 | Concordance for continuous and ordinal data. 334 | """ 335 | 336 | def __init__(self, x, y): 337 | """ 338 | ctor. 339 | 340 | :param x: Continuous or ordinal data (iterable). 341 | :param y: Continuous or ordinal data (iterable). 342 | """ 343 | d, t_xy, t_x, t_y, c, n = Concordance.__to_counts(x, y) 344 | self._d = d 345 | self._t_xy = t_xy 346 | self._t_x = t_x 347 | self._t_y = t_y 348 | self._c = c 349 | self._n = n 350 | 351 | @staticmethod 352 | def __to_counts(x, y): 353 | """ 354 | Gets the count of concordance, discordance or tie. Two pairs of variables :math:`(X_i, Y_i)` 355 | and :math:`(X_j, Y_j)` are 356 | 357 | - concordant if :math:`X_i < X_j` and :math:`Y_i < Y_j` **or** :math:`X_i > X_j` and :math:`Y_i > Y_j`, 358 | - discordant if :math:`X_i < X_j` and :math:`Y_i > Y_j` **or** :math:`X_i > X_j` and :math:`Y_i < Y_j`, and 359 | - tied if :math:`X_i = X_j` and :math:`Y_i = Y_j`. 360 | 361 | Equivalently. 362 | 363 | - concordant if :math:`(X_j - X_i)(Y_j - Y_i) > 0` 364 | - discordant if :math:`(X_j - X_i)(Y_j - Y_i) < 0` 365 | - tied if :math:`(X_j - X_i)(Y_j - Y_i) = 0` 366 | 367 | Any two pairs of observations are necessarily concordant, discordant or tied. 368 | 369 | :return: Counts(D, T_XY, T_X, T_Y, C), n. 370 | """ 371 | 372 | def get_concordance(p1, p2): 373 | x_i, y_i = p1 374 | x_j, y_j = p2 375 | 376 | d = 0 377 | t_xy = 0 378 | t_x = 0 379 | t_y = 0 380 | c = 0 381 | 382 | r = (x_j - x_i) * (y_j - y_i) 383 | 384 | if r > 0: 385 | c = 1 386 | elif r < 0: 387 | d = 1 388 | else: 389 | if x_i == x_j and y_i == y_j: 390 | t_xy = 1 391 | elif x_i == x_j: 392 | t_x = 1 393 | elif y_i == y_j: 394 | t_y = 1 395 | 396 | return ConcordantCounts(d, t_xy, t_x, t_y, c) 397 | 398 | is_valid = lambda a, b: a is not None and b is not None 399 | data = [(a, b) for a, b in zip(x, y) if is_valid(a, b)] 400 | results = combinations(data, 2) 401 | results = map(lambda tup: get_concordance(tup[0], tup[1]), results) 402 | c = reduce(lambda c1, c2: c1 + c2, results) 403 | n = len(data) 404 | return c.d, c.t_xy, c.t_x, c.t_y, c.c, n 405 | 406 | 407 | class ConcordanceStats(MeasureMixin, ConcordanceMixin): 408 | """ 409 | Computes concordance stats. 410 | """ 411 | 412 | def __init__(self, d, t_xy, t_x, t_y, c, n): 413 | """ 414 | ctor. 415 | 416 | :param d: Number of discordant pairs. 417 | :param t_xy: Number of ties on XY pairs. 418 | :param t_x: Number of ties on X pairs. 419 | :param t_y: Number of ties on Y pairs. 420 | :param c: Number of concordant pairs. 421 | :param n: Total number of pairs. 422 | """ 423 | self._d = d 424 | self._t_xy = t_xy 425 | self._t_x = t_x 426 | self._t_y = t_y 427 | self._t_c = c 428 | self._c = c 429 | self._n = n 430 | -------------------------------------------------------------------------------- /pypair/decorator.py: -------------------------------------------------------------------------------- 1 | import time 2 | from functools import wraps 3 | 4 | 5 | def timeit(f): 6 | """ 7 | Benchmarks the time it takes (seconds) to execute. 8 | """ 9 | 10 | @wraps(f) 11 | def wrapper(*args, **kwargs): 12 | start = time.time() 13 | output = f(*args, **kwargs) 14 | # diff = time.time() - start 15 | time.time() - start 16 | # print(f'{f.__name__}: {diff}') 17 | return output 18 | 19 | return wrapper 20 | 21 | 22 | def similarity(f): 23 | """ 24 | Marker for similarity functions. 25 | """ 26 | 27 | @wraps(f) 28 | def wrapper(*args, **kwargs): 29 | return f(*args, **kwargs) 30 | 31 | return wrapper 32 | 33 | 34 | def distance(f): 35 | """ 36 | Marker for distance functions. 37 | """ 38 | 39 | @wraps(f) 40 | def wrapper(*args, **kwargs): 41 | return f(*args, **kwargs) 42 | 43 | return wrapper 44 | -------------------------------------------------------------------------------- /pypair/spark.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | from itertools import combinations, product, chain 3 | from math import sqrt 4 | 5 | from pypair.biserial import BiserialStats 6 | from pypair.contingency import ConfusionStats, CategoricalStats, \ 7 | BinaryStats, AgreementStats 8 | from pypair.continuous import ConcordanceStats 9 | 10 | 11 | def __as_key(k1, k2): 12 | """ 13 | Creates a key (tuple) out of the two specified. The key is always ordered. 14 | If k2 < k1, then (k2, k1), else, (k1, k2). 15 | 16 | :param k1: Key (string). 17 | :param k2: Key (string). 18 | :return: (k1, k2) or (k2, k1). 19 | """ 20 | keys = sorted([k1, k2]) 21 | return keys[0], keys[1] 22 | 23 | 24 | def __to_abcd_counts(d): 25 | """ 26 | Maps the paired keys in the dictionary and their associated values to a form: ``(k1, k2), (a, b, c, d)``. 27 | 28 | :param d: A dictionary. Names are variable names. Values are 0 or 1. 29 | :return: A list of tuples of the form: (k1, k2), (a, b, c, d). 30 | """ 31 | 32 | def as_count(v1, v2): 33 | """ 34 | Maps the specified values to a (TP or 11), b (FN or 10), c (FP or 01) and d (TN or 00). 35 | Only one of these will be 1, and the others will be 0. Look below for example. 36 | 37 | - 1, 1 = (1, 0, 0, 0) 38 | - 1, 0 = (0, 1, 0, 0) 39 | - 0, 1 = (0, 0, 1, 0) 40 | - 0, 0 = (0, 0, 0, 1) 41 | 42 | :param v1: Value (0 or 1). 43 | :param v2: Value (0 or 1). 44 | :return: a, b, c, d 45 | """ 46 | a, b, c, d = 0, 0, 0, 0 47 | if v1 is not None and v2 is not None: 48 | if v1 == 1 and v2 == 1: 49 | a = 1 50 | elif v1 == 1 and v2 == 0: 51 | b = 1 52 | elif v1 == 0 and v2 == 1: 53 | c = 1 54 | else: 55 | d = 1 56 | return a, b, c, d 57 | 58 | def transform(k1, k2): 59 | """ 60 | Transforms the keys and associated value to the form (a tuple of tuples): (k1, k2), (a, b, c, d). 61 | 62 | :param k1: Key (string). 63 | :param k2: Key (string). 64 | :return: (k1, k2), (a, b, c, d) 65 | """ 66 | v1, v2 = d[k1], d[k2] 67 | return __as_key(k1, k2), as_count(v1, v2) 68 | 69 | return [transform(k1, k2) for k1, k2 in combinations(d.keys(), 2)] 70 | 71 | 72 | def __add_abcd_counts(x, y): 73 | """ 74 | Adds two tuples. For example. 75 | 76 | :math:`x + y = (x_a + y_a, x_b + y_b, x_c + y_c, x_d + y_d)` 77 | 78 | :param x: Tuple (a, b, c, d). 79 | :param y: Tuple (a, b, c, d). 80 | :return: Tuple (a, b, c, d). 81 | """ 82 | return x[0] + y[0], x[1] + y[1], x[2] + y[2], x[3] + y[3] 83 | 84 | 85 | def __add_concordance_counts(x, y): 86 | """ 87 | Adds two tuples. For example. 88 | 89 | :math:`x + y = (x_d + y_d, x_t_{xy} + y_t_{xy}, x_t_x + y_t_x, x_t_y + y_t_y, x_c + y_c, x_n + y_n)` 90 | 91 | :param x: Tuple (d, t_xy, t_x, t_y, c, n). 92 | :param y: Tuple (d, t_xy, t_x, t_y, c, n). 93 | :return: Tuple (d, t_xy, t_x, t_y, c, n). 94 | """ 95 | return x[0] + y[0], x[1] + y[1], x[2] + y[2], x[3] + y[3], x[4] + y[4], x[5] + y[5] 96 | 97 | 98 | def __get_contingency_table(sdf): 99 | """ 100 | Gets the pairwise contingency tables. Each record in the pair-RDD returns has the following form. 101 | 102 | `(k1, k2), (table, row_marginals, col_marginals, domain1, domain2)` 103 | 104 | - k1 is the name of a variable 105 | - k2 is the name of a variable 106 | - table is a list of list (a table, matrix) of counts 107 | - row_marginals contain the row marginals 108 | - col_marginals contain the column marginals 109 | - domain1 is a list of all the values of variable 1 110 | - domain2 is a list of all the values of variable 2 111 | 112 | :param sdf: Spark dataframe. 113 | :return: Spark pair-RDD. 114 | """ 115 | 116 | def to_count(d): 117 | def count(k1, k2): 118 | tups = [(k1, d[k1]), (k2, d[k2])] 119 | tups = sorted(tups, key=lambda t: t[0]) 120 | 121 | return (tups[0][0], tups[1][0], tups[0][1], tups[1][1]), 1 122 | 123 | return [count(k1, k2) for k1, k2 in combinations(d.keys(), 2)] 124 | 125 | def attach_domains(tup): 126 | key, d = tup 127 | v1 = sorted(list({k[0] for k, _ in d.items()})) 128 | v2 = sorted(list({k[1] for k, _ in d.items()})) 129 | 130 | return key, (d, v1, v2) 131 | 132 | def to_contingency_table(tup): 133 | key, (d, v1, v2) = tup 134 | table = [[d[(a, b)] if (a, b) in d else 0 for b in v2] for a in v1] 135 | 136 | return key, table 137 | 138 | return sdf.rdd \ 139 | .flatMap(lambda r: to_count(r.asDict())) \ 140 | .reduceByKey(lambda a, b: a + b) \ 141 | .map(lambda tup: ((tup[0][0], tup[0][1]), (tup[0][2], tup[0][3], tup[1]))) \ 142 | .map(lambda tup: (tup[0], {(tup[1][0], tup[1][1]): tup[1][2]})) \ 143 | .reduceByKey(lambda a, b: {**a, **b}) \ 144 | .map(lambda tup: attach_domains(tup)) \ 145 | .map(lambda tup: to_contingency_table(tup)) \ 146 | .sortByKey() 147 | 148 | 149 | def binary_binary(sdf): 150 | """ 151 | Gets all the pairwise binary-binary association measures. The result is a Spark pair-RDD, 152 | where the keys are tuples of variable names e.g. (k1, k2), and values are dictionaries 153 | of association names and measures e.g. {'phi': 1, 'lambda': 0.8}. Each record in the pair-RDD is of the form. 154 | 155 | - (k1, k2), {'phi': 1, 'lambda': 0.8, ...} 156 | 157 | :param sdf: Spark dataframe. Should be all 1's and 0's. 158 | :return: Spark pair-RDD. 159 | """ 160 | 161 | def to_results(counts): 162 | """ 163 | Converts the result of the contingency table counts to a dictionary of association measures. 164 | 165 | :param counts: Tuple of tuples: (k1, k2), (a, b, c, d). 166 | :return: (x1, x2), {'measure1': val1, 'measure2': val2, ...}. 167 | """ 168 | (x1, x2), (a, b, c, d) = counts 169 | 170 | computer = BinaryStats([[a + 1, b + 1], [c + 1, d + 1]]) 171 | measures = {m: computer.get(m) for m in computer.measures()} 172 | return (x1, x2), measures 173 | 174 | return sdf.rdd \ 175 | .flatMap(lambda r: __to_abcd_counts(r.asDict())) \ 176 | .reduceByKey(lambda a, b: __add_abcd_counts(a, b)) \ 177 | .sortByKey() \ 178 | .map(lambda counts: to_results(counts)) 179 | 180 | 181 | def confusion(sdf): 182 | """ 183 | Gets all the pairwise confusion matrix metrics. The result is a Spark pair-RDD, 184 | where the keys are tuples of variable names e.g. (k1, k2), and values are dictionaries 185 | of association names and metrics e.g. {'acc': 0.9, 'fpr': 0.2}. 186 | Each record in the pair-RDD is of the form. 187 | 188 | - (k1, k2), {'acc': 0.9, 'fpr': 0.2, ...} 189 | 190 | :param sdf: Spark dataframe. Should be all 1's and 0's. 191 | :return: Spark pair-RDD. 192 | """ 193 | 194 | def to_results(counts): 195 | """ 196 | Converts the result of the contingency table counts to a dictionary of association measures. 197 | 198 | :param counts: Tuple of tuples: (x1, x2), (tp, fn, fp, tn). 199 | :return: (x1, x2), {'metric1': val1, 'metric2': val2, ...}. 200 | """ 201 | (x1, x2), (tp, fn, fp, tn) = counts 202 | 203 | tp = max(1, tp) 204 | fn = max(1, fn) 205 | fp = max(1, fp) 206 | tn = max(1, tn) 207 | 208 | computer = ConfusionStats([[tp, fn], [fp, tn]]) 209 | measures = {m: computer.get(m) for m in computer.measures()} 210 | return (x1, x2), measures 211 | 212 | return sdf.rdd \ 213 | .flatMap(lambda r: __to_abcd_counts(r.asDict())) \ 214 | .reduceByKey(lambda a, b: __add_abcd_counts(a, b)) \ 215 | .map(lambda counts: to_results(counts)) \ 216 | .sortByKey() 217 | 218 | 219 | def categorical_categorical(sdf): 220 | """ 221 | Gets all pairwise categorical-categorical association measures. The result is a Spark pair-RDD, 222 | where the keys are tuples of variable names e.g. (k1, k2), and values are dictionaries of 223 | association names and metrics e.g. {‘phi’: 0.9, ‘chisq’: 0.2}. Each record in the pair-RDD is of the form. 224 | 225 | - (k1, k2), {‘phi’: 0.9, ‘chisq’: 0.2, ...} 226 | 227 | :param sdf: Spark dataframe. Should be strings or whole numbers to represent the values. 228 | :return: Spark pair-RDD. 229 | """ 230 | 231 | def to_results(tup): 232 | key, table = tup 233 | computer = CategoricalStats(table) 234 | measures = {m: computer.get(m) for m in computer.measures()} 235 | return key, measures 236 | 237 | return __get_contingency_table(sdf) \ 238 | .map(lambda tup: to_results(tup)) \ 239 | .sortByKey() 240 | 241 | 242 | def agreement(sdf): 243 | """ 244 | Gets all pairwise categorical-categorical `agreement` association measures. The result is a Spark pair-RDD, 245 | where the keys are tuples of variable names e.g. (k1, k2), and values are dictionaries of 246 | association names and metrics e.g. {‘kappa’: 0.9, ‘delta’: 0.2}. Each record in the pair-RDD is of the form. 247 | 248 | - (k1, k2), {‘kappa’: 0.9, ‘delta’: 0.2, ...} 249 | 250 | :param sdf: Spark dataframe. Should be strings or whole numbers to represent the values. 251 | :return: Spark pair-RDD. 252 | """ 253 | 254 | def to_results(tup): 255 | key, table = tup 256 | computer = AgreementStats(table) 257 | measures = {m: computer.get(m) for m in computer.measures()} 258 | return key, measures 259 | 260 | return __get_contingency_table(sdf) \ 261 | .map(lambda tup: to_results(tup)) \ 262 | .sortByKey() 263 | 264 | 265 | def binary_continuous(sdf, binary, continuous, b_0=0, b_1=1): 266 | """ 267 | Gets all pairwise binary-continuous association measures. The result is a Spark pair-RDD, 268 | where the keys are tuples of variable names e.g. (k1, k2), and values are dictionaries of 269 | association names and metrics e.g. {‘biserial’: 0.9, ‘point_biserial’: 0.2}. Each record 270 | in the pair-RDD is of the form. 271 | 272 | - (k1, k2), {‘biserial’: 0.9, ‘point_biserial’: 0.2, ...} 273 | 274 | All the binary fields/columns should be encoded in the same way. For example, if you 275 | are using 1 and 0, then all binary fields should only have those values, not a mixture 276 | of 1 and 0, True and False, -1 and 1, etc. 277 | 278 | :param sdf: Spark dataframe. 279 | :param binary: List of fields that are binary. 280 | :param continuous: List of fields that are continuous. 281 | :param b_0: Zero value for binary field. 282 | :param b_1: One value for binary field. 283 | :return: Spark pair-RDD. 284 | """ 285 | 286 | def to_pair1(d): 287 | """ 288 | Creates a list of tuples. 289 | 290 | :param d: Dictionary of data. 291 | :return: List of (b, c, b_val), (sum_c, sum_c_sq, sum_b). 292 | """ 293 | return [((b, c, d[b]), (d[c], d[c] ** 2, 1)) for b, c in product(*[binary, continuous])] 294 | 295 | def to_pair2(tup): 296 | """ 297 | Makes a new pair. 298 | 299 | :param tup: (b, c, b_val), (sum_c, sum_c_sq, sum_b) 300 | :return: (b, c), (b_val, sum_c, sum_c_sq, sum_b) 301 | """ 302 | (b, c, b_val), (sum_c, sum_c_sq, sum_b) = tup 303 | return (b, c), (b_val, sum_c, sum_c_sq, sum_b) 304 | 305 | def compute_stats(tup): 306 | """ 307 | `Computational formula for variance and standard deviation `_. 308 | 309 | - :math:`SS = \\sum (X - \\bar{X})^2 = \\sum X^2 - \\frac{\\left(\\sum X\\right)^2}{N}` 310 | - :math:`\\sigma^2 = \\frac{SS}{N - 1}` 311 | - :math:`\\sigma = \\sqrt{\\sigma^2}` 312 | 313 | :param tup: (b, c), [(b_val, sum_c, sum_c_sq, sum_b), (b_val, sum_c, sum_c_sq, sum_b)] 314 | :return: (b, c), (n, p, y_0, y_1, std) 315 | """ 316 | (b, c), data = tup 317 | 318 | data = list(data) 319 | data_0 = data[0] if data[0][0] == b_0 else data[1] 320 | data_1 = data[0] if data[0][0] == b_1 else data[0] 321 | 322 | _, sum_c_0, sum_c_sq_0, sum_b_0 = data_0 323 | _, sum_c_1, sum_c_sq_1, sum_b_1 = data_1 324 | 325 | n = sum_b_0 + sum_b_1 326 | p = sum_b_1 / n 327 | y_0 = sum_c_0 / sum_b_0 328 | y_1 = sum_c_1 / sum_b_1 329 | ss = (sum_c_sq_0 + sum_c_sq_1) - ((sum_c_0 + sum_c_1) ** 2 / n) 330 | v = ss / (n - 1) 331 | std = sqrt(v) 332 | 333 | return (b, c), (n, p, y_0, y_1, std) 334 | 335 | def to_results(tup): 336 | """ 337 | Computes the results. 338 | 339 | :param tup: (b, c), (n, p, y_0, y_1, std) 340 | :return: (b, c), {'measure1': val1, 'measure2': val2, ...} 341 | """ 342 | key, (n, p, y_0, y_1, std) = tup 343 | computer = BiserialStats(n, p, y_0, y_1, std) 344 | measures = {m: computer.get(m) for m in computer.measures()} 345 | return key, measures 346 | 347 | return sdf.rdd \ 348 | .flatMap(lambda r: to_pair1(r.asDict())) \ 349 | .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2])) \ 350 | .map(lambda tup: to_pair2(tup)) \ 351 | .groupByKey() \ 352 | .map(lambda tup: compute_stats(tup)) \ 353 | .map(lambda tup: to_results(tup)) \ 354 | .sortByKey() 355 | 356 | 357 | def categorical_continuous(sdf, categorical, continuous): 358 | """ 359 | Gets all pairwise categorical-continuous association measures. The result is a Spark pair-RDD, 360 | where the keys are tuples of variable names e.g. (k1, k2), and values are dictionaries of 361 | association names and metrics e.g. {‘eta_sq’: 0.9, 'eta': 0.95}. Each record 362 | in the pair-RDD is of the form. 363 | 364 | - (k1, k2), {‘eta_sq’: 0.9, 'eta': 0.95} 365 | 366 | For now, only ``eta`` :math:`\\eta^2` is supported. 367 | 368 | :param sdf: Spark dataframe. 369 | :param categorical: List of categorical variables. 370 | :param continuous: List of continuous variables. 371 | :return: Spark pair-RDD. 372 | """ 373 | 374 | def to_pair1(d): 375 | """ 376 | Creates a list of tuples. 377 | 378 | :param d: Dictionary of data. 379 | :return: List of (b, c, b_val), (sum_c, sum_c_sq, sum_b). 380 | """ 381 | kv_0 = lambda cat, con: ((cat, con, d[cat]), (d[con], 0, 1)) 382 | kv_1 = lambda cat, con: ((cat, con, '__*_avg_*__'), (d[con], 0, 1)) 383 | kv_2 = lambda cat, con: ((cat, con, '__*_den_*__'), (d[con], d[con] ** 2, 1)) 384 | explode = lambda cat, con: [kv_0(cat, con), kv_1(cat, con), kv_2(cat, con)] 385 | return chain(*(explode(cat, con) for cat, con in product(*[categorical, continuous]))) 386 | 387 | def to_pair2(tup): 388 | """ 389 | Makes a new pair. 390 | 391 | :param tup: (b, c, b_val), (sum_c, sum_c_sq, sum_b) 392 | :return: (b, c), (b_val, stats) 393 | """ 394 | ss = lambda x, x_sq, n: (x_sq - (x ** 2 / n)) 395 | (cat, con, flag), (sum_c, sum_c_sq, sum_b) = tup 396 | key = cat, con 397 | 398 | if flag == '__*_den_*__': 399 | val = ss(sum_c, sum_c_sq, sum_b) 400 | elif flag == '__*_avg_*__': 401 | val = sum_c / sum_b 402 | else: 403 | val = sum_c / sum_b, sum_b 404 | 405 | return key, (flag, val) 406 | 407 | def to_results(tup): 408 | """ 409 | Computes the results. 410 | 411 | :param tup: (b, c), (flag, val) 412 | :return: (b, c), {'measure1': val1, 'measure2': val2, ...} 413 | """ 414 | (b, c), data = tup 415 | data = {k: v for k, v in data} 416 | 417 | y_avg = data['__*_avg_*__'] 418 | num = sum([v[1] * ((v[0] - y_avg) ** 2) for k, v in data.items() if isinstance(v, tuple)]) 419 | den = data['__*_den_*__'] 420 | 421 | eta = num / den 422 | return (b, c), {'eta_sq': eta, 'eta': sqrt(eta)} 423 | 424 | return sdf.rdd \ 425 | .flatMap(lambda r: to_pair1(r.asDict())) \ 426 | .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2])) \ 427 | .map(lambda tup: to_pair2(tup)) \ 428 | .groupByKey() \ 429 | .map(lambda tup: to_results(tup)) \ 430 | .sortByKey() 431 | 432 | 433 | def concordance(sdf): 434 | """ 435 | Gets all the pairwise ordinal-ordinal concordance measures. The result is a Spark pair-RDD, 436 | where the keys are tuples of variable names e.g. (k1, k2), and values are dictionaries 437 | of association names and measures e.g. {'kendall': 1, 'gamma': 0.8}. Each record in the pair-RDD is of the form. 438 | 439 | - (k1, k2), {'kendall': 1, 'gamma': 0.8, ...} 440 | 441 | :param sdf: Spark dataframe. Should be all ordinal data (numeric). 442 | :return: Spark pair-RDD. 443 | """ 444 | 445 | def as_pair1(n1, n2, v1, v2): 446 | """ 447 | Creates a pair of the form (n1, n2), (v1, v2) where the first tuple are sorted and the second 448 | tuple have corresponding values to the elements of the first tuple. 449 | 450 | :param n1: String (variable name). 451 | :param n2: String (Variable name). 452 | :param v1: Value. 453 | :param v2: Value. 454 | :return: (n1, n2), (v1, v2). 455 | """ 456 | tups = sorted([(n1, v1), (n2, v2)], key=lambda t: t[0]) 457 | 458 | k1, j1 = tups[0] 459 | k2, j2 = tups[1] 460 | 461 | return (k1, k2), (j1, j2) 462 | 463 | def to_pair1(d): 464 | """ 465 | Creates a list of pairs of variables and values. Keys are names of variables and values are values of 466 | those variables. 467 | 468 | :param d: Dictionary. 469 | :return: List of (k1, k2), (v1, v2). 470 | """ 471 | return [as_pair1(n1, n2, d[n1], d[n2]) for n1, n2 in combinations(d.keys(), 2)] 472 | 473 | def as_count(v1, v2): 474 | """ 475 | Maps the specified pairs of values to concordance status. Concordance status can be the follow. 476 | 477 | - discordant: :math:`(y_j - y_i)(x_j - x_i) < 0` 478 | - tied: :math:`(y_j - y_i)(x_j - x_i) = 0` 479 | - concordant: :math:`(y_j - y_i)(x_j - x_i) > 0` 480 | 481 | Ties are differentiated as follows. 482 | 483 | - tied on ``x``: `x_i = x_j` 484 | - tied on ``y``: `y_i = y_j` 485 | - tied on ``xy``: `x_i = x_j \\land y_i = y_j` 486 | 487 | A tuple that looks like the following will be mapped from the concordance status. 488 | 489 | - discordant: (1, 0, 0, 0, 0, 1) 490 | - tie on ``x`` and ``y``: (0, 1, 0, 0, 0, 1) 491 | - tie on ``x``: (0, 0, 1, 0, 0, 1) 492 | - tie on ``y``: (0, 0, 0, 1, 0, 1) 493 | - concordant: (0, 0, 0, 0, 1, 1) 494 | 495 | :param v1: Pair (x_i, y_i). 496 | :param v2: Pair (x_j, y_j). 497 | :return: d, t_xy, t_x, t_y, c, n 498 | """ 499 | d, t_xy, t_x, t_y, c, n = 0, 0, 0, 0, 0, 1 500 | 501 | if v1 is not None and v2 is not None: 502 | x_i, y_i = v1 503 | x_j, y_j = v2 504 | r = (y_j - y_i) * (x_j - x_i) 505 | 506 | if r > 0: 507 | c = 1 508 | elif r < 0: 509 | d = 1 510 | else: 511 | if x_i == x_j and y_i == y_j: 512 | t_xy = 1 513 | elif x_i == x_j: 514 | t_x = 1 515 | else: 516 | t_y = 1 517 | 518 | return d, t_xy, t_x, t_y, c, n 519 | 520 | def to_pair2(tup): 521 | """ 522 | Creates concordant status counts for each pair of observations. 523 | 524 | :param tup: (key, iterable). 525 | :return: Generator of (k1, k2), (d, t_xy, t_x, t_y, c, n). 526 | """ 527 | key, data = tup 528 | 529 | return ((key, as_count(v1, v2)) for v1, v2 in combinations(data, 2)) 530 | 531 | def to_results(counts): 532 | """ 533 | Converts the results of concordance to a dictionary of measures. 534 | 535 | :param counts: Tuple of tuples: (x1, x2), (a, b, c, d). 536 | :return: (x1, x2), {'measure1': val1, 'measure2': val2, ...}. 537 | """ 538 | (x1, x2), (d, t_xy, t_x, t_y, c, n) = counts 539 | 540 | d += 1 541 | t_xy += 1 542 | t_x += 1 543 | t_y += 1 544 | c += 1 545 | n += 5 546 | 547 | computer = ConcordanceStats(d, t_xy, t_x, t_y, c, n) 548 | measures = {m: computer.get(m) for m in computer.measures()} 549 | return (x1, x2), measures 550 | 551 | return sdf.rdd \ 552 | .flatMap(lambda r: to_pair1(r.asDict())) \ 553 | .groupByKey() \ 554 | .flatMap(lambda tup: to_pair2(tup)) \ 555 | .reduceByKey(lambda x, y: __add_concordance_counts(x, y)) \ 556 | .map(lambda tup: to_results(tup)) \ 557 | .sortByKey() 558 | 559 | 560 | def continuous_continuous(sdf): 561 | """ 562 | Gets all the pairwise continuous-continuous association measures. The result is a Spark pair-RDD, 563 | where the keys are tuples of variable names e.g. (k1, k2), and values are dictionaries 564 | of association names and measures e.g. {'pearson': 1}. Each record in the pair-RDD is of the form. 565 | 566 | - (k1, k2), {'pearson': 1} 567 | 568 | Only pearson is supported at the moment. 569 | 570 | :param sdf: Spark dataframe. Should be all ordinal data (numeric). 571 | :return: Spark pair-RDD. 572 | """ 573 | 574 | CorrItem = namedtuple('CorrItem', 'x y xy x_sq y_sq n') 575 | 576 | def to_items(d): 577 | """ 578 | Converts the dictionary to (n1, n2), CorrItem. 579 | 580 | :param d: Dictionary. 581 | :return: (n1, n2), CorrItem. 582 | """ 583 | as_item = lambda n1, n2: CorrItem(d[n1], d[n2], d[n1] * d[n2], d[n1] ** 2, d[n2] ** 2, 1) 584 | return (((n1, n2), as_item(n1, n2)) for n1, n2 in combinations(d.keys(), 2)) 585 | 586 | def add_items(a, b): 587 | """ 588 | Adds two CorrItems. 589 | 590 | :param a: CorrItem. 591 | :param b: CorrItem. 592 | :return: CorrItem. 593 | """ 594 | return CorrItem(a.x + b.x, a.y + b.y, a.xy + b.xy, a.x_sq + b.x_sq, a.y_sq + b.y_sq, a.n + b.n) 595 | 596 | def to_results(tup): 597 | """ 598 | Converts the tup to a result. 599 | 600 | :param tup: (n1, n2), CorrItem. 601 | :return: (n1, n2), {'measure': value}. 602 | """ 603 | (n1, n2), item = tup 604 | n = item.xy - (item.x * item.y) / item.n 605 | d = sqrt(item.x_sq - (item.x ** 2 / item.n)) * sqrt(item.y_sq - (item.y ** 2 / item.n)) 606 | r = n / d 607 | return (n1, n2), {'pearson': r} 608 | 609 | return sdf.rdd \ 610 | .flatMap(lambda r: to_items(r.asDict())) \ 611 | .reduceByKey(lambda a, b: add_items(a, b)) \ 612 | .map(lambda tup: to_results(tup)) \ 613 | .sortByKey() 614 | -------------------------------------------------------------------------------- /pypair/util.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from functools import lru_cache 3 | from itertools import combinations 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | 9 | class MeasureMixin(ABC): 10 | """ 11 | Measure mixin. Able to get list the functions decorated with `@property` and also 12 | access such property based on name. 13 | """ 14 | 15 | @classmethod 16 | def measures(cls): 17 | """ 18 | Gets a list of all the measures. 19 | 20 | :return: List of all the measures. 21 | """ 22 | return get_measures(cls) 23 | 24 | @lru_cache(maxsize=None) 25 | def get(self, measure): 26 | """ 27 | Gets the specified measure. 28 | 29 | :param measure: Name of measure. 30 | :return: Measure. 31 | """ 32 | return getattr(self, measure) 33 | 34 | @lru_cache(maxsize=None) 35 | def get_measures(self): 36 | """ 37 | Gets a list of all the measures. 38 | 39 | :return: List of all the measures. 40 | """ 41 | return get_measures(self.__class__) 42 | 43 | 44 | def get_measures(clazz): 45 | """ 46 | Gets all the measures of a clazz. 47 | 48 | :param clazz: Clazz. 49 | :return: List of measures. 50 | """ 51 | from itertools import chain 52 | 53 | is_property = lambda v: isinstance(v, property) 54 | is_public = lambda n: not n.startswith('_') 55 | is_valid = lambda n, v: is_public(n) and is_property(v) 56 | 57 | measures = sorted(list(chain(*[[n for n, v in vars(c).items() if is_valid(n, v)] for c in clazz.__mro__]))) 58 | 59 | return measures 60 | 61 | 62 | def corr(df, f): 63 | """ 64 | Computes the pairwise association matrix. ALL fields/columns must be the same type and so that the specified field 65 | ``f`` will be able to compute the pairwise associations. 66 | 67 | :param df: Pandas data frame. 68 | :param f: Callable function; e.g. lambda a, b: categorical_categorical(a, b, measure='phi') 69 | """ 70 | fields = list(df.columns) 71 | idx_map = {f: i for i, f in enumerate(fields)} 72 | associations = ((idx_map[a], idx_map[b], f(df[a], df[b])) for a, b in combinations(fields, 2)) 73 | 74 | n = df.shape[1] 75 | mat = np.empty((n, n)) 76 | for i, j, a in associations: 77 | mat[i, j] = mat[j, i] = a 78 | 79 | df = pd.DataFrame(mat, columns=fields, index=fields) 80 | return df 81 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # TEST 2 | nose 3 | coverage 4 | # LINT OR DIE 5 | flake8 6 | pep8 7 | pyflakes 8 | # LIBS 9 | numpy 10 | scipy 11 | pandas 12 | scikit-learn 13 | pyspark 14 | # DOCUMENTATION 15 | sphinx 16 | sphinx_rtd_theme 17 | sphinxcontrib-bibtex 18 | sphinxcontrib-blockdiag 19 | sphinx-sitemap 20 | # PUBLISHING 21 | twine 22 | setuptools -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = pypair 3 | version = 0.0.1 4 | author = Jee Vang 5 | author_email = vangjee@gmail.com 6 | description = Pairwise association measures of statistical variable types 7 | long_description: file: README.md 8 | long_description_content_type = text/markdown 9 | description_file = README.md 10 | url = https://github.com/oneoffcoder/py-pair 11 | keywords = statistics, pairwise, association, correlation, concordance, measurement, strength, pyspark 12 | install_requires = scipy, numpy, pandas, scikit-learn, pyspark 13 | classifiers = 14 | Programming Language :: Python :: 3 15 | License :: OSI Approved :: Apache Software License 16 | Operating System :: OS Independent 17 | Topic :: Scientific/Engineering :: Artificial Intelligence 18 | Intended Audience :: Developers 19 | Intended Audience :: Science/Research 20 | Development Status :: 5 - Production/Stable 21 | include_package_data = True 22 | test_suite = nose.collector 23 | 24 | [flake8] 25 | max-line-length = 120 26 | ignore = E501 E731 27 | 28 | [nosetests] 29 | verbosity = 3 30 | with-doctest = 1 31 | with-coverage = 1 32 | with-id = 1 33 | cover-erase = 1 34 | cover-html = 1 35 | cover-html-dir = coverage 36 | cover-package = pypair 37 | detailed-errors = 1 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | packages=find_packages(exclude=('*.tests', '*.tests.*', 'tests.*', 'tests')) 5 | ) 6 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_association.py: -------------------------------------------------------------------------------- 1 | import random 2 | from itertools import combinations 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from nose import with_setup 7 | 8 | from pypair.association import binary_binary, categorical_categorical, \ 9 | binary_continuous, concordance, categorical_continuous, continuous_continuous, confusion 10 | from pypair.biserial import Biserial 11 | from pypair.contingency import BinaryTable, CategoricalTable, ConfusionMatrix 12 | from pypair.continuous import Concordance, CorrelationRatio, Continuous 13 | from pypair.util import corr 14 | 15 | 16 | def setup(): 17 | """ 18 | Setup. 19 | :return: None. 20 | """ 21 | np.random.seed(37) 22 | random.seed(37) 23 | 24 | 25 | def teardown(): 26 | """ 27 | Teardown. 28 | :return: None. 29 | """ 30 | pass 31 | 32 | 33 | @with_setup(setup, teardown) 34 | def test_binary_binary(): 35 | """ 36 | Tests binary-binary. 37 | 38 | :return: None. 39 | """ 40 | get_data = lambda x, y, n: [(x, y) for _ in range(n)] 41 | data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242) 42 | a = [a for a, _ in data] 43 | b = [b for _, b in data] 44 | 45 | for m in BinaryTable.measures(): 46 | r = binary_binary(a, b, m) 47 | print(f'{r}: {m}') 48 | 49 | 50 | @with_setup(setup, teardown) 51 | def test_categorical_categorical(): 52 | """ 53 | Tests categorical-categorical. 54 | 55 | :return: None. 56 | """ 57 | get_data = lambda x, y, n: [(x, y) for _ in range(n)] 58 | data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242) 59 | a = [a for a, _ in data] 60 | b = [b for _, b in data] 61 | 62 | for m in CategoricalTable.measures(): 63 | r = categorical_categorical(a, b, m) 64 | print(f'{r}: {m}') 65 | 66 | 67 | @with_setup(setup, teardown) 68 | def test_binary_continuous(): 69 | """ 70 | Tests binary-continuous. 71 | 72 | :return: None. 73 | """ 74 | get_data = lambda x, y, n: [(x, y) for _ in range(n)] 75 | data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242) 76 | a = [a for a, _ in data] 77 | b = [b for _, b in data] 78 | 79 | for m in Biserial.measures(): 80 | r = binary_continuous(a, b, m) 81 | print(f'{r}: {m}') 82 | 83 | 84 | @with_setup(setup, teardown) 85 | def test_concordance(): 86 | """ 87 | Tests concordance. 88 | 89 | :return: None. 90 | """ 91 | a = [1, 2, 3] 92 | b = [3, 2, 1] 93 | 94 | for m in Concordance.measures(): 95 | r = concordance(a, b, m) 96 | print(f'{r}: {m}') 97 | 98 | 99 | @with_setup(setup, teardown) 100 | def test_categorical_continuous(): 101 | """ 102 | Tests categorical-continuous. Data taken from `Wikipedia `_. 103 | 104 | :return: None. 105 | """ 106 | data = [ 107 | ('a', 45), ('a', 70), ('a', 29), ('a', 15), ('a', 21), 108 | ('g', 40), ('g', 20), ('g', 30), ('g', 42), 109 | ('s', 65), ('s', 95), ('s', 80), ('s', 70), ('s', 85), ('s', 73) 110 | ] 111 | x = [x for x, _ in data] 112 | y = [y for _, y in data] 113 | for m in CorrelationRatio.measures(): 114 | r = categorical_continuous(x, y, m) 115 | print(f'{r}: {m}') 116 | 117 | 118 | @with_setup(setup, teardown) 119 | def test_continuous_continuous(): 120 | """ 121 | Tests continuous-continuous. 122 | 123 | :return: None. 124 | """ 125 | x = [x for x in range(10)] 126 | y = [y for y in range(10)] 127 | for m in Continuous.measures(): 128 | r = continuous_continuous(x, y, m) 129 | print(f'{r}: {m}') 130 | 131 | 132 | @with_setup(setup, teardown) 133 | def test_confusion(): 134 | """ 135 | Tests confusion matrix. Data taken from `here `_. 136 | 137 | :return: None 138 | """ 139 | tn = [(0, 0) for _ in range(50)] 140 | fp = [(0, 1) for _ in range(10)] 141 | fn = [(1, 0) for _ in range(5)] 142 | tp = [(1, 1) for _ in range(100)] 143 | data = tn + fp + fn + tp 144 | a = [a for a, _ in data] 145 | b = [b for _, b in data] 146 | 147 | for m in ConfusionMatrix.measures(): 148 | r = confusion(a, b, m) 149 | print(f'{r}: {m}') 150 | 151 | 152 | @with_setup(setup, teardown) 153 | def test_pandas_categorical(): 154 | """ 155 | Tests categorical correlation using Pandas dataframe ``.corr()``. 156 | 157 | :return: None 158 | """ 159 | 160 | # FIXME: pandas.corr() is broken; no longer supports non-numeric columns! 161 | def get_associations(a, b): 162 | d = {'x': a, 'y': b} 163 | measures = {m: categorical_categorical(df[a], df[b], measure=m) for m in ['chisq', 'phi', 'mutual_information']} 164 | d = {**d, **measures} 165 | return d 166 | 167 | df = pd.DataFrame({ 168 | 'x1': ['on', 'on', 'on', 'on', 'on', 'off', 'off', 'off', 'off', 'off'], 169 | 'x2': ['on', 'off', 'on', 'off', 'on', 'off', 'on', 'off', 'on', 'off'], 170 | 'x3': ['off', 'off', 'off', 'off', 'off', 'on', 'on', 'on', 'on', 'on'], 171 | 'x4': ['on', 'on', 'on', 'on', 'off', 'off', 'off', 'off', 'off', 'on'], 172 | }) 173 | 174 | associations = [get_associations(a, b) for a, b in combinations(df.columns, 2)] 175 | 176 | expected = { 177 | 'chisq': [0.2857142857142857, 7.142857142857143, 2.5714285714285716, 0.2857142857142857, 0.2857142857142857, 178 | 2.5714285714285716], 179 | 'phi': [0.14285714285714285, 0.7142857142857143, 0.4285714285714286, 0.14285714285714285, 0.14285714285714285, 180 | 0.4285714285714286], 181 | 'mutual_information': [0.010239075859473604, 0.2830308622715362, 0.09487759197468806, 0.010239075859473604, 182 | 0.010239075859473604, 183 | 0.09487759197468805] 184 | } 185 | 186 | a_df = pd.DataFrame(associations) 187 | 188 | for field, e in expected.items(): 189 | o = list(a_df[field]) 190 | for v1, v2 in zip(o, e): 191 | assert abs(v1 - v2) < 0.0001 192 | 193 | 194 | @with_setup(setup, teardown) 195 | def test_get_correlation_matrix(): 196 | """ 197 | Tests getting correlation matrix as Pandas dataframe. 198 | 199 | :return: None. 200 | """ 201 | df = pd.DataFrame({ 202 | 'x1': ['on', 'on', 'on', 'on', 'on', 'off', 'off', 'off', 'off', 'off'], 203 | 'x2': ['on', 'off', 'on', 'off', 'on', 'off', 'on', 'off', 'on', 'off'], 204 | 'x3': ['off', 'off', 'off', 'off', 'off', 'on', 'on', 'on', 'on', 'on'], 205 | 'x4': ['on', 'on', 'on', 'on', 'off', 'off', 'off', 'off', 'off', 'on'], 206 | }) 207 | 208 | f = lambda a, b: categorical_categorical(a, b, measure='mutual_information') 209 | p = corr(df, f) 210 | print(list(p.x1)) 211 | print(list(p.x2)) 212 | print(list(p.x3)) 213 | print(list(p.x4)) 214 | 215 | expected = { 216 | 'x1': [4.66486312813147e-310, 0.010239075859473604, 0.2830308622715362, 0.09487759197468806], 217 | 'x2': [0.010239075859473604, 0.0, 0.010239075859473604, 0.010239075859473604], 218 | 'x3': [0.2830308622715362, 0.010239075859473604, 0.0, 0.09487759197468805], 219 | 'x4': [0.09487759197468806, 0.010239075859473604, 0.09487759197468805, 0.0] 220 | } 221 | 222 | for field, e in expected.items(): 223 | o = list(p[field]) 224 | 225 | for v1, v2 in zip(o, e): 226 | diff = abs(v1 - v2) 227 | assert diff < 0.0001 228 | -------------------------------------------------------------------------------- /tests/test_contingency.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from nose import with_setup 6 | 7 | from pypair.association import categorical_categorical 8 | 9 | 10 | def setup(): 11 | """ 12 | Setup. 13 | :return: None. 14 | """ 15 | np.random.seed(37) 16 | random.seed(37) 17 | 18 | 19 | def teardown(): 20 | """ 21 | Teardown. 22 | :return: None. 23 | """ 24 | pass 25 | 26 | 27 | @with_setup(setup, teardown) 28 | def test_contingency_with_nulls(): 29 | """ 30 | Tests creating contingency table with nulls. 31 | 32 | :return: None. 33 | """ 34 | df = pd.DataFrame([ 35 | (0, 0), 36 | (0, 1), 37 | (1, 0), 38 | (1, 1), 39 | (0, None), 40 | (None, 0), 41 | (None, None) 42 | ], columns=['a', 'b']) 43 | v = categorical_categorical(df.a, df.b, measure='phi') 44 | print(v) 45 | -------------------------------------------------------------------------------- /tests/test_spark.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import random 4 | import unittest 5 | from random import choice 6 | 7 | import pandas as pd 8 | from pyspark.sql import SparkSession 9 | 10 | from pypair.spark import binary_binary, confusion, categorical_categorical, agreement, binary_continuous, concordance, \ 11 | categorical_continuous, continuous_continuous 12 | 13 | 14 | class PySparkTest(unittest.TestCase): 15 | """ 16 | PySpark test class. 17 | """ 18 | 19 | @classmethod 20 | def supress_py4j_logging(cls): 21 | """ 22 | Supresses p4j logging. 23 | 24 | :return: None. 25 | """ 26 | logger = logging.getLogger('py4j') 27 | logger.setLevel(logging.WARN) 28 | 29 | @classmethod 30 | def create_pyspark_session(cls): 31 | """ 32 | Creates a PySpark session. 33 | 34 | :return: PySpark session. 35 | """ 36 | return (SparkSession.builder 37 | .master('local[4]') 38 | .appName('local-testing-pyspark') 39 | .getOrCreate()) 40 | 41 | @classmethod 42 | def setUpClass(cls): 43 | """ 44 | Sets up the class. 45 | 46 | :return: None. 47 | """ 48 | cls.supress_py4j_logging() 49 | cls.spark = cls.create_pyspark_session() 50 | random.seed(37) 51 | 52 | @classmethod 53 | def tearDownClass(cls): 54 | """ 55 | Tears down the class. 56 | 57 | :return: None. 58 | """ 59 | cls.spark.stop() 60 | 61 | def _get_binary_binary_data(self): 62 | """ 63 | Gets dummy binary-binary data in a Spark dataframe. 64 | 65 | :return: Spark dataframe. 66 | """ 67 | get_data = lambda x, y, n: [(x, y) * 2 for _ in range(n)] 68 | data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242) 69 | pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4']) 70 | sdf = self.spark.createDataFrame(pdf) 71 | return sdf 72 | 73 | def _get_confusion_data(self): 74 | """ 75 | Gets dummy binary-binary data in Spark dataframe. For use with confusion matrix analysis. 76 | 77 | :return: Spark dataframe. 78 | """ 79 | tn = [(0, 0) * 2 for _ in range(50)] 80 | fp = [(0, 1) * 2 for _ in range(10)] 81 | fn = [(1, 0) * 2 for _ in range(5)] 82 | tp = [(1, 1) * 2 for _ in range(100)] 83 | data = tn + fp + fn + tp 84 | pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4']) 85 | sdf = self.spark.createDataFrame(pdf) 86 | return sdf 87 | 88 | def _get_categorical_categorical_data(self): 89 | """ 90 | Gets dummy categorical-categorical data in Spark dataframe. 91 | 92 | :return: Spark dataframe. 93 | """ 94 | x_domain = ['a', 'b', 'c'] 95 | y_domain = ['a', 'b'] 96 | 97 | get_x = lambda: choice(x_domain) 98 | get_y = lambda: choice(y_domain) 99 | get_data = lambda: {f'x{i}': v for i, v in enumerate((get_x(), get_y(), get_x(), get_y()))} 100 | 101 | pdf = pd.DataFrame([get_data() for _ in range(100)]) 102 | sdf = self.spark.createDataFrame(pdf) 103 | return sdf 104 | 105 | def _get_binary_continuous_data(self): 106 | """ 107 | Gets dummy `binary-continuous data `_. 108 | 109 | :return: Spark dataframe. 110 | """ 111 | data = [ 112 | (1, 10), (1, 11), (1, 6), (1, 11), (0, 4), 113 | (0, 3), (1, 12), (0, 2), (0, 2), (0, 1) 114 | ] 115 | pdf = pd.DataFrame(data, columns=['gender', 'years']) 116 | sdf = self.spark.createDataFrame(pdf) 117 | return sdf 118 | 119 | def _get_concordance_data(self): 120 | """ 121 | Gets dummy concordance data. 122 | 123 | :return: Spark dataframe. 124 | """ 125 | a = [1, 2, 3] 126 | b = [3, 2, 1] 127 | pdf = pd.DataFrame({'a': a, 'b': b, 'c': a, 'd': b}) 128 | sdf = self.spark.createDataFrame(pdf) 129 | return sdf 130 | 131 | def _get_categorical_continuous_data(self): 132 | """ 133 | Gets dummy categorical-continuous data. 134 | See `site `_. 135 | 136 | :return: Spark dataframe. 137 | """ 138 | data = [ 139 | ('a', 45), ('a', 70), ('a', 29), ('a', 15), ('a', 21), 140 | ('g', 40), ('g', 20), ('g', 30), ('g', 42), 141 | ('s', 65), ('s', 95), ('s', 80), ('s', 70), ('s', 85), ('s', 73) 142 | ] 143 | data = [tup * 2 for tup in data] 144 | pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4']) 145 | sdf = self.spark.createDataFrame(pdf) 146 | return sdf 147 | 148 | def _get_continuous_continuous_data(self): 149 | """ 150 | Gets dummy continuous-continuous data. 151 | See `site `_. 152 | 153 | :return: Spark dataframe. 154 | """ 155 | data = [ 156 | (12, 9), 157 | (10, 12), 158 | (9, 12), 159 | (14, 11), 160 | (10, 8), 161 | (11, 9), 162 | (10, 9), 163 | (10, 6), 164 | (14, 12), 165 | (9, 11), 166 | (11, 12), 167 | (10, 7), 168 | (11, 13), 169 | (15, 14), 170 | (8, 11), 171 | (11, 11), 172 | (9, 8), 173 | (9, 9), 174 | (10, 11), 175 | (12, 9), 176 | (11, 12), 177 | (10, 12), 178 | (9, 7), 179 | (7, 9), 180 | (12, 14) 181 | ] 182 | pdf = pd.DataFrame([item * 2 for item in data], columns=['x1', 'x2', 'x3', 'x4']) 183 | sdf = self.spark.createDataFrame(pdf) 184 | return sdf 185 | 186 | 187 | class SparkTest(PySparkTest): 188 | """ 189 | Tests Spark operations. 190 | """ 191 | 192 | def test_binary_binary(self): 193 | """ 194 | Tests binary-binary Spark operation. 195 | 196 | :return: None. 197 | """ 198 | sdf = self._get_binary_binary_data() 199 | results = {tup[0]: tup[1] for tup in binary_binary(sdf).collect()} 200 | 201 | print(json.dumps({f'{k[0]}_{k[1]}': v for k, v in results.items()}, indent=1)) 202 | 203 | def test_confusion(self): 204 | """ 205 | Tests confusion Spark operation. 206 | 207 | :return: None. 208 | """ 209 | sdf = self._get_confusion_data() 210 | results = {tup[0]: tup[1] for tup in confusion(sdf).collect()} 211 | 212 | print(json.dumps({f'{k[0]}_{k[1]}': v for k, v in results.items()}, indent=1)) 213 | 214 | def test_categorical_categorical(self): 215 | """ 216 | Tests categorical-categorical Spark operation. 217 | 218 | :return: None. 219 | """ 220 | sdf = self._get_categorical_categorical_data() 221 | results = {tup[0]: tup[1] for tup in categorical_categorical(sdf).collect()} 222 | 223 | print(json.dumps({f'{k[0]}_{k[1]}': v for k, v in results.items()}, indent=1)) 224 | 225 | def test_agreement(self): 226 | """ 227 | Tests agreement Spark operation. 228 | 229 | :return: None. 230 | """ 231 | sdf = self._get_binary_binary_data() 232 | results = {tup[0]: tup[1] for tup in agreement(sdf).collect()} 233 | 234 | print(json.dumps({f'{k[0]}_{k[1]}': v for k, v in results.items()}, indent=1)) 235 | 236 | def test_biserial(self): 237 | """ 238 | Tests binary-continuous Spark operation. 239 | 240 | :return: None. 241 | """ 242 | sdf = self._get_binary_continuous_data() 243 | results = {tup[0]: tup[1] for tup in binary_continuous(sdf, binary=['gender'], continuous=['years']).collect()} 244 | 245 | print(json.dumps({f'{k[0]}_{k[1]}': v for k, v in results.items()}, indent=1)) 246 | 247 | def test_concordance(self): 248 | """ 249 | Tests concordance Spark operation. 250 | 251 | :return: None. 252 | """ 253 | sdf = self._get_concordance_data() 254 | results = {tup[0]: tup[1] for tup in concordance(sdf).collect()} 255 | 256 | print(json.dumps({f'{k[0]}_{k[1]}': v for k, v in results.items()}, indent=1)) 257 | 258 | def test_categorical_continuous(self): 259 | """ 260 | Tests categorical-continuous Spark operation. 261 | 262 | :return: None. 263 | """ 264 | sdf = self._get_categorical_continuous_data() 265 | results = {tup[0]: tup[1] for tup in categorical_continuous(sdf, ['x1', 'x3'], ['x2', 'x4']).collect()} 266 | 267 | print(json.dumps({f'{k[0]}_{k[1]}': v for k, v in results.items()}, indent=1)) 268 | 269 | def test_continuous_continuous(self): 270 | """ 271 | Tests continuous-continuous Spark operation. 272 | 273 | :return: None. 274 | """ 275 | sdf = self._get_continuous_continuous_data() 276 | results = {tup[0]: tup[1] for tup in continuous_continuous(sdf).collect()} 277 | 278 | print(json.dumps({f'{k[0]}_{k[1]}': v for k, v in results.items()}, indent=1)) 279 | -------------------------------------------------------------------------------- /tests/test_table.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | from nose import with_setup 5 | 6 | from pypair.contingency import BinaryTable, CategoricalTable, ConfusionMatrix 7 | 8 | 9 | def setup(): 10 | """ 11 | Setup. 12 | :return: None. 13 | """ 14 | np.random.seed(37) 15 | random.seed(37) 16 | 17 | 18 | def teardown(): 19 | """ 20 | Teardown. 21 | :return: None. 22 | """ 23 | pass 24 | 25 | 26 | @with_setup(setup, teardown) 27 | def test_confusion_matrix_creation(): 28 | """ 29 | Tests creating ConfusionMatrix. 30 | 31 | :return: None. 32 | """ 33 | a = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] 34 | b = [0, 0, 0, 1, 1, 0, 0, 1, 1, 1] 35 | 36 | table = ConfusionMatrix(a, b) 37 | for measure in ConfusionMatrix.measures(): 38 | stats = table.get(measure) 39 | if isinstance(stats, tuple): 40 | print(f'{stats[0]:.8f}, {stats[1]:.8f}: {measure}') 41 | else: 42 | print(f'{stats:.8f}: {measure}') 43 | 44 | 45 | @with_setup(setup, teardown) 46 | def test_binary_table_creation(): 47 | """ 48 | Tests creating BinaryTable. The data is simulated from this `site `_. 49 | 50 | :return: None. 51 | """ 52 | get_data = lambda x, y, n: [(x, y) for _ in range(n)] 53 | data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242) 54 | a = [a for a, _ in data] 55 | b = [b for _, b in data] 56 | 57 | table = BinaryTable(a, b) 58 | for measure in BinaryTable.measures(): 59 | stats = table.get(measure) 60 | if isinstance(stats, tuple): 61 | print(f'{stats[0]:.8f}, {stats[1]:.8f}: {measure}') 62 | else: 63 | print(f'{stats:.8f}: {measure}') 64 | 65 | 66 | @with_setup(setup, teardown) 67 | def test_categorical_table_creation(): 68 | """ 69 | Tests creating CategoricalTable. 70 | 71 | :return: None. 72 | """ 73 | a = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] 74 | b = [0, 0, 0, 1, 1, 0, 0, 1, 1, 1] 75 | 76 | table = CategoricalTable(a, b) 77 | chisq = table.get('chisq') 78 | phi = table.get('phi') 79 | print(chisq, phi) 80 | --------------------------------------------------------------------------------