├── .dockerignore
├── .github
└── FUNDING.yml
├── .gitignore
├── .readthedocs.yml
├── Dockerfile
├── LICENSE.txt
├── MANIFEST.in
├── Makefile
├── Makefile.bat
├── README.md
├── build.sh
├── docs
├── Makefile
├── autobuild.bat
├── autobuild.sh
├── make.bat
└── source
│ ├── _code
│ ├── binary-demo.py
│ ├── biserial-demo.py
│ ├── categorical-demo.py
│ ├── concordance-demo.py
│ ├── confusion-demo.py
│ ├── continuous-demo.py
│ ├── corr-ratio-demo.py
│ ├── dataframe-tip.py
│ ├── multiprocessing-tip.py
│ └── spark-demo.py
│ ├── _logo
│ ├── logo-1000.png
│ ├── logo-150.png
│ ├── logo-250.png
│ ├── logo-50.png
│ └── logo-500.png
│ ├── _static
│ ├── .gitkeep
│ ├── css
│ │ └── override.css
│ ├── favicon.ico
│ └── images
│ │ ├── logo-small.png
│ │ ├── logo.png
│ │ ├── ooc-logo.png
│ │ └── ooc-small.png
│ ├── _templates
│ └── .gitkeep
│ ├── conf.py
│ ├── deepdives.rst
│ ├── index.rst
│ ├── intro.rst
│ ├── modules.rst
│ ├── pypair.rst
│ ├── quicklist.rst
│ ├── quickstart.rst
│ ├── refs.bib
│ ├── robots.txt
│ └── zzz-bib.rst
├── logo.png
├── misc
├── SPARK.md
├── binary-measures.csv
├── binary-measures.py
├── count-measures.py
├── ipynb
│ ├── binary-binary.ipynb
│ └── cat-cat.ipynb
└── scratch.py
├── publish.sh
├── pypair
├── __init__.py
├── association.py
├── biserial.py
├── contingency.py
├── continuous.py
├── decorator.py
├── spark.py
└── util.py
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests
├── __init__.py
├── test_association.py
├── test_contingency.py
├── test_spark.py
└── test_table.py
/.dockerignore:
--------------------------------------------------------------------------------
1 | **/*.pyc
2 | .idea/
3 | docs/build/
4 | .pytest_cache/
5 | build/
6 | coverage/
7 | dist/
8 | pypair.egg-info/
9 | docs/build/
10 | .coverage
11 | .noseids
12 | .ipynb_checkpoints/
13 | joblib_memmap/
14 | .DS_store
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: vangj
4 | patreon: vangj
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: # Replace with a single Ko-fi username
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: https://oneoffcoder.com/
13 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | **/*.pyc
2 | .idea/
3 | docs/build/
4 | coverage/
5 | .coverage
6 | .noseids
7 | dist/
8 | pypair.egg-info/
9 | build/
10 | .ipynb_checkpoints/
11 | .pypirc
12 | .pypircc
13 | joblib_memmap/
14 | .pytest_cache/
15 | .DS_store
16 | .vscode
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Build documentation in the docs/ directory with Sphinx
9 | sphinx:
10 | configuration: docs/source/conf.py
11 |
12 | # Build documentation with MkDocs
13 | #mkdocs:
14 | # configuration: mkdocs.yml
15 |
16 | # Optionally build your docs in additional formats such as PDF
17 | formats:
18 | - htmlzip
19 |
20 | # Optionally set the version of Python and requirements required to build your docs
21 | python:
22 | version: 3.8
23 | install:
24 | - requirements: requirements.txt
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM oneoffcoder/python-java:latest
2 |
3 | LABEL author="Jee Vang, Ph.D."
4 | LABEL email="vangjee@gmail.com"
5 |
6 | ARG AAPI_VERSION
7 | ARG APYPI_REPO
8 |
9 | ENV API_VERSION=$AAPI_VERSION
10 | ENV PYPI_REPO=$APYPI_REPO
11 |
12 | RUN apt-get update \
13 | && apt-get upgrade -y
14 | COPY . /py-pair
15 | RUN pip install -r /py-pair/requirements.txt
16 | RUN /py-pair/publish.sh
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright 2017 Jee Vang
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt README.md
2 | prune tests*
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: init clean lint test
2 | .DEFAULT_GOAL := build
3 |
4 | init:
5 | pip install -r requirements.txt
6 |
7 | lint:
8 | python -m flake8 ./pypair
9 |
10 | test: clean lint
11 | nosetests tests
12 |
13 | build: test
14 | python setup.py bdist_egg sdist bdist_wheel
15 |
16 | install: build
17 | python setup.py install
18 |
19 | publish: build
20 | python setup.py sdist upload -r pypi
21 |
22 | compile:
23 | python -m compileall -f ./pypair
24 |
25 | clean:
26 | find . -type f -name '*.pyc' -delete
27 | find . -type d -name '__pycache__' -delete
28 | rm -fr coverage/
29 | rm -fr dist/
30 | rm -fr build/
31 | rm -fr pypair.egg-info/
32 | rm -fr pypair/pypair.egg-info/
33 | rm -fr jupyter/.ipynb_checkpoints/
34 | rm -fr joblib_memmap/
35 | rm -fr docs/build/
36 | rm -fr .pytest_cache/
37 | rm -f .coverage
38 | rm -f .noseids
39 |
40 |
--------------------------------------------------------------------------------
/Makefile.bat:
--------------------------------------------------------------------------------
1 | @ECHO off
2 | if /I %1 == default goto :default
3 | if /I %1 == init goto :init
4 | if /I %1 == lint goto :lint
5 | if /I %1 == test goto :test
6 | if /I %1 == clean goto :clean
7 | if /I %1 == build goto :build
8 | if /I %1 == install goto :install
9 |
10 | goto :eof ::can be ommited to run the `default` function similarly to makefiles
11 |
12 | :default
13 | goto :test
14 |
15 | :init
16 | pip install -r requirements.txt
17 | goto :eof
18 |
19 | :lint
20 | python -m flake8 ./pypair
21 | goto :eof
22 |
23 | :test
24 | nosetests tests
25 | goto :eof
26 |
27 | :clean
28 | del /S *.pyc
29 | rmdir /S /Q coverage
30 | rmdir /S /Q dist
31 | rmdir /S /Q build
32 | rmdir /S /Q pypair.egg-info
33 | rmdir /S /Q pypair/pypair.egg-info
34 | rmdir /S /Q jupyter/.ipynb_checkpoints
35 | rmdir /S /Q docs/build
36 | rmdir /S /Q joblib_memmap
37 | rmdir /S /Q .pytest_cache
38 | del .coverage
39 | del .noseids
40 | goto :eof
41 |
42 | :build
43 | python setup.py bdist_egg sdist bdist_wheel
44 | goto :eof
45 |
46 | :install
47 | python setup.py install
48 | goto :eof
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | # PyPair
4 |
5 | PyPair is a statistical library to compute pairwise association between any two variables. In general, statistical variable types are viewed as `categorical` or `continuous`. Categorical variables have no inherit order to their values, while continuous variables do. This API has `over 130 association measures` implemented for any combination of categorical and/or continuous variables.
6 |
7 | To install.
8 |
9 | ```bash
10 | pip install pypair
11 | ```
12 |
13 | Additional links.
14 |
15 | - [Documentation](https://py-pair.readthedocs.io/)
16 | - [PyPi](https://pypi.org/project/pypair/)
17 | - [Gitter](https://gitter.im/dataflava/py-pair)
18 |
19 | Here's a short and sweet snippet for using the API against a dataframe that stores strictly binary data. The Pandas `DataFrame.corr()` method no longer processes non-numeric fields!
20 |
21 | ```python
22 | from pypair.association import binary_binary
23 | from pypair.util import corr
24 |
25 | jaccard = lambda a, b: binary_binary(a, b, measure='jaccard')
26 | tanimoto = lambda a, b: binary_binary(a, b, measure='tanimoto_i')
27 |
28 | df = get_a_pandas_binary_dataframe()
29 |
30 | jaccard_corr = corr(df, jaccard)
31 | tanimoto_corr = corr(df, tanimoto)
32 |
33 | print(jaccard_corr)
34 | print('-' * 15)
35 | print(tanimoto_corr)
36 | ```
37 |
38 | Another way to get started with PyPair is to use the `convenience` methods whose names indicate the variable pair types.
39 |
40 | ```python
41 | from pypair.association import binary_binary, categorical_categorical, \
42 | binary_continuous, concordance, categorical_continuous, continuous_continuous, confusion, agreement
43 |
44 | # assume a and b are the appropriate iterables of values for 2 variables
45 | jaccard = binary_binary(a, b, measure='jaccard')
46 | acc = confusion(a, b, measure='acc')
47 | phi = categorical_categorical(a, b, measure='phi')
48 | kappa = agreement(a, b, measure='cohen_k')
49 | biserial = binary_continuous(a, b, measure='biserial')
50 | tau = concordance(a, b, measure='kendall_tau')
51 | eta = categorical_continuous(a, b, measure='eta')
52 | pearson = continuous_continuous(a, b, measure='pearson')
53 | ```
54 |
55 | # Software Copyright
56 |
57 | ```
58 | Copyright 2020 One-Off Coder
59 |
60 | Licensed under the Apache License, Version 2.0 (the "License");
61 | you may not use this file except in compliance with the License.
62 | You may obtain a copy of the License at
63 |
64 | http://www.apache.org/licenses/LICENSE-2.0
65 |
66 | Unless required by applicable law or agreed to in writing, software
67 | distributed under the License is distributed on an "AS IS" BASIS,
68 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
69 | See the License for the specific language governing permissions and
70 | limitations under the License.
71 | ```
72 |
73 | # Book Copyright
74 |
75 | Copyright 2020 One-Off Coder
76 |
77 | This work is licensed under a [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/) by [One-Off Coder](https://www.oneoffcoder.com).
78 |
79 | 
80 |
81 | # Art Copyright
82 |
83 | Copyright 2020 Daytchia Vang
84 |
85 | # Citation
86 |
87 | ```
88 | @misc{oneoffcoder_pypair_2020,
89 | title={PyPair, A Statistical API for Bivariate Association Measures},
90 | url={https://github.com/oneoffcoder/py-pair},
91 | author={Jee Vang},
92 | year={2020},
93 | month={Nov}}
94 | ```
95 |
96 | # Sponsor, Love
97 |
98 | - [Patreon](https://www.patreon.com/vangj)
99 | - [GitHub](https://github.com/sponsors/vangj)
--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | DOCKER_FILE=Dockerfile
4 | DOCKER_REPO=pypair
5 | DOCKER_TAG=local
6 | AAPI_VERSION=version
7 | APYPI_REPO=repo
8 |
9 | while getopts v:r: option
10 | do
11 | case "${option}"
12 | in
13 | v) AAPI_VERSION=${OPTARG};;
14 | r) APYPI_REPO=${OPTARG};;
15 | esac
16 | done
17 |
18 | if [[ "version" == AAPI_VERSION || "repo" == $APYPI_REPO ]]; then
19 | echo "Usage: ./build.sh -r [pypi|testpypi] -v [version]"
20 | echo " -r repository, pypi or testpypi"
21 | echo " -v version e.g. 0.2.5"
22 | else
23 | docker build --no-cache \
24 | -f $DOCKER_FILE \
25 | --build-arg AAPI_VERSION=$AAPI_VERSION \
26 | --build-arg APYPI_REPO=$APYPI_REPO \
27 | -t ${DOCKER_REPO}:${DOCKER_TAG} .
28 | fi
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/autobuild.bat:
--------------------------------------------------------------------------------
1 | python -m sphinx_autobuild ./source ./build -b html --host 0.0.0.0 --port 8000
--------------------------------------------------------------------------------
/docs/autobuild.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python -m sphinx_autobuild ./source ./build -b html --host 0.0.0.0 --port 8000
4 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/source/_code/binary-demo.py:
--------------------------------------------------------------------------------
1 | from pypair.association import binary_binary
2 | from pypair.contingency import BinaryTable
3 |
4 | get_data = lambda x, y, n: [(x, y) for _ in range(n)]
5 | data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
6 | a = [a for a, _ in data]
7 | b = [b for _, b in data]
8 |
9 | for m in BinaryTable.measures():
10 | r = binary_binary(a, b, m)
11 | print(f'{r}: {m}')
12 |
13 | print('-' * 15)
14 |
15 | table = BinaryTable(a, b)
16 | for m in table.measures():
17 | r = table.get(m)
18 | print(f'{r}: {m}')
19 |
--------------------------------------------------------------------------------
/docs/source/_code/biserial-demo.py:
--------------------------------------------------------------------------------
1 | from pypair.association import binary_continuous
2 | from pypair.biserial import Biserial
3 |
4 | get_data = lambda x, y, n: [(x, y) for _ in range(n)]
5 | data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
6 | a = [a for a, _ in data]
7 | b = [b for _, b in data]
8 |
9 | for m in Biserial.measures():
10 | r = binary_continuous(a, b, m)
11 | print(f'{r}: {m}')
12 |
13 | print('-' * 15)
14 |
15 | biserial = Biserial(a, b)
16 | for m in biserial.measures():
17 | r = biserial.get(m)
18 | print(f'{r}: {m}')
19 |
--------------------------------------------------------------------------------
/docs/source/_code/categorical-demo.py:
--------------------------------------------------------------------------------
1 | from pypair.association import categorical_categorical
2 | from pypair.contingency import CategoricalTable
3 |
4 | get_data = lambda x, y, n: [(x, y) for _ in range(n)]
5 | data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
6 | a = [a for a, _ in data]
7 | b = [b for _, b in data]
8 |
9 | for m in CategoricalTable.measures():
10 | r = categorical_categorical(a, b, m)
11 | print(f'{r}: {m}')
12 |
13 | print('-' * 15)
14 |
15 | table = CategoricalTable(a, b)
16 | for m in table.measures():
17 | r = table.get(m)
18 | print(f'{r}: {m}')
19 |
--------------------------------------------------------------------------------
/docs/source/_code/concordance-demo.py:
--------------------------------------------------------------------------------
1 | from pypair.association import concordance
2 | from pypair.continuous import Concordance
3 |
4 | a = [1, 2, 3]
5 | b = [3, 2, 1]
6 |
7 | for m in Concordance.measures():
8 | r = concordance(a, b, m)
9 | print(f'{r}: {m}')
10 |
11 | print('-' * 15)
12 |
13 | con = Concordance(a, b)
14 | for m in con.measures():
15 | r = con.get(m)
16 | print(f'{r}: {m}')
17 |
--------------------------------------------------------------------------------
/docs/source/_code/confusion-demo.py:
--------------------------------------------------------------------------------
1 | from pypair.association import confusion
2 | from pypair.contingency import ConfusionMatrix
3 |
4 |
5 | def get_data():
6 | """
7 | Data taken from `here `_.
8 | A pair of binary variables, `a` and `p`, are returned.
9 |
10 | :return: a, p
11 | """
12 | tn = [(0, 0) for _ in range(50)]
13 | fp = [(0, 1) for _ in range(10)]
14 | fn = [(1, 0) for _ in range(5)]
15 | tp = [(1, 1) for _ in range(100)]
16 | data = tn + fp + fn + tp
17 | a = [a for a, _ in data]
18 | p = [b for _, b in data]
19 | return a, p
20 |
21 |
22 | a, p = get_data()
23 |
24 | # if you need to quickly get just one association measure
25 | r = confusion(a, p, measure='acc')
26 | print(r)
27 |
28 | print('-' * 15)
29 |
30 | # you can also get a list of available association measures
31 | # and loop over to call confusion(...)
32 | # this is more convenient, but less fast
33 | for m in ConfusionMatrix.measures():
34 | r = confusion(a, p, m)
35 | print(f'{r}: {m}')
36 |
37 | print('-' * 15)
38 |
39 | # if you need multiple association measures, then
40 | # build the confusion matrix table
41 | # this is less convenient, but much faster
42 | matrix = ConfusionMatrix(a, p)
43 | for m in matrix.measures():
44 | r = matrix.get(m)
45 | print(f'{r}: {m}')
46 |
--------------------------------------------------------------------------------
/docs/source/_code/continuous-demo.py:
--------------------------------------------------------------------------------
1 | from pypair.association import continuous_continuous
2 | from pypair.continuous import Continuous
3 |
4 | x = [x for x in range(10)]
5 | y = [y for y in range(10)]
6 |
7 | for m in Continuous.measures():
8 | r = continuous_continuous(x, y, m)
9 | print(f'{r}: {m}')
10 |
11 | print('-' * 15)
12 |
13 | con = Continuous(x, y)
14 | for m in con.measures():
15 | r = con.get(m)
16 | print(f'{r}: {m}')
17 |
--------------------------------------------------------------------------------
/docs/source/_code/corr-ratio-demo.py:
--------------------------------------------------------------------------------
1 | from pypair.association import categorical_continuous
2 | from pypair.continuous import CorrelationRatio
3 |
4 | data = [
5 | ('a', 45), ('a', 70), ('a', 29), ('a', 15), ('a', 21),
6 | ('g', 40), ('g', 20), ('g', 30), ('g', 42),
7 | ('s', 65), ('s', 95), ('s', 80), ('s', 70), ('s', 85), ('s', 73)
8 | ]
9 | x = [x for x, _ in data]
10 | y = [y for _, y in data]
11 | for m in CorrelationRatio.measures():
12 | r = categorical_continuous(x, y, m)
13 | print(f'{r}: {m}')
14 |
15 | print('-' * 15)
16 |
17 | cr = CorrelationRatio(x, y)
18 | for m in cr.measures():
19 | r = cr.get(m)
20 | print(f'{r}: {m}')
21 |
--------------------------------------------------------------------------------
/docs/source/_code/dataframe-tip.py:
--------------------------------------------------------------------------------
1 | from random import randint
2 |
3 | import pandas as pd
4 |
5 | from pypair.association import binary_binary
6 | from pypair.util import corr
7 |
8 |
9 | def get_data(n_rows=1000, n_cols=5):
10 | data = [tuple([randint(0, 1) for _ in range(n_cols)]) for _ in range(n_rows)]
11 | cols = [f'x{i}' for i in range(n_cols)]
12 | return pd.DataFrame(data, columns=cols)
13 |
14 |
15 | if __name__ == '__main__':
16 | jaccard = lambda a, b: binary_binary(a, b, measure='jaccard')
17 | tanimoto = lambda a, b: binary_binary(a, b, measure='tanimoto_i')
18 |
19 | df = get_data()
20 | jaccard_corr = corr(df, jaccard)
21 | tanimoto_corr = corr(df, tanimoto)
22 |
23 | print(jaccard_corr)
24 | print('-' * 15)
25 | print(tanimoto_corr)
26 |
--------------------------------------------------------------------------------
/docs/source/_code/multiprocessing-tip.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import random
4 | from random import randint
5 | from pypair.association import binary_binary
6 | from itertools import combinations
7 | from multiprocessing import Pool
8 |
9 | np.random.seed(37)
10 | random.seed(37)
11 |
12 | def get_data(n_rows=1000, n_cols=5):
13 | data = [tuple([randint(0, 1) for _ in range(n_cols)]) for _ in range(n_rows)]
14 | cols = [f'x{i}' for i in range(n_cols)]
15 | return pd.DataFrame(data, columns=cols)
16 |
17 | def compute(a, b, df):
18 | x = df[a]
19 | y = df[b]
20 | return f'{a}_{b}', binary_binary(x, y, measure='jaccard')
21 |
22 | if __name__ == '__main__':
23 | df = get_data()
24 |
25 | with Pool(10) as pool:
26 | pairs = ((a, b, df) for a, b in combinations(df.columns, 2))
27 | bc = pool.starmap(compute, pairs)
28 |
29 | bc = sorted(bc, key=lambda tup: tup[0])
30 | print(dict(bc))
--------------------------------------------------------------------------------
/docs/source/_code/spark-demo.py:
--------------------------------------------------------------------------------
1 | import json
2 | from random import choice
3 |
4 | import pandas as pd
5 | from pyspark.sql import SparkSession
6 |
7 | from pypair.spark import binary_binary, confusion, categorical_categorical, agreement, binary_continuous, concordance, \
8 | categorical_continuous, continuous_continuous
9 |
10 |
11 | def _get_binary_binary_data(spark):
12 | """
13 | Gets dummy binary-binary data in a Spark dataframe.
14 |
15 | :return: Spark dataframe.
16 | """
17 | get_data = lambda x, y, n: [(x, y) * 2 for _ in range(n)]
18 | data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
19 | pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4'])
20 | sdf = spark.createDataFrame(pdf)
21 | return sdf
22 |
23 |
24 | def _get_confusion_data(spark):
25 | """
26 | Gets dummy binary-binary data in Spark dataframe. For use with confusion matrix analysis.
27 |
28 | :return: Spark dataframe.
29 | """
30 | tn = [(0, 0) * 2 for _ in range(50)]
31 | fp = [(0, 1) * 2 for _ in range(10)]
32 | fn = [(1, 0) * 2 for _ in range(5)]
33 | tp = [(1, 1) * 2 for _ in range(100)]
34 | data = tn + fp + fn + tp
35 | pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4'])
36 | sdf = spark.createDataFrame(pdf)
37 | return sdf
38 |
39 |
40 | def _get_categorical_categorical_data(spark):
41 | """
42 | Gets dummy categorical-categorical data in Spark dataframe.
43 |
44 | :return: Spark dataframe.
45 | """
46 | x_domain = ['a', 'b', 'c']
47 | y_domain = ['a', 'b']
48 |
49 | get_x = lambda: choice(x_domain)
50 | get_y = lambda: choice(y_domain)
51 | get_data = lambda: {f'x{i}': v for i, v in enumerate((get_x(), get_y(), get_x(), get_y()))}
52 |
53 | pdf = pd.DataFrame([get_data() for _ in range(100)])
54 | sdf = spark.createDataFrame(pdf)
55 | return sdf
56 |
57 |
58 | def _get_binary_continuous_data(spark):
59 | """
60 | Gets dummy `binary-continuous data `_.
61 |
62 | :return: Spark dataframe.
63 | """
64 | data = [
65 | (1, 10), (1, 11), (1, 6), (1, 11), (0, 4),
66 | (0, 3), (1, 12), (0, 2), (0, 2), (0, 1)
67 | ]
68 | pdf = pd.DataFrame(data, columns=['gender', 'years'])
69 | sdf = spark.createDataFrame(pdf)
70 | return sdf
71 |
72 |
73 | def _get_concordance_data(spark):
74 | """
75 | Gets dummy concordance data.
76 |
77 | :return: Spark dataframe.
78 | """
79 | a = [1, 2, 3]
80 | b = [3, 2, 1]
81 | pdf = pd.DataFrame({'a': a, 'b': b, 'c': a, 'd': b})
82 | sdf = spark.createDataFrame(pdf)
83 | return sdf
84 |
85 |
86 | def _get_categorical_continuous_data(spark):
87 | data = [
88 | ('a', 45), ('a', 70), ('a', 29), ('a', 15), ('a', 21),
89 | ('g', 40), ('g', 20), ('g', 30), ('g', 42),
90 | ('s', 65), ('s', 95), ('s', 80), ('s', 70), ('s', 85), ('s', 73)
91 | ]
92 | data = [tup * 2 for tup in data]
93 | pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4'])
94 | sdf = spark.createDataFrame(pdf)
95 | return sdf
96 |
97 |
98 | def _get_continuous_continuous_data(spark):
99 | """
100 | Gets dummy continuous-continuous data.
101 | See `site `_.
102 |
103 | :return: Spark dataframe.
104 | """
105 | data = [
106 | (12, 9),
107 | (10, 12),
108 | (9, 12),
109 | (14, 11),
110 | (10, 8),
111 | (11, 9),
112 | (10, 9),
113 | (10, 6),
114 | (14, 12),
115 | (9, 11),
116 | (11, 12),
117 | (10, 7),
118 | (11, 13),
119 | (15, 14),
120 | (8, 11),
121 | (11, 11),
122 | (9, 8),
123 | (9, 9),
124 | (10, 11),
125 | (12, 9),
126 | (11, 12),
127 | (10, 12),
128 | (9, 7),
129 | (7, 9),
130 | (12, 14)
131 | ]
132 | pdf = pd.DataFrame([item * 2 for item in data], columns=['x1', 'x2', 'x3', 'x4'])
133 | sdf = spark.createDataFrame(pdf)
134 | return sdf
135 |
136 |
137 | spark = None
138 |
139 | try:
140 | # create a spark session
141 | spark = (SparkSession.builder
142 | .master('local[4]')
143 | .appName('local-testing-pyspark')
144 | .getOrCreate())
145 |
146 | # create some spark dataframes
147 | bin_sdf = _get_binary_binary_data(spark)
148 | con_sdf = _get_confusion_data(spark)
149 | cat_sdf = _get_categorical_categorical_data(spark)
150 | bcn_sdf = _get_binary_continuous_data(spark)
151 | crd_sdf = _get_concordance_data(spark)
152 | ccn_sdf = _get_categorical_continuous_data(spark)
153 | cnt_sdf = _get_continuous_continuous_data(spark)
154 |
155 | # call these methods to get the association measures
156 | bin_results = binary_binary(bin_sdf).collect()
157 | con_results = confusion(con_sdf).collect()
158 | cat_results = categorical_categorical(cat_sdf).collect()
159 | agr_results = agreement(bin_sdf).collect()
160 | bcn_results = binary_continuous(bcn_sdf, binary=['gender'], continuous=['years']).collect()
161 | crd_results = concordance(crd_sdf).collect()
162 | ccn_results = categorical_continuous(ccn_sdf, ['x1', 'x3'], ['x2', 'x4']).collect()
163 | cnt_results = continuous_continuous(cnt_sdf).collect()
164 |
165 | # convert the lists to dictionaries
166 | bin_results = {tup[0]: tup[1] for tup in bin_results}
167 | con_results = {tup[0]: tup[1] for tup in con_results}
168 | cat_results = {tup[0]: tup[1] for tup in cat_results}
169 | agr_results = {tup[0]: tup[1] for tup in agr_results}
170 | bcn_results = {tup[0]: tup[1] for tup in bcn_results}
171 | crd_results = {tup[0]: tup[1] for tup in crd_results}
172 | ccn_results = {tup[0]: tup[1] for tup in ccn_results}
173 | cnt_results = {tup[0]: tup[1] for tup in cnt_results}
174 |
175 | # pretty print
176 | to_json = lambda r: json.dumps({f'{k[0]}_{k[1]}': v for k, v in r.items()}, indent=1)
177 | print(to_json(bin_results))
178 | print('-' * 10)
179 | print(to_json(con_results))
180 | print('*' * 10)
181 | print(to_json(cat_results))
182 | print('~' * 10)
183 | print(to_json(agr_results))
184 | print('-' * 10)
185 | print(to_json(bcn_results))
186 | print('=' * 10)
187 | print(to_json(crd_results))
188 | print('`' * 10)
189 | print(to_json(ccn_results))
190 | print('/' * 10)
191 | print(to_json(cnt_results))
192 | except Exception as e:
193 | print(e)
194 | finally:
195 | try:
196 | spark.stop()
197 | print('closed spark')
198 | except Exception as e:
199 | print(e)
200 |
--------------------------------------------------------------------------------
/docs/source/_logo/logo-1000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_logo/logo-1000.png
--------------------------------------------------------------------------------
/docs/source/_logo/logo-150.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_logo/logo-150.png
--------------------------------------------------------------------------------
/docs/source/_logo/logo-250.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_logo/logo-250.png
--------------------------------------------------------------------------------
/docs/source/_logo/logo-50.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_logo/logo-50.png
--------------------------------------------------------------------------------
/docs/source/_logo/logo-500.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_logo/logo-500.png
--------------------------------------------------------------------------------
/docs/source/_static/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_static/.gitkeep
--------------------------------------------------------------------------------
/docs/source/_static/css/override.css:
--------------------------------------------------------------------------------
1 | table.expand {
2 | width: 100%;
3 | }
4 | table.rc-headers, th.rc-headers, td.rc-headers {
5 | border: 1px dashed blue;
6 | border-collapse: collapse;
7 | padding: 5px;
8 | }
9 | th.heading, td.heading {
10 | font-weight: bold;
11 | }
--------------------------------------------------------------------------------
/docs/source/_static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_static/favicon.ico
--------------------------------------------------------------------------------
/docs/source/_static/images/logo-small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_static/images/logo-small.png
--------------------------------------------------------------------------------
/docs/source/_static/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_static/images/logo.png
--------------------------------------------------------------------------------
/docs/source/_static/images/ooc-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_static/images/ooc-logo.png
--------------------------------------------------------------------------------
/docs/source/_static/images/ooc-small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_static/images/ooc-small.png
--------------------------------------------------------------------------------
/docs/source/_templates/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/docs/source/_templates/.gitkeep
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('../../'))
16 |
17 |
18 | # -- Project information -----------------------------------------------------
19 |
20 | project = 'PyPair'
21 | copyright = '2020, One-Off Coder'
22 | author = 'Jee Vang'
23 |
24 | # The full version, including alpha/beta/rc tags
25 | release = '3.0.8'
26 |
27 |
28 | # -- General configuration ---------------------------------------------------
29 |
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = [
34 | 'sphinx.ext.autodoc',
35 | 'sphinx.ext.doctest',
36 | 'sphinx.ext.todo',
37 | 'sphinx.ext.coverage',
38 | 'sphinx.ext.mathjax',
39 | 'sphinx.ext.githubpages',
40 | 'sphinxcontrib.bibtex',
41 | 'sphinxcontrib.blockdiag',
42 | 'sphinx_sitemap'
43 | ]
44 |
45 | bibtex_bibfiles = ['refs.bib']
46 |
47 | # Add any paths that contain templates here, relative to this directory.
48 | templates_path = ['_templates']
49 |
50 | # List of patterns, relative to source directory, that match files and
51 | # directories to ignore when looking for source files.
52 | # This pattern also affects html_static_path and html_extra_path.
53 | exclude_patterns = []
54 |
55 |
56 | # -- Options for HTML output -------------------------------------------------
57 |
58 | # The theme to use for HTML and HTML Help pages. See the documentation for
59 | # a list of builtin themes.
60 | #
61 | html_theme = 'sphinx_rtd_theme'
62 |
63 | # Add any paths that contain custom static files (such as style sheets) here,
64 | # relative to this directory. They are copied after the builtin static files,
65 | # so a file named "default.css" will overwrite the builtin "default.css".
66 | html_static_path = ['_static']
67 | html_css_files = [
68 | 'css/override.css',
69 | ]
70 | html_show_sourcelink = False
71 | html_show_sphinx = False
72 | html_last_updated_fmt = '%b %d, %Y, %X'
73 | html_logo = '_static/images/logo-small.png'
74 | html_favicon = '_static/favicon.ico'
75 | html_theme_options = {
76 | 'canonical_url': 'https://py-pair.readthedocs.io/',
77 | 'analytics_id': 'UA-150762273-1', # Provided by Google in your dashboard
78 | 'logo_only': False,
79 | 'display_version': True,
80 | 'prev_next_buttons_location': 'bottom',
81 | 'style_external_links': True,
82 | 'style_nav_header_background': '#0085CA',
83 | # Toc options
84 | 'collapse_navigation': True,
85 | 'sticky_navigation': True,
86 | 'navigation_depth': 4,
87 | 'includehidden': True,
88 | 'titles_only': False
89 | }
--------------------------------------------------------------------------------
/docs/source/deepdives.rst:
--------------------------------------------------------------------------------
1 | Selected Deep Dives
2 | ===================
3 |
4 | Let's go into some association measures in more details.
5 |
6 | Binary association
7 | ------------------
8 |
9 | The association between binary variables have been studied prolifically in the last 100 years :cite:`2010:choi,1970:cox,1984:reynolds,2019:warrens,2020:ibm-proximities`. A binary variable has only two values. It is typical to re-encode these values into 0 or 1. How and why each of these two values are mapped to 0 or 1 is subjective, arbitrary and/or context-specific. For example, if we have a variable that captures the handedness, favoring left or right hand, of a person, we could map left to 0 and right to 1, or, left to 1 and right to 0. The 0-1 value representation of a binary variable's values is the common foundation for understanding association. Below is a contingency table created from two binary variables. Notice the main values of the tables are `a`, `b`, `c` and `d`.
10 |
11 | - :math:`a = N_{11}` is the count of when the two variables have a value of 1
12 | - :math:`b = N_{10}` is the count of when the row variable has a value of 1 and the column variable has a value of 0
13 | - :math:`c = N_{01}` is the count of when the row variable has a value of 0 and the column variable has a value of 1
14 | - :math:`d = N_{00}` is the count of when the two variables have a value of 0
15 |
16 | Also, look at how the table is structured with the value 1 coming before the value 0 in both the rows and columns.
17 |
18 | .. list-table:: Contingency table for two binary variables
19 |
20 | * -
21 | - 1
22 | - 0
23 | - Total
24 | * - 1
25 | - a
26 | - b
27 | - a + b
28 | * - 0
29 | - c
30 | - d
31 | - c + d
32 | * - Total
33 | - a + c
34 | - b + d
35 | - n = a + b + c + d
36 |
37 | Note that a and d are `matches` and b and c are `mismatches`. Sometimes, depending on the context, matching on 0 is not considered a match. For example, if 1 is the presence of something and 0 is the absence, then an observation of absence and absence does not really feel right to consider as a match (you cannot say two things match on what is not there). Additionally, when 1 is presence and 0 is absence, and the data is very sparse (a lot of 0's compared to 1's), considering absence and absence as matching will make it appear that the two variables are very similar.
38 |
39 | In :cite:`2010:choi`, there are 76 similarity and distance measures identified (some are not unique and/or redundant). Similarity is how `alike` are two things, and distance is how `different` are two things; or, in other words, similarity is how close are two things and distance is how far apart are two things. If a similarity or distance measure produces a value in :math:`[0, 1]`, then we can convert between the two easily.
40 |
41 | - If :math:`s` is the similarity, then :math:`d = 1 - s` is the distance.
42 | - If :math:`d` is the distance, then :math:`s = 1 - d` is the similarity.
43 |
44 | If we use a contingency table to summarize a bivariate binary data, the following similarity and distance measures may be derived entirely from `a`, `b`, `c` and/or `d`. The general pattern is that similarity and distance is always a ratio. The numerator in the ratio defines what we are interested in measuring. When we have `a` and/or `d` in the numerator, it is likely we are measuring similarity; when we have `b` and/or `c` in the numerator, it is likely we are measuring distance. The denominator considers what is important in considering; is it the matches, mismatches or both? The following tables list some identified similarity and distance measures based off of 2 x 2 contingency tables.
45 |
46 | .. list-table:: Similarity measures for 2 x 2 contingency table :cite:`2010:choi,2019:warrens,2020:psu-binary`
47 | :header-rows: 1
48 |
49 | * - Name
50 | - Computation
51 | * - 3W-Jaccard
52 | - :math:`\frac{3a}{3a+b+c}`
53 | * - Ample
54 | - :math:`\left|\frac{a(c+d)}{c(a+b)}\right|`
55 | * - Anderberg
56 | - :math:`\frac{\sigma-\sigma'}{2n}`
57 | * - Baroni-Urbani-Buser-I
58 | - :math:`\frac{\sqrt{ad}+a}{\sqrt{ad}+a+b+c}`
59 | * - Baroni-Urbani-Buser-II
60 | - :math:`\frac{\sqrt{ad}+a-(b+c)}{\sqrt{ad}+a+b+c}`
61 | * - Braun-Banquet
62 | - :math:`\frac{a}{\max(a+b,a+c)}`
63 | * - Cole :cite:`2010:choi,2019:warrens`
64 | - :math:`\frac{\sqrt{2}(ad-bc)}{\sqrt{(ad-bc)^2-(a+b)(a+c)(b+d)(c+d)}}`
65 | * -
66 | - :math:`\frac{ad-bc}{\min((a+b)(a+c),(b+d)(c+d))}`
67 | * - Cosine
68 | - :math:`\frac{a}{(a+b)(a+c)}`
69 | * - Dennis
70 | - :math:`\frac{ad-bc}{\sqrt{n(a+b)(a+c)}}`
71 | * - Dice; Czekanowski; Nei-Li
72 | - :math:`\frac{2a}{2a+b+c}`
73 | * - Disperson
74 | - :math:`\frac{ad-bc}{(a+b+c+d)^2}`
75 | * - Driver-Kroeber
76 | - :math:`\frac{a}{2}\left(\frac{1}{a+b}+\frac{1}{a+c}\right)`
77 | * - Eyraud
78 | - :math:`\frac{n^2(na-(a+b)(a+c))}{(a+b)(a+c)(b+d)(c+d)}`
79 | * - Fager-McGowan
80 | - :math:`\frac{a}{\sqrt{(a+b)(a+c)}}-\frac{max(a+b,a+c)}{2}`
81 | * - Faith
82 | - :math:`\frac{a+0.5d}{a+b+c+d}`
83 | * - Forbes-II
84 | - :math:`\frac{na-(a+b)(a+c)}{n \min(a+b,a+c) - (a+b)(a+c)}`
85 | * - Forbesi
86 | - :math:`\frac{na}{(a+b)(a+c)}`
87 | * - Fossum
88 | - :math:`\frac{n(a-0.5)^2}{(a+b)(a+c)}`
89 | * - Gilbert-Wells
90 | - :math:`\log a - \log n - \log \frac{a+b}{n} - \log \frac{a+c}{n}`
91 | * - Goodman-Kruskal
92 | - :math:`\frac{\sigma - \sigma'}{2n-\sigma'}`
93 | * -
94 | - :math:`\sigma=\max(a,b)+\max(c,d)+\max(a,c)+\max(b,d)`
95 | * -
96 | - :math:`\sigma'=\max(a+c,b+d)+\max(a+b,c+d)`
97 | * - Gower
98 | - :math:`\frac{a+d}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}`
99 | * - Gower-Legendre
100 | - :math:`\frac{a+d}{a+0.5b+0.5c+d}`
101 | * - Hamann
102 | - :math:`\frac{(a+d)-(b+c)}{a+b+c+d}`
103 | * - Inner Product
104 | - :math:`a+d`
105 | * - Intersection
106 | - :math:`a`
107 | * - Jaccard :cite:`2020:wiki-jaccard`
108 | - :math:`\frac{a}{a+b+c}`
109 | * - Johnson
110 | - :math:`\frac{a}{a+b}+\frac{a}{a+c}`
111 | * - Kulczynski-I
112 | - :math:`\frac{a}{b+c}`
113 | * - Kulczynski-II
114 | - :math:`\frac{0.5a(2a+b+c)}{(a+b)(a+c)}`
115 | * -
116 | - :math:`\frac{1}{2}\left(\frac{a}{a + b} + \frac{a}{a + c}\right)`
117 | * - McConnaughey
118 | - :math:`\frac{a^2 - bc}{(a+b)(a+c)}`
119 | * - Michael
120 | - :math:`\frac{4(ad-bc)}{(a+d)^2+(b+c)^2}`
121 | * - Mountford
122 | - :math:`\frac{a}{0.5(ab + ac) + bc}`
123 | * - Ochiai-I :cite:`2020:stack-sim`; Otsuka; Fowlkes-Mallows Index :cite:`2020:wiki-fowlkes`
124 | - :math:`\frac{a}{\sqrt{(a+b)(a+c)}}`
125 | * -
126 | - :math:`\sqrt{\frac{a}{a + b}\frac{a}{a + c}}`
127 | * - Ochiai-II
128 | - :math:`\frac{ad}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}`
129 | * - Pearson-Heron-I
130 | - :math:`\frac{ad-bc}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}`
131 | * - Pearson-Heron-II
132 | - :math:`\cos\left(\frac{\pi \sqrt{bc}}{\sqrt{ad}+\sqrt{bc}}\right)`
133 | * - Pearson-I
134 | - :math:`\chi^2=\frac{n(ad-bc)^2}{(a+b)(a+c)(c+d)(b+d)}`
135 | * - Pearson-II
136 | - :math:`\sqrt{\frac{\chi^2}{n+\chi^2}}`
137 | * - Pearson-II
138 | - :math:`\sqrt{\frac{\rho}{n+\rho}}`
139 | * -
140 | - :math:`\rho=\frac{ad-bc}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}`
141 | * - Peirce
142 | - :math:`\frac{ab+bc}{ab+2bc+cd}`
143 | * - Roger-Tanimoto
144 | - :math:`\frac{a+d}{a+2b+2c+d}`
145 | * - Russell-Rao
146 | - :math:`\frac{a}{a+b+c+d}`
147 | * - Simpson; Overlap :cite:`2020:wiki-overlap`
148 | - :math:`\frac{a}{\min(a+b,a+c)}`
149 | * - Sokal-Michener; Rand Index
150 | - :math:`\frac{a+d}{a+b+c+d}`
151 | * - Sokal-Sneath-I
152 | - :math:`\frac{a}{a+2b+2c}`
153 | * - Sokal-Sneath-II
154 | - :math:`\frac{2a+2d}{2a+b+c+2d}`
155 | * - Sokal-Sneath-III
156 | - :math:`\frac{a+d}{b+c}`
157 | * - Sokal-Sneath-IV
158 | - :math:`\frac{1}{4}\left(\frac{a}{a+b}+\frac{a}{a+c}+\frac{d}{b+d}+\frac{d}{b+d}\right)`
159 | * - Sokal-Sneath-V
160 | - :math:`\frac{ad}{(a+b)(a+c)(b+d)\sqrt{c+d}}`
161 | * - Sørensen–Dice :cite:`2020:wiki-dice`
162 | - :math:`\frac{2(a + d)}{2(a + d) + b + c}`
163 | * - Sorgenfrei
164 | - :math:`\frac{a^2}{(a+b)(a+c)}`
165 | * - Stiles
166 | - :math:`\log_{10} \frac{n\left(|ad-bc|-\frac{n}{2}\right)^2}{(a+b)(a+c)(b+d)(c+d)}`
167 | * - Tanimoto-I
168 | - :math:`\frac{a}{2a+b+c}`
169 | * - Tanimoto-II :cite:`2020:wiki-jaccard`
170 | - :math:`\frac{a}{b + c}`
171 | * - Tarwid
172 | - :math:`\frac{na - (a+b)(a+c)}{na + (a+b)(a+c)}`
173 | * - Tarantula
174 | - :math:`\frac{a(c+d)}{c(a+b)}`
175 | * - Tetrachoric
176 | - :math:`\frac{y-1}{y+1}`
177 | * -
178 | - :math:`y = \left(\frac{ad}{bc}\right)^{\frac{\pi}{4}}`
179 | * - Tversky Index :cite:`2020:wiki-tversky`
180 | - :math:`\frac{a}{a+\theta b+ \phi c}`
181 | * -
182 | - :math:`\theta` and :math:`\phi` are user-supplied parameters
183 | * - Yule-Q
184 | - :math:`\frac{ad-bc}{ad+bc}`
185 | * - Yule-w
186 | - :math:`\frac{\sqrt{ad}-\sqrt{bc}}{\sqrt{ad}+\sqrt{bc}}`
187 |
188 | .. list-table:: Distance measures for 2 x 2 contingency table :cite:`2010:choi`
189 | :header-rows: 1
190 |
191 | * - Name
192 | - Computation
193 | * - Chord
194 | - :math:`\sqrt{2\left(1 - \frac{a}{\sqrt{(a+b)(a+c)}}\right)}`
195 | * - Euclid
196 | - :math:`\sqrt{b+c}`
197 | * - Hamming; Canberra; Manhattan; Cityblock; Minkowski
198 | - :math:`b+c`
199 | * - Hellinger
200 | - :math:`2\sqrt{1 - \frac{a}{\sqrt{(a+b)(a+c)}}}`
201 | * - Jaccard distance :cite:`2020:wiki-jaccard`
202 | - :math:`\frac{b + c}{a + b + c}`
203 | * - Lance-Williams; Bray-Curtis
204 | - :math:`\frac{b+c}{2a+b+c}`
205 | * - Mean-Manhattan
206 | - :math:`\frac{b+c}{a+b+c+d}`
207 | * - Pattern Difference
208 | - :math:`\frac{4bc}{(a+b+c+d)^2}`
209 | * - Shape Difference
210 | - :math:`\frac{n(b+c)-(b-c)^2}{(a+b+c+d)^2}`
211 | * - Size Difference
212 | - :math:`\frac{(b+c)^2}{(a+b+c+d)^2}`
213 | * - Squared-Euclid
214 | - :math:`\sqrt{(b+c)^2}`
215 | * - Vari
216 | - :math:`\frac{b+c}{4a+4b+4c+4d}`
217 | * - Yule-Q
218 | - :math:`\frac{2bc}{ad+bc}`
219 |
220 | Instead of using `a`, `b`, `c` and `d` from a contingency table to define these association measures, it is common to use set notation. For two binary variables, :math:`X` and :math:`Y`, the following are equivalent.
221 |
222 | - :math:`|X \cap Y| = a`
223 | - :math:`|X \setminus Y| = b`
224 | - :math:`|Y \setminus X| = c`
225 | - :math:`|X \cup Y| = a + b + c`
226 |
227 | You will notice that `d` does not show up in the above relationship.
228 |
229 | Concordant, discordant, tie
230 | ---------------------------
231 |
232 | Let's try to understand how to determine if a pair of observations are concordant, discordant or tied. We have made up an example dataset below having two variables :math:`X` and :math:`Y`. Note that there are 6 observations, and as such, each observation is associated with an index from 1 to 6. An observation has a pair of values, one for :math:`X` and one for :math:`Y`.
233 |
234 | .. warning::
235 | Do **not** get the `pair of values of an observation` confused with a `pair of observations`.
236 |
237 | .. list-table:: Raw Data for :math:`X` and :math:`Y`
238 | :header-rows: 1
239 |
240 | * - Index
241 | - :math:`X`
242 | - :math:`Y`
243 | * - 1
244 | - 1
245 | - 3
246 | * - 2
247 | - 1
248 | - 3
249 | * - 3
250 | - 2
251 | - 4
252 | * - 4
253 | - 0
254 | - 2
255 | * - 5
256 | - 0
257 | - 4
258 | * - 6
259 | - 2
260 | - 2
261 |
262 | Because there are 6 observations, there are :math:`{{6}\choose{2}} = 15` possible pairs of observations. If we denote an observation by its corresponding index as :math:`O_i`, then the observations are then as follows.
263 |
264 | - :math:`O_1 = (1, 3)`
265 | - :math:`O_2 = (1, 3)`
266 | - :math:`O_3 = (2, 4)`
267 | - :math:`O_4 = (0, 2)`
268 | - :math:`O_5 = (0, 4)`
269 | - :math:`O_6 = (2, 2)`
270 |
271 | The 15 possible `combinations` of observation pairings are as follows.
272 |
273 | - :math:`O_1, O_2`
274 | - :math:`O_1, O_3`
275 | - :math:`O_1, O_4`
276 | - :math:`O_1, O_5`
277 | - :math:`O_1, O_6`
278 | - :math:`O_2, O_3`
279 | - :math:`O_2, O_4`
280 | - :math:`O_2, O_5`
281 | - :math:`O_2, O_6`
282 | - :math:`O_3, O_4`
283 | - :math:`O_3, O_5`
284 | - :math:`O_3, O_6`
285 | - :math:`O_4, O_5`
286 | - :math:`O_4, O_6`
287 | - :math:`O_5, O_6`
288 |
289 | For each one of these observation pairs, we can determine if such a pair is concordant, discordant or tied. There's a couple ways to determine concordant, discordant or tie status. The easiest way to determine so is mathematically. Another way is to use rules. Both are equivalent. Because we will use abstract notation to describe these math and rules used to determine concordant, discordant or tie for each pair, and because we are striving for clarity, let's expand these observation pairs into their component pairs of values and also their corresponding :math:`X` and :math:`Y` indexed notation.
290 |
291 | - :math:`O_1, O_2 = (1, 3), (1, 3) = (X_1, Y_1), (X_2, Y_2)`
292 | - :math:`O_1, O_3 = (1, 3), (2, 4) = (X_1, Y_1), (X_3, Y_3)`
293 | - :math:`O_1, O_4 = (1, 3), (0, 2) = (X_1, Y_1), (X_4, Y_4)`
294 | - :math:`O_1, O_5 = (1, 3), (0, 4) = (X_1, Y_1), (X_5, Y_5)`
295 | - :math:`O_1, O_6 = (1, 3), (2, 2) = (X_1, Y_1), (X_6, Y_6)`
296 | - :math:`O_2, O_3 = (1, 3), (2, 4) = (X_2, Y_2), (X_3, Y_3)`
297 | - :math:`O_2, O_4 = (1, 3), (0, 2) = (X_2, Y_2), (X_4, Y_4)`
298 | - :math:`O_2, O_5 = (1, 3), (0, 4) = (X_2, Y_2), (X_5, Y_5)`
299 | - :math:`O_2, O_6 = (1, 3), (2, 2) = (X_2, Y_2), (X_6, Y_6)`
300 | - :math:`O_3, O_4 = (2, 4), (0, 2) = (X_3, Y_3), (X_4, Y_4)`
301 | - :math:`O_3, O_5 = (2, 4), (0, 4) = (X_3, Y_3), (X_5, Y_5)`
302 | - :math:`O_3, O_6 = (2, 4), (2, 2) = (X_3, Y_3), (X_6, Y_6)`
303 | - :math:`O_4, O_5 = (0, 2), (0, 4) = (X_4, Y_4), (X_5, Y_5)`
304 | - :math:`O_4, O_6 = (0, 2), (2, 2) = (X_4, Y_4), (X_6, Y_6)`
305 | - :math:`O_5, O_6 = (0, 4), (2, 2) = (X_5, Y_5), (X_6, Y_6)`
306 |
307 | Now we can finally attempt to describe how to determine if any pair of observations is concordant, discordant or tied. If we want to use math to determine so, then, for any two pairs of observations :math:`(X_i, Y_i)` and :math:`(X_j, Y_j)`, the following determines the status.
308 |
309 | - concordant when :math:`(X_j - X_i)(Y_j - Y_i) > 0`
310 | - discordant when :math:`(X_j - X_i)(Y_j - Y_i) < 0`
311 | - tied when :math:`(X_j - X_i)(Y_j - Y_i) = 0`
312 |
313 | If we like rules, then the following determines the status.
314 |
315 | - concordant if :math:`X_i < X_j` and :math:`Y_i < Y_j` **or** :math:`X_i > X_j` and :math:`Y_i > Y_j`
316 | - discordant if :math:`X_i < X_j` and :math:`Y_i > Y_j` **or** :math:`X_i > X_j` and :math:`Y_i < Y_j`
317 | - tied if :math:`X_i = X_j` **or** :math:`Y_i = Y_j`
318 |
319 | All pairs of observations will evaluate categorically to one of these statuses. Continuing with our dummy data above, the concordancy status of the 15 pairs of observations are as follows (where concordant is C, discordant is D and tied is T).
320 |
321 | .. list-table:: Concordancy Status
322 | :header-rows: 1
323 |
324 | * - :math:`(X_i, Y_i)`
325 | - :math:`(X_j, Y_j)`
326 | - status
327 | * - :math:`(1, 3)`
328 | - :math:`(1, 3)`
329 | - T
330 | * - :math:`(1, 3)`
331 | - :math:`(2, 4)`
332 | - C
333 | * - :math:`(1, 3)`
334 | - :math:`(0, 2)`
335 | - C
336 | * - :math:`(1, 3)`
337 | - :math:`(0, 4)`
338 | - D
339 | * - :math:`(1, 3)`
340 | - :math:`(2, 2)`
341 | - D
342 | * - :math:`(1, 3)`
343 | - :math:`(2, 4)`
344 | - C
345 | * - :math:`(1, 3)`
346 | - :math:`(0, 2)`
347 | - C
348 | * - :math:`(1, 3)`
349 | - :math:`(0, 4)`
350 | - D
351 | * - :math:`(1, 3)`
352 | - :math:`(2, 2)`
353 | - D
354 | * - :math:`(2, 4)`
355 | - :math:`(0, 2)`
356 | - C
357 | * - :math:`(2, 4)`
358 | - :math:`(0, 4)`
359 | - C
360 | * - :math:`(2, 4)`
361 | - :math:`(2, 2)`
362 | - T
363 | * - :math:`(0, 2)`
364 | - :math:`(0, 4)`
365 | - T
366 | * - :math:`(0, 2)`
367 | - :math:`(2, 2)`
368 | - T
369 | * - :math:`(0, 4)`
370 | - :math:`(2, 2)`
371 | - D
372 |
373 | In this data set, the counts are :math:`C=6`, :math:`D=5` and :math:`T=4`. If we divide these counts with the total of pairs of observations, then we get the following probabilities.
374 |
375 | - :math:`\pi_C = \frac{C}{{n \choose 2}} = \frac{6}{15} = 0.40`
376 | - :math:`\pi_D = \frac{D}{{n \choose 2}} = \frac{5}{15} = 0.33`
377 | - :math:`\pi_T = \frac{T}{{n \choose 2}} = \frac{4}{15} = 0.27`
378 |
379 | Sometimes, it is desirable to distinguish between the types of ties. There are three possible types of ties.
380 |
381 | - :math:`T^X` are ties on only :math:`X`
382 | - :math:`T^Y` are ties on only :math:`Y`
383 | - :math:`T^{XY}` are ties on both :math:`X` and :math:`Y`
384 |
385 | Note, :math:`T = T^X + T^Y + T^{XY}`. If we want to distinguish between the tie types, then the status of each pair of observations is as follows.
386 |
387 | .. list-table:: Concordancy Status
388 | :header-rows: 1
389 |
390 | * - :math:`(X_i, Y_i)`
391 | - :math:`(X_j, Y_j)`
392 | - status
393 | * - :math:`(1, 3)`
394 | - :math:`(1, 3)`
395 | - :math:`T^{XY}`
396 | * - :math:`(1, 3)`
397 | - :math:`(2, 4)`
398 | - C
399 | * - :math:`(1, 3)`
400 | - :math:`(0, 2)`
401 | - C
402 | * - :math:`(1, 3)`
403 | - :math:`(0, 4)`
404 | - D
405 | * - :math:`(1, 3)`
406 | - :math:`(2, 2)`
407 | - D
408 | * - :math:`(1, 3)`
409 | - :math:`(2, 4)`
410 | - C
411 | * - :math:`(1, 3)`
412 | - :math:`(0, 2)`
413 | - C
414 | * - :math:`(1, 3)`
415 | - :math:`(0, 4)`
416 | - D
417 | * - :math:`(1, 3)`
418 | - :math:`(2, 2)`
419 | - D
420 | * - :math:`(2, 4)`
421 | - :math:`(0, 2)`
422 | - C
423 | * - :math:`(2, 4)`
424 | - :math:`(0, 4)`
425 | - C
426 | * - :math:`(2, 4)`
427 | - :math:`(2, 2)`
428 | - :math:`T^X`
429 | * - :math:`(0, 2)`
430 | - :math:`(0, 4)`
431 | - :math:`T^X`
432 | * - :math:`(0, 2)`
433 | - :math:`(2, 2)`
434 | - :math:`T^Y`
435 | * - :math:`(0, 4)`
436 | - :math:`(2, 2)`
437 | - D
438 |
439 | Distinguishing between ties, in this data set, the counts are :math:`C=6`, :math:`D=5`, :math:`T^X=2`, :math:`T^Y=1` and :math:`T^{XY}=1`. The probabilities of these statuses are as follows.
440 |
441 | - :math:`\pi_C = \frac{C}{{n \choose 2}} = \frac{6}{15} = 0.40`
442 | - :math:`\pi_D = \frac{D}{{n \choose 2}} = \frac{5}{15} = 0.33`
443 | - :math:`\pi_{T^X} = \frac{T^X}{{n \choose 2}} = \frac{2}{15} = 0.13`
444 | - :math:`\pi_{T^Y} = \frac{T^Y}{{n \choose 2}} = \frac{1}{15} = 0.07`
445 | - :math:`\pi_{T^{XY}} = \frac{T^{XY}}{{n \choose 2}} = \frac{1}{15} = 0.07`
446 |
447 | There are quite a few measures of associations using concordance as the basis for strength of association.
448 |
449 | .. list-table:: Association measures using concordance
450 | :header-rows: 1
451 |
452 | * - Association Measure
453 | - Formula
454 | * - Goodman-Kruskal's :math:`\gamma`
455 | - :math:`\gamma = \frac{\pi_C - \pi_D}{1 - \pi_T}`
456 | * - Somers' :math:`d`
457 | - :math:`d_{Y \cdot X} = \frac{\pi_C - \pi_D}{\pi_C + \pi_D + \pi_{T^Y}}`
458 | * -
459 | - :math:`d_{X \cdot Y} = \frac{\pi_C - \pi_D}{\pi_C + \pi_D + \pi_{T^X}}`
460 | * - Kendall's :math:`\tau`
461 | - :math:`\tau = \frac{C - D}{{n \choose 2}}`
462 |
463 | .. note::
464 | Sometimes `Somers' d` is written as `Somers' D`, `Somers' Delta` or even incorrectly as `Somer's D` :cite:`2017:glen,2020:wiki-somersd`. Somers' d has two versions, one that is symmetric and one that is asymmetric. The asymmetric Somers' d is the one most typically referred to :cite:`2017:glen`. The definition of Somers' d presented here is the asymmetric one, which explains :math:`d_{Y \cdot X}` and :math:`d_{X \cdot Y}`.
465 |
466 | Goodman-Kruskal's :math:`\lambda`
467 | ---------------------------------
468 |
469 | Goodman-Kruskal's lambda :math:`\lambda_{A|B}` measures the `proportional reduction in error` ``PRE`` for two categorical variables, :math:`A` and :math:`B`, when we want to understand how knowing :math:`B` reduces the probability of an error in predicting :math:`A`. :math:`\lambda_{A|B}` is estimated as follows.
470 |
471 | :math:`\lambda_{A|B} = \frac{P_E - P_{E|B}}{P_E}`
472 |
473 | Where,
474 |
475 | - :math:`P_E = 1 - \frac{\max_c N_{+c}}{N_{++}}`
476 | - :math:`P_{E|B} = 1 - \frac{\sum_r \max_c N_{rc}}{N_{++}}`
477 |
478 | In meaningful language.
479 |
480 | - :math:`P_E` is the probability of an error in predicting :math:`A`
481 | - :math:`P_{E|B}` is the probability of an error in predicting :math:`A` given knowledge of :math:`B`
482 |
483 | The terms :math:`N_{+c}`, :math:`N_{rc}` and :math:`N_{++}` comes from the contingency table we build from :math:`A` and :math:`B` (:math:`A` is in the columns and :math:`B` is in the rows) and denote the column marginal for the `c-th` column, total count for the `r-th` and `c-th` cell and total, correspondingly. To be clear.
484 |
485 | - :math:`N_{+c}` is the column marginal for the `c-th` column
486 | - :math:`N_{rc}` is total count for the `r-th` and `c-th` cell
487 | - :math:`N_{++}` is total number of observations
488 |
489 | The contingency table induced with :math:`A` in the columns and :math:`B` in the rows will look like the following. Note that :math:`A` has `C` columns and :math:`B` has `R` rows, or, in other words, :math:`A` has `C` values and :math:`B` has `R` values.
490 |
491 | .. list-table:: Contingency Table for :math:`A` and :math:`B`
492 |
493 | * -
494 | - :math:`A_1`
495 | - :math:`A_2`
496 | - :math:`\dotsb`
497 | - :math:`A_C`
498 | * - :math:`B_1`
499 | - :math:`N_{11}`
500 | - :math:`N_{12}`
501 | - :math:`\dotsb`
502 | - :math:`N_{1C}`
503 | * - :math:`B_2`
504 | - :math:`N_{21}`
505 | - :math:`N_{22}`
506 | - :math:`\dotsb`
507 | - :math:`N_{2C}`
508 | * - :math:`\vdots`
509 | - :math:`\vdots`
510 | - :math:`\vdots`
511 | -
512 | - :math:`\vdots`
513 | * - :math:`B_R`
514 | - :math:`N_{R1}`
515 | - :math:`N_{R2}`
516 | - :math:`\dotsb`
517 | - :math:`N_{RC}`
518 |
519 | The table above only shows the cell counts :math:`N_{11}, N_{12}, \ldots, N_{RC}` and **not** the row and column marginals. Below, we expand the contingency table to include
520 |
521 | - the row marginals :math:`N_{1+}, N_{2+}, \ldots, N_{R+}`, as well as,
522 | - the column marginals :math:`N_{+1}, N_{+2}, \ldots, N_{+C}`.
523 |
524 | .. list-table:: Contingency Table for :math:`A` and :math:`B`
525 |
526 | * -
527 | - :math:`A_1`
528 | - :math:`A_2`
529 | - :math:`\dotsb`
530 | - :math:`A_C`
531 | -
532 | * - :math:`B_1`
533 | - :math:`N_{11}`
534 | - :math:`N_{12}`
535 | - :math:`\dotsb`
536 | - :math:`N_{1C}`
537 | - :math:`N_{1+}`
538 | * - :math:`B_2`
539 | - :math:`N_{21}`
540 | - :math:`N_{22}`
541 | - :math:`\dotsb`
542 | - :math:`N_{2C}`
543 | - :math:`N_{2+}`
544 | * - :math:`\vdots`
545 | - :math:`\vdots`
546 | - :math:`\vdots`
547 | -
548 | - :math:`\vdots`
549 | - :math:`\vdots`
550 | * - :math:`B_R`
551 | - :math:`N_{R1}`
552 | - :math:`N_{R2}`
553 | - :math:`\dotsb`
554 | - :math:`N_{RC}`
555 | - :math:`N_{R+}`
556 | * -
557 | - :math:`N_{+1}`
558 | - :math:`N_{+2}`
559 | - :math:`\dotsb`
560 | - :math:`N_{+C}`
561 | - :math:`N_{++}`
562 |
563 | Note that the row marginal for a row is the sum of the values across the columns, and the column marginal for a colum is the sum of the values down the rows.
564 |
565 | - :math:`N_{R+} = \sum_C N_{RC}`
566 | - :math:`N_{+C} = \sum_R N_{RC}`
567 |
568 | Also, :math:`N_{++}` is just the sum over all the cells (excluding the row and column marginals). :math:`N_{++}` is really just the sample size.
569 |
570 | - :math:`N_{++} = \sum_R \sum_C N_{RC}`
571 |
572 | Let's go back to computing :math:`P_E` and :math:`P_{E|B}`.
573 |
574 | :math:`P_E` is given as follows.
575 |
576 | - :math:`P_E = 1 - \frac{\max_c N_{+c}}{N_{++}}`
577 |
578 | :math:`\max_c N_{+c}` is returning the maximum of the column marginals, and :math:`\frac{\max_c N_{+c}}{N_{++}}` is just a probability. What probability is this one? It is the largest probability associated with a value of :math:`A` (specifically, the value of :math:`A` with the largest counts). If we were to predict which value of :math:`A` would show up, we would choose the value of :math:`A` with the highest probability (it is the most likely). We would be correct :math:`\frac{\max_c N_{+c}}{N_{++}}` percent of the time, and we would be wrong :math:`1 - \frac{\max_c N_{+c}}{N_{++}}` percent of the time. Thus, :math:`P_E` is the error in predicting :math:`A` (knowing nothing else other than the distribution, or `probability mass function` ``PMF`` of :math:`A`).
579 |
580 | :math:`P_{E|B}` is given as follows.
581 |
582 | - :math:`P_{E|B} = 1 - \frac{\sum_r \max_c N_{rc}}{N_{++}}`
583 |
584 | What is :math:`\max_c N_{rc}` giving us? It is giving us the maximum cell count for the `r-th` row. :math:`\sum_r \max_c N_{rc}` adds up the all the largest values in each row, and :math:`\frac{\sum_r \max_c N_{rc}}{N_{++}}` is again a probability. What probability is this one? This probability is the one associated with predicting the value of :math:`A` when we know :math:`B`. When we know what the value of :math:`B` is, then the value of :math:`A` should be the one with the largest count (it has the highest probability, or, equivalently, the highest count). When we know the value of :math:`B` and by always choosing the value of :math:`A` with the highest count associated with that value of :math:`B`, we are correct :math:`\frac{\sum_r \max_c N_{rc}}{N_{++}}` percent of the time and incorrect :math:`1 - \frac{\sum_r \max_c N_{rc}}{N_{++}}` percent of the time. Thus, :math:`P_{E|B}` is the error in predicting :math:`A` when we know the value of :math:`B` and the PMF of :math:`A` given :math:`B`.
585 |
586 | The expression :math:`P_E - P_{E|B}` is the reduction in the probability of an error in predicting :math:`A` given knowledge of :math:`B`. This expression represents the `reduction in error` in the phrase/term ``PRE``. The proportional part in ``PRE`` comes from the expression :math:`\frac{P_E - P_{E|B}}{P_E}`, which is a proportion.
587 |
588 | What :math:`\lambda_{A|B}` is trying to compute is the reduction of error in predicting :math:`A` when we know :math:`B`. Did we reduce any prediction error of :math:`A` by knowing :math:`B`?
589 |
590 | - When :math:`\lambda_{A|B} = 0`, this value means that knowing :math:`B` did not reduce any prediction error in :math:`A`. The only way to get :math:`\lambda_{A|B} = 0` is when :math:`P_E = P_{E|B}`.
591 | - When :math:`\lambda_{A|B} = 1`, this value means that knowing :math:`B` completely reduced all prediction errors in :math:`A`. The only way to get :math:`\lambda_{A|B} = 1` is when :math:`P_{E|B} = 0`.
592 |
593 | Generally speaking, :math:`\lambda_{A|B} \neq \lambda_{B|A}`, and :math:`\lambda` is thus an asymmetric association measure. To compute :math:`\lambda_{B|A}`, simply put :math:`B` in the columns and :math:`A` in the rows and reuse the formulas above.
594 |
595 | Furthermore, :math:`\lambda` can be used in studies of causality :cite:`1983:liebetrau`. We are not saying it is appropriate or even possible to entertain causality with just two variables alone :cite:`2020:pearl,2016:pearl,2009:pearl,1988:pearl`, but, when we have two categorical variables and want to know which is likely the cause and which the effect, the asymmetry between :math:`\lambda_{A|B}` and :math:`\lambda_{B|A}` may prove informational :cite:`2020:wiki-prospect`. Causal analysis based on two variables alone has been studied :cite:`2008:nips`.
596 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 | :description: A statistical API for bivariate association measures.
3 | :keywords: python, statistics, bivariate, association, categorical, binary, nominal, ordinal, continuous, ratio, interval, contingency table analysis, apache spark, spark, high performance computing, massively parallel processing, hpc, mpp, causality, symmetric, asymmetric, correlation, confusion matrix, concordance, ranking
4 | :robots: index, follow
5 | :abstract: A statistical API for bivariate association measures. There are over 130 association measures identified between the product of categorical and continuous variable types.
6 | :author: Jee Vang, Ph.D.
7 | :contact: g@oneoffcoder.com
8 | :copyright: One-Off Coder
9 | :content: global
10 | :generator: Sphinx
11 | :language: English
12 | :rating: general
13 | :reply-to: info@oneoffcoder.com
14 | :web_author: Jee Vang, Ph.D.
15 | :revisit-after: 1 days
16 |
17 | .. PyPair documentation master file, created by
18 | sphinx-quickstart on Wed Nov 11 22:56:50 2020.
19 | You can adapt this file completely to your liking, but it should at least
20 | contain the root `toctree` directive.
21 |
22 | PyPair
23 | ======
24 |
25 | .. image:: _static/images/logo.png
26 | :align: center
27 | :alt: pypair logo.
28 |
29 | PyPair is a statistical library to compute pairwise association between any two types of variables. You can use the library locally on your laptop or desktop, or, you may use it on a `Spark `_ cluster.
30 |
31 | .. blockdiag::
32 |
33 | diagram {
34 | default_shape = roundedbox
35 | span_width = 32
36 | span_height = 20
37 | default_fontsize = 11
38 | edge_layout = normal
39 | orientation = landscape
40 |
41 | V [label = "Variable", color = pink]
42 | C [label = "Continuous", color = "#edfa78"]
43 | I [label = "Interval", color = "#def514"]
44 | R [label = "Ratio", color = "#def514"]
45 | A [label = "Categorical", color = "#e0e0e0"]
46 | B [label = "Binary", color ="#e4ede6"]
47 | N [label = "Nominal", color ="#e4ede6"]
48 | O [label = "Ordinal", color ="#e4ede6"]
49 |
50 | V -> A, C
51 | C -> I, R
52 | A -> B, N, O
53 | }
54 |
55 | You may install ``py-pair`` from `pypi `_.
56 |
57 | .. code:: bash
58 |
59 | pip install pypair
60 |
61 | .. toctree::
62 | :maxdepth: 2
63 | :caption: Contents
64 |
65 | intro
66 | quicklist
67 | quickstart
68 | deepdives
69 | zzz-bib
70 |
71 | .. toctree::
72 | :maxdepth: 2
73 | :caption: API Documentation
74 |
75 | modules
76 |
77 | Indices and tables
78 | ==================
79 |
80 | * :ref:`genindex`
81 | * :ref:`modindex`
82 | * :ref:`search`
83 |
84 | About
85 | =====
86 |
87 | .. image:: _static/images/ooc-logo.png
88 | :alt: One-Off Coder logo.
89 |
90 | One-Off Coder is an educational, service and product company. Please visit us online to discover how we may help you achieve life-long success in your personal coding career or with your company's business goals and objectives.
91 |
92 | - |Website_Link|
93 | - |Facebook_Link|
94 | - |Twitter_Link|
95 | - |Instagram_Link|
96 | - |YouTube_Link|
97 | - |LinkedIn_Link|
98 |
99 | .. |Website_Link| raw:: html
100 |
101 | Website
102 |
103 | .. |Facebook_Link| raw:: html
104 |
105 | Facebook
106 |
107 | .. |Twitter_Link| raw:: html
108 |
109 | Twitter
110 |
111 | .. |Instagram_Link| raw:: html
112 |
113 | Instagram
114 |
115 | .. |YouTube_Link| raw:: html
116 |
117 | YouTube
118 |
119 | .. |LinkedIn_Link| raw:: html
120 |
121 | LinkedIn
122 |
123 | Copyright
124 | =========
125 |
126 | Documentation
127 | -------------
128 |
129 | .. raw:: html
130 |
131 |
141 |
142 | Software
143 | --------
144 |
145 | ::
146 |
147 | Copyright 2020 One-Off Coder
148 |
149 | Licensed under the Apache License, Version 2.0 (the "License");
150 | you may not use this file except in compliance with the License.
151 | You may obtain a copy of the License at
152 |
153 | http://www.apache.org/licenses/LICENSE-2.0
154 |
155 | Unless required by applicable law or agreed to in writing, software
156 | distributed under the License is distributed on an "AS IS" BASIS,
157 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
158 | See the License for the specific language governing permissions and
159 | limitations under the License.
160 |
161 | Art
162 | ---
163 |
164 | ::
165 |
166 | Copyright 2020 Daytchia Vang
167 |
168 | Citation
169 | ========
170 |
171 | ::
172 |
173 | @misc{oneoffcoder_pypair_2020,
174 | title={PyPair, A Statistical API for Bivariate Association Measures},
175 | url={https://github.com/oneoffcoder/py-pair},
176 | author={Jee Vang},
177 | year={2020},
178 | month={Nov}}
179 |
180 | Author
181 | ======
182 |
183 | Jee Vang, Ph.D.
184 |
185 | - |Patreon_Link|
186 | - |Github_Link|
187 |
188 | .. |Patreon_Link| raw:: html
189 |
190 | Patreon: support is appreciated
191 |
192 | .. |Github_Link| raw:: html
193 |
194 | GitHub: sponsorship will help us change the world for the better
195 |
196 | Help
197 | ====
198 |
199 | - |Source_Link|
200 | - |Gitter_Link|
201 |
202 | .. |Source_Link| raw:: html
203 |
204 | GitHub: source code
205 |
206 | .. |Gitter_Link| raw:: html
207 |
208 | Gitter: chat
--------------------------------------------------------------------------------
/docs/source/intro.rst:
--------------------------------------------------------------------------------
1 | Introduction
2 | ============
3 |
4 | PyPair is a statistical library to compute pairwise association between any two variables. A reasonable taxonomy of variable types in statistics is as follows :cite:`2020:uom,2020:idre,2020:laerd,2020:graphpad,2020:minitab`.
5 |
6 | - ``Categorical``: A variable whose values have no intrinsic ordering. An example is a variable indicating the continents: North America, South America, Asia, Arctic, Antarctica, Africa and Europe. There is no ordering to these continents; we cannot say North America comes before Africa. Categorical variables are also referred to as `qualitative` variables.
7 | - ``Binary``: A categorical variable that has only 2 values. An example is a variable indicating whether or not someone likes to eat pizza; the values could be ``yes`` or ``no``. It is common to encode the binary values to ``0`` and ``1`` for storage and numerical convenience, but do not be fooled, there is still no numerical ordering. These variables are also referred to in the wild as `dichotomous` variables.
8 | - ``Nominal``: A categorical variable that has 3 or more values. When most people think of categorical variables, they think of nominal variables.
9 | - ``Ordinal``: A categorical variable whose values have a logical order but the difference between any two values do not give meaningful numerical magnitude. An example of an ordinal variable is one that indicates the performance on a test: good, better, best. We know that good is the base, better is the comparative and best is the superlative, however, we cannot say that the difference between best and good is two numbers up. For all we know, best can be orders of magnitude away from good.
10 | - ``Continuous``: A variable whose values are (basically) numbers, and thus, have meaningful ordering. A continuous variable may have an infinite number of values. Continuous variables are also referred to as `quantitative` variables.
11 | - ``Interval``: A continuous variable that is one whose values exists on a continuum of numerical values. Temperature measured in Celcius or Fahrenheit is an example of an interval variable.
12 | - ``Ratio``: An interval variable with a true zero. Temperature measured in Kelvin is an example of a ratio variable.
13 |
14 | .. note::
15 | If we have a variable capturing eye colors, the possible values may be blue, green or brown. On first sight, this variable may be considered a nominal variable. Instead of capturing the eye color categorically, what if we measure the wavelengths of eye colors? Below are estimations of each of the wavelengths (nanometers) corresponding to these colors.
16 |
17 | - blue: 450
18 | - green: 550
19 | - brown: 600
20 |
21 | Which variable type does the eye color variable become?
22 |
23 | .. note::
24 | There is also much use of the term ``discrete variable``, and sometimes it refers to categorical or continuous variables. In general, a discrete variable has a finite set of values, and in this sense, a discrete variable could be a categorical variable. We have seen many cases of a continuous variable (infinite values) undergoing `discretization` (finite values). The resulting variable from discretization is often treated as a categorical variable by applying statistical operations appropriate for that type of variable. Yet, in some cases, a continuous variable can also be a discrete variable. If we have a variable to capture age (whole numbers only), we might observe a range :math:`[0, 120]`. There are 121 values (zero is included), but still, we can treat this age variable like a ratio variable.
25 |
26 | Assuming we have data and we know the variable types in this data using the taxonomy above, we might want to make a progression of analyses from univariate, bivariate and to multivariate analyses. Along the way, for bivariate analysis, we are often curious about the association between any two pairs of variables. We want to know both the magnitude (the strength, is it small or big?) and direction (the sign, is it positive or negative?) of the association. When the variables are all of the same type, association measures may be abound to conduct pairwise association; if all the variables are continuous, we might just want to apply canonical Pearson correlation.
27 |
28 | The tough situation is when we have a mixed variable type of dataset; and this tough situation is quite often the normal situation. How do we find the association between a continuous and categorical variable? We can create a table as below to map the available association measure approaches for any two types of variables :cite:`2020:calkins,2020:psu-continuous`. (In the table below, we collapse all categorical and continuous variable types).
29 |
30 | .. raw:: html
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
48 |
49 |
50 |
51 |
52 |
60 |
66 |
67 |
68 |
69 | The ultimate goal of this project is to identify as many measures of associations for these unique pairs of variable types and to implement these association measures in a unified application programming interface (API).
70 |
71 | .. note::
72 | We use the term `association` over `correlation` since the latter typically connotes canonical Pearson correlation or association between two continuous variables. The term `association` is more general and can cover specific types of association, such as `agreement` measures, along side with those dealing with continuous variables :cite:`1983:liebetrau`.
73 |
--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | .. toctree::
2 | :maxdepth: 4
3 |
4 | pypair
5 |
--------------------------------------------------------------------------------
/docs/source/pypair.rst:
--------------------------------------------------------------------------------
1 | PyPair
2 | ======
3 |
4 | Contingency Table Analysis
5 | --------------------------
6 |
7 | These are the basic contingency tables used to analyze categorical data.
8 |
9 | - CategoricalTable
10 | - BinaryTable
11 | - ConfusionMatrix
12 | - AgreementTable
13 |
14 | .. automodule:: pypair.contingency
15 | :members:
16 | :undoc-members:
17 | :show-inheritance:
18 | :special-members: __init__
19 |
20 | Biserial
21 | --------
22 |
23 | These are the biserial association measures.
24 |
25 | .. automodule:: pypair.biserial
26 | :members:
27 | :undoc-members:
28 | :show-inheritance:
29 | :special-members: __init__
30 |
31 | Continuous
32 | ----------
33 |
34 | These are the continuous association measures.
35 |
36 | .. automodule:: pypair.continuous
37 | :members:
38 | :undoc-members:
39 | :show-inheritance:
40 | :special-members: __init__
41 |
42 | Associations
43 | ------------
44 |
45 | Some of the functions here are just wrappers around the contingency tables and may be looked at as convenience methods to simply pass in data for two variables. If you need more than the specific association, you are encouraged to build the appropriate contingency table and then call upon the measures you need.
46 |
47 | .. automodule:: pypair.association
48 | :members:
49 | :undoc-members:
50 | :show-inheritance:
51 | :special-members: __init__
52 |
53 | Decorators
54 | ----------
55 |
56 | These are decorators.
57 |
58 | .. automodule:: pypair.decorator
59 | :members:
60 | :undoc-members:
61 | :show-inheritance:
62 | :special-members: __init__
63 |
64 | Utility
65 | -------
66 |
67 | These are utility functions.
68 |
69 | .. automodule:: pypair.util
70 | :members:
71 | :undoc-members:
72 | :show-inheritance:
73 | :special-members: __init__
74 |
75 | Spark
76 | -----
77 |
78 | These are functions that you can use in a Spark. You must pass in a Spark dataframe and you will get a ``pair-RDD`` as output. The pair-RDD will have the following as its keys and values.
79 |
80 | - key: in the form of a tuple of strings ``(k1, k2)`` where k1 and k2 are names of variables (column names)
81 | - value: a dictionary ``{'acc': 0.8, 'tpr': 0.9, 'fpr': 0.8, ...}`` where keys are association measure names and values are the corresponding association values
82 |
83 |
84 |
85 | .. automodule:: pypair.spark
86 | :members:
87 | :undoc-members:
88 | :show-inheritance:
89 | :special-members: __init__
90 |
--------------------------------------------------------------------------------
/docs/source/quicklist.rst:
--------------------------------------------------------------------------------
1 | Quick List
2 | ==========
3 |
4 | Below are just some quick listing of association measures without any description. These association measures are grouped by variable pair types and/or approach.
5 |
6 | Binary-Binary (88)
7 | ------------------
8 |
9 | - adjusted_rand_index
10 | - ample
11 | - anderberg
12 | - baroni_urbani_buser_i
13 | - baroni_urbani_buser_ii
14 | - braun_banquet
15 | - chisq
16 | - chisq
17 | - chisq_dof
18 | - chord
19 | - cole_i
20 | - cole_ii
21 | - contingency_coefficient
22 | - cosine
23 | - cramer_v
24 | - dennis
25 | - dice
26 | - disperson
27 | - driver_kroeber
28 | - euclid
29 | - eyraud
30 | - fager_mcgowan
31 | - faith
32 | - forbes_ii
33 | - forbesi
34 | - fossum
35 | - gilbert_wells
36 | - gk_lambda
37 | - gk_lambda_reversed
38 | - goodman_kruskal
39 | - gower
40 | - gower_legendre
41 | - hamann
42 | - hamming
43 | - hellinger
44 | - inner_product
45 | - intersection
46 | - jaccard
47 | - jaccard_3w
48 | - jaccard_distance
49 | - johnson
50 | - kulcyznski_ii
51 | - kulczynski_i
52 | - lance_williams
53 | - mcconnaughey
54 | - mcnemar_test
55 | - mean_manhattan
56 | - michael
57 | - mountford
58 | - mutual_information
59 | - ochia_i
60 | - ochia_ii
61 | - odds_ratio
62 | - pattern_difference
63 | - pearson_heron_i
64 | - pearson_heron_ii
65 | - pearson_i
66 | - peirce
67 | - person_ii
68 | - phi
69 | - roger_tanimoto
70 | - russel_rao
71 | - shape_difference
72 | - simpson
73 | - size_difference
74 | - sokal_michener
75 | - sokal_sneath_i
76 | - sokal_sneath_ii
77 | - sokal_sneath_iii
78 | - sokal_sneath_iv
79 | - sokal_sneath_v
80 | - sorensen_dice
81 | - sorgenfrei
82 | - stiles
83 | - tanimoto_distance
84 | - tanimoto_i
85 | - tanimoto_ii
86 | - tarantula
87 | - tarwid
88 | - tetrachoric
89 | - tschuprow_t
90 | - uncertainty_coefficient
91 | - uncertainty_coefficient_reversed
92 | - vari
93 | - yule_q
94 | - yule_q_difference
95 | - yule_w
96 | - yule_y
97 |
98 | Confusion Matrix, Binary-Binary (29)
99 | ------------------------------------
100 |
101 | - acc
102 | - ba
103 | - bm
104 | - dor
105 | - f1
106 | - fdr
107 | - fn
108 | - fnr
109 | - fomr
110 | - fp
111 | - fpr
112 | - mcc
113 | - mk
114 | - n
115 | - nlr
116 | - npv
117 | - plr
118 | - ppv
119 | - precision
120 | - prevalence
121 | - pt
122 | - recall
123 | - sensitivity
124 | - specificity
125 | - tn
126 | - tnr
127 | - tp
128 | - tpr
129 | - ts
130 |
131 | Categorical-Categorical (9)
132 | ---------------------------
133 |
134 | - adjusted_rand_index
135 | - chisq
136 | - chisq_dof
137 | - gk_lambda
138 | - gk_lambda_reversed
139 | - mutual_information
140 | - phi
141 | - uncertainty_coefficient
142 | - uncertainty_coefficient_reversed
143 |
144 | Categorical-Continuous, Biserial (3)
145 | ------------------------------------
146 |
147 | - biserial
148 | - point_biserial
149 | - rank_biserial
150 |
151 | Categorical-Continuous (7)
152 | --------------------------
153 |
154 | - anova
155 | - calinski_harabasz
156 | - davies_bouldin
157 | - eta
158 | - eta_squared
159 | - kruskal
160 | - silhouette
161 |
162 | Ordinal-Ordinal, Concordance (3)
163 | --------------------------------
164 |
165 | - goodman_kruskal_gamma
166 | - kendall_tau
167 | - somers_d
168 |
169 | Continuous-Continuous (4)
170 | -------------------------
171 |
172 | - kendall
173 | - pearson
174 | - regression
175 | - spearman
--------------------------------------------------------------------------------
/docs/source/quickstart.rst:
--------------------------------------------------------------------------------
1 | Quickstart
2 | ==========
3 |
4 | Installation
5 | ------------
6 |
7 | Use PyPi to install the `package `_.
8 |
9 | .. code:: bash
10 |
11 | pip install pypair
12 |
13 | Confusion Matrix
14 | ----------------
15 |
16 | A confusion matrix is typically used to judge binary classification performance. There are two variables, :math:`A` and :math:`P`, where :math:`A` is the actual value (ground truth) and :math:`P` is the predicted value. The example below shows how to use the convenience method ``confusion()`` and the class ``ConfusionMatrix`` to get association measures derived from the confusion matrix.
17 |
18 | .. literalinclude:: _code/confusion-demo.py
19 | :language: python
20 | :linenos:
21 |
22 | Binary-Binary
23 | -------------
24 |
25 | Association measures for binary-binary variables are computed using ``binary_binary()`` or ``BinaryTable``.
26 |
27 | .. literalinclude:: _code/binary-demo.py
28 | :language: python
29 | :linenos:
30 |
31 | Categorical-Categorical
32 | -----------------------
33 |
34 | Association measures for categorical-categorical variables are computed using ``categorical_categorical()`` or ``CategoricalTable``.
35 |
36 | .. literalinclude:: _code/categorical-demo.py
37 | :language: python
38 | :linenos:
39 |
40 | Binary-Continuous
41 | -----------------
42 |
43 | Association measures for binary-continuous variables are computed using ``binary_continuous()`` or ``Biserial``.
44 |
45 | .. literalinclude:: _code/biserial-demo.py
46 | :language: python
47 | :linenos:
48 |
49 | Ordinal-Ordinal, Concordance
50 | ----------------------------
51 | Concordance measures are used for ordinal-ordinal or continuous-continuous variables using ``concordance()`` or ``Concordance()``.
52 |
53 | .. literalinclude:: _code/concordance-demo.py
54 | :language: python
55 | :linenos:
56 |
57 | Categorical-Continuous
58 | ----------------------
59 | Categorical-continuous association measures are computed using ``categorical_continuous()`` or ``CorrelationRatio``.
60 |
61 | .. literalinclude:: _code/corr-ratio-demo.py
62 | :language: python
63 | :linenos:
64 |
65 | Continuous-Continuous
66 | ---------------------
67 |
68 | Association measures for continuous-continuous variables are computed using ``continuous_continuous()`` or ``Continuous``.
69 |
70 | .. literalinclude:: _code/continuous-demo.py
71 | :language: python
72 | :linenos:
73 |
74 | Recipe
75 | ------
76 |
77 | Here's a recipe in using multiprocessing to compute pairwise association with binary data.
78 |
79 | .. literalinclude:: _code/multiprocessing-tip.py
80 | :language: python
81 | :linenos:
82 |
83 | Here's a nifty utility method to create a correlation matrix. The input data frame must be all the same type and you must supply a function. Note that Pandas ``DataFrame.corr()`` no longer supports processing non-numeric data; fields that are not numeric will be simply skipped over. Why?
84 |
85 | .. literalinclude:: _code/dataframe-tip.py
86 | :language: python
87 | :linenos:
88 |
89 | Apache Spark
90 | ------------
91 |
92 | Spark is supported for some of the association measures. `Active support `_ is appreciated. Below are some code samples to get you started.
93 |
94 | .. literalinclude:: _code/spark-demo.py
95 | :language: python
96 | :linenos:
--------------------------------------------------------------------------------
/docs/source/refs.bib:
--------------------------------------------------------------------------------
1 | @misc{2020:calkins,
2 | author = {Keith G. Calkins},
3 | title = {More Correlation Coefficients},
4 | url = {https://www.andrews.edu/~calkins/math/edrm611/edrm13.htm},
5 | addendum = "(accessed: 11.12.2020)"
6 | }
7 | @misc{2020:uom,
8 | author = {University of Minnesota},
9 | title = {Types of Variables},
10 | url = {https://cyfar.org/types-variables},
11 | addendum = "(accessed: 11.12.2020)"
12 | }
13 | @misc{2020:idre,
14 | author = {Institute for Digital Research and Education},
15 | title = {What is the difference between categorical, ordinal and numerical variables?},
16 | url = {https://stats.idre.ucla.edu/other/mult-pkg/whatstat/what-is-the-difference-between-categorical-ordinal-and-numerical-variables/},
17 | addendum = "(accessed: 11.12.2020)"
18 | }
19 | @misc{2020:minitab,
20 | author = {Minitab},
21 | title = {What are categorical, discrete, and continuous variables?},
22 | url = {https://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/supporting-topics/basics/what-are-categorical-discrete-and-continuous-variables/},
23 | addendum = "(accessed: 11.12.2020)"
24 | }
25 | @misc{2020:laerd,
26 | author = {Laerd Statistics},
27 | title = {Types of Variable},
28 | url = {https://statistics.laerd.com/statistical-guides/types-of-variable.php},
29 | addendum = "(accessed: 11.12.2020)"
30 | }
31 | @misc{2020:graphpad,
32 | author = {GraphPad},
33 | title = {What is the difference between ordinal, interval and ratio variables? Why should I care?},
34 | url = {https://www.graphpad.com/support/faq/what-is-the-difference-between-ordinal-interval-and-ratio-variables-why-should-i-care/},
35 | addendum = "(accessed: 11.12.2020)"
36 | }
37 | @misc{2020:wiki-somersd,
38 | author = {Wikipedia},
39 | title = {Somer's D},
40 | url = {https://en.wikipedia.org/wiki/Somers%27_D},
41 | addendum = "(accessed: 11.12.2020)"
42 | }
43 | @misc{2020:wiki-jaccard,
44 | author = {Wikipedia},
45 | title = {Jaccard index},
46 | url = {https://en.wikipedia.org/wiki/Jaccard_index},
47 | addendum = "(accessed: 11.14.2020)"
48 | }
49 | @misc{2020:wiki-dice,
50 | author = {Wikipedia},
51 | title = {Sørensen–Dice coefficient},
52 | url = {https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient},
53 | addendum = "(accessed: 11.14.2020)"
54 | }
55 | @misc{2020:wiki-tversky,
56 | author = {Wikipedia},
57 | title = {Tversky index},
58 | url = {https://en.wikipedia.org/wiki/Tversky_index},
59 | addendum = "(accessed: 11.14.2020)"
60 | }
61 | @misc{2020:wiki-prospect,
62 | author = {Wikipedia},
63 | title = {Prospect theory},
64 | url = {https://en.wikipedia.org/wiki/Prospect_theory},
65 | addendum = "(accessed: 11.14.2020)"
66 | }
67 | @misc{2020:wiki-overlap,
68 | author = {Wikipedia},
69 | title = {Overlap coefficient},
70 | url = {https://en.wikipedia.org/wiki/Overlap_coefficient},
71 | addendum = "(accessed: 11.14.2020)"
72 | }
73 | @misc{2020:wiki-fowlkes,
74 | author = {Wikipedia},
75 | title = {Fowlkes-Mallows Index},
76 | url = {https://en.wikipedia.org/wiki/Fowlkes%E2%80%93Mallows_index},
77 | addendum = "(accessed: 11.14.2020)"
78 | }
79 | @misc{2008:nips,
80 | author = {NIPS},
81 | title = {NIPS 2008 Workshop on Causality},
82 | url = {http://clopinet.com/isabelle/Projects/NIPS2008/},
83 | addendum = "(accessed: 11.13.2020)"
84 | }
85 | @Book{1983:liebetrau,
86 | author = {Albert M. Liebetrau},
87 | title = {Measures of association},
88 | publisher = {Sage Publications, Inc.},
89 | year = {1983}
90 | }
91 | @Book{1984:reynolds,
92 | author = {H. T. Reynolds},
93 | title = {Analysis of nominal data},
94 | publisher = {Sage Publications, Inc.},
95 | year = {1984}
96 | }
97 | @Book{1970:cox,
98 | author = {D. R. Cox},
99 | title = {Analysis of binary data},
100 | publisher = {Chapman and Hall},
101 | year = {1970}
102 | }
103 | @Book{2020:pearl,
104 | author = {Judea Pearl},
105 | title = {The Book of Why: The New Science of Cause and Effect},
106 | publisher = {Basic Books},
107 | year = {2020}
108 | }
109 | @Book{2016:pearl,
110 | author = {Judea Pearl},
111 | title = {Causal Inference in Statistics - A Primer},
112 | publisher = {Wiley},
113 | year = {2016}
114 | }
115 | @Book{2009:pearl,
116 | author = {Judea Pearl},
117 | title = {Causality: Models, Reasoning and Inference},
118 | publisher = {Chapman and Hall},
119 | year = {2009}
120 | }
121 | @Book{1988:pearl,
122 | author = {Judea Pearl},
123 | title = {Probabilistic Reasoning in Intelligent Systems: Networks of Plausible Inference},
124 | publisher = {Morgan Kaufmann},
125 | year = {1988}
126 | }
127 | @misc{2017:glen,
128 | author = {Stephanie Glen},
129 | title = {What is Somers’ Delta?},
130 | url = {https://www.statisticshowto.com/somers-d},
131 | addendum = "(accessed: 11.14.2020)"
132 | }
133 | @misc{2020:psu-binary,
134 | author = {Penn State University},
135 | title = {Measures of Association for Binary Variables},
136 | url = {https://online.stat.psu.edu/stat505/lesson/14/14.3},
137 | addendum = "(accessed: 11.14.2020)"
138 | }
139 | @misc{2020:psu-continuous,
140 | author = {Penn State University},
141 | title = {Measures of Association for Continuous Variables},
142 | url = {https://online.stat.psu.edu/stat505/lesson/14/14.2},
143 | addendum = "(accessed: 11.14.2020)"
144 | }
145 | @misc{2020:ibm-proximities,
146 | author = {IBM Proximities},
147 | title = {Measures for binary data},
148 | url = {https://www.ibm.com/support/knowledgecenter/SSLVMB_24.0.0/spss/base/syn_proximities_measures_binary_data.html},
149 | addendum = "(accessed: 11.14.2020)"
150 | }
151 | @misc{2020:stack-sim,
152 | author = {Stack Exchange},
153 | title = {Measures for binary data},
154 | url = {https://stats.stackexchange.com/questions/61705/similarity-coefficients-for-binary-data-why-choose-jaccard-over-russell-and-rao},
155 | addendum = "(accessed: 11.14.2020)"
156 | }
157 | @article{2010:choi,
158 | author = {Seung-Seok Choi, Sung-Hyuk Cha, Charles C. Tappert},
159 | title = {A Survey of Binary Similarity and Distance Measures},
160 | journal = {Systemics, Cybernetics and Informatics},
161 | year = 2010,
162 | number = 1,
163 | volume = 8
164 | }
165 | @article{2019:warrens,
166 | author = {Matthijs J. Warrens},
167 | title = {Similarity measures for 2 x 2 tables},
168 | journal = {Journal of Intelligent & Fuzzy Systems},
169 | year = 2019,
170 | volume = 36
171 | }
172 |
173 |
--------------------------------------------------------------------------------
/docs/source/robots.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | Allow: /
3 | Sitemap: https://py-pair.readthedocs.io/sitemap.xml
--------------------------------------------------------------------------------
/docs/source/zzz-bib.rst:
--------------------------------------------------------------------------------
1 | Bibliography
2 | ------------
3 |
4 | .. bibliography:: refs.bib
5 | :all:
--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/logo.png
--------------------------------------------------------------------------------
/misc/SPARK.md:
--------------------------------------------------------------------------------
1 | # Spark Tinkering
2 |
3 | To run Spark + Jupyter container. Then go to [http://localhost:8888](http://localhost:8888).
4 |
5 | On Linux.
6 |
7 | ```bash
8 | docker run -it \
9 | -p 9870:9870 \
10 | -p 8088:8088 \
11 | -p 8080:8080 \
12 | -p 18080:18080 \
13 | -p 9000:9000 \
14 | -p 8888:8888 \
15 | -p 9864:9864 \
16 | -v $HOME/git/py-pair/misc/ipynb:/root/ipynb \
17 | -e PYSPARK_MASTER=spark://localhost:7077 \
18 | -e NOTEBOOK_PASSWORD='' \
19 | oneoffcoder/spark-jupyter
20 | ```
21 |
22 | On Windows.
23 |
24 | ```bash
25 | docker run -it ^
26 | -p 9870:9870 ^
27 | -p 8088:8088 ^
28 | -p 8080:8080 ^
29 | -p 18080:18080 ^
30 | -p 9000:9000 ^
31 | -p 8888:8888 ^
32 | -p 9864:9864 ^
33 | -v ./git/py-pair/misc/ipynb:/root/ipynb ^
34 | -e PYSPARK_MASTER=spark://localhost:7077 ^
35 | -e NOTEBOOK_PASSWORD='' ^
36 | oneoffcoder/spark-jupyter
37 | ```
--------------------------------------------------------------------------------
/misc/binary-measures.csv:
--------------------------------------------------------------------------------
1 | type,name,equation
2 | s,Jaccard,"\frac{a}{a+b+c}"
3 | s,Dice;Czekanowski;Nei-Li,"\frac{2a}{2a+b+c}"
4 | s,3W-Jaccard,"\frac{3a}{3a+b+c}"
5 | s,Sokal-Sneath-I,"\frac{a}{a+2b+2c}"
6 | s,Sokal-Michener,"\frac{a+d}{a+b+c+d}"
7 | s,Sokal-Sneath-II,"\frac{2a+2d}{2a+b+c+2d}"
8 | s,Roger-Tanimoto,"\frac{a+d}{a+2b+2c+d}"
9 | s,Faith,"\frac{a+0.5d}{a+b+c+d}"
10 | s,Gower-Legendre,"\frac{a+d}{a+0.5b+0.5c+d}"
11 | s,Intersection,"a"
12 | s,Inner Product,"a+d"
13 | s,Russell-Rao,"\frac{a}{a+b+c+d}"
14 | s,Cosine,"\frac{a}{(a+b)(a+c)}"
15 | s,Gilbert-Wells,"\log a - \log p - \log \frac{a+b}{p} - \log \frac{a+c}{p}"
16 | s,Ochiai-I;Otsuka,"\frac{a}{\sqrt{(a+b)(a+c)}}"
17 | s,Forbesi,"\frac{pa}{(a+b)(a+c)}"
18 | s,Fossum,"\frac{n(a-0.5)^2}{(a+b)(a+c)}"
19 | s,Sorgenfrei,"\frac{a^2}{(a+b)(a+c)}"
20 | s,Mountford,"\frac{a}{0.5(ab + ac) + bc}"
21 | s,McConnaughey,"\frac{a^2 - bc}{(a+b)(a+c)}"
22 | s,Tarwid,"\frac{na - (a+b)(a+c)}{na + (a+b)(a+c)}"
23 | s,Kulczynski-II,"\frac{0.5a(2a+b+c)}{(a+b)(a+c)}"
24 | s,Driver-Kroeber,"\frac{a}{2}\left(\frac{1}{a+b}+\frac{1}{a+c}\right)"
25 | s,Johnson,"\frac{a}{a+b}+\frac{a}{a+c}"
26 | s,Dennis,"\frac{ad-bc}{\sqrt{n(a+b)(a+c)}}"
27 | s,Simpson,"\frac{a}{\min(a+b,a+c)}"
28 | s,Braun-Banquet,"\frac{a}{\max(a+b,a+c)}"
29 | s,Fager-McGowan,"\frac{a}{\sqrt{(a+b)(a+c)}}-\frac{max(a+b,a+c)}{2}"
30 | s,Forbes-II,"\frac{na-(a+b)(a+c)}{n \min(a+b,a+c) - (a+b)(a+c)}"
31 | s,Sokal-Sneath-IV,"\frac{1}{4}\left(\frac{a}{a+b}+\frac{a}{a+c}+\frac{d}{b+d}+\frac{d}{b+d}\right)"
32 | s,Gower,"\frac{a+d}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}"
33 | s,Pearson-I,"\chi^2=\frac{n(ad-bc)^2}{(a+b)(a+c)(c+d)(b+d)}"
34 | s,Pearson-II,"\sqrt{\frac{\chi^2}{n+\chi^2}}"
35 | s,Pearson-II,"\sqrt{\frac{\rho}{n+\rho}}; \rho=\frac{ad-bc}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}"
36 | s,Pearson-Heron-I,"\frac{ad-bc}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}"
37 | s,Pearson-Heron-II,"\cos\left(\frac{\pi \sqrt{bc}}{\sqrt{ad}+\sqrt{bc}}\right)"
38 | s,Sokal-Sneath-III,"\frac{a+d}{b+c}"
39 | s,Sokal-Sneath-V,"\frac{ad}{(a+b)(a+c)(b+d)\sqrt{c+d}}"
40 | s,Cole,"\frac{\sqrt{2}(ad-bc)}{\sqrt{(ad-bc)^2-(a+b)(a+c)(b+d)(c+d)}}"
41 | s,Stiles,"\log_{10} \frac{n\left(|ad-bc|-\frac{n}{2}\right)^2}{(a+b)(a+c)(b+d)(c+d)}"
42 | s,Ochiai-II,"\frac{ad}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}"
43 | s,Yule-Q,"\frac{ad-bc}{ad+bc}"
44 | s,Yule-w,"\frac{\sqrt{ad}-\sqrt{bc}}{\sqrt{ad}+\sqrt{bc}}"
45 | s,Kulczynski-I,"\frac{a}{b+c}"
46 | s,Tanimoto,"\frac{a}{2a+b+c}"
47 | s,Disperson,"\frac{ad-bc}{(a+b+c+d)^2}"
48 | s,Hamann,"\frac{(a+d)-(b+c)}{a+b+c+d}"
49 | s,Micahel,"\frac{4(ad-bc)}{(a+d)^2+(b+c)^2}"
50 | s,Goodman-Kruskal,"\frac{\sigma - \sigma'}{2n-\sigma'}; \sigma=\max(a,b)+\max(c,d)+\max(a,c)+\max(b,d); \sigma'=\max(a+c,b+d)+\max(a+b,c+d)"
51 | s,Anderberg,"\frac{\sigma-\sigma'}{2n}"
52 | s,Baroni-Urbani-Buser-I,"\frac{\sqrt{ad}+a}{\sqrt{ad}+a+b+c}"
53 | s,Baroni-Urbani-Buser-II,"\frac{\sqrt{ad}+a-(b+c)}{\sqrt{ad}+a+b+c}"
54 | s,Peirce,"\frac{ab+bc}{ab+2bc+cd}"
55 | s,Eyraud,"\frac{n^2(na-(a+b)(a+c))}{(a+b)(a+c)(b+d)(c+d)}"
56 | s,Trantuala,"\frac{a(c+d)}{c(a+b)}"
57 | s,Ample,"\left|\frac{a(c+d)}{c(a+b)}\right|"
58 | d,Hamming;Canberra;Manhattan;Cityblock;Minkowski,"b+c"
59 | d,Euclid,"\sqrt{b+c}"
60 | d,Squared-Euclid,"\sqrt{(b+c)^2}"
61 | d,Mean-Manhattan,"\frac{b+c}{a+b+c+d}"
62 | d,Vari,"\frac{b+c}{4a+4b+4c+4d}"
63 | d,Size Difference,"\frac{(b+c)^2}{(a+b+c+d)^2}"
64 | d,Shape Difference,"\frac{n(b+c)-(b-c)^2}{(a+b+c+d)^2}"
65 | d,Pattern Difference,"\frac{4bc}{(a+b+c+d)^2}"
66 | d,Lance-Williams;Bray-Curtis,"\frac{b+c}{2a+b+c}"
67 | d,Hellinger,"2\sqrt{1 - \frac{a}{\sqrt{(a+b)(a+c)}}}"
68 | d,Chord,"\sqrt{2\left(1 - \frac{a}{\sqrt{(a+b)(a+c)}}\right)}"
69 | d,Yule-Q,"\frac{2bc}{ad+bc}"
--------------------------------------------------------------------------------
/misc/binary-measures.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | df = pd.read_csv('binary-measures.csv')
4 | print(df)
5 | print(df.columns)
6 | print(df.shape)
7 | print(len(df.equation.unique()))
8 |
9 | equations = [r.equation for _, r in df.iterrows()]
10 | print(len(equations))
11 | m = {e: 0 for e in equations}
12 | for e in equations:
13 | m[e] += 1
14 | for e, c in m.items():
15 | if c > 1:
16 | print(e, c)
17 |
18 | for _, r in df[df.type == 'd'].sort_values(['name']).iterrows():
19 | e_name = r['name']
20 | e_form = r.equation
21 | name = f' * - {e_name}'
22 | eqn = f' - :math:`{e_form}`'
23 | print(name)
24 | print(eqn)
25 |
--------------------------------------------------------------------------------
/misc/count-measures.py:
--------------------------------------------------------------------------------
1 | from pypair.biserial import Biserial
2 | from pypair.contingency import BinaryTable, CategoricalTable, ConfusionMatrix
3 | from pypair.continuous import Concordance, CorrelationRatio, Continuous
4 |
5 | measures = [
6 | ('Binary-Binary', BinaryTable.measures()),
7 | ('Confusion Matrix, Binary-Binary', ConfusionMatrix.measures()),
8 | ('Categorical-Categorical', CategoricalTable.measures()),
9 | ('Categorical-Continuous, Biserial', Biserial.measures()),
10 | ('Categorical-Continuous', CorrelationRatio.measures()),
11 | ('Ordinal-Ordinal, Concordance', Concordance.measures()),
12 | ('Continuous-Continuous', Continuous.measures())
13 | ]
14 | print(sum([len(m) for _, m in measures]))
15 |
16 | for n, items in measures:
17 | title = f'{n} ({len(items)})'
18 | print(title)
19 | print('-' * len(title))
20 | print('')
21 | for m in items:
22 | print(f'- {m}')
23 | print('')
24 |
25 |
--------------------------------------------------------------------------------
/misc/ipynb/binary-binary.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 10,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "\n",
11 | "get_data = lambda x, y, n: [(x, y) * 2 for _ in range(n)]\n",
12 | "data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)\n",
13 | "pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4'])\n",
14 | "sdf = sqlContext.createDataFrame(pdf)"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 13,
20 | "metadata": {},
21 | "outputs": [
22 | {
23 | "name": "stdout",
24 | "output_type": "stream",
25 | "text": [
26 | "root\n",
27 | " |-- x1: long (nullable = true)\n",
28 | " |-- x2: long (nullable = true)\n",
29 | " |-- x3: long (nullable = true)\n",
30 | " |-- x4: long (nullable = true)\n",
31 | "\n"
32 | ]
33 | }
34 | ],
35 | "source": [
36 | "sdf.printSchema()"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 29,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "from itertools import combinations\n",
46 | "\n",
47 | "def to_counts(d):\n",
48 | " def as_key(k1, k2):\n",
49 | " keys = sorted([k1, k2])\n",
50 | " return keys[0], keys[1]\n",
51 | " \n",
52 | " def as_count(v1, v2):\n",
53 | " a, b, c, d = 0, 0, 0, 0\n",
54 | " if v1 is not None and v2 is not None:\n",
55 | " if v1 == 1 and v2 == 1:\n",
56 | " a = 1\n",
57 | " elif v1 == 1 and v2 == 0:\n",
58 | " b = 1\n",
59 | " elif v1 == 0 and v2 == 1:\n",
60 | " c = 1\n",
61 | " else:\n",
62 | " d = 1\n",
63 | " return a, b, c, d\n",
64 | " \n",
65 | " def transform(k1, k2):\n",
66 | " v1, v2 = d[k1], d[k2]\n",
67 | " return as_key(k1, k2), as_count(v1, v2)\n",
68 | " \n",
69 | " return [transform(k1, k2) for k1, k2 in combinations(d.keys(), 2)]\n",
70 | "\n",
71 | "def add_counts(a, b):\n",
72 | " return a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]\n",
73 | "\n",
74 | "# to_counts({'x1': 1, 'x2': 1, 'x3': 1, 'x4': 1})"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 31,
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "data": {
84 | "text/plain": [
85 | "[(('x1', 'x2'), (207, 282, 231, 242)),\n",
86 | " (('x1', 'x3'), (489, 0, 0, 473)),\n",
87 | " (('x1', 'x4'), (207, 282, 231, 242)),\n",
88 | " (('x2', 'x3'), (207, 231, 282, 242)),\n",
89 | " (('x2', 'x4'), (438, 0, 0, 524)),\n",
90 | " (('x3', 'x4'), (207, 282, 231, 242))]"
91 | ]
92 | },
93 | "execution_count": 31,
94 | "metadata": {},
95 | "output_type": "execute_result"
96 | }
97 | ],
98 | "source": [
99 | "sdf.rdd\\\n",
100 | " .flatMap(lambda r: to_counts(r.asDict()))\\\n",
101 | " .reduceByKey(lambda a, b: add_counts(a, b))\\\n",
102 | " .sortByKey()\\\n",
103 | " .collect()"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": []
112 | }
113 | ],
114 | "metadata": {
115 | "kernelspec": {
116 | "display_name": "Python 3",
117 | "language": "python",
118 | "name": "python3"
119 | },
120 | "language_info": {
121 | "codemirror_mode": {
122 | "name": "ipython",
123 | "version": 3
124 | },
125 | "file_extension": ".py",
126 | "mimetype": "text/x-python",
127 | "name": "python",
128 | "nbconvert_exporter": "python",
129 | "pygments_lexer": "ipython3",
130 | "version": "3.7.6"
131 | }
132 | },
133 | "nbformat": 4,
134 | "nbformat_minor": 4
135 | }
136 |
--------------------------------------------------------------------------------
/misc/ipynb/cat-cat.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 10,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from random import choice\n",
10 | "\n",
11 | "\n",
12 | "x_domain = ['a', 'b', 'c']\n",
13 | "y_domain = ['a', 'b']\n",
14 | "\n",
15 | "get_x = lambda: choice(x_domain)\n",
16 | "get_y = lambda: choice(y_domain)\n",
17 | "get_data = lambda: {f'x{i}':v for i, v in enumerate((get_x(), get_y(), get_x(), get_y()))}\n",
18 | "\n",
19 | "data = [get_data() for _ in range(10)]"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 19,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "from itertools import combinations, chain\n",
29 | "\n",
30 | "def to_count(d):\n",
31 | " def count(k1, k2):\n",
32 | " tups = [(k1, d[k1]), (k2, d[k2])]\n",
33 | " tups = sorted(tups, key=lambda t: t[0])\n",
34 | " \n",
35 | " return (tups[0][0], tups[1][0], tups[0][1], tups[1][1]), 1\n",
36 | " \n",
37 | " return [count(k1, k2) for k1, k2 in combinations(d.keys(), 2)]\n",
38 | " \n",
39 | "t = map(lambda d: to_count(d), data)\n",
40 | "t = chain(*t)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 20,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "data": {
50 | "text/plain": [
51 | "[(('x0', 'x1', 'c', 'a'), 1),\n",
52 | " (('x0', 'x2', 'c', 'c'), 1),\n",
53 | " (('x0', 'x3', 'c', 'a'), 1),\n",
54 | " (('x1', 'x2', 'a', 'c'), 1),\n",
55 | " (('x1', 'x3', 'a', 'a'), 1),\n",
56 | " (('x2', 'x3', 'c', 'a'), 1),\n",
57 | " (('x0', 'x1', 'a', 'a'), 1),\n",
58 | " (('x0', 'x2', 'a', 'b'), 1),\n",
59 | " (('x0', 'x3', 'a', 'b'), 1),\n",
60 | " (('x1', 'x2', 'a', 'b'), 1),\n",
61 | " (('x1', 'x3', 'a', 'b'), 1),\n",
62 | " (('x2', 'x3', 'b', 'b'), 1),\n",
63 | " (('x0', 'x1', 'b', 'a'), 1),\n",
64 | " (('x0', 'x2', 'b', 'a'), 1),\n",
65 | " (('x0', 'x3', 'b', 'b'), 1),\n",
66 | " (('x1', 'x2', 'a', 'a'), 1),\n",
67 | " (('x1', 'x3', 'a', 'b'), 1),\n",
68 | " (('x2', 'x3', 'a', 'b'), 1),\n",
69 | " (('x0', 'x1', 'a', 'b'), 1),\n",
70 | " (('x0', 'x2', 'a', 'a'), 1),\n",
71 | " (('x0', 'x3', 'a', 'a'), 1),\n",
72 | " (('x1', 'x2', 'b', 'a'), 1),\n",
73 | " (('x1', 'x3', 'b', 'a'), 1),\n",
74 | " (('x2', 'x3', 'a', 'a'), 1),\n",
75 | " (('x0', 'x1', 'b', 'a'), 1),\n",
76 | " (('x0', 'x2', 'b', 'a'), 1),\n",
77 | " (('x0', 'x3', 'b', 'a'), 1),\n",
78 | " (('x1', 'x2', 'a', 'a'), 1),\n",
79 | " (('x1', 'x3', 'a', 'a'), 1),\n",
80 | " (('x2', 'x3', 'a', 'a'), 1),\n",
81 | " (('x0', 'x1', 'b', 'a'), 1),\n",
82 | " (('x0', 'x2', 'b', 'b'), 1),\n",
83 | " (('x0', 'x3', 'b', 'b'), 1),\n",
84 | " (('x1', 'x2', 'a', 'b'), 1),\n",
85 | " (('x1', 'x3', 'a', 'b'), 1),\n",
86 | " (('x2', 'x3', 'b', 'b'), 1),\n",
87 | " (('x0', 'x1', 'c', 'b'), 1),\n",
88 | " (('x0', 'x2', 'c', 'a'), 1),\n",
89 | " (('x0', 'x3', 'c', 'a'), 1),\n",
90 | " (('x1', 'x2', 'b', 'a'), 1),\n",
91 | " (('x1', 'x3', 'b', 'a'), 1),\n",
92 | " (('x2', 'x3', 'a', 'a'), 1),\n",
93 | " (('x0', 'x1', 'b', 'b'), 1),\n",
94 | " (('x0', 'x2', 'b', 'b'), 1),\n",
95 | " (('x0', 'x3', 'b', 'a'), 1),\n",
96 | " (('x1', 'x2', 'b', 'b'), 1),\n",
97 | " (('x1', 'x3', 'b', 'a'), 1),\n",
98 | " (('x2', 'x3', 'b', 'a'), 1),\n",
99 | " (('x0', 'x1', 'a', 'a'), 1),\n",
100 | " (('x0', 'x2', 'a', 'a'), 1),\n",
101 | " (('x0', 'x3', 'a', 'b'), 1),\n",
102 | " (('x1', 'x2', 'a', 'a'), 1),\n",
103 | " (('x1', 'x3', 'a', 'b'), 1),\n",
104 | " (('x2', 'x3', 'a', 'b'), 1),\n",
105 | " (('x0', 'x1', 'b', 'a'), 1),\n",
106 | " (('x0', 'x2', 'b', 'b'), 1),\n",
107 | " (('x0', 'x3', 'b', 'a'), 1),\n",
108 | " (('x1', 'x2', 'a', 'b'), 1),\n",
109 | " (('x1', 'x3', 'a', 'a'), 1),\n",
110 | " (('x2', 'x3', 'b', 'a'), 1)]"
111 | ]
112 | },
113 | "execution_count": 20,
114 | "metadata": {},
115 | "output_type": "execute_result"
116 | }
117 | ],
118 | "source": [
119 | "list(t)"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": []
128 | }
129 | ],
130 | "metadata": {
131 | "kernelspec": {
132 | "display_name": "Python 3",
133 | "language": "python",
134 | "name": "python3"
135 | },
136 | "language_info": {
137 | "codemirror_mode": {
138 | "name": "ipython",
139 | "version": 3
140 | },
141 | "file_extension": ".py",
142 | "mimetype": "text/x-python",
143 | "name": "python",
144 | "nbconvert_exporter": "python",
145 | "pygments_lexer": "ipython3",
146 | "version": "3.7.3"
147 | }
148 | },
149 | "nbformat": 4,
150 | "nbformat_minor": 2
151 | }
152 |
--------------------------------------------------------------------------------
/misc/scratch.py:
--------------------------------------------------------------------------------
1 | from pypair.contingency import CategoricalTable, BinaryTable, ConfusionMatrix, AgreementTable
2 |
3 | a = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
4 | b = [0, 0, 0, 1, 1, 0, 0, 1, 1, 1]
5 |
6 | cat = CategoricalTable(a, b)
7 | bin = BinaryTable(a, b)
8 | con = ConfusionMatrix(a, b)
9 | agr = AgreementTable(a, b)
10 |
11 | print(cat.measures())
12 | print(CategoricalTable.measures())
13 | print('-' * 15)
14 | print(bin.measures())
15 | print(BinaryTable.measures())
16 | print('-' * 15)
17 | print(con.measures())
18 | print(ConfusionMatrix.measures())
19 | print('-' * 15)
20 | print(agr.measures())
21 | print(AgreementTable.measures())
22 |
23 | print('~' * 15)
24 | print('~' * 15)
25 |
26 |
27 | def print_measures(computer):
28 | r = {m: computer.get(m) for m in computer.measures()}
29 | print(r)
30 |
31 |
32 | print_measures(cat)
33 | print_measures(bin)
34 | print_measures(con)
35 | print_measures(agr)
36 |
--------------------------------------------------------------------------------
/publish.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SOURCE_DIST=/py-pair/dist/pypair-${API_VERSION}.tar.gz
4 |
5 | buildCode() {
6 | echo "start the build"
7 | cd /py-pair \
8 | && make clean \
9 | && make \
10 | && twine check dist/*
11 | }
12 |
13 | updateVersion() {
14 | echo "replace version of software to ${API_VERSION}"
15 | sed -i "s/version = 0.0.1/version = ${API_VERSION}/g" /py-pair/setup.cfg
16 | }
17 |
18 | copyCredentials() {
19 | if [[ -f /py-pair/.pypirc ]]; then
20 | echo "copying over .pypirc"
21 | cp /py-pair/.pypirc /root/.pypirc
22 | fi
23 | }
24 |
25 | publish() {
26 | echo "python publish"
27 |
28 | if [[ -f /root/.pypirc ]]; then
29 | if [[ -f ${SOURCE_DIST} ]]; then
30 | echo "uploading source"
31 | cd /py-pair \
32 | && make clean \
33 | && python setup.py sdist \
34 | && twine upload --repository ${PYPI_REPO} ${SOURCE_DIST}
35 | else
36 | echo "no ${SOURCE_DIST} found!"
37 | fi
38 | else
39 | echo "no .pypirc found!"
40 | fi
41 | }
42 |
43 | cleanUp() {
44 | if [[ -f /root/.pypirc ]]; then
45 | echo "cleaning up"
46 | rm -f /root/.pypirc
47 | fi
48 | }
49 |
50 | build() {
51 | echo "python build"
52 | buildCode
53 | publish
54 | }
55 |
56 | conda init bash
57 | . /root/.bashrc
58 | updateVersion
59 | copyCredentials
60 | build
61 | cleanUp
62 |
63 | echo "done!"
--------------------------------------------------------------------------------
/pypair/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/pypair/__init__.py
--------------------------------------------------------------------------------
/pypair/association.py:
--------------------------------------------------------------------------------
1 | from pypair.biserial import Biserial
2 | from pypair.contingency import BinaryTable, CategoricalTable, ConfusionMatrix, AgreementTable
3 | from pypair.continuous import Concordance, CorrelationRatio, Continuous
4 |
5 |
6 | def confusion(a, b, measure='acc', a_0=0, a_1=1, b_0=0, b_1=1):
7 | """
8 | Gets the specified confusion matrix stats.
9 |
10 | :param a: Binary variable (iterable).
11 | :param b: Binary variable (iterable).
12 | :param measure: Measure. Default is `acc`.
13 | :param a_0: The a zero value. Default 0.
14 | :param a_1: The a one value. Default 1.
15 | :param b_0: The b zero value. Default 0.
16 | :param b_1: The b one value. Default 1.
17 | :return: Measure.
18 | """
19 | if measure not in ConfusionMatrix.measures():
20 | raise ValueError(f'{measure} is not a valid association measure.')
21 | return ConfusionMatrix(a, b, a_0=a_0, a_1=a_1, b_0=b_0, b_1=b_1).get(measure)
22 |
23 |
24 | def binary_binary(a, b, measure='chisq', a_0=0, a_1=1, b_0=0, b_1=1):
25 | """
26 | Gets the binary-binary association.
27 |
28 | :param a: Binary variable (iterable).
29 | :param b: Binary variable (iterable).
30 | :param measure: Measure. Default is `chisq`.
31 | :param a_0: The a zero value. Default 0.
32 | :param a_1: The a one value. Default 1.
33 | :param b_0: The b zero value. Default 0.
34 | :param b_1: The b one value. Default 1.
35 | :return: Measure.
36 | """
37 | if measure not in BinaryTable.measures():
38 | raise ValueError(f'{measure} is not a valid association measure.')
39 | return BinaryTable(a, b, a_0=a_0, a_1=a_1, b_0=b_0, b_1=b_1).get(measure)
40 |
41 |
42 | def categorical_categorical(a, b, measure='chisq', a_vals=None, b_vals=None):
43 | """
44 | Gets the categorical-categorical association.
45 |
46 | :param a: Categorical variable (iterable).
47 | :param b: Categorical variable (iterable).
48 | :param measure: Measure. Default is `chisq`.
49 | :param a_vals: The unique values in `a`.
50 | :param b_vals: The unique values in `b`.
51 | :return: Measure.
52 | """
53 | if measure not in CategoricalTable.measures():
54 | raise ValueError(f'{measure} is not a valid association measure.')
55 | return CategoricalTable(a, b, a_vals=a_vals, b_vals=b_vals).get(measure)
56 |
57 |
58 | def agreement(a, b, measure='chohen_k', a_vals=None, b_vals=None):
59 | """
60 | Gets the agreement association.
61 |
62 | :param a: Categorical variable (iterable).
63 | :param b: Categorical variable (iterable).
64 | :param measure: Measure. Default is `chohen_k`.
65 | :param a_vals: The unique values in `a`.
66 | :param b_vals: The unique values in `b`.
67 | :return: Measure.
68 | """
69 | if measure not in AgreementTable.measures():
70 | raise ValueError(f'{measure} is not a valid association measure.')
71 | return AgreementTable(a, b, a_vals=a_vals, b_vals=b_vals).get(measure)
72 |
73 |
74 | def binary_continuous(b, c, measure='biserial', b_0=0, b_1=1):
75 | """
76 | Gets the binary-continuous association.
77 |
78 | :param b: Binary variable (iterable).
79 | :param c: Continuous variable (iterable).
80 | :param measure: Measure. Default is `biserial`.
81 | :param b_0: Value when `b` is zero. Default 0.
82 | :param b_1: Value when `b` is one. Default is 1.
83 | :return: Measure.
84 | """
85 | if measure not in Biserial.measures():
86 | raise ValueError(f'{measure} is not a valid association measure.')
87 | return Biserial(b, c, b_0=b_0, b_1=b_1).get(measure)
88 |
89 |
90 | def categorical_continuous(x, y, measure='eta'):
91 | """
92 | Gets the categorical-continuous association.
93 |
94 | :param x: Categorical variable (iterable).
95 | :param y: Continuous variable (iterable).
96 | :param measure: Measure. Default is `eta`.
97 | :return: Measure.
98 | """
99 | if measure not in CorrelationRatio.measures():
100 | raise ValueError(f'{measure} is not a valid association measure.')
101 | return CorrelationRatio(x, y).get(measure)
102 |
103 |
104 | def concordance(x, y, measure='kendall_tau'):
105 | """
106 | Gets the specified concordance between the two variables.
107 |
108 | :param x: Continuous or ordinal variable (iterable).
109 | :param y: Continuous or ordinal variable (iterable).
110 | :param measure: Measure. Default is `kendall_tau`.
111 | :return: Measure.
112 | """
113 | if measure not in Concordance.measures():
114 | raise ValueError(f'{measure} is not a valid association measure.')
115 | return Concordance(x, y).get(measure)
116 |
117 |
118 | def continuous_continuous(x, y, measure='pearson'):
119 | """
120 | Gets the continuous-continuous association.
121 |
122 | :param x: Continuous variable (iterable).
123 | :param y: Continuous variable (iterable).
124 | :param measure: Measure. Default is 'pearson'.
125 | :return: Measure.
126 | """
127 | if measure not in Continuous.measures():
128 | raise ValueError(f'{measure} is not a valid association measure.')
129 | return Continuous(x, y).get(measure)
130 |
--------------------------------------------------------------------------------
/pypair/biserial.py:
--------------------------------------------------------------------------------
1 | from functools import lru_cache
2 | from math import sqrt
3 |
4 | import pandas as pd
5 | from scipy.stats import norm
6 |
7 | from pypair.util import MeasureMixin
8 |
9 |
10 | class BiserialMixin(object):
11 | """
12 | Biserial computations based off of :math:`n, p, q, y_0, y_1, \\sigma`.
13 |
14 | """
15 |
16 | @property
17 | @lru_cache(maxsize=None)
18 | def __params(self):
19 | """
20 | Gets the parameters associated with the data.
21 |
22 | - n: total
23 | - p: P(b=0)
24 | - q: 1 - p
25 | - y_0: average of c when b=0
26 | - y_1: average of c when b=1
27 | - std: standard deviation of c
28 |
29 | :return: n, p, q, y_0, y_1, std
30 | """
31 | return self._n, self._p, self._q, self._y_0, self._y_1, self._std
32 |
33 | @property
34 | @lru_cache(maxsize=None)
35 | def biserial(self):
36 | """
37 | Computes the biserial correlation between a binary and continuous variable. The biserial correlation
38 | :math:`r_b` can be computed from the point-biserial correlation :math:`r_{\\mathrm{pb}}` as follows.
39 |
40 | :math:`r_b = \\frac{r_{\\mathrm{pb}}}{h} \\sqrt{pq}`
41 |
42 | The tricky thing to explain is the :math:`h` parameter. :math:`h` is defined as the
43 | height of the standard normal distribution at z, where :math:`P(z'z) = p`.
44 | The way to get :math:`h` in practice is take the inverse standard normal of :math:`q`, and
45 | then take the standard normal probability of that result. Using Scipy `norm.pdf(norm.ppf(q))`.
46 |
47 | References
48 |
49 | - `Point-Biserial Correlation & Biserial Correlation: Definition, Examples `_
50 | - `Point-Biserial and Biserial Correlations `_
51 | - `Real Statistics Using Excel `_
52 | - `NORM.S.DIST function `_
53 | - `NORM.S.INV function `_
54 | - `scipy.stats.norm `_
55 | - `How to calculate the inverse of the normal cumulative distribution function in python? `_
56 |
57 | :return: Biserial correlation coefficient.
58 | """
59 | n, p, q, y_0, y_1, std = self.__params
60 |
61 | r_pb = (y_1 - y_0) * sqrt(p * q) / std
62 |
63 | y = norm.pdf(norm.ppf(q))
64 | r_b = r_pb * sqrt(p * q) / y
65 |
66 | return r_b
67 |
68 | @property
69 | @lru_cache(maxsize=None)
70 | def point_biserial(self):
71 | """
72 | Computes the `point-biserial correlation coefficient `_
73 | between a binary variable :math:`X` and a continuous variable :math:`Y`.
74 |
75 | :math:`r_{\\mathrm{pb}} = \\frac{(Y_1 - Y_0) \\sqrt{pq}}{\\sigma_Y}`
76 |
77 | Where
78 |
79 | - :math:`Y_0` is the average of :math:`Y` when :math:`X=0`
80 | - :math:`Y_1` is the average of :math:`Y` when :math:`X=1`
81 | - :math:`\\sigma_Y`` is the standard deviation of :math:`Y`
82 | - :math:`p` is :math:`P(X=1)`
83 | - :math:`q` is :math:`1 - p`
84 |
85 | :return: Point-biserial correlation coefficient.
86 | """
87 | n, p, q, y_0, y_1, std = self.__params
88 |
89 | r = (y_1 - y_0) * sqrt(p * q) / std
90 | return r
91 |
92 | @property
93 | @lru_cache(maxsize=None)
94 | def rank_biserial(self):
95 | """
96 | Computes the rank-biserial correlation between a binary variable :math:`X` and a continuous variable :math:`Y`.
97 |
98 | :math:`r_r = \\frac{2 (Y_1 - Y_0)}{n}`
99 |
100 | Where
101 |
102 | - :math:`Y_0` is the average of :math:`Y` when :math:`X=0`
103 | - :math:`Y_1` is the average of :math:`Y` when :math:`X=1`
104 | - :math:`n` is the total number of data
105 |
106 | :return: Rank-biserial correlation.
107 | """
108 | n, p, q, y_0, y_1, std = self.__params
109 |
110 | r = 2 * (y_1 - y_0) / n
111 | return r
112 |
113 |
114 | class Biserial(MeasureMixin, BiserialMixin, object):
115 | """
116 | Biserial association between a binary and continuous variable.
117 | """
118 |
119 | def __init__(self, b, c, b_0=0, b_1=1):
120 | """
121 | ctor.
122 |
123 | :param b: Binary variable (iterable).
124 | :param c: Continuous variable (iterable).
125 | :param b_0: Value for b is zero. Default 0.
126 | :param b_1: Value for b is one. Default 1.
127 | """
128 | df = pd.DataFrame([(x, y) for x, y in zip(b, c) if pd.notna(x)], columns=['b', 'c'])
129 |
130 | n = df.shape[0]
131 | p = df[df.b == b_1].shape[0] / n
132 | q = 1.0 - p
133 |
134 | y_0 = df[df.b == b_0].c.mean()
135 | y_1 = df[df.b == b_1].c.mean()
136 | std = df.c.std()
137 |
138 | self._n = n
139 | self._p = p
140 | self._q = q
141 | self._y_0 = y_0
142 | self._y_1 = y_1
143 | self._std = std
144 |
145 |
146 | class BiserialStats(MeasureMixin, BiserialMixin, object):
147 | """
148 | Computes biserial stats.
149 | """
150 |
151 | def __init__(self, n, p, y_0, y_1, std):
152 | """
153 | ctor.
154 |
155 | :param n: Total number of samples.
156 | :param p: :math:`P(Y|X=0)`.
157 | :param y_0: Average of :math:`Y` when :math:`X=0`. :math:`\\bar{Y}_0`
158 | :param y_1: Average of :math:`Y` when :math:`X=1`. :math:`\\bar{Y}_1`
159 | :param std: Standard deviation of :math:`Y`, :math:`\\sigma`.
160 | """
161 | self._n = n
162 | self._p = p
163 | self._q = 1.0 - p
164 | self._y_0 = y_0
165 | self._y_1 = y_1
166 | self._std = std
167 |
--------------------------------------------------------------------------------
/pypair/continuous.py:
--------------------------------------------------------------------------------
1 | from functools import reduce, lru_cache
2 | from itertools import combinations
3 | from math import sqrt
4 |
5 | import pandas as pd
6 | from scipy.stats import pearsonr, spearmanr, kendalltau, f_oneway, kruskal, linregress
7 | from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
8 |
9 | from pypair.util import MeasureMixin
10 |
11 |
12 | class ConcordantCounts(object):
13 | """
14 | Stores the concordance, discordant and tie counts.
15 | """
16 |
17 | def __init__(self, d, t_xy, t_x, t_y, c):
18 | """
19 | ctor.
20 |
21 | :param d: Discordant.
22 | :param t_xy: Tie.
23 | :param t_x: Tie on X.
24 | :param t_y: Tie on Y.
25 | :param c: Concordant.
26 | """
27 | self.d = d
28 | self.t_xy = t_xy
29 | self.t_x = t_x
30 | self.t_y = t_y
31 | self.c = c
32 |
33 | def __add__(self, other):
34 | d = self.d + other.d
35 | t_xy = self.t_xy + other.t_xy
36 | t_x = self.t_x + other.t_x
37 | t_y = self.t_y + other.t_y
38 | c = self.c + other.c
39 | return ConcordantCounts(d, t_xy, t_x, t_y, c)
40 |
41 |
42 | class Continuous(MeasureMixin, object):
43 | def __init__(self, a, b):
44 | """
45 | ctor.
46 |
47 | :param a: Continuous variable (iterable).
48 | :param b: Continuous variable (iterable).
49 | """
50 | self.__a = a
51 | self.__b = b
52 |
53 | @property
54 | @lru_cache(maxsize=None)
55 | def pearson(self):
56 | """
57 | `Pearson's r `_.
58 |
59 | :return: Pearson's r, p-value.
60 | """
61 | return pearsonr(self.__a, self.__b)
62 |
63 | @property
64 | @lru_cache(maxsize=None)
65 | def spearman(self):
66 | """
67 | `Spearman's r `_.
68 |
69 | :return: Spearman's r, p-value.
70 | """
71 | r = spearmanr(self.__a, self.__b)
72 | return r.correlation, r.pvalue
73 |
74 | @property
75 | @lru_cache(maxsize=None)
76 | def kendall(self):
77 | """
78 | `Kendall's tau `_.
79 |
80 | :return: Kendall's tau, p-value.
81 | """
82 | r = kendalltau(self.__a, self.__b)
83 | return r.correlation, r.pvalue
84 |
85 | @property
86 | @lru_cache(maxsize=None)
87 | def regression(self):
88 | """
89 | `Line regression `_.
90 |
91 | :return: Coefficient, p-value
92 | """
93 | slope, intercept, r_value, p_value, std_err = linregress(self.__a, self.__b)
94 | return r_value, p_value
95 |
96 |
97 | class CorrelationRatio(MeasureMixin, object):
98 | """
99 | `Correlation ratio `_.
100 |
101 | """
102 |
103 | def __init__(self, x, y):
104 | """
105 | ctor.
106 |
107 | :param x: Categorical variable (iterable).
108 | :param y: Continuous variable (iterable).
109 | """
110 | is_valid = lambda a, b: a is not None and b is not None
111 | self.__df = pd.DataFrame([(a, b) for a, b, in zip(x, y) if is_valid(a, b)], columns=['x', 'y'])
112 |
113 | @property
114 | @lru_cache(maxsize=None)
115 | def __mean(self):
116 | """
117 | Gets the mean of :math:`\\bar{y}`.
118 |
119 | :return: :math:`\\bar{y}`.
120 | """
121 | return self.__df.y.mean()
122 |
123 | @property
124 | @lru_cache(maxsize=None)
125 | def __sigma_cat(self):
126 | """
127 | Gets :math:`\\sigma_{\\bar{y}}^2`
128 |
129 | :return: :math:`\\sigma_{\\bar{y}}^2`.
130 | """
131 | stats = self.__df.groupby(['x']).agg(['count', 'mean']).reset_index()
132 | stats.columns = stats.columns.droplevel(0)
133 | stats = stats.rename(columns={'': 'x', 'count': 'n_x', 'mean': 'y_x'})
134 | y = self.__mean
135 |
136 | sigma = sum([r.n_x * (r.y_x - y) ** 2 for _, r in stats.iterrows()])
137 |
138 | return sigma
139 |
140 | @property
141 | def __sigma_sam(self):
142 | """
143 | Gets :math:`\\sigma_{y}^2`
144 |
145 | :return: :math:`\\sigma_{y}^2`.
146 | """
147 | y = self.__mean
148 | sigma = sum((self.__df.y - y) ** 2)
149 |
150 | return sigma
151 |
152 | @property
153 | @lru_cache(maxsize=None)
154 | def eta_squared(self):
155 | """
156 | Gets :math:`\\eta^2 = \\frac{\\sigma_{\\bar{y}}^2}{\\sigma_{y}^2}`
157 |
158 | :return: :math:`\\eta^2`.
159 | """
160 | sigma_cat = self.__sigma_cat
161 | sigma_sam = self.__sigma_sam
162 | eta = sigma_cat / sigma_sam
163 | return eta
164 |
165 | @property
166 | @lru_cache(maxsize=None)
167 | def eta(self):
168 | """
169 | Gets :math:`\\eta`.
170 |
171 | :returns: :math:`\\eta`.
172 | """
173 | return sqrt(self.eta_squared)
174 |
175 | @property
176 | @lru_cache(maxsize=None)
177 | def anova(self):
178 | """
179 | Computes an `ANOVA test `_.
180 |
181 | :return: F-statistic, p-value.
182 | """
183 | df = self.__df
184 | samples = [df[df.x == x].y for x in df.x.unique()]
185 | r = f_oneway(*samples)
186 | return r.statistic, r.pvalue
187 |
188 | @property
189 | @lru_cache(maxsize=None)
190 | def kruskal(self):
191 | """
192 | Computes the `Kruskal-Wallis H-test `_.
193 |
194 | :return: H-statistic, p-value.
195 | """
196 | df = self.__df
197 | samples = [df[df.x == x].y for x in df.x.unique()]
198 | r = kruskal(*samples)
199 | return r.statistic, r.pvalue
200 |
201 | @property
202 | @lru_cache(maxsize=None)
203 | def silhouette(self):
204 | """
205 | `Silhouette coefficient `_.
206 |
207 | :return: Silhouette coefficient.
208 | """
209 | labels = self.__df.x
210 | X = self.__df[['y']]
211 | return silhouette_score(X, labels)
212 |
213 | @property
214 | @lru_cache(maxsize=None)
215 | def davies_bouldin(self):
216 | """
217 | `Davies-Bouldin Index `_.
218 |
219 | :return: Davies-Bouldin Index.
220 | """
221 | labels = self.__df.x
222 | X = self.__df[['y']]
223 | return davies_bouldin_score(X, labels)
224 |
225 | @property
226 | @lru_cache(maxsize=None)
227 | def calinski_harabasz(self):
228 | """
229 | `Calinski-Harabasz Index `_.
230 |
231 | :return: Calinski-Harabasz Index.
232 | """
233 | labels = self.__df.x
234 | X = self.__df[['y']]
235 | return calinski_harabasz_score(X, labels)
236 |
237 |
238 | class ConcordanceMixin(object):
239 |
240 | @property
241 | @lru_cache(maxsize=None)
242 | def __counts(self):
243 | return self._d, self._t_xy, self._t_x, self._t_y, self._c, self._n
244 |
245 | @property
246 | @lru_cache(maxsize=None)
247 | def __probs(self):
248 | n = self._n
249 | return self._d / n, self._t_xy / n, self._t_x / n, self._t_y / n, self._c / n, n
250 |
251 | @property
252 | @lru_cache(maxsize=None)
253 | def kendall_tau(self):
254 | """
255 | Kendall's :math:`\\tau` is defined as follows.
256 |
257 | :math:`\\tau = \\frac{C - D}{{{n}\\choose{2}}}`
258 |
259 | Where
260 |
261 | - :math:`C` is the number of concordant pairs
262 | - :math:`D` is the number of discordant pairs
263 | - :math:`n` is the sample size
264 |
265 | :return: :math:`\\tau`.
266 | """
267 | d, t_xy, t_x, t_y, c, n = self.__counts
268 | t = (c - d) / (n * (n - 1) / 2)
269 | return t
270 |
271 | @property
272 | @lru_cache(maxsize=None)
273 | def somers_d(self):
274 | """
275 | Computes `Somers' d `_ for two continuous
276 | variables. Note that Somers' d is defined for :math:`d_{X \\cdot Y}` and :math:`d_{Y \\cdot X}`
277 | and in general :math:`d_{X \\cdot Y} \\neq d_{Y \\cdot X}`.
278 |
279 | - :math:`d_{Y \\cdot X} = \\frac{\\pi_c - \\pi_d}{\\pi_c + \\pi_d + \\pi_t^Y}`
280 | - :math:`d_{X \\cdot Y} = \\frac{\\pi_c - \\pi_d}{\\pi_c + \\pi_d + \\pi_t^X}`
281 |
282 | Where
283 |
284 | - :math:`\\pi_c = \\frac{C}{n}`
285 | - :math:`\\pi_d = \\frac{D}{n}`
286 | - :math:`\\pi_t^X = \\frac{T^X}{n}`
287 | - :math:`\\pi_t^Y = \\frac{T^Y}{n}`
288 | - :math:`C` is the number of concordant pairs
289 | - :math:`D` is the number of discordant pairs
290 | - :math:`T^X` is the number of ties on :math:`X`
291 | - :math:`T^Y` is the number of ties on :math:`Y`
292 | - :math:`n` is the sample size
293 |
294 | :return: :math:`d_{X \\cdot Y}`, :math:`d_{Y \\cdot X}`.
295 | """
296 | p_d, p_txy, p_tx, p_ty, p_c, n = self.__probs
297 |
298 | d_yx = (p_c - p_d) / (p_c + p_d + p_ty)
299 | d_xy = (p_c - p_d) / (p_c + p_d + p_tx)
300 |
301 | return d_yx, d_xy
302 |
303 | @property
304 | @lru_cache(maxsize=None)
305 | def goodman_kruskal_gamma(self):
306 | """
307 | Goodman-Kruskal :math:`\\gamma` is like Somer's D. It is defined as follows.
308 |
309 | :math:`\\gamma = \\frac{\\pi_c - \\pi_d}{1 - \\pi_t}`
310 |
311 | Where
312 |
313 | - :math:`\\pi_c = \\frac{C}{n}`
314 | - :math:`\\pi_d = \\frac{D}{n}`
315 | - :math:`\\pi_t = \\frac{T}{n}`
316 | - :math:`C` is the number of concordant pairs
317 | - :math:`D` is the number of discordant pairs
318 | - :math:`T` is the number of ties
319 | - :math:`n` is the sample size
320 |
321 | :return: :math:`\\gamma`.
322 | """
323 | p_d, p_txy, p_tx, p_ty, p_c, n = self.__probs
324 | p_t = p_txy + p_tx + p_ty
325 |
326 | gamma = (p_c - p_d) / (1 - p_t)
327 |
328 | return gamma
329 |
330 |
331 | class Concordance(MeasureMixin, ConcordanceMixin, object):
332 | """
333 | Concordance for continuous and ordinal data.
334 | """
335 |
336 | def __init__(self, x, y):
337 | """
338 | ctor.
339 |
340 | :param x: Continuous or ordinal data (iterable).
341 | :param y: Continuous or ordinal data (iterable).
342 | """
343 | d, t_xy, t_x, t_y, c, n = Concordance.__to_counts(x, y)
344 | self._d = d
345 | self._t_xy = t_xy
346 | self._t_x = t_x
347 | self._t_y = t_y
348 | self._c = c
349 | self._n = n
350 |
351 | @staticmethod
352 | def __to_counts(x, y):
353 | """
354 | Gets the count of concordance, discordance or tie. Two pairs of variables :math:`(X_i, Y_i)`
355 | and :math:`(X_j, Y_j)` are
356 |
357 | - concordant if :math:`X_i < X_j` and :math:`Y_i < Y_j` **or** :math:`X_i > X_j` and :math:`Y_i > Y_j`,
358 | - discordant if :math:`X_i < X_j` and :math:`Y_i > Y_j` **or** :math:`X_i > X_j` and :math:`Y_i < Y_j`, and
359 | - tied if :math:`X_i = X_j` and :math:`Y_i = Y_j`.
360 |
361 | Equivalently.
362 |
363 | - concordant if :math:`(X_j - X_i)(Y_j - Y_i) > 0`
364 | - discordant if :math:`(X_j - X_i)(Y_j - Y_i) < 0`
365 | - tied if :math:`(X_j - X_i)(Y_j - Y_i) = 0`
366 |
367 | Any two pairs of observations are necessarily concordant, discordant or tied.
368 |
369 | :return: Counts(D, T_XY, T_X, T_Y, C), n.
370 | """
371 |
372 | def get_concordance(p1, p2):
373 | x_i, y_i = p1
374 | x_j, y_j = p2
375 |
376 | d = 0
377 | t_xy = 0
378 | t_x = 0
379 | t_y = 0
380 | c = 0
381 |
382 | r = (x_j - x_i) * (y_j - y_i)
383 |
384 | if r > 0:
385 | c = 1
386 | elif r < 0:
387 | d = 1
388 | else:
389 | if x_i == x_j and y_i == y_j:
390 | t_xy = 1
391 | elif x_i == x_j:
392 | t_x = 1
393 | elif y_i == y_j:
394 | t_y = 1
395 |
396 | return ConcordantCounts(d, t_xy, t_x, t_y, c)
397 |
398 | is_valid = lambda a, b: a is not None and b is not None
399 | data = [(a, b) for a, b in zip(x, y) if is_valid(a, b)]
400 | results = combinations(data, 2)
401 | results = map(lambda tup: get_concordance(tup[0], tup[1]), results)
402 | c = reduce(lambda c1, c2: c1 + c2, results)
403 | n = len(data)
404 | return c.d, c.t_xy, c.t_x, c.t_y, c.c, n
405 |
406 |
407 | class ConcordanceStats(MeasureMixin, ConcordanceMixin):
408 | """
409 | Computes concordance stats.
410 | """
411 |
412 | def __init__(self, d, t_xy, t_x, t_y, c, n):
413 | """
414 | ctor.
415 |
416 | :param d: Number of discordant pairs.
417 | :param t_xy: Number of ties on XY pairs.
418 | :param t_x: Number of ties on X pairs.
419 | :param t_y: Number of ties on Y pairs.
420 | :param c: Number of concordant pairs.
421 | :param n: Total number of pairs.
422 | """
423 | self._d = d
424 | self._t_xy = t_xy
425 | self._t_x = t_x
426 | self._t_y = t_y
427 | self._t_c = c
428 | self._c = c
429 | self._n = n
430 |
--------------------------------------------------------------------------------
/pypair/decorator.py:
--------------------------------------------------------------------------------
1 | import time
2 | from functools import wraps
3 |
4 |
5 | def timeit(f):
6 | """
7 | Benchmarks the time it takes (seconds) to execute.
8 | """
9 |
10 | @wraps(f)
11 | def wrapper(*args, **kwargs):
12 | start = time.time()
13 | output = f(*args, **kwargs)
14 | # diff = time.time() - start
15 | time.time() - start
16 | # print(f'{f.__name__}: {diff}')
17 | return output
18 |
19 | return wrapper
20 |
21 |
22 | def similarity(f):
23 | """
24 | Marker for similarity functions.
25 | """
26 |
27 | @wraps(f)
28 | def wrapper(*args, **kwargs):
29 | return f(*args, **kwargs)
30 |
31 | return wrapper
32 |
33 |
34 | def distance(f):
35 | """
36 | Marker for distance functions.
37 | """
38 |
39 | @wraps(f)
40 | def wrapper(*args, **kwargs):
41 | return f(*args, **kwargs)
42 |
43 | return wrapper
44 |
--------------------------------------------------------------------------------
/pypair/spark.py:
--------------------------------------------------------------------------------
1 | from collections import namedtuple
2 | from itertools import combinations, product, chain
3 | from math import sqrt
4 |
5 | from pypair.biserial import BiserialStats
6 | from pypair.contingency import ConfusionStats, CategoricalStats, \
7 | BinaryStats, AgreementStats
8 | from pypair.continuous import ConcordanceStats
9 |
10 |
11 | def __as_key(k1, k2):
12 | """
13 | Creates a key (tuple) out of the two specified. The key is always ordered.
14 | If k2 < k1, then (k2, k1), else, (k1, k2).
15 |
16 | :param k1: Key (string).
17 | :param k2: Key (string).
18 | :return: (k1, k2) or (k2, k1).
19 | """
20 | keys = sorted([k1, k2])
21 | return keys[0], keys[1]
22 |
23 |
24 | def __to_abcd_counts(d):
25 | """
26 | Maps the paired keys in the dictionary and their associated values to a form: ``(k1, k2), (a, b, c, d)``.
27 |
28 | :param d: A dictionary. Names are variable names. Values are 0 or 1.
29 | :return: A list of tuples of the form: (k1, k2), (a, b, c, d).
30 | """
31 |
32 | def as_count(v1, v2):
33 | """
34 | Maps the specified values to a (TP or 11), b (FN or 10), c (FP or 01) and d (TN or 00).
35 | Only one of these will be 1, and the others will be 0. Look below for example.
36 |
37 | - 1, 1 = (1, 0, 0, 0)
38 | - 1, 0 = (0, 1, 0, 0)
39 | - 0, 1 = (0, 0, 1, 0)
40 | - 0, 0 = (0, 0, 0, 1)
41 |
42 | :param v1: Value (0 or 1).
43 | :param v2: Value (0 or 1).
44 | :return: a, b, c, d
45 | """
46 | a, b, c, d = 0, 0, 0, 0
47 | if v1 is not None and v2 is not None:
48 | if v1 == 1 and v2 == 1:
49 | a = 1
50 | elif v1 == 1 and v2 == 0:
51 | b = 1
52 | elif v1 == 0 and v2 == 1:
53 | c = 1
54 | else:
55 | d = 1
56 | return a, b, c, d
57 |
58 | def transform(k1, k2):
59 | """
60 | Transforms the keys and associated value to the form (a tuple of tuples): (k1, k2), (a, b, c, d).
61 |
62 | :param k1: Key (string).
63 | :param k2: Key (string).
64 | :return: (k1, k2), (a, b, c, d)
65 | """
66 | v1, v2 = d[k1], d[k2]
67 | return __as_key(k1, k2), as_count(v1, v2)
68 |
69 | return [transform(k1, k2) for k1, k2 in combinations(d.keys(), 2)]
70 |
71 |
72 | def __add_abcd_counts(x, y):
73 | """
74 | Adds two tuples. For example.
75 |
76 | :math:`x + y = (x_a + y_a, x_b + y_b, x_c + y_c, x_d + y_d)`
77 |
78 | :param x: Tuple (a, b, c, d).
79 | :param y: Tuple (a, b, c, d).
80 | :return: Tuple (a, b, c, d).
81 | """
82 | return x[0] + y[0], x[1] + y[1], x[2] + y[2], x[3] + y[3]
83 |
84 |
85 | def __add_concordance_counts(x, y):
86 | """
87 | Adds two tuples. For example.
88 |
89 | :math:`x + y = (x_d + y_d, x_t_{xy} + y_t_{xy}, x_t_x + y_t_x, x_t_y + y_t_y, x_c + y_c, x_n + y_n)`
90 |
91 | :param x: Tuple (d, t_xy, t_x, t_y, c, n).
92 | :param y: Tuple (d, t_xy, t_x, t_y, c, n).
93 | :return: Tuple (d, t_xy, t_x, t_y, c, n).
94 | """
95 | return x[0] + y[0], x[1] + y[1], x[2] + y[2], x[3] + y[3], x[4] + y[4], x[5] + y[5]
96 |
97 |
98 | def __get_contingency_table(sdf):
99 | """
100 | Gets the pairwise contingency tables. Each record in the pair-RDD returns has the following form.
101 |
102 | `(k1, k2), (table, row_marginals, col_marginals, domain1, domain2)`
103 |
104 | - k1 is the name of a variable
105 | - k2 is the name of a variable
106 | - table is a list of list (a table, matrix) of counts
107 | - row_marginals contain the row marginals
108 | - col_marginals contain the column marginals
109 | - domain1 is a list of all the values of variable 1
110 | - domain2 is a list of all the values of variable 2
111 |
112 | :param sdf: Spark dataframe.
113 | :return: Spark pair-RDD.
114 | """
115 |
116 | def to_count(d):
117 | def count(k1, k2):
118 | tups = [(k1, d[k1]), (k2, d[k2])]
119 | tups = sorted(tups, key=lambda t: t[0])
120 |
121 | return (tups[0][0], tups[1][0], tups[0][1], tups[1][1]), 1
122 |
123 | return [count(k1, k2) for k1, k2 in combinations(d.keys(), 2)]
124 |
125 | def attach_domains(tup):
126 | key, d = tup
127 | v1 = sorted(list({k[0] for k, _ in d.items()}))
128 | v2 = sorted(list({k[1] for k, _ in d.items()}))
129 |
130 | return key, (d, v1, v2)
131 |
132 | def to_contingency_table(tup):
133 | key, (d, v1, v2) = tup
134 | table = [[d[(a, b)] if (a, b) in d else 0 for b in v2] for a in v1]
135 |
136 | return key, table
137 |
138 | return sdf.rdd \
139 | .flatMap(lambda r: to_count(r.asDict())) \
140 | .reduceByKey(lambda a, b: a + b) \
141 | .map(lambda tup: ((tup[0][0], tup[0][1]), (tup[0][2], tup[0][3], tup[1]))) \
142 | .map(lambda tup: (tup[0], {(tup[1][0], tup[1][1]): tup[1][2]})) \
143 | .reduceByKey(lambda a, b: {**a, **b}) \
144 | .map(lambda tup: attach_domains(tup)) \
145 | .map(lambda tup: to_contingency_table(tup)) \
146 | .sortByKey()
147 |
148 |
149 | def binary_binary(sdf):
150 | """
151 | Gets all the pairwise binary-binary association measures. The result is a Spark pair-RDD,
152 | where the keys are tuples of variable names e.g. (k1, k2), and values are dictionaries
153 | of association names and measures e.g. {'phi': 1, 'lambda': 0.8}. Each record in the pair-RDD is of the form.
154 |
155 | - (k1, k2), {'phi': 1, 'lambda': 0.8, ...}
156 |
157 | :param sdf: Spark dataframe. Should be all 1's and 0's.
158 | :return: Spark pair-RDD.
159 | """
160 |
161 | def to_results(counts):
162 | """
163 | Converts the result of the contingency table counts to a dictionary of association measures.
164 |
165 | :param counts: Tuple of tuples: (k1, k2), (a, b, c, d).
166 | :return: (x1, x2), {'measure1': val1, 'measure2': val2, ...}.
167 | """
168 | (x1, x2), (a, b, c, d) = counts
169 |
170 | computer = BinaryStats([[a + 1, b + 1], [c + 1, d + 1]])
171 | measures = {m: computer.get(m) for m in computer.measures()}
172 | return (x1, x2), measures
173 |
174 | return sdf.rdd \
175 | .flatMap(lambda r: __to_abcd_counts(r.asDict())) \
176 | .reduceByKey(lambda a, b: __add_abcd_counts(a, b)) \
177 | .sortByKey() \
178 | .map(lambda counts: to_results(counts))
179 |
180 |
181 | def confusion(sdf):
182 | """
183 | Gets all the pairwise confusion matrix metrics. The result is a Spark pair-RDD,
184 | where the keys are tuples of variable names e.g. (k1, k2), and values are dictionaries
185 | of association names and metrics e.g. {'acc': 0.9, 'fpr': 0.2}.
186 | Each record in the pair-RDD is of the form.
187 |
188 | - (k1, k2), {'acc': 0.9, 'fpr': 0.2, ...}
189 |
190 | :param sdf: Spark dataframe. Should be all 1's and 0's.
191 | :return: Spark pair-RDD.
192 | """
193 |
194 | def to_results(counts):
195 | """
196 | Converts the result of the contingency table counts to a dictionary of association measures.
197 |
198 | :param counts: Tuple of tuples: (x1, x2), (tp, fn, fp, tn).
199 | :return: (x1, x2), {'metric1': val1, 'metric2': val2, ...}.
200 | """
201 | (x1, x2), (tp, fn, fp, tn) = counts
202 |
203 | tp = max(1, tp)
204 | fn = max(1, fn)
205 | fp = max(1, fp)
206 | tn = max(1, tn)
207 |
208 | computer = ConfusionStats([[tp, fn], [fp, tn]])
209 | measures = {m: computer.get(m) for m in computer.measures()}
210 | return (x1, x2), measures
211 |
212 | return sdf.rdd \
213 | .flatMap(lambda r: __to_abcd_counts(r.asDict())) \
214 | .reduceByKey(lambda a, b: __add_abcd_counts(a, b)) \
215 | .map(lambda counts: to_results(counts)) \
216 | .sortByKey()
217 |
218 |
219 | def categorical_categorical(sdf):
220 | """
221 | Gets all pairwise categorical-categorical association measures. The result is a Spark pair-RDD,
222 | where the keys are tuples of variable names e.g. (k1, k2), and values are dictionaries of
223 | association names and metrics e.g. {‘phi’: 0.9, ‘chisq’: 0.2}. Each record in the pair-RDD is of the form.
224 |
225 | - (k1, k2), {‘phi’: 0.9, ‘chisq’: 0.2, ...}
226 |
227 | :param sdf: Spark dataframe. Should be strings or whole numbers to represent the values.
228 | :return: Spark pair-RDD.
229 | """
230 |
231 | def to_results(tup):
232 | key, table = tup
233 | computer = CategoricalStats(table)
234 | measures = {m: computer.get(m) for m in computer.measures()}
235 | return key, measures
236 |
237 | return __get_contingency_table(sdf) \
238 | .map(lambda tup: to_results(tup)) \
239 | .sortByKey()
240 |
241 |
242 | def agreement(sdf):
243 | """
244 | Gets all pairwise categorical-categorical `agreement` association measures. The result is a Spark pair-RDD,
245 | where the keys are tuples of variable names e.g. (k1, k2), and values are dictionaries of
246 | association names and metrics e.g. {‘kappa’: 0.9, ‘delta’: 0.2}. Each record in the pair-RDD is of the form.
247 |
248 | - (k1, k2), {‘kappa’: 0.9, ‘delta’: 0.2, ...}
249 |
250 | :param sdf: Spark dataframe. Should be strings or whole numbers to represent the values.
251 | :return: Spark pair-RDD.
252 | """
253 |
254 | def to_results(tup):
255 | key, table = tup
256 | computer = AgreementStats(table)
257 | measures = {m: computer.get(m) for m in computer.measures()}
258 | return key, measures
259 |
260 | return __get_contingency_table(sdf) \
261 | .map(lambda tup: to_results(tup)) \
262 | .sortByKey()
263 |
264 |
265 | def binary_continuous(sdf, binary, continuous, b_0=0, b_1=1):
266 | """
267 | Gets all pairwise binary-continuous association measures. The result is a Spark pair-RDD,
268 | where the keys are tuples of variable names e.g. (k1, k2), and values are dictionaries of
269 | association names and metrics e.g. {‘biserial’: 0.9, ‘point_biserial’: 0.2}. Each record
270 | in the pair-RDD is of the form.
271 |
272 | - (k1, k2), {‘biserial’: 0.9, ‘point_biserial’: 0.2, ...}
273 |
274 | All the binary fields/columns should be encoded in the same way. For example, if you
275 | are using 1 and 0, then all binary fields should only have those values, not a mixture
276 | of 1 and 0, True and False, -1 and 1, etc.
277 |
278 | :param sdf: Spark dataframe.
279 | :param binary: List of fields that are binary.
280 | :param continuous: List of fields that are continuous.
281 | :param b_0: Zero value for binary field.
282 | :param b_1: One value for binary field.
283 | :return: Spark pair-RDD.
284 | """
285 |
286 | def to_pair1(d):
287 | """
288 | Creates a list of tuples.
289 |
290 | :param d: Dictionary of data.
291 | :return: List of (b, c, b_val), (sum_c, sum_c_sq, sum_b).
292 | """
293 | return [((b, c, d[b]), (d[c], d[c] ** 2, 1)) for b, c in product(*[binary, continuous])]
294 |
295 | def to_pair2(tup):
296 | """
297 | Makes a new pair.
298 |
299 | :param tup: (b, c, b_val), (sum_c, sum_c_sq, sum_b)
300 | :return: (b, c), (b_val, sum_c, sum_c_sq, sum_b)
301 | """
302 | (b, c, b_val), (sum_c, sum_c_sq, sum_b) = tup
303 | return (b, c), (b_val, sum_c, sum_c_sq, sum_b)
304 |
305 | def compute_stats(tup):
306 | """
307 | `Computational formula for variance and standard deviation `_.
308 |
309 | - :math:`SS = \\sum (X - \\bar{X})^2 = \\sum X^2 - \\frac{\\left(\\sum X\\right)^2}{N}`
310 | - :math:`\\sigma^2 = \\frac{SS}{N - 1}`
311 | - :math:`\\sigma = \\sqrt{\\sigma^2}`
312 |
313 | :param tup: (b, c), [(b_val, sum_c, sum_c_sq, sum_b), (b_val, sum_c, sum_c_sq, sum_b)]
314 | :return: (b, c), (n, p, y_0, y_1, std)
315 | """
316 | (b, c), data = tup
317 |
318 | data = list(data)
319 | data_0 = data[0] if data[0][0] == b_0 else data[1]
320 | data_1 = data[0] if data[0][0] == b_1 else data[0]
321 |
322 | _, sum_c_0, sum_c_sq_0, sum_b_0 = data_0
323 | _, sum_c_1, sum_c_sq_1, sum_b_1 = data_1
324 |
325 | n = sum_b_0 + sum_b_1
326 | p = sum_b_1 / n
327 | y_0 = sum_c_0 / sum_b_0
328 | y_1 = sum_c_1 / sum_b_1
329 | ss = (sum_c_sq_0 + sum_c_sq_1) - ((sum_c_0 + sum_c_1) ** 2 / n)
330 | v = ss / (n - 1)
331 | std = sqrt(v)
332 |
333 | return (b, c), (n, p, y_0, y_1, std)
334 |
335 | def to_results(tup):
336 | """
337 | Computes the results.
338 |
339 | :param tup: (b, c), (n, p, y_0, y_1, std)
340 | :return: (b, c), {'measure1': val1, 'measure2': val2, ...}
341 | """
342 | key, (n, p, y_0, y_1, std) = tup
343 | computer = BiserialStats(n, p, y_0, y_1, std)
344 | measures = {m: computer.get(m) for m in computer.measures()}
345 | return key, measures
346 |
347 | return sdf.rdd \
348 | .flatMap(lambda r: to_pair1(r.asDict())) \
349 | .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2])) \
350 | .map(lambda tup: to_pair2(tup)) \
351 | .groupByKey() \
352 | .map(lambda tup: compute_stats(tup)) \
353 | .map(lambda tup: to_results(tup)) \
354 | .sortByKey()
355 |
356 |
357 | def categorical_continuous(sdf, categorical, continuous):
358 | """
359 | Gets all pairwise categorical-continuous association measures. The result is a Spark pair-RDD,
360 | where the keys are tuples of variable names e.g. (k1, k2), and values are dictionaries of
361 | association names and metrics e.g. {‘eta_sq’: 0.9, 'eta': 0.95}. Each record
362 | in the pair-RDD is of the form.
363 |
364 | - (k1, k2), {‘eta_sq’: 0.9, 'eta': 0.95}
365 |
366 | For now, only ``eta`` :math:`\\eta^2` is supported.
367 |
368 | :param sdf: Spark dataframe.
369 | :param categorical: List of categorical variables.
370 | :param continuous: List of continuous variables.
371 | :return: Spark pair-RDD.
372 | """
373 |
374 | def to_pair1(d):
375 | """
376 | Creates a list of tuples.
377 |
378 | :param d: Dictionary of data.
379 | :return: List of (b, c, b_val), (sum_c, sum_c_sq, sum_b).
380 | """
381 | kv_0 = lambda cat, con: ((cat, con, d[cat]), (d[con], 0, 1))
382 | kv_1 = lambda cat, con: ((cat, con, '__*_avg_*__'), (d[con], 0, 1))
383 | kv_2 = lambda cat, con: ((cat, con, '__*_den_*__'), (d[con], d[con] ** 2, 1))
384 | explode = lambda cat, con: [kv_0(cat, con), kv_1(cat, con), kv_2(cat, con)]
385 | return chain(*(explode(cat, con) for cat, con in product(*[categorical, continuous])))
386 |
387 | def to_pair2(tup):
388 | """
389 | Makes a new pair.
390 |
391 | :param tup: (b, c, b_val), (sum_c, sum_c_sq, sum_b)
392 | :return: (b, c), (b_val, stats)
393 | """
394 | ss = lambda x, x_sq, n: (x_sq - (x ** 2 / n))
395 | (cat, con, flag), (sum_c, sum_c_sq, sum_b) = tup
396 | key = cat, con
397 |
398 | if flag == '__*_den_*__':
399 | val = ss(sum_c, sum_c_sq, sum_b)
400 | elif flag == '__*_avg_*__':
401 | val = sum_c / sum_b
402 | else:
403 | val = sum_c / sum_b, sum_b
404 |
405 | return key, (flag, val)
406 |
407 | def to_results(tup):
408 | """
409 | Computes the results.
410 |
411 | :param tup: (b, c), (flag, val)
412 | :return: (b, c), {'measure1': val1, 'measure2': val2, ...}
413 | """
414 | (b, c), data = tup
415 | data = {k: v for k, v in data}
416 |
417 | y_avg = data['__*_avg_*__']
418 | num = sum([v[1] * ((v[0] - y_avg) ** 2) for k, v in data.items() if isinstance(v, tuple)])
419 | den = data['__*_den_*__']
420 |
421 | eta = num / den
422 | return (b, c), {'eta_sq': eta, 'eta': sqrt(eta)}
423 |
424 | return sdf.rdd \
425 | .flatMap(lambda r: to_pair1(r.asDict())) \
426 | .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2])) \
427 | .map(lambda tup: to_pair2(tup)) \
428 | .groupByKey() \
429 | .map(lambda tup: to_results(tup)) \
430 | .sortByKey()
431 |
432 |
433 | def concordance(sdf):
434 | """
435 | Gets all the pairwise ordinal-ordinal concordance measures. The result is a Spark pair-RDD,
436 | where the keys are tuples of variable names e.g. (k1, k2), and values are dictionaries
437 | of association names and measures e.g. {'kendall': 1, 'gamma': 0.8}. Each record in the pair-RDD is of the form.
438 |
439 | - (k1, k2), {'kendall': 1, 'gamma': 0.8, ...}
440 |
441 | :param sdf: Spark dataframe. Should be all ordinal data (numeric).
442 | :return: Spark pair-RDD.
443 | """
444 |
445 | def as_pair1(n1, n2, v1, v2):
446 | """
447 | Creates a pair of the form (n1, n2), (v1, v2) where the first tuple are sorted and the second
448 | tuple have corresponding values to the elements of the first tuple.
449 |
450 | :param n1: String (variable name).
451 | :param n2: String (Variable name).
452 | :param v1: Value.
453 | :param v2: Value.
454 | :return: (n1, n2), (v1, v2).
455 | """
456 | tups = sorted([(n1, v1), (n2, v2)], key=lambda t: t[0])
457 |
458 | k1, j1 = tups[0]
459 | k2, j2 = tups[1]
460 |
461 | return (k1, k2), (j1, j2)
462 |
463 | def to_pair1(d):
464 | """
465 | Creates a list of pairs of variables and values. Keys are names of variables and values are values of
466 | those variables.
467 |
468 | :param d: Dictionary.
469 | :return: List of (k1, k2), (v1, v2).
470 | """
471 | return [as_pair1(n1, n2, d[n1], d[n2]) for n1, n2 in combinations(d.keys(), 2)]
472 |
473 | def as_count(v1, v2):
474 | """
475 | Maps the specified pairs of values to concordance status. Concordance status can be the follow.
476 |
477 | - discordant: :math:`(y_j - y_i)(x_j - x_i) < 0`
478 | - tied: :math:`(y_j - y_i)(x_j - x_i) = 0`
479 | - concordant: :math:`(y_j - y_i)(x_j - x_i) > 0`
480 |
481 | Ties are differentiated as follows.
482 |
483 | - tied on ``x``: `x_i = x_j`
484 | - tied on ``y``: `y_i = y_j`
485 | - tied on ``xy``: `x_i = x_j \\land y_i = y_j`
486 |
487 | A tuple that looks like the following will be mapped from the concordance status.
488 |
489 | - discordant: (1, 0, 0, 0, 0, 1)
490 | - tie on ``x`` and ``y``: (0, 1, 0, 0, 0, 1)
491 | - tie on ``x``: (0, 0, 1, 0, 0, 1)
492 | - tie on ``y``: (0, 0, 0, 1, 0, 1)
493 | - concordant: (0, 0, 0, 0, 1, 1)
494 |
495 | :param v1: Pair (x_i, y_i).
496 | :param v2: Pair (x_j, y_j).
497 | :return: d, t_xy, t_x, t_y, c, n
498 | """
499 | d, t_xy, t_x, t_y, c, n = 0, 0, 0, 0, 0, 1
500 |
501 | if v1 is not None and v2 is not None:
502 | x_i, y_i = v1
503 | x_j, y_j = v2
504 | r = (y_j - y_i) * (x_j - x_i)
505 |
506 | if r > 0:
507 | c = 1
508 | elif r < 0:
509 | d = 1
510 | else:
511 | if x_i == x_j and y_i == y_j:
512 | t_xy = 1
513 | elif x_i == x_j:
514 | t_x = 1
515 | else:
516 | t_y = 1
517 |
518 | return d, t_xy, t_x, t_y, c, n
519 |
520 | def to_pair2(tup):
521 | """
522 | Creates concordant status counts for each pair of observations.
523 |
524 | :param tup: (key, iterable).
525 | :return: Generator of (k1, k2), (d, t_xy, t_x, t_y, c, n).
526 | """
527 | key, data = tup
528 |
529 | return ((key, as_count(v1, v2)) for v1, v2 in combinations(data, 2))
530 |
531 | def to_results(counts):
532 | """
533 | Converts the results of concordance to a dictionary of measures.
534 |
535 | :param counts: Tuple of tuples: (x1, x2), (a, b, c, d).
536 | :return: (x1, x2), {'measure1': val1, 'measure2': val2, ...}.
537 | """
538 | (x1, x2), (d, t_xy, t_x, t_y, c, n) = counts
539 |
540 | d += 1
541 | t_xy += 1
542 | t_x += 1
543 | t_y += 1
544 | c += 1
545 | n += 5
546 |
547 | computer = ConcordanceStats(d, t_xy, t_x, t_y, c, n)
548 | measures = {m: computer.get(m) for m in computer.measures()}
549 | return (x1, x2), measures
550 |
551 | return sdf.rdd \
552 | .flatMap(lambda r: to_pair1(r.asDict())) \
553 | .groupByKey() \
554 | .flatMap(lambda tup: to_pair2(tup)) \
555 | .reduceByKey(lambda x, y: __add_concordance_counts(x, y)) \
556 | .map(lambda tup: to_results(tup)) \
557 | .sortByKey()
558 |
559 |
560 | def continuous_continuous(sdf):
561 | """
562 | Gets all the pairwise continuous-continuous association measures. The result is a Spark pair-RDD,
563 | where the keys are tuples of variable names e.g. (k1, k2), and values are dictionaries
564 | of association names and measures e.g. {'pearson': 1}. Each record in the pair-RDD is of the form.
565 |
566 | - (k1, k2), {'pearson': 1}
567 |
568 | Only pearson is supported at the moment.
569 |
570 | :param sdf: Spark dataframe. Should be all ordinal data (numeric).
571 | :return: Spark pair-RDD.
572 | """
573 |
574 | CorrItem = namedtuple('CorrItem', 'x y xy x_sq y_sq n')
575 |
576 | def to_items(d):
577 | """
578 | Converts the dictionary to (n1, n2), CorrItem.
579 |
580 | :param d: Dictionary.
581 | :return: (n1, n2), CorrItem.
582 | """
583 | as_item = lambda n1, n2: CorrItem(d[n1], d[n2], d[n1] * d[n2], d[n1] ** 2, d[n2] ** 2, 1)
584 | return (((n1, n2), as_item(n1, n2)) for n1, n2 in combinations(d.keys(), 2))
585 |
586 | def add_items(a, b):
587 | """
588 | Adds two CorrItems.
589 |
590 | :param a: CorrItem.
591 | :param b: CorrItem.
592 | :return: CorrItem.
593 | """
594 | return CorrItem(a.x + b.x, a.y + b.y, a.xy + b.xy, a.x_sq + b.x_sq, a.y_sq + b.y_sq, a.n + b.n)
595 |
596 | def to_results(tup):
597 | """
598 | Converts the tup to a result.
599 |
600 | :param tup: (n1, n2), CorrItem.
601 | :return: (n1, n2), {'measure': value}.
602 | """
603 | (n1, n2), item = tup
604 | n = item.xy - (item.x * item.y) / item.n
605 | d = sqrt(item.x_sq - (item.x ** 2 / item.n)) * sqrt(item.y_sq - (item.y ** 2 / item.n))
606 | r = n / d
607 | return (n1, n2), {'pearson': r}
608 |
609 | return sdf.rdd \
610 | .flatMap(lambda r: to_items(r.asDict())) \
611 | .reduceByKey(lambda a, b: add_items(a, b)) \
612 | .map(lambda tup: to_results(tup)) \
613 | .sortByKey()
614 |
--------------------------------------------------------------------------------
/pypair/util.py:
--------------------------------------------------------------------------------
1 | from abc import ABC
2 | from functools import lru_cache
3 | from itertools import combinations
4 |
5 | import numpy as np
6 | import pandas as pd
7 |
8 |
9 | class MeasureMixin(ABC):
10 | """
11 | Measure mixin. Able to get list the functions decorated with `@property` and also
12 | access such property based on name.
13 | """
14 |
15 | @classmethod
16 | def measures(cls):
17 | """
18 | Gets a list of all the measures.
19 |
20 | :return: List of all the measures.
21 | """
22 | return get_measures(cls)
23 |
24 | @lru_cache(maxsize=None)
25 | def get(self, measure):
26 | """
27 | Gets the specified measure.
28 |
29 | :param measure: Name of measure.
30 | :return: Measure.
31 | """
32 | return getattr(self, measure)
33 |
34 | @lru_cache(maxsize=None)
35 | def get_measures(self):
36 | """
37 | Gets a list of all the measures.
38 |
39 | :return: List of all the measures.
40 | """
41 | return get_measures(self.__class__)
42 |
43 |
44 | def get_measures(clazz):
45 | """
46 | Gets all the measures of a clazz.
47 |
48 | :param clazz: Clazz.
49 | :return: List of measures.
50 | """
51 | from itertools import chain
52 |
53 | is_property = lambda v: isinstance(v, property)
54 | is_public = lambda n: not n.startswith('_')
55 | is_valid = lambda n, v: is_public(n) and is_property(v)
56 |
57 | measures = sorted(list(chain(*[[n for n, v in vars(c).items() if is_valid(n, v)] for c in clazz.__mro__])))
58 |
59 | return measures
60 |
61 |
62 | def corr(df, f):
63 | """
64 | Computes the pairwise association matrix. ALL fields/columns must be the same type and so that the specified field
65 | ``f`` will be able to compute the pairwise associations.
66 |
67 | :param df: Pandas data frame.
68 | :param f: Callable function; e.g. lambda a, b: categorical_categorical(a, b, measure='phi')
69 | """
70 | fields = list(df.columns)
71 | idx_map = {f: i for i, f in enumerate(fields)}
72 | associations = ((idx_map[a], idx_map[b], f(df[a], df[b])) for a, b in combinations(fields, 2))
73 |
74 | n = df.shape[1]
75 | mat = np.empty((n, n))
76 | for i, j, a in associations:
77 | mat[i, j] = mat[j, i] = a
78 |
79 | df = pd.DataFrame(mat, columns=fields, index=fields)
80 | return df
81 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # TEST
2 | nose
3 | coverage
4 | # LINT OR DIE
5 | flake8
6 | pep8
7 | pyflakes
8 | # LIBS
9 | numpy
10 | scipy
11 | pandas
12 | scikit-learn
13 | pyspark
14 | # DOCUMENTATION
15 | sphinx
16 | sphinx_rtd_theme
17 | sphinxcontrib-bibtex
18 | sphinxcontrib-blockdiag
19 | sphinx-sitemap
20 | # PUBLISHING
21 | twine
22 | setuptools
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = pypair
3 | version = 0.0.1
4 | author = Jee Vang
5 | author_email = vangjee@gmail.com
6 | description = Pairwise association measures of statistical variable types
7 | long_description: file: README.md
8 | long_description_content_type = text/markdown
9 | description_file = README.md
10 | url = https://github.com/oneoffcoder/py-pair
11 | keywords = statistics, pairwise, association, correlation, concordance, measurement, strength, pyspark
12 | install_requires = scipy, numpy, pandas, scikit-learn, pyspark
13 | classifiers =
14 | Programming Language :: Python :: 3
15 | License :: OSI Approved :: Apache Software License
16 | Operating System :: OS Independent
17 | Topic :: Scientific/Engineering :: Artificial Intelligence
18 | Intended Audience :: Developers
19 | Intended Audience :: Science/Research
20 | Development Status :: 5 - Production/Stable
21 | include_package_data = True
22 | test_suite = nose.collector
23 |
24 | [flake8]
25 | max-line-length = 120
26 | ignore = E501 E731
27 |
28 | [nosetests]
29 | verbosity = 3
30 | with-doctest = 1
31 | with-coverage = 1
32 | with-id = 1
33 | cover-erase = 1
34 | cover-html = 1
35 | cover-html-dir = coverage
36 | cover-package = pypair
37 | detailed-errors = 1
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | setup(
4 | packages=find_packages(exclude=('*.tests', '*.tests.*', 'tests.*', 'tests'))
5 | )
6 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/py-pair/79e6e75c32333fc9421aad36c02c2e1043ae4a05/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_association.py:
--------------------------------------------------------------------------------
1 | import random
2 | from itertools import combinations
3 |
4 | import numpy as np
5 | import pandas as pd
6 | from nose import with_setup
7 |
8 | from pypair.association import binary_binary, categorical_categorical, \
9 | binary_continuous, concordance, categorical_continuous, continuous_continuous, confusion
10 | from pypair.biserial import Biserial
11 | from pypair.contingency import BinaryTable, CategoricalTable, ConfusionMatrix
12 | from pypair.continuous import Concordance, CorrelationRatio, Continuous
13 | from pypair.util import corr
14 |
15 |
16 | def setup():
17 | """
18 | Setup.
19 | :return: None.
20 | """
21 | np.random.seed(37)
22 | random.seed(37)
23 |
24 |
25 | def teardown():
26 | """
27 | Teardown.
28 | :return: None.
29 | """
30 | pass
31 |
32 |
33 | @with_setup(setup, teardown)
34 | def test_binary_binary():
35 | """
36 | Tests binary-binary.
37 |
38 | :return: None.
39 | """
40 | get_data = lambda x, y, n: [(x, y) for _ in range(n)]
41 | data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
42 | a = [a for a, _ in data]
43 | b = [b for _, b in data]
44 |
45 | for m in BinaryTable.measures():
46 | r = binary_binary(a, b, m)
47 | print(f'{r}: {m}')
48 |
49 |
50 | @with_setup(setup, teardown)
51 | def test_categorical_categorical():
52 | """
53 | Tests categorical-categorical.
54 |
55 | :return: None.
56 | """
57 | get_data = lambda x, y, n: [(x, y) for _ in range(n)]
58 | data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
59 | a = [a for a, _ in data]
60 | b = [b for _, b in data]
61 |
62 | for m in CategoricalTable.measures():
63 | r = categorical_categorical(a, b, m)
64 | print(f'{r}: {m}')
65 |
66 |
67 | @with_setup(setup, teardown)
68 | def test_binary_continuous():
69 | """
70 | Tests binary-continuous.
71 |
72 | :return: None.
73 | """
74 | get_data = lambda x, y, n: [(x, y) for _ in range(n)]
75 | data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
76 | a = [a for a, _ in data]
77 | b = [b for _, b in data]
78 |
79 | for m in Biserial.measures():
80 | r = binary_continuous(a, b, m)
81 | print(f'{r}: {m}')
82 |
83 |
84 | @with_setup(setup, teardown)
85 | def test_concordance():
86 | """
87 | Tests concordance.
88 |
89 | :return: None.
90 | """
91 | a = [1, 2, 3]
92 | b = [3, 2, 1]
93 |
94 | for m in Concordance.measures():
95 | r = concordance(a, b, m)
96 | print(f'{r}: {m}')
97 |
98 |
99 | @with_setup(setup, teardown)
100 | def test_categorical_continuous():
101 | """
102 | Tests categorical-continuous. Data taken from `Wikipedia `_.
103 |
104 | :return: None.
105 | """
106 | data = [
107 | ('a', 45), ('a', 70), ('a', 29), ('a', 15), ('a', 21),
108 | ('g', 40), ('g', 20), ('g', 30), ('g', 42),
109 | ('s', 65), ('s', 95), ('s', 80), ('s', 70), ('s', 85), ('s', 73)
110 | ]
111 | x = [x for x, _ in data]
112 | y = [y for _, y in data]
113 | for m in CorrelationRatio.measures():
114 | r = categorical_continuous(x, y, m)
115 | print(f'{r}: {m}')
116 |
117 |
118 | @with_setup(setup, teardown)
119 | def test_continuous_continuous():
120 | """
121 | Tests continuous-continuous.
122 |
123 | :return: None.
124 | """
125 | x = [x for x in range(10)]
126 | y = [y for y in range(10)]
127 | for m in Continuous.measures():
128 | r = continuous_continuous(x, y, m)
129 | print(f'{r}: {m}')
130 |
131 |
132 | @with_setup(setup, teardown)
133 | def test_confusion():
134 | """
135 | Tests confusion matrix. Data taken from `here `_.
136 |
137 | :return: None
138 | """
139 | tn = [(0, 0) for _ in range(50)]
140 | fp = [(0, 1) for _ in range(10)]
141 | fn = [(1, 0) for _ in range(5)]
142 | tp = [(1, 1) for _ in range(100)]
143 | data = tn + fp + fn + tp
144 | a = [a for a, _ in data]
145 | b = [b for _, b in data]
146 |
147 | for m in ConfusionMatrix.measures():
148 | r = confusion(a, b, m)
149 | print(f'{r}: {m}')
150 |
151 |
152 | @with_setup(setup, teardown)
153 | def test_pandas_categorical():
154 | """
155 | Tests categorical correlation using Pandas dataframe ``.corr()``.
156 |
157 | :return: None
158 | """
159 |
160 | # FIXME: pandas.corr() is broken; no longer supports non-numeric columns!
161 | def get_associations(a, b):
162 | d = {'x': a, 'y': b}
163 | measures = {m: categorical_categorical(df[a], df[b], measure=m) for m in ['chisq', 'phi', 'mutual_information']}
164 | d = {**d, **measures}
165 | return d
166 |
167 | df = pd.DataFrame({
168 | 'x1': ['on', 'on', 'on', 'on', 'on', 'off', 'off', 'off', 'off', 'off'],
169 | 'x2': ['on', 'off', 'on', 'off', 'on', 'off', 'on', 'off', 'on', 'off'],
170 | 'x3': ['off', 'off', 'off', 'off', 'off', 'on', 'on', 'on', 'on', 'on'],
171 | 'x4': ['on', 'on', 'on', 'on', 'off', 'off', 'off', 'off', 'off', 'on'],
172 | })
173 |
174 | associations = [get_associations(a, b) for a, b in combinations(df.columns, 2)]
175 |
176 | expected = {
177 | 'chisq': [0.2857142857142857, 7.142857142857143, 2.5714285714285716, 0.2857142857142857, 0.2857142857142857,
178 | 2.5714285714285716],
179 | 'phi': [0.14285714285714285, 0.7142857142857143, 0.4285714285714286, 0.14285714285714285, 0.14285714285714285,
180 | 0.4285714285714286],
181 | 'mutual_information': [0.010239075859473604, 0.2830308622715362, 0.09487759197468806, 0.010239075859473604,
182 | 0.010239075859473604,
183 | 0.09487759197468805]
184 | }
185 |
186 | a_df = pd.DataFrame(associations)
187 |
188 | for field, e in expected.items():
189 | o = list(a_df[field])
190 | for v1, v2 in zip(o, e):
191 | assert abs(v1 - v2) < 0.0001
192 |
193 |
194 | @with_setup(setup, teardown)
195 | def test_get_correlation_matrix():
196 | """
197 | Tests getting correlation matrix as Pandas dataframe.
198 |
199 | :return: None.
200 | """
201 | df = pd.DataFrame({
202 | 'x1': ['on', 'on', 'on', 'on', 'on', 'off', 'off', 'off', 'off', 'off'],
203 | 'x2': ['on', 'off', 'on', 'off', 'on', 'off', 'on', 'off', 'on', 'off'],
204 | 'x3': ['off', 'off', 'off', 'off', 'off', 'on', 'on', 'on', 'on', 'on'],
205 | 'x4': ['on', 'on', 'on', 'on', 'off', 'off', 'off', 'off', 'off', 'on'],
206 | })
207 |
208 | f = lambda a, b: categorical_categorical(a, b, measure='mutual_information')
209 | p = corr(df, f)
210 | print(list(p.x1))
211 | print(list(p.x2))
212 | print(list(p.x3))
213 | print(list(p.x4))
214 |
215 | expected = {
216 | 'x1': [4.66486312813147e-310, 0.010239075859473604, 0.2830308622715362, 0.09487759197468806],
217 | 'x2': [0.010239075859473604, 0.0, 0.010239075859473604, 0.010239075859473604],
218 | 'x3': [0.2830308622715362, 0.010239075859473604, 0.0, 0.09487759197468805],
219 | 'x4': [0.09487759197468806, 0.010239075859473604, 0.09487759197468805, 0.0]
220 | }
221 |
222 | for field, e in expected.items():
223 | o = list(p[field])
224 |
225 | for v1, v2 in zip(o, e):
226 | diff = abs(v1 - v2)
227 | assert diff < 0.0001
228 |
--------------------------------------------------------------------------------
/tests/test_contingency.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from nose import with_setup
6 |
7 | from pypair.association import categorical_categorical
8 |
9 |
10 | def setup():
11 | """
12 | Setup.
13 | :return: None.
14 | """
15 | np.random.seed(37)
16 | random.seed(37)
17 |
18 |
19 | def teardown():
20 | """
21 | Teardown.
22 | :return: None.
23 | """
24 | pass
25 |
26 |
27 | @with_setup(setup, teardown)
28 | def test_contingency_with_nulls():
29 | """
30 | Tests creating contingency table with nulls.
31 |
32 | :return: None.
33 | """
34 | df = pd.DataFrame([
35 | (0, 0),
36 | (0, 1),
37 | (1, 0),
38 | (1, 1),
39 | (0, None),
40 | (None, 0),
41 | (None, None)
42 | ], columns=['a', 'b'])
43 | v = categorical_categorical(df.a, df.b, measure='phi')
44 | print(v)
45 |
--------------------------------------------------------------------------------
/tests/test_spark.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import random
4 | import unittest
5 | from random import choice
6 |
7 | import pandas as pd
8 | from pyspark.sql import SparkSession
9 |
10 | from pypair.spark import binary_binary, confusion, categorical_categorical, agreement, binary_continuous, concordance, \
11 | categorical_continuous, continuous_continuous
12 |
13 |
14 | class PySparkTest(unittest.TestCase):
15 | """
16 | PySpark test class.
17 | """
18 |
19 | @classmethod
20 | def supress_py4j_logging(cls):
21 | """
22 | Supresses p4j logging.
23 |
24 | :return: None.
25 | """
26 | logger = logging.getLogger('py4j')
27 | logger.setLevel(logging.WARN)
28 |
29 | @classmethod
30 | def create_pyspark_session(cls):
31 | """
32 | Creates a PySpark session.
33 |
34 | :return: PySpark session.
35 | """
36 | return (SparkSession.builder
37 | .master('local[4]')
38 | .appName('local-testing-pyspark')
39 | .getOrCreate())
40 |
41 | @classmethod
42 | def setUpClass(cls):
43 | """
44 | Sets up the class.
45 |
46 | :return: None.
47 | """
48 | cls.supress_py4j_logging()
49 | cls.spark = cls.create_pyspark_session()
50 | random.seed(37)
51 |
52 | @classmethod
53 | def tearDownClass(cls):
54 | """
55 | Tears down the class.
56 |
57 | :return: None.
58 | """
59 | cls.spark.stop()
60 |
61 | def _get_binary_binary_data(self):
62 | """
63 | Gets dummy binary-binary data in a Spark dataframe.
64 |
65 | :return: Spark dataframe.
66 | """
67 | get_data = lambda x, y, n: [(x, y) * 2 for _ in range(n)]
68 | data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
69 | pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4'])
70 | sdf = self.spark.createDataFrame(pdf)
71 | return sdf
72 |
73 | def _get_confusion_data(self):
74 | """
75 | Gets dummy binary-binary data in Spark dataframe. For use with confusion matrix analysis.
76 |
77 | :return: Spark dataframe.
78 | """
79 | tn = [(0, 0) * 2 for _ in range(50)]
80 | fp = [(0, 1) * 2 for _ in range(10)]
81 | fn = [(1, 0) * 2 for _ in range(5)]
82 | tp = [(1, 1) * 2 for _ in range(100)]
83 | data = tn + fp + fn + tp
84 | pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4'])
85 | sdf = self.spark.createDataFrame(pdf)
86 | return sdf
87 |
88 | def _get_categorical_categorical_data(self):
89 | """
90 | Gets dummy categorical-categorical data in Spark dataframe.
91 |
92 | :return: Spark dataframe.
93 | """
94 | x_domain = ['a', 'b', 'c']
95 | y_domain = ['a', 'b']
96 |
97 | get_x = lambda: choice(x_domain)
98 | get_y = lambda: choice(y_domain)
99 | get_data = lambda: {f'x{i}': v for i, v in enumerate((get_x(), get_y(), get_x(), get_y()))}
100 |
101 | pdf = pd.DataFrame([get_data() for _ in range(100)])
102 | sdf = self.spark.createDataFrame(pdf)
103 | return sdf
104 |
105 | def _get_binary_continuous_data(self):
106 | """
107 | Gets dummy `binary-continuous data `_.
108 |
109 | :return: Spark dataframe.
110 | """
111 | data = [
112 | (1, 10), (1, 11), (1, 6), (1, 11), (0, 4),
113 | (0, 3), (1, 12), (0, 2), (0, 2), (0, 1)
114 | ]
115 | pdf = pd.DataFrame(data, columns=['gender', 'years'])
116 | sdf = self.spark.createDataFrame(pdf)
117 | return sdf
118 |
119 | def _get_concordance_data(self):
120 | """
121 | Gets dummy concordance data.
122 |
123 | :return: Spark dataframe.
124 | """
125 | a = [1, 2, 3]
126 | b = [3, 2, 1]
127 | pdf = pd.DataFrame({'a': a, 'b': b, 'c': a, 'd': b})
128 | sdf = self.spark.createDataFrame(pdf)
129 | return sdf
130 |
131 | def _get_categorical_continuous_data(self):
132 | """
133 | Gets dummy categorical-continuous data.
134 | See `site `_.
135 |
136 | :return: Spark dataframe.
137 | """
138 | data = [
139 | ('a', 45), ('a', 70), ('a', 29), ('a', 15), ('a', 21),
140 | ('g', 40), ('g', 20), ('g', 30), ('g', 42),
141 | ('s', 65), ('s', 95), ('s', 80), ('s', 70), ('s', 85), ('s', 73)
142 | ]
143 | data = [tup * 2 for tup in data]
144 | pdf = pd.DataFrame(data, columns=['x1', 'x2', 'x3', 'x4'])
145 | sdf = self.spark.createDataFrame(pdf)
146 | return sdf
147 |
148 | def _get_continuous_continuous_data(self):
149 | """
150 | Gets dummy continuous-continuous data.
151 | See `site `_.
152 |
153 | :return: Spark dataframe.
154 | """
155 | data = [
156 | (12, 9),
157 | (10, 12),
158 | (9, 12),
159 | (14, 11),
160 | (10, 8),
161 | (11, 9),
162 | (10, 9),
163 | (10, 6),
164 | (14, 12),
165 | (9, 11),
166 | (11, 12),
167 | (10, 7),
168 | (11, 13),
169 | (15, 14),
170 | (8, 11),
171 | (11, 11),
172 | (9, 8),
173 | (9, 9),
174 | (10, 11),
175 | (12, 9),
176 | (11, 12),
177 | (10, 12),
178 | (9, 7),
179 | (7, 9),
180 | (12, 14)
181 | ]
182 | pdf = pd.DataFrame([item * 2 for item in data], columns=['x1', 'x2', 'x3', 'x4'])
183 | sdf = self.spark.createDataFrame(pdf)
184 | return sdf
185 |
186 |
187 | class SparkTest(PySparkTest):
188 | """
189 | Tests Spark operations.
190 | """
191 |
192 | def test_binary_binary(self):
193 | """
194 | Tests binary-binary Spark operation.
195 |
196 | :return: None.
197 | """
198 | sdf = self._get_binary_binary_data()
199 | results = {tup[0]: tup[1] for tup in binary_binary(sdf).collect()}
200 |
201 | print(json.dumps({f'{k[0]}_{k[1]}': v for k, v in results.items()}, indent=1))
202 |
203 | def test_confusion(self):
204 | """
205 | Tests confusion Spark operation.
206 |
207 | :return: None.
208 | """
209 | sdf = self._get_confusion_data()
210 | results = {tup[0]: tup[1] for tup in confusion(sdf).collect()}
211 |
212 | print(json.dumps({f'{k[0]}_{k[1]}': v for k, v in results.items()}, indent=1))
213 |
214 | def test_categorical_categorical(self):
215 | """
216 | Tests categorical-categorical Spark operation.
217 |
218 | :return: None.
219 | """
220 | sdf = self._get_categorical_categorical_data()
221 | results = {tup[0]: tup[1] for tup in categorical_categorical(sdf).collect()}
222 |
223 | print(json.dumps({f'{k[0]}_{k[1]}': v for k, v in results.items()}, indent=1))
224 |
225 | def test_agreement(self):
226 | """
227 | Tests agreement Spark operation.
228 |
229 | :return: None.
230 | """
231 | sdf = self._get_binary_binary_data()
232 | results = {tup[0]: tup[1] for tup in agreement(sdf).collect()}
233 |
234 | print(json.dumps({f'{k[0]}_{k[1]}': v for k, v in results.items()}, indent=1))
235 |
236 | def test_biserial(self):
237 | """
238 | Tests binary-continuous Spark operation.
239 |
240 | :return: None.
241 | """
242 | sdf = self._get_binary_continuous_data()
243 | results = {tup[0]: tup[1] for tup in binary_continuous(sdf, binary=['gender'], continuous=['years']).collect()}
244 |
245 | print(json.dumps({f'{k[0]}_{k[1]}': v for k, v in results.items()}, indent=1))
246 |
247 | def test_concordance(self):
248 | """
249 | Tests concordance Spark operation.
250 |
251 | :return: None.
252 | """
253 | sdf = self._get_concordance_data()
254 | results = {tup[0]: tup[1] for tup in concordance(sdf).collect()}
255 |
256 | print(json.dumps({f'{k[0]}_{k[1]}': v for k, v in results.items()}, indent=1))
257 |
258 | def test_categorical_continuous(self):
259 | """
260 | Tests categorical-continuous Spark operation.
261 |
262 | :return: None.
263 | """
264 | sdf = self._get_categorical_continuous_data()
265 | results = {tup[0]: tup[1] for tup in categorical_continuous(sdf, ['x1', 'x3'], ['x2', 'x4']).collect()}
266 |
267 | print(json.dumps({f'{k[0]}_{k[1]}': v for k, v in results.items()}, indent=1))
268 |
269 | def test_continuous_continuous(self):
270 | """
271 | Tests continuous-continuous Spark operation.
272 |
273 | :return: None.
274 | """
275 | sdf = self._get_continuous_continuous_data()
276 | results = {tup[0]: tup[1] for tup in continuous_continuous(sdf).collect()}
277 |
278 | print(json.dumps({f'{k[0]}_{k[1]}': v for k, v in results.items()}, indent=1))
279 |
--------------------------------------------------------------------------------
/tests/test_table.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | import numpy as np
4 | from nose import with_setup
5 |
6 | from pypair.contingency import BinaryTable, CategoricalTable, ConfusionMatrix
7 |
8 |
9 | def setup():
10 | """
11 | Setup.
12 | :return: None.
13 | """
14 | np.random.seed(37)
15 | random.seed(37)
16 |
17 |
18 | def teardown():
19 | """
20 | Teardown.
21 | :return: None.
22 | """
23 | pass
24 |
25 |
26 | @with_setup(setup, teardown)
27 | def test_confusion_matrix_creation():
28 | """
29 | Tests creating ConfusionMatrix.
30 |
31 | :return: None.
32 | """
33 | a = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
34 | b = [0, 0, 0, 1, 1, 0, 0, 1, 1, 1]
35 |
36 | table = ConfusionMatrix(a, b)
37 | for measure in ConfusionMatrix.measures():
38 | stats = table.get(measure)
39 | if isinstance(stats, tuple):
40 | print(f'{stats[0]:.8f}, {stats[1]:.8f}: {measure}')
41 | else:
42 | print(f'{stats:.8f}: {measure}')
43 |
44 |
45 | @with_setup(setup, teardown)
46 | def test_binary_table_creation():
47 | """
48 | Tests creating BinaryTable. The data is simulated from this `site `_.
49 |
50 | :return: None.
51 | """
52 | get_data = lambda x, y, n: [(x, y) for _ in range(n)]
53 | data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
54 | a = [a for a, _ in data]
55 | b = [b for _, b in data]
56 |
57 | table = BinaryTable(a, b)
58 | for measure in BinaryTable.measures():
59 | stats = table.get(measure)
60 | if isinstance(stats, tuple):
61 | print(f'{stats[0]:.8f}, {stats[1]:.8f}: {measure}')
62 | else:
63 | print(f'{stats:.8f}: {measure}')
64 |
65 |
66 | @with_setup(setup, teardown)
67 | def test_categorical_table_creation():
68 | """
69 | Tests creating CategoricalTable.
70 |
71 | :return: None.
72 | """
73 | a = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
74 | b = [0, 0, 0, 1, 1, 0, 0, 1, 1, 1]
75 |
76 | table = CategoricalTable(a, b)
77 | chisq = table.get('chisq')
78 | phi = table.get('phi')
79 | print(chisq, phi)
80 |
--------------------------------------------------------------------------------