├── MANIFEST.in
├── images
├── logo.png
├── flowchart.png
├── plot-conover.png
├── result-conover.png
├── melted-dataframe.png
├── plot-conover-custom-cmap.png
└── flowchart.gv
├── paper
├── figure.png
├── codemeta.json
├── paper.bib
├── paper.md
└── generate.rb
├── docs
├── requirements.txt
├── source
│ ├── _static
│ │ ├── flowchart.png
│ │ ├── cd_diagram0.png
│ │ ├── cd_diagram1.png
│ │ ├── cd_diagram2.png
│ │ ├── plot-conover.png
│ │ ├── plot-conover-custom-cmap.png
│ │ └── cd_diagram_example_sig_plot.png
│ ├── global_api.rst
│ ├── omnibus_api.rst
│ ├── outliers_api.rst
│ ├── plotting_api.rst
│ ├── posthocs_api.rst
│ ├── installation.rst
│ ├── index.rst
│ ├── intro.rst
│ ├── conf.py
│ └── tutorial.rst
├── Makefile
└── make.bat
├── tests
├── __init__.py
└── test_posthocs.py
├── .readthedocs.yml
├── CONTRIBUTING.md
├── .github
├── workflows
│ ├── package-publish.yml
│ ├── package-pull.yml
│ └── package-test.yml
└── ISSUE_TEMPLATE
│ └── bug_report.md
├── LICENSE
├── scikit_posthocs
├── __init__.py
├── _global.py
├── _outliers.py
├── _omnibus.py
└── _plotting.py
├── pyproject.toml
├── CODE_OF_CONDUCT.md
├── DESCRIPTION.rst
├── usage-examples.ipynb
└── README.rst
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include DESCRIPTION.rst
3 | recursive-exclude tests *
4 |
--------------------------------------------------------------------------------
/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/images/logo.png
--------------------------------------------------------------------------------
/paper/figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/paper/figure.png
--------------------------------------------------------------------------------
/images/flowchart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/images/flowchart.png
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx_rtd_theme
2 | numpydoc
3 | git+https://github.com/maximtrp/scikit-posthocs
4 |
--------------------------------------------------------------------------------
/images/plot-conover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/images/plot-conover.png
--------------------------------------------------------------------------------
/images/result-conover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/images/result-conover.png
--------------------------------------------------------------------------------
/images/melted-dataframe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/images/melted-dataframe.png
--------------------------------------------------------------------------------
/docs/source/_static/flowchart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/docs/source/_static/flowchart.png
--------------------------------------------------------------------------------
/docs/source/_static/cd_diagram0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/docs/source/_static/cd_diagram0.png
--------------------------------------------------------------------------------
/docs/source/_static/cd_diagram1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/docs/source/_static/cd_diagram1.png
--------------------------------------------------------------------------------
/docs/source/_static/cd_diagram2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/docs/source/_static/cd_diagram2.png
--------------------------------------------------------------------------------
/docs/source/_static/plot-conover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/docs/source/_static/plot-conover.png
--------------------------------------------------------------------------------
/images/plot-conover-custom-cmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/images/plot-conover-custom-cmap.png
--------------------------------------------------------------------------------
/docs/source/_static/plot-conover-custom-cmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/docs/source/_static/plot-conover-custom-cmap.png
--------------------------------------------------------------------------------
/docs/source/_static/cd_diagram_example_sig_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/docs/source/_static/cd_diagram_example_sig_plot.png
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import tests.test_posthocs
3 |
4 | def posthocs_suite():
5 | loader = unittest.TestLoader()
6 | suite = loader.loadTestsFromModule(tests.test_posthocs)
7 | return suite
8 |
--------------------------------------------------------------------------------
/docs/source/global_api.rst:
--------------------------------------------------------------------------------
1 | Global Tests API reference
2 | --------------------------
3 |
4 | .. currentmodule:: scikit_posthocs
5 | .. autosummary::
6 | :toctree: generated
7 |
8 | global_f_test
9 | global_simes_test
10 |
--------------------------------------------------------------------------------
/docs/source/omnibus_api.rst:
--------------------------------------------------------------------------------
1 | Omnibus API reference
2 | ---------------------
3 |
4 | .. currentmodule:: scikit_posthocs
5 | .. autosummary::
6 | :toctree: generated
7 |
8 | test_mackwolfe
9 | test_osrt
10 | test_durbin
11 |
--------------------------------------------------------------------------------
/docs/source/outliers_api.rst:
--------------------------------------------------------------------------------
1 | Outliers API reference
2 | ----------------------
3 |
4 | .. currentmodule:: scikit_posthocs
5 | .. autosummary::
6 | :toctree: generated
7 |
8 | outliers_iqr
9 | outliers_gesd
10 | outliers_grubbs
11 | outliers_tietjen
12 |
--------------------------------------------------------------------------------
/docs/source/plotting_api.rst:
--------------------------------------------------------------------------------
1 | Plotting API reference
2 | ----------------------
3 |
4 | .. currentmodule:: scikit_posthocs
5 | .. autosummary::
6 | :toctree: generated
7 |
8 | sign_array
9 | sign_table
10 | sign_plot
11 | critical_difference_diagram
12 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | build:
4 | os: ubuntu-22.04
5 | tools:
6 | python: "3.11"
7 |
8 | sphinx:
9 | configuration: docs/source/conf.py
10 |
11 | formats:
12 | - pdf
13 |
14 | python:
15 | install:
16 | - requirements: docs/requirements.txt
17 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Support
2 |
3 | Currently, all support is provided on GitHub. Please open an issue with your
4 | bug, question, or suggestion.
5 |
6 | ## Bugs
7 |
8 | If you have found a bug, open a GitHub issue using `Bug report` template. Ensure
9 | that you have included the following information:
10 |
11 | - Full error traceback.
12 | - Steps to reproduce a bug.
13 | - Dataset you get a bug with.
14 |
15 | ## Contribution
16 |
17 | Your contribution is highly welcome. You may open a pull request or an issue
18 | describing an improvement or implementation of new functionality.
19 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SOURCEDIR = source
8 | BUILDDIR = .
9 |
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 |
14 | .PHONY: help Makefile
15 |
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
20 |
--------------------------------------------------------------------------------
/docs/source/posthocs_api.rst:
--------------------------------------------------------------------------------
1 | Post-hocs API reference
2 | -----------------------
3 |
4 | .. currentmodule:: scikit_posthocs
5 | .. autosummary::
6 | :toctree: generated
7 |
8 | posthoc_conover
9 | posthoc_dunn
10 | posthoc_nemenyi
11 | posthoc_nemenyi_friedman
12 | posthoc_conover_friedman
13 | posthoc_siegel_friedman
14 | posthoc_miller_friedman
15 | posthoc_npm_test
16 | posthoc_durbin
17 | posthoc_anderson
18 | posthoc_quade
19 | posthoc_vanwaerden
20 | posthoc_tukey_hsd
21 | posthoc_ttest
22 | posthoc_mannwhitney
23 | posthoc_wilcoxon
24 | posthoc_scheffe
25 | posthoc_tamhane
26 | posthoc_tukey
27 | posthoc_dscf
28 | posthoc_dunnett
29 |
--------------------------------------------------------------------------------
/.github/workflows/package-publish.yml:
--------------------------------------------------------------------------------
1 | name: Package Upload to PyPi
2 |
3 | on:
4 | release:
5 | types: [published]
6 |
7 | permissions:
8 | contents: read
9 |
10 | jobs:
11 | deploy:
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v4
15 | - name: Set up Python
16 | uses: actions/setup-python@v5
17 | with:
18 | python-version: '3.x'
19 | - name: Install dependencies
20 | run: |
21 | python -m pip install --upgrade pip
22 | pip install build
23 | - name: Build package
24 | run: python -m build
25 | - name: Publish package
26 | uses: pypa/gh-action-pypi-publish@release/v1
27 | with:
28 | user: __token__
29 | password: ${{ secrets.PYPI_API_TOKEN }}
30 |
--------------------------------------------------------------------------------
/.github/workflows/package-pull.yml:
--------------------------------------------------------------------------------
1 | name: Run tests on pull requests
2 |
3 | on:
4 | pull_request:
5 | branches:
6 | - master
7 |
8 | jobs:
9 | build:
10 | runs-on: ubuntu-latest
11 | strategy:
12 | matrix:
13 | python-version: ["3.9", "3.12"]
14 | steps:
15 | - uses: actions/checkout@v4
16 | - name: Set up Python ${{ matrix.python-version }}
17 | uses: actions/setup-python@v5
18 | with:
19 | python-version: ${{ matrix.python-version }}
20 | cache: 'pip'
21 | - name: Install dependencies
22 | run: |
23 | python -m pip install --upgrade pip
24 | python -m pip install .
25 | python -m pip install .[test]
26 | - name: Testing with pytest
27 | run: |
28 | pytest .
29 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: bug
6 | assignees: maximtrp
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **Dataset**
14 | Please provide a link to the dataset you get the bug with.
15 |
16 | **To Reproduce**
17 | Steps to reproduce the behavior:
18 | 1. Go to '...'
19 | 2. Click on '....'
20 | 3. Scroll down to '....'
21 | 4. See error
22 |
23 | **Expected behavior**
24 | A clear and concise description of what you expected to happen.
25 |
26 | **System and package information (please complete the following information):**
27 | - OS: (e.g. Linux 4.20.0-arch1-1-ARCH x86_64 GNU/Linux)
28 | - Package version: (e.g. 0.4.0)
29 |
30 | **Additional context**
31 | Add any other context about the problem here.
32 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/paper/codemeta.json:
--------------------------------------------------------------------------------
1 | {
2 | "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
3 | "@type": "Code",
4 | "author": [
5 | {
6 | "@id": "https://orcid.org/0000-0003-2586-4633",
7 | "@type": "Person",
8 | "email": "maximtrp@gmail.com",
9 | "name": "Maksim A. Terpilowski",
10 | "affiliation": "Institute of Evolutionary Physiology and Biochemistry, Saint Petersburg, Russia"
11 | }
12 | ],
13 | "identifier": "",
14 | "codeRepository": "https://github.com/maximtrp/scikit-posthocs",
15 | "datePublished": "2018-12-06",
16 | "dateModified": "2018-12-06",
17 | "dateCreated": "2018-12-06",
18 | "description": "A Python package for pairwise multiple comparison post hoc tests and outliers detection",
19 | "keywords": "python,statistics,posthoc",
20 | "license": "BSD 3-Clause License",
21 | "title": "scikit-posthocs: Pairwise multiple comparison tests in Python",
22 | "version": "v0.4.0"
23 | }
24 |
--------------------------------------------------------------------------------
/docs/source/installation.rst:
--------------------------------------------------------------------------------
1 | Installation
2 | ============
3 |
4 | The latest version can be installed from PyPi using ``pip``:
5 |
6 | .. code:: sh
7 |
8 | pip install scikit-posthocs
9 |
10 | Or from conda-forge repository using ``conda``:
11 |
12 | .. code:: sh
13 |
14 | conda install -c conda-forge scikit-posthocs
15 |
16 | You can also use ``pip`` to install the development version from GitHub:
17 |
18 | .. code:: sh
19 |
20 | pip install git+https://github.com/maximtrp/scikit-posthocs.git
21 |
22 | Dependencies
23 | ------------
24 |
25 | Package is compatible with both major versions of Python and has the following dependencies:
26 |
27 | * `NumPy `_
28 | * `SciPy `_
29 | * `Statsmodels `_
30 | * `Pandas `_
31 | * `Seaborn `_
32 | * `Matplotlib `_
33 |
34 | Bugs
35 | ----
36 |
37 | Please report any bugs using issues tracker on `GitHub `_.
38 |
--------------------------------------------------------------------------------
/.github/workflows/package-test.yml:
--------------------------------------------------------------------------------
1 | name: Run tests on push
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 |
8 | jobs:
9 | build:
10 | runs-on: ubuntu-latest
11 | strategy:
12 | matrix:
13 | python-version: ["3.9", "3.12"]
14 | steps:
15 | - uses: actions/checkout@v4
16 | - name: Set up Python ${{ matrix.python-version }}
17 | uses: actions/setup-python@v5
18 | with:
19 | python-version: ${{ matrix.python-version }}
20 | cache: 'pip'
21 | - name: Install dependencies
22 | run: |
23 | python -m pip install --upgrade pip
24 | python -m pip install .
25 | python -m pip install .[test]
26 | - name: Testing with pytest and measuring coverage
27 | run: |
28 | coverage run --source scikit_posthocs -m pytest .
29 | coverage xml
30 | - name: Reporting coverage to Codacy
31 | uses: codacy/codacy-coverage-reporter-action@v1
32 | with:
33 | project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
34 | coverage-reports: coverage.xml
35 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | scikit-posthocs
2 | ===============
3 |
4 | **scikit-posthocs** is a Python package which provides post hoc tests for
5 | pairwise multiple comparisons that are usually performed in statistical data
6 | analysis to assess the differences between group levels if a statistically
7 | significant result of ANOVA test has been obtained.
8 |
9 | **scikit-posthocs** is tightly integrated with Pandas DataFrames and NumPy
10 | arrays to ensure fast computations and convenient data import and storage.
11 |
12 | This package will be useful for statisticians, data analysts, and researchers
13 | who use Python in their work.
14 |
15 |
16 | .. toctree::
17 | :caption: Documentation
18 | :maxdepth: 2
19 | :hidden:
20 |
21 | Introduction
22 | Installation
23 | Tutorial
24 |
25 | .. toctree::
26 | :caption: API
27 | :hidden:
28 | :maxdepth: 2
29 |
30 | Global Tests API
31 | Omnibus API
32 | Outliers API
33 | Plotting API
34 | Post-hocs API
35 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2024 Maksim Terpilovskii
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 |
--------------------------------------------------------------------------------
/scikit_posthocs/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.11.4"
2 |
3 | from scikit_posthocs._global import global_simes_test, global_f_test
4 | from scikit_posthocs._omnibus import test_osrt, test_durbin, test_mackwolfe
5 |
6 | from scikit_posthocs._posthocs import (
7 | posthoc_anderson,
8 | posthoc_conover,
9 | posthoc_conover_friedman,
10 | posthoc_dscf,
11 | posthoc_dunn,
12 | posthoc_durbin,
13 | posthoc_mannwhitney,
14 | posthoc_miller_friedman,
15 | posthoc_nemenyi,
16 | posthoc_nemenyi_friedman,
17 | posthoc_npm_test,
18 | posthoc_quade,
19 | posthoc_scheffe,
20 | posthoc_siegel_friedman,
21 | posthoc_tamhane,
22 | posthoc_ttest,
23 | posthoc_tukey,
24 | posthoc_tukey_hsd,
25 | posthoc_vanwaerden,
26 | posthoc_wilcoxon,
27 | posthoc_dunnett,
28 | __convert_to_df,
29 | __convert_to_block_df,
30 | )
31 |
32 | from scikit_posthocs._plotting import (
33 | sign_array,
34 | sign_plot,
35 | sign_table,
36 | critical_difference_diagram,
37 | )
38 | from scikit_posthocs._outliers import (
39 | outliers_gesd,
40 | outliers_grubbs,
41 | outliers_iqr,
42 | outliers_tietjen,
43 | )
44 |
--------------------------------------------------------------------------------
/paper/paper.bib:
--------------------------------------------------------------------------------
1 | @inproceedings{Seabold2010,
2 | title = {Statsmodels: Econometric and statistical modeling with python},
3 | author = {{Seabold}, S. and {Perktold}, J.},
4 | booktitle = {Proceedings of the 9th Python in Science Conference},
5 | volume = {57},
6 | pages = {61},
7 | year = {2010},
8 | organization = {SciPy society Austin}
9 | }
10 |
11 | @misc{Jones2001,
12 | author = {{Jones}, E. and {Oliphant}, T. and {Peterson}, P.},
13 | title = {SciPy: Open source scientific tools for Python},
14 | year = {2001},
15 | url = {http://www.scipy.org/}
16 | }
17 |
18 | @online{Pohlert2018,
19 | title = {PMCMRplus: Calculate Pairwise Multiple Comparisons of Mean Rank Sums Extended},
20 | author = {{Pohlert}, T.},
21 | year = {2018},
22 | note = {R package version 1.4.1},
23 | url = {https://CRAN.R-project.org/package=PMCMRplus}
24 | }
25 |
26 | @inproceedings{McKinney2010,
27 | title={Data structures for statistical computing in Python},
28 | author={{McKinney}, W.},
29 | booktitle={Proceedings of the 9th Python in Science Conference},
30 | volume={445},
31 | pages={51-56},
32 | year={2010},
33 | organization={Austin, TX}
34 | }
35 |
36 | @book{Oliphant2006,
37 | title={A guide to NumPy},
38 | author={{Oliphant}, T. E.},
39 | volume={1},
40 | year={2006},
41 | publisher={Trelgol Publishing USA}
42 | }
43 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "scikit-posthocs"
7 | dynamic = ["version"]
8 | description = "Statistical post-hoc analysis and outlier detection algorithms"
9 | readme = "DESCRIPTION.rst"
10 | requires-python = ">=3.9"
11 | keywords = ["statistics", "stats", "posthoc", "anova", "data science"]
12 | license.file = "LICENSE"
13 | authors = [
14 | { name = "Maksim Terpilovskii", email = "maximtrp@gmail.com" },
15 | ]
16 | classifiers = [
17 | "Development Status :: 5 - Production/Stable",
18 | "Intended Audience :: Education",
19 | "Intended Audience :: Information Technology",
20 | "Intended Audience :: Science/Research",
21 | "Topic :: Scientific/Engineering :: Information Analysis",
22 | "Topic :: Scientific/Engineering :: Mathematics",
23 | "License :: OSI Approved :: MIT License",
24 | "Programming Language :: Python :: 3",
25 | "Programming Language :: Python :: 3.9",
26 | "Programming Language :: Python :: 3.10",
27 | "Programming Language :: Python :: 3.11",
28 | "Programming Language :: Python :: 3.12",
29 | "Programming Language :: Python :: 3.13",
30 | ]
31 | urls.homepage = "https://github.com/maximtrp/scikit-posthocs"
32 | urls.documentation = "https://scikit-posthocs.rtfd.io"
33 | dependencies = [
34 | "numpy",
35 | "scipy>=1.9.0",
36 | "statsmodels",
37 | "pandas>=0.20.0",
38 | "seaborn",
39 | "matplotlib",
40 | ]
41 |
42 | [tool.basedpyright]
43 | pythonVersion = "3.9"
44 |
45 | [tool.ruff]
46 | target-version = "py39"
47 | respect-gitignore = true
48 | line-length = 100
49 |
50 | [tool.setuptools]
51 | packages = ["scikit_posthocs"]
52 |
53 | [tool.setuptools.dynamic]
54 | version = {attr = "scikit_posthocs.__version__"}
55 |
56 | [project.optional-dependencies]
57 | test = ["pytest", "coverage"]
58 |
59 | [tool.pytest.ini_options]
60 | log_cli = true
61 | log_cli_level = "INFO"
62 | log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
63 | log_cli_date_format = "%Y-%m-%d %H:%M:%S"
64 |
--------------------------------------------------------------------------------
/scikit_posthocs/_global.py:
--------------------------------------------------------------------------------
1 | from typing import Union, List, Tuple
2 | from numpy import array, ndarray, log
3 | from scipy.stats import rankdata, chi2
4 |
5 |
6 | def global_simes_test(p_vals: Union[List, ndarray]) -> float:
7 | '''Global Simes test of the intersection null hypothesis.
8 |
9 | Computes the combined p value as min(np(i)/i), where p(1), ..., p(n) are
10 | the ordered p values [1]_.
11 |
12 | Parameters
13 | ----------
14 | p_vals : Union[List, ndarray]
15 | An array of p values.
16 |
17 | Returns
18 | -------
19 | p_value : float
20 | Global p value.
21 |
22 | References
23 | ----------
24 | .. [1] Simes, R. J. (1986). An improved Bonferroni procedure for multiple
25 | tests of significance. Biometrika, 73(3):751-754.
26 |
27 | Examples
28 | --------
29 | >>> arr = [0.04, 0.03, 0.98, 0.01, 0.43, 0.99, 1.0, 0.002]
30 | >>> sp.global_simes_test(arr)
31 | '''
32 | arr = array(p_vals)
33 | ranks = rankdata(arr)
34 | p_value = min(arr.size * arr / ranks)
35 | return p_value
36 |
37 |
38 | def global_f_test(
39 | p_vals: Union[List, ndarray],
40 | stat: bool = False) -> Union[float, Tuple[float, float]]:
41 | '''Fisher's combination test for global null hypothesis.
42 |
43 | Computes the combined p value using chi-squared distribution and T
44 | statistic: -2 * sum(log(x)) [1]_.
45 |
46 | Parameters
47 | ----------
48 | p_vals : Union[List, ndarray]
49 | An array or a list of p values.
50 | stat : bool
51 | Defines if statistic should be returned.
52 |
53 | Returns
54 | -------
55 | p_value : float
56 | Global p value.
57 | t_stat : float
58 | Statistic.
59 |
60 | References
61 | ----------
62 | .. [1] Fisher RA. Statistical methods for research workers,
63 | London: Oliver and Boyd, 1932.
64 |
65 | Examples
66 | --------
67 | >>> x = [0.04, 0.03, 0.98, 0.01, 0.43, 0.99, 1.0, 0.002]
68 | >>> sp.global_f_test(x)
69 | '''
70 | arr = array(p_vals)
71 | t_stat = -2 * sum(log(arr))
72 | p_value = chi2.sf(t_stat, df=2 * len(arr))
73 | return p_value, t_stat if stat else p_value
74 |
75 |
--------------------------------------------------------------------------------
/docs/source/intro.rst:
--------------------------------------------------------------------------------
1 | Introduction
2 | ============
3 |
4 | Background
5 | ----------
6 |
7 | Python statistical ecosystem is comprised of multiple packages. However, it
8 | still has numerous gaps and is surpassed by R packages and capabilities.
9 |
10 | `SciPy `_ (version 1.2.0) offers *Student*, *Wilcoxon*,
11 | and *Mann-Whitney* tests which are not adapted to multiple pairwise comparisons.
12 | `Statsmodels `_ (version 0.9.0) features
13 | *TukeyHSD* test which needs some extra actions to be fluently integrated into a
14 | data analysis pipeline. `Statsmodels` also has good helper methods:
15 | ``allpairtest`` (adapts an external function such as ``scipy.stats.ttest_ind``
16 | to multiple pairwise comparisons) and ``multipletests`` (adjusts *p* values to
17 | minimize type I and II errors). `PMCMRplus `_
18 | is a very good R package which has no rivals in Python as it offers more than 40
19 | various tests (including post hoc tests) for factorial and block design data.
20 | PMCMRplus was an inspiration and a reference for *scikit-posthocs*.
21 |
22 | *scikit-posthocs* attempts to improve Python statistical capabilities by
23 | offering a lot of parametric and nonparametric post hoc tests along with
24 | outliers detection and basic plotting methods.
25 |
26 | Features
27 | --------
28 |
29 | .. image:: _static/flowchart.png
30 |
31 | - *Omnibox* tests:
32 |
33 | - Durbin test (for balanced incomplete block design).
34 |
35 | - *Parametric* pairwise multiple comparisons tests:
36 |
37 | - Scheffe test.
38 | - Student T test.
39 | - Tamhane T2 test.
40 | - TukeyHSD test.
41 |
42 | - *Non-parametric* tests for factorial design:
43 |
44 | - Conover test.
45 | - Dunn test.
46 | - Dwass, Steel, Critchlow, and Fligner test.
47 | - Mann-Whitney test.
48 | - Nashimoto and Wright (NPM) test.
49 | - Nemenyi test.
50 | - van Waerden test.
51 | - Wilcoxon test.
52 |
53 | - *Non-parametric* tests for block design:
54 |
55 | - Conover test.
56 | - Durbin and Conover test.
57 | - Miller test.
58 | - Nemenyi test.
59 | - Quade test.
60 | - Siegel test.
61 |
62 | - Other tests:
63 |
64 | - Anderson-Darling test.
65 | - Mack-Wolfe test.
66 | - Hayter (OSRT) test.
67 |
68 | - Outliers detection tests:
69 |
70 | - Simple test based on interquartile range (IQR).
71 | - Grubbs test.
72 | - Tietjen-Moore test.
73 | - Generalized Extreme Studentized Deviate test (ESD test).
74 |
75 | - Plotting functionality:
76 |
77 | - Significance plots.
78 | - Critical difference diagrams.
79 |
80 | All post hoc tests are capable of p value adjustments for multiple pairwise
81 | comparisons.
82 |
--------------------------------------------------------------------------------
/paper/paper.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'scikit-posthocs: Pairwise multiple comparison tests in Python'
3 | tags:
4 | - Python
5 | - statistics
6 | - post hoc
7 | authors:
8 | - name: Maksim A. Terpilowski
9 | orcid: 0000-0003-2586-4633
10 | affiliation: 1
11 | affiliations:
12 | - name: Institute of Evolutionary Physiology and Biochemistry, Saint Petersburg, Russia
13 | index: 1
14 | date: 6 December 2018
15 | bibliography: paper.bib
16 | ---
17 |
18 | # Summary
19 |
20 | **scikit-posthocs** is a Python package providing multiple pairwise comparison tests (post hocs). Statisticians, data scientists, and researchers will find it useful in a statistical analysis routine to assess the differences between group levels if a statistically significant result of a parametric or nonparametric analysis of variance (ANOVA) test has been obtained.
21 |
22 | Most statistical software packages (such as SPSS and Stata) provide a number of post hoc tests. However, Python ecosystem still lacks implementation of these tests for factorial and block design data. Currently, *Student*, *Wilcoxon*, *Mann-Whitney* tests which are not adapted to multiple pairwise comparisons are offered by *SciPy*. *Statsmodels* package includes *TukeyHSD* test which needs extra actions to be fluently integrated into a data analysis pipeline.
23 |
24 | ``scikit-posthocs`` package is aimed at filling this gap by providing a number of nonparametric and parametric pairwise comparisons tests as well as outliers detection algorithms implemented in Python.
25 |
26 | Currently, the following tests are implemented in this package:
27 |
28 | 1. *Parametric* pairwise multiple comparisons tests: Scheffe, Student T, Tamhane T2, and TukeyHSD test.
29 | 2. *Non-parametric* tests for factorial design: Conover, Dunn, Dwass-Steel-Critchlow-Fligner, Mann-Whitney, Nashimoto-Wright (NPM), Nemenyi, van Waerden, and Wilcoxon test.
30 | 3. *Non-parametric* tests for block design: Conover, Durbin and Conover, Miller, Nemenyi, Quade, Siegel test.
31 | 4. Additional tests: Anderson-Darling, Mack-Wolfe, Hayter (OSRT) test.
32 |
33 | ``scikit-posthocs`` provides tests for outliers detection: interquartile range (IQR) test, Grubbs test, Tietjen-Moore test, and generalized (extreme Studentized deviate) ESD test. It also has plotting functionality to present the results of pairwise comparisons as a heatmap (significance plot, see figure).
34 |
35 | 
36 |
37 | ``scikit-posthocs`` is compatible with Python 2 and 3 versions, relies heavily and extends the functionality of ``statsmodels``, ``SciPy`` and ``PMCMRplus`` packages [@Seabold2010], [@Jones2001], [@Pohlert2018]. It is also integrated with ``Pandas`` [@McKinney2010] and ``Numpy`` [@Oliphant2006] for efficient computations and data analysis. The package is fully documented and comes with a Jupyter notebook example.
38 |
39 | # References
40 |
--------------------------------------------------------------------------------
/images/flowchart.gv:
--------------------------------------------------------------------------------
1 | digraph {
2 | graph [truecolor=true, bgcolor="#ff000000"];
3 | node [fontname="Roboto", fontsize=14, style="filled", fillcolor="#ffffff"];
4 |
5 | factorial [
6 | label="Factorial Design",
7 | style="filled",
8 | shape=box,
9 | fillcolor="#C7DFFF",
10 | color="#2666ba",
11 | fontcolor="#184074"
12 | ];
13 |
14 | anova [
15 | fillcolor="#eec4c6",
16 | color="#8d1e22",
17 | fontcolor="#581315",
18 | label="ANOVA",
19 | shape=box,
20 | style="filled",
21 | width=1.5,
22 | ];
23 |
24 | param [
25 | shape=box,
26 | style="rounded,filled",
27 | fontcolor="#746522",
28 | color="#baa136",
29 | fillcolor="#fff5cd",
30 | label="Parametric\nANOVA\n\nscipy.stats.f_oneway()\ntest_osrt()",
31 | width=0.5
32 | ];
33 |
34 | param_posthocs [
35 | shape=box,
36 | fontname="Iosevka",
37 | label="posthoc_scheffe()\nposthoc_tamhane()\nposthoc_ttest()\nposthoc_tukey()\nposthoc_tukey_hsd()"
38 | ];
39 |
40 | nonparam [
41 | shape=box,
42 | style="rounded,filled",
43 | fontcolor="#746522",
44 | color="#baa136",
45 | fillcolor="#fff5cd",
46 | label="Non-parametric\nANOVA\n\nscipy.stats.kruskal()",
47 | width=0.5
48 | ];
49 |
50 | nonparam_posthocs [
51 | shape=box,
52 | fontname="Iosevka",
53 | label="posthoc_conover()\nposthoc_dscf()\nposthoc_mannwhitney()\nposthoc_nemenyi()\nposthoc_dunn()\nposthoc_npm_test()\nposthoc_vanwaerden()\nposthoc_wilcoxon()"
54 | ];
55 |
56 | blocked [
57 | label="Block Design",
58 | shape=box,
59 | style="filled",
60 | fillcolor="#C7DFFF",
61 | color="#2666ba",
62 | fontcolor="#184074"
63 | ];
64 |
65 | friedman [
66 | fillcolor="#eec4c6",
67 | color="#8d1e22",
68 | fontcolor="#581315",
69 | label="Friedman test\nComplete block design\n\nscipy.stats.friedmanchisquare()",
70 | shape=box,
71 | style="filled",
72 | width=0.5,
73 | ];
74 |
75 | friedman_posthocs [
76 | shape=box,
77 | fontname="Iosevka"
78 | label="posthoc_conover_friedman()\nposthoc_miller_friedman()\nposthoc_nemenyi_friedman()\nposthoc_siegel_friedman()\nposthoc_quade()"
79 | ];
80 |
81 | durbin [
82 | fillcolor="#eec4c6",
83 | color="#8d1e22",
84 | fontcolor="#581315",
85 | label="Durbin test\nIncomplete block design\n\ntest_durbin()",
86 | shape=box,
87 | style="filled",
88 | width=0.5,
89 | ];
90 |
91 | durbin_posthocs [
92 | shape=box,
93 | fontname="Iosevka"
94 | label="posthoc_durbin()"
95 | ];
96 |
97 |
98 | factorial -> anova;
99 | anova -> param;
100 | anova -> nonparam;
101 |
102 | param -> param_posthocs;
103 | nonparam -> nonparam_posthocs;
104 |
105 | blocked -> {friedman, durbin};
106 | friedman -> friedman_posthocs;
107 | durbin -> durbin_posthocs;
108 | }
109 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and
9 | expression, level of experience, education, socio-economic status, nationality,
10 | personal appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | - Using welcoming and inclusive language
18 | - Being respectful of differing viewpoints and experiences
19 | - Gracefully accepting constructive criticism
20 | - Focusing on what is best for the community
21 | - Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | - The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 |
28 | - Trolling, insulting/derogatory comments, and personal or political attacks
29 |
30 | - Public or private harassment
31 |
32 | - Publishing others' private information, such as a physical or electronic
33 | address, without explicit permission
34 |
35 | - Other conduct which could reasonably be considered inappropriate in a
36 | professional setting
37 |
38 | ## Our Responsibilities
39 |
40 | Project maintainers are responsible for clarifying the standards of acceptable
41 | behavior and are expected to take appropriate and fair corrective action in
42 | response to any instances of unacceptable behavior.
43 |
44 | Project maintainers have the right and responsibility to remove, edit, or
45 | reject comments, commits, code, wiki edits, issues, and other contributions
46 | that are not aligned to this Code of Conduct, or to ban temporarily or
47 | permanently any contributor for other behaviors that they deem inappropriate,
48 | threatening, offensive, or harmful.
49 |
50 | ## Scope
51 |
52 | This Code of Conduct applies both within project spaces and in public spaces
53 | when an individual is representing the project or its community. Examples of
54 | representing a project or community include using an official project e-mail
55 | address, posting via an official social media account, or acting as an appointed
56 | representative at an online or offline event. Representation of a project may be
57 | further defined and clarified by project maintainers.
58 |
59 | ## Enforcement
60 |
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported by contacting the project maintainer. All complaints will be reviewed
63 | and investigated and will result in a response that is deemed necessary
64 | and appropriate to the circumstances. The project team is obligated to
65 | maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 |
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 |
72 | ## Attribution
73 |
74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
75 | version 1.4, available at
76 |
77 |
78 | For answers to common questions about this code of conduct, see
79 |
80 |
81 | [homepage]: https://www.contributor-covenant.org
82 |
--------------------------------------------------------------------------------
/DESCRIPTION.rst:
--------------------------------------------------------------------------------
1 | scikit-posthocs
2 | ===============
3 |
4 | **scikit-posthocs** is a Python package that provides post hoc tests for
5 | pairwise multiple comparisons that are usually performed in statistical
6 | data analysis to assess the differences between group levels if a
7 | statistically significant result of ANOVA test has been obtained.
8 |
9 | **scikit-posthocs** is tightly integrated with Pandas DataFrames and NumPy
10 | arrays to ensure fast computations and convenient data import and storage.
11 |
12 | This package will be useful for statisticians, data analysts, and researchers
13 | who use Python in their work.
14 |
15 |
16 | Background
17 | ----------
18 |
19 | Python statistical ecosystem comprises multiple packages. However, it still has
20 | numerous gaps and is surpassed by R packages and capabilities.
21 |
22 | `SciPy `_ (version 1.2.0) offers *Student*, *Wilcoxon*,
23 | and *Mann-Whitney* tests that are not adapted to multiple pairwise comparisons.
24 | `Statsmodels `_ (version 0.9.0) features
25 | *TukeyHSD* test that needs some extra actions to be fluently integrated into
26 | a data analysis pipeline. Statsmodels also has good helper methods:
27 | ``allpairtest`` (adapts an external function such
28 | as ``scipy.stats.ttest_ind`` to multiple pairwise comparisons) and
29 | ``multipletests`` (adjusts *p* values to minimize type I and II errors).
30 | `PMCMRplus `_ is a very good R package that
31 | has no rivals in Python as it offers more than 40 various tests (including post
32 | hoc tests) for factorial and block design data. PMCMRplus was an inspiration
33 | and a reference for *scikit-posthocs*.
34 |
35 | *scikit-posthocs* attempts to improve Python statistical capabilities by
36 | offering a lot of parametric and nonparametric post hoc tests along with
37 | outliers detection and basic plotting methods.
38 |
39 |
40 | Features
41 | --------
42 |
43 | - *Parametric* pairwise multiple comparisons tests:
44 |
45 | - Scheffe test.
46 | - Student T test.
47 | - Tamhane T2 test.
48 | - TukeyHSD test.
49 |
50 | - *Non-parametric* tests for factorial design:
51 |
52 | - Conover test.
53 | - Dunn test.
54 | - Dwass, Steel, Critchlow, and Fligner test.
55 | - Mann-Whitney test.
56 | - Nashimoto and Wright (NPM) test.
57 | - Nemenyi test.
58 | - van Waerden test.
59 | - Wilcoxon test.
60 |
61 | - *Non-parametric* tests for block design:
62 |
63 | - Conover test.
64 | - Durbin and Conover test.
65 | - Miller test.
66 | - Nemenyi test.
67 | - Quade test.
68 | - Siegel test.
69 |
70 | - Other tests:
71 |
72 | - Anderson-Darling test.
73 | - Mack-Wolfe test.
74 | - Hayter (OSRT) test.
75 |
76 | - Outliers detection tests:
77 |
78 | - Simple test based on interquartile range (IQR).
79 | - Grubbs test.
80 | - Tietjen-Moore test.
81 | - Generalized Extreme Studentized Deviate test (ESD test).
82 |
83 | - Plotting functionality (e.g. significance plots).
84 |
85 | All post hoc tests are capable of p adjustments for multiple pairwise
86 | comparisons.
87 |
88 | Dependencies
89 | ------------
90 |
91 | - `NumPy and SciPy packages `_
92 | - `Statsmodels `_
93 | - `Pandas `_
94 | - `Matplotlib `_
95 | - `Seaborn `_
96 |
97 | Compatibility
98 | -------------
99 |
100 | Package is compatible with Python 2 and Python 3.
101 |
102 | Install
103 | -------
104 |
105 | You can install the package using ``pip`` :
106 |
107 | .. code:: bash
108 |
109 | $ pip install scikit-posthocs
110 |
--------------------------------------------------------------------------------
/paper/generate.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/ruby
2 |
3 | # For an OO language, this is distinctly procedural. Should probably fix that.
4 | require 'json'
5 |
6 | details = Hash.new({})
7 |
8 | capture_params = [
9 | { :name => "title", :message => "Enter project name." },
10 | { :name => "url", :message => "Enter the URL of the project repository." },
11 | { :name => "description", :message => "Enter the (short) project description." },
12 | { :name => "license", :message => "Enter the license this software shared under. (hit enter to skip)\nFor example MIT, BSD, GPL v3.0, Apache 2.0" },
13 | { :name => "doi", :message => "Enter the DOI of the archived version of this code. (hit enter to skip)\nFor example http://dx.doi.org/10.6084/m9.figshare.828487" },
14 | { :name => "keywords", :message => "Enter keywords that should be associated with this project (hit enter to skip)\nComma-separated, for example: turkey, chicken, pot pie" },
15 | { :name => "version", :message => "Enter the version of your software (hit enter to skip)\nSEMVER preferred: http://semver.org e.g. v1.0.0" }
16 | ]
17 |
18 | puts "I'm going to try and help you prepare some things for your JOSS submission"
19 | puts "If all goes well then we'll have a nice codemeta.json file soon..."
20 | puts ""
21 | puts "************************************"
22 | puts "* First, some basic details *"
23 | puts "************************************"
24 | puts ""
25 |
26 | # Loop through the desired captures and print out for clarity
27 | capture_params.each do |param|
28 | puts param[:message]
29 | print "> "
30 | input = gets
31 |
32 | details[param[:name]] = input.chomp
33 |
34 | puts ""
35 | puts "OK, your project has #{param[:name]}: #{input}"
36 | puts ""
37 | end
38 |
39 | puts ""
40 | puts "************************************"
41 | puts "* Experimental stuff *"
42 | puts "************************************"
43 | puts ""
44 |
45 | puts "Would you like me to try and build a list of authors for you?"
46 | puts "(You need to be running this script in a git repository for this to work)"
47 | print "> (Y/N)"
48 | answer = gets.chomp
49 |
50 | case answer.downcase
51 | when "y", "yes"
52 |
53 | # Use git shortlog to extract a list of author names and commit counts.
54 | # Note we don't extract emails here as there's often different emails for
55 | # each user. Instead we capture emails at the end.
56 |
57 | git_log = `git shortlog --summary --numbered --no-merges`
58 |
59 | # ["252\tMichael Jackson", "151\tMC Hammer"]
60 | authors_and_counts = git_log.split("\n").map(&:strip)
61 |
62 | authors_and_counts.each do |author_count|
63 | count, author = author_count.split("\t").map(&:strip)
64 |
65 | puts "Looks like #{author} made #{count} commits"
66 | puts "Add them to the output?"
67 | print "> (Y/N)"
68 | answer = gets.chomp
69 |
70 | # If a user chooses to add this author to the output then we ask for some
71 | # additional information including their email, ORCID and affiliation.
72 | case answer.downcase
73 | when "y", "yes"
74 | puts "What is #{author}'s email address? (hit enter to skip)"
75 | print "> "
76 | email = gets.chomp
77 |
78 | puts "What is #{author}'s ORCID? (hit enter to skip)"
79 | puts "For example: http://orcid.org/0000-0000-0000-0000"
80 | print "> "
81 | orcid = gets.chomp
82 |
83 | puts "What is #{author}'s affiliation? (hit enter to skip)"
84 | print "> "
85 | affiliation = gets.chomp
86 |
87 |
88 | details['authors'].merge!(author => { 'commits' => count,
89 | 'email' => email,
90 | 'orcid' => orcid,
91 | 'affiliation' => affiliation })
92 |
93 | when "n", "no"
94 | puts "OK boss..."
95 | puts ""
96 | end
97 | end
98 | when "n", "no"
99 | puts "OK boss..."
100 | puts ""
101 | end
102 |
103 | puts "Reticulating splines"
104 |
105 | 5.times do
106 | print "."
107 | sleep 0.5
108 | end
109 |
110 | puts ""
111 | puts "Generating some JSON goodness..."
112 |
113 | # TODO: work out how to use some kind of JSON template here.
114 | # Build the output list of authors from the inputs we've collected.
115 | output_authors = []
116 |
117 | details['authors'].each do |author_name, values|
118 | entry = {
119 | "@id" => values['orcid'],
120 | "@type" => "Person",
121 | "email" => values['email'],
122 | "name" => author_name,
123 | "affiliation" => values['affiliation']
124 | }
125 | output_authors << entry
126 | end
127 |
128 | # TODO: this is currently a static template (written out here). It would be good
129 | # to do something smarter here.
130 | output = {
131 | "@context" => "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
132 | "@type" => "Code",
133 | "author" => output_authors,
134 | "identifier" => details['doi'],
135 | "codeRepository" => details['url'],
136 | "datePublished" => Time.now.strftime("%Y-%m-%d"),
137 | "dateModified" => Time.now.strftime("%Y-%m-%d"),
138 | "dateCreated" => Time.now.strftime("%Y-%m-%d"),
139 | "description" => details['description'],
140 | "keywords" => details['keywords'],
141 | "license" => details['license'],
142 | "title" => details['title'],
143 | "version" => details['version']
144 | }
145 |
146 | File.open('codemeta.json', 'w') {|f| f.write(JSON.pretty_generate(output)) }
147 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # Configuration file for the Sphinx documentation builder.
4 | #
5 | # This file does only contain a selection of the most common options. For a
6 | # full list see the documentation:
7 | # http://www.sphinx-doc.org/en/master/config
8 |
9 |
10 | # -- Path setup --------------------------------------------------------------
11 |
12 | # If extensions (or modules to document with autodoc) are in another directory,
13 | # add these directories to sys.path here. If the directory is relative to the
14 | # documentation root, use os.path.abspath to make it absolute, like shown here.
15 | #
16 | #import os
17 | #import sys
18 | #import scikit_posthocs
19 | #import sphinx_rtd_theme
20 |
21 | #sys.path.insert(0, os.path.abspath('../../'))
22 |
23 |
24 | # -- Project information -----------------------------------------------------
25 |
26 | project = 'scikit-posthocs'
27 | author = 'Maksim Terpilowski'
28 |
29 | # The short X.Y version
30 | version = '0.7.0'
31 | # The full version, including alpha/beta/rc tags
32 | release = '0.7.0'
33 |
34 |
35 | # -- General configuration ---------------------------------------------------
36 |
37 | # If your documentation needs a minimal Sphinx version, state it here.
38 | #
39 | # needs_sphinx = '1.0'
40 |
41 | # Add any Sphinx extension module names here, as strings. They can be
42 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
43 | # ones.
44 | extensions = [
45 | 'sphinx.ext.autosummary',
46 | 'sphinx.ext.napoleon',
47 | 'sphinx.ext.coverage',
48 | 'sphinx.ext.mathjax',
49 | ]
50 |
51 | autosummary_generate = True
52 |
53 | # Add any paths that contain templates here, relative to this directory.
54 | templates_path = ['_templates']
55 |
56 | # The suffix(es) of source filenames.
57 | # You can specify multiple suffix as a list of string:
58 | #
59 | # source_suffix = ['.rst', '.md']
60 | source_suffix = '.rst'
61 |
62 | # The master toctree document.
63 | master_doc = 'index'
64 |
65 | # The language for content autogenerated by Sphinx. Refer to documentation
66 | # for a list of supported languages.
67 | #
68 | # This is also used if you do content translation via gettext catalogs.
69 | # Usually you set "language" from the command line for these cases.
70 | language = 'en'
71 |
72 | # List of patterns, relative to source directory, that match files and
73 | # directories to ignore when looking for source files.
74 | # This pattern also affects html_static_path and html_extra_path.
75 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
76 |
77 | # The name of the Pygments (syntax highlighting) style to use.
78 | pygments_style = None
79 |
80 |
81 | # -- Options for HTML output -------------------------------------------------
82 |
83 | # The theme to use for HTML and HTML Help pages. See the documentation for
84 | # a list of builtin themes.
85 | #
86 | html_theme = 'sphinx_rtd_theme'
87 | #html_theme_path = sphinx_bootstrap_theme.get_html_theme_path()
88 |
89 | # Theme options are theme-specific and customize the look and feel of a theme
90 | # further. For a list of options available for each theme, see the
91 | # documentation.
92 | #
93 | html_theme_options = {
94 | 'collapse_navigation': False,
95 | 'sticky_navigation': False,
96 | 'navigation_depth': 4,
97 | 'includehidden': True,
98 | 'titles_only': False
99 | }
100 |
101 | # Add any paths that contain custom static files (such as style sheets) here,
102 | # relative to this directory. They are copied after the builtin static files,
103 | # so a file named "default.css" will overwrite the builtin "default.css".
104 | html_static_path = ['_static']
105 |
106 | # Custom sidebar templates, must be a dictionary that maps document names
107 | # to template names.
108 | #
109 | # The default sidebars (for documents that don't match any pattern) are
110 | # defined by theme itself. Builtin themes are using these templates by
111 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
112 | # 'searchbox.html']``.
113 | #
114 | # html_sidebars = {}
115 |
116 |
117 | # -- Options for HTMLHelp output ---------------------------------------------
118 |
119 | # Output file base name for HTML help builder.
120 | htmlhelp_basename = 'scikit-posthocs-doc'
121 |
122 |
123 | # -- Options for LaTeX output ------------------------------------------------
124 |
125 | latex_elements = {
126 | # The paper size ('letterpaper' or 'a4paper').
127 | #
128 | # 'papersize': 'letterpaper',
129 |
130 | # The font size ('10pt', '11pt' or '12pt').
131 | #
132 | # 'pointsize': '10pt',
133 |
134 | # Additional stuff for the LaTeX preamble.
135 | #
136 | # 'preamble': '',
137 |
138 | # Latex figure (float) alignment
139 | #
140 | # 'figure_align': 'htbp',
141 | }
142 |
143 | # Grouping the document tree into LaTeX files. List of tuples
144 | # (source start file, target name, title,
145 | # author, documentclass [howto, manual, or own class]).
146 | latex_documents = [
147 | (master_doc, 'scikit-posthocs.tex', 'scikit-posthocs Documentation',
148 | 'Maksim Terpilowski', 'manual'),
149 | ]
150 |
151 |
152 | # -- Options for manual page output ------------------------------------------
153 |
154 | # One entry per manual page. List of tuples
155 | # (source start file, name, description, authors, manual section).
156 | man_pages = [
157 | (master_doc, 'scikit-posthocs', 'scikit-posthocs Documentation',
158 | [author], 1)
159 | ]
160 |
161 |
162 | # -- Options for Texinfo output ----------------------------------------------
163 |
164 | # Grouping the document tree into Texinfo files. List of tuples
165 | # (source start file, target name, title, author,
166 | # dir menu entry, description, category)
167 | texinfo_documents = [
168 | (master_doc, 'scikit-posthocs', 'scikit-posthocs Documentation',
169 | author, 'scikit-posthocs', 'One line description of project.',
170 | 'Miscellaneous'),
171 | ]
172 |
173 |
174 | # -- Options for Epub output -------------------------------------------------
175 |
176 | # Bibliographic Dublin Core info.
177 | epub_title = project
178 |
179 | # The unique identifier of the text. This can be a ISBN number
180 | # or the project homepage.
181 | #
182 | # epub_identifier = ''
183 |
184 | # A unique identification for the text.
185 | #
186 | # epub_uid = ''
187 |
188 | # A list of files that should not be packed into the epub file.
189 | epub_exclude_files = ['search.html']
190 |
191 |
192 | # -- Extension configuration -------------------------------------------------
193 |
--------------------------------------------------------------------------------
/usage-examples.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import scikit_posthocs as sp\n",
10 | "import numpy as np\n",
11 | "import pandas as pd"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "## Without p adjustments"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 2,
24 | "metadata": {},
25 | "outputs": [
26 | {
27 | "data": {
28 | "text/plain": [
29 | "array([[ -1.00000000e+00, 3.98391078e-04, 1.39164426e-03],\n",
30 | " [ 3.98391078e-04, -1.00000000e+00, 1.86722274e-01],\n",
31 | " [ 1.39164426e-03, 1.86722274e-01, -1.00000000e+00]])"
32 | ]
33 | },
34 | "execution_count": 2,
35 | "metadata": {},
36 | "output_type": "execute_result"
37 | }
38 | ],
39 | "source": [
40 | "x = [[1,2,3,5,1], [12,31,54], [10,12,6,74,11]]\n",
41 | "sp.posthoc_conover(x)"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "## With Holm p adjustment"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 3,
54 | "metadata": {},
55 | "outputs": [
56 | {
57 | "data": {
58 | "text/plain": [
59 | "array([[-1. , 0.00119517, 0.00278329],\n",
60 | " [ 0.00119517, -1. , 0.18672227],\n",
61 | " [ 0.00278329, 0.18672227, -1. ]])"
62 | ]
63 | },
64 | "execution_count": 3,
65 | "metadata": {},
66 | "output_type": "execute_result"
67 | }
68 | ],
69 | "source": [
70 | "sp.posthoc_conover(x, p_adjust = 'holm')"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "## Exporting to pandas"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 4,
83 | "metadata": {},
84 | "outputs": [
85 | {
86 | "data": {
87 | "text/html": [
88 | "\n",
89 | "\n",
102 | "
\n",
103 | " \n",
104 | " \n",
105 | " | \n",
106 | " groups | \n",
107 | " vals | \n",
108 | "
\n",
109 | " \n",
110 | " \n",
111 | " \n",
112 | " | 0 | \n",
113 | " a | \n",
114 | " 1 | \n",
115 | "
\n",
116 | " \n",
117 | " | 1 | \n",
118 | " a | \n",
119 | " 2 | \n",
120 | "
\n",
121 | " \n",
122 | " | 2 | \n",
123 | " a | \n",
124 | " 3 | \n",
125 | "
\n",
126 | " \n",
127 | " | 3 | \n",
128 | " a | \n",
129 | " 5 | \n",
130 | "
\n",
131 | " \n",
132 | " | 4 | \n",
133 | " a | \n",
134 | " 1 | \n",
135 | "
\n",
136 | " \n",
137 | " | 5 | \n",
138 | " b | \n",
139 | " 12 | \n",
140 | "
\n",
141 | " \n",
142 | " | 6 | \n",
143 | " b | \n",
144 | " 31 | \n",
145 | "
\n",
146 | " \n",
147 | " | 7 | \n",
148 | " b | \n",
149 | " 54 | \n",
150 | "
\n",
151 | " \n",
152 | " | 8 | \n",
153 | " c | \n",
154 | " 10 | \n",
155 | "
\n",
156 | " \n",
157 | " | 9 | \n",
158 | " c | \n",
159 | " 12 | \n",
160 | "
\n",
161 | " \n",
162 | " | 10 | \n",
163 | " c | \n",
164 | " 6 | \n",
165 | "
\n",
166 | " \n",
167 | " | 11 | \n",
168 | " c | \n",
169 | " 74 | \n",
170 | "
\n",
171 | " \n",
172 | " | 12 | \n",
173 | " c | \n",
174 | " 11 | \n",
175 | "
\n",
176 | " \n",
177 | "
\n",
178 | "
"
179 | ],
180 | "text/plain": [
181 | " groups vals\n",
182 | "0 a 1\n",
183 | "1 a 2\n",
184 | "2 a 3\n",
185 | "3 a 5\n",
186 | "4 a 1\n",
187 | "5 b 12\n",
188 | "6 b 31\n",
189 | "7 b 54\n",
190 | "8 c 10\n",
191 | "9 c 12\n",
192 | "10 c 6\n",
193 | "11 c 74\n",
194 | "12 c 11"
195 | ]
196 | },
197 | "execution_count": 4,
198 | "metadata": {},
199 | "output_type": "execute_result"
200 | }
201 | ],
202 | "source": [
203 | "groups = [['a']*5, ['b']*3, ['c']*5]\n",
204 | "df = pd.DataFrame({'vals': np.concatenate(x), 'groups': np.concatenate(groups)})\n",
205 | "df"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": 5,
211 | "metadata": {},
212 | "outputs": [
213 | {
214 | "data": {
215 | "text/html": [
216 | "\n",
217 | "\n",
230 | "
\n",
231 | " \n",
232 | " \n",
233 | " | \n",
234 | " a | \n",
235 | " b | \n",
236 | " c | \n",
237 | "
\n",
238 | " \n",
239 | " \n",
240 | " \n",
241 | " | a | \n",
242 | " -1.000000 | \n",
243 | " 0.000398 | \n",
244 | " 0.001392 | \n",
245 | "
\n",
246 | " \n",
247 | " | b | \n",
248 | " 0.000398 | \n",
249 | " -1.000000 | \n",
250 | " 0.186722 | \n",
251 | "
\n",
252 | " \n",
253 | " | c | \n",
254 | " 0.001392 | \n",
255 | " 0.186722 | \n",
256 | " -1.000000 | \n",
257 | "
\n",
258 | " \n",
259 | "
\n",
260 | "
"
261 | ],
262 | "text/plain": [
263 | " a b c\n",
264 | "a -1.000000 0.000398 0.001392\n",
265 | "b 0.000398 -1.000000 0.186722\n",
266 | "c 0.001392 0.186722 -1.000000"
267 | ]
268 | },
269 | "execution_count": 5,
270 | "metadata": {},
271 | "output_type": "execute_result"
272 | }
273 | ],
274 | "source": [
275 | "result = sp.posthoc_conover(df, val_col='vals', group_col='groups')\n",
276 | "result"
277 | ]
278 | }
279 | ],
280 | "metadata": {
281 | "kernelspec": {
282 | "display_name": "Python 3",
283 | "language": "python",
284 | "name": "python3"
285 | },
286 | "language_info": {
287 | "codemirror_mode": {
288 | "name": "ipython",
289 | "version": 3
290 | },
291 | "file_extension": ".py",
292 | "mimetype": "text/x-python",
293 | "name": "python",
294 | "nbconvert_exporter": "python",
295 | "pygments_lexer": "ipython3",
296 | "version": "3.6.3"
297 | }
298 | },
299 | "nbformat": 4,
300 | "nbformat_minor": 2
301 | }
302 |
--------------------------------------------------------------------------------
/scikit_posthocs/_outliers.py:
--------------------------------------------------------------------------------
1 | from typing import Union, List
2 | import numpy as np
3 | from numpy.typing import ArrayLike
4 | from scipy.stats import t
5 |
6 |
7 | def outliers_iqr(
8 | x: Union[List, np.ndarray], ret: str = "filtered", coef: float = 1.5
9 | ) -> np.ndarray:
10 | """Simple detection of potential outliers based on interquartile range
11 | (IQR). Data that lie within the lower and upper limits are considered
12 | non-outliers. The lower limit is the number that lies 1.5 IQRs below
13 | (coefficient may be changed with an argument, see Parameters)
14 | the first quartile; the upper limit is the number that lies 1.5 IQRs
15 | above the third quartile.
16 |
17 | Parameters
18 | ----------
19 | x : Union[List, np.ndarray]
20 | An array, any object exposing the array interface, containing
21 | p values.
22 |
23 | ret : str = 'filtered'
24 | Specifies object to be returned. Available options are:
25 |
26 | - ``filtered``: return a filtered array (default)
27 | - ``outliers``: return outliers
28 | - ``indices``: return indices of non-outliers
29 | - ``outliers_indices``: return indices of outliers
30 |
31 | coef : float = 1.5
32 | Coefficient by which IQR is multiplied.
33 |
34 | Returns
35 | -------
36 | numpy.ndarray
37 | One of the following objects:
38 |
39 | - Filtered array (default) if ``ret`` is set to ``filtered``.
40 | - Array with indices of elements lying within the specified limits
41 | if ``ret`` is set to ``indices``.
42 | - Array with outliers if ``ret`` is set to ``outliers``.
43 | - Array with indices of outlier elements
44 | if ``ret`` is set to ``outliers_indices``.
45 |
46 | Examples
47 | --------
48 | >>> x = np.array([4, 5, 6, 10, 12, 4, 3, 1, 2, 3, 23, 5, 3])
49 | >>> outliers_iqr(x, ret = 'outliers')
50 | array([12, 23])
51 | """
52 | arr = np.copy(x)
53 |
54 | q1, q3 = np.percentile(arr, [25, 75])
55 | iqr = q3 - q1
56 | ll = q1 - iqr * coef
57 | ul = q3 + iqr * coef
58 |
59 | if ret == "indices":
60 | return np.where((arr > ll) & (arr < ul))[0]
61 | elif ret == "outliers":
62 | return arr[(arr < ll) | (arr > ul)]
63 | elif ret == "outliers_indices":
64 | return np.where((arr < ll) | (arr > ul))[0]
65 | else:
66 | return x[(x > ll) & (x < ul)]
67 |
68 |
69 | def outliers_grubbs(
70 | x: Union[List, np.ndarray], hypo: bool = False, alpha: float = 0.05
71 | ) -> Union[np.ndarray, bool]:
72 | """Grubbs' Test for Outliers [1]_. This is the two-sided version
73 | of the test. The null hypothesis implies that there are no outliers
74 | in the data set.
75 |
76 | Parameters
77 | ----------
78 | x : Union[List, np.ndarray]
79 | An array, any object exposing the array interface, containing
80 | data to test for an outlier in.
81 |
82 | hypo : bool = False
83 | Specifies whether to return a bool value of a hypothesis test result.
84 | Returns ``True`` when we can reject the null hypothesis.
85 | Otherwise, ``False``. Available options are:
86 |
87 | - ``True``: return a hypothesis test result
88 | - ``False``: return a filtered array without an outlier (default)
89 |
90 | alpha : float = 0.05
91 | Significance level for a hypothesis test.
92 |
93 | Returns
94 | -------
95 | Union[np.ndarray, bool]
96 | Returns a filtered array if alternative hypothesis is true, otherwise
97 | an unfiltered array. Returns null hypothesis test result instead of an
98 | array if ``hypo`` argument is set to ``True``.
99 |
100 | Notes
101 | -----
102 | .. [1] http://www.itl.nist.gov/div898/handbook/eda/section3/eda35h1.htm
103 |
104 | Examples
105 | --------
106 | >>> x = np.array([199.31,199.53,200.19,200.82,201.92,201.95,202.18,245.57])
107 | >>> ph.outliers_grubbs(x)
108 | array([ 199.31, 199.53, 200.19, 200.82, 201.92, 201.95, 202.18])
109 | """
110 | arr = np.copy(x)
111 | val = np.max(np.abs(arr - np.mean(arr)))
112 | ind = np.argmax(np.abs(arr - np.mean(arr)))
113 | G = val / np.std(arr, ddof=1)
114 | N = len(arr)
115 | result = G > (N - 1) / np.sqrt(N) * np.sqrt(
116 | (t.ppf(1 - alpha / (2 * N), N - 2) ** 2) / (N - 2 + t.ppf(1 - alpha / (2 * N), N - 2) ** 2)
117 | )
118 |
119 | if hypo:
120 | return result
121 | else:
122 | if result:
123 | return np.delete(arr, ind)
124 | else:
125 | return arr
126 |
127 |
128 | def outliers_tietjen(
129 | x: Union[List, np.ndarray], k: int, hypo: bool = False, alpha: float = 0.05
130 | ) -> Union[np.ndarray, bool]:
131 | """Tietjen-Moore test [1]_ to detect multiple outliers in a univariate
132 | data set that follows an approximately normal distribution.
133 | The Tietjen-Moore test [2]_ is a generalization of the Grubbs' test to
134 | the case of multiple outliers. If testing for a single outlier,
135 | the Tietjen-Moore test is equivalent to the Grubbs' test.
136 |
137 | The null hypothesis implies that there are no outliers in the data set.
138 |
139 | Parameters
140 | ----------
141 | x : Union[List, np.ndarray]
142 | An array, any object exposing the array interface, containing
143 | data to test for an outlier in.
144 |
145 | k : int
146 | Number of potential outliers to test for. Function tests for
147 | outliers in both tails.
148 |
149 | hypo : bool = False
150 | Specifies whether to return a bool value of a hypothesis test result.
151 | Returns ``True`` when we can reject the null hypothesis.
152 | Otherwise, ``False``. Available options are:
153 |
154 | - ``True``: return a hypothesis test result
155 | - ``False``: return a filtered array without outliers (default).
156 |
157 | alpha : float = 0.05
158 | Significance level for a hypothesis test.
159 |
160 | Returns
161 | -------
162 | Union[numpy.ndarray, bool]
163 | Returns a filtered array if alternative hypothesis is true, otherwise
164 | an unfiltered array. Returns null hypothesis test result instead of an
165 | array if ``hypo`` argument is set to True.
166 |
167 | Notes
168 | -----
169 | .. [1] Tietjen and Moore (August 1972), Some Grubbs-Type Statistics
170 | for the Detection of Outliers, Technometrics, 14(3), pp. 583-597.
171 | .. [2] http://www.itl.nist.gov/div898/handbook/eda/section3/eda35h2.htm
172 |
173 | Examples
174 | --------
175 | >>> x = np.array([-1.40, -0.44, -0.30, -0.24, -0.22, -0.13, -0.05, 0.06,
176 | 0.10, 0.18, 0.20, 0.39, 0.48, 0.63, 1.01])
177 | >>> outliers_tietjen(x, 2)
178 | array([-0.44, -0.3 , -0.24, -0.22, -0.13, -0.05, 0.06, 0.1 , 0.18,
179 | 0.2 , 0.39, 0.48, 0.63])
180 | """
181 | arr = np.copy(x)
182 | n = arr.size
183 |
184 | def tietjen(x_, k_):
185 | x_mean = x_.mean()
186 | r = np.abs(x_ - x_mean)
187 | z = x_[r.argsort()]
188 | E = np.sum((z[:-k_] - z[:-k_].mean()) ** 2) / np.sum((z - x_mean) ** 2)
189 | return E
190 |
191 | e_x = tietjen(arr, k)
192 | e_norm = np.zeros(10000)
193 |
194 | for i in np.arange(10000):
195 | norm = np.random.normal(size=n)
196 | e_norm[i] = tietjen(norm, k)
197 |
198 | CV = np.percentile(e_norm, alpha * 100)
199 | result = e_x < CV
200 |
201 | if hypo:
202 | return result
203 | else:
204 | if result:
205 | ind = np.argpartition(np.abs(arr - arr.mean()), -k)[-k:]
206 | return np.delete(arr, ind)
207 | else:
208 | return arr
209 |
210 |
211 | def outliers_gesd(
212 | x: ArrayLike,
213 | outliers: int = 5,
214 | hypo: bool = False,
215 | report: bool = False,
216 | alpha: float = 0.05,
217 | ) -> np.ndarray:
218 | """The generalized (Extreme Studentized Deviate) ESD test is used
219 | to detect one or more outliers in a univariate data set that follows
220 | an approximately normal distribution [1]_.
221 |
222 | Parameters
223 | ----------
224 | x : Union[List, np.ndarray]
225 | An array, any object exposing the array interface, containing
226 | data to test for outliers.
227 |
228 | outliers : int = 5
229 | Number of potential outliers to test for. Test is two-tailed, i.e.
230 | maximum and minimum values are checked for potential outliers.
231 |
232 | hypo : bool = False
233 | Specifies whether to return a bool value of a hypothesis test result.
234 | Returns True when we can reject the null hypothesis. Otherwise, False.
235 | Available options are:
236 | 1) True - return a hypothesis test result.
237 | 2) False - return a filtered array without an outlier (default).
238 |
239 | report : bool = False
240 | Specifies whether to print a summary table of the test.
241 |
242 | alpha : float = 0.05
243 | Significance level for a hypothesis test.
244 |
245 | Returns
246 | -------
247 | np.ndarray
248 | If hypo is True, returns a boolean array where True indicates an outlier.
249 | If hypo is False, returns the filtered array with outliers removed.
250 |
251 | Notes
252 | -----
253 | .. [1] Rosner, Bernard (May 1983), Percentage Points for a Generalized
254 | ESD Many-Outlier Procedure,Technometrics, 25(2), pp. 165-172.
255 |
256 | Examples
257 | --------
258 | >>> data = np.array([-0.25, 0.68, 0.94, 1.15, 1.2, 1.26, 1.26, 1.34,
259 | 1.38, 1.43, 1.49, 1.49, 1.55, 1.56, 1.58, 1.65, 1.69, 1.7, 1.76,
260 | 1.77, 1.81, 1.91, 1.94, 1.96, 1.99, 2.06, 2.09, 2.1, 2.14, 2.15,
261 | 2.23, 2.24, 2.26, 2.35, 2.37, 2.4, 2.47, 2.54, 2.62, 2.64, 2.9,
262 | 2.92, 2.92, 2.93, 3.21, 3.26, 3.3, 3.59, 3.68, 4.3, 4.64, 5.34,
263 | 5.42, 6.01])
264 | >>> outliers_gesd(data, 5)
265 | array([-0.25, 0.68, 0.94, 1.15, 1.2 , 1.26, 1.26, 1.34, 1.38,
266 | 1.43, 1.49, 1.49, 1.55, 1.56, 1.58, 1.65, 1.69, 1.7 ,
267 | 1.76, 1.77, 1.81, 1.91, 1.94, 1.96, 1.99, 2.06, 2.09,
268 | 2.1 , 2.14, 2.15, 2.23, 2.24, 2.26, 2.35, 2.37, 2.4 ,
269 | 2.47, 2.54, 2.62, 2.64, 2.9 , 2.92, 2.92, 2.93, 3.21,
270 | 3.26, 3.3 , 3.59, 3.68, 4.3 , 4.64])
271 | >>> outliers_gesd(data, outliers = 5, report = True)
272 | H0: no outliers in the data
273 | Ha: up to 5 outliers in the data
274 | Significance level: α = 0.05
275 | Reject H0 if Ri > Critical Value (λi)
276 | Summary Table for Two-Tailed Test
277 | ---------------------------------------
278 | Exact Test Critical
279 | Number of Statistic Value, λi
280 | Outliers, i Value, Ri 5 %
281 | ---------------------------------------
282 | 1 3.119 3.159
283 | 2 2.943 3.151
284 | 3 3.179 3.144 *
285 | 4 2.81 3.136
286 | 5 2.816 3.128
287 | """
288 | rs, ls = np.zeros(outliers, dtype=float), np.zeros(outliers, dtype=float)
289 | ms = []
290 |
291 | data_proc = np.copy(x)
292 | argsort_index = np.argsort(data_proc)
293 | data = data_proc[argsort_index]
294 | n = data_proc.size
295 |
296 | # Lambda values (critical values): do not depend on the outliers.
297 | nol = np.arange(outliers) # the number of outliers
298 | df = n - nol - 2 # degrees of freedom
299 | t_ppr = t.ppf(1 - alpha / (2 * (n - nol)), df)
300 | ls = ((n - nol - 1) * t_ppr) / np.sqrt((df + t_ppr**2) * (n - nol))
301 |
302 | for i in np.arange(outliers):
303 | abs_d = np.abs(data_proc - np.mean(data_proc))
304 |
305 | # R-value calculation
306 | R = np.max(abs_d) / np.std(data_proc, ddof=1)
307 | rs[i] = R
308 |
309 | # Masked values
310 | lms = ms[-1] if len(ms) > 0 else []
311 | ms.append(lms + [np.where(data == data_proc[np.argmax(abs_d)])[0][0]])
312 |
313 | # Remove the observation that maximizes |xi − xmean|
314 | data_proc = np.delete(data_proc, np.argmax(abs_d))
315 |
316 | if report:
317 | report_str = [
318 | "H0: no outliers in the data",
319 | "Ha: up to " + str(outliers) + " outliers in the data",
320 | "Significance level: α = " + str(alpha),
321 | "Reject H0 if Ri > Critical Value (λi)",
322 | "",
323 | "Summary Table for Two-Tailed Test",
324 | "---------------------------------------",
325 | " Exact Test Critical",
326 | " Number of Statistic Value, λi",
327 | "Outliers, i Value, Ri {:5.3g} %".format(100 * alpha),
328 | "---------------------------------------",
329 | ]
330 |
331 | for i, (stat, crit_val) in enumerate(zip(rs, ls)):
332 | report_str.append(
333 | "{: >11s}".format(str(i + 1))
334 | + "{: >15s}".format(str(np.round(stat, 3)))
335 | + "{: >13s}".format(str(np.round(crit_val, 3)))
336 | + (" *" if stat > crit_val else "")
337 | )
338 |
339 | print("\n".join(report_str))
340 |
341 | # Remove masked values
342 | # for which the test statistic is greater
343 | # than the critical value and return the result
344 | if hypo:
345 | data = np.zeros(n, dtype=bool)
346 | if any(rs > ls):
347 | data[ms[np.max(np.where(rs > ls))]] = True
348 | return data
349 | else:
350 | if any(rs > ls):
351 | return np.delete(data, ms[np.max(np.where(rs > ls))])
352 | return data
353 |
--------------------------------------------------------------------------------
/scikit_posthocs/_omnibus.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from typing import Optional, Union, List, cast
4 | import itertools as it
5 | import numpy as np
6 | from numpy.typing import ArrayLike
7 | import scipy.stats as ss
8 | from pandas import DataFrame, Categorical, Series
9 | from scikit_posthocs._posthocs import __convert_to_df, __convert_to_block_df
10 |
11 |
12 | def test_mackwolfe(
13 | data: Union[ArrayLike, DataFrame],
14 | val_col: Optional[str] = None,
15 | group_col: Optional[str] = None,
16 | p: Optional[int] = None,
17 | n_perm: int = 100,
18 | sort: bool = False,
19 | ) -> tuple[float, float]:
20 | """Mack-Wolfe Test for Umbrella Alternatives.
21 |
22 | In dose-finding studies one may assume an increasing treatment effect with
23 | increasing dose level. However, the test subject may actually succumb to
24 | toxic effects at high doses, which leads to decresing treatment
25 | effects [1]_, [2]_.
26 |
27 | The scope of the Mack-Wolfe Test is to test for umbrella alternatives for
28 | either a known or unknown point P (i.e. dose-level), where the peak
29 | (umbrella point) is present.
30 |
31 | Parameters
32 | ----------
33 | data : Union[List, numpy.ndarray, DataFrame]
34 | An array, any object exposing the array interface or a pandas
35 | DataFrame with data values.
36 |
37 | val_col : str = None
38 | Name of a DataFrame column that contains dependent variable values
39 | (test or response variable). Values should have a non-nominal scale.
40 | Must be specified if ``a`` is a pandas DataFrame object.
41 |
42 | group_col : str = None
43 | Name of a DataFrame column that contains independent variable values
44 | (grouping or predictor variable). Values should have a nominal scale
45 | (categorical). Must be specified if ``a`` is a pandas DataFrame object.
46 |
47 | p : int = None
48 | The a priori known peak as an ordinal number of the treatment group
49 | including the zero dose level, i.e. p = {0, ..., k-1}.
50 | Defaults to None.
51 |
52 | n_perm: int = 100
53 | Permutations number.
54 |
55 | sort : bool = False
56 | If ``True``, sort data by block and group columns.
57 |
58 | Returns
59 | -------
60 | tuple[float, float]
61 | P value and statistic.
62 |
63 | References
64 | ----------
65 | .. [1] Chen, I.Y. (1991) Notes on the Mack-Wolfe and Chen-Wolfe Tests for
66 | Umbrella Alternatives. Biom. J., 33, 281-290.
67 | .. [2] Mack, G.A., Wolfe, D. A. (1981) K-sample rank tests for umbrella
68 | alternatives. J. Amer. Statist. Assoc., 76, 175-181.
69 |
70 | Examples
71 | --------
72 | >>> x = [[22, 23, 35], [60, 59, 54], [98, 78, 50], [60, 82, 59], [22, 44, 33], [23, 21, 25]]
73 | >>> sp.posthoc_mackwolfe(x)
74 | """
75 | x, _val_col, _group_col = __convert_to_df(data, val_col, group_col)
76 |
77 | if not sort:
78 | x[_group_col] = Categorical(x[_group_col], categories=x[_group_col].unique(), ordered=True)
79 | x.sort_values(by=[_group_col], ascending=True, inplace=True)
80 |
81 | k = x[_group_col].unique().size
82 |
83 | if p and p > k:
84 | print("Selected 'p' > number of groups:", str(p), " > ", str(k))
85 | return (np.nan, np.nan)
86 | elif p is not None and p < 1:
87 | print("Selected 'p' < 1: ", str(p))
88 | return (np.nan, np.nan)
89 |
90 | Rij = x[_val_col].rank()
91 | n = cast(Series, x.groupby(_group_col, observed=True)[_val_col].count())
92 |
93 | def _fn(Ri, Rj):
94 | return np.sum(Ri.apply(lambda x: Rj[Rj > x].size))
95 |
96 | def _ustat(Rij, g, k):
97 | levels = np.unique(g)
98 | U = np.identity(k)
99 |
100 | for i in range(k):
101 | for j in range(i):
102 | U[i, j] = _fn(Rij[x[_group_col] == levels[i]], Rij[x[_group_col] == levels[j]])
103 | U[j, i] = _fn(Rij[x[_group_col] == levels[j]], Rij[x[_group_col] == levels[i]])
104 |
105 | return U
106 |
107 | def _ap(p, U) -> float:
108 | tmp1 = 0.0
109 | if p > 0:
110 | for i in range(p):
111 | for j in range(i + 1, p + 1):
112 | tmp1 += U[i, j]
113 | tmp2 = 0.0
114 | if p < k:
115 | for i in range(p, k):
116 | for j in range(i + 1, k):
117 | tmp2 += U[j, i]
118 |
119 | return tmp1 + tmp2
120 |
121 | def _n1(p: int, n: Series) -> float:
122 | return np.sum(n[: p + 1])
123 |
124 | def _n2(p: int, n: Series) -> float:
125 | return np.sum(n[p:k])
126 |
127 | def _mean_at(p, n) -> float:
128 | N1 = _n1(p, n)
129 | N2 = _n2(p, n)
130 | return (N1**2.0 + N2**2.0 - np.sum(n**2.0) - n.iloc[p] ** 2.0) / 4.0
131 |
132 | def _var_at(p: int, n: Series) -> float:
133 | N1 = _n1(p, n)
134 | N2 = _n2(p, n)
135 | N = np.sum(n)
136 |
137 | var = (
138 | 2.0 * (N1**3 + N2**3)
139 | + 3.0 * (N1**2 + N2**2)
140 | - np.sum(n**2 * (2 * n + 3.0))
141 | - n.iloc[p] ** 2.0 * (2.0 * n.iloc[p] + 3.0)
142 | + 12.0 * n.iloc[p] * N1 * N2
143 | - 12.0 * n.iloc[p] ** 2.0 * N
144 | ) / 72.0
145 | return var
146 |
147 | if p:
148 | # if (x.groupby(_val_col).count() > 1).any().any():
149 | # print("Ties are present")
150 | U = _ustat(Rij, x[_group_col], k)
151 | est = _ap(p, U)
152 | mean = _mean_at(p, n)
153 | sd = np.sqrt(_var_at(p, n))
154 | stat = (est - mean) / sd
155 | p_value = ss.norm.sf(stat).item()
156 | else:
157 | U = _ustat(Rij, x[_group_col], k)
158 | Ap = np.array([_ap(i, U) for i in range(k)]).ravel()
159 | mean = np.array([_mean_at(i, n) for i in range(k)]).ravel()
160 | var = np.array([_var_at(i, n) for i in range(k)]).ravel()
161 | A = (Ap - mean) / np.sqrt(var)
162 | stat = float(np.max(A))
163 |
164 | mt = []
165 | for _ in range(n_perm):
166 | ix = Series(np.random.permutation(Rij))
167 | uix = _ustat(ix, x[_group_col], k)
168 | apix = np.array([_ap(i, uix) for i in range(k)])
169 | astarix = (apix - mean) / np.sqrt(var)
170 | mt.append(np.max(astarix))
171 |
172 | mt = np.array(mt)
173 | p_value = mt[mt > stat].size / n_perm
174 |
175 | return p_value, stat
176 |
177 |
178 | def test_osrt(
179 | data: Union[List, np.ndarray, DataFrame],
180 | val_col: Optional[str] = None,
181 | group_col: Optional[str] = None,
182 | sort: bool = False,
183 | ) -> tuple[float, float, int]:
184 | """Hayter's one-sided studentised range test (OSRT)
185 |
186 | Tests a hypothesis against an ordered alternative for normal data with
187 | equal variances [1]_.
188 |
189 | Parameters
190 | ----------
191 | data : Union[List, numpy.ndarray, DataFrame]
192 | An array, any object exposing the array interface or a pandas
193 | DataFrame with data values.
194 |
195 | val_col : str = None
196 | Name of a DataFrame column that contains dependent variable values
197 | (test or response variable). Values should have a non-nominal scale.
198 | Must be specified if ``a`` is a pandas DataFrame object.
199 |
200 | group_col : str = None
201 | Name of a DataFrame column that contains independent variable values
202 | (grouping or predictor variable). Values should have a nominal scale
203 | (categorical). Must be specified if `a` is a pandas DataFrame object.
204 |
205 | sort : bool = False
206 | If True, sort data by block and group columns.
207 |
208 | Returns
209 | -------
210 | tuple[float, float, int]
211 | P value, statistic, and number of degrees of freedom.
212 |
213 | Notes
214 | -----
215 | P values are computed from the Tukey distribution.
216 |
217 | References
218 | ----------
219 | .. [1] Hayter, A.J.(1990) A One-Sided Studentised Range Test for Testing
220 | Against a Simple Ordered Alternative, Journal of the American
221 | Statistical Association, 85, 778-785.
222 |
223 | Examples
224 | --------
225 | >>> import scikit_posthocs as sp
226 | >>> import pandas as pd
227 | >>> x = pd.DataFrame({"a": [1,2,3,5,1], "b": [12,31,54,62,12], "c": [10,12,6,74,11]})
228 | >>> x = x.melt(var_name='groups', value_name='values')
229 | >>> sp.test_osrt(x, val_col='values', group_col='groups')
230 | """
231 | x, _val_col, _group_col = __convert_to_df(data, val_col, group_col)
232 |
233 | if not sort:
234 | x[_group_col] = Categorical(x[_group_col], categories=x[_group_col].unique(), ordered=True)
235 |
236 | x.sort_values(by=[_group_col], ascending=True, inplace=True)
237 | groups = np.unique(x[_group_col])
238 | x_grouped = x.groupby(_group_col, observed=True)[_val_col]
239 |
240 | xi = x_grouped.mean()
241 | ni = x_grouped.count()
242 | k = groups.size
243 | n = len(x.index)
244 | df = n - k
245 |
246 | sigma2 = 0
247 | c = -1
248 |
249 | for i in range(k):
250 | for j in range(ni.iloc[i]):
251 | c += 1
252 | sigma2 += (x[_val_col].iloc[c] - xi.iloc[i]) ** 2.0 / df
253 |
254 | sigma = np.sqrt(sigma2)
255 |
256 | def compare(i, j):
257 | dif = xi.loc[groups[j]] - xi.loc[groups[i]]
258 | A = sigma / np.sqrt(2.0) * np.sqrt(1.0 / ni[groups[j]] + 1.0 / ni[groups[i]])
259 | qval = np.abs(dif) / A
260 | return qval
261 |
262 | vs = np.zeros((k, k), dtype=float)
263 | combs = it.combinations(range(k), 2)
264 |
265 | for i, j in combs:
266 | vs[i, j] = compare(i, j)
267 |
268 | stat = np.max(vs)
269 | pval = ss.studentized_range.sf(stat, k, df)
270 | return pval, stat, df
271 |
272 |
273 | def test_durbin(
274 | data: Union[List, np.ndarray, DataFrame],
275 | y_col: Optional[Union[str, int]] = None,
276 | group_col: Optional[Union[str, int]] = None,
277 | block_col: Optional[Union[str, int]] = None,
278 | block_id_col: Optional[Union[str, int]] = None,
279 | melted: bool = False,
280 | sort: bool = True,
281 | ) -> tuple[float, float, int]:
282 | """Durbin's test whether k groups (or treatments) in a two-way
283 | balanced incomplete block design (BIBD) have identical effects. See
284 | references for additional information [1]_, [2]_.
285 |
286 | Parameters
287 | ----------
288 | data : Union[List, np.ndarray, DataFrame]
289 | An array, any object exposing the array interface or a pandas
290 | DataFrame with data values.
291 |
292 | If ``melted`` argument is set to False (default), ``a`` is a typical
293 | matrix of block design, i.e. rows are blocks, and columns are groups.
294 | In this case, you do not need to specify col arguments.
295 |
296 | If ``a`` is an array and ``melted`` is set to True,
297 | y_col, block_col and group_col must specify the indices of columns
298 | containing elements of correspondary type.
299 |
300 | If ``a`` is a Pandas DataFrame and ``melted`` is set to True,
301 | y_col, block_col and group_col must specify columns names (string).
302 |
303 | y_col : Union[str, int] = None
304 | Must be specified if ``a`` is a melted pandas DataFrame object.
305 | Name of the column that contains y data.
306 |
307 | group_col : Union[str, int] = None
308 | Must be specified if ``a`` is a melted pandas DataFrame object.
309 | Name of the column that contains group names.
310 |
311 | block_col : Union[str, int] = None
312 | Must be specified if ``a`` is a melted pandas DataFrame object.
313 | Name of the column that contains block names.
314 |
315 | block_id_col : Union[str, int] = None
316 | Must be specified if ``a`` is a melted pandas DataFrame object.
317 | Name of the column that contains identifiers of block names.
318 | In most cases, this is the same as `block_col` except for those
319 | cases when you have multiple instances of the same blocks.
320 |
321 | melted : bool = False
322 | Specifies if data are given as melted columns "y", "blocks", and
323 | "groups".
324 |
325 | sort : bool = False
326 | If True, sort data by block and group columns.
327 |
328 | Returns
329 | -------
330 | tuple[float, float, int]
331 | P value, statistic, and number of degrees of freedom.
332 |
333 | References
334 | ----------
335 | .. [1] N. A. Heckert, J. J. Filliben. (2003) NIST Handbook 148: Dataplot Reference
336 | Manual, Volume 2: Let Subcommands and Library Functions. National Institute of
337 | Standards and Technology Handbook Series, June 2003.
338 | .. [2] W. J. Conover (1999), Practical nonparametric Statistics,
339 | 3rd. edition, Wiley.
340 |
341 | Examples
342 | --------
343 | >>> x = np.array([[31,27,24],[31,28,31],[45,29,46],[21,18,48],[42,36,46],[32,17,40]])
344 | >>> sp.test_durbin(x)
345 | """
346 | x, _y_col, _group_col, _block_col, _block_id_col = __convert_to_block_df(
347 | data, y_col, group_col, block_col, block_id_col, melted
348 | )
349 |
350 | groups = x[_group_col].unique()
351 | blocks = x[_block_id_col].unique()
352 | if not sort:
353 | x[_group_col] = Categorical(x[_group_col], categories=groups, ordered=True)
354 | x[_block_col] = Categorical(x[_block_col], categories=blocks, ordered=True)
355 | x.sort_values(by=[_block_col, _group_col], ascending=True, inplace=True)
356 | x.dropna(inplace=True)
357 |
358 | t = len(groups)
359 | b = len(blocks)
360 | r = float(b)
361 | k = float(t)
362 |
363 | x["y_ranks"] = x.groupby(_block_id_col, observed=True)[_y_col].rank()
364 | rs = x.groupby(_group_col, observed=True)["y_ranks"].sum().to_numpy()
365 |
366 | A = float(np.sum(x["y_ranks"] ** 2.0))
367 | C = float(b * k * (k + 1) ** 2.0) / 4.0
368 | D = float(np.sum(rs**2.0)) - r * C
369 | T1 = (t - 1.0) / (A - C) * D
370 | stat = T1
371 | df = t - 1
372 | pval = ss.chi2.sf(stat, df).item()
373 |
374 | return pval, stat, df
375 |
--------------------------------------------------------------------------------
/docs/source/tutorial.rst:
--------------------------------------------------------------------------------
1 | Tutorial
2 | ========
3 |
4 | Parametric ANOVA with post hoc tests
5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
6 |
7 | Here is a simple example of the one-way analysis of variance (ANOVA) with post hoc tests used to compare *sepal width* means of three groups (three iris species) in *iris* dataset.
8 |
9 | To begin, we will import the dataset using statsmodels ``get_rdataset()`` method.
10 |
11 | .. code:: python
12 |
13 | >>> import statsmodels.api as sa
14 | >>> import statsmodels.formula.api as sfa
15 | >>> import scikit_posthocs as sp
16 | >>> df = sa.datasets.get_rdataset('iris').data
17 | >>> df.head()
18 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species
19 | 0 5.1 3.5 1.4 0.2 setosa
20 | 1 4.9 3.0 1.4 0.2 setosa
21 | 2 4.7 3.2 1.3 0.2 setosa
22 | 3 4.6 3.1 1.5 0.2 setosa
23 | 4 5.0 3.6 1.4 0.2 setosa
24 |
25 | Now, we will build a model and run ANOVA using statsmodels ``ols()`` and ``anova_lm()`` methods. Columns ``Species`` and ``Sepal.Width`` contain independent (predictor) and dependent (response) variable values, correspondingly.
26 |
27 | .. code:: python
28 |
29 | >>> lm = sfa.ols('Sepal.Width ~ C(Species)', data=df).fit()
30 | >>> anova = sa.stats.anova_lm(lm)
31 | >>> print(anova)
32 | df sum_sq mean_sq F PR(>F)
33 | C(Species) 2.0 11.344933 5.672467 49.16004 4.492017e-17
34 | Residual 147.0 16.962000 0.115388 NaN NaN
35 |
36 | The results tell us that there is a significant difference between groups means (p = 4.49e-17), but does not tell us the exact group pairs which are different in means. To obtain pairwise group differences, we will carry out a posteriori (post hoc) analysis using ``scikits-posthocs`` package. Student T test applied pairwisely gives us the following p values:
37 |
38 | .. code:: python
39 |
40 | >>> sp.posthoc_ttest(df, val_col='Sepal.Width', group_col='Species', p_adjust='holm')
41 | setosa versicolor virginica
42 | setosa -1.000000e+00 5.535780e-15 8.492711e-09
43 | versicolor 5.535780e-15 -1.000000e+00 1.819100e-03
44 | virginica 8.492711e-09 1.819100e-03 -1.000000e+00
45 |
46 | Remember to use a `FWER controlling procedure `_, such as Holm procedure, when making multiple comparisons. As seen from this table, significant differences in group means are obtained for all group pairs.
47 |
48 | Non-parametric ANOVA with post hoc tests
49 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
50 |
51 | If normality and other `assumptions `_ are violated, one can use a non-parametric Kruskal-Wallis H test (one-way non-parametric ANOVA) to test if samples came from the same distribution.
52 |
53 | Let's use the same dataset just to demonstrate the procedure. Kruskal-Wallis test is implemented in SciPy package. ``scipy.stats.kruskal`` method accepts array-like structures, but not DataFrames.
54 |
55 | .. code:: python
56 |
57 | >>> import scipy.stats as ss
58 | >>> import statsmodels.api as sa
59 | >>> import scikit_posthocs as sp
60 | >>> df = sa.datasets.get_rdataset('iris').data
61 | >>> data = [df.loc[ids, 'Sepal.Width'].values for ids in df.groupby('Species').groups.values()]
62 |
63 | ``data`` is a list of 1D arrays containing *sepal width* values, one array per each species. Now we can run Kruskal-Wallis analysis of variance.
64 |
65 | .. code:: python
66 |
67 | >>> H, p = ss.kruskal(*data)
68 | >>> p
69 | 1.5692820940316782e-14
70 |
71 | P value tells us we may reject the null hypothesis that the population medians of all of the groups are equal. To learn what groups (species) differ in their medians we need to run post hoc tests. ``scikit-posthocs`` provides a lot of non-parametric tests mentioned above. Let's choose Conover's test.
72 |
73 | .. code:: python
74 |
75 | >>> sp.posthoc_conover(df, val_col='Sepal.Width', group_col='Species', p_adjust = 'holm')
76 | setosa versicolor virginica
77 | setosa -1.000000e+00 2.278515e-18 1.293888e-10
78 | versicolor 2.278515e-18 -1.000000e+00 1.881294e-03
79 | virginica 1.293888e-10 1.881294e-03 -1.000000e+00
80 |
81 | Pairwise comparisons show that we may reject the null hypothesis (p < 0.01) for each pair of species and conclude that all groups (species) differ in their sepal widths.
82 |
83 | Block design
84 | ~~~~~~~~~~~~
85 |
86 | In block design case, we have a primary factor (e.g. treatment) and a blocking factor (e.g. age or gender). A blocking factor is also called a *nuisance* factor, and it is usually a source of variability that needs to be accounted for.
87 |
88 | An example scenario is testing the effect of four fertilizers on crop yield in four cornfields. We can represent the results with a matrix in which rows correspond to the blocking factor (field) and columns correspond to the primary factor (yield).
89 |
90 | The following dataset is artificial and created just for demonstration of the procedure:
91 |
92 | .. code:: python
93 |
94 | >>> data = np.array([[ 8.82, 11.8 , 10.37, 12.08],
95 | [ 8.92, 9.58, 10.59, 11.89],
96 | [ 8.27, 11.46, 10.24, 11.6 ],
97 | [ 8.83, 13.25, 8.33, 11.51]])
98 |
99 | First, we need to perform an omnibus test — Friedman rank sum test. It is implemented in ``scipy.stats`` subpackage:
100 |
101 | .. code:: python
102 |
103 | >>> import scipy.stats as ss
104 | >>> ss.friedmanchisquare(*data.T)
105 | FriedmanchisquareResult(statistic=8.700000000000003, pvalue=0.03355726870553798)
106 |
107 | We can reject the null hypothesis that our treatments have the same distribution, because p value is less than 0.05. A number of post hoc tests are available in ``scikit-posthocs`` package for unreplicated block design data. In the following example, Nemenyi's test is used:
108 |
109 | .. code:: python
110 |
111 | >>> import scikit_posthocs as sp
112 | >>> sp.posthoc_nemenyi_friedman(data)
113 | 0 1 2 3
114 | 0 -1.000000 0.220908 0.823993 0.031375
115 | 1 0.220908 -1.000000 0.670273 0.823993
116 | 2 0.823993 0.670273 -1.000000 0.220908
117 | 3 0.031375 0.823993 0.220908 -1.000000
118 |
119 | This function returns a DataFrame with p values obtained in pairwise comparisons between all treatments.
120 | One can also pass a DataFrame and specify the names of columns containing dependent variable values, blocking and primary factor values. The following code creates a DataFrame with the same data:
121 |
122 | .. code:: python
123 |
124 | >>> data = pd.DataFrame.from_dict({'blocks': {0: 0, 1: 1, 2: 2, 3: 3, 4: 0, 5: 1, 6:
125 | 2, 7: 3, 8: 0, 9: 1, 10: 2, 11: 3, 12: 0, 13: 1, 14: 2, 15: 3}, 'groups': {0:
126 | 0, 1: 0, 2: 0, 3: 0, 4: 1, 5: 1, 6: 1, 7: 1, 8: 2, 9: 2, 10: 2, 11: 2, 12: 3,
127 | 13: 3, 14: 3, 15: 3}, 'y': {0: 8.82, 1: 8.92, 2: 8.27, 3: 8.83, 4: 11.8, 5:
128 | 9.58, 6: 11.46, 7: 13.25, 8: 10.37, 9: 10.59, 10: 10.24, 11: 8.33, 12: 12.08,
129 | 13: 11.89, 14: 11.6, 15: 11.51}})
130 | >>> data
131 | blocks groups y
132 | 0 0 0 8.82
133 | 1 1 0 8.92
134 | 2 2 0 8.27
135 | 3 3 0 8.83
136 | 4 0 1 11.80
137 | 5 1 1 9.58
138 | 6 2 1 11.46
139 | 7 3 1 13.25
140 | 8 0 2 10.37
141 | 9 1 2 10.59
142 | 10 2 2 10.24
143 | 11 3 2 8.33
144 | 12 0 3 12.08
145 | 13 1 3 11.89
146 | 14 2 3 11.60
147 | 15 3 3 11.51
148 |
149 | This is a *melted* and ready-to-use DataFrame. Do not forget to pass ``melted`` argument:
150 |
151 | .. code:: python
152 |
153 | >>> sp.posthoc_nemenyi_friedman(data, y_col='y', block_col='blocks', group_col='groups', melted=True)
154 | 0 1 2 3
155 | 0 -1.000000 0.220908 0.823993 0.031375
156 | 1 0.220908 -1.000000 0.670273 0.823993
157 | 2 0.823993 0.670273 -1.000000 0.220908
158 | 3 0.031375 0.823993 0.220908 -1.000000
159 |
160 |
161 | Data types
162 | ~~~~~~~~~~
163 |
164 | Internally, ``scikit-posthocs`` uses NumPy ndarrays and pandas DataFrames to store and process data. Python lists, NumPy ndarrays, and pandas DataFrames are supported as *input* data types. Below are usage examples of various input data structures.
165 |
166 | Lists and arrays
167 | ^^^^^^^^^^^^^^^^
168 |
169 | .. code:: python
170 |
171 | >>> x = [[1,2,1,3,1,4], [12,3,11,9,3,8,1], [10,22,12,9,8,3]]
172 | >>> # or
173 | >>> x = np.array([[1,2,1,3,1,4], [12,3,11,9,3,8,1], [10,22,12,9,8,3]])
174 | >>> sp.posthoc_conover(x, p_adjust='holm')
175 | 1 2 3
176 | 1 -1.000000 0.057606 0.007888
177 | 2 0.057606 -1.000000 0.215761
178 | 3 0.007888 0.215761 -1.000000
179 |
180 | You can check how it is processed with a hidden function ``__convert_to_df()``:
181 |
182 | .. code:: python
183 |
184 | >>> sp.__convert_to_df(x)
185 | ( vals groups
186 | 0 1 1
187 | 1 2 1
188 | 2 1 1
189 | 3 3 1
190 | 4 1 1
191 | 5 4 1
192 | 6 12 2
193 | 7 3 2
194 | 8 11 2
195 | 9 9 2
196 | 10 3 2
197 | 11 8 2
198 | 12 1 2
199 | 13 10 3
200 | 14 22 3
201 | 15 12 3
202 | 16 9 3
203 | 17 8 3
204 | 18 3 3, 'vals', 'groups')
205 |
206 | It returns a tuple of a DataFrame representation and names of the columns containing dependent (``vals``) and independent (``groups``) variable values.
207 |
208 | *Block design* matrix passed as a NumPy ndarray is processed with a hidden ``__convert_to_block_df()`` function:
209 |
210 | .. code:: python
211 |
212 | >>> data = np.array([[ 8.82, 11.8 , 10.37, 12.08],
213 | [ 8.92, 9.58, 10.59, 11.89],
214 | [ 8.27, 11.46, 10.24, 11.6 ],
215 | [ 8.83, 13.25, 8.33, 11.51]])
216 | >>> sp.__convert_to_block_df(data)
217 | ( blocks groups y
218 | 0 0 0 8.82
219 | 1 1 0 8.92
220 | 2 2 0 8.27
221 | 3 3 0 8.83
222 | 4 0 1 11.80
223 | 5 1 1 9.58
224 | 6 2 1 11.46
225 | 7 3 1 13.25
226 | 8 0 2 10.37
227 | 9 1 2 10.59
228 | 10 2 2 10.24
229 | 11 3 2 8.33
230 | 12 0 3 12.08
231 | 13 1 3 11.89
232 | 14 2 3 11.60
233 | 15 3 3 11.51, 'y', 'groups', 'blocks')
234 |
235 | DataFrames
236 | ^^^^^^^^^^
237 |
238 | If you are using DataFrames, you need to pass column names containing variable values to a post hoc function:
239 |
240 | .. code:: python
241 |
242 | >>> import statsmodels.api as sa
243 | >>> import scikit_posthocs as sp
244 | >>> df = sa.datasets.get_rdataset('iris').data
245 | >>> sp.posthoc_conover(df, val_col='Sepal.Width', group_col='Species', p_adjust = 'holm')
246 |
247 | ``val_col`` and ``group_col`` arguments specify the names of the columns containing dependent (response) and independent (grouping) variable values.
248 |
249 | Significance plots
250 | ~~~~~~~~~~~~~~~~~~
251 |
252 | P values can be plotted using a heatmap:
253 |
254 | .. code:: python
255 |
256 | pc = sp.posthoc_conover(x, val_col='values', group_col='groups')
257 | heatmap_args = {'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]}
258 | sp.sign_plot(pc, **heatmap_args)
259 |
260 | .. image:: _static/plot-conover.png
261 |
262 | Custom colormap applied to a plot:
263 |
264 | .. code:: python
265 |
266 | pc = sp.posthoc_conover(x, val_col='values', group_col='groups')
267 | # Format: diagonal, non-significant, p<0.001, p<0.01, p<0.05
268 | cmap = ['1', '#fb6a4a', '#08306b', '#4292c6', '#c6dbef']
269 | heatmap_args = {'cmap': cmap, 'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]}
270 | sp.sign_plot(pc, **heatmap_args)
271 |
272 | .. image:: _static/plot-conover-custom-cmap.png
273 |
274 |
275 | Critical difference diagrams
276 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
277 |
278 | Critical difference diagrams are another interesting way of visualizing post
279 | hoc test statistics. Firstly, in a block design scenario, the values within
280 | each block are ranked, and the average rank across all blocks for each
281 | treatment is plotted along the x axis. A crossbar is then drawn over each
282 | group of treatments that do not show a statistically significant difference
283 | among themselves.
284 |
285 | .. image:: _static/cd_diagram0.png
286 |
287 | As an example, suppose we have a set of 8 treatments with 30 measurements
288 | (blocks) each, as simulated below. It could, for instance, represent scores for
289 | eight machine learning models in a 30-fold cross-validation setting.
290 |
291 | .. code:: python
292 |
293 | >>> rng = np.random.default_rng(1)
294 | >>> dict_data = {
295 | 'model1': rng.normal(loc=0.2, scale=0.1, size=30),
296 | 'model2': rng.normal(loc=0.2, scale=0.1, size=30),
297 | 'model3': rng.normal(loc=0.4, scale=0.1, size=30),
298 | 'model4': rng.normal(loc=0.5, scale=0.1, size=30),
299 | 'model5': rng.normal(loc=0.7, scale=0.1, size=30),
300 | 'model6': rng.normal(loc=0.7, scale=0.1, size=30),
301 | 'model7': rng.normal(loc=0.8, scale=0.1, size=30),
302 | 'model8': rng.normal(loc=0.9, scale=0.1, size=30),
303 | }
304 | >>> data = (
305 | pd.DataFrame(dict_data)
306 | .rename_axis('cv_fold')
307 | .melt(
308 | var_name='estimator',
309 | value_name='score',
310 | ignore_index=False,
311 | )
312 | .reset_index()
313 | )
314 | >>> data
315 | cv_fold estimator score
316 | 0 0 model1 0.234558
317 | 1 1 model1 0.282162
318 | 2 2 model1 0.233044
319 | 3 3 model1 0.069684
320 | 4 4 model1 0.290536
321 | .. ... ... ...
322 | 235 25 model8 0.925956
323 | 236 26 model8 0.758762
324 | 237 27 model8 0.977032
325 | 238 28 model8 0.829890
326 | 239 29 model8 0.787381
327 |
328 | [240 rows x 3 columns]
329 |
330 | The average (percentile) ranks could be calculated as follows:
331 |
332 | .. code:: python
333 |
334 | >>> avg_rank = data.groupby('cv_fold').score.rank(pct=True).groupby(data.estimator).mean()
335 | >>> avg_rank
336 |
337 | estimator
338 | model1 0.208333
339 | model2 0.191667
340 | model3 0.366667
341 | model4 0.495833
342 | model5 0.708333
343 | model6 0.737500
344 | model7 0.850000
345 | model8 0.941667
346 | Name: score, dtype: float64
347 |
348 | Again, the omnibus test result shows we can confidently reject the null
349 | hypothesis that all models come from the same distribution and proceed to the
350 | post hoc analysis.
351 |
352 | .. code:: python
353 |
354 | >>> import scipy.stats as ss
355 | >>> ss.friedmanchisquare(*dict_data.values())
356 | FriedmanchisquareResult(statistic=186.9000000000001, pvalue=6.787361102785178e-37)
357 |
358 | The results of a post hoc Conover test are collected:
359 |
360 | .. code:: python
361 |
362 | >>> test_results = sp.posthoc_conover_friedman(
363 | >>> data,
364 | >>> melted=True,
365 | >>> block_col='cv_fold',
366 | >>> group_col='estimator',
367 | >>> y_col='score',
368 | >>> )
369 | >>> sp.sign_plot(test_results)
370 |
371 | .. image:: _static/cd_diagram_example_sig_plot.png
372 |
373 | Finally, the average ranks and post hoc significance results can be passed to
374 | the ``critical_difference_diagram()`` function to plot the diagram:
375 |
376 | .. code:: python
377 |
378 | >>> plt.figure(figsize=(10, 2), dpi=100)
379 | >>> plt.title('Critical difference diagram of average score ranks')
380 | >>> sp.critical_difference_diagram(avg_rank, test_results)
381 |
382 | .. image:: _static/cd_diagram1.png
383 |
384 | The diagram shows that model 8 is significantly better ranked than all models
385 | but model 7, that models 1 and 2 are worse than the others, and that 3 and 4
386 | are also worse ranked than models 5, 6 and 7. Other comparisons, however, do
387 | not have sufficient statistical evidence to support them.
388 |
389 | Several style customization options are available:
390 |
391 | .. code:: python
392 |
393 | >>> plt.figure(figsize=(10, 2), dpi=100)
394 | >>> plt.title('Critical difference diagram of average score ranks')
395 | >>> sp.critical_difference_diagram(
396 | >>> ranks=avg_rank,
397 | >>> sig_matrix=test_results,
398 | >>> label_fmt_left='{label} [{rank:.3f}] ',
399 | >>> label_fmt_right=' [{rank:.3f}] {label}',
400 | >>> text_h_margin=0.3,
401 | >>> label_props={'color': 'black', 'fontweight': 'bold'},
402 | >>> crossbar_props={'color': None, 'marker': 'o'},
403 | >>> marker_props={'marker': '*', 's': 150, 'color': 'y', 'edgecolor': 'k'},
404 | >>> elbow_props={'color': 'gray'},
405 | >>> )
406 |
407 | .. image:: _static/cd_diagram2.png
408 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | .. image:: images/logo.png
2 |
3 | ===============
4 |
5 | .. image:: http://joss.theoj.org/papers/10.21105/joss.01169/status.svg
6 | :target: https://doi.org/10.21105/joss.01169
7 | .. image:: https://img.shields.io/github/actions/workflow/status/maximtrp/scikit-posthocs/package-test.yml?label=build
8 | :target: https://github.com/maximtrp/scikit-posthocs/actions/workflows/package-test.yml
9 | .. image:: https://img.shields.io/readthedocs/scikit-posthocs.svg
10 | :target: https://scikit-posthocs.readthedocs.io
11 | .. image:: https://img.shields.io/codacy/coverage/50d2a82a6dd84b51b515cebf931067d7/master
12 | :target: https://app.codacy.com/gh/maximtrp/scikit-posthocs/dashboard
13 | .. image:: https://img.shields.io/codacy/grade/50d2a82a6dd84b51b515cebf931067d7
14 | :target: https://www.codacy.com/gh/maximtrp/scikit-posthocs/dashboard
15 | .. image:: https://static.pepy.tech/badge/scikit-posthocs
16 | :target: https://pepy.tech/project/scikit-posthocs
17 | .. image:: https://img.shields.io/github/issues/maximtrp/scikit-posthocs.svg
18 | :target: https://github.com/maximtrp/scikit-posthocs/issues
19 | .. image:: https://img.shields.io/pypi/v/scikit-posthocs.svg
20 | :target: https://pypi.python.org/pypi/scikit-posthocs/
21 | .. image:: https://img.shields.io/conda/vn/conda-forge/scikit-posthocs.svg
22 | :target: https://anaconda.org/conda-forge/scikit-posthocs
23 |
24 | ===============
25 |
26 | **scikit-posthocs** is a Python package that provides post hoc tests for
27 | pairwise multiple comparisons that are usually performed in statistical
28 | data analysis to assess the differences between group levels if a statistically
29 | significant result of ANOVA test has been obtained.
30 |
31 | **scikit-posthocs** is tightly integrated with Pandas DataFrames and NumPy
32 | arrays to ensure fast computations and convenient data import and storage.
33 |
34 | This package will be useful for statisticians, data analysts, and
35 | researchers who use Python in their work.
36 |
37 |
38 | Background
39 | ----------
40 |
41 | Python statistical ecosystem comprises multiple packages. However, it
42 | still has numerous gaps and is surpassed by R packages and capabilities.
43 |
44 | `SciPy `_ (version 1.2.0) offers *Student*, *Wilcoxon*,
45 | and *Mann-Whitney* tests that are not adapted to multiple pairwise
46 | comparisons. `Statsmodels `_ (version 0.9.0)
47 | features *TukeyHSD* test that needs some extra actions to be fluently
48 | integrated into a data analysis pipeline.
49 | `Statsmodels `_ also has good helper
50 | methods: ``allpairtest`` (adapts an external function such as
51 | ``scipy.stats.ttest_ind`` to multiple pairwise comparisons) and
52 | ``multipletests`` (adjusts *p* values to minimize type I and II errors).
53 | `PMCMRplus `_ is a very good R package that
54 | has no rivals in Python as it offers more than 40 various tests (including
55 | post hoc tests) for factorial and block design data. PMCMRplus was an
56 | inspiration and a reference for *scikit-posthocs*.
57 |
58 | **scikit-posthocs** attempts to improve Python statistical capabilities by
59 | offering a lot of parametric and nonparametric post hoc tests along with
60 | outliers detection and basic plotting methods.
61 |
62 |
63 | Features
64 | --------
65 |
66 | .. image:: images/flowchart.png
67 | :alt: Tests Flowchart
68 |
69 | - *Omnibus* tests:
70 |
71 | - Durbin test (for balanced incomplete block design).
72 | - Mack-Wolfe test.
73 | - Hayter (OSRT) test.
74 |
75 | - *Parametric* pairwise multiple comparisons tests:
76 |
77 | - Scheffe test.
78 | - Student T test.
79 | - Tamhane T2 test.
80 | - TukeyHSD test.
81 |
82 | - *Non-parametric* tests for factorial design:
83 |
84 | - Conover test.
85 | - Dunn test.
86 | - Dwass, Steel, Critchlow, and Fligner test.
87 | - Mann-Whitney test.
88 | - Nashimoto and Wright (NPM) test.
89 | - Nemenyi test.
90 | - van Waerden test.
91 | - Wilcoxon test.
92 |
93 | - *Non-parametric* tests for block design:
94 |
95 | - Conover test.
96 | - Durbin and Conover test.
97 | - Miller test.
98 | - Nemenyi test.
99 | - Quade test.
100 | - Siegel test.
101 |
102 | - Outliers detection tests:
103 |
104 | - Simple test based on interquartile range (IQR).
105 | - Grubbs test.
106 | - Tietjen-Moore test.
107 | - Generalized Extreme Studentized Deviate test (ESD test).
108 |
109 | - Other tests:
110 |
111 | - Anderson-Darling test.
112 |
113 | - Global null hypothesis tests:
114 |
115 | - Fisher's combination test.
116 | - Simes test.
117 |
118 | - Plotting functionality (e.g. significance plots).
119 |
120 | All post hoc tests are capable of p adjustments for multiple
121 | pairwise comparisons.
122 |
123 | Dependencies
124 | ------------
125 |
126 | - `NumPy and SciPy packages `_
127 | - `Statsmodels `_
128 | - `Pandas `_
129 | - `Matplotlib `_
130 | - `Seaborn `_
131 |
132 | Compatibility
133 | -------------
134 |
135 | Package is only compatible with Python 3.
136 |
137 | Install
138 | -------
139 |
140 | You can install the package using ``pip`` (from PyPi):
141 |
142 | .. code:: bash
143 |
144 | pip install scikit-posthocs
145 |
146 | Or using ``conda`` (from conda-forge repo):
147 |
148 | .. code:: bash
149 |
150 | conda install -c conda-forge scikit-posthocs
151 |
152 | The latest version from GitHub can be installed using:
153 |
154 | .. code:: bash
155 |
156 | pip install git+https://github.com/maximtrp/scikit-posthocs.git
157 |
158 | Examples
159 | --------
160 |
161 | Parametric ANOVA with post hoc tests
162 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
163 |
164 | Here is a simple example of the one-way analysis of variance (ANOVA)
165 | with post hoc tests used to compare *sepal width* means of three
166 | groups (three iris species) in *iris* dataset.
167 |
168 | To begin, we will import the dataset using statsmodels
169 | ``get_rdataset()`` method.
170 |
171 | .. code:: python
172 |
173 | >>> import statsmodels.api as sa
174 | >>> import statsmodels.formula.api as sfa
175 | >>> import scikit_posthocs as sp
176 | >>> df = sa.datasets.get_rdataset('iris').data
177 | >>> df.columns = df.columns.str.replace('.', '')
178 | >>> df.head()
179 | SepalLength SepalWidth PetalLength PetalWidth Species
180 | 0 5.1 3.5 1.4 0.2 setosa
181 | 1 4.9 3.0 1.4 0.2 setosa
182 | 2 4.7 3.2 1.3 0.2 setosa
183 | 3 4.6 3.1 1.5 0.2 setosa
184 | 4 5.0 3.6 1.4 0.2 setosa
185 |
186 | Now, we will build a model and run ANOVA using statsmodels ``ols()``
187 | and ``anova_lm()`` methods. Columns ``Species`` and ``SepalWidth``
188 | contain independent (predictor) and dependent (response) variable
189 | values, correspondingly.
190 |
191 | .. code:: python
192 |
193 | >>> lm = sfa.ols('SepalWidth ~ C(Species)', data=df).fit()
194 | >>> anova = sa.stats.anova_lm(lm)
195 | >>> print(anova)
196 | df sum_sq mean_sq F PR(>F)
197 | C(Species) 2.0 11.344933 5.672467 49.16004 4.492017e-17
198 | Residual 147.0 16.962000 0.115388 NaN NaN
199 |
200 | The results tell us that there is a significant difference between
201 | groups means (p = 4.49e-17), but does not tell us the exact group pairs which
202 | are different in means. To obtain pairwise group differences, we will carry
203 | out a posteriori (post hoc) analysis using ``scikits-posthocs`` package.
204 | Student T test applied pairwisely gives us the following p values:
205 |
206 | .. code:: python
207 |
208 | >>> sp.posthoc_ttest(df, val_col='SepalWidth', group_col='Species', p_adjust='holm')
209 | setosa versicolor virginica
210 | setosa -1.000000e+00 5.535780e-15 8.492711e-09
211 | versicolor 5.535780e-15 -1.000000e+00 1.819100e-03
212 | virginica 8.492711e-09 1.819100e-03 -1.000000e+00
213 |
214 | Remember to use a `FWER controlling procedure `_,
215 | such as Holm procedure, when making multiple comparisons. As seen from this
216 | table, significant differences in group means are obtained for all group pairs.
217 |
218 | Non-parametric ANOVA with post hoc tests
219 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
220 |
221 | If normality and other `assumptions `_
222 | are violated, one can use a non-parametric Kruskal-Wallis H test (one-way
223 | non-parametric ANOVA) to test if samples came from the same distribution.
224 |
225 | Let's use the same dataset just to demonstrate the procedure. Kruskal-Wallis
226 | test is implemented in SciPy package. ``scipy.stats.kruskal`` method
227 | accepts array-like structures, but not DataFrames.
228 |
229 | .. code:: python
230 |
231 | >>> import scipy.stats as ss
232 | >>> import statsmodels.api as sa
233 | >>> import scikit_posthocs as sp
234 | >>> df = sa.datasets.get_rdataset('iris').data
235 | >>> df.columns = df.columns.str.replace('.', '')
236 | >>> data = [df.loc[ids, 'SepalWidth'].values for ids in df.groupby('Species').groups.values()]
237 |
238 | ``data`` is a list of 1D arrays containing *sepal width* values, one array per
239 | each species. Now we can run Kruskal-Wallis analysis of variance.
240 |
241 | .. code:: python
242 |
243 | >>> H, p = ss.kruskal(*data)
244 | >>> p
245 | 1.5692820940316782e-14
246 |
247 | P value tells us we may reject the null hypothesis that the population medians
248 | of all of the groups are equal. To learn what groups (species) differ in their
249 | medians we need to run post hoc tests. ``scikit-posthocs`` provides a lot of
250 | non-parametric tests mentioned above. Let's choose Conover's test.
251 |
252 | .. code:: python
253 |
254 | >>> sp.posthoc_conover(df, val_col='SepalWidth', group_col='Species', p_adjust = 'holm')
255 | setosa versicolor virginica
256 | setosa -1.000000e+00 2.278515e-18 1.293888e-10
257 | versicolor 2.278515e-18 -1.000000e+00 1.881294e-03
258 | virginica 1.293888e-10 1.881294e-03 -1.000000e+00
259 |
260 | Pairwise comparisons show that we may reject the null hypothesis (p < 0.01) for
261 | each pair of species and conclude that all groups (species) differ in their
262 | sepal widths.
263 |
264 | Block design
265 | ~~~~~~~~~~~~
266 |
267 | In block design case, we have a primary factor (e.g. treatment) and a blocking
268 | factor (e.g. age or gender). A blocking factor is also called a *nuisance*
269 | factor, and it is usually a source of variability that needs to be accounted
270 | for.
271 |
272 | An example scenario is testing the effect of four fertilizers on crop yield in
273 | four cornfields. We can represent the results with a matrix in which rows
274 | correspond to the blocking factor (field) and columns correspond to the
275 | primary factor (yield).
276 |
277 | The following dataset is artificial and created just for demonstration
278 | of the procedure:
279 |
280 | .. code:: python
281 |
282 | >>> data = np.array([[ 8.82, 11.8 , 10.37, 12.08],
283 | [ 8.92, 9.58, 10.59, 11.89],
284 | [ 8.27, 11.46, 10.24, 11.6 ],
285 | [ 8.83, 13.25, 8.33, 11.51]])
286 |
287 | First, we need to perform an omnibus test — Friedman rank sum test. It is
288 | implemented in ``scipy.stats`` subpackage:
289 |
290 | .. code:: python
291 |
292 | >>> import scipy.stats as ss
293 | >>> ss.friedmanchisquare(*data.T)
294 | FriedmanchisquareResult(statistic=8.700000000000003, pvalue=0.03355726870553798)
295 |
296 | We can reject the null hypothesis that our treatments have the same
297 | distribution, because p value is less than 0.05. A number of post hoc tests are
298 | available in ``scikit-posthocs`` package for unreplicated block design data.
299 | In the following example, Nemenyi's test is used:
300 |
301 | .. code:: python
302 |
303 | >>> import scikit_posthocs as sp
304 | >>> sp.posthoc_nemenyi_friedman(data)
305 | 0 1 2 3
306 | 0 -1.000000 0.220908 0.823993 0.031375
307 | 1 0.220908 -1.000000 0.670273 0.823993
308 | 2 0.823993 0.670273 -1.000000 0.220908
309 | 3 0.031375 0.823993 0.220908 -1.000000
310 |
311 | This function returns a DataFrame with p values obtained in pairwise
312 | comparisons between all treatments.
313 | One can also pass a DataFrame and specify the names of columns containing
314 | dependent variable values, blocking and primary factor values.
315 | The following code creates a DataFrame with the same data:
316 |
317 | .. code:: python
318 |
319 | >>> data = pd.DataFrame.from_dict({'blocks': {0: 0, 1: 1, 2: 2, 3: 3, 4: 0, 5: 1, 6:
320 | 2, 7: 3, 8: 0, 9: 1, 10: 2, 11: 3, 12: 0, 13: 1, 14: 2, 15: 3}, 'groups': {0:
321 | 0, 1: 0, 2: 0, 3: 0, 4: 1, 5: 1, 6: 1, 7: 1, 8: 2, 9: 2, 10: 2, 11: 2, 12: 3,
322 | 13: 3, 14: 3, 15: 3}, 'y': {0: 8.82, 1: 8.92, 2: 8.27, 3: 8.83, 4: 11.8, 5:
323 | 9.58, 6: 11.46, 7: 13.25, 8: 10.37, 9: 10.59, 10: 10.24, 11: 8.33, 12: 12.08,
324 | 13: 11.89, 14: 11.6, 15: 11.51}})
325 | >>> data
326 | blocks groups y
327 | 0 0 0 8.82
328 | 1 1 0 8.92
329 | 2 2 0 8.27
330 | 3 3 0 8.83
331 | 4 0 1 11.80
332 | 5 1 1 9.58
333 | 6 2 1 11.46
334 | 7 3 1 13.25
335 | 8 0 2 10.37
336 | 9 1 2 10.59
337 | 10 2 2 10.24
338 | 11 3 2 8.33
339 | 12 0 3 12.08
340 | 13 1 3 11.89
341 | 14 2 3 11.60
342 | 15 3 3 11.51
343 |
344 | This is a *melted* and ready-to-use DataFrame. Do not forget to pass ``melted``
345 | argument:
346 |
347 | .. code:: python
348 |
349 | >>> sp.posthoc_nemenyi_friedman(data, y_col='y', block_col='blocks', group_col='groups', melted=True)
350 | 0 1 2 3
351 | 0 -1.000000 0.220908 0.823993 0.031375
352 | 1 0.220908 -1.000000 0.670273 0.823993
353 | 2 0.823993 0.670273 -1.000000 0.220908
354 | 3 0.031375 0.823993 0.220908 -1.000000
355 |
356 |
357 | Data types
358 | ~~~~~~~~~~
359 |
360 | Internally, ``scikit-posthocs`` uses NumPy ndarrays and pandas DataFrames to
361 | store and process data. Python lists, NumPy ndarrays, and pandas DataFrames
362 | are supported as *input* data types. Below are usage examples of various
363 | input data structures.
364 |
365 | Lists and arrays
366 | ^^^^^^^^^^^^^^^^
367 |
368 | .. code:: python
369 |
370 | >>> x = [[1,2,1,3,1,4], [12,3,11,9,3,8,1], [10,22,12,9,8,3]]
371 | >>> # or
372 | >>> x = np.array([[1,2,1,3,1,4], [12,3,11,9,3,8,1], [10,22,12,9,8,3]])
373 | >>> sp.posthoc_conover(x, p_adjust='holm')
374 | 1 2 3
375 | 1 -1.000000 0.057606 0.007888
376 | 2 0.057606 -1.000000 0.215761
377 | 3 0.007888 0.215761 -1.000000
378 |
379 | You can check how it is processed with a hidden function ``__convert_to_df()``:
380 |
381 | .. code:: python
382 |
383 | >>> sp.__convert_to_df(x)
384 | ( vals groups
385 | 0 1 1
386 | 1 2 1
387 | 2 1 1
388 | 3 3 1
389 | 4 1 1
390 | 5 4 1
391 | 6 12 2
392 | 7 3 2
393 | 8 11 2
394 | 9 9 2
395 | 10 3 2
396 | 11 8 2
397 | 12 1 2
398 | 13 10 3
399 | 14 22 3
400 | 15 12 3
401 | 16 9 3
402 | 17 8 3
403 | 18 3 3, 'vals', 'groups')
404 |
405 | It returns a tuple of a DataFrame representation and names of the columns
406 | containing dependent (``vals``) and independent (``groups``) variable values.
407 |
408 | *Block design* matrix passed as a NumPy ndarray is processed with a hidden
409 | ``__convert_to_block_df()`` function:
410 |
411 | .. code:: python
412 |
413 | >>> data = np.array([[ 8.82, 11.8 , 10.37, 12.08],
414 | [ 8.92, 9.58, 10.59, 11.89],
415 | [ 8.27, 11.46, 10.24, 11.6 ],
416 | [ 8.83, 13.25, 8.33, 11.51]])
417 | >>> sp.__convert_to_block_df(data)
418 | ( blocks groups y
419 | 0 0 0 8.82
420 | 1 1 0 8.92
421 | 2 2 0 8.27
422 | 3 3 0 8.83
423 | 4 0 1 11.80
424 | 5 1 1 9.58
425 | 6 2 1 11.46
426 | 7 3 1 13.25
427 | 8 0 2 10.37
428 | 9 1 2 10.59
429 | 10 2 2 10.24
430 | 11 3 2 8.33
431 | 12 0 3 12.08
432 | 13 1 3 11.89
433 | 14 2 3 11.60
434 | 15 3 3 11.51, 'y', 'groups', 'blocks')
435 |
436 | DataFrames
437 | ^^^^^^^^^^
438 |
439 | If you are using DataFrames, you need to pass column names containing variable
440 | values to a post hoc function:
441 |
442 | .. code:: python
443 |
444 | >>> import statsmodels.api as sa
445 | >>> import scikit_posthocs as sp
446 | >>> df = sa.datasets.get_rdataset('iris').data
447 | >>> df.columns = df.columns.str.replace('.', '')
448 | >>> sp.posthoc_conover(df, val_col='SepalWidth', group_col='Species', p_adjust = 'holm')
449 |
450 | ``val_col`` and ``group_col`` arguments specify the names of the columns
451 | containing dependent (response) and independent (grouping) variable values.
452 |
453 |
454 | Significance plots
455 | ------------------
456 |
457 | P values can be plotted using a heatmap:
458 |
459 | .. code:: python
460 |
461 | >>> pc = sp.posthoc_conover(x, val_col='values', group_col='groups')
462 | >>> heatmap_args = {'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]}
463 | >>> sp.sign_plot(pc, **heatmap_args)
464 |
465 | .. image:: images/plot-conover.png
466 |
467 | Custom colormap applied to a plot:
468 |
469 | .. code:: python
470 |
471 | >>> pc = sp.posthoc_conover(x, val_col='values', group_col='groups')
472 | >>> # Format: diagonal, non-significant, p<0.001, p<0.01, p<0.05
473 | >>> cmap = ['1', '#fb6a4a', '#08306b', '#4292c6', '#c6dbef']
474 | >>> heatmap_args = {'cmap': cmap, 'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]}
475 | >>> sp.sign_plot(pc, **heatmap_args)
476 |
477 | .. image:: images/plot-conover-custom-cmap.png
478 |
479 | Citing
480 | ------
481 |
482 | If you want to cite *scikit-posthocs*, please refer to the publication in
483 | the `Journal of Open Source Software `_:
484 |
485 | Terpilowski, M. (2019). scikit-posthocs: Pairwise multiple comparison tests in
486 | Python. Journal of Open Source Software, 4(36), 1169, https://doi.org/10.21105/joss.01169
487 |
488 | .. code::
489 |
490 | @ARTICLE{Terpilowski2019,
491 | title = {scikit-posthocs: Pairwise multiple comparison tests in Python},
492 | author = {Terpilowski, Maksim},
493 | journal = {The Journal of Open Source Software},
494 | volume = {4},
495 | number = {36},
496 | pages = {1169},
497 | year = {2019},
498 | doi = {10.21105/joss.01169}
499 | }
500 |
501 | Acknowledgement
502 | ---------------
503 |
504 | Thorsten Pohlert, PMCMR author and maintainer
505 |
--------------------------------------------------------------------------------
/scikit_posthocs/_plotting.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | from typing import Dict, List, Optional, Set, Tuple, Union
3 |
4 | import numpy as np
5 | from matplotlib import colors, pyplot
6 | from matplotlib.axes import Axes
7 | from matplotlib.colorbar import Colorbar, ColorbarBase
8 | from matplotlib.colors import ListedColormap
9 | from pandas import DataFrame, Index, Series
10 | from seaborn import heatmap
11 |
12 |
13 | def sign_array(p_values: Union[List, np.ndarray, DataFrame], alpha: float = 0.05) -> np.ndarray:
14 | """Significance array.
15 |
16 | Converts an array with p values to a significance array where
17 | 0 is False (not significant), 1 is True (significant),
18 | and -1 is for diagonal elements.
19 |
20 | Parameters
21 | ----------
22 | p_values : Union[List, np.ndarray, DataFrame]
23 | Any object exposing the array interface and containing
24 | p values.
25 |
26 | alpha : float = 0.05
27 | Significance level. Default is 0.05.
28 |
29 | Returns
30 | -------
31 | result : numpy.ndarray
32 | Array where 0 is False (not significant), 1 is True (significant),
33 | and -1 is for diagonal elements.
34 |
35 | Examples
36 | --------
37 | >>> p_values = np.array([[ 1. , 0.00119517, 0.00278329],
38 | [ 0.00119517, 1. , 0.18672227],
39 | [ 0.00278329, 0.18672227, 1. ]])
40 | >>> ph.sign_array(p_values)
41 | array([[-1, 1, 1],
42 | [ 1, -1, 0],
43 | [ 1, 0, -1]])
44 | """
45 | sig_array = deepcopy(np.array(p_values))
46 | sig_array[sig_array == 0] = 1e-10
47 | sig_array[sig_array > alpha] = 0
48 | sig_array[(sig_array < alpha) & (sig_array > 0)] = 1
49 | np.fill_diagonal(sig_array, -1)
50 |
51 | return sig_array
52 |
53 |
54 | def sign_table(
55 | p_values: Union[List, np.ndarray, DataFrame], lower: bool = True, upper: bool = True
56 | ) -> Union[DataFrame, np.ndarray]:
57 | """Significance table.
58 |
59 | Returns table that can be used in a publication. P values are replaced
60 | with asterisks: \\* - p < 0.05, \\*\\* - p < 0.01, \\*\\*\\* - p < 0.001.
61 |
62 | Parameters
63 | ----------
64 | p_values : Union[List, np.ndarray, DataFrame]
65 | Any object exposing the array interface and containing
66 | p values.
67 |
68 | lower : bool
69 | Defines whether to return the lower triangle.
70 |
71 | upper : bool
72 | Defines whether to return the upper triangle.
73 |
74 | Returns
75 | -------
76 | result : Union[DataFrame, np.ndarray]
77 | P values masked with asterisks.
78 |
79 | Examples
80 | --------
81 | >>> p_values = np.array([[-1. , 0.00119517, 0.00278329],
82 | [ 0.00119517, -1. , 0.18672227],
83 | [ 0.00278329, 0.18672227, -1. ]])
84 | >>> ph.sign_table(p_values)
85 | array([['-', '**', '**'],
86 | ['**', '-', 'NS'],
87 | ['**', 'NS', '-']], dtype=object)
88 | """
89 | if not any([lower, upper]):
90 | raise ValueError("Either lower or upper triangle must be returned")
91 |
92 | pv = DataFrame(p_values, copy=True) if not isinstance(p_values, DataFrame) else p_values.copy()
93 |
94 | ns = pv > 0.05
95 | three = (pv < 0.001) & (pv >= 0)
96 | two = (pv < 0.01) & (pv >= 0.001)
97 | one = (pv < 0.05) & (pv >= 0.01)
98 |
99 | pv = pv.astype(str)
100 | pv[ns] = "NS"
101 | pv[three] = "***"
102 | pv[two] = "**"
103 | pv[one] = "*"
104 |
105 | np.fill_diagonal(pv.values, "-")
106 | if not lower:
107 | pv.values[np.tril_indices(pv.shape[0], -1)] = ""
108 | elif not upper:
109 | pv.values[np.triu_indices(pv.shape[0], 1)] = ""
110 |
111 | return pv
112 |
113 |
114 | def sign_plot(
115 | x: Union[List, np.ndarray, DataFrame],
116 | g: Union[List, np.ndarray, None] = None,
117 | flat: bool = False,
118 | labels: bool = True,
119 | cmap: Optional[List] = None,
120 | cbar_ax_bbox: Optional[Tuple[float, float, float, float]] = None,
121 | ax: Optional[Axes] = None,
122 | **kwargs,
123 | ) -> Union[Axes, Tuple[Axes, Colorbar]]:
124 | """Significance plot, a heatmap of p values (based on Seaborn).
125 |
126 | Parameters
127 | ----------
128 | x : Union[List, np.ndarray, DataFrame]
129 | If `flat` is False (default), `x` must be a square array, any object
130 | exposing the array interface, containing p values. If `flat` is True,
131 | `x` must be a sign_array
132 | (returned by :py:meth:`scikit_posthocs.sign_array` function).
133 |
134 | g : Union[List, np.ndarray]
135 | An array, any object exposing the array interface, containing
136 | group names.
137 |
138 | flat : bool
139 | If `flat` is True, plots a significance array as a heatmap using
140 | seaborn. If `flat` is False (default), plots an array of p values.
141 | Non-flat mode is useful if you need to differentiate significance
142 | levels visually. It is the preferred mode.
143 |
144 | labels : bool
145 | Plot axes labels (default) or not.
146 |
147 | cmap : list
148 | 1) If flat is False (default):
149 | List consisting of five elements, that will be exported to
150 | ListedColormap method of matplotlib. First is for diagonal
151 | elements, second is for non-significant elements, third is for
152 | p < 0.001, fourth is for p < 0.01, fifth is for p < 0.05.
153 |
154 | 2) If flat is True:
155 | List consisting of three elements, that will be exported to
156 | ListedColormap method of matplotlib. First is for diagonal
157 | elements, second is for non-significant elements, third is for
158 | significant ones.
159 | 3) If not defined, default colormaps will be used.
160 |
161 | cbar_ax_bbox : list
162 | Colorbar axes position rect [left, bottom, width, height] where
163 | all quantities are in fractions of figure width and height.
164 | Refer to `matplotlib.figure.Figure.add_axes` for more information.
165 | Default is [0.95, 0.35, 0.04, 0.3].
166 |
167 | ax : SubplotBase
168 | Axes in which to draw the plot, otherwise use the currently-active
169 | Axes.
170 |
171 | kwargs
172 | Keyword arguments to be passed to seaborn heatmap method. These
173 | keyword args cannot be used: cbar, vmin, vmax, center.
174 |
175 | Returns
176 | -------
177 | ax : matplotlib.axes._subplots.AxesSubplot
178 | Axes object with the heatmap.
179 |
180 | cbar : matplotlib.colorbar.Colorbar
181 | ColorBar object if `flat` is set to False.
182 |
183 | Examples
184 | --------
185 | >>> x = np.array([[ 1, 1, 1],
186 | [ 1, 1, 0],
187 | [ 1, 0, 1]])
188 | >>> ph.sign_plot(x, flat = True)
189 | """
190 | for key in ["cbar", "vmin", "vmax", "center"]:
191 | if key in kwargs:
192 | del kwargs[key]
193 |
194 | if isinstance(x, DataFrame):
195 | df = x.copy()
196 | else:
197 | g = g or np.arange(len(x))
198 | df = DataFrame(x, index=Index(g), columns=Index(g), copy=True)
199 |
200 | dtype = df.values.dtype
201 |
202 | if not np.issubdtype(dtype, np.integer) and flat:
203 | raise ValueError("X should be a sign_array or DataFrame of integers")
204 | elif not np.issubdtype(dtype, np.floating) and not flat:
205 | raise ValueError("X should be an array or DataFrame of float p values")
206 |
207 | if not cmap and flat:
208 | # format: diagonal, non-significant, significant
209 | cmap = ["1", "#fbd7d4", "#1a9641"]
210 | elif not cmap:
211 | # format: diagonal, non-significant, p<0.001, p<0.01, p<0.05
212 | cmap = ["1", "#fbd7d4", "#005a32", "#238b45", "#a1d99b"]
213 |
214 | if flat:
215 | np.fill_diagonal(df.values, -1)
216 | hax = heatmap(df, vmin=-1, vmax=1, cmap=ListedColormap(cmap), cbar=False, ax=ax, **kwargs)
217 | if not labels:
218 | hax.set_xlabel("")
219 | hax.set_ylabel("")
220 | return hax
221 |
222 | else:
223 | xc = df.values.copy()
224 | df[(xc < 0.001) & (xc >= 0)] = 1
225 | df[(xc < 0.01) & (xc >= 0.001)] = 2
226 | df[(xc < 0.05) & (xc >= 0.01)] = 3
227 | df[(xc >= 0.05)] = 0
228 |
229 | np.fill_diagonal(df.values, -1)
230 |
231 | if len(cmap) != 5:
232 | raise ValueError("Cmap list must contain 5 items")
233 |
234 | hax = heatmap(
235 | df,
236 | vmin=-1,
237 | vmax=3,
238 | cmap=ListedColormap(cmap),
239 | center=1,
240 | cbar=False,
241 | ax=ax,
242 | **kwargs,
243 | )
244 | if not labels:
245 | hax.set_xlabel("")
246 | hax.set_ylabel("")
247 |
248 | cbar_ax = hax.figure.add_axes(cbar_ax_bbox or (0.95, 0.35, 0.04, 0.3))
249 | cbar = ColorbarBase(
250 | cbar_ax,
251 | cmap=(ListedColormap(cmap[2:] + [cmap[1]])),
252 | norm=colors.NoNorm(),
253 | boundaries=[0, 1, 2, 3, 4],
254 | )
255 | cbar.set_ticks(
256 | list(np.linspace(0, 3, 4)),
257 | labels=["p < 0.001", "p < 0.01", "p < 0.05", "NS"],
258 | )
259 |
260 | cbar.outline.set_linewidth(1)
261 | cbar.outline.set_edgecolor("0.5")
262 | cbar.ax.tick_params(size=0)
263 |
264 | return hax, cbar
265 |
266 |
267 | def _find_maximal_cliques(adj_matrix: DataFrame) -> List[Set]:
268 | """Wrapper function over the recursive Bron-Kerbosch algorithm.
269 |
270 | Will be used to find points that are under the same crossbar in critical
271 | difference diagrams.
272 |
273 | Parameters
274 | ----------
275 | adj_matrix : pandas.DataFrame
276 | Binary matrix with 1 if row item and column item do NOT significantly
277 | differ. Values in the main diagonal are not considered.
278 |
279 | Returns
280 | -------
281 | list[set]
282 | Largest fully connected subgraphs, represented as sets of indices of
283 | adj_matrix.
284 |
285 | Raises
286 | ------
287 | ValueError
288 | If the input matrix is empty or not symmetric.
289 | If the input matrix is not binary.
290 |
291 | """
292 | if (adj_matrix.index != adj_matrix.columns).any():
293 | raise ValueError("adj_matrix must be symmetric, indices do not match")
294 | if not adj_matrix.isin((0, 1)).values.all():
295 | raise ValueError("Input matrix must be binary")
296 | if adj_matrix.empty or not (adj_matrix.T == adj_matrix).values.all():
297 | raise ValueError("Input matrix must be non-empty and symmetric")
298 |
299 | result = []
300 | _bron_kerbosch(
301 | current_clique=set(),
302 | candidates=set(adj_matrix.index),
303 | visited=set(),
304 | adj_matrix=adj_matrix,
305 | result=result,
306 | )
307 | return result
308 |
309 |
310 | def _bron_kerbosch(
311 | current_clique: Set,
312 | candidates: Set,
313 | visited: Set,
314 | adj_matrix: DataFrame,
315 | result: List[Set],
316 | ) -> None:
317 | """Recursive algorithm to find the maximal fully connected subgraphs.
318 |
319 | See [1]_ for more information.
320 |
321 | Parameters
322 | ----------
323 | current_clique : set
324 | A set of vertices known to be fully connected.
325 | candidates : set
326 | Set of vertices that could potentially be added to the clique.
327 | visited : set
328 | Set of vertices already known to be part of another previously explored
329 | clique, that is not current_clique.
330 | adj_matrix : pandas.DataFrame
331 | Binary matrix with 1 if row item and column item do NOT significantly
332 | differ. Diagonal must be zeroed.
333 | result : list[set]
334 | List where to append the maximal cliques.
335 |
336 | Returns
337 | -------
338 | None
339 |
340 | References
341 | ----------
342 | .. [1] https://en.wikipedia.org/wiki/Bron%E2%80%93Kerbosch_algorithm
343 | """
344 | while candidates:
345 | v = candidates.pop()
346 | _bron_kerbosch(
347 | current_clique | {v},
348 | # Restrict candidate vertices to the neighbors of v
349 | {n for n in candidates if adj_matrix.loc[v, n]},
350 | # Restrict visited vertices to the neighbors of v
351 | {n for n in visited if adj_matrix.loc[v, n]},
352 | adj_matrix,
353 | result,
354 | )
355 | visited.add(v)
356 |
357 | # We do not need to report a clique if a children call aready did it.
358 | if not visited:
359 | # If this is not a terminal call, i.e. if any clique was reported.
360 | result.append(current_clique)
361 |
362 |
363 | def critical_difference_diagram(
364 | ranks: Union[dict, Series],
365 | sig_matrix: DataFrame,
366 | *,
367 | alpha: float = 0.05,
368 | ax: Optional[Axes] = None,
369 | label_fmt_left: str = "{label} ({rank:.2g})",
370 | label_fmt_right: str = "({rank:.2g}) {label}",
371 | label_props: Optional[dict] = None,
372 | marker_props: Optional[dict] = None,
373 | elbow_props: Optional[dict] = None,
374 | crossbar_props: Optional[dict] = None,
375 | color_palette: Union[Dict[str, str], List, None] = None,
376 | text_h_margin: float = 0.01,
377 | left_only: bool = False,
378 | ) -> Dict[str, list]:
379 | """Plot a Critical Difference diagram from ranks and post-hoc results.
380 |
381 | The diagram arranges the average ranks of multiple groups on the x axis
382 | in order to facilitate performance comparisons between them. The groups
383 | that could not be statistically deemed as different are linked by a
384 | horizontal crossbar [1]_, [2]_.
385 |
386 | ::
387 |
388 | rank markers
389 | X axis ---------O----O-------------------O-O------------O---------
390 | |----| | | |
391 | | | |---crossbar---|
392 | clf1 ----| | | | |---- clf3
393 | clf2 ---------| | |----------------- clf4
394 | |------------------- clf5
395 | |____|
396 | text_h_margin
397 |
398 | In the drawing above, the two crossbars indicate that clf1 and clf2 cannot
399 | be statistically differentiated, the same occurring between clf3, clf4 and
400 | clf5. However, clf1 and clf2 are each significantly lower ranked than clf3,
401 | clf4 and clf5.
402 |
403 | Parameters
404 | ----------
405 | ranks : dict or Series
406 | Indicates the rank value for each sample or estimator (as keys or index).
407 |
408 | sig_matrix : DataFrame
409 | The corresponding p-value matrix outputted by post-hoc tests, with
410 | indices matching the labels in the ranks argument.
411 |
412 | alpha : float, optional = 0.05
413 | Significance level. Default is 0.05.
414 | Values below this will be considered statistically different.
415 |
416 | ax : matplotlib.SubplotBase, optional
417 | The object in which the plot will be built. Gets the current Axes
418 | by default (if None is passed).
419 |
420 | label_fmt_left : str, optional
421 | The format string to apply to the labels on the left side. The keywords
422 | label and rank can be used to specify the sample/estimator name and
423 | rank value, respectively, by default '{label} ({rank:.2g})'.
424 |
425 | label_fmt_right : str, optional
426 | The same, but for the labels on the right side of the plot.
427 | By default '({rank:.2g}) {label}'.
428 |
429 | label_props : dict, optional
430 | Parameters to be passed to pyplot.text() when creating the labels,
431 | by default None.
432 |
433 | marker_props : dict, optional
434 | Parameters to be passed to pyplot.scatter() when plotting the rank
435 | markers on the axis, by default None.
436 |
437 | elbow_props : dict, optional
438 | Parameters to be passed to pyplot.plot() when creating the elbow lines,
439 | by default None.
440 |
441 | crossbar_props : dict, optional
442 | Parameters to be passed to pyplot.plot() when creating the crossbars
443 | that indicate lack of statistically significant difference. By default
444 | None.
445 |
446 | color_palette: dict or list, optional
447 | Parameters to be passed when you need specific colors for each category
448 |
449 | text_h_margin : float, optional
450 | Space between the text labels and the nearest vertical line of an
451 | elbow, by default 0.01.
452 |
453 | left_only: boolean, optional
454 | Set all labels in a single left-sided block instead of splitting them
455 | into two block, one for the left and one for the right.
456 |
457 |
458 | Returns
459 | -------
460 | dict[str, list[matplotlib.Artist]]
461 | Lists of Artists created.
462 |
463 | Examples
464 | --------
465 | See the :doc:`/tutorial`.
466 |
467 | References
468 | ----------
469 | .. [1] Demšar, J. (2006). Statistical comparisons of classifiers over multiple
470 | data sets. The Journal of Machine learning research, 7, 1-30.
471 |
472 | .. [2] https://mirkobunse.github.io/CriticalDifferenceDiagrams.jl/stable/
473 | """
474 | ## check color_palette consistency
475 | if not color_palette or len(color_palette) == 0:
476 | pass
477 | elif isinstance(color_palette, Dict) and (
478 | (len(set(ranks.keys()) & set(color_palette.keys()))) == len(ranks)
479 | ):
480 | pass
481 | elif isinstance(color_palette, List) and (len(ranks) <= len(color_palette)):
482 | pass
483 | else:
484 | raise ValueError("color_palette keys are not consistent, or list size too small")
485 |
486 | elbow_props = elbow_props or {}
487 | marker_props = {"zorder": 3, **(marker_props or {})}
488 | label_props = {"va": "center", **(label_props or {})}
489 | crossbar_props = {
490 | "color": "k",
491 | "zorder": 3,
492 | "linewidth": 2,
493 | **(crossbar_props or {}),
494 | }
495 |
496 | ax = ax or pyplot.gca()
497 | ax.yaxis.set_visible(False)
498 | ax.spines["right"].set_visible(False)
499 | ax.spines["left"].set_visible(False)
500 | ax.spines["bottom"].set_visible(False)
501 | ax.xaxis.set_ticks_position("top")
502 | ax.spines["top"].set_position("zero")
503 |
504 | # lists of artists to be returned
505 | markers = []
506 | elbows = []
507 | labels = []
508 | crossbars = []
509 |
510 | # True if pairwise comparison is NOT significant
511 | adj_matrix = DataFrame(
512 | 1 - sign_array(sig_matrix, alpha=alpha),
513 | index=sig_matrix.index,
514 | columns=sig_matrix.columns,
515 | dtype=bool,
516 | )
517 |
518 | ranks = Series(ranks).sort_values() # Standardize if ranks is dict
519 | if left_only:
520 | points_left = ranks
521 | else:
522 | points_left, points_right = (
523 | ranks.iloc[: len(ranks) // 2],
524 | ranks.iloc[len(ranks) // 2 :],
525 | )
526 | # points_left, points_right = np.array_split(ranks.sort_values(), 2)
527 |
528 | # Sets of points under the same crossbar
529 | crossbar_sets = _find_maximal_cliques(adj_matrix)
530 |
531 | # Sort by lowest rank and filter single-valued sets
532 | crossbar_sets = sorted(
533 | (x for x in crossbar_sets if len(x) > 1), key=lambda x: ranks[list(x)].min()
534 | )
535 |
536 | # Create stacking of crossbars: for each level, try to fit the crossbar,
537 | # so that it does not intersect with any other in the level. If it does not
538 | # fit in any level, create a new level for it.
539 | crossbar_levels: list[list[set]] = []
540 | for bar in crossbar_sets:
541 | for level, bars_in_level in enumerate(crossbar_levels):
542 | if not any(bool(bar & bar_in_lvl) for bar_in_lvl in bars_in_level):
543 | ypos = -level - 1
544 | bars_in_level.append(bar)
545 | break
546 | else:
547 | ypos = -len(crossbar_levels) - 1
548 | crossbar_levels.append([bar])
549 |
550 | crossbars.append(
551 | ax.plot(
552 | # Adding a separate line between each pair enables showing a
553 | # marker over each elbow with crossbar_props={'marker': 'o'}.
554 | [ranks.loc[i] for i in bar],
555 | [ypos] * len(bar),
556 | **crossbar_props,
557 | )
558 | )
559 |
560 | lowest_crossbar_ypos = -len(crossbar_levels)
561 |
562 | def plot_items(points, xpos, label_fmt, color_palette, label_props):
563 | """Plot each marker + elbow + label."""
564 | ypos = lowest_crossbar_ypos - 1
565 | for idx, (label, rank) in enumerate(points.items()):
566 | if not color_palette or len(color_palette) == 0:
567 | elbow, *_ = ax.plot(
568 | [xpos, rank, rank],
569 | [ypos, ypos, 0],
570 | **elbow_props,
571 | )
572 | else:
573 | elbow, *_ = ax.plot(
574 | [xpos, rank, rank],
575 | [ypos, ypos, 0],
576 | c=color_palette[label]
577 | if isinstance(color_palette, Dict)
578 | else color_palette[idx],
579 | **elbow_props,
580 | )
581 |
582 | elbows.append(elbow)
583 | curr_color = elbow.get_color()
584 | markers.append(ax.scatter(rank, 0, **{"color": curr_color, **marker_props}))
585 | labels.append(
586 | ax.text(
587 | xpos,
588 | ypos,
589 | label_fmt.format(label=label, rank=rank),
590 | color=curr_color,
591 | **label_props,
592 | )
593 | )
594 | ypos -= 1
595 |
596 | plot_items(
597 | points_left,
598 | xpos=points_left.iloc[0] - text_h_margin,
599 | label_fmt=label_fmt_left,
600 | color_palette=color_palette,
601 | label_props={
602 | "ha": "right",
603 | **label_props,
604 | },
605 | )
606 |
607 | if not left_only:
608 | plot_items(
609 | points_right[::-1],
610 | xpos=points_right.iloc[-1] + text_h_margin,
611 | label_fmt=label_fmt_right,
612 | color_palette=list(reversed(color_palette))
613 | if isinstance(color_palette, list)
614 | else color_palette,
615 | label_props={"ha": "left", **label_props},
616 | )
617 |
618 | return {
619 | "markers": markers,
620 | "elbows": elbows,
621 | "labels": labels,
622 | "crossbars": crossbars,
623 | }
624 |
--------------------------------------------------------------------------------
/tests/test_posthocs.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import unittest
4 | import matplotlib as mpl
5 | import scikit_posthocs._posthocs as sp
6 | import scikit_posthocs._omnibus as som
7 | import scikit_posthocs._outliers as so
8 | import scikit_posthocs._plotting as splt
9 | import scikit_posthocs._global as spg
10 | import seaborn as sb
11 | import numpy as np
12 | import matplotlib.axes as ma
13 | from pandas import DataFrame, Series
14 |
15 | if os.environ.get("DISPLAY", "") == "":
16 | print("No display found. Using non-interactive Agg backend")
17 | mpl.use("Agg")
18 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
19 |
20 |
21 | class TestPosthocs(unittest.TestCase):
22 | # Global tests
23 | def test_global_simes_test(self):
24 | a = np.array([0.9, 0.1, 0.01, 0.99, 1.0, 0.02, 0.04])
25 | result = spg.global_simes_test(a)
26 | self.assertAlmostEqual(result, 0.07)
27 |
28 | def test_global_f_test(self):
29 | a = np.array([0.9, 0.1, 0.01, 0.99, 1.0, 0.02, 0.04])
30 | result, _ = spg.global_f_test(a)
31 | self.assertAlmostEqual(result, 0.01294562)
32 |
33 | # Plotting tests
34 | def test_sign_array(self):
35 | p_values = np.array(
36 | [
37 | [0.0, 0.00119517, 0.00278329],
38 | [0.00119517, 0.0, 0.18672227],
39 | [0.00278329, 0.18672227, 0.0],
40 | ]
41 | )
42 | test_results = splt.sign_array(p_values)
43 | correct_results = np.array([[-1, 1, 1], [1, -1, 0], [1, 0, -1]])
44 | self.assertTrue(np.all(test_results == correct_results))
45 |
46 | def test_sign_table(self):
47 | p_values = np.array(
48 | [
49 | [1.0, 0.00119517, 0.00278329],
50 | [0.00119517, 1.0, 0.18672227],
51 | [0.00278329, 0.18672227, 1.0],
52 | ]
53 | )
54 |
55 | correct_results = np.array(
56 | [["-", "**", "**"], ["**", "-", "NS"], ["**", "NS", "-"]], dtype=object
57 | )
58 | correct_resultsl = np.array(
59 | [["-", "", ""], ["**", "-", ""], ["**", "NS", "-"]], dtype=object
60 | )
61 | correct_resultsu = np.array(
62 | [["-", "**", "**"], ["", "-", "NS"], ["", "", "-"]], dtype=object
63 | )
64 |
65 | with self.assertRaises(ValueError):
66 | splt.sign_table(p_values, lower=False, upper=False)
67 |
68 | self.assertTrue(
69 | np.all(splt.sign_table(p_values, lower=False, upper=True) == correct_resultsu)
70 | )
71 | self.assertTrue(
72 | np.all(splt.sign_table(p_values, lower=True, upper=False) == correct_resultsl)
73 | )
74 | self.assertTrue(
75 | np.all(splt.sign_table(p_values, lower=True, upper=True) == correct_results)
76 | )
77 |
78 | def test_sign_plot(self):
79 | x = np.array([[1, 1, 1], [1, 1, 0], [1, 0, 1]])
80 | a = splt.sign_plot(x, flat=True, labels=False)
81 | with self.assertRaises(ValueError):
82 | splt.sign_plot(x.astype(float), flat=True, labels=False)
83 | self.assertTrue(isinstance(a, ma._axes.Axes))
84 |
85 | def test_sign_plot_nonflat(self):
86 | x = np.array(
87 | [
88 | [1.0, 0.00119517, 0.00278329],
89 | [0.00119517, 1.0, 0.18672227],
90 | [0.00278329, 0.18672227, 1.0],
91 | ]
92 | )
93 | a, cbar = splt.sign_plot(x, cbar=True, labels=False)
94 |
95 | with self.assertRaises(ValueError):
96 | splt.sign_plot(x, cmap=[1, 1], labels=False)
97 | with self.assertRaises(ValueError):
98 | splt.sign_plot(x.astype(np.int64), labels=False)
99 |
100 | self.assertTrue(
101 | isinstance(a, ma._axes.Axes) and isinstance(cbar, mpl.colorbar.ColorbarBase)
102 | )
103 |
104 | def test_find_maximal_cliques_input_validation(self):
105 | with self.assertRaisesRegex(ValueError, ".*indices do not match"):
106 | splt._find_maximal_cliques(
107 | DataFrame(
108 | [[0, 1], [1, 0]],
109 | index=["a", "b"],
110 | columns=["b", "a"],
111 | )
112 | )
113 | with self.assertRaises(ValueError, msg="Input matrix must be binary"):
114 | splt._find_maximal_cliques(DataFrame([[0, 3], [3, 0]]))
115 | with self.assertRaisesRegex(ValueError, ".*empty and symmetric"):
116 | splt._find_maximal_cliques(DataFrame())
117 | with self.assertRaisesRegex(ValueError, ".*empty and symmetric"):
118 | splt._find_maximal_cliques(DataFrame([[1, 0], [1, 0]]))
119 |
120 | def test_find_maximal_cliques_1x1(self):
121 | adj_matrix = DataFrame([[0]], columns=["a"], index=["a"])
122 | expected = [{"a"}]
123 | self.assertEqual(splt._find_maximal_cliques(adj_matrix), expected)
124 |
125 | def test_find_maximal_cliques_2x2(self):
126 | adj_matrix = DataFrame(
127 | [[0, 1], [1, 0]],
128 | columns=["a", "b"],
129 | index=["a", "b"],
130 | )
131 | expected = [{"a", "b"}]
132 | self.assertEqual(splt._find_maximal_cliques(adj_matrix), expected)
133 |
134 | def test_find_maximal_cliques_3x3(self):
135 | adj_matrix = DataFrame(
136 | [[0, 0, 1], [0, 0, 0], [1, 0, 0]],
137 | columns=["a", "b", "c"],
138 | index=["a", "b", "c"],
139 | )
140 | expected = [{"a", "c"}, {"b"}]
141 | self.assertEqual(
142 | set(map(frozenset, splt._find_maximal_cliques(adj_matrix))),
143 | set(map(frozenset, expected)),
144 | )
145 |
146 | def test_find_maximal_cliques_6x6(self):
147 | adj_matrix = DataFrame(
148 | [
149 | [0, 1, 0, 0, 0, 0],
150 | [1, 0, 1, 1, 1, 0],
151 | [0, 1, 0, 1, 1, 0],
152 | [0, 1, 1, 0, 1, 0],
153 | [0, 1, 1, 1, 0, 0],
154 | [0, 0, 0, 0, 0, 0],
155 | ]
156 | )
157 | expected = [{0, 1}, {1, 2, 3, 4}, {5}]
158 | self.assertEqual(
159 | set(map(frozenset, splt._find_maximal_cliques(adj_matrix))),
160 | set(map(frozenset, expected)),
161 | )
162 |
163 | def test_cd_diagram_number_of_artists(self):
164 | index = list("abcdef")
165 | ranks = Series([2.1, 1.2, 4.5, 3.2, 5.7, 6.5], index=index)
166 | sig_matrix = DataFrame(
167 | [
168 | [0.08, 0.08, 0.01, 0.01, 0.01, 0.01],
169 | [0.08, 0.08, 0.08, 0.08, 0.08, 0.01],
170 | [0.01, 0.08, 0.08, 0.08, 0.08, 0.01],
171 | [0.01, 0.08, 0.08, 0.08, 0.08, 0.01],
172 | [0.01, 0.08, 0.08, 0.08, 0.08, 0.01],
173 | [0.01, 0.01, 0.01, 0.01, 0.01, 0.08],
174 | ],
175 | index=index,
176 | columns=index,
177 | )
178 |
179 | output = splt.critical_difference_diagram(ranks, sig_matrix)
180 | self.assertEqual(len(output["markers"]), len(ranks))
181 | self.assertEqual(len(output["elbows"]), len(ranks))
182 | self.assertEqual(len(output["labels"]), len(ranks))
183 | self.assertEqual(len(output["crossbars"]), 2)
184 |
185 | # Outliers tests
186 | def test_outliers_iqr(self):
187 | x = np.array([4, 5, 6, 10, 12, 4, 3, 1, 2, 3, 23, 5, 3])
188 |
189 | x_filtered = np.array([4, 5, 6, 10, 4, 3, 1, 2, 3, 5, 3])
190 | indices = np.delete(np.arange(13), [4, 10])
191 | outliers_indices = np.array([4, 10])
192 | outliers = np.array([12, 23])
193 |
194 | test_outliers = so.outliers_iqr(x, ret="outliers")
195 | test_outliers_indices = so.outliers_iqr(x, ret="outliers_indices")
196 | test_indices = so.outliers_iqr(x, ret="indices")
197 | test_filtered = so.outliers_iqr(x, ret="filtered")
198 |
199 | self.assertTrue(
200 | np.all(test_outliers == outliers)
201 | and np.all(test_outliers_indices == outliers_indices)
202 | and np.all(test_indices == indices)
203 | and np.all(test_filtered == x_filtered)
204 | )
205 |
206 | def test_outliers_grubbs(self):
207 | x = np.array([199.31, 199.53, 200.19, 200.82, 201.92, 201.95, 202.18, 245.57])
208 | test_results = so.outliers_grubbs(x)
209 | correct_results = np.array([199.31, 199.53, 200.19, 200.82, 201.92, 201.95, 202.18])
210 | self.assertTrue(so.outliers_grubbs(x, hypo=True))
211 | self.assertTrue(np.all(test_results == correct_results))
212 |
213 | def test_outliers_tietjen(self):
214 | x = np.array(
215 | [
216 | -1.40,
217 | -0.44,
218 | -0.30,
219 | -0.24,
220 | -0.22,
221 | -0.13,
222 | -0.05,
223 | 0.06,
224 | 0.10,
225 | 0.18,
226 | 0.20,
227 | 0.39,
228 | 0.48,
229 | 0.63,
230 | 1.01,
231 | ]
232 | )
233 | test_results = so.outliers_tietjen(x, 2)
234 | correct_results = np.array(
235 | [
236 | -0.44,
237 | -0.3,
238 | -0.24,
239 | -0.22,
240 | -0.13,
241 | -0.05,
242 | 0.06,
243 | 0.1,
244 | 0.18,
245 | 0.2,
246 | 0.39,
247 | 0.48,
248 | 0.63,
249 | ]
250 | )
251 | self.assertTrue(so.outliers_tietjen(x, 2, hypo=True))
252 | self.assertTrue(np.all(test_results == correct_results))
253 |
254 | def test_outliers_gesd(self):
255 | x = np.array(
256 | [
257 | -0.25,
258 | 0.68,
259 | 0.94,
260 | 1.15,
261 | 1.2,
262 | 1.26,
263 | 1.26,
264 | 1.34,
265 | 1.38,
266 | 1.43,
267 | 1.49,
268 | 1.49,
269 | 1.55,
270 | 1.56,
271 | 1.58,
272 | 1.65,
273 | 1.69,
274 | 1.7,
275 | 1.76,
276 | 1.77,
277 | 1.81,
278 | 1.91,
279 | 1.94,
280 | 1.96,
281 | 1.99,
282 | 2.06,
283 | 2.09,
284 | 2.1,
285 | 2.14,
286 | 2.15,
287 | 2.23,
288 | 2.24,
289 | 2.26,
290 | 2.35,
291 | 2.37,
292 | 2.4,
293 | 2.47,
294 | 2.54,
295 | 2.62,
296 | 2.64,
297 | 2.9,
298 | 2.92,
299 | 2.92,
300 | 2.93,
301 | 3.21,
302 | 3.26,
303 | 3.3,
304 | 3.59,
305 | 3.68,
306 | 4.3,
307 | 4.64,
308 | 5.34,
309 | 5.42,
310 | 6.01,
311 | ]
312 | )
313 | correct_mask = np.zeros_like(x, dtype=bool)
314 | correct_mask[-3:] = True
315 | test_results = so.outliers_gesd(x, 5)
316 | test_mask_results = so.outliers_gesd(x, 5, hypo=True)
317 | correct_results = np.array(
318 | [
319 | -0.25,
320 | 0.68,
321 | 0.94,
322 | 1.15,
323 | 1.2,
324 | 1.26,
325 | 1.26,
326 | 1.34,
327 | 1.38,
328 | 1.43,
329 | 1.49,
330 | 1.49,
331 | 1.55,
332 | 1.56,
333 | 1.58,
334 | 1.65,
335 | 1.69,
336 | 1.7,
337 | 1.76,
338 | 1.77,
339 | 1.81,
340 | 1.91,
341 | 1.94,
342 | 1.96,
343 | 1.99,
344 | 2.06,
345 | 2.09,
346 | 2.1,
347 | 2.14,
348 | 2.15,
349 | 2.23,
350 | 2.24,
351 | 2.26,
352 | 2.35,
353 | 2.37,
354 | 2.4,
355 | 2.47,
356 | 2.54,
357 | 2.62,
358 | 2.64,
359 | 2.9,
360 | 2.92,
361 | 2.92,
362 | 2.93,
363 | 3.21,
364 | 3.26,
365 | 3.3,
366 | 3.59,
367 | 3.68,
368 | 4.3,
369 | 4.64,
370 | ]
371 | )
372 | self.assertTrue(isinstance(so.outliers_gesd(x, 5, report=True), np.ndarray))
373 | self.assertTrue(np.array_equal(test_results, correct_results))
374 | self.assertTrue(np.array_equal(test_mask_results, correct_mask))
375 | self.assertTrue(
376 | np.array_equal(so.outliers_gesd(correct_results, 5, hypo=False), correct_results)
377 | )
378 | self.assertTrue(
379 | np.array_equal(
380 | so.outliers_gesd(correct_results, 5, hypo=True),
381 | np.zeros_like(correct_results, dtype=bool),
382 | )
383 | )
384 |
385 | # Statistical tests
386 | df = sb.load_dataset("exercise")
387 | df[df.columns[df.dtypes == "category"]] = df[df.columns[df.dtypes == "category"]].astype(object)
388 | df_bn = np.array([[4, 3, 4, 4, 5, 6, 3], [1, 2, 3, 5, 6, 7, 7], [1, 2, 6, 4, 1, 5, 1]])
389 |
390 | # DataFrame conversion tests
391 | def test_convert_to_block_df(self):
392 | a = np.array(
393 | [
394 | [0, 0, 0, 4],
395 | [1, 1, 0, 1],
396 | [2, 2, 0, 1],
397 | [0, 0, 1, 3],
398 | [1, 1, 1, 2],
399 | [2, 2, 1, 2],
400 | [0, 0, 2, 4],
401 | [1, 1, 2, 3],
402 | [2, 2, 2, 6],
403 | [0, 0, 3, 4],
404 | [1, 1, 3, 5],
405 | [2, 2, 3, 4],
406 | [0, 0, 4, 5],
407 | [1, 1, 4, 6],
408 | [2, 2, 4, 1],
409 | [0, 0, 5, 6],
410 | [1, 1, 5, 7],
411 | [2, 2, 5, 5],
412 | [0, 0, 6, 3],
413 | [1, 1, 6, 7],
414 | [2, 2, 6, 1],
415 | ],
416 | dtype=float,
417 | )
418 | df_a = DataFrame(a, columns=["blk_col", "blk_id_col", "grp_col", "y_col"])
419 |
420 | result = sp.posthoc_nemenyi_friedman(
421 | a, y_col=3, group_col=2, block_col=0, block_id_col=1, melted=True
422 | )[0].values
423 | result2 = sp.posthoc_nemenyi_friedman(self.df_bn)[0].values
424 | result3 = sp.posthoc_nemenyi_friedman(
425 | df_a,
426 | y_col="y_col",
427 | group_col="grp_col",
428 | block_col="blk_col",
429 | block_id_col="blk_id_col",
430 | melted=True,
431 | )[0].values
432 | self.assertTrue(np.allclose(result, result2))
433 | self.assertTrue(np.allclose(result, result3))
434 | self.assertTrue(np.allclose(result2, result3))
435 |
436 | # Omnibox tests
437 | def test_osrt(self):
438 | df = DataFrame(dict(zip(["a", "b", "c"], self.df_bn.tolist()))).melt()
439 | p, _, _ = som.test_osrt(df, val_col="value", group_col="variable")
440 | result = 0.3157646
441 | self.assertTrue(np.allclose(p, result, atol=1.0e-3))
442 |
443 | def test_durbin(self):
444 | r_result = np.array([0.205758, 8.468354, 6])
445 | result = som.test_durbin(self.df_bn)
446 | self.assertTrue(np.allclose(result, r_result))
447 |
448 | def test_mackwolfe(self):
449 | x = [
450 | [22, 23, 35],
451 | [60, 59, 54],
452 | [98, 78, 50],
453 | [60, 82, 59],
454 | [22, 44, 33],
455 | [23, 21, 25],
456 | ]
457 | result, _ = som.test_mackwolfe(x, p=2)
458 | self.assertEqual(som.test_mackwolfe(x, p=20), (np.nan, np.nan))
459 | self.assertEqual(som.test_mackwolfe(x, p=0), (np.nan, np.nan))
460 | self.assertTrue(np.allclose(result, 0.0006812725))
461 |
462 | def test_mackwolfe_nperm(self):
463 | x = [
464 | [22, 23, 35],
465 | [60, 59, 54],
466 | [98, 78, 50],
467 | [60, 82, 59],
468 | [22, 44, 33],
469 | [23, 21, 25],
470 | ]
471 | _, stat = som.test_mackwolfe(x, n_perm=50)
472 | self.assertTrue(np.allclose(stat, 3.2024699769846983))
473 |
474 | # Post hoc tests
475 | def test_posthoc_anderson(self):
476 | r_results = np.array(
477 | [
478 | [1, 1.35079e-02, 8.64418e-09],
479 | [1.35079e-02, 1, 1.644534e-05],
480 | [8.64418e-09, 1.644534e-05, 1],
481 | ]
482 | )
483 |
484 | results = sp.posthoc_anderson(self.df, val_col="pulse", group_col="kind", p_adjust="holm")
485 | self.assertTrue(np.allclose(results.values, r_results, atol=3.0e-3))
486 |
487 | def test_posthoc_conover(self):
488 | r_results = np.array(
489 | [
490 | [1, 9.354690e-11, 1.131263e-02],
491 | [9.354690e-11, 1, 5.496288e-06],
492 | [1.131263e-02, 5.496288e-06, 1],
493 | ]
494 | )
495 |
496 | results = sp.posthoc_conover(
497 | self.df, val_col="pulse", group_col="kind", p_adjust="holm"
498 | ).values
499 | self.assertTrue(np.allclose(results, r_results))
500 |
501 | def test_posthoc_dunn(self):
502 | r_results = np.array(
503 | [
504 | [1, 9.570998e-09, 4.390066e-02],
505 | [9.570998e-09, 1, 1.873208e-04],
506 | [4.390066e-02, 1.873208e-04, 1],
507 | ]
508 | )
509 |
510 | results = sp.posthoc_dunn(
511 | self.df, val_col="pulse", group_col="kind", p_adjust="holm"
512 | ).values
513 | self.assertTrue(np.allclose(results, r_results))
514 |
515 | def test_posthoc_nemenyi(self):
516 | r_results = np.array(
517 | [
518 | [1, 2.431833e-08, 1.313107e-01],
519 | [2.431833e-08, 1, 4.855675e-04],
520 | [1.313107e-01, 4.855675e-04, 1],
521 | ]
522 | )
523 |
524 | results = sp.posthoc_nemenyi(self.df, val_col="pulse", group_col="kind").values
525 | self.assertTrue(np.allclose(results, r_results))
526 |
527 | def test_posthoc_nemenyi_tukey(self):
528 | r_results = np.array(
529 | [
530 | [1, 9.793203e-09, 1.088785e-01],
531 | [9.793203e-09, 1, 0.0002789016],
532 | [1.088785e-01, 0.0002789016, 1],
533 | ]
534 | )
535 |
536 | results = sp.posthoc_nemenyi(
537 | self.df, val_col="pulse", group_col="kind", dist="tukey"
538 | ).values
539 | self.assertTrue(np.allclose(results, r_results, atol=1.0e-3))
540 |
541 | def test_posthoc_nemenyi_friedman(self):
542 | p_results = np.array(
543 | [
544 | [
545 | 1.0,
546 | np.nan,
547 | np.nan,
548 | np.nan,
549 | np.nan,
550 | np.nan,
551 | np.nan,
552 | ],
553 | [
554 | 0.9999999,
555 | 1.0,
556 | np.nan,
557 | np.nan,
558 | np.nan,
559 | np.nan,
560 | np.nan,
561 | ],
562 | [
563 | 0.8414506,
564 | 0.8833015,
565 | 1.0,
566 | np.nan,
567 | np.nan,
568 | np.nan,
569 | np.nan,
570 | ],
571 | [0.9177741, 0.9449086, 0.9999962, 1.0, np.nan, np.nan, np.nan],
572 | [0.9177741, 0.9449086, 0.9999962, 1.0000000, 1.0, np.nan, np.nan],
573 | [0.2147827, 0.2597539, 0.9449086, 0.8833015, 0.8833015, 1.0, np.nan],
574 | [0.9976902, 0.9991770, 0.9888953, 0.9976902, 0.9976902, 0.5511935, 1.0],
575 | ]
576 | )
577 | tri_upper = np.triu_indices(p_results.shape[0], 1)
578 | p_results[tri_upper] = np.transpose(p_results)[tri_upper]
579 | results = sp.posthoc_nemenyi_friedman(self.df_bn)
580 | self.assertTrue(np.allclose(results, p_results))
581 |
582 | def test_posthoc_conover_friedman(self):
583 | results = sp.posthoc_conover_friedman(self.df_bn, p_adjust="bonferroni")
584 | p_results = (
585 | np.array(
586 | [
587 | [1.0000000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
588 | [0.9147508, 1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan],
589 | [0.1518030, 0.18071036, 1.0000000, np.nan, np.nan, np.nan, np.nan],
590 | [
591 | 0.2140927,
592 | 0.25232845,
593 | 0.8305955,
594 | 1.000000,
595 | np.nan,
596 | np.nan,
597 | np.nan,
598 | ],
599 | [
600 | 0.2140927,
601 | 0.25232845,
602 | 0.8305955,
603 | 1.000000,
604 | 1.000000,
605 | np.nan,
606 | np.nan,
607 | ],
608 | [
609 | 0.0181602,
610 | 0.02222747,
611 | 0.2523284,
612 | 0.1807104,
613 | 0.1807104,
614 | 1.00009000,
615 | np.nan,
616 | ],
617 | [
618 | 0.5242303,
619 | 0.59465124,
620 | 0.3989535,
621 | 0.5242303,
622 | 0.5242303,
623 | 0.05991984,
624 | 1.000000,
625 | ],
626 | ]
627 | )
628 | * 21
629 | )
630 | p_results[p_results > 1] = 1.0
631 | tri_upper = np.triu_indices(p_results.shape[0], 1)
632 | p_results[tri_upper] = np.transpose(p_results)[tri_upper]
633 | np.fill_diagonal(p_results, 1)
634 | self.assertTrue(np.allclose(results, p_results))
635 |
636 | def test_posthoc_conover_friedman_tukey(self):
637 | results = sp.posthoc_conover_friedman(self.df_bn, p_adjust="single-step")
638 | p_results = np.array(
639 | [
640 | [1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
641 | [0.99999986, 1.0000000, np.nan, np.nan, np.nan, np.nan, np.nan],
642 | [0.72638075, 0.7905289, 1.0000000, np.nan, np.nan, np.nan, np.nan],
643 | [0.84667448, 0.8934524, 0.9999910, 1.0000000, np.nan, np.nan, np.nan],
644 | [
645 | 0.84667448,
646 | 0.8934524,
647 | 0.9999910,
648 | 1.0000000,
649 | 1.0000000,
650 | np.nan,
651 | np.nan,
652 | ],
653 | [
654 | 0.09013677,
655 | 0.1187580,
656 | 0.8934524,
657 | 0.7905289,
658 | 0.7905289,
659 | 1.0000000,
660 | np.nan,
661 | ],
662 | [
663 | 0.99482447,
664 | 0.9981178,
665 | 0.9763466,
666 | 0.9948245,
667 | 0.9948245,
668 | 0.3662675,
669 | 1.000000,
670 | ],
671 | ]
672 | )
673 | tri_upper = np.triu_indices(p_results.shape[0], 1)
674 | p_results[tri_upper] = np.transpose(p_results)[tri_upper]
675 | np.fill_diagonal(p_results, 1)
676 | self.assertTrue(np.allclose(results, p_results, atol=1e-3))
677 |
678 | def test_posthoc_conover_friedman_non_melted(self):
679 | df = DataFrame(self.df_bn)
680 | results = sp.posthoc_conover_friedman(df, melted=False)
681 | p_results = np.array(
682 | [
683 | [1.0000000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
684 | [0.9147508, 1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan],
685 | [0.1518030, 0.18071036, 1.0000000, np.nan, np.nan, np.nan, np.nan],
686 | [0.2140927, 0.25232845, 0.8305955, 1.000000, np.nan, np.nan, np.nan],
687 | [0.2140927, 0.25232845, 0.8305955, 1.000000, 1.000000, np.nan, np.nan],
688 | [
689 | 0.0181602,
690 | 0.02222747,
691 | 0.2523284,
692 | 0.1807104,
693 | 0.1807104,
694 | 1.00009000,
695 | np.nan,
696 | ],
697 | [
698 | 0.5242303,
699 | 0.59465124,
700 | 0.3989535,
701 | 0.5242303,
702 | 0.5242303,
703 | 0.05991984,
704 | 1.000000,
705 | ],
706 | ]
707 | )
708 | tri_upper = np.triu_indices(p_results.shape[0], 1)
709 | p_results[tri_upper] = np.transpose(p_results)[tri_upper]
710 | np.fill_diagonal(p_results, 1)
711 | self.assertTrue(np.allclose(results, p_results))
712 |
713 | def test_posthoc_miller_friedman(self):
714 | results = sp.posthoc_miller_friedman(self.df_bn)
715 |
716 | p_results = np.array(
717 | [
718 | [
719 | 1.0,
720 | 1.0,
721 | 0.9411963,
722 | 0.9724396000000001,
723 | 0.9724396000000001,
724 | 0.4717981,
725 | 0.9993864,
726 | ],
727 | [
728 | 1.0,
729 | 1.0,
730 | 0.9588993,
731 | 0.9823818000000001,
732 | 0.9823818000000001,
733 | 0.5256257,
734 | 0.9997869,
735 | ],
736 | [
737 | 0.9411963,
738 | 0.9588993,
739 | 1.0,
740 | 0.9999991,
741 | 0.9999991,
742 | 0.9823818000000001,
743 | 0.9968575999999999,
744 | ],
745 | [
746 | 0.9724396000000001,
747 | 0.9823818000000001,
748 | 0.9999991,
749 | 1.0,
750 | 1.0,
751 | 0.9588993,
752 | 0.9993864,
753 | ],
754 | [
755 | 0.9724396000000001,
756 | 0.9823818000000001,
757 | 0.9999991,
758 | 1.0,
759 | 1.0,
760 | 0.9588993,
761 | 0.9993864,
762 | ],
763 | [
764 | 0.4717981,
765 | 0.5256257,
766 | 0.9823818000000001,
767 | 0.9588993,
768 | 0.9588993,
769 | 1.0,
770 | 0.7803545999999999,
771 | ],
772 | [
773 | 0.9993864,
774 | 0.9997869,
775 | 0.9968575999999999,
776 | 0.9993864,
777 | 0.9993864,
778 | 0.7803545999999999,
779 | 1.0,
780 | ],
781 | ]
782 | )
783 |
784 | self.assertTrue(np.allclose(results, p_results))
785 |
786 | def test_posthoc_siegel_friedman(self):
787 | results = sp.posthoc_siegel_friedman(self.df_bn, p_adjust="bonferroni")
788 |
789 | p_results = (
790 | np.array(
791 | [
792 | [
793 | 1.000000,
794 | 0.92471904,
795 | 0.18587673,
796 | 0.25683926,
797 | 0.25683926,
798 | 0.01816302,
799 | 0.57075039,
800 | ],
801 | [
802 | 0.92471904,
803 | 1.0000000,
804 | 0.2193026,
805 | 0.2986177,
806 | 0.2986177,
807 | 0.0233422,
808 | 0.6366016,
809 | ],
810 | [
811 | 0.18587673,
812 | 0.2193026,
813 | 1.0000000,
814 | 0.8501067,
815 | 0.8501067,
816 | 0.2986177,
817 | 0.4496918,
818 | ],
819 | [
820 | 0.25683926,
821 | 0.2986177,
822 | 0.8501067,
823 | 1.000000,
824 | 1.0000000,
825 | 0.2193026,
826 | 0.5707504,
827 | ],
828 | [
829 | 0.25683926,
830 | 0.2986177,
831 | 0.8501067,
832 | 1.0000000,
833 | 1.0000000,
834 | 0.2193026,
835 | 0.5707504,
836 | ],
837 | [
838 | 0.01816302,
839 | 0.0233422,
840 | 0.2986177,
841 | 0.2193026,
842 | 0.2193026,
843 | 1.000000,
844 | 0.07260094,
845 | ],
846 | [
847 | 0.57075039,
848 | 0.6366016,
849 | 0.4496918,
850 | 0.5707504,
851 | 0.5707504,
852 | 0.07260094,
853 | 1.000000,
854 | ],
855 | ]
856 | )
857 | * 21
858 | )
859 | p_results[p_results > 1] = 1.0
860 |
861 | self.assertTrue(np.allclose(results, p_results))
862 |
863 | def test_posthoc_durbin(self):
864 | results = sp.posthoc_durbin(self.df_bn, p_adjust="holm")
865 |
866 | p_results = np.array(
867 | [
868 | [1.000000, 1.000000, 1.0, 1.0, 1.0, 0.381364, 1.0],
869 | [1.000000, 1.000000, 1.0, 1.0, 1.0, 0.444549, 1.0],
870 | [1.000000, 1.000000, 1.0, 1.0, 1.0, 1.000000, 1.0],
871 | [1.000000, 1.000000, 1.0, 1.0, 1.0, 1.000000, 1.0],
872 | [1.000000, 1.000000, 1.0, 1.0, 1.0, 1.000000, 1.0],
873 | [0.381364, 0.444549, 1.0, 1.0, 1.0, 1.000000, 1.0],
874 | [1.000000, 1.000000, 1.0, 1.0, 1.0, 1.000000, 1.0],
875 | ]
876 | )
877 | self.assertTrue(np.allclose(results, p_results))
878 |
879 | def test_posthoc_quade(self):
880 | results = sp.posthoc_quade(self.df_bn, p_adjust="bonferroni")
881 |
882 | p_results = (
883 | np.array(
884 | [
885 | [
886 | 1.00000000,
887 | 0.67651326,
888 | 0.15432143,
889 | 0.17954686,
890 | 0.2081421,
891 | 0.02267043,
892 | 0.2081421,
893 | ],
894 | [
895 | 0.67651326,
896 | 1.00000000,
897 | 0.29595042,
898 | 0.33809987,
899 | 0.38443835,
900 | 0.0494024,
901 | 0.38443835,
902 | ],
903 | [
904 | 0.15432143,
905 | 0.29595042,
906 | 1.00000000,
907 | 0.92586499,
908 | 0.85245022,
909 | 0.29595042,
910 | 0.85245022,
911 | ],
912 | [
913 | 0.17954686,
914 | 0.33809987,
915 | 0.92586499,
916 | 1.00000000,
917 | 0.92586499,
918 | 0.25789648,
919 | 0.92586499,
920 | ],
921 | [
922 | 0.2081421,
923 | 0.38443835,
924 | 0.85245022,
925 | 0.92586499,
926 | 1.00000000,
927 | 0.22378308,
928 | 1.00000000,
929 | ],
930 | [
931 | 0.02267043,
932 | 0.0494024,
933 | 0.29595042,
934 | 0.25789648,
935 | 0.22378308,
936 | 1.00000000,
937 | 0.22378308,
938 | ],
939 | [
940 | 0.2081421,
941 | 0.38443835,
942 | 0.85245022,
943 | 0.92586499,
944 | 1.00000000,
945 | 0.22378308,
946 | 1.00000000,
947 | ],
948 | ]
949 | )
950 | * 21
951 | )
952 | p_results[p_results > 1.0] = 1.0
953 | self.assertTrue(np.allclose(results, p_results))
954 |
955 | def test_posthoc_quade_norm(self):
956 | results = sp.posthoc_quade(self.df_bn, dist="normal")
957 |
958 | p_results = np.array(
959 | [
960 | [1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
961 | [0.5693540320, 1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan],
962 | [0.0430605548, 0.145913303, 1.00000000, np.nan, np.nan, np.nan, np.nan],
963 | [
964 | 0.0578705783,
965 | 0.184285855,
966 | 0.8993796,
967 | 1.00000000,
968 | np.nan,
969 | np.nan,
970 | np.nan,
971 | ],
972 | [
973 | 0.0766885196,
974 | 0.229662468,
975 | 0.8003530,
976 | 0.8993796,
977 | 1.00000000,
978 | np.nan,
979 | np.nan,
980 | ],
981 | [
982 | 0.0005066018,
983 | 0.003634715,
984 | 0.1459133,
985 | 0.1139777,
986 | 0.08782032,
987 | 1.00000000,
988 | np.nan,
989 | ],
990 | [
991 | 0.0766885196,
992 | 0.229662468,
993 | 0.8003530,
994 | 0.8993796,
995 | 1.00000000,
996 | 0.08782032,
997 | 1.00000000,
998 | ],
999 | ]
1000 | )
1001 | tri_upper = np.triu_indices(p_results.shape[0], 1)
1002 | p_results[tri_upper] = np.transpose(p_results)[tri_upper]
1003 | self.assertTrue(np.allclose(results, p_results))
1004 |
1005 | def test_posthoc_npm_test(self):
1006 | data = np.array(
1007 | [
1008 | [2.4, 3, 3, 2.2, 2.2, 2.2, 2.2, 2.8, 2, 3],
1009 | [2.8, 2.2, 3.8, 9.4, 8.4, 3, 3.2, 4.4, 3.2, 7.4],
1010 | [9.8, 3.2, 5.8, 7.8, 2.6, 2.2, 6.2, 9.4, 7.8, 3.4],
1011 | [7, 9.8, 9.4, 8.8, 8.8, 3.4, 9, 8.4, 2.4, 7.8],
1012 | ]
1013 | )
1014 |
1015 | results = sp.posthoc_npm_test(data)
1016 |
1017 | p_results = np.array(
1018 | [
1019 | [1.0, 0.0077, 0.0020, 2e-16],
1020 | [0.0077, 1.0, 0.2884, 0.0854],
1021 | [0.0020, 0.2884, 1.0, 0.1385],
1022 | [2e-16, 0.0854, 0.1385, 1.0],
1023 | ]
1024 | )
1025 |
1026 | self.assertTrue(np.allclose(results, p_results, rtol=4))
1027 |
1028 | def test_posthoc_vanwaerden(self):
1029 | r_results = np.array(
1030 | [
1031 | [1, 1.054709e-02, 6.476665e-11],
1032 | [1.054709e-02, 1, 4.433141e-06],
1033 | [6.476665e-11, 4.433141e-06, 1],
1034 | ]
1035 | )
1036 |
1037 | results = sp.posthoc_vanwaerden(self.df, val_col="pulse", group_col="kind", p_adjust="holm")
1038 | self.assertTrue(np.allclose(results, r_results))
1039 |
1040 | def test_posthoc_dscf(self):
1041 | r_results = np.array(
1042 | [
1043 | [1, 4.430682e-02, 9.828003e-08],
1044 | [4.430682e-02, 1, 5.655274e-05],
1045 | [9.828003e-08, 5.655274e-05, 1],
1046 | ]
1047 | )
1048 |
1049 | results = sp.posthoc_dscf(self.df, val_col="pulse", group_col="kind")
1050 | self.assertTrue(np.allclose(results, r_results, atol=0.001))
1051 |
1052 | def test_posthoc_ttest(self):
1053 | r_results = np.array(
1054 | [
1055 | [1, 9.757069e-03, 4.100954e-07],
1056 | [9.757069e-03, 1, 1.556010e-05],
1057 | [4.100954e-07, 1.556010e-05, 1],
1058 | ]
1059 | )
1060 |
1061 | results = sp.posthoc_ttest(
1062 | self.df, val_col="pulse", group_col="kind", equal_var=False, p_adjust="holm"
1063 | )
1064 | self.assertTrue(np.allclose(results, r_results))
1065 |
1066 | def test_posthoc_ttest_pooled(self):
1067 | x = [[1, 2, 3, 5, 1], [12, 31, 54, 50, 40], [10, 12, 6, 74, 11]]
1068 | r_results = np.array(
1069 | [
1070 | [1, 0.04226866, 0.24706893],
1071 | [0.04226866, 1, 0.2482456],
1072 | [0.24706893, 0.2482456, 1],
1073 | ]
1074 | )
1075 |
1076 | results = sp.posthoc_ttest(x, equal_var=False, p_adjust="holm", pool_sd=True)
1077 | self.assertTrue(np.allclose(results, r_results))
1078 |
1079 | def test_posthoc_tukey_hsd(self):
1080 | x = [[1, 2, 3, 4, 5], [35, 31, 75, 40, 21], [10, 6, 9, 6, 1]]
1081 | results = sp.posthoc_tukey_hsd(x)
1082 | n_results = np.array(
1083 | [
1084 | [1.0, 0.000991287, 0.897449027],
1085 | [0.000991287, 1.0, 0.00210909],
1086 | [0.897449027, 0.00210909, 1.0],
1087 | ]
1088 | )
1089 | self.assertTrue(np.allclose(n_results, results))
1090 |
1091 | def test_posthoc_mannwhitney(self):
1092 | r_results = (
1093 | np.array(
1094 | [
1095 | [1, 3.420508e-08, 1.714393e-02],
1096 | [3.420508e-08, 1, 1.968352e-05],
1097 | [1.714393e-02, 1.968352e-05, 1],
1098 | ]
1099 | )
1100 | * 3
1101 | )
1102 | np.fill_diagonal(r_results, 1)
1103 |
1104 | results = sp.posthoc_mannwhitney(
1105 | self.df, val_col="pulse", group_col="kind", p_adjust="bonferroni"
1106 | ).values
1107 | self.assertTrue(np.allclose(results, r_results))
1108 |
1109 | def test_posthoc_mannwhitney_ndarray(self):
1110 | _x = [[1, 2, 3, 5, 1], [12, 31, 54, 50, 40], [10, 12, 6, 74, 11]]
1111 | x = np.array(_x)
1112 | g = np.repeat([0, 1, 2], 5)
1113 | nd = np.column_stack((x.ravel(), g))
1114 | xdf = DataFrame(dict(zip(list("abc"), _x))).melt(var_name="groups", value_name="vals")
1115 | results = sp.posthoc_mannwhitney(xdf, val_col="vals", group_col="groups").values
1116 | nd_results = sp.posthoc_mannwhitney(nd, val_col=0, group_col=1).values
1117 | self.assertTrue(np.allclose(nd_results, results))
1118 |
1119 | def test_posthoc_wilcoxon(self):
1120 | r_results = (
1121 | np.array(
1122 | [
1123 | [1, 2.337133e-03, 2.857818e-06],
1124 | [2.337133e-03, 1, 1.230888e-05],
1125 | [2.857818e-06, 1.230888e-05, 1],
1126 | ]
1127 | )
1128 | * 3
1129 | )
1130 | np.fill_diagonal(r_results, 1)
1131 |
1132 | results = sp.posthoc_wilcoxon(
1133 | self.df.sort_index(),
1134 | val_col="pulse",
1135 | group_col="kind",
1136 | p_adjust="bonferroni",
1137 | )
1138 | self.assertTrue(np.allclose(results, r_results, atol=1e-4))
1139 |
1140 | def test_posthoc_scheffe(self):
1141 | r_results = np.array(
1142 | [
1143 | [1.0, 3.378449e-01, 3.047472e-10],
1144 | [3.378449e-01, 1.0, 2.173209e-07],
1145 | [3.047472e-10, 2.173209e-07, 1.0],
1146 | ]
1147 | )
1148 |
1149 | results = sp.posthoc_scheffe(self.df.sort_index(), val_col="pulse", group_col="kind")
1150 | self.assertTrue(np.allclose(results, r_results))
1151 |
1152 | def test_posthoc_tamhane(self):
1153 | r_results = np.array(
1154 | [
1155 | [1, 2.898653e-02, 4.100954e-07],
1156 | [2.898653e-02, 1, 2.333996e-05],
1157 | [4.100954e-07, 2.333996e-05, 1],
1158 | ]
1159 | )
1160 |
1161 | results = sp.posthoc_tamhane(self.df.sort_index(), val_col="pulse", group_col="kind")
1162 | self.assertTrue(np.allclose(results, r_results))
1163 |
1164 | def test_posthoc_tamhane_nw(self):
1165 | r_results = np.array(
1166 | [
1167 | [1, 2.883219e-02, 4.780682e-08],
1168 | [2.883219e-02, 1, 8.643683e-06],
1169 | [4.780682e-08, 8.643683e-06, 1],
1170 | ]
1171 | )
1172 |
1173 | results = sp.posthoc_tamhane(
1174 | self.df.sort_index(), val_col="pulse", group_col="kind", welch=False
1175 | )
1176 | self.assertTrue(np.allclose(results, r_results))
1177 |
1178 | def test_posthoc_tukey(self):
1179 | r_results = np.array(
1180 | [
1181 | [1, 3.042955e-01, 4.308631e-10],
1182 | [3.042955e-01, 1, 9.946571e-08],
1183 | [4.308631e-10, 9.946571e-08, 1],
1184 | ]
1185 | )
1186 |
1187 | results = sp.posthoc_tukey(self.df.sort_index(), val_col="pulse", group_col="kind")
1188 | self.assertTrue(np.allclose(results, r_results, atol=1.0e-3))
1189 |
1190 | def test_posthoc_dunnett(self):
1191 | r_results = [8.125844e-11, 2.427434e-01]
1192 |
1193 | # scipy use randomized Quasi-Monte Carlo integration of the multivariate-t distribution
1194 | # to compute the p-values. The result may vary slightly from run to run.
1195 | # we run the test 1000 times (maximum absolute tolerance = 1.e-4 for example data)
1196 | is_close = []
1197 | for i in range(100):
1198 | results = sp.posthoc_dunnett(
1199 | self.df.sort_index(),
1200 | val_col="pulse",
1201 | group_col="kind",
1202 | control="rest",
1203 | to_matrix=False,
1204 | )
1205 | is_close.append(np.allclose(results, r_results, atol=1e-4))
1206 |
1207 | is_close_mt = []
1208 | for i in range(100):
1209 | df_results = sp.posthoc_dunnett(
1210 | self.df.sort_index(),
1211 | val_col="pulse",
1212 | group_col="kind",
1213 | control="rest",
1214 | to_matrix=True,
1215 | )
1216 | results = [
1217 | df_results.loc["rest", "running"],
1218 | df_results.loc["rest", "walking"],
1219 | ]
1220 | is_close_mt.append(np.allclose(results, r_results, atol=1e-4))
1221 | self.assertTrue(sum(is_close) > 95)
1222 | self.assertTrue(sum(is_close_mt) > 95)
1223 |
1224 |
1225 | if __name__ == "__main__":
1226 | unittest.main()
1227 |
--------------------------------------------------------------------------------