├── MANIFEST.in ├── images ├── logo.png ├── flowchart.png ├── plot-conover.png ├── result-conover.png ├── melted-dataframe.png ├── plot-conover-custom-cmap.png └── flowchart.gv ├── paper ├── figure.png ├── codemeta.json ├── paper.bib ├── paper.md └── generate.rb ├── docs ├── requirements.txt ├── source │ ├── _static │ │ ├── flowchart.png │ │ ├── cd_diagram0.png │ │ ├── cd_diagram1.png │ │ ├── cd_diagram2.png │ │ ├── plot-conover.png │ │ ├── plot-conover-custom-cmap.png │ │ └── cd_diagram_example_sig_plot.png │ ├── global_api.rst │ ├── omnibus_api.rst │ ├── outliers_api.rst │ ├── plotting_api.rst │ ├── posthocs_api.rst │ ├── installation.rst │ ├── index.rst │ ├── intro.rst │ ├── conf.py │ └── tutorial.rst ├── Makefile └── make.bat ├── tests ├── __init__.py └── test_posthocs.py ├── .readthedocs.yml ├── CONTRIBUTING.md ├── .github ├── workflows │ ├── package-publish.yml │ ├── package-pull.yml │ └── package-test.yml └── ISSUE_TEMPLATE │ └── bug_report.md ├── LICENSE ├── scikit_posthocs ├── __init__.py ├── _global.py ├── _outliers.py ├── _omnibus.py └── _plotting.py ├── pyproject.toml ├── CODE_OF_CONDUCT.md ├── DESCRIPTION.rst ├── usage-examples.ipynb └── README.rst /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include DESCRIPTION.rst 3 | recursive-exclude tests * 4 | -------------------------------------------------------------------------------- /images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/images/logo.png -------------------------------------------------------------------------------- /paper/figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/paper/figure.png -------------------------------------------------------------------------------- /images/flowchart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/images/flowchart.png -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx_rtd_theme 2 | numpydoc 3 | git+https://github.com/maximtrp/scikit-posthocs 4 | -------------------------------------------------------------------------------- /images/plot-conover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/images/plot-conover.png -------------------------------------------------------------------------------- /images/result-conover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/images/result-conover.png -------------------------------------------------------------------------------- /images/melted-dataframe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/images/melted-dataframe.png -------------------------------------------------------------------------------- /docs/source/_static/flowchart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/docs/source/_static/flowchart.png -------------------------------------------------------------------------------- /docs/source/_static/cd_diagram0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/docs/source/_static/cd_diagram0.png -------------------------------------------------------------------------------- /docs/source/_static/cd_diagram1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/docs/source/_static/cd_diagram1.png -------------------------------------------------------------------------------- /docs/source/_static/cd_diagram2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/docs/source/_static/cd_diagram2.png -------------------------------------------------------------------------------- /docs/source/_static/plot-conover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/docs/source/_static/plot-conover.png -------------------------------------------------------------------------------- /images/plot-conover-custom-cmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/images/plot-conover-custom-cmap.png -------------------------------------------------------------------------------- /docs/source/_static/plot-conover-custom-cmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/docs/source/_static/plot-conover-custom-cmap.png -------------------------------------------------------------------------------- /docs/source/_static/cd_diagram_example_sig_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maximtrp/scikit-posthocs/HEAD/docs/source/_static/cd_diagram_example_sig_plot.png -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import tests.test_posthocs 3 | 4 | def posthocs_suite(): 5 | loader = unittest.TestLoader() 6 | suite = loader.loadTestsFromModule(tests.test_posthocs) 7 | return suite 8 | -------------------------------------------------------------------------------- /docs/source/global_api.rst: -------------------------------------------------------------------------------- 1 | Global Tests API reference 2 | -------------------------- 3 | 4 | .. currentmodule:: scikit_posthocs 5 | .. autosummary:: 6 | :toctree: generated 7 | 8 | global_f_test 9 | global_simes_test 10 | -------------------------------------------------------------------------------- /docs/source/omnibus_api.rst: -------------------------------------------------------------------------------- 1 | Omnibus API reference 2 | --------------------- 3 | 4 | .. currentmodule:: scikit_posthocs 5 | .. autosummary:: 6 | :toctree: generated 7 | 8 | test_mackwolfe 9 | test_osrt 10 | test_durbin 11 | -------------------------------------------------------------------------------- /docs/source/outliers_api.rst: -------------------------------------------------------------------------------- 1 | Outliers API reference 2 | ---------------------- 3 | 4 | .. currentmodule:: scikit_posthocs 5 | .. autosummary:: 6 | :toctree: generated 7 | 8 | outliers_iqr 9 | outliers_gesd 10 | outliers_grubbs 11 | outliers_tietjen 12 | -------------------------------------------------------------------------------- /docs/source/plotting_api.rst: -------------------------------------------------------------------------------- 1 | Plotting API reference 2 | ---------------------- 3 | 4 | .. currentmodule:: scikit_posthocs 5 | .. autosummary:: 6 | :toctree: generated 7 | 8 | sign_array 9 | sign_table 10 | sign_plot 11 | critical_difference_diagram 12 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.11" 7 | 8 | sphinx: 9 | configuration: docs/source/conf.py 10 | 11 | formats: 12 | - pdf 13 | 14 | python: 15 | install: 16 | - requirements: docs/requirements.txt 17 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Support 2 | 3 | Currently, all support is provided on GitHub. Please open an issue with your 4 | bug, question, or suggestion. 5 | 6 | ## Bugs 7 | 8 | If you have found a bug, open a GitHub issue using `Bug report` template. Ensure 9 | that you have included the following information: 10 | 11 | - Full error traceback. 12 | - Steps to reproduce a bug. 13 | - Dataset you get a bug with. 14 | 15 | ## Contribution 16 | 17 | Your contribution is highly welcome. You may open a pull request or an issue 18 | describing an improvement or implementation of new functionality. 19 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = . 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 20 | -------------------------------------------------------------------------------- /docs/source/posthocs_api.rst: -------------------------------------------------------------------------------- 1 | Post-hocs API reference 2 | ----------------------- 3 | 4 | .. currentmodule:: scikit_posthocs 5 | .. autosummary:: 6 | :toctree: generated 7 | 8 | posthoc_conover 9 | posthoc_dunn 10 | posthoc_nemenyi 11 | posthoc_nemenyi_friedman 12 | posthoc_conover_friedman 13 | posthoc_siegel_friedman 14 | posthoc_miller_friedman 15 | posthoc_npm_test 16 | posthoc_durbin 17 | posthoc_anderson 18 | posthoc_quade 19 | posthoc_vanwaerden 20 | posthoc_tukey_hsd 21 | posthoc_ttest 22 | posthoc_mannwhitney 23 | posthoc_wilcoxon 24 | posthoc_scheffe 25 | posthoc_tamhane 26 | posthoc_tukey 27 | posthoc_dscf 28 | posthoc_dunnett 29 | -------------------------------------------------------------------------------- /.github/workflows/package-publish.yml: -------------------------------------------------------------------------------- 1 | name: Package Upload to PyPi 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | permissions: 8 | contents: read 9 | 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | - name: Set up Python 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: '3.x' 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install build 23 | - name: Build package 24 | run: python -m build 25 | - name: Publish package 26 | uses: pypa/gh-action-pypi-publish@release/v1 27 | with: 28 | user: __token__ 29 | password: ${{ secrets.PYPI_API_TOKEN }} 30 | -------------------------------------------------------------------------------- /.github/workflows/package-pull.yml: -------------------------------------------------------------------------------- 1 | name: Run tests on pull requests 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | python-version: ["3.9", "3.12"] 14 | steps: 15 | - uses: actions/checkout@v4 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | cache: 'pip' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | python -m pip install . 25 | python -m pip install .[test] 26 | - name: Testing with pytest 27 | run: | 28 | pytest . 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: maximtrp 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **Dataset** 14 | Please provide a link to the dataset you get the bug with. 15 | 16 | **To Reproduce** 17 | Steps to reproduce the behavior: 18 | 1. Go to '...' 19 | 2. Click on '....' 20 | 3. Scroll down to '....' 21 | 4. See error 22 | 23 | **Expected behavior** 24 | A clear and concise description of what you expected to happen. 25 | 26 | **System and package information (please complete the following information):** 27 | - OS: (e.g. Linux 4.20.0-arch1-1-ARCH x86_64 GNU/Linux) 28 | - Package version: (e.g. 0.4.0) 29 | 30 | **Additional context** 31 | Add any other context about the problem here. 32 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /paper/codemeta.json: -------------------------------------------------------------------------------- 1 | { 2 | "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld", 3 | "@type": "Code", 4 | "author": [ 5 | { 6 | "@id": "https://orcid.org/0000-0003-2586-4633", 7 | "@type": "Person", 8 | "email": "maximtrp@gmail.com", 9 | "name": "Maksim A. Terpilowski", 10 | "affiliation": "Institute of Evolutionary Physiology and Biochemistry, Saint Petersburg, Russia" 11 | } 12 | ], 13 | "identifier": "", 14 | "codeRepository": "https://github.com/maximtrp/scikit-posthocs", 15 | "datePublished": "2018-12-06", 16 | "dateModified": "2018-12-06", 17 | "dateCreated": "2018-12-06", 18 | "description": "A Python package for pairwise multiple comparison post hoc tests and outliers detection", 19 | "keywords": "python,statistics,posthoc", 20 | "license": "BSD 3-Clause License", 21 | "title": "scikit-posthocs: Pairwise multiple comparison tests in Python", 22 | "version": "v0.4.0" 23 | } 24 | -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | The latest version can be installed from PyPi using ``pip``: 5 | 6 | .. code:: sh 7 | 8 | pip install scikit-posthocs 9 | 10 | Or from conda-forge repository using ``conda``: 11 | 12 | .. code:: sh 13 | 14 | conda install -c conda-forge scikit-posthocs 15 | 16 | You can also use ``pip`` to install the development version from GitHub: 17 | 18 | .. code:: sh 19 | 20 | pip install git+https://github.com/maximtrp/scikit-posthocs.git 21 | 22 | Dependencies 23 | ------------ 24 | 25 | Package is compatible with both major versions of Python and has the following dependencies: 26 | 27 | * `NumPy `_ 28 | * `SciPy `_ 29 | * `Statsmodels `_ 30 | * `Pandas `_ 31 | * `Seaborn `_ 32 | * `Matplotlib `_ 33 | 34 | Bugs 35 | ---- 36 | 37 | Please report any bugs using issues tracker on `GitHub `_. 38 | -------------------------------------------------------------------------------- /.github/workflows/package-test.yml: -------------------------------------------------------------------------------- 1 | name: Run tests on push 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | python-version: ["3.9", "3.12"] 14 | steps: 15 | - uses: actions/checkout@v4 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | cache: 'pip' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | python -m pip install . 25 | python -m pip install .[test] 26 | - name: Testing with pytest and measuring coverage 27 | run: | 28 | coverage run --source scikit_posthocs -m pytest . 29 | coverage xml 30 | - name: Reporting coverage to Codacy 31 | uses: codacy/codacy-coverage-reporter-action@v1 32 | with: 33 | project-token: ${{ secrets.CODACY_PROJECT_TOKEN }} 34 | coverage-reports: coverage.xml 35 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | scikit-posthocs 2 | =============== 3 | 4 | **scikit-posthocs** is a Python package which provides post hoc tests for 5 | pairwise multiple comparisons that are usually performed in statistical data 6 | analysis to assess the differences between group levels if a statistically 7 | significant result of ANOVA test has been obtained. 8 | 9 | **scikit-posthocs** is tightly integrated with Pandas DataFrames and NumPy 10 | arrays to ensure fast computations and convenient data import and storage. 11 | 12 | This package will be useful for statisticians, data analysts, and researchers 13 | who use Python in their work. 14 | 15 | 16 | .. toctree:: 17 | :caption: Documentation 18 | :maxdepth: 2 19 | :hidden: 20 | 21 | Introduction 22 | Installation 23 | Tutorial 24 | 25 | .. toctree:: 26 | :caption: API 27 | :hidden: 28 | :maxdepth: 2 29 | 30 | Global Tests API 31 | Omnibus API 32 | Outliers API 33 | Plotting API 34 | Post-hocs API 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024 Maksim Terpilovskii 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /scikit_posthocs/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.11.4" 2 | 3 | from scikit_posthocs._global import global_simes_test, global_f_test 4 | from scikit_posthocs._omnibus import test_osrt, test_durbin, test_mackwolfe 5 | 6 | from scikit_posthocs._posthocs import ( 7 | posthoc_anderson, 8 | posthoc_conover, 9 | posthoc_conover_friedman, 10 | posthoc_dscf, 11 | posthoc_dunn, 12 | posthoc_durbin, 13 | posthoc_mannwhitney, 14 | posthoc_miller_friedman, 15 | posthoc_nemenyi, 16 | posthoc_nemenyi_friedman, 17 | posthoc_npm_test, 18 | posthoc_quade, 19 | posthoc_scheffe, 20 | posthoc_siegel_friedman, 21 | posthoc_tamhane, 22 | posthoc_ttest, 23 | posthoc_tukey, 24 | posthoc_tukey_hsd, 25 | posthoc_vanwaerden, 26 | posthoc_wilcoxon, 27 | posthoc_dunnett, 28 | __convert_to_df, 29 | __convert_to_block_df, 30 | ) 31 | 32 | from scikit_posthocs._plotting import ( 33 | sign_array, 34 | sign_plot, 35 | sign_table, 36 | critical_difference_diagram, 37 | ) 38 | from scikit_posthocs._outliers import ( 39 | outliers_gesd, 40 | outliers_grubbs, 41 | outliers_iqr, 42 | outliers_tietjen, 43 | ) 44 | -------------------------------------------------------------------------------- /paper/paper.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{Seabold2010, 2 | title = {Statsmodels: Econometric and statistical modeling with python}, 3 | author = {{Seabold}, S. and {Perktold}, J.}, 4 | booktitle = {Proceedings of the 9th Python in Science Conference}, 5 | volume = {57}, 6 | pages = {61}, 7 | year = {2010}, 8 | organization = {SciPy society Austin} 9 | } 10 | 11 | @misc{Jones2001, 12 | author = {{Jones}, E. and {Oliphant}, T. and {Peterson}, P.}, 13 | title = {SciPy: Open source scientific tools for Python}, 14 | year = {2001}, 15 | url = {http://www.scipy.org/} 16 | } 17 | 18 | @online{Pohlert2018, 19 | title = {PMCMRplus: Calculate Pairwise Multiple Comparisons of Mean Rank Sums Extended}, 20 | author = {{Pohlert}, T.}, 21 | year = {2018}, 22 | note = {R package version 1.4.1}, 23 | url = {https://CRAN.R-project.org/package=PMCMRplus} 24 | } 25 | 26 | @inproceedings{McKinney2010, 27 | title={Data structures for statistical computing in Python}, 28 | author={{McKinney}, W.}, 29 | booktitle={Proceedings of the 9th Python in Science Conference}, 30 | volume={445}, 31 | pages={51-56}, 32 | year={2010}, 33 | organization={Austin, TX} 34 | } 35 | 36 | @book{Oliphant2006, 37 | title={A guide to NumPy}, 38 | author={{Oliphant}, T. E.}, 39 | volume={1}, 40 | year={2006}, 41 | publisher={Trelgol Publishing USA} 42 | } 43 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "scikit-posthocs" 7 | dynamic = ["version"] 8 | description = "Statistical post-hoc analysis and outlier detection algorithms" 9 | readme = "DESCRIPTION.rst" 10 | requires-python = ">=3.9" 11 | keywords = ["statistics", "stats", "posthoc", "anova", "data science"] 12 | license.file = "LICENSE" 13 | authors = [ 14 | { name = "Maksim Terpilovskii", email = "maximtrp@gmail.com" }, 15 | ] 16 | classifiers = [ 17 | "Development Status :: 5 - Production/Stable", 18 | "Intended Audience :: Education", 19 | "Intended Audience :: Information Technology", 20 | "Intended Audience :: Science/Research", 21 | "Topic :: Scientific/Engineering :: Information Analysis", 22 | "Topic :: Scientific/Engineering :: Mathematics", 23 | "License :: OSI Approved :: MIT License", 24 | "Programming Language :: Python :: 3", 25 | "Programming Language :: Python :: 3.9", 26 | "Programming Language :: Python :: 3.10", 27 | "Programming Language :: Python :: 3.11", 28 | "Programming Language :: Python :: 3.12", 29 | "Programming Language :: Python :: 3.13", 30 | ] 31 | urls.homepage = "https://github.com/maximtrp/scikit-posthocs" 32 | urls.documentation = "https://scikit-posthocs.rtfd.io" 33 | dependencies = [ 34 | "numpy", 35 | "scipy>=1.9.0", 36 | "statsmodels", 37 | "pandas>=0.20.0", 38 | "seaborn", 39 | "matplotlib", 40 | ] 41 | 42 | [tool.basedpyright] 43 | pythonVersion = "3.9" 44 | 45 | [tool.ruff] 46 | target-version = "py39" 47 | respect-gitignore = true 48 | line-length = 100 49 | 50 | [tool.setuptools] 51 | packages = ["scikit_posthocs"] 52 | 53 | [tool.setuptools.dynamic] 54 | version = {attr = "scikit_posthocs.__version__"} 55 | 56 | [project.optional-dependencies] 57 | test = ["pytest", "coverage"] 58 | 59 | [tool.pytest.ini_options] 60 | log_cli = true 61 | log_cli_level = "INFO" 62 | log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)" 63 | log_cli_date_format = "%Y-%m-%d %H:%M:%S" 64 | -------------------------------------------------------------------------------- /scikit_posthocs/_global.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List, Tuple 2 | from numpy import array, ndarray, log 3 | from scipy.stats import rankdata, chi2 4 | 5 | 6 | def global_simes_test(p_vals: Union[List, ndarray]) -> float: 7 | '''Global Simes test of the intersection null hypothesis. 8 | 9 | Computes the combined p value as min(np(i)/i), where p(1), ..., p(n) are 10 | the ordered p values [1]_. 11 | 12 | Parameters 13 | ---------- 14 | p_vals : Union[List, ndarray] 15 | An array of p values. 16 | 17 | Returns 18 | ------- 19 | p_value : float 20 | Global p value. 21 | 22 | References 23 | ---------- 24 | .. [1] Simes, R. J. (1986). An improved Bonferroni procedure for multiple 25 | tests of significance. Biometrika, 73(3):751-754. 26 | 27 | Examples 28 | -------- 29 | >>> arr = [0.04, 0.03, 0.98, 0.01, 0.43, 0.99, 1.0, 0.002] 30 | >>> sp.global_simes_test(arr) 31 | ''' 32 | arr = array(p_vals) 33 | ranks = rankdata(arr) 34 | p_value = min(arr.size * arr / ranks) 35 | return p_value 36 | 37 | 38 | def global_f_test( 39 | p_vals: Union[List, ndarray], 40 | stat: bool = False) -> Union[float, Tuple[float, float]]: 41 | '''Fisher's combination test for global null hypothesis. 42 | 43 | Computes the combined p value using chi-squared distribution and T 44 | statistic: -2 * sum(log(x)) [1]_. 45 | 46 | Parameters 47 | ---------- 48 | p_vals : Union[List, ndarray] 49 | An array or a list of p values. 50 | stat : bool 51 | Defines if statistic should be returned. 52 | 53 | Returns 54 | ------- 55 | p_value : float 56 | Global p value. 57 | t_stat : float 58 | Statistic. 59 | 60 | References 61 | ---------- 62 | .. [1] Fisher RA. Statistical methods for research workers, 63 | London: Oliver and Boyd, 1932. 64 | 65 | Examples 66 | -------- 67 | >>> x = [0.04, 0.03, 0.98, 0.01, 0.43, 0.99, 1.0, 0.002] 68 | >>> sp.global_f_test(x) 69 | ''' 70 | arr = array(p_vals) 71 | t_stat = -2 * sum(log(arr)) 72 | p_value = chi2.sf(t_stat, df=2 * len(arr)) 73 | return p_value, t_stat if stat else p_value 74 | 75 | -------------------------------------------------------------------------------- /docs/source/intro.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | ============ 3 | 4 | Background 5 | ---------- 6 | 7 | Python statistical ecosystem is comprised of multiple packages. However, it 8 | still has numerous gaps and is surpassed by R packages and capabilities. 9 | 10 | `SciPy `_ (version 1.2.0) offers *Student*, *Wilcoxon*, 11 | and *Mann-Whitney* tests which are not adapted to multiple pairwise comparisons. 12 | `Statsmodels `_ (version 0.9.0) features 13 | *TukeyHSD* test which needs some extra actions to be fluently integrated into a 14 | data analysis pipeline. `Statsmodels` also has good helper methods: 15 | ``allpairtest`` (adapts an external function such as ``scipy.stats.ttest_ind`` 16 | to multiple pairwise comparisons) and ``multipletests`` (adjusts *p* values to 17 | minimize type I and II errors). `PMCMRplus `_ 18 | is a very good R package which has no rivals in Python as it offers more than 40 19 | various tests (including post hoc tests) for factorial and block design data. 20 | PMCMRplus was an inspiration and a reference for *scikit-posthocs*. 21 | 22 | *scikit-posthocs* attempts to improve Python statistical capabilities by 23 | offering a lot of parametric and nonparametric post hoc tests along with 24 | outliers detection and basic plotting methods. 25 | 26 | Features 27 | -------- 28 | 29 | .. image:: _static/flowchart.png 30 | 31 | - *Omnibox* tests: 32 | 33 | - Durbin test (for balanced incomplete block design). 34 | 35 | - *Parametric* pairwise multiple comparisons tests: 36 | 37 | - Scheffe test. 38 | - Student T test. 39 | - Tamhane T2 test. 40 | - TukeyHSD test. 41 | 42 | - *Non-parametric* tests for factorial design: 43 | 44 | - Conover test. 45 | - Dunn test. 46 | - Dwass, Steel, Critchlow, and Fligner test. 47 | - Mann-Whitney test. 48 | - Nashimoto and Wright (NPM) test. 49 | - Nemenyi test. 50 | - van Waerden test. 51 | - Wilcoxon test. 52 | 53 | - *Non-parametric* tests for block design: 54 | 55 | - Conover test. 56 | - Durbin and Conover test. 57 | - Miller test. 58 | - Nemenyi test. 59 | - Quade test. 60 | - Siegel test. 61 | 62 | - Other tests: 63 | 64 | - Anderson-Darling test. 65 | - Mack-Wolfe test. 66 | - Hayter (OSRT) test. 67 | 68 | - Outliers detection tests: 69 | 70 | - Simple test based on interquartile range (IQR). 71 | - Grubbs test. 72 | - Tietjen-Moore test. 73 | - Generalized Extreme Studentized Deviate test (ESD test). 74 | 75 | - Plotting functionality: 76 | 77 | - Significance plots. 78 | - Critical difference diagrams. 79 | 80 | All post hoc tests are capable of p value adjustments for multiple pairwise 81 | comparisons. 82 | -------------------------------------------------------------------------------- /paper/paper.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'scikit-posthocs: Pairwise multiple comparison tests in Python' 3 | tags: 4 | - Python 5 | - statistics 6 | - post hoc 7 | authors: 8 | - name: Maksim A. Terpilowski 9 | orcid: 0000-0003-2586-4633 10 | affiliation: 1 11 | affiliations: 12 | - name: Institute of Evolutionary Physiology and Biochemistry, Saint Petersburg, Russia 13 | index: 1 14 | date: 6 December 2018 15 | bibliography: paper.bib 16 | --- 17 | 18 | # Summary 19 | 20 | **scikit-posthocs** is a Python package providing multiple pairwise comparison tests (post hocs). Statisticians, data scientists, and researchers will find it useful in a statistical analysis routine to assess the differences between group levels if a statistically significant result of a parametric or nonparametric analysis of variance (ANOVA) test has been obtained. 21 | 22 | Most statistical software packages (such as SPSS and Stata) provide a number of post hoc tests. However, Python ecosystem still lacks implementation of these tests for factorial and block design data. Currently, *Student*, *Wilcoxon*, *Mann-Whitney* tests which are not adapted to multiple pairwise comparisons are offered by *SciPy*. *Statsmodels* package includes *TukeyHSD* test which needs extra actions to be fluently integrated into a data analysis pipeline. 23 | 24 | ``scikit-posthocs`` package is aimed at filling this gap by providing a number of nonparametric and parametric pairwise comparisons tests as well as outliers detection algorithms implemented in Python. 25 | 26 | Currently, the following tests are implemented in this package: 27 | 28 | 1. *Parametric* pairwise multiple comparisons tests: Scheffe, Student T, Tamhane T2, and TukeyHSD test. 29 | 2. *Non-parametric* tests for factorial design: Conover, Dunn, Dwass-Steel-Critchlow-Fligner, Mann-Whitney, Nashimoto-Wright (NPM), Nemenyi, van Waerden, and Wilcoxon test. 30 | 3. *Non-parametric* tests for block design: Conover, Durbin and Conover, Miller, Nemenyi, Quade, Siegel test. 31 | 4. Additional tests: Anderson-Darling, Mack-Wolfe, Hayter (OSRT) test. 32 | 33 | ``scikit-posthocs`` provides tests for outliers detection: interquartile range (IQR) test, Grubbs test, Tietjen-Moore test, and generalized (extreme Studentized deviate) ESD test. It also has plotting functionality to present the results of pairwise comparisons as a heatmap (significance plot, see figure). 34 | 35 | ![Significance plot](figure.png) 36 | 37 | ``scikit-posthocs`` is compatible with Python 2 and 3 versions, relies heavily and extends the functionality of ``statsmodels``, ``SciPy`` and ``PMCMRplus`` packages [@Seabold2010], [@Jones2001], [@Pohlert2018]. It is also integrated with ``Pandas`` [@McKinney2010] and ``Numpy`` [@Oliphant2006] for efficient computations and data analysis. The package is fully documented and comes with a Jupyter notebook example. 38 | 39 | # References 40 | -------------------------------------------------------------------------------- /images/flowchart.gv: -------------------------------------------------------------------------------- 1 | digraph { 2 | graph [truecolor=true, bgcolor="#ff000000"]; 3 | node [fontname="Roboto", fontsize=14, style="filled", fillcolor="#ffffff"]; 4 | 5 | factorial [ 6 | label="Factorial Design", 7 | style="filled", 8 | shape=box, 9 | fillcolor="#C7DFFF", 10 | color="#2666ba", 11 | fontcolor="#184074" 12 | ]; 13 | 14 | anova [ 15 | fillcolor="#eec4c6", 16 | color="#8d1e22", 17 | fontcolor="#581315", 18 | label="ANOVA", 19 | shape=box, 20 | style="filled", 21 | width=1.5, 22 | ]; 23 | 24 | param [ 25 | shape=box, 26 | style="rounded,filled", 27 | fontcolor="#746522", 28 | color="#baa136", 29 | fillcolor="#fff5cd", 30 | label="Parametric\nANOVA\n\nscipy.stats.f_oneway()\ntest_osrt()", 31 | width=0.5 32 | ]; 33 | 34 | param_posthocs [ 35 | shape=box, 36 | fontname="Iosevka", 37 | label="posthoc_scheffe()\nposthoc_tamhane()\nposthoc_ttest()\nposthoc_tukey()\nposthoc_tukey_hsd()" 38 | ]; 39 | 40 | nonparam [ 41 | shape=box, 42 | style="rounded,filled", 43 | fontcolor="#746522", 44 | color="#baa136", 45 | fillcolor="#fff5cd", 46 | label="Non-parametric\nANOVA\n\nscipy.stats.kruskal()", 47 | width=0.5 48 | ]; 49 | 50 | nonparam_posthocs [ 51 | shape=box, 52 | fontname="Iosevka", 53 | label="posthoc_conover()\nposthoc_dscf()\nposthoc_mannwhitney()\nposthoc_nemenyi()\nposthoc_dunn()\nposthoc_npm_test()\nposthoc_vanwaerden()\nposthoc_wilcoxon()" 54 | ]; 55 | 56 | blocked [ 57 | label="Block Design", 58 | shape=box, 59 | style="filled", 60 | fillcolor="#C7DFFF", 61 | color="#2666ba", 62 | fontcolor="#184074" 63 | ]; 64 | 65 | friedman [ 66 | fillcolor="#eec4c6", 67 | color="#8d1e22", 68 | fontcolor="#581315", 69 | label="Friedman test\nComplete block design\n\nscipy.stats.friedmanchisquare()", 70 | shape=box, 71 | style="filled", 72 | width=0.5, 73 | ]; 74 | 75 | friedman_posthocs [ 76 | shape=box, 77 | fontname="Iosevka" 78 | label="posthoc_conover_friedman()\nposthoc_miller_friedman()\nposthoc_nemenyi_friedman()\nposthoc_siegel_friedman()\nposthoc_quade()" 79 | ]; 80 | 81 | durbin [ 82 | fillcolor="#eec4c6", 83 | color="#8d1e22", 84 | fontcolor="#581315", 85 | label="Durbin test\nIncomplete block design\n\ntest_durbin()", 86 | shape=box, 87 | style="filled", 88 | width=0.5, 89 | ]; 90 | 91 | durbin_posthocs [ 92 | shape=box, 93 | fontname="Iosevka" 94 | label="posthoc_durbin()" 95 | ]; 96 | 97 | 98 | factorial -> anova; 99 | anova -> param; 100 | anova -> nonparam; 101 | 102 | param -> param_posthocs; 103 | nonparam -> nonparam_posthocs; 104 | 105 | blocked -> {friedman, durbin}; 106 | friedman -> friedman_posthocs; 107 | durbin -> durbin_posthocs; 108 | } 109 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and 9 | expression, level of experience, education, socio-economic status, nationality, 10 | personal appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | - Using welcoming and inclusive language 18 | - Being respectful of differing viewpoints and experiences 19 | - Gracefully accepting constructive criticism 20 | - Focusing on what is best for the community 21 | - Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | - The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | 28 | - Trolling, insulting/derogatory comments, and personal or political attacks 29 | 30 | - Public or private harassment 31 | 32 | - Publishing others' private information, such as a physical or electronic 33 | address, without explicit permission 34 | 35 | - Other conduct which could reasonably be considered inappropriate in a 36 | professional setting 37 | 38 | ## Our Responsibilities 39 | 40 | Project maintainers are responsible for clarifying the standards of acceptable 41 | behavior and are expected to take appropriate and fair corrective action in 42 | response to any instances of unacceptable behavior. 43 | 44 | Project maintainers have the right and responsibility to remove, edit, or 45 | reject comments, commits, code, wiki edits, issues, and other contributions 46 | that are not aligned to this Code of Conduct, or to ban temporarily or 47 | permanently any contributor for other behaviors that they deem inappropriate, 48 | threatening, offensive, or harmful. 49 | 50 | ## Scope 51 | 52 | This Code of Conduct applies both within project spaces and in public spaces 53 | when an individual is representing the project or its community. Examples of 54 | representing a project or community include using an official project e-mail 55 | address, posting via an official social media account, or acting as an appointed 56 | representative at an online or offline event. Representation of a project may be 57 | further defined and clarified by project maintainers. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project maintainer. All complaints will be reviewed 63 | and investigated and will result in a response that is deemed necessary 64 | and appropriate to the circumstances. The project team is obligated to 65 | maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 75 | version 1.4, available at 76 | 77 | 78 | For answers to common questions about this code of conduct, see 79 | 80 | 81 | [homepage]: https://www.contributor-covenant.org 82 | -------------------------------------------------------------------------------- /DESCRIPTION.rst: -------------------------------------------------------------------------------- 1 | scikit-posthocs 2 | =============== 3 | 4 | **scikit-posthocs** is a Python package that provides post hoc tests for 5 | pairwise multiple comparisons that are usually performed in statistical 6 | data analysis to assess the differences between group levels if a 7 | statistically significant result of ANOVA test has been obtained. 8 | 9 | **scikit-posthocs** is tightly integrated with Pandas DataFrames and NumPy 10 | arrays to ensure fast computations and convenient data import and storage. 11 | 12 | This package will be useful for statisticians, data analysts, and researchers 13 | who use Python in their work. 14 | 15 | 16 | Background 17 | ---------- 18 | 19 | Python statistical ecosystem comprises multiple packages. However, it still has 20 | numerous gaps and is surpassed by R packages and capabilities. 21 | 22 | `SciPy `_ (version 1.2.0) offers *Student*, *Wilcoxon*, 23 | and *Mann-Whitney* tests that are not adapted to multiple pairwise comparisons. 24 | `Statsmodels `_ (version 0.9.0) features 25 | *TukeyHSD* test that needs some extra actions to be fluently integrated into 26 | a data analysis pipeline. Statsmodels also has good helper methods: 27 | ``allpairtest`` (adapts an external function such 28 | as ``scipy.stats.ttest_ind`` to multiple pairwise comparisons) and 29 | ``multipletests`` (adjusts *p* values to minimize type I and II errors). 30 | `PMCMRplus `_ is a very good R package that 31 | has no rivals in Python as it offers more than 40 various tests (including post 32 | hoc tests) for factorial and block design data. PMCMRplus was an inspiration 33 | and a reference for *scikit-posthocs*. 34 | 35 | *scikit-posthocs* attempts to improve Python statistical capabilities by 36 | offering a lot of parametric and nonparametric post hoc tests along with 37 | outliers detection and basic plotting methods. 38 | 39 | 40 | Features 41 | -------- 42 | 43 | - *Parametric* pairwise multiple comparisons tests: 44 | 45 | - Scheffe test. 46 | - Student T test. 47 | - Tamhane T2 test. 48 | - TukeyHSD test. 49 | 50 | - *Non-parametric* tests for factorial design: 51 | 52 | - Conover test. 53 | - Dunn test. 54 | - Dwass, Steel, Critchlow, and Fligner test. 55 | - Mann-Whitney test. 56 | - Nashimoto and Wright (NPM) test. 57 | - Nemenyi test. 58 | - van Waerden test. 59 | - Wilcoxon test. 60 | 61 | - *Non-parametric* tests for block design: 62 | 63 | - Conover test. 64 | - Durbin and Conover test. 65 | - Miller test. 66 | - Nemenyi test. 67 | - Quade test. 68 | - Siegel test. 69 | 70 | - Other tests: 71 | 72 | - Anderson-Darling test. 73 | - Mack-Wolfe test. 74 | - Hayter (OSRT) test. 75 | 76 | - Outliers detection tests: 77 | 78 | - Simple test based on interquartile range (IQR). 79 | - Grubbs test. 80 | - Tietjen-Moore test. 81 | - Generalized Extreme Studentized Deviate test (ESD test). 82 | 83 | - Plotting functionality (e.g. significance plots). 84 | 85 | All post hoc tests are capable of p adjustments for multiple pairwise 86 | comparisons. 87 | 88 | Dependencies 89 | ------------ 90 | 91 | - `NumPy and SciPy packages `_ 92 | - `Statsmodels `_ 93 | - `Pandas `_ 94 | - `Matplotlib `_ 95 | - `Seaborn `_ 96 | 97 | Compatibility 98 | ------------- 99 | 100 | Package is compatible with Python 2 and Python 3. 101 | 102 | Install 103 | ------- 104 | 105 | You can install the package using ``pip`` : 106 | 107 | .. code:: bash 108 | 109 | $ pip install scikit-posthocs 110 | -------------------------------------------------------------------------------- /paper/generate.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | # For an OO language, this is distinctly procedural. Should probably fix that. 4 | require 'json' 5 | 6 | details = Hash.new({}) 7 | 8 | capture_params = [ 9 | { :name => "title", :message => "Enter project name." }, 10 | { :name => "url", :message => "Enter the URL of the project repository." }, 11 | { :name => "description", :message => "Enter the (short) project description." }, 12 | { :name => "license", :message => "Enter the license this software shared under. (hit enter to skip)\nFor example MIT, BSD, GPL v3.0, Apache 2.0" }, 13 | { :name => "doi", :message => "Enter the DOI of the archived version of this code. (hit enter to skip)\nFor example http://dx.doi.org/10.6084/m9.figshare.828487" }, 14 | { :name => "keywords", :message => "Enter keywords that should be associated with this project (hit enter to skip)\nComma-separated, for example: turkey, chicken, pot pie" }, 15 | { :name => "version", :message => "Enter the version of your software (hit enter to skip)\nSEMVER preferred: http://semver.org e.g. v1.0.0" } 16 | ] 17 | 18 | puts "I'm going to try and help you prepare some things for your JOSS submission" 19 | puts "If all goes well then we'll have a nice codemeta.json file soon..." 20 | puts "" 21 | puts "************************************" 22 | puts "* First, some basic details *" 23 | puts "************************************" 24 | puts "" 25 | 26 | # Loop through the desired captures and print out for clarity 27 | capture_params.each do |param| 28 | puts param[:message] 29 | print "> " 30 | input = gets 31 | 32 | details[param[:name]] = input.chomp 33 | 34 | puts "" 35 | puts "OK, your project has #{param[:name]}: #{input}" 36 | puts "" 37 | end 38 | 39 | puts "" 40 | puts "************************************" 41 | puts "* Experimental stuff *" 42 | puts "************************************" 43 | puts "" 44 | 45 | puts "Would you like me to try and build a list of authors for you?" 46 | puts "(You need to be running this script in a git repository for this to work)" 47 | print "> (Y/N)" 48 | answer = gets.chomp 49 | 50 | case answer.downcase 51 | when "y", "yes" 52 | 53 | # Use git shortlog to extract a list of author names and commit counts. 54 | # Note we don't extract emails here as there's often different emails for 55 | # each user. Instead we capture emails at the end. 56 | 57 | git_log = `git shortlog --summary --numbered --no-merges` 58 | 59 | # ["252\tMichael Jackson", "151\tMC Hammer"] 60 | authors_and_counts = git_log.split("\n").map(&:strip) 61 | 62 | authors_and_counts.each do |author_count| 63 | count, author = author_count.split("\t").map(&:strip) 64 | 65 | puts "Looks like #{author} made #{count} commits" 66 | puts "Add them to the output?" 67 | print "> (Y/N)" 68 | answer = gets.chomp 69 | 70 | # If a user chooses to add this author to the output then we ask for some 71 | # additional information including their email, ORCID and affiliation. 72 | case answer.downcase 73 | when "y", "yes" 74 | puts "What is #{author}'s email address? (hit enter to skip)" 75 | print "> " 76 | email = gets.chomp 77 | 78 | puts "What is #{author}'s ORCID? (hit enter to skip)" 79 | puts "For example: http://orcid.org/0000-0000-0000-0000" 80 | print "> " 81 | orcid = gets.chomp 82 | 83 | puts "What is #{author}'s affiliation? (hit enter to skip)" 84 | print "> " 85 | affiliation = gets.chomp 86 | 87 | 88 | details['authors'].merge!(author => { 'commits' => count, 89 | 'email' => email, 90 | 'orcid' => orcid, 91 | 'affiliation' => affiliation }) 92 | 93 | when "n", "no" 94 | puts "OK boss..." 95 | puts "" 96 | end 97 | end 98 | when "n", "no" 99 | puts "OK boss..." 100 | puts "" 101 | end 102 | 103 | puts "Reticulating splines" 104 | 105 | 5.times do 106 | print "." 107 | sleep 0.5 108 | end 109 | 110 | puts "" 111 | puts "Generating some JSON goodness..." 112 | 113 | # TODO: work out how to use some kind of JSON template here. 114 | # Build the output list of authors from the inputs we've collected. 115 | output_authors = [] 116 | 117 | details['authors'].each do |author_name, values| 118 | entry = { 119 | "@id" => values['orcid'], 120 | "@type" => "Person", 121 | "email" => values['email'], 122 | "name" => author_name, 123 | "affiliation" => values['affiliation'] 124 | } 125 | output_authors << entry 126 | end 127 | 128 | # TODO: this is currently a static template (written out here). It would be good 129 | # to do something smarter here. 130 | output = { 131 | "@context" => "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld", 132 | "@type" => "Code", 133 | "author" => output_authors, 134 | "identifier" => details['doi'], 135 | "codeRepository" => details['url'], 136 | "datePublished" => Time.now.strftime("%Y-%m-%d"), 137 | "dateModified" => Time.now.strftime("%Y-%m-%d"), 138 | "dateCreated" => Time.now.strftime("%Y-%m-%d"), 139 | "description" => details['description'], 140 | "keywords" => details['keywords'], 141 | "license" => details['license'], 142 | "title" => details['title'], 143 | "version" => details['version'] 144 | } 145 | 146 | File.open('codemeta.json', 'w') {|f| f.write(JSON.pretty_generate(output)) } 147 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | 10 | # -- Path setup -------------------------------------------------------------- 11 | 12 | # If extensions (or modules to document with autodoc) are in another directory, 13 | # add these directories to sys.path here. If the directory is relative to the 14 | # documentation root, use os.path.abspath to make it absolute, like shown here. 15 | # 16 | #import os 17 | #import sys 18 | #import scikit_posthocs 19 | #import sphinx_rtd_theme 20 | 21 | #sys.path.insert(0, os.path.abspath('../../')) 22 | 23 | 24 | # -- Project information ----------------------------------------------------- 25 | 26 | project = 'scikit-posthocs' 27 | author = 'Maksim Terpilowski' 28 | 29 | # The short X.Y version 30 | version = '0.7.0' 31 | # The full version, including alpha/beta/rc tags 32 | release = '0.7.0' 33 | 34 | 35 | # -- General configuration --------------------------------------------------- 36 | 37 | # If your documentation needs a minimal Sphinx version, state it here. 38 | # 39 | # needs_sphinx = '1.0' 40 | 41 | # Add any Sphinx extension module names here, as strings. They can be 42 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 43 | # ones. 44 | extensions = [ 45 | 'sphinx.ext.autosummary', 46 | 'sphinx.ext.napoleon', 47 | 'sphinx.ext.coverage', 48 | 'sphinx.ext.mathjax', 49 | ] 50 | 51 | autosummary_generate = True 52 | 53 | # Add any paths that contain templates here, relative to this directory. 54 | templates_path = ['_templates'] 55 | 56 | # The suffix(es) of source filenames. 57 | # You can specify multiple suffix as a list of string: 58 | # 59 | # source_suffix = ['.rst', '.md'] 60 | source_suffix = '.rst' 61 | 62 | # The master toctree document. 63 | master_doc = 'index' 64 | 65 | # The language for content autogenerated by Sphinx. Refer to documentation 66 | # for a list of supported languages. 67 | # 68 | # This is also used if you do content translation via gettext catalogs. 69 | # Usually you set "language" from the command line for these cases. 70 | language = 'en' 71 | 72 | # List of patterns, relative to source directory, that match files and 73 | # directories to ignore when looking for source files. 74 | # This pattern also affects html_static_path and html_extra_path. 75 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 76 | 77 | # The name of the Pygments (syntax highlighting) style to use. 78 | pygments_style = None 79 | 80 | 81 | # -- Options for HTML output ------------------------------------------------- 82 | 83 | # The theme to use for HTML and HTML Help pages. See the documentation for 84 | # a list of builtin themes. 85 | # 86 | html_theme = 'sphinx_rtd_theme' 87 | #html_theme_path = sphinx_bootstrap_theme.get_html_theme_path() 88 | 89 | # Theme options are theme-specific and customize the look and feel of a theme 90 | # further. For a list of options available for each theme, see the 91 | # documentation. 92 | # 93 | html_theme_options = { 94 | 'collapse_navigation': False, 95 | 'sticky_navigation': False, 96 | 'navigation_depth': 4, 97 | 'includehidden': True, 98 | 'titles_only': False 99 | } 100 | 101 | # Add any paths that contain custom static files (such as style sheets) here, 102 | # relative to this directory. They are copied after the builtin static files, 103 | # so a file named "default.css" will overwrite the builtin "default.css". 104 | html_static_path = ['_static'] 105 | 106 | # Custom sidebar templates, must be a dictionary that maps document names 107 | # to template names. 108 | # 109 | # The default sidebars (for documents that don't match any pattern) are 110 | # defined by theme itself. Builtin themes are using these templates by 111 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 112 | # 'searchbox.html']``. 113 | # 114 | # html_sidebars = {} 115 | 116 | 117 | # -- Options for HTMLHelp output --------------------------------------------- 118 | 119 | # Output file base name for HTML help builder. 120 | htmlhelp_basename = 'scikit-posthocs-doc' 121 | 122 | 123 | # -- Options for LaTeX output ------------------------------------------------ 124 | 125 | latex_elements = { 126 | # The paper size ('letterpaper' or 'a4paper'). 127 | # 128 | # 'papersize': 'letterpaper', 129 | 130 | # The font size ('10pt', '11pt' or '12pt'). 131 | # 132 | # 'pointsize': '10pt', 133 | 134 | # Additional stuff for the LaTeX preamble. 135 | # 136 | # 'preamble': '', 137 | 138 | # Latex figure (float) alignment 139 | # 140 | # 'figure_align': 'htbp', 141 | } 142 | 143 | # Grouping the document tree into LaTeX files. List of tuples 144 | # (source start file, target name, title, 145 | # author, documentclass [howto, manual, or own class]). 146 | latex_documents = [ 147 | (master_doc, 'scikit-posthocs.tex', 'scikit-posthocs Documentation', 148 | 'Maksim Terpilowski', 'manual'), 149 | ] 150 | 151 | 152 | # -- Options for manual page output ------------------------------------------ 153 | 154 | # One entry per manual page. List of tuples 155 | # (source start file, name, description, authors, manual section). 156 | man_pages = [ 157 | (master_doc, 'scikit-posthocs', 'scikit-posthocs Documentation', 158 | [author], 1) 159 | ] 160 | 161 | 162 | # -- Options for Texinfo output ---------------------------------------------- 163 | 164 | # Grouping the document tree into Texinfo files. List of tuples 165 | # (source start file, target name, title, author, 166 | # dir menu entry, description, category) 167 | texinfo_documents = [ 168 | (master_doc, 'scikit-posthocs', 'scikit-posthocs Documentation', 169 | author, 'scikit-posthocs', 'One line description of project.', 170 | 'Miscellaneous'), 171 | ] 172 | 173 | 174 | # -- Options for Epub output ------------------------------------------------- 175 | 176 | # Bibliographic Dublin Core info. 177 | epub_title = project 178 | 179 | # The unique identifier of the text. This can be a ISBN number 180 | # or the project homepage. 181 | # 182 | # epub_identifier = '' 183 | 184 | # A unique identification for the text. 185 | # 186 | # epub_uid = '' 187 | 188 | # A list of files that should not be packed into the epub file. 189 | epub_exclude_files = ['search.html'] 190 | 191 | 192 | # -- Extension configuration ------------------------------------------------- 193 | -------------------------------------------------------------------------------- /usage-examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import scikit_posthocs as sp\n", 10 | "import numpy as np\n", 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## Without p adjustments" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "data": { 28 | "text/plain": [ 29 | "array([[ -1.00000000e+00, 3.98391078e-04, 1.39164426e-03],\n", 30 | " [ 3.98391078e-04, -1.00000000e+00, 1.86722274e-01],\n", 31 | " [ 1.39164426e-03, 1.86722274e-01, -1.00000000e+00]])" 32 | ] 33 | }, 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "output_type": "execute_result" 37 | } 38 | ], 39 | "source": [ 40 | "x = [[1,2,3,5,1], [12,31,54], [10,12,6,74,11]]\n", 41 | "sp.posthoc_conover(x)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## With Holm p adjustment" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": [ 59 | "array([[-1. , 0.00119517, 0.00278329],\n", 60 | " [ 0.00119517, -1. , 0.18672227],\n", 61 | " [ 0.00278329, 0.18672227, -1. ]])" 62 | ] 63 | }, 64 | "execution_count": 3, 65 | "metadata": {}, 66 | "output_type": "execute_result" 67 | } 68 | ], 69 | "source": [ 70 | "sp.posthoc_conover(x, p_adjust = 'holm')" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "## Exporting to pandas" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 4, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/html": [ 88 | "
\n", 89 | "\n", 102 | "\n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | "
groupsvals
0a1
1a2
2a3
3a5
4a1
5b12
6b31
7b54
8c10
9c12
10c6
11c74
12c11
\n", 178 | "
" 179 | ], 180 | "text/plain": [ 181 | " groups vals\n", 182 | "0 a 1\n", 183 | "1 a 2\n", 184 | "2 a 3\n", 185 | "3 a 5\n", 186 | "4 a 1\n", 187 | "5 b 12\n", 188 | "6 b 31\n", 189 | "7 b 54\n", 190 | "8 c 10\n", 191 | "9 c 12\n", 192 | "10 c 6\n", 193 | "11 c 74\n", 194 | "12 c 11" 195 | ] 196 | }, 197 | "execution_count": 4, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "groups = [['a']*5, ['b']*3, ['c']*5]\n", 204 | "df = pd.DataFrame({'vals': np.concatenate(x), 'groups': np.concatenate(groups)})\n", 205 | "df" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 5, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/html": [ 216 | "
\n", 217 | "\n", 230 | "\n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | "
abc
a-1.0000000.0003980.001392
b0.000398-1.0000000.186722
c0.0013920.186722-1.000000
\n", 260 | "
" 261 | ], 262 | "text/plain": [ 263 | " a b c\n", 264 | "a -1.000000 0.000398 0.001392\n", 265 | "b 0.000398 -1.000000 0.186722\n", 266 | "c 0.001392 0.186722 -1.000000" 267 | ] 268 | }, 269 | "execution_count": 5, 270 | "metadata": {}, 271 | "output_type": "execute_result" 272 | } 273 | ], 274 | "source": [ 275 | "result = sp.posthoc_conover(df, val_col='vals', group_col='groups')\n", 276 | "result" 277 | ] 278 | } 279 | ], 280 | "metadata": { 281 | "kernelspec": { 282 | "display_name": "Python 3", 283 | "language": "python", 284 | "name": "python3" 285 | }, 286 | "language_info": { 287 | "codemirror_mode": { 288 | "name": "ipython", 289 | "version": 3 290 | }, 291 | "file_extension": ".py", 292 | "mimetype": "text/x-python", 293 | "name": "python", 294 | "nbconvert_exporter": "python", 295 | "pygments_lexer": "ipython3", 296 | "version": "3.6.3" 297 | } 298 | }, 299 | "nbformat": 4, 300 | "nbformat_minor": 2 301 | } 302 | -------------------------------------------------------------------------------- /scikit_posthocs/_outliers.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | import numpy as np 3 | from numpy.typing import ArrayLike 4 | from scipy.stats import t 5 | 6 | 7 | def outliers_iqr( 8 | x: Union[List, np.ndarray], ret: str = "filtered", coef: float = 1.5 9 | ) -> np.ndarray: 10 | """Simple detection of potential outliers based on interquartile range 11 | (IQR). Data that lie within the lower and upper limits are considered 12 | non-outliers. The lower limit is the number that lies 1.5 IQRs below 13 | (coefficient may be changed with an argument, see Parameters) 14 | the first quartile; the upper limit is the number that lies 1.5 IQRs 15 | above the third quartile. 16 | 17 | Parameters 18 | ---------- 19 | x : Union[List, np.ndarray] 20 | An array, any object exposing the array interface, containing 21 | p values. 22 | 23 | ret : str = 'filtered' 24 | Specifies object to be returned. Available options are: 25 | 26 | - ``filtered``: return a filtered array (default) 27 | - ``outliers``: return outliers 28 | - ``indices``: return indices of non-outliers 29 | - ``outliers_indices``: return indices of outliers 30 | 31 | coef : float = 1.5 32 | Coefficient by which IQR is multiplied. 33 | 34 | Returns 35 | ------- 36 | numpy.ndarray 37 | One of the following objects: 38 | 39 | - Filtered array (default) if ``ret`` is set to ``filtered``. 40 | - Array with indices of elements lying within the specified limits 41 | if ``ret`` is set to ``indices``. 42 | - Array with outliers if ``ret`` is set to ``outliers``. 43 | - Array with indices of outlier elements 44 | if ``ret`` is set to ``outliers_indices``. 45 | 46 | Examples 47 | -------- 48 | >>> x = np.array([4, 5, 6, 10, 12, 4, 3, 1, 2, 3, 23, 5, 3]) 49 | >>> outliers_iqr(x, ret = 'outliers') 50 | array([12, 23]) 51 | """ 52 | arr = np.copy(x) 53 | 54 | q1, q3 = np.percentile(arr, [25, 75]) 55 | iqr = q3 - q1 56 | ll = q1 - iqr * coef 57 | ul = q3 + iqr * coef 58 | 59 | if ret == "indices": 60 | return np.where((arr > ll) & (arr < ul))[0] 61 | elif ret == "outliers": 62 | return arr[(arr < ll) | (arr > ul)] 63 | elif ret == "outliers_indices": 64 | return np.where((arr < ll) | (arr > ul))[0] 65 | else: 66 | return x[(x > ll) & (x < ul)] 67 | 68 | 69 | def outliers_grubbs( 70 | x: Union[List, np.ndarray], hypo: bool = False, alpha: float = 0.05 71 | ) -> Union[np.ndarray, bool]: 72 | """Grubbs' Test for Outliers [1]_. This is the two-sided version 73 | of the test. The null hypothesis implies that there are no outliers 74 | in the data set. 75 | 76 | Parameters 77 | ---------- 78 | x : Union[List, np.ndarray] 79 | An array, any object exposing the array interface, containing 80 | data to test for an outlier in. 81 | 82 | hypo : bool = False 83 | Specifies whether to return a bool value of a hypothesis test result. 84 | Returns ``True`` when we can reject the null hypothesis. 85 | Otherwise, ``False``. Available options are: 86 | 87 | - ``True``: return a hypothesis test result 88 | - ``False``: return a filtered array without an outlier (default) 89 | 90 | alpha : float = 0.05 91 | Significance level for a hypothesis test. 92 | 93 | Returns 94 | ------- 95 | Union[np.ndarray, bool] 96 | Returns a filtered array if alternative hypothesis is true, otherwise 97 | an unfiltered array. Returns null hypothesis test result instead of an 98 | array if ``hypo`` argument is set to ``True``. 99 | 100 | Notes 101 | ----- 102 | .. [1] http://www.itl.nist.gov/div898/handbook/eda/section3/eda35h1.htm 103 | 104 | Examples 105 | -------- 106 | >>> x = np.array([199.31,199.53,200.19,200.82,201.92,201.95,202.18,245.57]) 107 | >>> ph.outliers_grubbs(x) 108 | array([ 199.31, 199.53, 200.19, 200.82, 201.92, 201.95, 202.18]) 109 | """ 110 | arr = np.copy(x) 111 | val = np.max(np.abs(arr - np.mean(arr))) 112 | ind = np.argmax(np.abs(arr - np.mean(arr))) 113 | G = val / np.std(arr, ddof=1) 114 | N = len(arr) 115 | result = G > (N - 1) / np.sqrt(N) * np.sqrt( 116 | (t.ppf(1 - alpha / (2 * N), N - 2) ** 2) / (N - 2 + t.ppf(1 - alpha / (2 * N), N - 2) ** 2) 117 | ) 118 | 119 | if hypo: 120 | return result 121 | else: 122 | if result: 123 | return np.delete(arr, ind) 124 | else: 125 | return arr 126 | 127 | 128 | def outliers_tietjen( 129 | x: Union[List, np.ndarray], k: int, hypo: bool = False, alpha: float = 0.05 130 | ) -> Union[np.ndarray, bool]: 131 | """Tietjen-Moore test [1]_ to detect multiple outliers in a univariate 132 | data set that follows an approximately normal distribution. 133 | The Tietjen-Moore test [2]_ is a generalization of the Grubbs' test to 134 | the case of multiple outliers. If testing for a single outlier, 135 | the Tietjen-Moore test is equivalent to the Grubbs' test. 136 | 137 | The null hypothesis implies that there are no outliers in the data set. 138 | 139 | Parameters 140 | ---------- 141 | x : Union[List, np.ndarray] 142 | An array, any object exposing the array interface, containing 143 | data to test for an outlier in. 144 | 145 | k : int 146 | Number of potential outliers to test for. Function tests for 147 | outliers in both tails. 148 | 149 | hypo : bool = False 150 | Specifies whether to return a bool value of a hypothesis test result. 151 | Returns ``True`` when we can reject the null hypothesis. 152 | Otherwise, ``False``. Available options are: 153 | 154 | - ``True``: return a hypothesis test result 155 | - ``False``: return a filtered array without outliers (default). 156 | 157 | alpha : float = 0.05 158 | Significance level for a hypothesis test. 159 | 160 | Returns 161 | ------- 162 | Union[numpy.ndarray, bool] 163 | Returns a filtered array if alternative hypothesis is true, otherwise 164 | an unfiltered array. Returns null hypothesis test result instead of an 165 | array if ``hypo`` argument is set to True. 166 | 167 | Notes 168 | ----- 169 | .. [1] Tietjen and Moore (August 1972), Some Grubbs-Type Statistics 170 | for the Detection of Outliers, Technometrics, 14(3), pp. 583-597. 171 | .. [2] http://www.itl.nist.gov/div898/handbook/eda/section3/eda35h2.htm 172 | 173 | Examples 174 | -------- 175 | >>> x = np.array([-1.40, -0.44, -0.30, -0.24, -0.22, -0.13, -0.05, 0.06, 176 | 0.10, 0.18, 0.20, 0.39, 0.48, 0.63, 1.01]) 177 | >>> outliers_tietjen(x, 2) 178 | array([-0.44, -0.3 , -0.24, -0.22, -0.13, -0.05, 0.06, 0.1 , 0.18, 179 | 0.2 , 0.39, 0.48, 0.63]) 180 | """ 181 | arr = np.copy(x) 182 | n = arr.size 183 | 184 | def tietjen(x_, k_): 185 | x_mean = x_.mean() 186 | r = np.abs(x_ - x_mean) 187 | z = x_[r.argsort()] 188 | E = np.sum((z[:-k_] - z[:-k_].mean()) ** 2) / np.sum((z - x_mean) ** 2) 189 | return E 190 | 191 | e_x = tietjen(arr, k) 192 | e_norm = np.zeros(10000) 193 | 194 | for i in np.arange(10000): 195 | norm = np.random.normal(size=n) 196 | e_norm[i] = tietjen(norm, k) 197 | 198 | CV = np.percentile(e_norm, alpha * 100) 199 | result = e_x < CV 200 | 201 | if hypo: 202 | return result 203 | else: 204 | if result: 205 | ind = np.argpartition(np.abs(arr - arr.mean()), -k)[-k:] 206 | return np.delete(arr, ind) 207 | else: 208 | return arr 209 | 210 | 211 | def outliers_gesd( 212 | x: ArrayLike, 213 | outliers: int = 5, 214 | hypo: bool = False, 215 | report: bool = False, 216 | alpha: float = 0.05, 217 | ) -> np.ndarray: 218 | """The generalized (Extreme Studentized Deviate) ESD test is used 219 | to detect one or more outliers in a univariate data set that follows 220 | an approximately normal distribution [1]_. 221 | 222 | Parameters 223 | ---------- 224 | x : Union[List, np.ndarray] 225 | An array, any object exposing the array interface, containing 226 | data to test for outliers. 227 | 228 | outliers : int = 5 229 | Number of potential outliers to test for. Test is two-tailed, i.e. 230 | maximum and minimum values are checked for potential outliers. 231 | 232 | hypo : bool = False 233 | Specifies whether to return a bool value of a hypothesis test result. 234 | Returns True when we can reject the null hypothesis. Otherwise, False. 235 | Available options are: 236 | 1) True - return a hypothesis test result. 237 | 2) False - return a filtered array without an outlier (default). 238 | 239 | report : bool = False 240 | Specifies whether to print a summary table of the test. 241 | 242 | alpha : float = 0.05 243 | Significance level for a hypothesis test. 244 | 245 | Returns 246 | ------- 247 | np.ndarray 248 | If hypo is True, returns a boolean array where True indicates an outlier. 249 | If hypo is False, returns the filtered array with outliers removed. 250 | 251 | Notes 252 | ----- 253 | .. [1] Rosner, Bernard (May 1983), Percentage Points for a Generalized 254 | ESD Many-Outlier Procedure,Technometrics, 25(2), pp. 165-172. 255 | 256 | Examples 257 | -------- 258 | >>> data = np.array([-0.25, 0.68, 0.94, 1.15, 1.2, 1.26, 1.26, 1.34, 259 | 1.38, 1.43, 1.49, 1.49, 1.55, 1.56, 1.58, 1.65, 1.69, 1.7, 1.76, 260 | 1.77, 1.81, 1.91, 1.94, 1.96, 1.99, 2.06, 2.09, 2.1, 2.14, 2.15, 261 | 2.23, 2.24, 2.26, 2.35, 2.37, 2.4, 2.47, 2.54, 2.62, 2.64, 2.9, 262 | 2.92, 2.92, 2.93, 3.21, 3.26, 3.3, 3.59, 3.68, 4.3, 4.64, 5.34, 263 | 5.42, 6.01]) 264 | >>> outliers_gesd(data, 5) 265 | array([-0.25, 0.68, 0.94, 1.15, 1.2 , 1.26, 1.26, 1.34, 1.38, 266 | 1.43, 1.49, 1.49, 1.55, 1.56, 1.58, 1.65, 1.69, 1.7 , 267 | 1.76, 1.77, 1.81, 1.91, 1.94, 1.96, 1.99, 2.06, 2.09, 268 | 2.1 , 2.14, 2.15, 2.23, 2.24, 2.26, 2.35, 2.37, 2.4 , 269 | 2.47, 2.54, 2.62, 2.64, 2.9 , 2.92, 2.92, 2.93, 3.21, 270 | 3.26, 3.3 , 3.59, 3.68, 4.3 , 4.64]) 271 | >>> outliers_gesd(data, outliers = 5, report = True) 272 | H0: no outliers in the data 273 | Ha: up to 5 outliers in the data 274 | Significance level: α = 0.05 275 | Reject H0 if Ri > Critical Value (λi) 276 | Summary Table for Two-Tailed Test 277 | --------------------------------------- 278 | Exact Test Critical 279 | Number of Statistic Value, λi 280 | Outliers, i Value, Ri 5 % 281 | --------------------------------------- 282 | 1 3.119 3.159 283 | 2 2.943 3.151 284 | 3 3.179 3.144 * 285 | 4 2.81 3.136 286 | 5 2.816 3.128 287 | """ 288 | rs, ls = np.zeros(outliers, dtype=float), np.zeros(outliers, dtype=float) 289 | ms = [] 290 | 291 | data_proc = np.copy(x) 292 | argsort_index = np.argsort(data_proc) 293 | data = data_proc[argsort_index] 294 | n = data_proc.size 295 | 296 | # Lambda values (critical values): do not depend on the outliers. 297 | nol = np.arange(outliers) # the number of outliers 298 | df = n - nol - 2 # degrees of freedom 299 | t_ppr = t.ppf(1 - alpha / (2 * (n - nol)), df) 300 | ls = ((n - nol - 1) * t_ppr) / np.sqrt((df + t_ppr**2) * (n - nol)) 301 | 302 | for i in np.arange(outliers): 303 | abs_d = np.abs(data_proc - np.mean(data_proc)) 304 | 305 | # R-value calculation 306 | R = np.max(abs_d) / np.std(data_proc, ddof=1) 307 | rs[i] = R 308 | 309 | # Masked values 310 | lms = ms[-1] if len(ms) > 0 else [] 311 | ms.append(lms + [np.where(data == data_proc[np.argmax(abs_d)])[0][0]]) 312 | 313 | # Remove the observation that maximizes |xi − xmean| 314 | data_proc = np.delete(data_proc, np.argmax(abs_d)) 315 | 316 | if report: 317 | report_str = [ 318 | "H0: no outliers in the data", 319 | "Ha: up to " + str(outliers) + " outliers in the data", 320 | "Significance level: α = " + str(alpha), 321 | "Reject H0 if Ri > Critical Value (λi)", 322 | "", 323 | "Summary Table for Two-Tailed Test", 324 | "---------------------------------------", 325 | " Exact Test Critical", 326 | " Number of Statistic Value, λi", 327 | "Outliers, i Value, Ri {:5.3g} %".format(100 * alpha), 328 | "---------------------------------------", 329 | ] 330 | 331 | for i, (stat, crit_val) in enumerate(zip(rs, ls)): 332 | report_str.append( 333 | "{: >11s}".format(str(i + 1)) 334 | + "{: >15s}".format(str(np.round(stat, 3))) 335 | + "{: >13s}".format(str(np.round(crit_val, 3))) 336 | + (" *" if stat > crit_val else "") 337 | ) 338 | 339 | print("\n".join(report_str)) 340 | 341 | # Remove masked values 342 | # for which the test statistic is greater 343 | # than the critical value and return the result 344 | if hypo: 345 | data = np.zeros(n, dtype=bool) 346 | if any(rs > ls): 347 | data[ms[np.max(np.where(rs > ls))]] = True 348 | return data 349 | else: 350 | if any(rs > ls): 351 | return np.delete(data, ms[np.max(np.where(rs > ls))]) 352 | return data 353 | -------------------------------------------------------------------------------- /scikit_posthocs/_omnibus.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from typing import Optional, Union, List, cast 4 | import itertools as it 5 | import numpy as np 6 | from numpy.typing import ArrayLike 7 | import scipy.stats as ss 8 | from pandas import DataFrame, Categorical, Series 9 | from scikit_posthocs._posthocs import __convert_to_df, __convert_to_block_df 10 | 11 | 12 | def test_mackwolfe( 13 | data: Union[ArrayLike, DataFrame], 14 | val_col: Optional[str] = None, 15 | group_col: Optional[str] = None, 16 | p: Optional[int] = None, 17 | n_perm: int = 100, 18 | sort: bool = False, 19 | ) -> tuple[float, float]: 20 | """Mack-Wolfe Test for Umbrella Alternatives. 21 | 22 | In dose-finding studies one may assume an increasing treatment effect with 23 | increasing dose level. However, the test subject may actually succumb to 24 | toxic effects at high doses, which leads to decresing treatment 25 | effects [1]_, [2]_. 26 | 27 | The scope of the Mack-Wolfe Test is to test for umbrella alternatives for 28 | either a known or unknown point P (i.e. dose-level), where the peak 29 | (umbrella point) is present. 30 | 31 | Parameters 32 | ---------- 33 | data : Union[List, numpy.ndarray, DataFrame] 34 | An array, any object exposing the array interface or a pandas 35 | DataFrame with data values. 36 | 37 | val_col : str = None 38 | Name of a DataFrame column that contains dependent variable values 39 | (test or response variable). Values should have a non-nominal scale. 40 | Must be specified if ``a`` is a pandas DataFrame object. 41 | 42 | group_col : str = None 43 | Name of a DataFrame column that contains independent variable values 44 | (grouping or predictor variable). Values should have a nominal scale 45 | (categorical). Must be specified if ``a`` is a pandas DataFrame object. 46 | 47 | p : int = None 48 | The a priori known peak as an ordinal number of the treatment group 49 | including the zero dose level, i.e. p = {0, ..., k-1}. 50 | Defaults to None. 51 | 52 | n_perm: int = 100 53 | Permutations number. 54 | 55 | sort : bool = False 56 | If ``True``, sort data by block and group columns. 57 | 58 | Returns 59 | ------- 60 | tuple[float, float] 61 | P value and statistic. 62 | 63 | References 64 | ---------- 65 | .. [1] Chen, I.Y. (1991) Notes on the Mack-Wolfe and Chen-Wolfe Tests for 66 | Umbrella Alternatives. Biom. J., 33, 281-290. 67 | .. [2] Mack, G.A., Wolfe, D. A. (1981) K-sample rank tests for umbrella 68 | alternatives. J. Amer. Statist. Assoc., 76, 175-181. 69 | 70 | Examples 71 | -------- 72 | >>> x = [[22, 23, 35], [60, 59, 54], [98, 78, 50], [60, 82, 59], [22, 44, 33], [23, 21, 25]] 73 | >>> sp.posthoc_mackwolfe(x) 74 | """ 75 | x, _val_col, _group_col = __convert_to_df(data, val_col, group_col) 76 | 77 | if not sort: 78 | x[_group_col] = Categorical(x[_group_col], categories=x[_group_col].unique(), ordered=True) 79 | x.sort_values(by=[_group_col], ascending=True, inplace=True) 80 | 81 | k = x[_group_col].unique().size 82 | 83 | if p and p > k: 84 | print("Selected 'p' > number of groups:", str(p), " > ", str(k)) 85 | return (np.nan, np.nan) 86 | elif p is not None and p < 1: 87 | print("Selected 'p' < 1: ", str(p)) 88 | return (np.nan, np.nan) 89 | 90 | Rij = x[_val_col].rank() 91 | n = cast(Series, x.groupby(_group_col, observed=True)[_val_col].count()) 92 | 93 | def _fn(Ri, Rj): 94 | return np.sum(Ri.apply(lambda x: Rj[Rj > x].size)) 95 | 96 | def _ustat(Rij, g, k): 97 | levels = np.unique(g) 98 | U = np.identity(k) 99 | 100 | for i in range(k): 101 | for j in range(i): 102 | U[i, j] = _fn(Rij[x[_group_col] == levels[i]], Rij[x[_group_col] == levels[j]]) 103 | U[j, i] = _fn(Rij[x[_group_col] == levels[j]], Rij[x[_group_col] == levels[i]]) 104 | 105 | return U 106 | 107 | def _ap(p, U) -> float: 108 | tmp1 = 0.0 109 | if p > 0: 110 | for i in range(p): 111 | for j in range(i + 1, p + 1): 112 | tmp1 += U[i, j] 113 | tmp2 = 0.0 114 | if p < k: 115 | for i in range(p, k): 116 | for j in range(i + 1, k): 117 | tmp2 += U[j, i] 118 | 119 | return tmp1 + tmp2 120 | 121 | def _n1(p: int, n: Series) -> float: 122 | return np.sum(n[: p + 1]) 123 | 124 | def _n2(p: int, n: Series) -> float: 125 | return np.sum(n[p:k]) 126 | 127 | def _mean_at(p, n) -> float: 128 | N1 = _n1(p, n) 129 | N2 = _n2(p, n) 130 | return (N1**2.0 + N2**2.0 - np.sum(n**2.0) - n.iloc[p] ** 2.0) / 4.0 131 | 132 | def _var_at(p: int, n: Series) -> float: 133 | N1 = _n1(p, n) 134 | N2 = _n2(p, n) 135 | N = np.sum(n) 136 | 137 | var = ( 138 | 2.0 * (N1**3 + N2**3) 139 | + 3.0 * (N1**2 + N2**2) 140 | - np.sum(n**2 * (2 * n + 3.0)) 141 | - n.iloc[p] ** 2.0 * (2.0 * n.iloc[p] + 3.0) 142 | + 12.0 * n.iloc[p] * N1 * N2 143 | - 12.0 * n.iloc[p] ** 2.0 * N 144 | ) / 72.0 145 | return var 146 | 147 | if p: 148 | # if (x.groupby(_val_col).count() > 1).any().any(): 149 | # print("Ties are present") 150 | U = _ustat(Rij, x[_group_col], k) 151 | est = _ap(p, U) 152 | mean = _mean_at(p, n) 153 | sd = np.sqrt(_var_at(p, n)) 154 | stat = (est - mean) / sd 155 | p_value = ss.norm.sf(stat).item() 156 | else: 157 | U = _ustat(Rij, x[_group_col], k) 158 | Ap = np.array([_ap(i, U) for i in range(k)]).ravel() 159 | mean = np.array([_mean_at(i, n) for i in range(k)]).ravel() 160 | var = np.array([_var_at(i, n) for i in range(k)]).ravel() 161 | A = (Ap - mean) / np.sqrt(var) 162 | stat = float(np.max(A)) 163 | 164 | mt = [] 165 | for _ in range(n_perm): 166 | ix = Series(np.random.permutation(Rij)) 167 | uix = _ustat(ix, x[_group_col], k) 168 | apix = np.array([_ap(i, uix) for i in range(k)]) 169 | astarix = (apix - mean) / np.sqrt(var) 170 | mt.append(np.max(astarix)) 171 | 172 | mt = np.array(mt) 173 | p_value = mt[mt > stat].size / n_perm 174 | 175 | return p_value, stat 176 | 177 | 178 | def test_osrt( 179 | data: Union[List, np.ndarray, DataFrame], 180 | val_col: Optional[str] = None, 181 | group_col: Optional[str] = None, 182 | sort: bool = False, 183 | ) -> tuple[float, float, int]: 184 | """Hayter's one-sided studentised range test (OSRT) 185 | 186 | Tests a hypothesis against an ordered alternative for normal data with 187 | equal variances [1]_. 188 | 189 | Parameters 190 | ---------- 191 | data : Union[List, numpy.ndarray, DataFrame] 192 | An array, any object exposing the array interface or a pandas 193 | DataFrame with data values. 194 | 195 | val_col : str = None 196 | Name of a DataFrame column that contains dependent variable values 197 | (test or response variable). Values should have a non-nominal scale. 198 | Must be specified if ``a`` is a pandas DataFrame object. 199 | 200 | group_col : str = None 201 | Name of a DataFrame column that contains independent variable values 202 | (grouping or predictor variable). Values should have a nominal scale 203 | (categorical). Must be specified if `a` is a pandas DataFrame object. 204 | 205 | sort : bool = False 206 | If True, sort data by block and group columns. 207 | 208 | Returns 209 | ------- 210 | tuple[float, float, int] 211 | P value, statistic, and number of degrees of freedom. 212 | 213 | Notes 214 | ----- 215 | P values are computed from the Tukey distribution. 216 | 217 | References 218 | ---------- 219 | .. [1] Hayter, A.J.(1990) A One-Sided Studentised Range Test for Testing 220 | Against a Simple Ordered Alternative, Journal of the American 221 | Statistical Association, 85, 778-785. 222 | 223 | Examples 224 | -------- 225 | >>> import scikit_posthocs as sp 226 | >>> import pandas as pd 227 | >>> x = pd.DataFrame({"a": [1,2,3,5,1], "b": [12,31,54,62,12], "c": [10,12,6,74,11]}) 228 | >>> x = x.melt(var_name='groups', value_name='values') 229 | >>> sp.test_osrt(x, val_col='values', group_col='groups') 230 | """ 231 | x, _val_col, _group_col = __convert_to_df(data, val_col, group_col) 232 | 233 | if not sort: 234 | x[_group_col] = Categorical(x[_group_col], categories=x[_group_col].unique(), ordered=True) 235 | 236 | x.sort_values(by=[_group_col], ascending=True, inplace=True) 237 | groups = np.unique(x[_group_col]) 238 | x_grouped = x.groupby(_group_col, observed=True)[_val_col] 239 | 240 | xi = x_grouped.mean() 241 | ni = x_grouped.count() 242 | k = groups.size 243 | n = len(x.index) 244 | df = n - k 245 | 246 | sigma2 = 0 247 | c = -1 248 | 249 | for i in range(k): 250 | for j in range(ni.iloc[i]): 251 | c += 1 252 | sigma2 += (x[_val_col].iloc[c] - xi.iloc[i]) ** 2.0 / df 253 | 254 | sigma = np.sqrt(sigma2) 255 | 256 | def compare(i, j): 257 | dif = xi.loc[groups[j]] - xi.loc[groups[i]] 258 | A = sigma / np.sqrt(2.0) * np.sqrt(1.0 / ni[groups[j]] + 1.0 / ni[groups[i]]) 259 | qval = np.abs(dif) / A 260 | return qval 261 | 262 | vs = np.zeros((k, k), dtype=float) 263 | combs = it.combinations(range(k), 2) 264 | 265 | for i, j in combs: 266 | vs[i, j] = compare(i, j) 267 | 268 | stat = np.max(vs) 269 | pval = ss.studentized_range.sf(stat, k, df) 270 | return pval, stat, df 271 | 272 | 273 | def test_durbin( 274 | data: Union[List, np.ndarray, DataFrame], 275 | y_col: Optional[Union[str, int]] = None, 276 | group_col: Optional[Union[str, int]] = None, 277 | block_col: Optional[Union[str, int]] = None, 278 | block_id_col: Optional[Union[str, int]] = None, 279 | melted: bool = False, 280 | sort: bool = True, 281 | ) -> tuple[float, float, int]: 282 | """Durbin's test whether k groups (or treatments) in a two-way 283 | balanced incomplete block design (BIBD) have identical effects. See 284 | references for additional information [1]_, [2]_. 285 | 286 | Parameters 287 | ---------- 288 | data : Union[List, np.ndarray, DataFrame] 289 | An array, any object exposing the array interface or a pandas 290 | DataFrame with data values. 291 | 292 | If ``melted`` argument is set to False (default), ``a`` is a typical 293 | matrix of block design, i.e. rows are blocks, and columns are groups. 294 | In this case, you do not need to specify col arguments. 295 | 296 | If ``a`` is an array and ``melted`` is set to True, 297 | y_col, block_col and group_col must specify the indices of columns 298 | containing elements of correspondary type. 299 | 300 | If ``a`` is a Pandas DataFrame and ``melted`` is set to True, 301 | y_col, block_col and group_col must specify columns names (string). 302 | 303 | y_col : Union[str, int] = None 304 | Must be specified if ``a`` is a melted pandas DataFrame object. 305 | Name of the column that contains y data. 306 | 307 | group_col : Union[str, int] = None 308 | Must be specified if ``a`` is a melted pandas DataFrame object. 309 | Name of the column that contains group names. 310 | 311 | block_col : Union[str, int] = None 312 | Must be specified if ``a`` is a melted pandas DataFrame object. 313 | Name of the column that contains block names. 314 | 315 | block_id_col : Union[str, int] = None 316 | Must be specified if ``a`` is a melted pandas DataFrame object. 317 | Name of the column that contains identifiers of block names. 318 | In most cases, this is the same as `block_col` except for those 319 | cases when you have multiple instances of the same blocks. 320 | 321 | melted : bool = False 322 | Specifies if data are given as melted columns "y", "blocks", and 323 | "groups". 324 | 325 | sort : bool = False 326 | If True, sort data by block and group columns. 327 | 328 | Returns 329 | ------- 330 | tuple[float, float, int] 331 | P value, statistic, and number of degrees of freedom. 332 | 333 | References 334 | ---------- 335 | .. [1] N. A. Heckert, J. J. Filliben. (2003) NIST Handbook 148: Dataplot Reference 336 | Manual, Volume 2: Let Subcommands and Library Functions. National Institute of 337 | Standards and Technology Handbook Series, June 2003. 338 | .. [2] W. J. Conover (1999), Practical nonparametric Statistics, 339 | 3rd. edition, Wiley. 340 | 341 | Examples 342 | -------- 343 | >>> x = np.array([[31,27,24],[31,28,31],[45,29,46],[21,18,48],[42,36,46],[32,17,40]]) 344 | >>> sp.test_durbin(x) 345 | """ 346 | x, _y_col, _group_col, _block_col, _block_id_col = __convert_to_block_df( 347 | data, y_col, group_col, block_col, block_id_col, melted 348 | ) 349 | 350 | groups = x[_group_col].unique() 351 | blocks = x[_block_id_col].unique() 352 | if not sort: 353 | x[_group_col] = Categorical(x[_group_col], categories=groups, ordered=True) 354 | x[_block_col] = Categorical(x[_block_col], categories=blocks, ordered=True) 355 | x.sort_values(by=[_block_col, _group_col], ascending=True, inplace=True) 356 | x.dropna(inplace=True) 357 | 358 | t = len(groups) 359 | b = len(blocks) 360 | r = float(b) 361 | k = float(t) 362 | 363 | x["y_ranks"] = x.groupby(_block_id_col, observed=True)[_y_col].rank() 364 | rs = x.groupby(_group_col, observed=True)["y_ranks"].sum().to_numpy() 365 | 366 | A = float(np.sum(x["y_ranks"] ** 2.0)) 367 | C = float(b * k * (k + 1) ** 2.0) / 4.0 368 | D = float(np.sum(rs**2.0)) - r * C 369 | T1 = (t - 1.0) / (A - C) * D 370 | stat = T1 371 | df = t - 1 372 | pval = ss.chi2.sf(stat, df).item() 373 | 374 | return pval, stat, df 375 | -------------------------------------------------------------------------------- /docs/source/tutorial.rst: -------------------------------------------------------------------------------- 1 | Tutorial 2 | ======== 3 | 4 | Parametric ANOVA with post hoc tests 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | Here is a simple example of the one-way analysis of variance (ANOVA) with post hoc tests used to compare *sepal width* means of three groups (three iris species) in *iris* dataset. 8 | 9 | To begin, we will import the dataset using statsmodels ``get_rdataset()`` method. 10 | 11 | .. code:: python 12 | 13 | >>> import statsmodels.api as sa 14 | >>> import statsmodels.formula.api as sfa 15 | >>> import scikit_posthocs as sp 16 | >>> df = sa.datasets.get_rdataset('iris').data 17 | >>> df.head() 18 | Sepal.Length Sepal.Width Petal.Length Petal.Width Species 19 | 0 5.1 3.5 1.4 0.2 setosa 20 | 1 4.9 3.0 1.4 0.2 setosa 21 | 2 4.7 3.2 1.3 0.2 setosa 22 | 3 4.6 3.1 1.5 0.2 setosa 23 | 4 5.0 3.6 1.4 0.2 setosa 24 | 25 | Now, we will build a model and run ANOVA using statsmodels ``ols()`` and ``anova_lm()`` methods. Columns ``Species`` and ``Sepal.Width`` contain independent (predictor) and dependent (response) variable values, correspondingly. 26 | 27 | .. code:: python 28 | 29 | >>> lm = sfa.ols('Sepal.Width ~ C(Species)', data=df).fit() 30 | >>> anova = sa.stats.anova_lm(lm) 31 | >>> print(anova) 32 | df sum_sq mean_sq F PR(>F) 33 | C(Species) 2.0 11.344933 5.672467 49.16004 4.492017e-17 34 | Residual 147.0 16.962000 0.115388 NaN NaN 35 | 36 | The results tell us that there is a significant difference between groups means (p = 4.49e-17), but does not tell us the exact group pairs which are different in means. To obtain pairwise group differences, we will carry out a posteriori (post hoc) analysis using ``scikits-posthocs`` package. Student T test applied pairwisely gives us the following p values: 37 | 38 | .. code:: python 39 | 40 | >>> sp.posthoc_ttest(df, val_col='Sepal.Width', group_col='Species', p_adjust='holm') 41 | setosa versicolor virginica 42 | setosa -1.000000e+00 5.535780e-15 8.492711e-09 43 | versicolor 5.535780e-15 -1.000000e+00 1.819100e-03 44 | virginica 8.492711e-09 1.819100e-03 -1.000000e+00 45 | 46 | Remember to use a `FWER controlling procedure `_, such as Holm procedure, when making multiple comparisons. As seen from this table, significant differences in group means are obtained for all group pairs. 47 | 48 | Non-parametric ANOVA with post hoc tests 49 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 50 | 51 | If normality and other `assumptions `_ are violated, one can use a non-parametric Kruskal-Wallis H test (one-way non-parametric ANOVA) to test if samples came from the same distribution. 52 | 53 | Let's use the same dataset just to demonstrate the procedure. Kruskal-Wallis test is implemented in SciPy package. ``scipy.stats.kruskal`` method accepts array-like structures, but not DataFrames. 54 | 55 | .. code:: python 56 | 57 | >>> import scipy.stats as ss 58 | >>> import statsmodels.api as sa 59 | >>> import scikit_posthocs as sp 60 | >>> df = sa.datasets.get_rdataset('iris').data 61 | >>> data = [df.loc[ids, 'Sepal.Width'].values for ids in df.groupby('Species').groups.values()] 62 | 63 | ``data`` is a list of 1D arrays containing *sepal width* values, one array per each species. Now we can run Kruskal-Wallis analysis of variance. 64 | 65 | .. code:: python 66 | 67 | >>> H, p = ss.kruskal(*data) 68 | >>> p 69 | 1.5692820940316782e-14 70 | 71 | P value tells us we may reject the null hypothesis that the population medians of all of the groups are equal. To learn what groups (species) differ in their medians we need to run post hoc tests. ``scikit-posthocs`` provides a lot of non-parametric tests mentioned above. Let's choose Conover's test. 72 | 73 | .. code:: python 74 | 75 | >>> sp.posthoc_conover(df, val_col='Sepal.Width', group_col='Species', p_adjust = 'holm') 76 | setosa versicolor virginica 77 | setosa -1.000000e+00 2.278515e-18 1.293888e-10 78 | versicolor 2.278515e-18 -1.000000e+00 1.881294e-03 79 | virginica 1.293888e-10 1.881294e-03 -1.000000e+00 80 | 81 | Pairwise comparisons show that we may reject the null hypothesis (p < 0.01) for each pair of species and conclude that all groups (species) differ in their sepal widths. 82 | 83 | Block design 84 | ~~~~~~~~~~~~ 85 | 86 | In block design case, we have a primary factor (e.g. treatment) and a blocking factor (e.g. age or gender). A blocking factor is also called a *nuisance* factor, and it is usually a source of variability that needs to be accounted for. 87 | 88 | An example scenario is testing the effect of four fertilizers on crop yield in four cornfields. We can represent the results with a matrix in which rows correspond to the blocking factor (field) and columns correspond to the primary factor (yield). 89 | 90 | The following dataset is artificial and created just for demonstration of the procedure: 91 | 92 | .. code:: python 93 | 94 | >>> data = np.array([[ 8.82, 11.8 , 10.37, 12.08], 95 | [ 8.92, 9.58, 10.59, 11.89], 96 | [ 8.27, 11.46, 10.24, 11.6 ], 97 | [ 8.83, 13.25, 8.33, 11.51]]) 98 | 99 | First, we need to perform an omnibus test — Friedman rank sum test. It is implemented in ``scipy.stats`` subpackage: 100 | 101 | .. code:: python 102 | 103 | >>> import scipy.stats as ss 104 | >>> ss.friedmanchisquare(*data.T) 105 | FriedmanchisquareResult(statistic=8.700000000000003, pvalue=0.03355726870553798) 106 | 107 | We can reject the null hypothesis that our treatments have the same distribution, because p value is less than 0.05. A number of post hoc tests are available in ``scikit-posthocs`` package for unreplicated block design data. In the following example, Nemenyi's test is used: 108 | 109 | .. code:: python 110 | 111 | >>> import scikit_posthocs as sp 112 | >>> sp.posthoc_nemenyi_friedman(data) 113 | 0 1 2 3 114 | 0 -1.000000 0.220908 0.823993 0.031375 115 | 1 0.220908 -1.000000 0.670273 0.823993 116 | 2 0.823993 0.670273 -1.000000 0.220908 117 | 3 0.031375 0.823993 0.220908 -1.000000 118 | 119 | This function returns a DataFrame with p values obtained in pairwise comparisons between all treatments. 120 | One can also pass a DataFrame and specify the names of columns containing dependent variable values, blocking and primary factor values. The following code creates a DataFrame with the same data: 121 | 122 | .. code:: python 123 | 124 | >>> data = pd.DataFrame.from_dict({'blocks': {0: 0, 1: 1, 2: 2, 3: 3, 4: 0, 5: 1, 6: 125 | 2, 7: 3, 8: 0, 9: 1, 10: 2, 11: 3, 12: 0, 13: 1, 14: 2, 15: 3}, 'groups': {0: 126 | 0, 1: 0, 2: 0, 3: 0, 4: 1, 5: 1, 6: 1, 7: 1, 8: 2, 9: 2, 10: 2, 11: 2, 12: 3, 127 | 13: 3, 14: 3, 15: 3}, 'y': {0: 8.82, 1: 8.92, 2: 8.27, 3: 8.83, 4: 11.8, 5: 128 | 9.58, 6: 11.46, 7: 13.25, 8: 10.37, 9: 10.59, 10: 10.24, 11: 8.33, 12: 12.08, 129 | 13: 11.89, 14: 11.6, 15: 11.51}}) 130 | >>> data 131 | blocks groups y 132 | 0 0 0 8.82 133 | 1 1 0 8.92 134 | 2 2 0 8.27 135 | 3 3 0 8.83 136 | 4 0 1 11.80 137 | 5 1 1 9.58 138 | 6 2 1 11.46 139 | 7 3 1 13.25 140 | 8 0 2 10.37 141 | 9 1 2 10.59 142 | 10 2 2 10.24 143 | 11 3 2 8.33 144 | 12 0 3 12.08 145 | 13 1 3 11.89 146 | 14 2 3 11.60 147 | 15 3 3 11.51 148 | 149 | This is a *melted* and ready-to-use DataFrame. Do not forget to pass ``melted`` argument: 150 | 151 | .. code:: python 152 | 153 | >>> sp.posthoc_nemenyi_friedman(data, y_col='y', block_col='blocks', group_col='groups', melted=True) 154 | 0 1 2 3 155 | 0 -1.000000 0.220908 0.823993 0.031375 156 | 1 0.220908 -1.000000 0.670273 0.823993 157 | 2 0.823993 0.670273 -1.000000 0.220908 158 | 3 0.031375 0.823993 0.220908 -1.000000 159 | 160 | 161 | Data types 162 | ~~~~~~~~~~ 163 | 164 | Internally, ``scikit-posthocs`` uses NumPy ndarrays and pandas DataFrames to store and process data. Python lists, NumPy ndarrays, and pandas DataFrames are supported as *input* data types. Below are usage examples of various input data structures. 165 | 166 | Lists and arrays 167 | ^^^^^^^^^^^^^^^^ 168 | 169 | .. code:: python 170 | 171 | >>> x = [[1,2,1,3,1,4], [12,3,11,9,3,8,1], [10,22,12,9,8,3]] 172 | >>> # or 173 | >>> x = np.array([[1,2,1,3,1,4], [12,3,11,9,3,8,1], [10,22,12,9,8,3]]) 174 | >>> sp.posthoc_conover(x, p_adjust='holm') 175 | 1 2 3 176 | 1 -1.000000 0.057606 0.007888 177 | 2 0.057606 -1.000000 0.215761 178 | 3 0.007888 0.215761 -1.000000 179 | 180 | You can check how it is processed with a hidden function ``__convert_to_df()``: 181 | 182 | .. code:: python 183 | 184 | >>> sp.__convert_to_df(x) 185 | ( vals groups 186 | 0 1 1 187 | 1 2 1 188 | 2 1 1 189 | 3 3 1 190 | 4 1 1 191 | 5 4 1 192 | 6 12 2 193 | 7 3 2 194 | 8 11 2 195 | 9 9 2 196 | 10 3 2 197 | 11 8 2 198 | 12 1 2 199 | 13 10 3 200 | 14 22 3 201 | 15 12 3 202 | 16 9 3 203 | 17 8 3 204 | 18 3 3, 'vals', 'groups') 205 | 206 | It returns a tuple of a DataFrame representation and names of the columns containing dependent (``vals``) and independent (``groups``) variable values. 207 | 208 | *Block design* matrix passed as a NumPy ndarray is processed with a hidden ``__convert_to_block_df()`` function: 209 | 210 | .. code:: python 211 | 212 | >>> data = np.array([[ 8.82, 11.8 , 10.37, 12.08], 213 | [ 8.92, 9.58, 10.59, 11.89], 214 | [ 8.27, 11.46, 10.24, 11.6 ], 215 | [ 8.83, 13.25, 8.33, 11.51]]) 216 | >>> sp.__convert_to_block_df(data) 217 | ( blocks groups y 218 | 0 0 0 8.82 219 | 1 1 0 8.92 220 | 2 2 0 8.27 221 | 3 3 0 8.83 222 | 4 0 1 11.80 223 | 5 1 1 9.58 224 | 6 2 1 11.46 225 | 7 3 1 13.25 226 | 8 0 2 10.37 227 | 9 1 2 10.59 228 | 10 2 2 10.24 229 | 11 3 2 8.33 230 | 12 0 3 12.08 231 | 13 1 3 11.89 232 | 14 2 3 11.60 233 | 15 3 3 11.51, 'y', 'groups', 'blocks') 234 | 235 | DataFrames 236 | ^^^^^^^^^^ 237 | 238 | If you are using DataFrames, you need to pass column names containing variable values to a post hoc function: 239 | 240 | .. code:: python 241 | 242 | >>> import statsmodels.api as sa 243 | >>> import scikit_posthocs as sp 244 | >>> df = sa.datasets.get_rdataset('iris').data 245 | >>> sp.posthoc_conover(df, val_col='Sepal.Width', group_col='Species', p_adjust = 'holm') 246 | 247 | ``val_col`` and ``group_col`` arguments specify the names of the columns containing dependent (response) and independent (grouping) variable values. 248 | 249 | Significance plots 250 | ~~~~~~~~~~~~~~~~~~ 251 | 252 | P values can be plotted using a heatmap: 253 | 254 | .. code:: python 255 | 256 | pc = sp.posthoc_conover(x, val_col='values', group_col='groups') 257 | heatmap_args = {'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]} 258 | sp.sign_plot(pc, **heatmap_args) 259 | 260 | .. image:: _static/plot-conover.png 261 | 262 | Custom colormap applied to a plot: 263 | 264 | .. code:: python 265 | 266 | pc = sp.posthoc_conover(x, val_col='values', group_col='groups') 267 | # Format: diagonal, non-significant, p<0.001, p<0.01, p<0.05 268 | cmap = ['1', '#fb6a4a', '#08306b', '#4292c6', '#c6dbef'] 269 | heatmap_args = {'cmap': cmap, 'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]} 270 | sp.sign_plot(pc, **heatmap_args) 271 | 272 | .. image:: _static/plot-conover-custom-cmap.png 273 | 274 | 275 | Critical difference diagrams 276 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 277 | 278 | Critical difference diagrams are another interesting way of visualizing post 279 | hoc test statistics. Firstly, in a block design scenario, the values within 280 | each block are ranked, and the average rank across all blocks for each 281 | treatment is plotted along the x axis. A crossbar is then drawn over each 282 | group of treatments that do not show a statistically significant difference 283 | among themselves. 284 | 285 | .. image:: _static/cd_diagram0.png 286 | 287 | As an example, suppose we have a set of 8 treatments with 30 measurements 288 | (blocks) each, as simulated below. It could, for instance, represent scores for 289 | eight machine learning models in a 30-fold cross-validation setting. 290 | 291 | .. code:: python 292 | 293 | >>> rng = np.random.default_rng(1) 294 | >>> dict_data = { 295 | 'model1': rng.normal(loc=0.2, scale=0.1, size=30), 296 | 'model2': rng.normal(loc=0.2, scale=0.1, size=30), 297 | 'model3': rng.normal(loc=0.4, scale=0.1, size=30), 298 | 'model4': rng.normal(loc=0.5, scale=0.1, size=30), 299 | 'model5': rng.normal(loc=0.7, scale=0.1, size=30), 300 | 'model6': rng.normal(loc=0.7, scale=0.1, size=30), 301 | 'model7': rng.normal(loc=0.8, scale=0.1, size=30), 302 | 'model8': rng.normal(loc=0.9, scale=0.1, size=30), 303 | } 304 | >>> data = ( 305 | pd.DataFrame(dict_data) 306 | .rename_axis('cv_fold') 307 | .melt( 308 | var_name='estimator', 309 | value_name='score', 310 | ignore_index=False, 311 | ) 312 | .reset_index() 313 | ) 314 | >>> data 315 | cv_fold estimator score 316 | 0 0 model1 0.234558 317 | 1 1 model1 0.282162 318 | 2 2 model1 0.233044 319 | 3 3 model1 0.069684 320 | 4 4 model1 0.290536 321 | .. ... ... ... 322 | 235 25 model8 0.925956 323 | 236 26 model8 0.758762 324 | 237 27 model8 0.977032 325 | 238 28 model8 0.829890 326 | 239 29 model8 0.787381 327 | 328 | [240 rows x 3 columns] 329 | 330 | The average (percentile) ranks could be calculated as follows: 331 | 332 | .. code:: python 333 | 334 | >>> avg_rank = data.groupby('cv_fold').score.rank(pct=True).groupby(data.estimator).mean() 335 | >>> avg_rank 336 | 337 | estimator 338 | model1 0.208333 339 | model2 0.191667 340 | model3 0.366667 341 | model4 0.495833 342 | model5 0.708333 343 | model6 0.737500 344 | model7 0.850000 345 | model8 0.941667 346 | Name: score, dtype: float64 347 | 348 | Again, the omnibus test result shows we can confidently reject the null 349 | hypothesis that all models come from the same distribution and proceed to the 350 | post hoc analysis. 351 | 352 | .. code:: python 353 | 354 | >>> import scipy.stats as ss 355 | >>> ss.friedmanchisquare(*dict_data.values()) 356 | FriedmanchisquareResult(statistic=186.9000000000001, pvalue=6.787361102785178e-37) 357 | 358 | The results of a post hoc Conover test are collected: 359 | 360 | .. code:: python 361 | 362 | >>> test_results = sp.posthoc_conover_friedman( 363 | >>> data, 364 | >>> melted=True, 365 | >>> block_col='cv_fold', 366 | >>> group_col='estimator', 367 | >>> y_col='score', 368 | >>> ) 369 | >>> sp.sign_plot(test_results) 370 | 371 | .. image:: _static/cd_diagram_example_sig_plot.png 372 | 373 | Finally, the average ranks and post hoc significance results can be passed to 374 | the ``critical_difference_diagram()`` function to plot the diagram: 375 | 376 | .. code:: python 377 | 378 | >>> plt.figure(figsize=(10, 2), dpi=100) 379 | >>> plt.title('Critical difference diagram of average score ranks') 380 | >>> sp.critical_difference_diagram(avg_rank, test_results) 381 | 382 | .. image:: _static/cd_diagram1.png 383 | 384 | The diagram shows that model 8 is significantly better ranked than all models 385 | but model 7, that models 1 and 2 are worse than the others, and that 3 and 4 386 | are also worse ranked than models 5, 6 and 7. Other comparisons, however, do 387 | not have sufficient statistical evidence to support them. 388 | 389 | Several style customization options are available: 390 | 391 | .. code:: python 392 | 393 | >>> plt.figure(figsize=(10, 2), dpi=100) 394 | >>> plt.title('Critical difference diagram of average score ranks') 395 | >>> sp.critical_difference_diagram( 396 | >>> ranks=avg_rank, 397 | >>> sig_matrix=test_results, 398 | >>> label_fmt_left='{label} [{rank:.3f}] ', 399 | >>> label_fmt_right=' [{rank:.3f}] {label}', 400 | >>> text_h_margin=0.3, 401 | >>> label_props={'color': 'black', 'fontweight': 'bold'}, 402 | >>> crossbar_props={'color': None, 'marker': 'o'}, 403 | >>> marker_props={'marker': '*', 's': 150, 'color': 'y', 'edgecolor': 'k'}, 404 | >>> elbow_props={'color': 'gray'}, 405 | >>> ) 406 | 407 | .. image:: _static/cd_diagram2.png 408 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: images/logo.png 2 | 3 | =============== 4 | 5 | .. image:: http://joss.theoj.org/papers/10.21105/joss.01169/status.svg 6 | :target: https://doi.org/10.21105/joss.01169 7 | .. image:: https://img.shields.io/github/actions/workflow/status/maximtrp/scikit-posthocs/package-test.yml?label=build 8 | :target: https://github.com/maximtrp/scikit-posthocs/actions/workflows/package-test.yml 9 | .. image:: https://img.shields.io/readthedocs/scikit-posthocs.svg 10 | :target: https://scikit-posthocs.readthedocs.io 11 | .. image:: https://img.shields.io/codacy/coverage/50d2a82a6dd84b51b515cebf931067d7/master 12 | :target: https://app.codacy.com/gh/maximtrp/scikit-posthocs/dashboard 13 | .. image:: https://img.shields.io/codacy/grade/50d2a82a6dd84b51b515cebf931067d7 14 | :target: https://www.codacy.com/gh/maximtrp/scikit-posthocs/dashboard 15 | .. image:: https://static.pepy.tech/badge/scikit-posthocs 16 | :target: https://pepy.tech/project/scikit-posthocs 17 | .. image:: https://img.shields.io/github/issues/maximtrp/scikit-posthocs.svg 18 | :target: https://github.com/maximtrp/scikit-posthocs/issues 19 | .. image:: https://img.shields.io/pypi/v/scikit-posthocs.svg 20 | :target: https://pypi.python.org/pypi/scikit-posthocs/ 21 | .. image:: https://img.shields.io/conda/vn/conda-forge/scikit-posthocs.svg 22 | :target: https://anaconda.org/conda-forge/scikit-posthocs 23 | 24 | =============== 25 | 26 | **scikit-posthocs** is a Python package that provides post hoc tests for 27 | pairwise multiple comparisons that are usually performed in statistical 28 | data analysis to assess the differences between group levels if a statistically 29 | significant result of ANOVA test has been obtained. 30 | 31 | **scikit-posthocs** is tightly integrated with Pandas DataFrames and NumPy 32 | arrays to ensure fast computations and convenient data import and storage. 33 | 34 | This package will be useful for statisticians, data analysts, and 35 | researchers who use Python in their work. 36 | 37 | 38 | Background 39 | ---------- 40 | 41 | Python statistical ecosystem comprises multiple packages. However, it 42 | still has numerous gaps and is surpassed by R packages and capabilities. 43 | 44 | `SciPy `_ (version 1.2.0) offers *Student*, *Wilcoxon*, 45 | and *Mann-Whitney* tests that are not adapted to multiple pairwise 46 | comparisons. `Statsmodels `_ (version 0.9.0) 47 | features *TukeyHSD* test that needs some extra actions to be fluently 48 | integrated into a data analysis pipeline. 49 | `Statsmodels `_ also has good helper 50 | methods: ``allpairtest`` (adapts an external function such as 51 | ``scipy.stats.ttest_ind`` to multiple pairwise comparisons) and 52 | ``multipletests`` (adjusts *p* values to minimize type I and II errors). 53 | `PMCMRplus `_ is a very good R package that 54 | has no rivals in Python as it offers more than 40 various tests (including 55 | post hoc tests) for factorial and block design data. PMCMRplus was an 56 | inspiration and a reference for *scikit-posthocs*. 57 | 58 | **scikit-posthocs** attempts to improve Python statistical capabilities by 59 | offering a lot of parametric and nonparametric post hoc tests along with 60 | outliers detection and basic plotting methods. 61 | 62 | 63 | Features 64 | -------- 65 | 66 | .. image:: images/flowchart.png 67 | :alt: Tests Flowchart 68 | 69 | - *Omnibus* tests: 70 | 71 | - Durbin test (for balanced incomplete block design). 72 | - Mack-Wolfe test. 73 | - Hayter (OSRT) test. 74 | 75 | - *Parametric* pairwise multiple comparisons tests: 76 | 77 | - Scheffe test. 78 | - Student T test. 79 | - Tamhane T2 test. 80 | - TukeyHSD test. 81 | 82 | - *Non-parametric* tests for factorial design: 83 | 84 | - Conover test. 85 | - Dunn test. 86 | - Dwass, Steel, Critchlow, and Fligner test. 87 | - Mann-Whitney test. 88 | - Nashimoto and Wright (NPM) test. 89 | - Nemenyi test. 90 | - van Waerden test. 91 | - Wilcoxon test. 92 | 93 | - *Non-parametric* tests for block design: 94 | 95 | - Conover test. 96 | - Durbin and Conover test. 97 | - Miller test. 98 | - Nemenyi test. 99 | - Quade test. 100 | - Siegel test. 101 | 102 | - Outliers detection tests: 103 | 104 | - Simple test based on interquartile range (IQR). 105 | - Grubbs test. 106 | - Tietjen-Moore test. 107 | - Generalized Extreme Studentized Deviate test (ESD test). 108 | 109 | - Other tests: 110 | 111 | - Anderson-Darling test. 112 | 113 | - Global null hypothesis tests: 114 | 115 | - Fisher's combination test. 116 | - Simes test. 117 | 118 | - Plotting functionality (e.g. significance plots). 119 | 120 | All post hoc tests are capable of p adjustments for multiple 121 | pairwise comparisons. 122 | 123 | Dependencies 124 | ------------ 125 | 126 | - `NumPy and SciPy packages `_ 127 | - `Statsmodels `_ 128 | - `Pandas `_ 129 | - `Matplotlib `_ 130 | - `Seaborn `_ 131 | 132 | Compatibility 133 | ------------- 134 | 135 | Package is only compatible with Python 3. 136 | 137 | Install 138 | ------- 139 | 140 | You can install the package using ``pip`` (from PyPi): 141 | 142 | .. code:: bash 143 | 144 | pip install scikit-posthocs 145 | 146 | Or using ``conda`` (from conda-forge repo): 147 | 148 | .. code:: bash 149 | 150 | conda install -c conda-forge scikit-posthocs 151 | 152 | The latest version from GitHub can be installed using: 153 | 154 | .. code:: bash 155 | 156 | pip install git+https://github.com/maximtrp/scikit-posthocs.git 157 | 158 | Examples 159 | -------- 160 | 161 | Parametric ANOVA with post hoc tests 162 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 163 | 164 | Here is a simple example of the one-way analysis of variance (ANOVA) 165 | with post hoc tests used to compare *sepal width* means of three 166 | groups (three iris species) in *iris* dataset. 167 | 168 | To begin, we will import the dataset using statsmodels 169 | ``get_rdataset()`` method. 170 | 171 | .. code:: python 172 | 173 | >>> import statsmodels.api as sa 174 | >>> import statsmodels.formula.api as sfa 175 | >>> import scikit_posthocs as sp 176 | >>> df = sa.datasets.get_rdataset('iris').data 177 | >>> df.columns = df.columns.str.replace('.', '') 178 | >>> df.head() 179 | SepalLength SepalWidth PetalLength PetalWidth Species 180 | 0 5.1 3.5 1.4 0.2 setosa 181 | 1 4.9 3.0 1.4 0.2 setosa 182 | 2 4.7 3.2 1.3 0.2 setosa 183 | 3 4.6 3.1 1.5 0.2 setosa 184 | 4 5.0 3.6 1.4 0.2 setosa 185 | 186 | Now, we will build a model and run ANOVA using statsmodels ``ols()`` 187 | and ``anova_lm()`` methods. Columns ``Species`` and ``SepalWidth`` 188 | contain independent (predictor) and dependent (response) variable 189 | values, correspondingly. 190 | 191 | .. code:: python 192 | 193 | >>> lm = sfa.ols('SepalWidth ~ C(Species)', data=df).fit() 194 | >>> anova = sa.stats.anova_lm(lm) 195 | >>> print(anova) 196 | df sum_sq mean_sq F PR(>F) 197 | C(Species) 2.0 11.344933 5.672467 49.16004 4.492017e-17 198 | Residual 147.0 16.962000 0.115388 NaN NaN 199 | 200 | The results tell us that there is a significant difference between 201 | groups means (p = 4.49e-17), but does not tell us the exact group pairs which 202 | are different in means. To obtain pairwise group differences, we will carry 203 | out a posteriori (post hoc) analysis using ``scikits-posthocs`` package. 204 | Student T test applied pairwisely gives us the following p values: 205 | 206 | .. code:: python 207 | 208 | >>> sp.posthoc_ttest(df, val_col='SepalWidth', group_col='Species', p_adjust='holm') 209 | setosa versicolor virginica 210 | setosa -1.000000e+00 5.535780e-15 8.492711e-09 211 | versicolor 5.535780e-15 -1.000000e+00 1.819100e-03 212 | virginica 8.492711e-09 1.819100e-03 -1.000000e+00 213 | 214 | Remember to use a `FWER controlling procedure `_, 215 | such as Holm procedure, when making multiple comparisons. As seen from this 216 | table, significant differences in group means are obtained for all group pairs. 217 | 218 | Non-parametric ANOVA with post hoc tests 219 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 220 | 221 | If normality and other `assumptions `_ 222 | are violated, one can use a non-parametric Kruskal-Wallis H test (one-way 223 | non-parametric ANOVA) to test if samples came from the same distribution. 224 | 225 | Let's use the same dataset just to demonstrate the procedure. Kruskal-Wallis 226 | test is implemented in SciPy package. ``scipy.stats.kruskal`` method 227 | accepts array-like structures, but not DataFrames. 228 | 229 | .. code:: python 230 | 231 | >>> import scipy.stats as ss 232 | >>> import statsmodels.api as sa 233 | >>> import scikit_posthocs as sp 234 | >>> df = sa.datasets.get_rdataset('iris').data 235 | >>> df.columns = df.columns.str.replace('.', '') 236 | >>> data = [df.loc[ids, 'SepalWidth'].values for ids in df.groupby('Species').groups.values()] 237 | 238 | ``data`` is a list of 1D arrays containing *sepal width* values, one array per 239 | each species. Now we can run Kruskal-Wallis analysis of variance. 240 | 241 | .. code:: python 242 | 243 | >>> H, p = ss.kruskal(*data) 244 | >>> p 245 | 1.5692820940316782e-14 246 | 247 | P value tells us we may reject the null hypothesis that the population medians 248 | of all of the groups are equal. To learn what groups (species) differ in their 249 | medians we need to run post hoc tests. ``scikit-posthocs`` provides a lot of 250 | non-parametric tests mentioned above. Let's choose Conover's test. 251 | 252 | .. code:: python 253 | 254 | >>> sp.posthoc_conover(df, val_col='SepalWidth', group_col='Species', p_adjust = 'holm') 255 | setosa versicolor virginica 256 | setosa -1.000000e+00 2.278515e-18 1.293888e-10 257 | versicolor 2.278515e-18 -1.000000e+00 1.881294e-03 258 | virginica 1.293888e-10 1.881294e-03 -1.000000e+00 259 | 260 | Pairwise comparisons show that we may reject the null hypothesis (p < 0.01) for 261 | each pair of species and conclude that all groups (species) differ in their 262 | sepal widths. 263 | 264 | Block design 265 | ~~~~~~~~~~~~ 266 | 267 | In block design case, we have a primary factor (e.g. treatment) and a blocking 268 | factor (e.g. age or gender). A blocking factor is also called a *nuisance* 269 | factor, and it is usually a source of variability that needs to be accounted 270 | for. 271 | 272 | An example scenario is testing the effect of four fertilizers on crop yield in 273 | four cornfields. We can represent the results with a matrix in which rows 274 | correspond to the blocking factor (field) and columns correspond to the 275 | primary factor (yield). 276 | 277 | The following dataset is artificial and created just for demonstration 278 | of the procedure: 279 | 280 | .. code:: python 281 | 282 | >>> data = np.array([[ 8.82, 11.8 , 10.37, 12.08], 283 | [ 8.92, 9.58, 10.59, 11.89], 284 | [ 8.27, 11.46, 10.24, 11.6 ], 285 | [ 8.83, 13.25, 8.33, 11.51]]) 286 | 287 | First, we need to perform an omnibus test — Friedman rank sum test. It is 288 | implemented in ``scipy.stats`` subpackage: 289 | 290 | .. code:: python 291 | 292 | >>> import scipy.stats as ss 293 | >>> ss.friedmanchisquare(*data.T) 294 | FriedmanchisquareResult(statistic=8.700000000000003, pvalue=0.03355726870553798) 295 | 296 | We can reject the null hypothesis that our treatments have the same 297 | distribution, because p value is less than 0.05. A number of post hoc tests are 298 | available in ``scikit-posthocs`` package for unreplicated block design data. 299 | In the following example, Nemenyi's test is used: 300 | 301 | .. code:: python 302 | 303 | >>> import scikit_posthocs as sp 304 | >>> sp.posthoc_nemenyi_friedman(data) 305 | 0 1 2 3 306 | 0 -1.000000 0.220908 0.823993 0.031375 307 | 1 0.220908 -1.000000 0.670273 0.823993 308 | 2 0.823993 0.670273 -1.000000 0.220908 309 | 3 0.031375 0.823993 0.220908 -1.000000 310 | 311 | This function returns a DataFrame with p values obtained in pairwise 312 | comparisons between all treatments. 313 | One can also pass a DataFrame and specify the names of columns containing 314 | dependent variable values, blocking and primary factor values. 315 | The following code creates a DataFrame with the same data: 316 | 317 | .. code:: python 318 | 319 | >>> data = pd.DataFrame.from_dict({'blocks': {0: 0, 1: 1, 2: 2, 3: 3, 4: 0, 5: 1, 6: 320 | 2, 7: 3, 8: 0, 9: 1, 10: 2, 11: 3, 12: 0, 13: 1, 14: 2, 15: 3}, 'groups': {0: 321 | 0, 1: 0, 2: 0, 3: 0, 4: 1, 5: 1, 6: 1, 7: 1, 8: 2, 9: 2, 10: 2, 11: 2, 12: 3, 322 | 13: 3, 14: 3, 15: 3}, 'y': {0: 8.82, 1: 8.92, 2: 8.27, 3: 8.83, 4: 11.8, 5: 323 | 9.58, 6: 11.46, 7: 13.25, 8: 10.37, 9: 10.59, 10: 10.24, 11: 8.33, 12: 12.08, 324 | 13: 11.89, 14: 11.6, 15: 11.51}}) 325 | >>> data 326 | blocks groups y 327 | 0 0 0 8.82 328 | 1 1 0 8.92 329 | 2 2 0 8.27 330 | 3 3 0 8.83 331 | 4 0 1 11.80 332 | 5 1 1 9.58 333 | 6 2 1 11.46 334 | 7 3 1 13.25 335 | 8 0 2 10.37 336 | 9 1 2 10.59 337 | 10 2 2 10.24 338 | 11 3 2 8.33 339 | 12 0 3 12.08 340 | 13 1 3 11.89 341 | 14 2 3 11.60 342 | 15 3 3 11.51 343 | 344 | This is a *melted* and ready-to-use DataFrame. Do not forget to pass ``melted`` 345 | argument: 346 | 347 | .. code:: python 348 | 349 | >>> sp.posthoc_nemenyi_friedman(data, y_col='y', block_col='blocks', group_col='groups', melted=True) 350 | 0 1 2 3 351 | 0 -1.000000 0.220908 0.823993 0.031375 352 | 1 0.220908 -1.000000 0.670273 0.823993 353 | 2 0.823993 0.670273 -1.000000 0.220908 354 | 3 0.031375 0.823993 0.220908 -1.000000 355 | 356 | 357 | Data types 358 | ~~~~~~~~~~ 359 | 360 | Internally, ``scikit-posthocs`` uses NumPy ndarrays and pandas DataFrames to 361 | store and process data. Python lists, NumPy ndarrays, and pandas DataFrames 362 | are supported as *input* data types. Below are usage examples of various 363 | input data structures. 364 | 365 | Lists and arrays 366 | ^^^^^^^^^^^^^^^^ 367 | 368 | .. code:: python 369 | 370 | >>> x = [[1,2,1,3,1,4], [12,3,11,9,3,8,1], [10,22,12,9,8,3]] 371 | >>> # or 372 | >>> x = np.array([[1,2,1,3,1,4], [12,3,11,9,3,8,1], [10,22,12,9,8,3]]) 373 | >>> sp.posthoc_conover(x, p_adjust='holm') 374 | 1 2 3 375 | 1 -1.000000 0.057606 0.007888 376 | 2 0.057606 -1.000000 0.215761 377 | 3 0.007888 0.215761 -1.000000 378 | 379 | You can check how it is processed with a hidden function ``__convert_to_df()``: 380 | 381 | .. code:: python 382 | 383 | >>> sp.__convert_to_df(x) 384 | ( vals groups 385 | 0 1 1 386 | 1 2 1 387 | 2 1 1 388 | 3 3 1 389 | 4 1 1 390 | 5 4 1 391 | 6 12 2 392 | 7 3 2 393 | 8 11 2 394 | 9 9 2 395 | 10 3 2 396 | 11 8 2 397 | 12 1 2 398 | 13 10 3 399 | 14 22 3 400 | 15 12 3 401 | 16 9 3 402 | 17 8 3 403 | 18 3 3, 'vals', 'groups') 404 | 405 | It returns a tuple of a DataFrame representation and names of the columns 406 | containing dependent (``vals``) and independent (``groups``) variable values. 407 | 408 | *Block design* matrix passed as a NumPy ndarray is processed with a hidden 409 | ``__convert_to_block_df()`` function: 410 | 411 | .. code:: python 412 | 413 | >>> data = np.array([[ 8.82, 11.8 , 10.37, 12.08], 414 | [ 8.92, 9.58, 10.59, 11.89], 415 | [ 8.27, 11.46, 10.24, 11.6 ], 416 | [ 8.83, 13.25, 8.33, 11.51]]) 417 | >>> sp.__convert_to_block_df(data) 418 | ( blocks groups y 419 | 0 0 0 8.82 420 | 1 1 0 8.92 421 | 2 2 0 8.27 422 | 3 3 0 8.83 423 | 4 0 1 11.80 424 | 5 1 1 9.58 425 | 6 2 1 11.46 426 | 7 3 1 13.25 427 | 8 0 2 10.37 428 | 9 1 2 10.59 429 | 10 2 2 10.24 430 | 11 3 2 8.33 431 | 12 0 3 12.08 432 | 13 1 3 11.89 433 | 14 2 3 11.60 434 | 15 3 3 11.51, 'y', 'groups', 'blocks') 435 | 436 | DataFrames 437 | ^^^^^^^^^^ 438 | 439 | If you are using DataFrames, you need to pass column names containing variable 440 | values to a post hoc function: 441 | 442 | .. code:: python 443 | 444 | >>> import statsmodels.api as sa 445 | >>> import scikit_posthocs as sp 446 | >>> df = sa.datasets.get_rdataset('iris').data 447 | >>> df.columns = df.columns.str.replace('.', '') 448 | >>> sp.posthoc_conover(df, val_col='SepalWidth', group_col='Species', p_adjust = 'holm') 449 | 450 | ``val_col`` and ``group_col`` arguments specify the names of the columns 451 | containing dependent (response) and independent (grouping) variable values. 452 | 453 | 454 | Significance plots 455 | ------------------ 456 | 457 | P values can be plotted using a heatmap: 458 | 459 | .. code:: python 460 | 461 | >>> pc = sp.posthoc_conover(x, val_col='values', group_col='groups') 462 | >>> heatmap_args = {'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]} 463 | >>> sp.sign_plot(pc, **heatmap_args) 464 | 465 | .. image:: images/plot-conover.png 466 | 467 | Custom colormap applied to a plot: 468 | 469 | .. code:: python 470 | 471 | >>> pc = sp.posthoc_conover(x, val_col='values', group_col='groups') 472 | >>> # Format: diagonal, non-significant, p<0.001, p<0.01, p<0.05 473 | >>> cmap = ['1', '#fb6a4a', '#08306b', '#4292c6', '#c6dbef'] 474 | >>> heatmap_args = {'cmap': cmap, 'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]} 475 | >>> sp.sign_plot(pc, **heatmap_args) 476 | 477 | .. image:: images/plot-conover-custom-cmap.png 478 | 479 | Citing 480 | ------ 481 | 482 | If you want to cite *scikit-posthocs*, please refer to the publication in 483 | the `Journal of Open Source Software `_: 484 | 485 | Terpilowski, M. (2019). scikit-posthocs: Pairwise multiple comparison tests in 486 | Python. Journal of Open Source Software, 4(36), 1169, https://doi.org/10.21105/joss.01169 487 | 488 | .. code:: 489 | 490 | @ARTICLE{Terpilowski2019, 491 | title = {scikit-posthocs: Pairwise multiple comparison tests in Python}, 492 | author = {Terpilowski, Maksim}, 493 | journal = {The Journal of Open Source Software}, 494 | volume = {4}, 495 | number = {36}, 496 | pages = {1169}, 497 | year = {2019}, 498 | doi = {10.21105/joss.01169} 499 | } 500 | 501 | Acknowledgement 502 | --------------- 503 | 504 | Thorsten Pohlert, PMCMR author and maintainer 505 | -------------------------------------------------------------------------------- /scikit_posthocs/_plotting.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import Dict, List, Optional, Set, Tuple, Union 3 | 4 | import numpy as np 5 | from matplotlib import colors, pyplot 6 | from matplotlib.axes import Axes 7 | from matplotlib.colorbar import Colorbar, ColorbarBase 8 | from matplotlib.colors import ListedColormap 9 | from pandas import DataFrame, Index, Series 10 | from seaborn import heatmap 11 | 12 | 13 | def sign_array(p_values: Union[List, np.ndarray, DataFrame], alpha: float = 0.05) -> np.ndarray: 14 | """Significance array. 15 | 16 | Converts an array with p values to a significance array where 17 | 0 is False (not significant), 1 is True (significant), 18 | and -1 is for diagonal elements. 19 | 20 | Parameters 21 | ---------- 22 | p_values : Union[List, np.ndarray, DataFrame] 23 | Any object exposing the array interface and containing 24 | p values. 25 | 26 | alpha : float = 0.05 27 | Significance level. Default is 0.05. 28 | 29 | Returns 30 | ------- 31 | result : numpy.ndarray 32 | Array where 0 is False (not significant), 1 is True (significant), 33 | and -1 is for diagonal elements. 34 | 35 | Examples 36 | -------- 37 | >>> p_values = np.array([[ 1. , 0.00119517, 0.00278329], 38 | [ 0.00119517, 1. , 0.18672227], 39 | [ 0.00278329, 0.18672227, 1. ]]) 40 | >>> ph.sign_array(p_values) 41 | array([[-1, 1, 1], 42 | [ 1, -1, 0], 43 | [ 1, 0, -1]]) 44 | """ 45 | sig_array = deepcopy(np.array(p_values)) 46 | sig_array[sig_array == 0] = 1e-10 47 | sig_array[sig_array > alpha] = 0 48 | sig_array[(sig_array < alpha) & (sig_array > 0)] = 1 49 | np.fill_diagonal(sig_array, -1) 50 | 51 | return sig_array 52 | 53 | 54 | def sign_table( 55 | p_values: Union[List, np.ndarray, DataFrame], lower: bool = True, upper: bool = True 56 | ) -> Union[DataFrame, np.ndarray]: 57 | """Significance table. 58 | 59 | Returns table that can be used in a publication. P values are replaced 60 | with asterisks: \\* - p < 0.05, \\*\\* - p < 0.01, \\*\\*\\* - p < 0.001. 61 | 62 | Parameters 63 | ---------- 64 | p_values : Union[List, np.ndarray, DataFrame] 65 | Any object exposing the array interface and containing 66 | p values. 67 | 68 | lower : bool 69 | Defines whether to return the lower triangle. 70 | 71 | upper : bool 72 | Defines whether to return the upper triangle. 73 | 74 | Returns 75 | ------- 76 | result : Union[DataFrame, np.ndarray] 77 | P values masked with asterisks. 78 | 79 | Examples 80 | -------- 81 | >>> p_values = np.array([[-1. , 0.00119517, 0.00278329], 82 | [ 0.00119517, -1. , 0.18672227], 83 | [ 0.00278329, 0.18672227, -1. ]]) 84 | >>> ph.sign_table(p_values) 85 | array([['-', '**', '**'], 86 | ['**', '-', 'NS'], 87 | ['**', 'NS', '-']], dtype=object) 88 | """ 89 | if not any([lower, upper]): 90 | raise ValueError("Either lower or upper triangle must be returned") 91 | 92 | pv = DataFrame(p_values, copy=True) if not isinstance(p_values, DataFrame) else p_values.copy() 93 | 94 | ns = pv > 0.05 95 | three = (pv < 0.001) & (pv >= 0) 96 | two = (pv < 0.01) & (pv >= 0.001) 97 | one = (pv < 0.05) & (pv >= 0.01) 98 | 99 | pv = pv.astype(str) 100 | pv[ns] = "NS" 101 | pv[three] = "***" 102 | pv[two] = "**" 103 | pv[one] = "*" 104 | 105 | np.fill_diagonal(pv.values, "-") 106 | if not lower: 107 | pv.values[np.tril_indices(pv.shape[0], -1)] = "" 108 | elif not upper: 109 | pv.values[np.triu_indices(pv.shape[0], 1)] = "" 110 | 111 | return pv 112 | 113 | 114 | def sign_plot( 115 | x: Union[List, np.ndarray, DataFrame], 116 | g: Union[List, np.ndarray, None] = None, 117 | flat: bool = False, 118 | labels: bool = True, 119 | cmap: Optional[List] = None, 120 | cbar_ax_bbox: Optional[Tuple[float, float, float, float]] = None, 121 | ax: Optional[Axes] = None, 122 | **kwargs, 123 | ) -> Union[Axes, Tuple[Axes, Colorbar]]: 124 | """Significance plot, a heatmap of p values (based on Seaborn). 125 | 126 | Parameters 127 | ---------- 128 | x : Union[List, np.ndarray, DataFrame] 129 | If `flat` is False (default), `x` must be a square array, any object 130 | exposing the array interface, containing p values. If `flat` is True, 131 | `x` must be a sign_array 132 | (returned by :py:meth:`scikit_posthocs.sign_array` function). 133 | 134 | g : Union[List, np.ndarray] 135 | An array, any object exposing the array interface, containing 136 | group names. 137 | 138 | flat : bool 139 | If `flat` is True, plots a significance array as a heatmap using 140 | seaborn. If `flat` is False (default), plots an array of p values. 141 | Non-flat mode is useful if you need to differentiate significance 142 | levels visually. It is the preferred mode. 143 | 144 | labels : bool 145 | Plot axes labels (default) or not. 146 | 147 | cmap : list 148 | 1) If flat is False (default): 149 | List consisting of five elements, that will be exported to 150 | ListedColormap method of matplotlib. First is for diagonal 151 | elements, second is for non-significant elements, third is for 152 | p < 0.001, fourth is for p < 0.01, fifth is for p < 0.05. 153 | 154 | 2) If flat is True: 155 | List consisting of three elements, that will be exported to 156 | ListedColormap method of matplotlib. First is for diagonal 157 | elements, second is for non-significant elements, third is for 158 | significant ones. 159 | 3) If not defined, default colormaps will be used. 160 | 161 | cbar_ax_bbox : list 162 | Colorbar axes position rect [left, bottom, width, height] where 163 | all quantities are in fractions of figure width and height. 164 | Refer to `matplotlib.figure.Figure.add_axes` for more information. 165 | Default is [0.95, 0.35, 0.04, 0.3]. 166 | 167 | ax : SubplotBase 168 | Axes in which to draw the plot, otherwise use the currently-active 169 | Axes. 170 | 171 | kwargs 172 | Keyword arguments to be passed to seaborn heatmap method. These 173 | keyword args cannot be used: cbar, vmin, vmax, center. 174 | 175 | Returns 176 | ------- 177 | ax : matplotlib.axes._subplots.AxesSubplot 178 | Axes object with the heatmap. 179 | 180 | cbar : matplotlib.colorbar.Colorbar 181 | ColorBar object if `flat` is set to False. 182 | 183 | Examples 184 | -------- 185 | >>> x = np.array([[ 1, 1, 1], 186 | [ 1, 1, 0], 187 | [ 1, 0, 1]]) 188 | >>> ph.sign_plot(x, flat = True) 189 | """ 190 | for key in ["cbar", "vmin", "vmax", "center"]: 191 | if key in kwargs: 192 | del kwargs[key] 193 | 194 | if isinstance(x, DataFrame): 195 | df = x.copy() 196 | else: 197 | g = g or np.arange(len(x)) 198 | df = DataFrame(x, index=Index(g), columns=Index(g), copy=True) 199 | 200 | dtype = df.values.dtype 201 | 202 | if not np.issubdtype(dtype, np.integer) and flat: 203 | raise ValueError("X should be a sign_array or DataFrame of integers") 204 | elif not np.issubdtype(dtype, np.floating) and not flat: 205 | raise ValueError("X should be an array or DataFrame of float p values") 206 | 207 | if not cmap and flat: 208 | # format: diagonal, non-significant, significant 209 | cmap = ["1", "#fbd7d4", "#1a9641"] 210 | elif not cmap: 211 | # format: diagonal, non-significant, p<0.001, p<0.01, p<0.05 212 | cmap = ["1", "#fbd7d4", "#005a32", "#238b45", "#a1d99b"] 213 | 214 | if flat: 215 | np.fill_diagonal(df.values, -1) 216 | hax = heatmap(df, vmin=-1, vmax=1, cmap=ListedColormap(cmap), cbar=False, ax=ax, **kwargs) 217 | if not labels: 218 | hax.set_xlabel("") 219 | hax.set_ylabel("") 220 | return hax 221 | 222 | else: 223 | xc = df.values.copy() 224 | df[(xc < 0.001) & (xc >= 0)] = 1 225 | df[(xc < 0.01) & (xc >= 0.001)] = 2 226 | df[(xc < 0.05) & (xc >= 0.01)] = 3 227 | df[(xc >= 0.05)] = 0 228 | 229 | np.fill_diagonal(df.values, -1) 230 | 231 | if len(cmap) != 5: 232 | raise ValueError("Cmap list must contain 5 items") 233 | 234 | hax = heatmap( 235 | df, 236 | vmin=-1, 237 | vmax=3, 238 | cmap=ListedColormap(cmap), 239 | center=1, 240 | cbar=False, 241 | ax=ax, 242 | **kwargs, 243 | ) 244 | if not labels: 245 | hax.set_xlabel("") 246 | hax.set_ylabel("") 247 | 248 | cbar_ax = hax.figure.add_axes(cbar_ax_bbox or (0.95, 0.35, 0.04, 0.3)) 249 | cbar = ColorbarBase( 250 | cbar_ax, 251 | cmap=(ListedColormap(cmap[2:] + [cmap[1]])), 252 | norm=colors.NoNorm(), 253 | boundaries=[0, 1, 2, 3, 4], 254 | ) 255 | cbar.set_ticks( 256 | list(np.linspace(0, 3, 4)), 257 | labels=["p < 0.001", "p < 0.01", "p < 0.05", "NS"], 258 | ) 259 | 260 | cbar.outline.set_linewidth(1) 261 | cbar.outline.set_edgecolor("0.5") 262 | cbar.ax.tick_params(size=0) 263 | 264 | return hax, cbar 265 | 266 | 267 | def _find_maximal_cliques(adj_matrix: DataFrame) -> List[Set]: 268 | """Wrapper function over the recursive Bron-Kerbosch algorithm. 269 | 270 | Will be used to find points that are under the same crossbar in critical 271 | difference diagrams. 272 | 273 | Parameters 274 | ---------- 275 | adj_matrix : pandas.DataFrame 276 | Binary matrix with 1 if row item and column item do NOT significantly 277 | differ. Values in the main diagonal are not considered. 278 | 279 | Returns 280 | ------- 281 | list[set] 282 | Largest fully connected subgraphs, represented as sets of indices of 283 | adj_matrix. 284 | 285 | Raises 286 | ------ 287 | ValueError 288 | If the input matrix is empty or not symmetric. 289 | If the input matrix is not binary. 290 | 291 | """ 292 | if (adj_matrix.index != adj_matrix.columns).any(): 293 | raise ValueError("adj_matrix must be symmetric, indices do not match") 294 | if not adj_matrix.isin((0, 1)).values.all(): 295 | raise ValueError("Input matrix must be binary") 296 | if adj_matrix.empty or not (adj_matrix.T == adj_matrix).values.all(): 297 | raise ValueError("Input matrix must be non-empty and symmetric") 298 | 299 | result = [] 300 | _bron_kerbosch( 301 | current_clique=set(), 302 | candidates=set(adj_matrix.index), 303 | visited=set(), 304 | adj_matrix=adj_matrix, 305 | result=result, 306 | ) 307 | return result 308 | 309 | 310 | def _bron_kerbosch( 311 | current_clique: Set, 312 | candidates: Set, 313 | visited: Set, 314 | adj_matrix: DataFrame, 315 | result: List[Set], 316 | ) -> None: 317 | """Recursive algorithm to find the maximal fully connected subgraphs. 318 | 319 | See [1]_ for more information. 320 | 321 | Parameters 322 | ---------- 323 | current_clique : set 324 | A set of vertices known to be fully connected. 325 | candidates : set 326 | Set of vertices that could potentially be added to the clique. 327 | visited : set 328 | Set of vertices already known to be part of another previously explored 329 | clique, that is not current_clique. 330 | adj_matrix : pandas.DataFrame 331 | Binary matrix with 1 if row item and column item do NOT significantly 332 | differ. Diagonal must be zeroed. 333 | result : list[set] 334 | List where to append the maximal cliques. 335 | 336 | Returns 337 | ------- 338 | None 339 | 340 | References 341 | ---------- 342 | .. [1] https://en.wikipedia.org/wiki/Bron%E2%80%93Kerbosch_algorithm 343 | """ 344 | while candidates: 345 | v = candidates.pop() 346 | _bron_kerbosch( 347 | current_clique | {v}, 348 | # Restrict candidate vertices to the neighbors of v 349 | {n for n in candidates if adj_matrix.loc[v, n]}, 350 | # Restrict visited vertices to the neighbors of v 351 | {n for n in visited if adj_matrix.loc[v, n]}, 352 | adj_matrix, 353 | result, 354 | ) 355 | visited.add(v) 356 | 357 | # We do not need to report a clique if a children call aready did it. 358 | if not visited: 359 | # If this is not a terminal call, i.e. if any clique was reported. 360 | result.append(current_clique) 361 | 362 | 363 | def critical_difference_diagram( 364 | ranks: Union[dict, Series], 365 | sig_matrix: DataFrame, 366 | *, 367 | alpha: float = 0.05, 368 | ax: Optional[Axes] = None, 369 | label_fmt_left: str = "{label} ({rank:.2g})", 370 | label_fmt_right: str = "({rank:.2g}) {label}", 371 | label_props: Optional[dict] = None, 372 | marker_props: Optional[dict] = None, 373 | elbow_props: Optional[dict] = None, 374 | crossbar_props: Optional[dict] = None, 375 | color_palette: Union[Dict[str, str], List, None] = None, 376 | text_h_margin: float = 0.01, 377 | left_only: bool = False, 378 | ) -> Dict[str, list]: 379 | """Plot a Critical Difference diagram from ranks and post-hoc results. 380 | 381 | The diagram arranges the average ranks of multiple groups on the x axis 382 | in order to facilitate performance comparisons between them. The groups 383 | that could not be statistically deemed as different are linked by a 384 | horizontal crossbar [1]_, [2]_. 385 | 386 | :: 387 | 388 | rank markers 389 | X axis ---------O----O-------------------O-O------------O--------- 390 | |----| | | | 391 | | | |---crossbar---| 392 | clf1 ----| | | | |---- clf3 393 | clf2 ---------| | |----------------- clf4 394 | |------------------- clf5 395 | |____| 396 | text_h_margin 397 | 398 | In the drawing above, the two crossbars indicate that clf1 and clf2 cannot 399 | be statistically differentiated, the same occurring between clf3, clf4 and 400 | clf5. However, clf1 and clf2 are each significantly lower ranked than clf3, 401 | clf4 and clf5. 402 | 403 | Parameters 404 | ---------- 405 | ranks : dict or Series 406 | Indicates the rank value for each sample or estimator (as keys or index). 407 | 408 | sig_matrix : DataFrame 409 | The corresponding p-value matrix outputted by post-hoc tests, with 410 | indices matching the labels in the ranks argument. 411 | 412 | alpha : float, optional = 0.05 413 | Significance level. Default is 0.05. 414 | Values below this will be considered statistically different. 415 | 416 | ax : matplotlib.SubplotBase, optional 417 | The object in which the plot will be built. Gets the current Axes 418 | by default (if None is passed). 419 | 420 | label_fmt_left : str, optional 421 | The format string to apply to the labels on the left side. The keywords 422 | label and rank can be used to specify the sample/estimator name and 423 | rank value, respectively, by default '{label} ({rank:.2g})'. 424 | 425 | label_fmt_right : str, optional 426 | The same, but for the labels on the right side of the plot. 427 | By default '({rank:.2g}) {label}'. 428 | 429 | label_props : dict, optional 430 | Parameters to be passed to pyplot.text() when creating the labels, 431 | by default None. 432 | 433 | marker_props : dict, optional 434 | Parameters to be passed to pyplot.scatter() when plotting the rank 435 | markers on the axis, by default None. 436 | 437 | elbow_props : dict, optional 438 | Parameters to be passed to pyplot.plot() when creating the elbow lines, 439 | by default None. 440 | 441 | crossbar_props : dict, optional 442 | Parameters to be passed to pyplot.plot() when creating the crossbars 443 | that indicate lack of statistically significant difference. By default 444 | None. 445 | 446 | color_palette: dict or list, optional 447 | Parameters to be passed when you need specific colors for each category 448 | 449 | text_h_margin : float, optional 450 | Space between the text labels and the nearest vertical line of an 451 | elbow, by default 0.01. 452 | 453 | left_only: boolean, optional 454 | Set all labels in a single left-sided block instead of splitting them 455 | into two block, one for the left and one for the right. 456 | 457 | 458 | Returns 459 | ------- 460 | dict[str, list[matplotlib.Artist]] 461 | Lists of Artists created. 462 | 463 | Examples 464 | -------- 465 | See the :doc:`/tutorial`. 466 | 467 | References 468 | ---------- 469 | .. [1] Demšar, J. (2006). Statistical comparisons of classifiers over multiple 470 | data sets. The Journal of Machine learning research, 7, 1-30. 471 | 472 | .. [2] https://mirkobunse.github.io/CriticalDifferenceDiagrams.jl/stable/ 473 | """ 474 | ## check color_palette consistency 475 | if not color_palette or len(color_palette) == 0: 476 | pass 477 | elif isinstance(color_palette, Dict) and ( 478 | (len(set(ranks.keys()) & set(color_palette.keys()))) == len(ranks) 479 | ): 480 | pass 481 | elif isinstance(color_palette, List) and (len(ranks) <= len(color_palette)): 482 | pass 483 | else: 484 | raise ValueError("color_palette keys are not consistent, or list size too small") 485 | 486 | elbow_props = elbow_props or {} 487 | marker_props = {"zorder": 3, **(marker_props or {})} 488 | label_props = {"va": "center", **(label_props or {})} 489 | crossbar_props = { 490 | "color": "k", 491 | "zorder": 3, 492 | "linewidth": 2, 493 | **(crossbar_props or {}), 494 | } 495 | 496 | ax = ax or pyplot.gca() 497 | ax.yaxis.set_visible(False) 498 | ax.spines["right"].set_visible(False) 499 | ax.spines["left"].set_visible(False) 500 | ax.spines["bottom"].set_visible(False) 501 | ax.xaxis.set_ticks_position("top") 502 | ax.spines["top"].set_position("zero") 503 | 504 | # lists of artists to be returned 505 | markers = [] 506 | elbows = [] 507 | labels = [] 508 | crossbars = [] 509 | 510 | # True if pairwise comparison is NOT significant 511 | adj_matrix = DataFrame( 512 | 1 - sign_array(sig_matrix, alpha=alpha), 513 | index=sig_matrix.index, 514 | columns=sig_matrix.columns, 515 | dtype=bool, 516 | ) 517 | 518 | ranks = Series(ranks).sort_values() # Standardize if ranks is dict 519 | if left_only: 520 | points_left = ranks 521 | else: 522 | points_left, points_right = ( 523 | ranks.iloc[: len(ranks) // 2], 524 | ranks.iloc[len(ranks) // 2 :], 525 | ) 526 | # points_left, points_right = np.array_split(ranks.sort_values(), 2) 527 | 528 | # Sets of points under the same crossbar 529 | crossbar_sets = _find_maximal_cliques(adj_matrix) 530 | 531 | # Sort by lowest rank and filter single-valued sets 532 | crossbar_sets = sorted( 533 | (x for x in crossbar_sets if len(x) > 1), key=lambda x: ranks[list(x)].min() 534 | ) 535 | 536 | # Create stacking of crossbars: for each level, try to fit the crossbar, 537 | # so that it does not intersect with any other in the level. If it does not 538 | # fit in any level, create a new level for it. 539 | crossbar_levels: list[list[set]] = [] 540 | for bar in crossbar_sets: 541 | for level, bars_in_level in enumerate(crossbar_levels): 542 | if not any(bool(bar & bar_in_lvl) for bar_in_lvl in bars_in_level): 543 | ypos = -level - 1 544 | bars_in_level.append(bar) 545 | break 546 | else: 547 | ypos = -len(crossbar_levels) - 1 548 | crossbar_levels.append([bar]) 549 | 550 | crossbars.append( 551 | ax.plot( 552 | # Adding a separate line between each pair enables showing a 553 | # marker over each elbow with crossbar_props={'marker': 'o'}. 554 | [ranks.loc[i] for i in bar], 555 | [ypos] * len(bar), 556 | **crossbar_props, 557 | ) 558 | ) 559 | 560 | lowest_crossbar_ypos = -len(crossbar_levels) 561 | 562 | def plot_items(points, xpos, label_fmt, color_palette, label_props): 563 | """Plot each marker + elbow + label.""" 564 | ypos = lowest_crossbar_ypos - 1 565 | for idx, (label, rank) in enumerate(points.items()): 566 | if not color_palette or len(color_palette) == 0: 567 | elbow, *_ = ax.plot( 568 | [xpos, rank, rank], 569 | [ypos, ypos, 0], 570 | **elbow_props, 571 | ) 572 | else: 573 | elbow, *_ = ax.plot( 574 | [xpos, rank, rank], 575 | [ypos, ypos, 0], 576 | c=color_palette[label] 577 | if isinstance(color_palette, Dict) 578 | else color_palette[idx], 579 | **elbow_props, 580 | ) 581 | 582 | elbows.append(elbow) 583 | curr_color = elbow.get_color() 584 | markers.append(ax.scatter(rank, 0, **{"color": curr_color, **marker_props})) 585 | labels.append( 586 | ax.text( 587 | xpos, 588 | ypos, 589 | label_fmt.format(label=label, rank=rank), 590 | color=curr_color, 591 | **label_props, 592 | ) 593 | ) 594 | ypos -= 1 595 | 596 | plot_items( 597 | points_left, 598 | xpos=points_left.iloc[0] - text_h_margin, 599 | label_fmt=label_fmt_left, 600 | color_palette=color_palette, 601 | label_props={ 602 | "ha": "right", 603 | **label_props, 604 | }, 605 | ) 606 | 607 | if not left_only: 608 | plot_items( 609 | points_right[::-1], 610 | xpos=points_right.iloc[-1] + text_h_margin, 611 | label_fmt=label_fmt_right, 612 | color_palette=list(reversed(color_palette)) 613 | if isinstance(color_palette, list) 614 | else color_palette, 615 | label_props={"ha": "left", **label_props}, 616 | ) 617 | 618 | return { 619 | "markers": markers, 620 | "elbows": elbows, 621 | "labels": labels, 622 | "crossbars": crossbars, 623 | } 624 | -------------------------------------------------------------------------------- /tests/test_posthocs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | import matplotlib as mpl 5 | import scikit_posthocs._posthocs as sp 6 | import scikit_posthocs._omnibus as som 7 | import scikit_posthocs._outliers as so 8 | import scikit_posthocs._plotting as splt 9 | import scikit_posthocs._global as spg 10 | import seaborn as sb 11 | import numpy as np 12 | import matplotlib.axes as ma 13 | from pandas import DataFrame, Series 14 | 15 | if os.environ.get("DISPLAY", "") == "": 16 | print("No display found. Using non-interactive Agg backend") 17 | mpl.use("Agg") 18 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 19 | 20 | 21 | class TestPosthocs(unittest.TestCase): 22 | # Global tests 23 | def test_global_simes_test(self): 24 | a = np.array([0.9, 0.1, 0.01, 0.99, 1.0, 0.02, 0.04]) 25 | result = spg.global_simes_test(a) 26 | self.assertAlmostEqual(result, 0.07) 27 | 28 | def test_global_f_test(self): 29 | a = np.array([0.9, 0.1, 0.01, 0.99, 1.0, 0.02, 0.04]) 30 | result, _ = spg.global_f_test(a) 31 | self.assertAlmostEqual(result, 0.01294562) 32 | 33 | # Plotting tests 34 | def test_sign_array(self): 35 | p_values = np.array( 36 | [ 37 | [0.0, 0.00119517, 0.00278329], 38 | [0.00119517, 0.0, 0.18672227], 39 | [0.00278329, 0.18672227, 0.0], 40 | ] 41 | ) 42 | test_results = splt.sign_array(p_values) 43 | correct_results = np.array([[-1, 1, 1], [1, -1, 0], [1, 0, -1]]) 44 | self.assertTrue(np.all(test_results == correct_results)) 45 | 46 | def test_sign_table(self): 47 | p_values = np.array( 48 | [ 49 | [1.0, 0.00119517, 0.00278329], 50 | [0.00119517, 1.0, 0.18672227], 51 | [0.00278329, 0.18672227, 1.0], 52 | ] 53 | ) 54 | 55 | correct_results = np.array( 56 | [["-", "**", "**"], ["**", "-", "NS"], ["**", "NS", "-"]], dtype=object 57 | ) 58 | correct_resultsl = np.array( 59 | [["-", "", ""], ["**", "-", ""], ["**", "NS", "-"]], dtype=object 60 | ) 61 | correct_resultsu = np.array( 62 | [["-", "**", "**"], ["", "-", "NS"], ["", "", "-"]], dtype=object 63 | ) 64 | 65 | with self.assertRaises(ValueError): 66 | splt.sign_table(p_values, lower=False, upper=False) 67 | 68 | self.assertTrue( 69 | np.all(splt.sign_table(p_values, lower=False, upper=True) == correct_resultsu) 70 | ) 71 | self.assertTrue( 72 | np.all(splt.sign_table(p_values, lower=True, upper=False) == correct_resultsl) 73 | ) 74 | self.assertTrue( 75 | np.all(splt.sign_table(p_values, lower=True, upper=True) == correct_results) 76 | ) 77 | 78 | def test_sign_plot(self): 79 | x = np.array([[1, 1, 1], [1, 1, 0], [1, 0, 1]]) 80 | a = splt.sign_plot(x, flat=True, labels=False) 81 | with self.assertRaises(ValueError): 82 | splt.sign_plot(x.astype(float), flat=True, labels=False) 83 | self.assertTrue(isinstance(a, ma._axes.Axes)) 84 | 85 | def test_sign_plot_nonflat(self): 86 | x = np.array( 87 | [ 88 | [1.0, 0.00119517, 0.00278329], 89 | [0.00119517, 1.0, 0.18672227], 90 | [0.00278329, 0.18672227, 1.0], 91 | ] 92 | ) 93 | a, cbar = splt.sign_plot(x, cbar=True, labels=False) 94 | 95 | with self.assertRaises(ValueError): 96 | splt.sign_plot(x, cmap=[1, 1], labels=False) 97 | with self.assertRaises(ValueError): 98 | splt.sign_plot(x.astype(np.int64), labels=False) 99 | 100 | self.assertTrue( 101 | isinstance(a, ma._axes.Axes) and isinstance(cbar, mpl.colorbar.ColorbarBase) 102 | ) 103 | 104 | def test_find_maximal_cliques_input_validation(self): 105 | with self.assertRaisesRegex(ValueError, ".*indices do not match"): 106 | splt._find_maximal_cliques( 107 | DataFrame( 108 | [[0, 1], [1, 0]], 109 | index=["a", "b"], 110 | columns=["b", "a"], 111 | ) 112 | ) 113 | with self.assertRaises(ValueError, msg="Input matrix must be binary"): 114 | splt._find_maximal_cliques(DataFrame([[0, 3], [3, 0]])) 115 | with self.assertRaisesRegex(ValueError, ".*empty and symmetric"): 116 | splt._find_maximal_cliques(DataFrame()) 117 | with self.assertRaisesRegex(ValueError, ".*empty and symmetric"): 118 | splt._find_maximal_cliques(DataFrame([[1, 0], [1, 0]])) 119 | 120 | def test_find_maximal_cliques_1x1(self): 121 | adj_matrix = DataFrame([[0]], columns=["a"], index=["a"]) 122 | expected = [{"a"}] 123 | self.assertEqual(splt._find_maximal_cliques(adj_matrix), expected) 124 | 125 | def test_find_maximal_cliques_2x2(self): 126 | adj_matrix = DataFrame( 127 | [[0, 1], [1, 0]], 128 | columns=["a", "b"], 129 | index=["a", "b"], 130 | ) 131 | expected = [{"a", "b"}] 132 | self.assertEqual(splt._find_maximal_cliques(adj_matrix), expected) 133 | 134 | def test_find_maximal_cliques_3x3(self): 135 | adj_matrix = DataFrame( 136 | [[0, 0, 1], [0, 0, 0], [1, 0, 0]], 137 | columns=["a", "b", "c"], 138 | index=["a", "b", "c"], 139 | ) 140 | expected = [{"a", "c"}, {"b"}] 141 | self.assertEqual( 142 | set(map(frozenset, splt._find_maximal_cliques(adj_matrix))), 143 | set(map(frozenset, expected)), 144 | ) 145 | 146 | def test_find_maximal_cliques_6x6(self): 147 | adj_matrix = DataFrame( 148 | [ 149 | [0, 1, 0, 0, 0, 0], 150 | [1, 0, 1, 1, 1, 0], 151 | [0, 1, 0, 1, 1, 0], 152 | [0, 1, 1, 0, 1, 0], 153 | [0, 1, 1, 1, 0, 0], 154 | [0, 0, 0, 0, 0, 0], 155 | ] 156 | ) 157 | expected = [{0, 1}, {1, 2, 3, 4}, {5}] 158 | self.assertEqual( 159 | set(map(frozenset, splt._find_maximal_cliques(adj_matrix))), 160 | set(map(frozenset, expected)), 161 | ) 162 | 163 | def test_cd_diagram_number_of_artists(self): 164 | index = list("abcdef") 165 | ranks = Series([2.1, 1.2, 4.5, 3.2, 5.7, 6.5], index=index) 166 | sig_matrix = DataFrame( 167 | [ 168 | [0.08, 0.08, 0.01, 0.01, 0.01, 0.01], 169 | [0.08, 0.08, 0.08, 0.08, 0.08, 0.01], 170 | [0.01, 0.08, 0.08, 0.08, 0.08, 0.01], 171 | [0.01, 0.08, 0.08, 0.08, 0.08, 0.01], 172 | [0.01, 0.08, 0.08, 0.08, 0.08, 0.01], 173 | [0.01, 0.01, 0.01, 0.01, 0.01, 0.08], 174 | ], 175 | index=index, 176 | columns=index, 177 | ) 178 | 179 | output = splt.critical_difference_diagram(ranks, sig_matrix) 180 | self.assertEqual(len(output["markers"]), len(ranks)) 181 | self.assertEqual(len(output["elbows"]), len(ranks)) 182 | self.assertEqual(len(output["labels"]), len(ranks)) 183 | self.assertEqual(len(output["crossbars"]), 2) 184 | 185 | # Outliers tests 186 | def test_outliers_iqr(self): 187 | x = np.array([4, 5, 6, 10, 12, 4, 3, 1, 2, 3, 23, 5, 3]) 188 | 189 | x_filtered = np.array([4, 5, 6, 10, 4, 3, 1, 2, 3, 5, 3]) 190 | indices = np.delete(np.arange(13), [4, 10]) 191 | outliers_indices = np.array([4, 10]) 192 | outliers = np.array([12, 23]) 193 | 194 | test_outliers = so.outliers_iqr(x, ret="outliers") 195 | test_outliers_indices = so.outliers_iqr(x, ret="outliers_indices") 196 | test_indices = so.outliers_iqr(x, ret="indices") 197 | test_filtered = so.outliers_iqr(x, ret="filtered") 198 | 199 | self.assertTrue( 200 | np.all(test_outliers == outliers) 201 | and np.all(test_outliers_indices == outliers_indices) 202 | and np.all(test_indices == indices) 203 | and np.all(test_filtered == x_filtered) 204 | ) 205 | 206 | def test_outliers_grubbs(self): 207 | x = np.array([199.31, 199.53, 200.19, 200.82, 201.92, 201.95, 202.18, 245.57]) 208 | test_results = so.outliers_grubbs(x) 209 | correct_results = np.array([199.31, 199.53, 200.19, 200.82, 201.92, 201.95, 202.18]) 210 | self.assertTrue(so.outliers_grubbs(x, hypo=True)) 211 | self.assertTrue(np.all(test_results == correct_results)) 212 | 213 | def test_outliers_tietjen(self): 214 | x = np.array( 215 | [ 216 | -1.40, 217 | -0.44, 218 | -0.30, 219 | -0.24, 220 | -0.22, 221 | -0.13, 222 | -0.05, 223 | 0.06, 224 | 0.10, 225 | 0.18, 226 | 0.20, 227 | 0.39, 228 | 0.48, 229 | 0.63, 230 | 1.01, 231 | ] 232 | ) 233 | test_results = so.outliers_tietjen(x, 2) 234 | correct_results = np.array( 235 | [ 236 | -0.44, 237 | -0.3, 238 | -0.24, 239 | -0.22, 240 | -0.13, 241 | -0.05, 242 | 0.06, 243 | 0.1, 244 | 0.18, 245 | 0.2, 246 | 0.39, 247 | 0.48, 248 | 0.63, 249 | ] 250 | ) 251 | self.assertTrue(so.outliers_tietjen(x, 2, hypo=True)) 252 | self.assertTrue(np.all(test_results == correct_results)) 253 | 254 | def test_outliers_gesd(self): 255 | x = np.array( 256 | [ 257 | -0.25, 258 | 0.68, 259 | 0.94, 260 | 1.15, 261 | 1.2, 262 | 1.26, 263 | 1.26, 264 | 1.34, 265 | 1.38, 266 | 1.43, 267 | 1.49, 268 | 1.49, 269 | 1.55, 270 | 1.56, 271 | 1.58, 272 | 1.65, 273 | 1.69, 274 | 1.7, 275 | 1.76, 276 | 1.77, 277 | 1.81, 278 | 1.91, 279 | 1.94, 280 | 1.96, 281 | 1.99, 282 | 2.06, 283 | 2.09, 284 | 2.1, 285 | 2.14, 286 | 2.15, 287 | 2.23, 288 | 2.24, 289 | 2.26, 290 | 2.35, 291 | 2.37, 292 | 2.4, 293 | 2.47, 294 | 2.54, 295 | 2.62, 296 | 2.64, 297 | 2.9, 298 | 2.92, 299 | 2.92, 300 | 2.93, 301 | 3.21, 302 | 3.26, 303 | 3.3, 304 | 3.59, 305 | 3.68, 306 | 4.3, 307 | 4.64, 308 | 5.34, 309 | 5.42, 310 | 6.01, 311 | ] 312 | ) 313 | correct_mask = np.zeros_like(x, dtype=bool) 314 | correct_mask[-3:] = True 315 | test_results = so.outliers_gesd(x, 5) 316 | test_mask_results = so.outliers_gesd(x, 5, hypo=True) 317 | correct_results = np.array( 318 | [ 319 | -0.25, 320 | 0.68, 321 | 0.94, 322 | 1.15, 323 | 1.2, 324 | 1.26, 325 | 1.26, 326 | 1.34, 327 | 1.38, 328 | 1.43, 329 | 1.49, 330 | 1.49, 331 | 1.55, 332 | 1.56, 333 | 1.58, 334 | 1.65, 335 | 1.69, 336 | 1.7, 337 | 1.76, 338 | 1.77, 339 | 1.81, 340 | 1.91, 341 | 1.94, 342 | 1.96, 343 | 1.99, 344 | 2.06, 345 | 2.09, 346 | 2.1, 347 | 2.14, 348 | 2.15, 349 | 2.23, 350 | 2.24, 351 | 2.26, 352 | 2.35, 353 | 2.37, 354 | 2.4, 355 | 2.47, 356 | 2.54, 357 | 2.62, 358 | 2.64, 359 | 2.9, 360 | 2.92, 361 | 2.92, 362 | 2.93, 363 | 3.21, 364 | 3.26, 365 | 3.3, 366 | 3.59, 367 | 3.68, 368 | 4.3, 369 | 4.64, 370 | ] 371 | ) 372 | self.assertTrue(isinstance(so.outliers_gesd(x, 5, report=True), np.ndarray)) 373 | self.assertTrue(np.array_equal(test_results, correct_results)) 374 | self.assertTrue(np.array_equal(test_mask_results, correct_mask)) 375 | self.assertTrue( 376 | np.array_equal(so.outliers_gesd(correct_results, 5, hypo=False), correct_results) 377 | ) 378 | self.assertTrue( 379 | np.array_equal( 380 | so.outliers_gesd(correct_results, 5, hypo=True), 381 | np.zeros_like(correct_results, dtype=bool), 382 | ) 383 | ) 384 | 385 | # Statistical tests 386 | df = sb.load_dataset("exercise") 387 | df[df.columns[df.dtypes == "category"]] = df[df.columns[df.dtypes == "category"]].astype(object) 388 | df_bn = np.array([[4, 3, 4, 4, 5, 6, 3], [1, 2, 3, 5, 6, 7, 7], [1, 2, 6, 4, 1, 5, 1]]) 389 | 390 | # DataFrame conversion tests 391 | def test_convert_to_block_df(self): 392 | a = np.array( 393 | [ 394 | [0, 0, 0, 4], 395 | [1, 1, 0, 1], 396 | [2, 2, 0, 1], 397 | [0, 0, 1, 3], 398 | [1, 1, 1, 2], 399 | [2, 2, 1, 2], 400 | [0, 0, 2, 4], 401 | [1, 1, 2, 3], 402 | [2, 2, 2, 6], 403 | [0, 0, 3, 4], 404 | [1, 1, 3, 5], 405 | [2, 2, 3, 4], 406 | [0, 0, 4, 5], 407 | [1, 1, 4, 6], 408 | [2, 2, 4, 1], 409 | [0, 0, 5, 6], 410 | [1, 1, 5, 7], 411 | [2, 2, 5, 5], 412 | [0, 0, 6, 3], 413 | [1, 1, 6, 7], 414 | [2, 2, 6, 1], 415 | ], 416 | dtype=float, 417 | ) 418 | df_a = DataFrame(a, columns=["blk_col", "blk_id_col", "grp_col", "y_col"]) 419 | 420 | result = sp.posthoc_nemenyi_friedman( 421 | a, y_col=3, group_col=2, block_col=0, block_id_col=1, melted=True 422 | )[0].values 423 | result2 = sp.posthoc_nemenyi_friedman(self.df_bn)[0].values 424 | result3 = sp.posthoc_nemenyi_friedman( 425 | df_a, 426 | y_col="y_col", 427 | group_col="grp_col", 428 | block_col="blk_col", 429 | block_id_col="blk_id_col", 430 | melted=True, 431 | )[0].values 432 | self.assertTrue(np.allclose(result, result2)) 433 | self.assertTrue(np.allclose(result, result3)) 434 | self.assertTrue(np.allclose(result2, result3)) 435 | 436 | # Omnibox tests 437 | def test_osrt(self): 438 | df = DataFrame(dict(zip(["a", "b", "c"], self.df_bn.tolist()))).melt() 439 | p, _, _ = som.test_osrt(df, val_col="value", group_col="variable") 440 | result = 0.3157646 441 | self.assertTrue(np.allclose(p, result, atol=1.0e-3)) 442 | 443 | def test_durbin(self): 444 | r_result = np.array([0.205758, 8.468354, 6]) 445 | result = som.test_durbin(self.df_bn) 446 | self.assertTrue(np.allclose(result, r_result)) 447 | 448 | def test_mackwolfe(self): 449 | x = [ 450 | [22, 23, 35], 451 | [60, 59, 54], 452 | [98, 78, 50], 453 | [60, 82, 59], 454 | [22, 44, 33], 455 | [23, 21, 25], 456 | ] 457 | result, _ = som.test_mackwolfe(x, p=2) 458 | self.assertEqual(som.test_mackwolfe(x, p=20), (np.nan, np.nan)) 459 | self.assertEqual(som.test_mackwolfe(x, p=0), (np.nan, np.nan)) 460 | self.assertTrue(np.allclose(result, 0.0006812725)) 461 | 462 | def test_mackwolfe_nperm(self): 463 | x = [ 464 | [22, 23, 35], 465 | [60, 59, 54], 466 | [98, 78, 50], 467 | [60, 82, 59], 468 | [22, 44, 33], 469 | [23, 21, 25], 470 | ] 471 | _, stat = som.test_mackwolfe(x, n_perm=50) 472 | self.assertTrue(np.allclose(stat, 3.2024699769846983)) 473 | 474 | # Post hoc tests 475 | def test_posthoc_anderson(self): 476 | r_results = np.array( 477 | [ 478 | [1, 1.35079e-02, 8.64418e-09], 479 | [1.35079e-02, 1, 1.644534e-05], 480 | [8.64418e-09, 1.644534e-05, 1], 481 | ] 482 | ) 483 | 484 | results = sp.posthoc_anderson(self.df, val_col="pulse", group_col="kind", p_adjust="holm") 485 | self.assertTrue(np.allclose(results.values, r_results, atol=3.0e-3)) 486 | 487 | def test_posthoc_conover(self): 488 | r_results = np.array( 489 | [ 490 | [1, 9.354690e-11, 1.131263e-02], 491 | [9.354690e-11, 1, 5.496288e-06], 492 | [1.131263e-02, 5.496288e-06, 1], 493 | ] 494 | ) 495 | 496 | results = sp.posthoc_conover( 497 | self.df, val_col="pulse", group_col="kind", p_adjust="holm" 498 | ).values 499 | self.assertTrue(np.allclose(results, r_results)) 500 | 501 | def test_posthoc_dunn(self): 502 | r_results = np.array( 503 | [ 504 | [1, 9.570998e-09, 4.390066e-02], 505 | [9.570998e-09, 1, 1.873208e-04], 506 | [4.390066e-02, 1.873208e-04, 1], 507 | ] 508 | ) 509 | 510 | results = sp.posthoc_dunn( 511 | self.df, val_col="pulse", group_col="kind", p_adjust="holm" 512 | ).values 513 | self.assertTrue(np.allclose(results, r_results)) 514 | 515 | def test_posthoc_nemenyi(self): 516 | r_results = np.array( 517 | [ 518 | [1, 2.431833e-08, 1.313107e-01], 519 | [2.431833e-08, 1, 4.855675e-04], 520 | [1.313107e-01, 4.855675e-04, 1], 521 | ] 522 | ) 523 | 524 | results = sp.posthoc_nemenyi(self.df, val_col="pulse", group_col="kind").values 525 | self.assertTrue(np.allclose(results, r_results)) 526 | 527 | def test_posthoc_nemenyi_tukey(self): 528 | r_results = np.array( 529 | [ 530 | [1, 9.793203e-09, 1.088785e-01], 531 | [9.793203e-09, 1, 0.0002789016], 532 | [1.088785e-01, 0.0002789016, 1], 533 | ] 534 | ) 535 | 536 | results = sp.posthoc_nemenyi( 537 | self.df, val_col="pulse", group_col="kind", dist="tukey" 538 | ).values 539 | self.assertTrue(np.allclose(results, r_results, atol=1.0e-3)) 540 | 541 | def test_posthoc_nemenyi_friedman(self): 542 | p_results = np.array( 543 | [ 544 | [ 545 | 1.0, 546 | np.nan, 547 | np.nan, 548 | np.nan, 549 | np.nan, 550 | np.nan, 551 | np.nan, 552 | ], 553 | [ 554 | 0.9999999, 555 | 1.0, 556 | np.nan, 557 | np.nan, 558 | np.nan, 559 | np.nan, 560 | np.nan, 561 | ], 562 | [ 563 | 0.8414506, 564 | 0.8833015, 565 | 1.0, 566 | np.nan, 567 | np.nan, 568 | np.nan, 569 | np.nan, 570 | ], 571 | [0.9177741, 0.9449086, 0.9999962, 1.0, np.nan, np.nan, np.nan], 572 | [0.9177741, 0.9449086, 0.9999962, 1.0000000, 1.0, np.nan, np.nan], 573 | [0.2147827, 0.2597539, 0.9449086, 0.8833015, 0.8833015, 1.0, np.nan], 574 | [0.9976902, 0.9991770, 0.9888953, 0.9976902, 0.9976902, 0.5511935, 1.0], 575 | ] 576 | ) 577 | tri_upper = np.triu_indices(p_results.shape[0], 1) 578 | p_results[tri_upper] = np.transpose(p_results)[tri_upper] 579 | results = sp.posthoc_nemenyi_friedman(self.df_bn) 580 | self.assertTrue(np.allclose(results, p_results)) 581 | 582 | def test_posthoc_conover_friedman(self): 583 | results = sp.posthoc_conover_friedman(self.df_bn, p_adjust="bonferroni") 584 | p_results = ( 585 | np.array( 586 | [ 587 | [1.0000000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], 588 | [0.9147508, 1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan], 589 | [0.1518030, 0.18071036, 1.0000000, np.nan, np.nan, np.nan, np.nan], 590 | [ 591 | 0.2140927, 592 | 0.25232845, 593 | 0.8305955, 594 | 1.000000, 595 | np.nan, 596 | np.nan, 597 | np.nan, 598 | ], 599 | [ 600 | 0.2140927, 601 | 0.25232845, 602 | 0.8305955, 603 | 1.000000, 604 | 1.000000, 605 | np.nan, 606 | np.nan, 607 | ], 608 | [ 609 | 0.0181602, 610 | 0.02222747, 611 | 0.2523284, 612 | 0.1807104, 613 | 0.1807104, 614 | 1.00009000, 615 | np.nan, 616 | ], 617 | [ 618 | 0.5242303, 619 | 0.59465124, 620 | 0.3989535, 621 | 0.5242303, 622 | 0.5242303, 623 | 0.05991984, 624 | 1.000000, 625 | ], 626 | ] 627 | ) 628 | * 21 629 | ) 630 | p_results[p_results > 1] = 1.0 631 | tri_upper = np.triu_indices(p_results.shape[0], 1) 632 | p_results[tri_upper] = np.transpose(p_results)[tri_upper] 633 | np.fill_diagonal(p_results, 1) 634 | self.assertTrue(np.allclose(results, p_results)) 635 | 636 | def test_posthoc_conover_friedman_tukey(self): 637 | results = sp.posthoc_conover_friedman(self.df_bn, p_adjust="single-step") 638 | p_results = np.array( 639 | [ 640 | [1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], 641 | [0.99999986, 1.0000000, np.nan, np.nan, np.nan, np.nan, np.nan], 642 | [0.72638075, 0.7905289, 1.0000000, np.nan, np.nan, np.nan, np.nan], 643 | [0.84667448, 0.8934524, 0.9999910, 1.0000000, np.nan, np.nan, np.nan], 644 | [ 645 | 0.84667448, 646 | 0.8934524, 647 | 0.9999910, 648 | 1.0000000, 649 | 1.0000000, 650 | np.nan, 651 | np.nan, 652 | ], 653 | [ 654 | 0.09013677, 655 | 0.1187580, 656 | 0.8934524, 657 | 0.7905289, 658 | 0.7905289, 659 | 1.0000000, 660 | np.nan, 661 | ], 662 | [ 663 | 0.99482447, 664 | 0.9981178, 665 | 0.9763466, 666 | 0.9948245, 667 | 0.9948245, 668 | 0.3662675, 669 | 1.000000, 670 | ], 671 | ] 672 | ) 673 | tri_upper = np.triu_indices(p_results.shape[0], 1) 674 | p_results[tri_upper] = np.transpose(p_results)[tri_upper] 675 | np.fill_diagonal(p_results, 1) 676 | self.assertTrue(np.allclose(results, p_results, atol=1e-3)) 677 | 678 | def test_posthoc_conover_friedman_non_melted(self): 679 | df = DataFrame(self.df_bn) 680 | results = sp.posthoc_conover_friedman(df, melted=False) 681 | p_results = np.array( 682 | [ 683 | [1.0000000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], 684 | [0.9147508, 1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan], 685 | [0.1518030, 0.18071036, 1.0000000, np.nan, np.nan, np.nan, np.nan], 686 | [0.2140927, 0.25232845, 0.8305955, 1.000000, np.nan, np.nan, np.nan], 687 | [0.2140927, 0.25232845, 0.8305955, 1.000000, 1.000000, np.nan, np.nan], 688 | [ 689 | 0.0181602, 690 | 0.02222747, 691 | 0.2523284, 692 | 0.1807104, 693 | 0.1807104, 694 | 1.00009000, 695 | np.nan, 696 | ], 697 | [ 698 | 0.5242303, 699 | 0.59465124, 700 | 0.3989535, 701 | 0.5242303, 702 | 0.5242303, 703 | 0.05991984, 704 | 1.000000, 705 | ], 706 | ] 707 | ) 708 | tri_upper = np.triu_indices(p_results.shape[0], 1) 709 | p_results[tri_upper] = np.transpose(p_results)[tri_upper] 710 | np.fill_diagonal(p_results, 1) 711 | self.assertTrue(np.allclose(results, p_results)) 712 | 713 | def test_posthoc_miller_friedman(self): 714 | results = sp.posthoc_miller_friedman(self.df_bn) 715 | 716 | p_results = np.array( 717 | [ 718 | [ 719 | 1.0, 720 | 1.0, 721 | 0.9411963, 722 | 0.9724396000000001, 723 | 0.9724396000000001, 724 | 0.4717981, 725 | 0.9993864, 726 | ], 727 | [ 728 | 1.0, 729 | 1.0, 730 | 0.9588993, 731 | 0.9823818000000001, 732 | 0.9823818000000001, 733 | 0.5256257, 734 | 0.9997869, 735 | ], 736 | [ 737 | 0.9411963, 738 | 0.9588993, 739 | 1.0, 740 | 0.9999991, 741 | 0.9999991, 742 | 0.9823818000000001, 743 | 0.9968575999999999, 744 | ], 745 | [ 746 | 0.9724396000000001, 747 | 0.9823818000000001, 748 | 0.9999991, 749 | 1.0, 750 | 1.0, 751 | 0.9588993, 752 | 0.9993864, 753 | ], 754 | [ 755 | 0.9724396000000001, 756 | 0.9823818000000001, 757 | 0.9999991, 758 | 1.0, 759 | 1.0, 760 | 0.9588993, 761 | 0.9993864, 762 | ], 763 | [ 764 | 0.4717981, 765 | 0.5256257, 766 | 0.9823818000000001, 767 | 0.9588993, 768 | 0.9588993, 769 | 1.0, 770 | 0.7803545999999999, 771 | ], 772 | [ 773 | 0.9993864, 774 | 0.9997869, 775 | 0.9968575999999999, 776 | 0.9993864, 777 | 0.9993864, 778 | 0.7803545999999999, 779 | 1.0, 780 | ], 781 | ] 782 | ) 783 | 784 | self.assertTrue(np.allclose(results, p_results)) 785 | 786 | def test_posthoc_siegel_friedman(self): 787 | results = sp.posthoc_siegel_friedman(self.df_bn, p_adjust="bonferroni") 788 | 789 | p_results = ( 790 | np.array( 791 | [ 792 | [ 793 | 1.000000, 794 | 0.92471904, 795 | 0.18587673, 796 | 0.25683926, 797 | 0.25683926, 798 | 0.01816302, 799 | 0.57075039, 800 | ], 801 | [ 802 | 0.92471904, 803 | 1.0000000, 804 | 0.2193026, 805 | 0.2986177, 806 | 0.2986177, 807 | 0.0233422, 808 | 0.6366016, 809 | ], 810 | [ 811 | 0.18587673, 812 | 0.2193026, 813 | 1.0000000, 814 | 0.8501067, 815 | 0.8501067, 816 | 0.2986177, 817 | 0.4496918, 818 | ], 819 | [ 820 | 0.25683926, 821 | 0.2986177, 822 | 0.8501067, 823 | 1.000000, 824 | 1.0000000, 825 | 0.2193026, 826 | 0.5707504, 827 | ], 828 | [ 829 | 0.25683926, 830 | 0.2986177, 831 | 0.8501067, 832 | 1.0000000, 833 | 1.0000000, 834 | 0.2193026, 835 | 0.5707504, 836 | ], 837 | [ 838 | 0.01816302, 839 | 0.0233422, 840 | 0.2986177, 841 | 0.2193026, 842 | 0.2193026, 843 | 1.000000, 844 | 0.07260094, 845 | ], 846 | [ 847 | 0.57075039, 848 | 0.6366016, 849 | 0.4496918, 850 | 0.5707504, 851 | 0.5707504, 852 | 0.07260094, 853 | 1.000000, 854 | ], 855 | ] 856 | ) 857 | * 21 858 | ) 859 | p_results[p_results > 1] = 1.0 860 | 861 | self.assertTrue(np.allclose(results, p_results)) 862 | 863 | def test_posthoc_durbin(self): 864 | results = sp.posthoc_durbin(self.df_bn, p_adjust="holm") 865 | 866 | p_results = np.array( 867 | [ 868 | [1.000000, 1.000000, 1.0, 1.0, 1.0, 0.381364, 1.0], 869 | [1.000000, 1.000000, 1.0, 1.0, 1.0, 0.444549, 1.0], 870 | [1.000000, 1.000000, 1.0, 1.0, 1.0, 1.000000, 1.0], 871 | [1.000000, 1.000000, 1.0, 1.0, 1.0, 1.000000, 1.0], 872 | [1.000000, 1.000000, 1.0, 1.0, 1.0, 1.000000, 1.0], 873 | [0.381364, 0.444549, 1.0, 1.0, 1.0, 1.000000, 1.0], 874 | [1.000000, 1.000000, 1.0, 1.0, 1.0, 1.000000, 1.0], 875 | ] 876 | ) 877 | self.assertTrue(np.allclose(results, p_results)) 878 | 879 | def test_posthoc_quade(self): 880 | results = sp.posthoc_quade(self.df_bn, p_adjust="bonferroni") 881 | 882 | p_results = ( 883 | np.array( 884 | [ 885 | [ 886 | 1.00000000, 887 | 0.67651326, 888 | 0.15432143, 889 | 0.17954686, 890 | 0.2081421, 891 | 0.02267043, 892 | 0.2081421, 893 | ], 894 | [ 895 | 0.67651326, 896 | 1.00000000, 897 | 0.29595042, 898 | 0.33809987, 899 | 0.38443835, 900 | 0.0494024, 901 | 0.38443835, 902 | ], 903 | [ 904 | 0.15432143, 905 | 0.29595042, 906 | 1.00000000, 907 | 0.92586499, 908 | 0.85245022, 909 | 0.29595042, 910 | 0.85245022, 911 | ], 912 | [ 913 | 0.17954686, 914 | 0.33809987, 915 | 0.92586499, 916 | 1.00000000, 917 | 0.92586499, 918 | 0.25789648, 919 | 0.92586499, 920 | ], 921 | [ 922 | 0.2081421, 923 | 0.38443835, 924 | 0.85245022, 925 | 0.92586499, 926 | 1.00000000, 927 | 0.22378308, 928 | 1.00000000, 929 | ], 930 | [ 931 | 0.02267043, 932 | 0.0494024, 933 | 0.29595042, 934 | 0.25789648, 935 | 0.22378308, 936 | 1.00000000, 937 | 0.22378308, 938 | ], 939 | [ 940 | 0.2081421, 941 | 0.38443835, 942 | 0.85245022, 943 | 0.92586499, 944 | 1.00000000, 945 | 0.22378308, 946 | 1.00000000, 947 | ], 948 | ] 949 | ) 950 | * 21 951 | ) 952 | p_results[p_results > 1.0] = 1.0 953 | self.assertTrue(np.allclose(results, p_results)) 954 | 955 | def test_posthoc_quade_norm(self): 956 | results = sp.posthoc_quade(self.df_bn, dist="normal") 957 | 958 | p_results = np.array( 959 | [ 960 | [1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], 961 | [0.5693540320, 1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan], 962 | [0.0430605548, 0.145913303, 1.00000000, np.nan, np.nan, np.nan, np.nan], 963 | [ 964 | 0.0578705783, 965 | 0.184285855, 966 | 0.8993796, 967 | 1.00000000, 968 | np.nan, 969 | np.nan, 970 | np.nan, 971 | ], 972 | [ 973 | 0.0766885196, 974 | 0.229662468, 975 | 0.8003530, 976 | 0.8993796, 977 | 1.00000000, 978 | np.nan, 979 | np.nan, 980 | ], 981 | [ 982 | 0.0005066018, 983 | 0.003634715, 984 | 0.1459133, 985 | 0.1139777, 986 | 0.08782032, 987 | 1.00000000, 988 | np.nan, 989 | ], 990 | [ 991 | 0.0766885196, 992 | 0.229662468, 993 | 0.8003530, 994 | 0.8993796, 995 | 1.00000000, 996 | 0.08782032, 997 | 1.00000000, 998 | ], 999 | ] 1000 | ) 1001 | tri_upper = np.triu_indices(p_results.shape[0], 1) 1002 | p_results[tri_upper] = np.transpose(p_results)[tri_upper] 1003 | self.assertTrue(np.allclose(results, p_results)) 1004 | 1005 | def test_posthoc_npm_test(self): 1006 | data = np.array( 1007 | [ 1008 | [2.4, 3, 3, 2.2, 2.2, 2.2, 2.2, 2.8, 2, 3], 1009 | [2.8, 2.2, 3.8, 9.4, 8.4, 3, 3.2, 4.4, 3.2, 7.4], 1010 | [9.8, 3.2, 5.8, 7.8, 2.6, 2.2, 6.2, 9.4, 7.8, 3.4], 1011 | [7, 9.8, 9.4, 8.8, 8.8, 3.4, 9, 8.4, 2.4, 7.8], 1012 | ] 1013 | ) 1014 | 1015 | results = sp.posthoc_npm_test(data) 1016 | 1017 | p_results = np.array( 1018 | [ 1019 | [1.0, 0.0077, 0.0020, 2e-16], 1020 | [0.0077, 1.0, 0.2884, 0.0854], 1021 | [0.0020, 0.2884, 1.0, 0.1385], 1022 | [2e-16, 0.0854, 0.1385, 1.0], 1023 | ] 1024 | ) 1025 | 1026 | self.assertTrue(np.allclose(results, p_results, rtol=4)) 1027 | 1028 | def test_posthoc_vanwaerden(self): 1029 | r_results = np.array( 1030 | [ 1031 | [1, 1.054709e-02, 6.476665e-11], 1032 | [1.054709e-02, 1, 4.433141e-06], 1033 | [6.476665e-11, 4.433141e-06, 1], 1034 | ] 1035 | ) 1036 | 1037 | results = sp.posthoc_vanwaerden(self.df, val_col="pulse", group_col="kind", p_adjust="holm") 1038 | self.assertTrue(np.allclose(results, r_results)) 1039 | 1040 | def test_posthoc_dscf(self): 1041 | r_results = np.array( 1042 | [ 1043 | [1, 4.430682e-02, 9.828003e-08], 1044 | [4.430682e-02, 1, 5.655274e-05], 1045 | [9.828003e-08, 5.655274e-05, 1], 1046 | ] 1047 | ) 1048 | 1049 | results = sp.posthoc_dscf(self.df, val_col="pulse", group_col="kind") 1050 | self.assertTrue(np.allclose(results, r_results, atol=0.001)) 1051 | 1052 | def test_posthoc_ttest(self): 1053 | r_results = np.array( 1054 | [ 1055 | [1, 9.757069e-03, 4.100954e-07], 1056 | [9.757069e-03, 1, 1.556010e-05], 1057 | [4.100954e-07, 1.556010e-05, 1], 1058 | ] 1059 | ) 1060 | 1061 | results = sp.posthoc_ttest( 1062 | self.df, val_col="pulse", group_col="kind", equal_var=False, p_adjust="holm" 1063 | ) 1064 | self.assertTrue(np.allclose(results, r_results)) 1065 | 1066 | def test_posthoc_ttest_pooled(self): 1067 | x = [[1, 2, 3, 5, 1], [12, 31, 54, 50, 40], [10, 12, 6, 74, 11]] 1068 | r_results = np.array( 1069 | [ 1070 | [1, 0.04226866, 0.24706893], 1071 | [0.04226866, 1, 0.2482456], 1072 | [0.24706893, 0.2482456, 1], 1073 | ] 1074 | ) 1075 | 1076 | results = sp.posthoc_ttest(x, equal_var=False, p_adjust="holm", pool_sd=True) 1077 | self.assertTrue(np.allclose(results, r_results)) 1078 | 1079 | def test_posthoc_tukey_hsd(self): 1080 | x = [[1, 2, 3, 4, 5], [35, 31, 75, 40, 21], [10, 6, 9, 6, 1]] 1081 | results = sp.posthoc_tukey_hsd(x) 1082 | n_results = np.array( 1083 | [ 1084 | [1.0, 0.000991287, 0.897449027], 1085 | [0.000991287, 1.0, 0.00210909], 1086 | [0.897449027, 0.00210909, 1.0], 1087 | ] 1088 | ) 1089 | self.assertTrue(np.allclose(n_results, results)) 1090 | 1091 | def test_posthoc_mannwhitney(self): 1092 | r_results = ( 1093 | np.array( 1094 | [ 1095 | [1, 3.420508e-08, 1.714393e-02], 1096 | [3.420508e-08, 1, 1.968352e-05], 1097 | [1.714393e-02, 1.968352e-05, 1], 1098 | ] 1099 | ) 1100 | * 3 1101 | ) 1102 | np.fill_diagonal(r_results, 1) 1103 | 1104 | results = sp.posthoc_mannwhitney( 1105 | self.df, val_col="pulse", group_col="kind", p_adjust="bonferroni" 1106 | ).values 1107 | self.assertTrue(np.allclose(results, r_results)) 1108 | 1109 | def test_posthoc_mannwhitney_ndarray(self): 1110 | _x = [[1, 2, 3, 5, 1], [12, 31, 54, 50, 40], [10, 12, 6, 74, 11]] 1111 | x = np.array(_x) 1112 | g = np.repeat([0, 1, 2], 5) 1113 | nd = np.column_stack((x.ravel(), g)) 1114 | xdf = DataFrame(dict(zip(list("abc"), _x))).melt(var_name="groups", value_name="vals") 1115 | results = sp.posthoc_mannwhitney(xdf, val_col="vals", group_col="groups").values 1116 | nd_results = sp.posthoc_mannwhitney(nd, val_col=0, group_col=1).values 1117 | self.assertTrue(np.allclose(nd_results, results)) 1118 | 1119 | def test_posthoc_wilcoxon(self): 1120 | r_results = ( 1121 | np.array( 1122 | [ 1123 | [1, 2.337133e-03, 2.857818e-06], 1124 | [2.337133e-03, 1, 1.230888e-05], 1125 | [2.857818e-06, 1.230888e-05, 1], 1126 | ] 1127 | ) 1128 | * 3 1129 | ) 1130 | np.fill_diagonal(r_results, 1) 1131 | 1132 | results = sp.posthoc_wilcoxon( 1133 | self.df.sort_index(), 1134 | val_col="pulse", 1135 | group_col="kind", 1136 | p_adjust="bonferroni", 1137 | ) 1138 | self.assertTrue(np.allclose(results, r_results, atol=1e-4)) 1139 | 1140 | def test_posthoc_scheffe(self): 1141 | r_results = np.array( 1142 | [ 1143 | [1.0, 3.378449e-01, 3.047472e-10], 1144 | [3.378449e-01, 1.0, 2.173209e-07], 1145 | [3.047472e-10, 2.173209e-07, 1.0], 1146 | ] 1147 | ) 1148 | 1149 | results = sp.posthoc_scheffe(self.df.sort_index(), val_col="pulse", group_col="kind") 1150 | self.assertTrue(np.allclose(results, r_results)) 1151 | 1152 | def test_posthoc_tamhane(self): 1153 | r_results = np.array( 1154 | [ 1155 | [1, 2.898653e-02, 4.100954e-07], 1156 | [2.898653e-02, 1, 2.333996e-05], 1157 | [4.100954e-07, 2.333996e-05, 1], 1158 | ] 1159 | ) 1160 | 1161 | results = sp.posthoc_tamhane(self.df.sort_index(), val_col="pulse", group_col="kind") 1162 | self.assertTrue(np.allclose(results, r_results)) 1163 | 1164 | def test_posthoc_tamhane_nw(self): 1165 | r_results = np.array( 1166 | [ 1167 | [1, 2.883219e-02, 4.780682e-08], 1168 | [2.883219e-02, 1, 8.643683e-06], 1169 | [4.780682e-08, 8.643683e-06, 1], 1170 | ] 1171 | ) 1172 | 1173 | results = sp.posthoc_tamhane( 1174 | self.df.sort_index(), val_col="pulse", group_col="kind", welch=False 1175 | ) 1176 | self.assertTrue(np.allclose(results, r_results)) 1177 | 1178 | def test_posthoc_tukey(self): 1179 | r_results = np.array( 1180 | [ 1181 | [1, 3.042955e-01, 4.308631e-10], 1182 | [3.042955e-01, 1, 9.946571e-08], 1183 | [4.308631e-10, 9.946571e-08, 1], 1184 | ] 1185 | ) 1186 | 1187 | results = sp.posthoc_tukey(self.df.sort_index(), val_col="pulse", group_col="kind") 1188 | self.assertTrue(np.allclose(results, r_results, atol=1.0e-3)) 1189 | 1190 | def test_posthoc_dunnett(self): 1191 | r_results = [8.125844e-11, 2.427434e-01] 1192 | 1193 | # scipy use randomized Quasi-Monte Carlo integration of the multivariate-t distribution 1194 | # to compute the p-values. The result may vary slightly from run to run. 1195 | # we run the test 1000 times (maximum absolute tolerance = 1.e-4 for example data) 1196 | is_close = [] 1197 | for i in range(100): 1198 | results = sp.posthoc_dunnett( 1199 | self.df.sort_index(), 1200 | val_col="pulse", 1201 | group_col="kind", 1202 | control="rest", 1203 | to_matrix=False, 1204 | ) 1205 | is_close.append(np.allclose(results, r_results, atol=1e-4)) 1206 | 1207 | is_close_mt = [] 1208 | for i in range(100): 1209 | df_results = sp.posthoc_dunnett( 1210 | self.df.sort_index(), 1211 | val_col="pulse", 1212 | group_col="kind", 1213 | control="rest", 1214 | to_matrix=True, 1215 | ) 1216 | results = [ 1217 | df_results.loc["rest", "running"], 1218 | df_results.loc["rest", "walking"], 1219 | ] 1220 | is_close_mt.append(np.allclose(results, r_results, atol=1e-4)) 1221 | self.assertTrue(sum(is_close) > 95) 1222 | self.assertTrue(sum(is_close_mt) > 95) 1223 | 1224 | 1225 | if __name__ == "__main__": 1226 | unittest.main() 1227 | --------------------------------------------------------------------------------