├── tests ├── __init__.py ├── conftest.py ├── test_main_functions.py ├── test_file_loader.py ├── test_content_creation.py ├── test_cli.py ├── test_document_creation.py ├── test_analysis.py ├── test_data_validation.py ├── test_bivariate_analysis.py ├── test_univariate_analysis.py └── test_plotting_functions.py ├── runtime.txt ├── MANIFEST.in ├── eda_report ├── images │ ├── icon.png │ └── background.png ├── __main__.py ├── exceptions.py ├── _read_file.py ├── __init__.py ├── _cli.py ├── _content.py ├── _validate.py ├── _analysis.py ├── bivariate.py ├── gui.py ├── univariate.py ├── document.py └── plotting.py ├── docs ├── source │ ├── _static │ │ ├── haha.png │ │ ├── report.gif │ │ ├── report.png │ │ ├── bar-plot.png │ │ ├── box-plot.png │ │ ├── kde-plot.png │ │ ├── screencast.gif │ │ ├── screencast.png │ │ ├── bar-plot-dark.png │ │ ├── box-plot-dark.png │ │ ├── kde-plot-dark.png │ │ ├── correlation-plot.png │ │ ├── probability-plot.png │ │ ├── regression-plot.png │ │ ├── correlation-plot-dark.png │ │ ├── probability-plot-dark.png │ │ └── regression-plot-dark.png │ ├── eda_report.rst │ ├── eda_report.bivariate.rst │ ├── eda_report.gui.rst │ ├── eda_report.univariate.rst │ ├── eda_report.exceptions.rst │ ├── eda_report.document.rst │ ├── modules.rst │ ├── installation.rst │ ├── quickstart.rst │ ├── index.rst │ ├── conf.py │ ├── eda_report.plotting.rst │ └── examples.txt ├── Makefile ├── make.bat └── requirements.txt ├── pyproject.toml ├── .gitignore ├── .coveragerc ├── .readthedocs.yaml ├── requirements.txt ├── requirements-dev.txt ├── .github └── workflows │ ├── code-cov.yml │ ├── publish-pypi.yml │ └── unit-tests.yml ├── LICENSE ├── setup.cfg └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.12 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include eda_report/images/*.png 2 | exclude tests/* -------------------------------------------------------------------------------- /eda_report/images/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/eda_report/images/icon.png -------------------------------------------------------------------------------- /docs/source/_static/haha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/haha.png -------------------------------------------------------------------------------- /docs/source/_static/report.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/report.gif -------------------------------------------------------------------------------- /docs/source/_static/report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/report.png -------------------------------------------------------------------------------- /docs/source/_static/bar-plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/bar-plot.png -------------------------------------------------------------------------------- /docs/source/_static/box-plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/box-plot.png -------------------------------------------------------------------------------- /docs/source/_static/kde-plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/kde-plot.png -------------------------------------------------------------------------------- /eda_report/images/background.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/eda_report/images/background.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /docs/source/_static/screencast.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/screencast.gif -------------------------------------------------------------------------------- /docs/source/_static/screencast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/screencast.png -------------------------------------------------------------------------------- /eda_report/__main__.py: -------------------------------------------------------------------------------- 1 | from eda_report._cli import run_from_cli 2 | 3 | if __name__ == "__main__": 4 | run_from_cli() 5 | -------------------------------------------------------------------------------- /docs/source/_static/bar-plot-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/bar-plot-dark.png -------------------------------------------------------------------------------- /docs/source/_static/box-plot-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/box-plot-dark.png -------------------------------------------------------------------------------- /docs/source/_static/kde-plot-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/kde-plot-dark.png -------------------------------------------------------------------------------- /docs/source/_static/correlation-plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/correlation-plot.png -------------------------------------------------------------------------------- /docs/source/_static/probability-plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/probability-plot.png -------------------------------------------------------------------------------- /docs/source/_static/regression-plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/regression-plot.png -------------------------------------------------------------------------------- /docs/source/_static/correlation-plot-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/correlation-plot-dark.png -------------------------------------------------------------------------------- /docs/source/_static/probability-plot-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/probability-plot-dark.png -------------------------------------------------------------------------------- /docs/source/_static/regression-plot-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/regression-plot-dark.png -------------------------------------------------------------------------------- /docs/source/eda_report.rst: -------------------------------------------------------------------------------- 1 | eda_report 2 | ---------- 3 | 4 | .. automodule:: eda_report 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: -------------------------------------------------------------------------------- /docs/source/eda_report.bivariate.rst: -------------------------------------------------------------------------------- 1 | eda\_report.bivariate 2 | ======================== 3 | 4 | .. automodule:: eda_report.bivariate 5 | :members: 6 | :undoc-members: 7 | -------------------------------------------------------------------------------- /docs/source/eda_report.gui.rst: -------------------------------------------------------------------------------- 1 | eda\_report.gui 2 | =============== 3 | 4 | .. automodule:: eda_report.gui 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/eda_report.univariate.rst: -------------------------------------------------------------------------------- 1 | eda\_report.univariate 2 | ====================== 3 | 4 | .. automodule:: eda_report.univariate 5 | :members: 6 | :undoc-members: 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | .ipynb_checkpoints 3 | __pycache__ 4 | *.docx 5 | .~lock* 6 | dist/ 7 | build/ 8 | eda_report.egg-info 9 | .coverage* 10 | !.coveragerc 11 | htmlcov/ 12 | -------------------------------------------------------------------------------- /docs/source/eda_report.exceptions.rst: -------------------------------------------------------------------------------- 1 | eda\_report.exceptions 2 | ====================== 3 | 4 | .. automodule:: eda_report.exceptions 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/eda_report.document.rst: -------------------------------------------------------------------------------- 1 | eda\_report.document 2 | ==================== 3 | 4 | .. automodule:: eda_report.document 5 | :members: 6 | :inherited-members: 7 | :undoc-members: 8 | :show-inheritance: 9 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = true 3 | concurrency = multiprocessing 4 | omit = 5 | eda_report/__main__.py 6 | eda_report/gui.py 7 | parallel = true 8 | sigterm = true 9 | source = eda_report 10 | 11 | [report] 12 | fail_under = 85 13 | precision = 2 14 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | .. toctree:: 5 | :maxdepth: 3 6 | 7 | eda_report 8 | eda_report.bivariate 9 | eda_report.document 10 | eda_report.exceptions 11 | eda_report.gui 12 | eda_report.plotting 13 | eda_report.univariate -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | version: 2 5 | build: 6 | os: ubuntu-20.04 7 | tools: 8 | python: "3.11" 9 | 10 | # Build documentation in the docs/ directory with Sphinx 11 | sphinx: 12 | configuration: docs/source/conf.py 13 | formats: 14 | [htmlzip, pdf] 15 | python: 16 | install: 17 | - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | contourpy==1.3.1 2 | cycler==0.12.1 3 | et_xmlfile==2.0.0 4 | fonttools==4.55.3 5 | kiwisolver==1.4.8 6 | lxml==5.3.0 7 | matplotlib==3.10.0 8 | numpy==2.2.1 9 | openpyxl==3.1.5 10 | packaging==24.2 11 | pandas==2.2.3 12 | pillow==11.0.0 13 | pyparsing==3.2.0 14 | python-dateutil==2.9.0.post0 15 | python-docx==1.1.2 16 | pytz==2024.2 17 | scipy==1.14.1 18 | six==1.17.0 19 | tqdm==4.67.1 20 | typing_extensions==4.12.2 21 | tzdata==2024.2 22 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from shutil import rmtree 2 | 3 | import pytest 4 | from pandas import DataFrame 5 | 6 | 7 | @pytest.fixture(scope="session") 8 | def temp_data_dir(tmp_path_factory): 9 | temp_dir = tmp_path_factory.mktemp("data") 10 | sample_data = DataFrame([[1, 2, 3], [4, 5, 6]], columns=list("ABC")) 11 | sample_data.to_csv(temp_dir / "data.csv", index=False) 12 | sample_data.to_excel(temp_dir / "data.xlsx", index=False) 13 | yield temp_dir 14 | rmtree(temp_dir) 15 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | contourpy==1.3.1 2 | coverage==7.6.10 3 | cycler==0.12.1 4 | et_xmlfile==2.0.0 5 | flake8==7.1.1 6 | fonttools==4.55.3 7 | iniconfig==2.0.0 8 | kiwisolver==1.4.8 9 | lxml==5.3.0 10 | matplotlib==3.10.0 11 | mccabe==0.7.0 12 | numpy==2.2.1 13 | openpyxl==3.1.5 14 | packaging==24.2 15 | pandas==2.2.3 16 | pillow==11.0.0 17 | pluggy==1.5.0 18 | pycodestyle==2.12.1 19 | pyflakes==3.2.0 20 | pyparsing==3.2.0 21 | pytest==8.3.4 22 | python-dateutil==2.9.0.post0 23 | python-docx==1.1.2 24 | pytz==2024.2 25 | scipy==1.14.1 26 | six==1.17.0 27 | tqdm==4.67.1 28 | typing_extensions==4.12.2 29 | tzdata==2024.2 30 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /tests/test_main_functions.py: -------------------------------------------------------------------------------- 1 | from pandas import DataFrame 2 | 3 | from eda_report import get_word_report, summarize 4 | from eda_report.bivariate import Dataset 5 | from eda_report.document import ReportDocument 6 | from eda_report.univariate import Variable 7 | 8 | sample_data = DataFrame( 9 | { 10 | "A": range(50), 11 | "B": list("abcdef") * 8 + ["a"] * 2, 12 | "C": [True, False] * 24 + [True] * 2, 13 | "D": [1, 3, 5, 7, 9, 11, 13] * 7 + [17], 14 | } 15 | ) 16 | 17 | 18 | def test_get_word_report_function(): 19 | report = get_word_report(sample_data) 20 | assert isinstance(report, ReportDocument) 21 | 22 | 23 | def test_summarize_function(): 24 | summary_1D = summarize(range(25)) 25 | assert isinstance(summary_1D, Variable) 26 | 27 | summary_2D = summarize(sample_data) 28 | assert isinstance(summary_2D, Dataset) 29 | -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ------------ 3 | 4 | .. important:: 5 | Only **Python3.10 to 3.12** are currently supported. 6 | 7 | .. tip:: 8 | Consider using a `virtual environment`_. Virtual environments are a great way to ensure that you install the right versions of dependencies, while avoiding breaking other Python packages in your system. 9 | 10 | You can install ``eda-report`` from the `Python Package Index`_ using ``pip``:: 11 | 12 | $ pip install eda-report 13 | 14 | You can also install the latest stable version right from the `GitHub repository`_ using:: 15 | 16 | $ pip install https://github.com/tim-abwao/eda-report/archive/main.tar.gz 17 | 18 | 19 | .. _virtual environment: https://docs.python.org/3/tutorial/venv.html#virtual-environments-and-packages 20 | .. _Python Package Index: https://pypi.org/project/eda-report/ 21 | .. _GitHub repository: https://github.com/Tim-Abwao/eda-report 22 | -------------------------------------------------------------------------------- /.github/workflows/code-cov.yml: -------------------------------------------------------------------------------- 1 | name: Codecov 2 | on: 3 | push: 4 | branches: [main, dev] 5 | pull_request: 6 | branches: [main, dev] 7 | 8 | jobs: 9 | run: 10 | runs-on: ${{ matrix.os }} 11 | strategy: 12 | matrix: 13 | os: [ubuntu-latest, macos-latest, windows-latest] 14 | env: 15 | OS: ${{ matrix.os }} 16 | PYTHON: "3.12" 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Setup Python 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: "3.12" 23 | - name: Generate coverage report 24 | run: | 25 | pip install -r requirements-dev.txt 26 | coverage run -m pytest 27 | coverage combine 28 | - name: Upload coverage to Codecov 29 | uses: codecov/codecov-action@v4 30 | with: 31 | env_vars: OS,PYTHON 32 | flags: unittests 33 | name: codecov-umbrella 34 | verbose: true 35 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /eda_report/exceptions.py: -------------------------------------------------------------------------------- 1 | class Error(Exception): 2 | """The base class for exceptions in this package.""" 3 | 4 | pass 5 | 6 | 7 | class InputError(Error): 8 | """*Exception* raised when a given input object is *not of the expected 9 | type* or is otherwise *invalid*. 10 | 11 | In most cases, an attempt is made to cast the erroneous input into the 12 | proper type, and this *Exception* is raised if it fails. 13 | 14 | Args: 15 | message (str): A brief description of the mishap detected. 16 | """ 17 | 18 | def __init__(self, message: str) -> None: 19 | self.message = message 20 | 21 | 22 | class EmptyDataError(InputError): 23 | """*Exception* raised when an iterable input object has length zero or has 24 | no more items to yield. 25 | """ 26 | 27 | pass 28 | 29 | 30 | class GroupbyVariableError(InputError): 31 | """*Exception* raised when the specified group-by variable is invalid.""" 32 | 33 | pass 34 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | alabaster==1.0.0 2 | babel==2.16.0 3 | beautifulsoup4==4.12.3 4 | certifi==2024.12.14 5 | charset-normalizer==3.4.1 6 | contourpy==1.3.1 7 | cycler==0.12.1 8 | docutils==0.21.2 9 | fonttools==4.55.3 10 | furo==2024.8.6 11 | idna==3.10 12 | imagesize==1.4.1 13 | Jinja2==3.1.5 14 | kiwisolver==1.4.8 15 | lxml==5.3.0 16 | MarkupSafe==3.0.2 17 | matplotlib==3.10.0 18 | numpy==2.2.1 19 | packaging==24.2 20 | pandas==2.2.3 21 | pillow==11.0.0 22 | Pygments==2.18.0 23 | pyparsing==3.2.0 24 | python-dateutil==2.9.0.post0 25 | python-docx==1.1.2 26 | pytz==2024.2 27 | requests==2.32.3 28 | scipy==1.14.1 29 | six==1.17.0 30 | snowballstemmer==2.2.0 31 | soupsieve==2.6 32 | Sphinx==8.1.3 33 | sphinx-basic-ng==1.0.0b2 34 | sphinxcontrib-applehelp==2.0.0 35 | sphinxcontrib-devhelp==2.0.0 36 | sphinxcontrib-htmlhelp==2.1.0 37 | sphinxcontrib-jsmath==1.0.1 38 | sphinxcontrib-qthelp==2.0.0 39 | sphinxcontrib-serializinghtml==2.0.0 40 | tqdm==4.67.1 41 | typing_extensions==4.12.2 42 | tzdata==2024.2 43 | urllib3==2.3.0 44 | -------------------------------------------------------------------------------- /tests/test_file_loader.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pandas import DataFrame 3 | 4 | from eda_report._read_file import df_from_file 5 | from eda_report.exceptions import InputError 6 | 7 | 8 | class TestFileLoader: 9 | data = DataFrame([[1, 2, 3], [4, 5, 6]], columns=list("ABC")) 10 | 11 | def test_csv_file_load(self, temp_data_dir): 12 | # Check that a valid csv file is read as a DataFrame 13 | assert df_from_file(temp_data_dir / "data.csv").equals(self.data) 14 | 15 | def test_excel_file_load(self, temp_data_dir): 16 | # Check that a valid excel file is read as a DataFrame 17 | assert df_from_file(temp_data_dir / "data.xlsx").equals(self.data) 18 | 19 | def test_invalid_file(self): 20 | # Check that an invalid file format/extension raises an InputError 21 | with pytest.raises(InputError) as error: 22 | df_from_file("data.some_extension") 23 | # Check that the error message is as expected 24 | assert "Invalid input file: 'data.some_extension'" in str(error.value) 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021-2025 Abwao 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/publish-pypi.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | jobs: 16 | deploy: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: actions/checkout@v4 20 | - name: Set up Python 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: '3.12' 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install build 28 | - name: Build package 29 | run: python -m build 30 | - name: Publish package 31 | uses: pypa/gh-action-pypi-publish@ec4db0b4ddc65acdf4bff5fa45ac92d78b56bdf0 32 | with: 33 | user: __token__ 34 | password: ${{ secrets.PYPI_API_TOKEN }} 35 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = eda_report 3 | version = attr: eda_report.__version__ 4 | description = Automate exploratory data analysis and reporting. 5 | long_description = file: README.md 6 | long_description_content_type = text/markdown 7 | keywords = eda exploratory data analysis report 8 | author = Abwao 9 | author_email = abwaomusungu@gmail.com 10 | url = https://eda-report.readthedocs.io/ 11 | license = MIT 12 | classifiers = 13 | Development Status :: 4 - Beta 14 | Intended Audience :: Science/Research 15 | License :: OSI Approved :: MIT License 16 | Operating System :: OS Independent 17 | Programming Language :: Python :: 3.12 18 | 19 | project_urls = 20 | Source Code = https://github.com/Tim-Abwao/eda-report 21 | 22 | [options] 23 | packages = find: 24 | install_requires = 25 | matplotlib>=3.10.0 26 | openpyxl>=3.1.5 27 | pandas>=2.2.3 28 | python-docx>=1.1.2 29 | scipy>=1.14.1 30 | tqdm>=4.67.1 31 | include_package_data = True 32 | python_requires = >=3.10 33 | 34 | [options.entry_points] 35 | console_scripts = 36 | eda-report = eda_report._cli:run_from_cli 37 | 38 | [options.extras_require] 39 | dev = 40 | black>=24.10.0 41 | coverage>=7.6.10 42 | flake8>=7.1.1 43 | pytest>=8.3.4 44 | 45 | [options.package_data] 46 | eda_report = eda_report/images/*.png 47 | -------------------------------------------------------------------------------- /eda_report/_read_file.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Union 3 | 4 | import pandas as pd 5 | 6 | from eda_report.exceptions import InputError 7 | 8 | 9 | def df_from_file(filepath: Union[str, Path]) -> pd.DataFrame: 10 | """Reads a file, and loads its contents as a :class:`~pandas.DataFrame`. 11 | 12 | File formats are currently restricted to *csv* and *excel*, since these 13 | are the most often used to store data. 14 | 15 | This is basically a wrapper around ``pandas'`` input functions: 16 | 17 | * :func:`pandas.read_csv` 18 | * :func:`pandas.read_excel` 19 | 20 | 21 | Args: 22 | filepath (Union[str, Path]): The path to a file. 23 | 24 | Raises: 25 | InputError: If the supplied filepath is invalid, for instance if the 26 | file is of an incorrect format or does not exist. 27 | 28 | Returns: 29 | pandas.DataFrame: The specified file's contents. 30 | """ 31 | file = Path(filepath) 32 | 33 | if file.suffix == ".csv": 34 | return pd.read_csv(file) 35 | elif file.suffix == ".xlsx": 36 | return pd.read_excel(file, engine="openpyxl") 37 | else: 38 | raise InputError( 39 | f"Invalid input file: '{filepath}'. Expected a CSV or Excel file." 40 | ) 41 | -------------------------------------------------------------------------------- /.github/workflows/unit-tests.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python 3.10-3.12 5 | 6 | on: 7 | push: 8 | branches: [main, dev] 9 | pull_request: 10 | branches: [main, dev] 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.10", "3.11", "3.12"] 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | pip install -U pip 28 | pip install -r requirements-dev.txt 29 | - name: Lint with flake8 30 | run: | 31 | # stop the build if there are Python syntax errors or undefined names 32 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 33 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 34 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 35 | - name: Test with pytest 36 | run: | 37 | pytest tests/ 38 | -------------------------------------------------------------------------------- /tests/test_content_creation.py: -------------------------------------------------------------------------------- 1 | from pandas import DataFrame 2 | 3 | from eda_report._analysis import _AnalysisResult 4 | from eda_report._content import _ReportContent 5 | 6 | data = DataFrame( 7 | {"A": range(48), "B": list(range(16)) * 3, "C": list("abcd") * 12} 8 | ) 9 | 10 | 11 | class TestReportContent: 12 | content = _ReportContent(data, title="Some Title") 13 | 14 | def test_general_attributes(self): 15 | assert isinstance(self.content, _AnalysisResult) 16 | assert self.content.GRAPH_COLOR == "cyan" 17 | assert self.content.TITLE == "Some Title" 18 | assert self.content.GROUPBY_DATA is None 19 | 20 | def test_intro(self): 21 | assert self.content.intro_text == ( 22 | "The dataset consists of 48 rows (observations) and 3 columns " 23 | "(features), 2 of which are numeric." 24 | ) 25 | 26 | def test_variable_descriptions(self): 27 | assert self.content.variable_descriptions == { 28 | "A": ( 29 | "A is a numeric variable with 48 unique values." 30 | " None of its values are missing." 31 | ), 32 | "B": ( 33 | "B is a numeric variable with 16 unique values." 34 | " None of its values are missing." 35 | ), 36 | "C": ( 37 | "C is a categorical variable with 4 unique values." 38 | " None of its values are missing." 39 | ), 40 | } 41 | 42 | def test_bivariate_summaries(self): 43 | assert self.content.bivariate_summaries == { 44 | ("A", "B"): "A and B have weak positive correlation (0.33)." 45 | } 46 | 47 | 48 | def test_limiting_bivariate_summaries(): 49 | content = _ReportContent([range(12), [1, 2, 3, 4] * 3]) 50 | # content has 66 var_pairs (66 possible pairs from 12 numeric cols) 51 | # but the limit for summaries is 20 52 | assert len(content.bivariate_summaries) == 20 53 | -------------------------------------------------------------------------------- /docs/source/quickstart.rst: -------------------------------------------------------------------------------- 1 | Quickstart 2 | ========== 3 | 4 | Using the Graphical User Interface 5 | ---------------------------------- 6 | 7 | The command ``eda-report`` launches a graphical window to help select a *csv* or *excel* file to analyze:: 8 | 9 | $ eda-report 10 | 11 | .. figure:: _static/screencast.* 12 | :alt: an image of the graphical user interface 13 | 14 | A ``tkinter``-based graphical user interface to the application 15 | 16 | You will be prompted to enter your desired *title*, *groupby/target variable*, *graph color* & *output file-name*. Afterwards, a report is generated, as specified, from the contents of the selected file. 17 | 18 | .. hint:: 19 | For help with `Tk` - related issues, consider visiting `TkDocs`_. 20 | 21 | .. _`TkDocs`: https://tkdocs.com/index.html 22 | 23 | Using the Command Line Interface 24 | -------------------------------- 25 | 26 | You can specify an input file and an output file-name:: 27 | 28 | $ eda-report -i data.csv -o some_name.docx 29 | 30 | .. literalinclude:: examples.txt 31 | :lines: 106-128 32 | 33 | From an Interactive Session 34 | --------------------------- 35 | 36 | You can use the :func:`~eda_report.get_word_report` function to generate reports: 37 | 38 | .. literalinclude:: examples.txt 39 | :lines: 136-142 40 | 41 | You can use the :func:`~eda_report.summarize` function to analyze datasets: 42 | 43 | .. literalinclude:: examples.txt 44 | :lines: 146-171 45 | .. literalinclude:: examples.txt 46 | :lines: 172-195 47 | 48 | You can plot several statistical graphs (see :ref:`plotting-examples`): 49 | 50 | >>> import eda_report.plotting as ep 51 | >>> ax = ep.plot_correlation(mpg_data) 52 | >>> ax.figure.savefig("correlation-plot.png") 53 | 54 | .. image:: _static/correlation-plot.png 55 | :width: 80% 56 | :align: center 57 | :alt: a correlation-plot 58 | :class: only-light 59 | 60 | .. image:: _static/correlation-plot-dark.png 61 | :width: 80% 62 | :align: center 63 | :alt: a correlation-plot 64 | :class: only-dark 65 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | ``eda-report`` User Guide 2 | ========================= 3 | 4 | Speed up the `exploratory data analysis`_ and reporting process. Automatically analyze a dataset, and get: 5 | 6 | 1. Statistical properties 7 | ------------------------- 8 | 9 | Descriptive statistics, bivariate analysis, tests for normality and more: 10 | 11 | .. literalinclude:: examples.txt 12 | :lines: 146-171 13 | 14 | 2. Revealing visualizations 15 | --------------------------- 16 | 17 | - *Box-plots*, *kde-plots*, *normal-probability-plots*, *scatter-plots* and a *correlation bar-chart* for numeric variables. 18 | - *Bar-plots* for categorical variables. 19 | 20 | >>> import eda_report.plotting as ep 21 | >>> ax = ep.regression_plot(mpg_data["acceleration"], mpg_data["horsepower"], 22 | ... labels=("Acceleration", "Horsepower")) 23 | >>> ax.figure.savefig("regression-plot.png") 24 | 25 | .. image:: _static/regression-plot.png 26 | :width: 80% 27 | :align: center 28 | :alt: a regression-plot 29 | :class: only-light 30 | 31 | .. image:: _static/regression-plot-dark.png 32 | :width: 80% 33 | :align: center 34 | :alt: a regression-plot 35 | :class: only-dark 36 | 37 | 3. A report in *Word* (.docx) format 38 | ------------------------------------ 39 | 40 | An exploratory data analysis report document complete with variable descriptions, summary statistics, statistical plots, contingency tables and more: 41 | 42 | .. literalinclude:: examples.txt 43 | :lines: 136-142 44 | 45 | .. figure:: _static/report.* 46 | :alt: iris dataset report animation 47 | 48 | A report generated from the *iris dataset*. 49 | 50 | .. image:: https://mybinder.org/badge_logo.svg 51 | :target: https://mybinder.org/v2/gh/Tim-Abwao/eda-report/HEAD?filepath=eda-report-basics.ipynb 52 | 53 | .. _exploratory data analysis: https://en.wikipedia.org/wiki/Exploratory_data_analysis 54 | 55 | .. toctree:: 56 | :maxdepth: 2 57 | 58 | installation 59 | quickstart 60 | modules 61 | 62 | Indices and tables 63 | ================== 64 | 65 | * :ref:`genindex` 66 | * :ref:`modindex` 67 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | from eda_report._cli import run_from_cli 5 | from eda_report.gui import EDAGUI 6 | 7 | 8 | class TestCLIArgumentParsing: 9 | def test_with_all_args(self, temp_data_dir, monkeypatch): 10 | # Simulate supplying all args 11 | monkeypatch.setattr( 12 | sys, 13 | "argv", 14 | [ 15 | "eda-report", 16 | "-i", 17 | f"{temp_data_dir / 'data.csv'}", 18 | "-o", 19 | f"{temp_data_dir}/cli-test-1.docx", 20 | "-t", 21 | "CLI Test", 22 | "-c", 23 | "teal", 24 | "-g", 25 | "A", 26 | ], 27 | ) 28 | run_from_cli() 29 | expected_output = temp_data_dir / "cli-test-1.docx" 30 | assert expected_output.is_file() 31 | 32 | def test_with_only_input_file(self, temp_data_dir, monkeypatch): 33 | # Supply the input file it has no default. 34 | monkeypatch.setattr( 35 | sys, "argv", ["eda-report", "-i", f"{temp_data_dir / 'data.xlsx'}"] 36 | ) 37 | run_from_cli() 38 | expected_output = Path("eda-report.docx") 39 | assert expected_output.is_file() 40 | 41 | Path("eda-report.docx").unlink() # Remove resultant report 42 | 43 | def test_without_optional_args(self, monkeypatch, capsys): 44 | # Simulate launching the GUI 45 | def mock_gui_init(gui): 46 | """Simulate GUI initialization.""" 47 | pass 48 | 49 | def mock_gui_mainloop(gui): 50 | """Simulate running GUI.""" 51 | print("Graphical user interface running in Tk mainloop.") 52 | 53 | monkeypatch.setattr(EDAGUI, "__init__", mock_gui_init) 54 | monkeypatch.setattr(EDAGUI, "mainloop", mock_gui_mainloop) 55 | 56 | # Simulate running with no args 57 | monkeypatch.setattr(sys, "argv", ["eda-report"]) 58 | run_from_cli() 59 | 60 | captured = capsys.readouterr() 61 | assert ( 62 | "Graphical user interface running in Tk mainloop." in captured.out 63 | ) 64 | -------------------------------------------------------------------------------- /eda_report/__init__.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterable 2 | from typing import Union 3 | 4 | from eda_report._validate import _validate_dataset 5 | from eda_report.bivariate import Dataset 6 | from eda_report.document import ReportDocument 7 | from eda_report.univariate import Variable 8 | 9 | __version__ = "2.8.2" 10 | 11 | 12 | def get_word_report( 13 | data: Iterable, 14 | *, 15 | title: str = "Exploratory Data Analysis Report", 16 | graph_color: str = "cyan", 17 | groupby_variable: Union[str, int] = None, 18 | output_filename: str = "eda-report.docx", 19 | table_style: str = "Table Grid", 20 | ) -> ReportDocument: 21 | """Analyze `data`, and generate a report document in *Word* (*.docx*) 22 | format. 23 | 24 | Args: 25 | data (Iterable): The data to analyze. 26 | title (str, optional): The title to assign the report. Defaults to 27 | "Exploratory Data Analysis Report". 28 | graph_color (str, optional): The color to apply to the graphs. 29 | Defaults to "cyan". 30 | groupby_variable (Union[str, int], optional): The label/index for the 31 | column to use to group values. Defaults to None. 32 | output_filename (str, optional): The name/path to save the report 33 | document. Defaults to "eda-report.docx". 34 | table_style (str, optional): The style to apply to the tables created. 35 | Defaults to "Table Grid". 36 | 37 | Returns: 38 | ReportDocument: Document object with analysis results. 39 | 40 | Example: 41 | .. literalinclude:: examples.txt 42 | :lines: 136-142 43 | """ 44 | return ReportDocument( 45 | data, 46 | title=title, 47 | graph_color=graph_color, 48 | output_filename=output_filename, 49 | groupby_variable=groupby_variable, 50 | table_style=table_style, 51 | ) 52 | 53 | 54 | def summarize(data: Iterable) -> Union[Variable, Dataset]: 55 | """Get summary statistics for the supplied data. 56 | 57 | Args: 58 | data (Iterable): The data to analyze. 59 | 60 | Returns: 61 | Union[Variable, Dataset]: Analysis results. 62 | 63 | Example: 64 | .. literalinclude:: examples.txt 65 | :lines: 172-195 66 | """ 67 | data = _validate_dataset(data) 68 | if data.shape[1] == 1: 69 | return Variable(data.squeeze()) 70 | else: 71 | return Dataset(data) 72 | -------------------------------------------------------------------------------- /tests/test_document_creation.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | 3 | from pandas import DataFrame 4 | 5 | from eda_report.document import ReportDocument 6 | 7 | 8 | class TestReportWithIdealInput: 9 | data = DataFrame( 10 | {"A": range(50), "B": [1, 2, 3, 4, 5] * 10, "C": list("ab") * 25} 11 | ) 12 | report = ReportDocument( 13 | data, 14 | title="Test Report", 15 | graph_color="teal", 16 | groupby_variable="C", 17 | output_filename=BytesIO(), 18 | ) 19 | 20 | def test_general_properties(self): 21 | # Largely covered in _ReportContent tests 22 | assert self.report.TITLE == "Test Report" 23 | assert self.report.GRAPH_COLOR == "teal" 24 | assert "correlation_plot" in self.report.bivariate_graphs 25 | assert "regression_plots" in self.report.bivariate_graphs 26 | assert self.report.TABLE_STYLE == "Table Grid" 27 | 28 | 29 | class TestReportWithLimitedInput: 30 | data = DataFrame( 31 | {"categorical": list("ABCDEFGHIJKL" * 2), "numeric": range(24)} 32 | ) 33 | report = ReportDocument( 34 | data, 35 | title="One Numeric One Categorical", 36 | graph_color="lime", 37 | output_filename=BytesIO(), 38 | ) 39 | 40 | def test_report_creation(self): 41 | assert isinstance(self.report, ReportDocument) 42 | assert self.report.TITLE == "One Numeric One Categorical" 43 | assert self.report.GRAPH_COLOR == "lime" 44 | 45 | def test_bivariate_analysis(self): 46 | assert self.report.bivariate_summaries is None 47 | assert self.report.bivariate_graphs is None 48 | 49 | 50 | class TestReportWithUnivariateInput: 51 | univariate_numeric_report = ReportDocument( 52 | DataFrame(range(5)), 53 | title="Univariate Numeric Report", 54 | output_filename=BytesIO(), 55 | ) 56 | univariate_categorical_report = ReportDocument( 57 | DataFrame(["a"]), 58 | title="Univariate Categorical Report", 59 | output_filename=BytesIO(), 60 | ) 61 | 62 | def test_bivariate_analysis(self): 63 | assert self.univariate_numeric_report.bivariate_summaries is None 64 | assert self.univariate_categorical_report.bivariate_summaries is None 65 | 66 | assert self.univariate_numeric_report.bivariate_graphs is None 67 | assert self.univariate_categorical_report.bivariate_graphs is None 68 | 69 | 70 | def test_output_file(temp_data_dir): 71 | ReportDocument(range(50), output_filename=temp_data_dir / "eda.docx") 72 | assert (temp_data_dir / "eda.docx").is_file() 73 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | 16 | from sphinx.builders.html import StandaloneHTMLBuilder 17 | 18 | sys.path.insert(0, os.path.abspath("../../")) 19 | 20 | # Modify supported image order 21 | StandaloneHTMLBuilder.supported_image_types = [ 22 | "image/svg+xml", 23 | "image/gif", 24 | "image/png", 25 | "image/jpeg", 26 | ] 27 | 28 | # -- Project information ----------------------------------------------------- 29 | 30 | project = "eda-report" 31 | copyright = "2022, Abwao" 32 | author = "Abwao" 33 | 34 | # The full version, including alpha/beta/rc tags 35 | release = "2.8.2" 36 | 37 | # -- General configuration --------------------------------------------------- 38 | 39 | # Add any Sphinx extension module names here, as strings. They can be 40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 41 | # ones. 42 | extensions = [ 43 | "sphinx.ext.autodoc", 44 | "sphinx.ext.intersphinx", 45 | "sphinx.ext.napoleon", 46 | "sphinx.ext.viewcode", 47 | ] 48 | 49 | intersphinx_mapping = { 50 | "docx": ("https://python-docx.readthedocs.io/en/latest/", None), 51 | "pandas": ("https://pandas.pydata.org/docs/", None), 52 | "python": ("https://docs.python.org/3", None), 53 | "matplotlib": ("https://matplotlib.org/stable/", None), 54 | } 55 | 56 | 57 | # Add any paths that contain templates here, relative to this directory. 58 | templates_path = ["_templates"] 59 | 60 | # List of patterns, relative to source directory, that match files and 61 | # directories to ignore when looking for source files. 62 | # This pattern also affects html_static_path and html_extra_path. 63 | exclude_patterns = [] 64 | 65 | # -- Options for HTML output ------------------------------------------------- 66 | master_doc = "index" 67 | # The theme to use for HTML and HTML Help pages. See the documentation for 68 | # a list of builtin themes. 69 | # 70 | html_theme = "furo" 71 | html_theme_options = { 72 | "light_css_variables": { 73 | "font-stack": "Georgia, serif", 74 | "font-stack--monospace": "Courier, monospace", 75 | }, 76 | } 77 | # Add any paths that contain custom static files (such as style sheets) here, 78 | # relative to this directory. They are copied after the builtin static files, 79 | # so a file named "default.css" will overwrite the builtin "default.css". 80 | html_static_path = ["_static"] 81 | -------------------------------------------------------------------------------- /eda_report/_cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import Optional 3 | 4 | from eda_report._read_file import df_from_file 5 | from eda_report.document import ReportDocument 6 | 7 | 8 | def process_cli_args() -> argparse.Namespace: 9 | """Captures and parses input from the command line interface using the 10 | :mod:`argparse` module from the Python standard library. 11 | 12 | Returns: 13 | argparse.Namespace: Object with the parsed arguments as attributes. 14 | 15 | Example: 16 | .. literalinclude:: examples.txt 17 | :lines: 106-128 18 | """ 19 | parser = argparse.ArgumentParser( 20 | prog="eda-report", 21 | description=( 22 | "Automatically analyze data and generate reports. A graphical user" 23 | " interface will be launched if none of the optional arguments is " 24 | "specified." 25 | ), 26 | ) 27 | parser.add_argument( 28 | "-i", 29 | "--infile", 30 | type=df_from_file, 31 | help="A .csv or .xlsx file to analyze.", 32 | ) 33 | parser.add_argument( 34 | "-o", 35 | "--outfile", 36 | default="eda-report.docx", 37 | help="The output name for analysis results (default: %(default)s)", 38 | ) 39 | parser.add_argument( 40 | "-t", 41 | "--title", 42 | default="Exploratory Data Analysis Report", 43 | help="The top level heading for the report (default: %(default)s)", 44 | ) 45 | parser.add_argument( 46 | "-c", 47 | "--color", 48 | default="cyan", 49 | help="The color to apply to graphs (default: %(default)s)", 50 | ) 51 | parser.add_argument( 52 | "-g", 53 | "-T", 54 | "--groupby", 55 | "--target", 56 | help=( 57 | "The variable to use for grouping plotted values. An integer value" 58 | " is treated as a column index, whereas a string is treated as a" 59 | " column label." 60 | ), 61 | ) 62 | return parser.parse_args() 63 | 64 | 65 | def run_from_cli() -> Optional[ReportDocument]: 66 | """Creates an exploratory data analysis report in *Word* format using input 67 | from the command line interface. 68 | 69 | This is the function executed when the package is run as a script (using 70 | ``python -m eda_report``). It is also the entry point for the 71 | ``eda-report`` command (console script). 72 | """ 73 | args = process_cli_args() 74 | if args.infile is None: 75 | from eda_report.gui import EDAGUI 76 | # Launch graphical user interface to select and analyze a file 77 | app = EDAGUI() 78 | app.mainloop() 79 | else: 80 | ReportDocument( 81 | args.infile, 82 | title=args.title, 83 | graph_color=args.color, 84 | output_filename=args.outfile, 85 | groupby_variable=args.groupby, 86 | ) 87 | -------------------------------------------------------------------------------- /eda_report/_content.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Iterable, Union 2 | 3 | from eda_report._analysis import _AnalysisResult 4 | 5 | 6 | class _ReportContent(_AnalysisResult): 7 | """Prepares textual summaries of analysis results. 8 | 9 | Args: 10 | data (Iterable): The data to analyze. 11 | title (str, optional): The title to assign the report. Defaults to 12 | "Exploratory Data Analysis Report". 13 | graph_color (str, optional): The color to apply to the graphs. 14 | Defaults to "cyan". 15 | groupby_variable (Union[str, int], optional): The column to 16 | use to group values. Defaults to None. 17 | """ 18 | 19 | def __init__( 20 | self, 21 | data: Iterable, 22 | *, 23 | title: str = "Exploratory Data Analysis Report", 24 | graph_color: str = "cyan", 25 | groupby_variable: Union[str, int] = None, 26 | ) -> None: 27 | super().__init__( 28 | data, graph_color=graph_color, groupby_variable=groupby_variable 29 | ) 30 | self.TITLE = title 31 | self.intro_text = self._get_introductory_summary() 32 | self.variable_descriptions = self._describe_variables() 33 | 34 | def _get_introductory_summary(self) -> str: 35 | """Get an overview of the number of rows and the nature of columns. 36 | 37 | Returns: 38 | str: Introduction. 39 | """ 40 | num_rows, num_cols = self.dataset.data.shape 41 | if num_rows == 1: 42 | rows = "1 row (observation)" 43 | else: 44 | rows = f"{num_rows:,} rows (observations)" 45 | 46 | if num_cols == 1: 47 | cols = "1 column (feature)" 48 | else: 49 | cols = f"{num_cols:,} columns (features)" 50 | 51 | if self.dataset._numeric_stats is None: 52 | numeric_descr = "" 53 | else: 54 | num_numeric = self.dataset._numeric_stats.shape[0] 55 | if num_numeric == 1: 56 | numeric_descr = ", 1 of which is numeric" 57 | else: 58 | numeric_descr = f", {num_numeric} of which are numeric" 59 | 60 | return f"The dataset consists of {rows} and {cols}{numeric_descr}." 61 | 62 | def _describe_variables(self) -> Dict[str, str]: 63 | """Get summary statistics for a variable. 64 | 65 | Returns: 66 | Dict[str, str]: Summary statistics. 67 | """ 68 | descriptions = {} 69 | for name, variable in self.variables.items(): 70 | if variable.num_unique == 1: 71 | unique_vals = "1 unique value" 72 | else: 73 | unique_vals = f"{variable.num_unique:,} unique values" 74 | 75 | descriptions[name] = ( 76 | f"{variable.name.capitalize()} is a {variable.var_type} " 77 | f"variable with {unique_vals}. {variable.missing} of its " 78 | "values are missing." 79 | ) 80 | return descriptions 81 | -------------------------------------------------------------------------------- /docs/source/eda_report.plotting.rst: -------------------------------------------------------------------------------- 1 | eda\_report.plotting 2 | ==================== 3 | 4 | You can find a wealth of plotting libraries at the `PyViz`_ website. 5 | 6 | .. _PyViz: https://pyviz.org/ 7 | 8 | The plotting functions below are implemented using `matplotlib`_. In the interest of efficiency, especially for large datasets with numerous columns; these plotting functions use a *non-interactive* `matplotlib backend`_. This was inspired by `Embedding in a web application server`_, which says in part: 9 | 10 | 11 | When using Matplotlib in a web server [GUI application, in this case] it is strongly recommended to not use :mod:`~matplotlib.pyplot` (pyplot maintains references to the opened figures to make `show`_ work, but this will cause memory leaks unless the figures are properly closed). 12 | 13 | 14 | .. _matplotlib: https://matplotlib.org/ 15 | .. _matplotlib backend: https://matplotlib.org/stable/users/explain/backends.html#the-builtin-backends 16 | .. _Embedding in a web application server: https://matplotlib.org/stable/gallery/user_interfaces/web_application_server_sgskip.html 17 | .. _show: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.show.html#matplotlib.pyplot.show 18 | 19 | You can conveniently view the generated figures in a *jupyter notebook* using ``%matplotlib inline``, as shown in this `demo notebook`_. 20 | 21 | .. _demo notebook: https://mybinder.org/v2/gh/Tim-Abwao/eda-report/HEAD?filepath=eda-report-basics.ipynb 22 | 23 | .. image:: https://mybinder.org/badge_logo.svg 24 | :target: https://mybinder.org/v2/gh/Tim-Abwao/eda-report/HEAD?filepath=eda-report-basics.ipynb 25 | 26 | Otherwise, you'll probably need to export them as images. 27 | 28 | .. _plotting-examples: 29 | 30 | Plotting Examples 31 | ----------------- 32 | >>> import eda_report.plotting as ep 33 | >>> ax = ep.bar_plot(mpg_data["origin"], label="Country of Origin") 34 | >>> ax.figure.savefig("bar-plot.png") 35 | 36 | .. image:: _static/bar-plot.png 37 | :width: 80% 38 | :align: center 39 | :alt: a bar-plot 40 | :class: only-light 41 | 42 | .. image:: _static/bar-plot-dark.png 43 | :width: 80% 44 | :align: center 45 | :alt: a bar-plot 46 | :class: only-dark 47 | 48 | >>> ax = ep.box_plot(mpg_data["acceleration"], label="Acceleration", hue=mpg_data["origin"]) 49 | >>> ax.figure.savefig("box-plot.png") 50 | 51 | .. image:: _static/box-plot.png 52 | :width: 80% 53 | :align: center 54 | :alt: a box-plot 55 | :class: only-light 56 | 57 | .. image:: _static/box-plot-dark.png 58 | :width: 80% 59 | :align: center 60 | :alt: a box-plot 61 | :class: only-dark 62 | 63 | >>> ax = ep.kde_plot(mpg_data["mpg"], label="MPG", hue=mpg_data["cylinders"]) 64 | >>> ax.figure.savefig("kde-plot.png") 65 | 66 | .. image:: _static/kde-plot.png 67 | :width: 80% 68 | :align: center 69 | :alt: a kde-plot 70 | :class: only-light 71 | 72 | .. image:: _static/kde-plot-dark.png 73 | :width: 80% 74 | :align: center 75 | :alt: a kde-plot 76 | :class: only-dark 77 | 78 | >>> ax = ep.regression_plot(mpg_data["acceleration"], mpg_data["horsepower"], 79 | ... labels=("Acceleration", "Horsepower")) 80 | >>> ax.figure.savefig("regression-plot.png") 81 | 82 | .. image:: _static/regression-plot.png 83 | :width: 80% 84 | :align: center 85 | :alt: a regression-plot 86 | :class: only-light 87 | 88 | .. image:: _static/regression-plot-dark.png 89 | :width: 80% 90 | :align: center 91 | :alt: a regression-plot 92 | :class: only-dark 93 | 94 | >>> ax = ep.prob_plot(mpg_data["acceleration"], label="Acceleration") 95 | >>> ax.figure.savefig("probability-plot.png") 96 | 97 | .. image:: _static/probability-plot.png 98 | :width: 80% 99 | :align: center 100 | :alt: a probability-plot 101 | :class: only-light 102 | 103 | .. image:: _static/probability-plot-dark.png 104 | :width: 80% 105 | :align: center 106 | :alt: a probability-plot 107 | :class: only-dark 108 | 109 | >>> ax = ep.plot_correlation(mpg_data) 110 | >>> ax.figure.savefig("correlation-plot.png") 111 | 112 | .. image:: _static/correlation-plot.png 113 | :width: 80% 114 | :align: center 115 | :alt: a correlation-plot 116 | :class: only-light 117 | 118 | .. image:: _static/correlation-plot-dark.png 119 | :width: 80% 120 | :align: center 121 | :alt: a correlation-plot 122 | :class: only-dark 123 | 124 | .. automodule:: eda_report.plotting 125 | :members: 126 | :inherited-members: 127 | :undoc-members: 128 | :show-inheritance: 129 | -------------------------------------------------------------------------------- /tests/test_analysis.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | 3 | from pandas import DataFrame, Series 4 | 5 | from eda_report._analysis import _AnalysisResult, _get_contingency_tables 6 | from eda_report.bivariate import Dataset 7 | 8 | data = DataFrame( 9 | {"A": range(50), "B": [1, 2, 3, 4, 5] * 10, "C": list("ab") * 25} 10 | ) 11 | 12 | 13 | class TestGetContingencyTables: 14 | data = DataFrame( 15 | [list("abc"), list("abd"), list("bcd")] * 4, columns=list("ABC") 16 | ) 17 | 18 | def test_with_empty_data(self): 19 | empty_df = self.data[[]] 20 | tables = _get_contingency_tables( 21 | categorical_df=empty_df, groupby_data=self.data["C"] 22 | ) 23 | assert tables == {} 24 | 25 | def test_with_null_groupby_data(self): 26 | tables = _get_contingency_tables( 27 | categorical_df=self.data, groupby_data=None 28 | ) 29 | assert tables == {} 30 | 31 | def test_with_valid_args(self): 32 | tables = _get_contingency_tables( 33 | categorical_df=self.data, groupby_data=self.data["C"] 34 | ) 35 | # Check that groupby_data "C" is not included 36 | assert set(tables.keys()) == {"A", "B"} 37 | assert tables["A"].to_dict() == { 38 | "c": {"a": 4, "b": 0, "Total": 4}, 39 | "d": {"a": 4, "b": 4, "Total": 8}, 40 | "Total": {"a": 8, "b": 4, "Total": 12}, 41 | } 42 | assert tables["B"].to_dict() == { 43 | "c": {"b": 4, "c": 0, "Total": 4}, 44 | "d": {"b": 4, "c": 4, "Total": 8}, 45 | "Total": {"b": 8, "c": 4, "Total": 12}, 46 | } 47 | 48 | def test_cardinality_limit(self): 49 | high_cardinality_data = DataFrame( 50 | { 51 | "A": range(50), 52 | "B": list("abcdefghijklmnopqrstuvwxy") * 2, 53 | "C": list(range(10)) * 5, 54 | } 55 | ) 56 | tables = _get_contingency_tables( 57 | categorical_df=high_cardinality_data, 58 | groupby_data=Series([1, 2] * 25), 59 | ) 60 | # "A" and "B" have > 20 unique values, and so are omitted 61 | assert set(tables.keys()) == {"C"} 62 | 63 | 64 | class TestAnalysisResult: 65 | results = _AnalysisResult(data, graph_color="green", groupby_variable="C") 66 | 67 | def test_general_properties(self): 68 | assert isinstance(self.results.dataset, Dataset) 69 | assert self.results.GRAPH_COLOR == "green" 70 | assert self.results.GROUPBY_DATA.equals(data["C"]) 71 | assert self.results.bivariate_summaries == { 72 | ("A", "B"): "A and B have very weak positive correlation (0.10)." 73 | } 74 | 75 | def test_univariate_analysis(self): 76 | assert set(self.results.univariate_stats) == {"A", "B", "C"} 77 | 78 | # Summary statistics for each variable should be available in a dict 79 | assert isinstance(self.results.variables["A"].summary_stats, dict) 80 | assert isinstance(self.results.variables["B"].summary_stats, dict) 81 | assert isinstance(self.results.variables["C"].summary_stats, dict) 82 | 83 | def test_univariate_graphs(self): 84 | for key, graphs in self.results.univariate_graphs.items(): 85 | assert key in set("ABC") 86 | for graph in graphs.values(): 87 | assert isinstance(graph, BytesIO) 88 | 89 | def test_normality_tests(self): 90 | assert set(self.results.normality_tests) == {"A"} 91 | 92 | for df in self.results.normality_tests.values(): 93 | assert set(df.index) == { 94 | "D'Agostino's K-squared test", 95 | "Shapiro-Wilk test", 96 | "Kolmogorov-Smirnov test", 97 | } 98 | 99 | def test_contingency_tables(self): 100 | assert set(self.results.contingency_tables) == {"B"} 101 | assert self.results.contingency_tables["B"].to_dict() == { 102 | "a": {1: 5, 2: 5, 3: 5, 4: 5, 5: 5, "Total": 25}, 103 | "b": {1: 5, 2: 5, 3: 5, 4: 5, 5: 5, "Total": 25}, 104 | "Total": {1: 10, 2: 10, 3: 10, 4: 10, 5: 10, "Total": 50}, 105 | } 106 | 107 | def test_bivariate_graphs(self): 108 | assert set(self.results.bivariate_graphs.keys()) == { 109 | "correlation_plot", 110 | "regression_plots", 111 | } 112 | assert isinstance( 113 | self.results.bivariate_graphs["correlation_plot"], BytesIO 114 | ) 115 | for graph in self.results.bivariate_graphs[ 116 | "regression_plots" 117 | ].values(): 118 | assert isinstance(graph, BytesIO) 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `eda-report` - Automated Exploratory Data Analysis 2 | 3 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Tim-Abwao/eda-report/HEAD?filepath=eda-report-basics.ipynb) 4 | [![PyPI version](https://badge.fury.io/py/eda-report.svg)](https://badge.fury.io/py/eda-report) 5 | [![Python 3.10 - 3.12](https://github.com/Tim-Abwao/eda-report/actions/workflows/unit-tests.yml/badge.svg)](https://github.com/Tim-Abwao/eda-report/actions/workflows/unit-tests.yml) 6 | [![Documentation Status](https://readthedocs.org/projects/eda-report/badge/?version=latest)](https://eda-report.readthedocs.io/en/latest/?badge=latest) 7 | [![codecov](https://codecov.io/gh/Tim-Abwao/eda-report/branch/main/graph/badge.svg?token=KNQD8XZCWG)](https://codecov.io/gh/Tim-Abwao/eda-report) 8 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 9 | 10 | A Python program to help automate the exploratory data analysis and reporting process. 11 | 12 | Input data is analyzed using [pandas][pandas] and [SciPy][scipy]. Graphs are plotted using [matplotlib][matplotlib]. The results are then nicely packaged as a *Word (.docx)* document using [python-docx][python-docx]. 13 | 14 | ![screencast of report document from iris dataset][report-screencast] 15 | 16 | ## Installation 17 | 18 | You can install the package from [PyPI][eda-report-pypi] using: 19 | 20 | ```bash 21 | pip install eda-report 22 | ``` 23 | 24 | ## Basic Usage 25 | 26 | ### 1. Graphical User Interface 27 | 28 | The `eda-report` command launches a graphical window to help select a `csv`/`excel` file to analyze: 29 | 30 | ```bash 31 | eda-report 32 | ``` 33 | 34 | ![screencast of the gui][gui-screencast] 35 | 36 | You'll be prompted to set a *report title*, *group-by/target variable (optional)*, *graph color* and *output filename*; after which the contents of the input file are analyzed, and the results saved in a *Word (.docx)* document. 37 | 38 | >**NOTE:** For help with `Tk` - related issues, consider visiting [TkDocs][tkdocs]. 39 | 40 | ### 2. Command Line Interface 41 | 42 | ```bash 43 | $ eda-report -i iris.csv -o iris-report.docx 44 | Analyze variables: 100%|███████████████████████████████████| 5/5 45 | Plot variables: 100%|███████████████████████████████████| 5/5 46 | Bivariate analysis: 100%|███████████████████████████████████| 6/6 pairs. 47 | [INFO 02:12:22.146] Done. Results saved as 'iris-report.docx' 48 | ``` 49 | 50 | ```bash 51 | $ eda-report -h 52 | usage: eda-report [-h] [-i INFILE] [-o OUTFILE] [-t TITLE] [-c COLOR] 53 | [-g GROUPBY] 54 | 55 | Automatically analyze data and generate reports. A graphical user interface 56 | will be launched if none of the optional arguments is specified. 57 | 58 | optional arguments: 59 | -h, --help show this help message and exit 60 | -i INFILE, --infile INFILE 61 | A .csv or .xlsx file to analyze. 62 | -o OUTFILE, --outfile OUTFILE 63 | The output name for analysis results (default: eda- 64 | report.docx) 65 | -t TITLE, --title TITLE 66 | The top level heading for the report (default: 67 | Exploratory Data Analysis Report) 68 | -c COLOR, --color COLOR 69 | The color to apply to graphs (default: cyan) 70 | -g GROUPBY, -T GROUPBY, --groupby GROUPBY, --target GROUPBY 71 | The variable to use for grouping plotted values. An 72 | integer value is treated as a column index, whereas a 73 | string is treated as a column label. 74 | ``` 75 | 76 | 77 | 78 | ### 3. Interpreter Session 79 | 80 | ```python 81 | >>> eda_report.summarize(iris_data) 82 | 83 | Summary Statistics for Numeric features (4) 84 | ------------------------------------------- 85 | count avg stddev min 25% 50% 75% max skewness kurtosis 86 | sepal_length 150 5.8433 0.8281 4.3 5.1 5.80 6.4 7.9 0.3149 -0.5521 87 | sepal_width 150 3.0573 0.4359 2.0 2.8 3.00 3.3 4.4 0.3190 0.2282 88 | petal_length 150 3.7580 1.7653 1.0 1.6 4.35 5.1 6.9 -0.2749 -1.4021 89 | petal_width 150 1.1993 0.7622 0.1 0.3 1.30 1.8 2.5 -0.1030 -1.3406 90 | 91 | Summary Statistics for Categorical features (1) 92 | ----------------------------------------------- 93 | count unique top freq relative freq 94 | species 150 3 setosa 50 33.33% 95 | 96 | 97 | Pearson's Correlation (Top 20) 98 | ------------------------------ 99 | petal_length & petal_width -> very strong positive correlation (0.96) 100 | sepal_length & petal_length -> very strong positive correlation (0.87) 101 | sepal_length & petal_width -> very strong positive correlation (0.82) 102 | sepal_width & petal_length -> moderate negative correlation (-0.43) 103 | sepal_width & petal_width -> weak negative correlation (-0.37) 104 | sepal_length & sepal_width -> very weak negative correlation (-0.12) 105 | ``` 106 | 107 | Check out the [documentation][docs] for more features and details. 108 | 109 | [docs]: https://eda-report.readthedocs.io/ 110 | [eda-report-pypi]: https://pypi.org/project/eda-report/ 111 | [matplotlib]: https://matplotlib.org/ 112 | [pandas]: https://pandas.pydata.org/ 113 | [python-docx]: https://python-docx.readthedocs.io/ 114 | [scipy]: https://scipy.org/ 115 | [gui-screencast]: https://raw.githubusercontent.com/Tim-Abwao/eda-report/dev/docs/source/_static/screencast.gif 116 | [report-screencast]: https://raw.githubusercontent.com/Tim-Abwao/eda-report/dev/docs/source/_static/report.gif 117 | [tkdocs]: https://tkdocs.com/index.html 118 | -------------------------------------------------------------------------------- /eda_report/_validate.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections.abc import Iterable 3 | from typing import Optional, Union 4 | 5 | from pandas import DataFrame, RangeIndex, Series 6 | from pandas.api.types import is_numeric_dtype 7 | 8 | from eda_report.exceptions import ( 9 | EmptyDataError, 10 | GroupbyVariableError, 11 | InputError, 12 | ) 13 | 14 | 15 | def _clean_column_labels(data: DataFrame) -> DataFrame: 16 | """Makes sure that columns have *meaningful* names. 17 | 18 | When creating a ``DataFrame`` from an ``Iterable``, if no column names 19 | are provided, the columns are set as a :class:`~pandas.RangeIndex` — 20 | [0, 1, 2, ...] (default). 21 | 22 | This function renames such columns to ['var_1', 'var_2, 'var_3', ...], 23 | making references and comparisons much more intuitive. 24 | 25 | It also ensures that column labels are all of similar type (``str``) to 26 | allow sorting and the use of string methods. 27 | 28 | Args: 29 | data (pandas.DataFrame): Data to inspect and perhaps edit. 30 | 31 | Returns: 32 | pandas.DataFrame: The ``data``, with human-friendly column 33 | names. 34 | """ 35 | if isinstance(data.columns, RangeIndex): 36 | data.columns = [f"var_{i+1}" for i in data.columns] 37 | elif is_numeric_dtype(data.columns): 38 | data.columns = [f"var_{i}" for i in data.columns] 39 | else: 40 | data.columns = data.columns.map(str) 41 | return data 42 | 43 | 44 | def _check_cardinality(groupby_data: Series, *, threshold: int = 10) -> None: 45 | """Assesses whether the ``groupby_data`` has too many unique values 46 | (> ``threshold``, default 10). 47 | 48 | Args: 49 | groupby_data (pandas.Series): The data intended to group values. 50 | threshold (int, optional): Maximum allowable cardinality. Defaults to 51 | 10. 52 | 53 | Raises: 54 | GroupbyVariableError: If the `groupby_data` has cardinality outside the 55 | acceptable range. 56 | """ 57 | if groupby_data.nunique() > threshold: 58 | message = ( 59 | f"Group-by variable '{groupby_data.name}' not used to group " 60 | f"values. It has high cardinality ({groupby_data.nunique()}) " 61 | f"and would clutter graphs." 62 | ) 63 | logging.warning(message) 64 | raise GroupbyVariableError(message) 65 | 66 | 67 | def _validate_dataset(data: Iterable) -> DataFrame: 68 | """Ensures that input data is of type :class:`pandas.DataFrame`. 69 | 70 | If it isn't, this attempts to explicitly cast it as a ``DataFrame``. 71 | 72 | Columns in the data that are completely empty will be dropped. 73 | 74 | Args: 75 | data (Iterable): The data to analyze. 76 | 77 | Raises: 78 | InputError: If the ``data`` cannot be cast as a 79 | :class:`~pandas.DataFrame`. 80 | EmptyDataError: If the ``data`` has no items. 81 | 82 | Returns: 83 | pandas.DataFrame: The input data as a DataFrame. 84 | """ 85 | try: 86 | data_frame = DataFrame(data) 87 | except Exception: 88 | raise InputError( 89 | f"Expected a pandas.Dataframe object, but got {type(data)}." 90 | ) 91 | # The data should not be empty 92 | if len(data_frame) == 0: 93 | raise EmptyDataError("No data to process.") 94 | 95 | data_frame = ( 96 | # Attempt to infer better dtypes for columns. 97 | data_frame.infer_objects() 98 | # Drop completely empty columns. 99 | .dropna(axis=1, how="all") 100 | ) 101 | return _clean_column_labels(data_frame) 102 | 103 | 104 | def _validate_univariate_input( 105 | data: Iterable, *, name: str = None 106 | ) -> Optional[Series]: 107 | """Ensures that *univariate input data* is of type :class:`pandas.Series`. 108 | 109 | If it isn't, this attempts to explicitly cast it as a ``Series``. 110 | 111 | Args: 112 | data (Iterable): The data to analyze. 113 | name (str, optional): The name to assign the data. Defaults 114 | to None. 115 | 116 | Raises: 117 | InputError: If the ``data`` cannot be cast as a 118 | :class:`~pandas.Series`. 119 | EmptyDataError: If the ``data`` has no items. 120 | 121 | Returns: 122 | Optional[pandas.Series]: The input data as a ``Series``. 123 | """ 124 | if data is None: 125 | return None 126 | else: 127 | try: 128 | series = Series(data, name=name) 129 | except Exception: 130 | raise InputError( 131 | f"Expected a one-dimensional sequence, but got {type(data)}." 132 | ) 133 | # Convert potentially mixed-type items to strings 134 | if series.dtype == "O": 135 | series = series.astype("string") 136 | 137 | if series.shape[0] == 0: 138 | raise EmptyDataError("No data to process.") 139 | else: 140 | return series 141 | 142 | 143 | def _validate_groupby_variable( 144 | *, data: DataFrame, groupby_variable: Union[int, str] 145 | ) -> Optional[Series]: 146 | """Ensures that the specified column label/index for grouping values is 147 | present in the data. 148 | 149 | Args: 150 | data (DataFrame): The data being analyzed. 151 | groupby_variable (Union[int, str]): A column label or index. 152 | 153 | Raises: 154 | GroupbyVariableError: If the supplied column label does not exist, or 155 | the supplied column index is out of bounds. 156 | 157 | Returns: 158 | Optional[pandas.Series]: The groupby variable's data. 159 | """ 160 | if groupby_variable is None: 161 | return None 162 | elif f"{groupby_variable}".isdecimal(): 163 | idx = int(groupby_variable) 164 | try: 165 | groupby_data = data.iloc[:, idx] 166 | except IndexError: 167 | raise GroupbyVariableError( 168 | f"Column index {groupby_variable} is not in the range" 169 | f" [0, {data.columns.size}]." 170 | ) 171 | _check_cardinality(groupby_data) 172 | return groupby_data 173 | elif isinstance(groupby_variable, str): 174 | try: 175 | groupby_data = data[groupby_variable] 176 | except KeyError: 177 | raise GroupbyVariableError( 178 | f"{groupby_variable!r} is not in {data.columns.to_list()}" 179 | ) 180 | _check_cardinality(groupby_data) 181 | return groupby_data 182 | else: 183 | # If groupby_data is neither an index(int) or label(str) 184 | logging.warning( 185 | f"Group-by variable '{groupby_variable}' ignored." 186 | " Not a valid column index or label." 187 | ) 188 | return None 189 | -------------------------------------------------------------------------------- /eda_report/_analysis.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import get_context 2 | from typing import Dict, Iterable, Optional, Union 3 | 4 | import pandas as pd 5 | from tqdm import tqdm 6 | 7 | from eda_report._validate import _validate_groupby_variable 8 | from eda_report.bivariate import Dataset 9 | from eda_report.plotting import _plot_dataset, _plot_variable 10 | from eda_report.univariate import Variable, _analyze_univariate 11 | 12 | mp_context = get_context("spawn") # Use "spawn" start method 13 | 14 | 15 | def _get_contingency_tables( 16 | categorical_df: pd.DataFrame, groupby_data: pd.Series 17 | ) -> Dict[str, pd.DataFrame]: 18 | """Get contingency tables for categorical variables. 19 | 20 | Args: 21 | categorical_df (pandas.DataFrame): Categorical data. 22 | groupby_data (pandas.Series): Values to group by. 23 | 24 | Returns: 25 | Dict[str, pandas.DataFrame]: Contingency tables for each column. 26 | """ 27 | if (categorical_df.shape[1] == 0) or (groupby_data is None): 28 | return {} 29 | 30 | contingency_tables = { 31 | col: pd.crosstab( 32 | index=categorical_df[col], 33 | columns=groupby_data, 34 | margins=True, 35 | margins_name="Total", 36 | ) 37 | for col in categorical_df 38 | # Only include columns with upto 20 unique values to cut clutter 39 | if categorical_df[col].nunique() <= 20 40 | } 41 | # Exclude groupby_variable in case it is among the categorical cols 42 | contingency_tables.pop(groupby_data.name, None) 43 | return contingency_tables 44 | 45 | 46 | class _AnalysisResult: 47 | """Analyzes data, and stores the resultant summary statistics and graphs. 48 | 49 | Args: 50 | data (Iterable): The data to analyse. 51 | graph_color (str, optional): The color to apply to the graphs. 52 | Defaults to "cyan". 53 | groupby_variable (Union[str, int], optional): The column to 54 | use to group values. Defaults to None. 55 | """ 56 | 57 | def __init__( 58 | self, 59 | data: Iterable, 60 | graph_color: str = "cyan", 61 | groupby_variable: Union[str, int] = None, 62 | ) -> None: 63 | self.GRAPH_COLOR = graph_color 64 | self.dataset = Dataset(data) 65 | self.GROUPBY_DATA = _validate_groupby_variable( 66 | data=self.dataset.data, groupby_variable=groupby_variable 67 | ) 68 | self.variables = self._analyze_variables() 69 | self.univariate_stats = self._get_univariate_statistics() 70 | self.normality_tests = self._get_normality_test_results() 71 | self.univariate_graphs = self._get_univariate_graphs() 72 | self.bivariate_graphs = _plot_dataset(self.dataset, color=graph_color) 73 | self.bivariate_summaries = self._get_bivariate_summaries() 74 | 75 | def _analyze_variables(self) -> Dict[str, Variable]: 76 | """Compute summary statistics and assess variable properties. 77 | 78 | Returns: 79 | Dict[str, Variable]: Univariate analysis results. 80 | """ 81 | data = self.dataset.data 82 | with mp_context.Pool() as p: 83 | univariate_stats = dict( 84 | tqdm( 85 | # Analyze variables concurrently 86 | p.imap(_analyze_univariate, data.items()), 87 | # Progress-bar options 88 | total=data.shape[1], 89 | bar_format=( 90 | "{desc} {percentage:3.0f}%|{bar:35}| " 91 | "{n_fmt}/{total_fmt}" 92 | ), 93 | desc="Analyze variables: ", 94 | dynamic_ncols=True, 95 | ) 96 | ) 97 | # Create contingency tables 98 | categorical_cols = [ 99 | col_name 100 | for col_name, var in univariate_stats.items() 101 | if var.var_type != "numeric" 102 | ] 103 | self.contingency_tables = _get_contingency_tables( 104 | data[categorical_cols], self.GROUPBY_DATA 105 | ) 106 | return univariate_stats 107 | 108 | def _get_univariate_statistics(self) -> Dict[str, pd.DataFrame]: 109 | """Get a dataframe of summary statistics for all variables. 110 | 111 | Returns: 112 | Dict[str, pandas.DataFrame]: Summary statistics. 113 | """ 114 | return { 115 | name: variable.summary_stats 116 | for name, variable in self.variables.items() 117 | } 118 | 119 | def _get_normality_test_results(self) -> Dict[str, pd.DataFrame]: 120 | """Perform tests for normality. 121 | 122 | Returns: 123 | Dict[str, pandas.DataFrame]: Normality test results. 124 | """ 125 | return { 126 | name: variable._normality_test_results 127 | for name, variable in self.variables.items() 128 | if variable.var_type == "numeric" 129 | } 130 | 131 | def _get_univariate_graphs(self) -> Dict[str, Dict]: 132 | """Plot graphs for all variables present. 133 | 134 | Returns: 135 | Dict[str, Dict]: Univariate graphs. 136 | """ 137 | 138 | with mp_context.Pool() as p: 139 | data = self.dataset.data 140 | variable_data_hue_and_color = [ 141 | ( 142 | variable, 143 | data[variable.name], 144 | self.GROUPBY_DATA, 145 | self.GRAPH_COLOR, 146 | ) 147 | for variable in self.variables.values() 148 | ] 149 | univariate_graphs = dict( 150 | tqdm( 151 | # Plot variables in parallel processes 152 | p.imap(_plot_variable, variable_data_hue_and_color), 153 | # Progress-bar options 154 | total=len(self.variables), 155 | bar_format=( 156 | "{desc} {percentage:3.0f}%|{bar:35}| " 157 | "{n_fmt}/{total_fmt}" 158 | ), 159 | desc="Plot variables: ", 160 | dynamic_ncols=True, 161 | ) 162 | ) 163 | return univariate_graphs 164 | 165 | def _get_bivariate_summaries(self) -> Optional[Dict[str, str]]: 166 | """Get descriptions of the nature of correlation between numeric 167 | column pairs. 168 | 169 | Returns: 170 | Optional[Dict[str, str]]: Correlation info. 171 | """ 172 | if self.dataset._correlation_values is None: 173 | return None 174 | else: 175 | # Take the top 20 pairs by magnitude of correlation. 176 | # 20 var_pairs ≈ 10+ pages 177 | # 20 numeric columns == 190 var_pairs ≈ 95+ pages. 178 | pairs_to_include = [ 179 | pair for pair, _ in self.dataset._correlation_values[:20] 180 | ] 181 | correlation_descriptions = self.dataset._correlation_descriptions 182 | return { 183 | var_pair: ( 184 | f"{var_pair[0].title()} and {var_pair[1].title()} have " 185 | f"{correlation_descriptions[var_pair]}." 186 | ) 187 | for var_pair in pairs_to_include 188 | } 189 | -------------------------------------------------------------------------------- /tests/test_data_validation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pandas import DataFrame, Series 3 | 4 | from eda_report._validate import ( 5 | _clean_column_labels, 6 | _validate_dataset, 7 | _validate_groupby_variable, 8 | _validate_univariate_input, 9 | ) 10 | from eda_report.exceptions import ( 11 | EmptyDataError, 12 | GroupbyVariableError, 13 | InputError, 14 | ) 15 | 16 | 17 | class TestDatasetValidation: 18 | def test_dataframe_input(self): 19 | # Check if a dataframe is returned as a dataframe 20 | assert isinstance(_validate_dataset(DataFrame(range(10))), DataFrame) 21 | 22 | def test_series_input(self): 23 | # Check if a series returns a dataframe 24 | assert isinstance(_validate_dataset(Series(range(10))), DataFrame) 25 | 26 | def test_iterable_input(self): 27 | # Check if a sequence returns a dataframe 28 | assert isinstance(_validate_dataset(range(10)), DataFrame) 29 | # Check if a generator returns a dataframe 30 | assert isinstance( 31 | _validate_dataset((x**2 for x in range(10))), DataFrame 32 | ) 33 | 34 | def test_invalid_input(self): 35 | # Check that invalid input rasies an InputError 36 | with pytest.raises(InputError) as error: 37 | _validate_dataset(0) 38 | assert ( 39 | "Expected a pandas.Dataframe object, but got ." 40 | in str(error.value) 41 | ) 42 | 43 | def test_empty_input(self): 44 | # Check that empty input rasies an EmptyDataError 45 | with pytest.raises(EmptyDataError) as error: 46 | _validate_dataset(DataFrame()) 47 | assert "No data to process." in str(error.value) 48 | 49 | def test_empty_column_is_dropped(self): 50 | # Check that columns consisting entirely of NaN are dropped 51 | data_with_empty_col = [[x, None] for x in range(10)] 52 | result = _validate_dataset(data_with_empty_col) 53 | assert result.shape == (10, 1) 54 | 55 | 56 | class TestUnivariateInputValidation: 57 | def test_series_input(self): 58 | # Check if a series is returned as a series 59 | assert isinstance( 60 | _validate_univariate_input(Series(range(10))), Series 61 | ) 62 | 63 | def test_iterable_input(self): 64 | # Check if a sequence-like returns a series 65 | assert isinstance(_validate_univariate_input(range(10)), Series) 66 | # Check if a generator returns a series 67 | assert isinstance( 68 | _validate_univariate_input((x**2 for x in range(10))), Series 69 | ) 70 | 71 | def test_mixed_type_input(self): 72 | # Check that mixed data is stored as strings, not objects 73 | mixed_data = _validate_univariate_input([1, 3, True, "hello"]) 74 | assert mixed_data.dtype == "string" 75 | 76 | def test_empty_input(self): 77 | with pytest.raises(EmptyDataError) as error: 78 | _validate_univariate_input(x for x in []) 79 | assert "No data to process." in str(error.value) 80 | 81 | def test_null_input(self): 82 | assert _validate_univariate_input(None) is None 83 | 84 | def test_invalid_input(self): 85 | # Check that invalid input rasies an InputError 86 | with pytest.raises(InputError) as error: 87 | _validate_univariate_input(DataFrame([1, 2, 3])) 88 | assert ( 89 | "Expected a one-dimensional sequence, but got " 90 | "." 91 | ) in str(error.value) 92 | 93 | 94 | class TestTargetValidation: 95 | data = DataFrame([range(5)] * 3, columns=list("ABCDE")) 96 | 97 | def test_valid_column_index(self): 98 | # Check that a valid column index returns the appropriate column data. 99 | assert _validate_groupby_variable( 100 | data=self.data, groupby_variable=3 101 | ).equals(self.data.get("D")) 102 | 103 | def test_invalid_column_index(self): 104 | # Check that an error is raised for a column index that is out of 105 | # bounds. 106 | with pytest.raises(GroupbyVariableError) as error: 107 | _validate_groupby_variable(data=self.data, groupby_variable=10) 108 | assert "Column index 10 is not in the range [0, 5]." in str( 109 | error.value 110 | ) 111 | 112 | def test_valid_column_label(self): 113 | # Check that a valid column label returns the appropriate column data. 114 | assert _validate_groupby_variable( 115 | data=self.data, groupby_variable="D" 116 | ).equals(self.data.get("D")) 117 | 118 | def test_invalid_column_label(self): 119 | # Check that an invalid column label raises an error. 120 | with pytest.raises(GroupbyVariableError) as error: 121 | _validate_groupby_variable(data=self.data, groupby_variable="X") 122 | assert "'X' is not in ['A', 'B', 'C', 'D', 'E']" in str(error.value) 123 | 124 | def test_null_input(self): 125 | # Check that `groupby_variable=None` returns `None` 126 | assert ( 127 | _validate_groupby_variable(data=self.data, groupby_variable=None) 128 | is None 129 | ) 130 | 131 | def test_invalid_input_type(self, caplog: pytest.LogCaptureFixture): 132 | # Check that invalid input (i.e not in {str, int, None} logs a warning 133 | # and returns None 134 | assert ( 135 | _validate_groupby_variable(data=self.data, groupby_variable=1.0) 136 | is None 137 | ) 138 | assert ( 139 | "Group-by variable '1.0' ignored. " 140 | "Not a valid column index or label." 141 | ) in caplog.text 142 | 143 | def test_groupby_variable_with_excess_categories( 144 | self, caplog: pytest.LogCaptureFixture 145 | ): 146 | # Check that target variables with more than 10 unique values raise an 147 | # error and log a warning that color-coding won't be applied. 148 | _data = DataFrame([range(11)] * 2, index=["X", "Y"]).T 149 | expected_message = ( 150 | "Group-by variable 'Y' not used to group values. " 151 | "It has high cardinality (11) and would clutter graphs." 152 | ) 153 | with pytest.raises(GroupbyVariableError) as error: 154 | assert _validate_groupby_variable( 155 | data=_data, groupby_variable=1 156 | ).equals(_data.iloc[:, 1]) 157 | assert expected_message in str(error.value) 158 | assert expected_message in caplog.text 159 | 160 | 161 | class TestColumnLabelCleaning: 162 | def test_cleaning_rangeindex(self): 163 | with_rangeindex = DataFrame([[0, 1], [1, 2]]) 164 | # Check if columns [0, 1] are changed to ["var_1", "var_2"] 165 | assert list(_clean_column_labels(with_rangeindex)) == [ 166 | "var_1", 167 | "var_2", 168 | ] 169 | 170 | def test_cleaning_numeric_colnames(self): 171 | with_numeric_colnames = DataFrame([[1, 2], [3, 4]], columns=[1, 5]) 172 | # Column names should be prefixed with "var_" 173 | assert list(_clean_column_labels(with_numeric_colnames)) == [ 174 | "var_1", 175 | "var_5", 176 | ] 177 | 178 | def test_cleaning_mixed_colnames(self): 179 | with_mixed_colnames = DataFrame([[1, 2], [3, 4]], columns=[1, "B"]) 180 | # Numeric column names should be converted to strings 181 | assert list(_clean_column_labels(with_mixed_colnames)) == ["1", "B"] 182 | -------------------------------------------------------------------------------- /eda_report/bivariate.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections.abc import Iterable 3 | from itertools import combinations 4 | from textwrap import indent 5 | from typing import List 6 | 7 | from pandas import DataFrame 8 | 9 | from eda_report._validate import _validate_dataset 10 | 11 | 12 | def _compute_correlation(dataframe: DataFrame) -> List: 13 | """Get the Pearson correlation coefficients for numeric variables. 14 | 15 | Args: 16 | dataframe (pandas.DataFrame): A 2D array of numeric data. 17 | 18 | Returns: 19 | Optional[List]: A list of column pairs and their Pearson's correlation 20 | coefficients; sorted by magnitude in descending order. 21 | """ 22 | if dataframe is None: 23 | return None 24 | 25 | numeric_data = dataframe.select_dtypes("number") 26 | if numeric_data.shape[1] < 2: 27 | return None 28 | else: 29 | correlation_df = numeric_data.corr(method="pearson") 30 | unique_pairs = list(combinations(correlation_df.columns, r=2)) 31 | correlation_info = [ 32 | (pair, correlation_df.at[pair]) for pair in unique_pairs 33 | ] 34 | return sorted(correlation_info, key=lambda x: -abs(x[1])) 35 | 36 | 37 | def _describe_correlation(corr_value: float) -> str: 38 | """Explain the nature and magnitude of correlation. 39 | 40 | Args: 41 | corr_value (str): Pearson's correlation coefficient. 42 | 43 | Returns: 44 | str: Brief description of correlation type. 45 | """ 46 | nature = " positive" if corr_value > 0 else " negative" 47 | value = abs(corr_value) 48 | if value >= 0.8: 49 | strength = "very strong" 50 | elif value >= 0.6: 51 | strength = "strong" 52 | elif value >= 0.4: 53 | strength = "moderate" 54 | elif value >= 0.2: 55 | strength = "weak" 56 | elif value >= 0.05: 57 | strength = "very weak" 58 | else: 59 | strength = "virtually no" 60 | nature = "" 61 | return f"{strength}{ nature} correlation ({corr_value:.2f})" 62 | 63 | 64 | class Dataset: 65 | """Analyze two-dimensional datasets to obtain descriptive statistics 66 | and correlation information. 67 | 68 | Input data is stored as a :class:`pandas.DataFrame` in order to leverage 69 | pandas_' built-in statistical methods. 70 | 71 | .. _pandas: https://pandas.pydata.org/ 72 | 73 | Args: 74 | data (Iterable): The data to analyze. 75 | 76 | Example: 77 | .. literalinclude:: examples.txt 78 | :lines: 79-101 79 | """ 80 | 81 | def __init__(self, data: Iterable) -> None: 82 | self.data = _validate_dataset(data) 83 | self._get_summary_statistics() 84 | self._get_bivariate_analysis() 85 | 86 | def __repr__(self) -> str: 87 | """Get the string representation for a `Dataset`. 88 | 89 | Returns: 90 | str: The string representation of the `Dataset` instance. 91 | """ 92 | if self._numeric_stats is None: 93 | numeric_stats = "" 94 | else: 95 | numeric_stats_title = ( 96 | "Summary Statistics for Numeric features " 97 | f"({self._numeric_stats.shape[0]})" 98 | ) 99 | numeric_stats = "\n".join( 100 | [ 101 | f"\n\t\t {numeric_stats_title}", 102 | f"\t\t {'-' * len(numeric_stats_title)}", 103 | indent(f"{self._numeric_stats}\n", " "), 104 | ] 105 | ) 106 | 107 | if self._categorical_stats is None: 108 | categorical_stats = "" 109 | else: 110 | categorical_stats_title = ( 111 | "Summary Statistics for Categorical features " 112 | f"({self._categorical_stats.shape[0]})" 113 | ) 114 | categorical_stats = "\n".join( 115 | [ 116 | f"\t{categorical_stats_title}", 117 | f"\t{'-' * len(categorical_stats_title)}", 118 | indent(f"{self._categorical_stats}\n", " " * 4), 119 | ] 120 | ) 121 | if hasattr(self, "_correlation_descriptions"): 122 | max_pairs = min(20, len(self._correlation_descriptions)) 123 | top_20 = list(self._correlation_descriptions.items())[:max_pairs] 124 | corr_repr = "\n".join( 125 | [ 126 | f"{var_pair[0] + ' & ' + var_pair[1]:>32} -> " 127 | f"{corr_description}" 128 | for var_pair, corr_description in top_20 129 | ] 130 | ) 131 | correlation_description = "\n".join( 132 | [ 133 | "\n\t\t\tPearson's Correlation (Top 20)", 134 | f"\t\t\t{'-' * 30}", 135 | f"{corr_repr}", 136 | ] 137 | ) 138 | else: 139 | correlation_description = "" 140 | 141 | return "\n".join( 142 | [ 143 | f"{numeric_stats}", 144 | indent(f"{categorical_stats}", "\t"), 145 | f"{correlation_description}", 146 | "\t", 147 | ] 148 | ) 149 | 150 | def _get_summary_statistics(self) -> None: 151 | """Compute descriptive statistics.""" 152 | data = self.data.copy() 153 | numeric_data = data.select_dtypes("number") 154 | # Consider numeric columns with < 11 unique values as categorical 155 | categorical_with_numbers = [ 156 | col for col in numeric_data if numeric_data[col].nunique() < 11 157 | ] 158 | numeric_data = numeric_data.drop(columns=categorical_with_numbers) 159 | if numeric_data.shape[1] < 1: 160 | self._numeric_stats = None 161 | else: 162 | numeric_stats = numeric_data.describe().T 163 | numeric_stats["count"] = numeric_stats["count"].astype("int") 164 | numeric_stats = numeric_stats.rename( 165 | columns={"mean": "avg", "std": "stddev"} 166 | ) 167 | numeric_stats["skewness"] = numeric_data.skew(numeric_only=True) 168 | numeric_stats["kurtosis"] = numeric_data.kurt(numeric_only=True) 169 | self._numeric_stats = numeric_stats.round(4) 170 | 171 | categorical_data = data.drop(columns=numeric_data.columns).copy() 172 | if categorical_data.shape[1] < 1: 173 | self._categorical_stats = None 174 | else: 175 | for col in categorical_data: 176 | # Convert categorical columns with "unique ratio" < 0.3 to 177 | # categorical dtype, which would consume much less memory. 178 | if ( 179 | categorical_data[col].nunique() / len(categorical_data) 180 | ) < 0.3: 181 | categorical_data[col] = categorical_data[col].astype( 182 | "category" 183 | ) 184 | else: 185 | categorical_data[col] = categorical_data[col].astype( 186 | "string" 187 | ) 188 | categorical_stats = categorical_data.describe().T 189 | categorical_stats["relative freq"] = ( 190 | categorical_stats["freq"] / len(self.data) 191 | ).apply(lambda x: f"{x :.2%}") 192 | self._categorical_stats = categorical_stats 193 | 194 | def _get_bivariate_analysis(self) -> None: 195 | """Compare numeric column pairs.""" 196 | self._correlation_values = _compute_correlation(self.data) 197 | if self._correlation_values is None: 198 | logging.warning( 199 | "Skipped Bivariate Analysis: There are less than 2 numeric " 200 | "variables." 201 | ) 202 | else: 203 | self._get_correlation_descriptions() 204 | 205 | def _get_correlation_descriptions(self) -> None: 206 | """Get brief descriptions of the nature of correlation between numeric 207 | column pairs.""" 208 | self._correlation_descriptions = { 209 | pair: _describe_correlation(corr_value) 210 | for pair, corr_value in self._correlation_values 211 | } 212 | -------------------------------------------------------------------------------- /docs/source/examples.txt: -------------------------------------------------------------------------------- 1 | UNIVARIATE 2 | ========== 3 | 4 | Numeric 5 | ------- 6 | >>> from eda_report.univariate import Variable 7 | >>> Variable(range(1, 51), name="1 to 50") 8 | 9 | Name: 1 to 50 10 | Type: numeric 11 | Non-null Observations: 50 12 | Unique Values: 50 -> [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, [...] 13 | Missing Values: None 14 | 15 | Summary Statistics 16 | ------------------ 17 | Average: 25.5000 18 | Standard Deviation: 14.5774 19 | Minimum: 1.0000 20 | Lower Quartile: 13.2500 21 | Median: 25.5000 22 | Upper Quartile: 37.7500 23 | Maximum: 50.0000 24 | Skewness: 0.0000 25 | Kurtosis: -1.2000 26 | 27 | Tests for Normality 28 | ------------------- 29 | p-value Conclusion at α = 0.05 30 | D'Agostino's K-squared test 0.0015981 Unlikely to be normal 31 | Kolmogorov-Smirnov test 0.0000000 Unlikely to be normal 32 | Shapiro-Wilk test 0.0580895 Possibly normal 33 | 34 | Categorical 35 | ----------- 36 | >>> Variable(["mango", "apple", "pear", "mango", "pear", "mango"], name="fruits") 37 | 38 | Name: fruits 39 | Type: categorical 40 | Non-null Observations: 6 41 | Unique Values: 3 -> ['apple', 'mango', 'pear'] 42 | Missing Values: None 43 | Mode (Most frequent): mango 44 | Maximum frequency: 3 45 | 46 | Most Common Items 47 | ----------------- 48 | mango: 3 (50.00%) 49 | pear: 2 (33.33%) 50 | apple: 1 (16.67%) 51 | 52 | Datetime 53 | -------- 54 | >>> import pandas as pd 55 | >>> dt = pd.date_range("2022-03-08", periods=20, freq="D") 56 | >>> Variable(dt, name="dttm") 57 | 58 | Name: dttm 59 | Type: datetime 60 | Non-null Observations: 20 61 | Unique Values: 20 -> [Timestamp('2022-03-08 00:00:00'), [...] 62 | Missing Values: None 63 | 64 | Summary Statistics 65 | ------------------ 66 | Average: 2022-03-17 12:00:00 67 | Minimum: 2022-03-08 00:00:00 68 | Lower Quartile: 2022-03-12 18:00:00 69 | Median: 2022-03-17 12:00:00 70 | Upper Quartile: 2022-03-22 06:00:00 71 | Maximum: 2022-03-27 00:00:00 72 | 73 | 74 | BIVARIATE 75 | ========= 76 | 77 | Dataset 78 | ------- 79 | >>> Dataset(iris_data) 80 | Summary Statistics for Numeric features (4) 81 | ------------------------------------------- 82 | count avg stddev min 25% 50% 75% max skewness kurtosis 83 | sepal_length 150 5.8433 0.8281 4.3 5.1 5.80 6.4 7.9 0.3149 -0.5521 84 | sepal_width 150 3.0573 0.4359 2.0 2.8 3.00 3.3 4.4 0.3190 0.2282 85 | petal_length 150 3.7580 1.7653 1.0 1.6 4.35 5.1 6.9 -0.2749 -1.4021 86 | petal_width 150 1.1993 0.7622 0.1 0.3 1.30 1.8 2.5 -0.1030 -1.3406 87 | 88 | Summary Statistics for Categorical features (1) 89 | ----------------------------------------------- 90 | count unique top freq relative freq 91 | species 150 3 setosa 50 33.33% 92 | 93 | 94 | Pearson's Correlation (Top 20) 95 | ------------------------------ 96 | petal_length & petal_width -> very strong positive correlation (0.96) 97 | sepal_length & petal_length -> very strong positive correlation (0.87) 98 | sepal_length & petal_width -> very strong positive correlation (0.82) 99 | sepal_width & petal_length -> moderate negative correlation (-0.43) 100 | sepal_width & petal_width -> weak negative correlation (-0.37) 101 | sepal_length & sepal_width -> very weak negative correlation (-0.12) 102 | 103 | 104 | CLI 105 | === 106 | $ eda-report -h 107 | usage: eda-report [-h] [-i INFILE] [-o OUTFILE] [-t TITLE] [-c COLOR] 108 | [-g GROUPBY] 109 | 110 | Automatically analyze data and generate reports. A graphical user interface 111 | will be launched if none of the optional arguments is specified. 112 | 113 | optional arguments: 114 | -h, --help show this help message and exit 115 | -i INFILE, --infile INFILE 116 | A .csv or .xlsx file to analyze. 117 | -o OUTFILE, --outfile OUTFILE 118 | The output name for analysis results (default: eda- 119 | report.docx) 120 | -t TITLE, --title TITLE 121 | The top level heading for the report (default: 122 | Exploratory Data Analysis Report) 123 | -c COLOR, --color COLOR 124 | The color to apply to graphs (default: cyan) 125 | -g GROUPBY, -T GROUPBY, --groupby GROUPBY, --target GROUPBY 126 | The variable to use for grouping plotted values. An 127 | integer value is treated as a column index, whereas a 128 | string is treated as a column label. 129 | 130 | 131 | TOP LEVEL 132 | ========= 133 | 134 | eda_report.get_word_report 135 | -------------------------- 136 | >>> import eda_report 137 | >>> eda_report.get_word_report(iris_data) 138 | Analyze variables: 100%|███████████████████████████████████| 5/5 139 | Plot variables: 100%|███████████████████████████████████| 5/5 140 | Bivariate analysis: 100%|███████████████████████████████████| 6/6 pairs. 141 | [INFO 16:14:53.648] Done. Results saved as 'eda-report.docx' 142 | 143 | 144 | eda_report.summarize 145 | -------------------- 146 | >>> eda_report.summarize(range(50)) 147 | 148 | Name: var_1 149 | Type: numeric 150 | Non-null Observations: 50 151 | Unique Values: 50 -> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, [...] 152 | Missing Values: None 153 | 154 | Summary Statistics 155 | ------------------ 156 | Average: 24.5000 157 | Standard Deviation: 14.5774 158 | Minimum: 0.0000 159 | Lower Quartile: 12.2500 160 | Median: 24.5000 161 | Upper Quartile: 36.7500 162 | Maximum: 49.0000 163 | Skewness: 0.0000 164 | Kurtosis: -1.2000 165 | 166 | Tests for Normality 167 | ------------------- 168 | p-value Conclusion at α = 0.05 169 | D'Agostino's K-squared test 0.0015981 Unlikely to be normal 170 | Kolmogorov-Smirnov test 0.0000000 Unlikely to be normal 171 | Shapiro-Wilk test 0.0580895 Possibly normal 172 | >>> eda_report.summarize(iris_data) 173 | 174 | Summary Statistics for Numeric features (4) 175 | ------------------------------------------- 176 | count avg stddev min 25% 50% 75% max skewness kurtosis 177 | sepal_length 150 5.8433 0.8281 4.3 5.1 5.80 6.4 7.9 0.3149 -0.5521 178 | sepal_width 150 3.0573 0.4359 2.0 2.8 3.00 3.3 4.4 0.3190 0.2282 179 | petal_length 150 3.7580 1.7653 1.0 1.6 4.35 5.1 6.9 -0.2749 -1.4021 180 | petal_width 150 1.1993 0.7622 0.1 0.3 1.30 1.8 2.5 -0.1030 -1.3406 181 | 182 | Summary Statistics for Categorical features (1) 183 | ----------------------------------------------- 184 | count unique top freq relative freq 185 | species 150 3 setosa 50 33.33% 186 | 187 | 188 | Pearson's Correlation (Top 20) 189 | ------------------------------ 190 | petal_length & petal_width -> very strong positive correlation (0.96) 191 | sepal_length & petal_length -> very strong positive correlation (0.87) 192 | sepal_length & petal_width -> very strong positive correlation (0.82) 193 | sepal_width & petal_length -> moderate negative correlation (-0.43) 194 | sepal_width & petal_width -> weak negative correlation (-0.37) 195 | sepal_length & sepal_width -> very weak negative correlation (-0.12) 196 | -------------------------------------------------------------------------------- /tests/test_bivariate_analysis.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pandas import DataFrame 3 | 4 | from eda_report.bivariate import ( 5 | Dataset, 6 | _compute_correlation, 7 | _describe_correlation, 8 | ) 9 | 10 | sample_data = DataFrame( 11 | { 12 | "A": range(50), 13 | "B": list("abcdef") * 8 + ["a"] * 2, 14 | "C": [True, False] * 24 + [True] * 2, 15 | "D": [1, 3, 5, 7, 9, 11, 13] * 7 + [17], 16 | } 17 | ) 18 | 19 | 20 | def test_correlation_computation(): 21 | data = sample_data.copy() 22 | assert _compute_correlation(None) is None 23 | 24 | # Check that < 2 numeric cols returns None 25 | assert _compute_correlation(data[["A", "B"]]) is None 26 | 27 | # Check that only numeric columns are processed 28 | assert _compute_correlation(data) == pytest.approx( 29 | [(("A", "D"), 0.21019754169815516)] 30 | ) 31 | 32 | 33 | def test_correlation_description(): 34 | assert ( 35 | _describe_correlation(0.9) == "very strong positive correlation (0.90)" 36 | ) 37 | assert _describe_correlation(-0.7) == "strong negative correlation (-0.70)" 38 | assert _describe_correlation(0.5) == "moderate positive correlation (0.50)" 39 | assert _describe_correlation(-0.3) == "weak negative correlation (-0.30)" 40 | assert ( 41 | _describe_correlation(0.1) == "very weak positive correlation (0.10)" 42 | ) 43 | assert _describe_correlation(0.025) == "virtually no correlation (0.03)" 44 | 45 | 46 | class TestDataset: 47 | dataset = Dataset(sample_data.copy()) 48 | 49 | def test_stored_data(self): 50 | assert isinstance(self.dataset.data, DataFrame) 51 | 52 | def test_categorical_summary_statistics(self): 53 | assert self.dataset._categorical_stats.to_dict() == { 54 | "count": {"B": 50, "C": 50, "D": 50}, 55 | "unique": {"B": 6, "C": 2, "D": 8}, 56 | "top": {"B": "a", "C": True, "D": 1}, 57 | "freq": {"B": 10, "C": 26, "D": 7}, 58 | "relative freq": {"B": "20.00%", "C": "52.00%", "D": "14.00%"}, 59 | } 60 | 61 | def test_numeric_summary_statistics(self): 62 | assert self.dataset._numeric_stats.to_dict( 63 | orient="list" 64 | ) == pytest.approx( 65 | { 66 | "count": [50], 67 | "avg": [24.5], 68 | "stddev": [14.5774], 69 | "min": [0.0], 70 | "25%": [12.25], 71 | "50%": [24.5], 72 | "75%": [36.75], 73 | "max": [49.0], 74 | "skewness": [0.0], 75 | "kurtosis": [-1.2], 76 | } 77 | ) 78 | 79 | def test_correlation(self): 80 | assert self.dataset._correlation_values == pytest.approx( 81 | [(("A", "D"), 0.21019754169815516)] 82 | ) 83 | assert self.dataset._correlation_descriptions == { 84 | ("A", "D"): "weak positive correlation (0.21)" 85 | } 86 | 87 | def test_repr(self): 88 | assert str(self.dataset) == ( 89 | "\n\t\t Summary Statistics for Numeric features (1)\n\t\t ------" 90 | "-------------------------------------\n count avg stddev " 91 | " min 25% 50% 75% max skewness kurtosis\n A 50 " 92 | "24.5 14.5774 0.0 12.25 24.5 36.75 49.0 0.0 -1.2" 93 | "\n\n\t\tSummary Statistics for Categorical features (3)\n\t\t---" 94 | "--------------------------------------------\n\t count " 95 | "unique top freq relative freq\n\t B 50 6 a 10" 96 | " 20.00%\n\t C 50 2 True 26 52.00%\n" 97 | "\t D 50 8 1 7 14.00%\n\n\n\t\t\t" 98 | "Pearson's Correlation (Top 20)\n\t\t\t--------------------------" 99 | "----\n A & D -> weak positive " 100 | "correlation (0.21)\n\t" 101 | ) 102 | 103 | def test_numeric_only_repr(self): 104 | numeric_only = Dataset(sample_data[["A"]]) 105 | assert str(numeric_only) == ( 106 | "\n\t\t Summary Statistics for Numeric features (1)\n\t\t ------" 107 | "-------------------------------------\n count avg stddev " 108 | " min 25% 50% 75% max skewness kurtosis\n A 50 " 109 | "24.5 14.5774 0.0 12.25 24.5 36.75 49.0 0.0 -1.2" 110 | "\n\n\n\n\t" 111 | ) 112 | 113 | def test_categorical_only_repr(self, caplog: pytest.LogCaptureFixture): 114 | categorical_only = Dataset(sample_data[["B", "C"]]) 115 | assert ( 116 | "Skipped Bivariate Analysis: There are less than 2 numeric " 117 | "variables." 118 | ) in str(caplog.text) 119 | assert str(categorical_only) == ( 120 | "\n\t\tSummary Statistics for Categorical features (2)\n\t\t-----" 121 | "------------------------------------------\n\t count unique" 122 | " top freq relative freq\n\t B 50 6 a 10 " 123 | " 20.00%\n\t C 50 2 True 26 52.00%\n\n\n\t" 124 | ) 125 | 126 | def test_correlation_info_truncation_(self): 127 | plenty_numeric = Dataset( 128 | DataFrame( 129 | { 130 | "A": range(11), 131 | "B": [0, 1, 2, 4, 5, 7, 8, 8, 9, 9, 4], 132 | "C": [0, 9, 2, 4, 5, 7, 8, 8, 9, 9, 1], 133 | "D": [2, 9, 2, 2, 4, 9, 8, 7, 9, 9, 3], 134 | "E": [2, 4, 2, 2, 4, 9, 2, 7, 4, 5, 8], 135 | "F": [9, 4, 2, 5, 3, 0, 2, 2, 4, 7, 6], 136 | "G": [9, 4, 9, 0, 3, 8, 7, 1, 9, 5, 2], 137 | } 138 | ) 139 | ) 140 | # In particular, only the top 20 correlation descriptions should be 141 | # displayed. 142 | assert len(plenty_numeric._correlation_descriptions) == 21 143 | assert str(plenty_numeric) == ( 144 | "\n\t\t Summary Statistics for Numeric features (1)\n\t\t ------" 145 | "-------------------------------------\n count avg stddev " 146 | "min 25% 50% 75% max skewness kurtosis\n A 11 5.0 " 147 | "3.3166 0.0 2.5 5.0 7.5 10.0 0.0 -1.2\n\n\t\t" 148 | "Summary Statistics for Categorical features (6)\n\t\t------------" 149 | "-----------------------------------\n\t count unique top " 150 | "freq relative freq\n\t B 11 8 4 2 18.18%" 151 | "\n\t C 11 8 9 3 27.27%\n\t D 11 " 152 | " 6 9 4 36.36%\n\t E 11 6 2 4 " 153 | " 36.36%\n\t F 11 8 2 3 27.27%\n\t G " 154 | " 11 9 9 3 27.27%\n\n\n\t\t\tPearson's " 155 | "Correlation (Top 20)\n\t\t\t------------------------------\n " 156 | " C & D -> very strong positive correlation" 157 | " (0.92)\n A & B -> strong positive " 158 | "correlation (0.78)\n B & C -> strong " 159 | "positive correlation (0.68)\n B & D ->" 160 | " strong positive correlation (0.64)\n " 161 | "A & E -> moderate positive correlation (0.57)\n " 162 | " C & F -> moderate negative correlation (-0.40)\n " 163 | " A & D -> weak positive correlation (0.38)\n" 164 | " D & E -> weak positive correlation " 165 | "(0.37)\n B & E -> weak positive " 166 | "correlation (0.36)\n B & F -> weak " 167 | "negative correlation (-0.35)\n D & F -" 168 | "> weak negative correlation (-0.35)\n " 169 | "A & C -> weak positive correlation (0.33)\n " 170 | " E & F -> weak negative correlation (-0.29)\n " 171 | " E & G -> weak negative correlation (-0.23)\n " 172 | " A & G -> weak negative correlation (-0.22)\n" 173 | " C & E -> very weak positive " 174 | "correlation (0.18)\n D & G -> very " 175 | "weak positive correlation (0.18)\n F &" 176 | " G -> very weak negative correlation (-0.06)\n " 177 | " A & F -> virtually no correlation (-0.05)\n " 178 | " B & G -> virtually no correlation (-0.04)\n\t" 179 | ) 180 | -------------------------------------------------------------------------------- /tests/test_univariate_analysis.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pandas import DataFrame, Timestamp, date_range 3 | 4 | from eda_report.univariate import Variable, _analyze_univariate 5 | 6 | 7 | class TestDtypeDetection: 8 | def test_bool_detection(self): 9 | boolean = Variable([True, False, True]) 10 | assert boolean.var_type == "boolean" 11 | 12 | def test_categorical_detection(self): 13 | categorical = Variable(list("abcdefg")) 14 | assert categorical.var_type == "categorical" 15 | 16 | def test_datetime_detection(self): 17 | datetime = Variable(date_range("2022-01-01", periods=5, freq="D")) 18 | assert datetime.var_type == "datetime" 19 | 20 | def test_numeric_detection(self): 21 | numeric = Variable(range(20)) 22 | assert numeric.var_type == "numeric" 23 | 24 | 25 | class TestGeneralVariableProperties: 26 | variable = Variable(list(range(20)) + [None], name="some-variable") 27 | unnamed_variable = Variable(list("ababdea")) 28 | 29 | def test_missing_values(self): 30 | assert self.variable.missing == "1 (4.76%)" 31 | assert self.variable._num_non_null == 20 32 | assert self.unnamed_variable.missing is None 33 | assert self.unnamed_variable._num_non_null == 7 34 | 35 | def test_name(self): 36 | assert self.variable.name == "some-variable" 37 | assert self.unnamed_variable.name is None 38 | 39 | def test_renaming(self): 40 | self.unnamed_variable.rename(name="new name") 41 | assert self.unnamed_variable.name == "new name" 42 | 43 | self.variable.rename("another new name") 44 | assert self.variable.name == "another new name" 45 | 46 | def test_unique_values(self): 47 | assert self.variable.num_unique == 20 48 | assert self.variable.unique_values == pytest.approx(list(range(20))) 49 | 50 | assert self.unnamed_variable.num_unique == 4 51 | assert all(self.unnamed_variable.unique_values == list("abde")) 52 | 53 | 54 | class TestCategoricalVariables: 55 | categorical_variable = Variable(["a", "b", "c", "d", None, "a"]) 56 | # Numeric variables with less than 10 unique values are treated as 57 | # categorical. 58 | numeric_categories = Variable([1, 2, 3] * 10) 59 | 60 | def test_variable_type(self): 61 | assert self.categorical_variable.var_type == "categorical" 62 | assert self.numeric_categories.var_type == "numeric (<=10 levels)" 63 | 64 | def test_summary_statistics(self): 65 | assert self.categorical_variable.summary_stats == { 66 | "Mode (Most frequent)": "a", 67 | "Maximum frequency": 2, 68 | } 69 | assert self.categorical_variable._most_common_categories == { 70 | "a": "2 (40.00%)", 71 | "b": "1 (20.00%)", 72 | "c": "1 (20.00%)", 73 | "d": "1 (20.00%)", 74 | } 75 | 76 | def test_normality_results(self): 77 | assert self.categorical_variable._normality_test_results is None 78 | assert self.numeric_categories._normality_test_results is None 79 | 80 | def test_repr(self): 81 | assert str(self.categorical_variable) == ( 82 | "\nName: None\nType: categorical\nNon-null Observations: 5" 83 | "\nUnique Values: 4 -> ['a' 'b' 'c' 'd']\nMissing Values: " 84 | "1 (16.67%)\nMode (Most frequent): a\nMaximum frequency: 2" 85 | "\n\n\t\tMost Common Items\n\t\t-----------------\n " 86 | " a: 2 (40.00%)\n b: " 87 | "1 (20.00%)\n c: 1 (20.00%)\n " 88 | " d: 1 (20.00%)" 89 | ) 90 | 91 | 92 | class TestBooleanVariables: 93 | # Boolean variables are treated as categorical. Only the var_type differs. 94 | boolean_variable = Variable([True, False, True, None] * 5) 95 | numeric_bool = Variable([1, 0, 1, None] * 5) 96 | str_bool_1 = Variable(["Yes", "No", "Yes"] * 5) 97 | str_bool_2 = Variable(["Y", "N", "Y"] * 5) 98 | 99 | def test_dtype(self): 100 | assert self.boolean_variable.var_type == "boolean" 101 | assert self.numeric_bool.var_type == "boolean" 102 | assert self.str_bool_1.var_type == "boolean" 103 | assert self.str_bool_2.var_type == "boolean" 104 | 105 | 106 | class TestDateTimeVariables: 107 | datetime_variable = Variable( 108 | date_range("01-01-2022", periods=10, freq="D"), name="dates" 109 | ) 110 | 111 | def test_variable_type(self): 112 | assert self.datetime_variable.var_type == "datetime" 113 | 114 | def test_summary_statistics(self): 115 | assert self.datetime_variable.summary_stats == { 116 | "Average": Timestamp("2022-01-05 12:00:00"), 117 | "Minimum": Timestamp("2022-01-01 00:00:00"), 118 | "Lower Quartile": Timestamp("2022-01-03 06:00:00"), 119 | "Median": Timestamp("2022-01-05 12:00:00"), 120 | "Upper Quartile": Timestamp("2022-01-07 18:00:00"), 121 | "Maximum": Timestamp("2022-01-10 00:00:00"), 122 | } 123 | assert self.datetime_variable._most_common_categories is None 124 | 125 | def test_normality_results(self): 126 | assert self.datetime_variable._normality_test_results is None 127 | 128 | def test_repr(self): 129 | assert str(self.datetime_variable) == ( 130 | "\nName: dates\nType: datetime\nNon-null Observations: 10\n" 131 | "Unique Values: 10 -> ['2022-01-01T00:00:00.000000000' ... " 132 | "]\nMissing Values: None\n\n\t\t Summary Statistics\n\t\t " 133 | " ------------------\n\tAverage: 2022-01-05 12" 134 | ":00:00\n\tMinimum: 2022-01-01 00:00:00\n\t" 135 | "Lower Quartile: 2022-01-03 06:00:00\n\tMedian: " 136 | " 2022-01-05 12:00:00\n\tUpper Quartile: 2022" 137 | "-01-07 18:00:00\n\tMaximum: 2022-01-10 00:00:00" 138 | ) 139 | 140 | 141 | class TestNumericVariable: 142 | numeric_variable = Variable(data=range(50), name="1 to 50") 143 | 144 | def test_variable_type(self): 145 | assert self.numeric_variable.var_type == "numeric" 146 | 147 | def test_summary_statistics(self): 148 | assert self.numeric_variable.summary_stats == pytest.approx( 149 | { 150 | "Average": 24.5, 151 | "Standard Deviation": 14.577379737113251, 152 | "Minimum": 0.0, 153 | "Lower Quartile": 12.25, 154 | "Median": 24.5, 155 | "Upper Quartile": 36.75, 156 | "Maximum": 49.0, 157 | "Skewness": 0.0, 158 | "Kurtosis": -1.2, 159 | } 160 | ) 161 | assert self.numeric_variable._most_common_categories is None 162 | 163 | def test_normality_results(self): 164 | assert isinstance( 165 | self.numeric_variable._normality_test_results, DataFrame 166 | ) 167 | assert self.numeric_variable._normality_test_results.to_dict() == { 168 | "p-value": { 169 | "D'Agostino's K-squared test": "0.0015981", 170 | "Kolmogorov-Smirnov test": "0.0000000", 171 | "Shapiro-Wilk test": "0.0580919", 172 | }, 173 | "Conclusion at α = 0.05": { 174 | "D'Agostino's K-squared test": "Unlikely to be normal", 175 | "Kolmogorov-Smirnov test": "Unlikely to be normal", 176 | "Shapiro-Wilk test": "Possibly normal", 177 | }, 178 | } 179 | 180 | def test_repr(self): 181 | assert str(self.numeric_variable) == ( 182 | "\nName: 1 to 50\nType: numeric\nNon-null Observations: 50" 183 | "\nUnique Values: 50 -> [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 1" 184 | "4 15 16 17 18 ... ]\nMissing Values: None\n\n\t\t Summary" 185 | " Statistics\n\t\t ------------------\n\tAverage: " 186 | " 24.5000\n\tStandard Deviation: 14.5" 187 | "774\n\tMinimum: 0.0000\n\tLower Quar" 188 | "tile: 12.2500\n\tMedian: " 189 | " 24.5000\n\tUpper Quartile: 36.7500\n\tMax" 190 | "imum: 49.0000\n\tSkewness: " 191 | " 0.0000\n\tKurtosis: -1.2000" 192 | "\n\n\t\t Tests for Normality\n\t\t -------------------\n" 193 | " p-value Conclusion at α = 0" 194 | ".05\nD'Agostino's K-squared test 0.0015981 Unlikely to " 195 | "be normal\nKolmogorov-Smirnov test 0.0000000 Unlikel" 196 | "y to be normal\nShapiro-Wilk test 0.0580919 " 197 | " Possibly normal" 198 | ) 199 | 200 | 201 | def test_analyse_variable(): 202 | name, variable = _analyze_univariate(("wantufifty", range(50))) 203 | 204 | assert name == "wantufifty" 205 | assert isinstance(variable, Variable) 206 | -------------------------------------------------------------------------------- /eda_report/gui.py: -------------------------------------------------------------------------------- 1 | import pkgutil 2 | 3 | try: 4 | from tkinter import Button, Canvas, Frame, Label, PhotoImage, StringVar 5 | from tkinter.colorchooser import askcolor 6 | from tkinter.filedialog import askopenfilename, asksaveasfilename 7 | from tkinter.messagebox import ( 8 | askretrycancel, 9 | askyesno, 10 | showinfo, 11 | showwarning, 12 | ) 13 | from tkinter.simpledialog import askstring 14 | except (ImportError, ModuleNotFoundError) as error: 15 | print( 16 | f"Unable to lauch app window because:\n\n\t* {error}.\n\n" 17 | "Please visit https://tkdocs.com/tutorial/install.html for" 18 | " help installing it.\n\nYou can still use the eda-report command. " 19 | "Try 'eda-report -h' for more details.\n" 20 | ) 21 | exit() 22 | 23 | from eda_report._read_file import df_from_file 24 | from eda_report._validate import _validate_groupby_variable 25 | from eda_report.document import ReportDocument 26 | from eda_report.exceptions import GroupbyVariableError 27 | 28 | background_image = pkgutil.get_data(__name__, "images/background.png") 29 | icon = pkgutil.get_data(__name__, "images/icon.png") 30 | 31 | description = ( 32 | "Speed up exploratory data analysis & reporting.\n\n" 33 | "Automatically analyze files, and get a Word report complete with " 34 | "summary statistics and graphs." 35 | ) 36 | 37 | 38 | class EDAGUI(Frame): # pragma: no cover 39 | """The blueprint for the :mod:`tkinter` - based *graphical user 40 | interface* to the application. 41 | 42 | .. figure:: _static/screencast.* 43 | :alt: an image of the graphical user interface 44 | 45 | The "Select a file" button launches a *file-dialog* to navigate to and 46 | select a file to analyze. 47 | 48 | If a valid file is selected, *text-input widgets* and a *color-picker 49 | tool* pop up to help set the report's *title*, 50 | *target/groupby variable(optional)* and *graph color*. 51 | 52 | Afterwards, a final file-dialog appears to help set the destination 53 | for the generated report. 54 | 55 | .. tip:: 56 | For help with `Tk` - related issues, consider visiting `TkDocs`_. 57 | 58 | .. _`TkDocs`: https://tkdocs.com/index.html 59 | """ 60 | 61 | def __init__(self, master=None, **kwargs) -> None: 62 | super().__init__(master) 63 | self.master.title("eda-report") 64 | self.master.geometry("560x320") 65 | self.master.resizable(False, False) # Fix window size 66 | self.master.wm_iconphoto(True, PhotoImage(data=icon)) 67 | self._create_widgets() 68 | self.pack() 69 | 70 | def _create_widgets(self) -> None: 71 | """Creates the widgets for the graphical user interface: A Tk *Frame* 72 | with the *canvas(background image)*, *introductory text*, and a 73 | *button* to select files to analyze. 74 | """ 75 | self.canvas = Canvas(self, width=560, height=320) 76 | # Set background image 77 | self.bg_image = PhotoImage(data=background_image) 78 | self.canvas.create_image((0, 0), image=self.bg_image, anchor="nw") 79 | # Add title 80 | self.canvas.create_text( 81 | (70, 30), 82 | anchor="nw", 83 | fill="black", 84 | font=("Courier", 28, "bold"), 85 | text="eda-report", 86 | ) 87 | # Add description 88 | self.canvas.create_text( 89 | (40, 90), 90 | anchor="nw", 91 | fill="black", 92 | font=("Courier", 12), 93 | text=description, 94 | width=480, 95 | ) 96 | # Add a button to select input file 97 | self.button = Button( 98 | self, 99 | bg="#204060", 100 | command=self._create_report, 101 | default="active", 102 | fg="white", 103 | font=("Courier", 11), 104 | relief="flat", 105 | text="Select a file", 106 | ) 107 | self.canvas.create_window( 108 | (180, 220), anchor="nw", height=40, width=200, window=self.button 109 | ) 110 | # Display current action 111 | self.current_action = StringVar() 112 | self.display_current_action = Label( 113 | self, 114 | bg="#c0d6e3", 115 | font=("Courier", 10, "italic"), 116 | textvariable=self.current_action, 117 | ) 118 | self.canvas.create_window( 119 | (140, 280), 120 | anchor="nw", 121 | window=self.display_current_action, 122 | ) 123 | self.canvas.pack() 124 | 125 | def _create_report(self) -> None: 126 | """Collects input from the graphical user interface, and uses the 127 | :class:`~eda_report.document.ReportDocument` object to generate a 128 | report. 129 | """ 130 | self.current_action.set("Waiting for input file...") 131 | self._get_data_from_file() 132 | 133 | if self.data is not None: 134 | self.current_action.set("Waiting for report title...") 135 | self._get_report_title() 136 | 137 | self.current_action.set("Waiting for group-by variable...") 138 | self._get_groupby_variable() 139 | 140 | self.current_action.set("Waiting for graph color...") 141 | self._get_graph_color() 142 | 143 | self.current_action.set("Analysing data & compiling the report...") 144 | self._get_save_as_name() 145 | 146 | # Generate and save the report using the collected arguments 147 | ReportDocument( 148 | self.data, 149 | title=self.report_title, 150 | graph_color=self.graph_color, 151 | output_filename=self.save_name, 152 | groupby_variable=self.groupby_variable, 153 | ) 154 | self.current_action.set("") 155 | showinfo(message=f"Done! Report saved as {self.save_name!r}.") 156 | 157 | # Clear data to free up memory 158 | del self.data 159 | 160 | def _get_data_from_file(self, retries: int = 1) -> None: 161 | """Creates a file dialog to help navigate to and select a file to 162 | analyze. 163 | 164 | Args: 165 | retries (int, optional): Number of additional prompts, if input is 166 | invalid. 167 | """ 168 | file_name = askopenfilename( 169 | title="Select a file to analyze", 170 | filetypes=( 171 | ("All supported formats", ("*.csv", "*.xlsx")), 172 | ("csv", "*.csv"), 173 | ("excel", "*.xlsx"), 174 | ), 175 | ) 176 | if file_name: 177 | self.data = df_from_file(file_name) 178 | elif retries > 0: 179 | if askretrycancel(message="Please select a file to continue"): 180 | self._get_data_from_file(retries - 1) 181 | else: 182 | # No data if retry prompt is cancelled 183 | self.data = None 184 | else: 185 | # No data if no file is selected and retry has been used up 186 | self.data = None 187 | 188 | def _get_report_title(self) -> None: 189 | """Capture text input for the desired report title.""" 190 | report_title = askstring( 191 | title="Report Title", 192 | prompt="Please enter your preferred title for the report:", 193 | initialvalue="Exploratory Data Analysis Report", 194 | ) 195 | self.report_title = report_title or "Exploratory Data Analysis Report" 196 | 197 | def _get_groupby_variable(self) -> None: 198 | """Inquire about the groupby variable, and create a text box to 199 | collect input. 200 | """ 201 | if askyesno( 202 | message="Would you like to specify a variable to group by?" 203 | ): 204 | self.groupby_variable = askstring( 205 | title="Select Group-by Variable", 206 | prompt="Please enter the name/index of the group-by variable:", 207 | ) 208 | try: 209 | _validate_groupby_variable( 210 | data=self.data, groupby_variable=self.groupby_variable 211 | ) 212 | except GroupbyVariableError as error: 213 | self.groupby_variable = None 214 | showwarning( 215 | title="Invalid Group-By Variable", message=error.message 216 | ) 217 | else: 218 | self.groupby_variable = None 219 | 220 | def _get_graph_color(self) -> None: 221 | """Creates a graphical color picking tool to help set the desired 222 | color for the generated graphs. 223 | """ 224 | color = askcolor( 225 | color="cyan", title="Please select a color for the graphs" 226 | ) 227 | # Pick the hexadecimal color format. `askcolor` returns a tuple e.g 228 | # ((255.99609375, 69.26953125, 0.0), '#ff4500'). 229 | self.graph_color = color[-1] or "cyan" 230 | 231 | def _get_save_as_name(self) -> None: 232 | """Create a file dialog to set destination of the generated report.""" 233 | save_name = asksaveasfilename( 234 | initialdir=".", 235 | initialfile="eda-report.docx", 236 | filetypes=(("Word document", "*.docx"),), 237 | title="Please select Save As file name", 238 | ) 239 | self.save_name = save_name or "eda-report.docx" 240 | -------------------------------------------------------------------------------- /eda_report/univariate.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterable 2 | from textwrap import shorten 3 | from typing import Dict, Optional, Tuple 4 | 5 | import numpy as np 6 | from pandas import DataFrame, Series 7 | from pandas.api.types import ( 8 | is_bool_dtype, 9 | is_datetime64_any_dtype, 10 | is_numeric_dtype, 11 | ) 12 | from scipy import stats 13 | 14 | from eda_report._validate import _validate_univariate_input 15 | 16 | 17 | class Variable: 18 | """Obtain summary statistics and properties such as data type, missing 19 | value info & cardinality from one-dimensional datasets. 20 | 21 | Args: 22 | data (Iterable): The data to analyze. 23 | name (str, optional): The name to assign the variable. Defaults to 24 | None. 25 | 26 | Examples: 27 | .. literalinclude:: examples.txt 28 | :lines: 6-32 29 | .. literalinclude:: examples.txt 30 | :lines: 36-50 31 | .. literalinclude:: examples.txt 32 | :lines: 54-71 33 | """ 34 | 35 | def __init__(self, data: Iterable, *, name: str = None) -> None: 36 | data = _validate_univariate_input(data, name=name) 37 | 38 | #: str: The variable's *name*. If no name is specified, the name will 39 | #: be set the value of the ``name`` attribute of the input data, or 40 | #: ``None``. 41 | self.name = data.name 42 | 43 | #: str: The type of variable — one of *"boolean"*, *"categorical"*, 44 | #: *"datetime"*, *"numeric"* or *"numeric (<=10 levels)"*. 45 | self.var_type = self._get_variable_type(data) 46 | 47 | #: int: The *number of unique values* present in the variable. 48 | self.num_unique = data.nunique() 49 | 50 | #: numpy.ndarray: The *unique values* present in the variable. 51 | self.unique_values = np.sort(data.dropna().unique()) 52 | 53 | #: str: The number of *missing values* in the form 54 | #: ``number (% of total count)`` e.g "4 (16.67%)". 55 | self.missing = self._get_missing_values_info(data) 56 | 57 | #: dict: Descriptive statistics 58 | self.summary_stats = self._get_summary_statistics(data) 59 | 60 | self._num_non_null = len(data.dropna()) 61 | self._normality_test_results = self._test_for_normality(data) 62 | self._most_common_categories = self._get_most_common_categories(data) 63 | 64 | def __repr__(self) -> str: 65 | """Define the string representation of a `Variable`. 66 | 67 | Returns: 68 | str: Variable summary. 69 | """ 70 | sample_values = shorten( 71 | f"{self.num_unique} -> {self.unique_values}", 72 | width=60, 73 | placeholder=" ... ]", 74 | ) 75 | basic_details = "\n".join( 76 | [ 77 | f"\nName: {self.name}", 78 | f"Type: {self.var_type}", 79 | f"Non-null Observations: {self._num_non_null}", 80 | f"Unique Values: {sample_values}", 81 | f"Missing Values: {self.missing}", 82 | ] 83 | ) 84 | if self.var_type == "numeric": 85 | summary_stats = "\n".join( 86 | [ 87 | f"\t{key + ':':21} {value :>15.4f}" 88 | for key, value in self.summary_stats.items() 89 | ], 90 | ) 91 | return "\n".join( 92 | [ 93 | f"{basic_details}\n", 94 | "\t\t Summary Statistics", 95 | "\t\t ------------------", 96 | summary_stats, 97 | "\n\t\t Tests for Normality", 98 | "\t\t -------------------", 99 | f"{self._normality_test_results}", 100 | ] 101 | ) 102 | elif self.var_type == "datetime": 103 | summary_stats = "\n".join( 104 | [ 105 | f"\t{key + ':':18} {str(value):>22}" 106 | for key, value in self.summary_stats.items() 107 | ], 108 | ) 109 | return "\n".join( 110 | [ 111 | f"{basic_details}\n", 112 | "\t\t Summary Statistics", 113 | "\t\t ------------------", 114 | summary_stats, 115 | ] 116 | ) 117 | else: 118 | summary_stats = "\n".join( 119 | [ 120 | f"{key}: {value}" 121 | for key, value in self.summary_stats.items() 122 | ] 123 | ) 124 | most_common = "\n".join( 125 | [ 126 | f"{str(key):>24}: {value}" 127 | for key, value in self._most_common_categories.items() 128 | ] 129 | ) 130 | return "\n".join( 131 | [ 132 | basic_details, 133 | summary_stats, 134 | "\n\t\tMost Common Items", 135 | "\t\t-----------------", 136 | most_common, 137 | ] 138 | ) 139 | 140 | def _get_variable_type(self, data: Series) -> str: 141 | """Determine the variable type. 142 | 143 | Args: 144 | data (pandas.Series): The data to analyze. 145 | 146 | Returns: 147 | str: The variable type: `boolean`, `categorical`, `datetime`, 148 | `numeric` or `numeric (<10 levels)`. 149 | """ 150 | if is_numeric_dtype(data): 151 | if is_bool_dtype(data) or set(data.dropna()) == {0, 1}: 152 | # Consider data consisting of ones and zeros as boolean 153 | return "boolean" 154 | elif data.nunique() <= 10: 155 | # Consider numeric data with cardinality <= 10 as categorical 156 | return "numeric (<=10 levels)" 157 | else: 158 | return "numeric" 159 | # Accomodate common values for boolean variables 160 | elif set(data.dropna()) in [ 161 | {False, True}, 162 | {"False", "True"}, 163 | {"No", "Yes"}, 164 | {"N", "Y"}, 165 | ]: 166 | return "boolean" 167 | elif is_datetime64_any_dtype(data): 168 | return "datetime" 169 | else: 170 | return "categorical" 171 | 172 | def _get_missing_values_info(self, data: Series) -> Optional[str]: 173 | """Get the number of missing values. 174 | 175 | Args: 176 | data (pandas.Series): The data to analyze. 177 | 178 | Returns: 179 | Optional[str]: Details about the number of missing values. 180 | """ 181 | missing_values = data.isna().sum() 182 | if missing_values == 0: 183 | return None 184 | else: 185 | return f"{missing_values:,} ({missing_values / len(data):.2%})" 186 | 187 | def _get_summary_statistics(self, data: Series) -> Dict: 188 | """Compute summary statistics for the variable based on data type. 189 | 190 | Args: 191 | data (pandas.Series): The data to analyze. 192 | 193 | Returns: 194 | Dict: Summary statistics. 195 | """ 196 | if self.var_type == "numeric": 197 | stats = data.describe() 198 | return { 199 | "Average": stats["mean"], 200 | "Standard Deviation": stats["std"], 201 | "Minimum": stats["min"], 202 | "Lower Quartile": stats["25%"], 203 | "Median": stats["50%"], 204 | "Upper Quartile": stats["75%"], 205 | "Maximum": stats["max"], 206 | "Skewness": data.skew(), 207 | "Kurtosis": data.kurt(), 208 | } 209 | elif self.var_type == "datetime": 210 | stats = data.describe() 211 | return { 212 | "Average": stats["mean"], 213 | "Minimum": stats["min"], 214 | "Lower Quartile": stats["25%"], 215 | "Median": stats["50%"], 216 | "Upper Quartile": stats["75%"], 217 | "Maximum": stats["max"], 218 | } 219 | else: 220 | data = data.copy().astype("category") 221 | stats = data.describe() 222 | return { 223 | "Mode (Most frequent)": stats["top"], 224 | "Maximum frequency": stats["freq"], 225 | } 226 | 227 | def _test_for_normality( 228 | self, data: Series, alpha: float = 0.05 229 | ) -> DataFrame: 230 | """Perform the "D'Agostino's K-squared", "Kolmogorov-Smirnov" and 231 | "Shapiro-Wilk" tests for normality. 232 | 233 | Args: 234 | data (pandas.Series): The data to analyze. 235 | alpha (float, optional): The level of significance. Defaults to 236 | 0.05. 237 | 238 | Returns: 239 | pandas.DataFrame: Table of results. 240 | """ 241 | data = data.dropna() 242 | if self.var_type == "numeric": 243 | # The scipy implementation of the Shapiro-Wilk test reports: 244 | # "For N > 5000 the W test statistic is accurate but the p-value 245 | # may not be." 246 | shapiro_sample = data.sample(5000) if len(data) > 5000 else data 247 | tests = [ 248 | "D'Agostino's K-squared test", 249 | "Kolmogorov-Smirnov test", 250 | "Shapiro-Wilk test", 251 | ] 252 | p_values = [ 253 | stats.normaltest(data).pvalue, 254 | stats.kstest(data, "norm", N=200).pvalue, 255 | stats.shapiro(shapiro_sample).pvalue, 256 | ] 257 | results = DataFrame(index=tests) 258 | results["p-value"] = [f"{x:.7f}" for x in p_values] 259 | results[f"Conclusion at α = {alpha}"] = [ 260 | "Possibly normal" 261 | if p_value > alpha 262 | else "Unlikely to be normal" 263 | for p_value in p_values 264 | ] 265 | return results 266 | else: 267 | return None 268 | 269 | def _get_most_common_categories(self, data: Series) -> Dict: 270 | """Get the top 10 frequently occuring categories. 271 | 272 | Args: 273 | data (pandas.Series): The data to analyze. 274 | 275 | Returns: 276 | Dict: Top 10 categories and their frequency info. 277 | """ 278 | data = data.dropna() 279 | if self.var_type in {"numeric", "datetime"}: 280 | return None 281 | else: 282 | top_10 = data.value_counts().nlargest(10) 283 | return { 284 | key: f"{val} ({val/len(data):.2%})" 285 | for key, val in top_10.items() 286 | } 287 | 288 | def rename(self, name: str) -> None: 289 | """Update the variable's name. 290 | 291 | Args: 292 | name (str): New name. 293 | """ 294 | self.name = name 295 | 296 | 297 | def _analyze_univariate(name_and_data: Tuple) -> Variable: 298 | """Helper function to concurrently analyze data with multiprocessing. 299 | 300 | Args: 301 | name_and_data (Tuple): Name and data. 302 | 303 | Returns: 304 | Variable: `Variable` instance. 305 | """ 306 | name, data = name_and_data 307 | var = Variable(data, name=name) 308 | return name, var 309 | -------------------------------------------------------------------------------- /eda_report/document.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Iterable, Sequence, Union 3 | 4 | from docx import Document 5 | from docx.enum.text import WD_ALIGN_PARAGRAPH 6 | from docx.shared import Inches, Pt 7 | from docx.text.paragraph import Paragraph 8 | from pandas import DataFrame, Series 9 | 10 | from eda_report._content import _ReportContent 11 | 12 | logging.basicConfig( 13 | format="[%(levelname)s %(asctime)s.%(msecs)03d] %(message)s", 14 | level=logging.INFO, 15 | datefmt="%H:%M:%S", 16 | ) 17 | # Set matplotlib logging level to WARNING. 18 | mpl_logger = logging.getLogger("matplotlib") 19 | mpl_logger.setLevel(logging.WARNING) 20 | 21 | 22 | class ReportDocument(_ReportContent): 23 | """Creates a report :class:`~docx.document.Document` with analysis results. 24 | 25 | The report consists of 3 main sections: 26 | 27 | #. An **Overview** of the data and its features. 28 | #. **Univariate Analysis**: Summary statistics and graphs for each feature. 29 | #. **Bivariate Analysis**: Pair-wise comparisons of numerical features. 30 | 31 | Args: 32 | data (Iterable): The data to analyze. 33 | title (str, optional): The title to assign the report. Defaults to 34 | "Exploratory Data Analysis Report". 35 | graph_color (str, optional): The color to apply to the graphs. 36 | Defaults to "cyan". 37 | groupby_variable (Union[str, int], optional): The column to 38 | use to group values. Defaults to None. 39 | output_filename (str, optional): The name/path to save the document 40 | to. Defaults to "eda-report.docx". 41 | table_style (str, optional): The style to apply to the tables created. 42 | Defaults to "Table Grid". 43 | """ 44 | 45 | def __init__( 46 | self, 47 | data: Iterable, 48 | *, 49 | title: str = "Exploratory Data Analysis Report", 50 | graph_color: str = "cyan", 51 | groupby_variable: Union[str, int] = None, 52 | output_filename: str = "eda-report.docx", 53 | table_style: str = "Table Grid", 54 | ) -> None: 55 | super().__init__( 56 | data, 57 | title=title, 58 | graph_color=graph_color, 59 | groupby_variable=groupby_variable, 60 | ) 61 | self.OUTPUT_FILENAME = output_filename 62 | self.TABLE_STYLE = table_style 63 | self.document = Document() # Initialize report document 64 | self._create_cover_page() 65 | self._get_univariate_analysis() 66 | 67 | if self.dataset._correlation_values is not None: 68 | self._get_bivariate_analysis() 69 | 70 | self._to_file() 71 | logging.info(f"Done. Results saved as {self.OUTPUT_FILENAME!r}") 72 | 73 | def _create_cover_page(self) -> None: 74 | """Add a title and overview of the data.""" 75 | self.document.add_heading(self.TITLE, level=0) 76 | self.document.add_paragraph(self.intro_text) 77 | self._get_numeric_overview_table() 78 | self._get_categorical_overview_table() 79 | self.document.add_page_break() 80 | 81 | def _get_numeric_overview_table(self) -> None: 82 | """Create a table with an overview of the numeric features present.""" 83 | if self.dataset._numeric_stats is None: 84 | return None 85 | else: 86 | heading = self.document.add_heading( 87 | "Overview of Numeric Features", level=1 88 | ) 89 | self._format_paragraph_spacing(heading) 90 | # count | avg | stddev | min | 25% | 50% | 75% | max 91 | self._create_table( 92 | data=self.dataset._numeric_stats, 93 | header=True, 94 | column_widths=(1.2,) + (0.7,) * 8, 95 | font_size=8.5, 96 | style="Normal Table", 97 | ) 98 | 99 | def _get_categorical_overview_table(self) -> None: 100 | """Create a table with an overview of the categorical features 101 | present. 102 | """ 103 | if self.dataset._categorical_stats is None: 104 | return None 105 | else: 106 | heading = self.document.add_heading( 107 | "Overview of Categorical Features", level=1 108 | ) 109 | self._format_paragraph_spacing(heading) 110 | # column-name | count | unique | top | freq | relative freq 111 | self._create_table( 112 | data=self.dataset._categorical_stats, 113 | header=True, 114 | column_widths=(1.2,) + (0.9,) * 5, 115 | font_size=8.5, 116 | style="Normal Table", 117 | ) 118 | 119 | def _get_univariate_analysis(self) -> None: 120 | """Get a brief introduction, summary statistics, and graphs for each 121 | individual variable. 122 | """ 123 | univariate_heading = self.document.add_heading( 124 | "1. Univariate Analysis", level=1 125 | ) 126 | self._format_paragraph_spacing(univariate_heading, before=0, after=0) 127 | for idx, variable in enumerate(self.variables.values(), start=1): 128 | var_name = variable.name 129 | description = self.variable_descriptions[var_name] 130 | summary_stats = Series(self.univariate_stats[var_name]).to_frame() 131 | graphs = self.univariate_graphs[var_name] 132 | contingency_table = self.contingency_tables.get(var_name) 133 | normality_tests = self.normality_tests.get(var_name) 134 | # Variable's title and brief description 135 | heading = self.document.add_heading( 136 | f"1.{idx} {var_name}".title(), level=2 137 | ) 138 | self._format_paragraph_spacing(heading, before=12, after=5) 139 | self.document.add_paragraph(description) 140 | # Summary statistics table 141 | stats_heading = self.document.add_heading( 142 | "Summary Statistics", level=4 143 | ) 144 | stats_heading.alignment = WD_ALIGN_PARAGRAPH.CENTER 145 | self._create_table(summary_stats, column_widths=[2.5, 2]) 146 | # Images of plotted graphs 147 | for name, image in graphs.items(): 148 | width = 3.3 if name == "prob_plot" else 4.2 149 | self.document.add_picture(image, width=Inches(width)) 150 | picture_paragraph = self.document.paragraphs[-1] 151 | picture_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER 152 | 153 | if contingency_table is not None: 154 | contingency_table_heading = self.document.add_heading( 155 | "Contingency table", level=4 156 | ) 157 | contingency_table_heading.alignment = WD_ALIGN_PARAGRAPH.CENTER 158 | context = self.document.add_paragraph( 159 | f"Index = '{var_name}', " 160 | f"Columns = '{self.GROUPBY_DATA.name}' " 161 | ) 162 | context.alignment = WD_ALIGN_PARAGRAPH.CENTER 163 | context.runs[0].font.size = Pt(8) 164 | n_cols = contingency_table.shape[1] 165 | max_width = 5.2 if n_cols > 5 else 3.2 166 | col_width = max_width / n_cols 167 | self._create_table( 168 | data=contingency_table, 169 | header=True, 170 | column_widths=(1.2,) + (col_width,) * n_cols, 171 | font_size=8.5, 172 | ) 173 | 174 | if normality_tests is not None: 175 | norm_test_heading = self.document.add_heading( 176 | "Tests for Normality", level=4 177 | ) 178 | norm_test_heading.alignment = WD_ALIGN_PARAGRAPH.CENTER 179 | # type | p-value | conclusion 180 | self._create_table( 181 | data=normality_tests, 182 | header=True, 183 | column_widths=(2.2, 1, 2), 184 | font_size=8.5, 185 | style="Normal Table", 186 | ) 187 | 188 | self.document.add_page_break() 189 | 190 | def _get_bivariate_analysis(self) -> None: 191 | """Get comparisons and regression-plots for pairs of numeric 192 | variables. 193 | """ 194 | bivariate_heading = self.document.add_heading( 195 | "2. Bivariate Analysis", level=1 196 | ) 197 | self._format_paragraph_spacing(bivariate_heading, before=0) 198 | overview_heading = self.document.add_heading("2.1 Overview", level=2) 199 | self._format_paragraph_spacing(overview_heading) 200 | self.document.add_picture( 201 | self.bivariate_graphs["correlation_plot"], 202 | width=Inches(6.4), 203 | ) 204 | picture_paragraph = self.document.paragraphs[-1] 205 | picture_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER 206 | self.document.add_page_break() 207 | 208 | pairwise_heading = self.document.add_heading( 209 | "2.2 Regression Plots (Top 20)", level=2 210 | ) 211 | self._format_paragraph_spacing(pairwise_heading, before=0) 212 | for idx, var_pair in enumerate(self.bivariate_summaries, start=1): 213 | heading = self.document.add_heading( 214 | f"2.2.{idx} {var_pair[0]} vs {var_pair[1]}".title(), level=3 215 | ) 216 | self._format_paragraph_spacing(heading, before=16, after=5) 217 | self.document.add_paragraph(self.bivariate_summaries[var_pair]) 218 | self.document.add_picture( 219 | self.bivariate_graphs["regression_plots"][var_pair], 220 | width=Inches(3.3), 221 | ) 222 | picture_paragraph = self.document.paragraphs[-1] 223 | picture_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER 224 | 225 | def _format_paragraph_spacing( 226 | self, paragraph: Paragraph, before: int = 15, after: int = 7 227 | ) -> None: 228 | """Set the spacing above or below a paragraph. 229 | 230 | Args: 231 | paragraph (docx.text.paragraph.Paragraph): A paragraph. 232 | before (int, optional): Size of spacing above the paragraph in pt. 233 | Defaults to 15. 234 | after (int, optional): Size of spacing below the paragraph in pt. 235 | Defaults to 7. 236 | """ 237 | paragraph.paragraph_format.space_before = Pt(before) 238 | paragraph.paragraph_format.space_after = Pt(after) 239 | 240 | def _create_table( 241 | self, 242 | data: DataFrame, 243 | column_widths: Sequence = (), 244 | font_face: str = "Courier New", 245 | font_size: float = 10, 246 | style: str = None, 247 | header: bool = False, 248 | ) -> None: 249 | """Generates a table for the supplied ``data``. 250 | 251 | Args: 252 | data (DataFrame): The data to tabulate. 253 | column_widths (Sequence, optional): Column dimensions in inches. 254 | Defaults to (). 255 | font_face (str, optional): Font for cell text. Defaults to 256 | "Courier New". 257 | font_size (float, optional): Font size. Defaults to 10. 258 | style (str, optional): A `Word` table style. Defaults to 259 | None. 260 | header (bool, optional): Whether or not to include column names. 261 | Defaults to False. 262 | """ 263 | table = self.document.add_table( 264 | rows=0, 265 | cols=len(column_widths), 266 | style=style or self.document.styles[self.TABLE_STYLE], 267 | ) 268 | table.alignment = WD_ALIGN_PARAGRAPH.CENTER 269 | for idx, width in enumerate(column_widths): 270 | table.columns[idx].width = Inches(width) 271 | 272 | if header: 273 | cells = table.add_row().cells 274 | header_labels = [""] + list(data.columns) 275 | for cell, value in zip(cells, header_labels): 276 | cell.text = f"{value}" 277 | # Font size and type-face have to be set at `run` level 278 | run = cell.paragraphs[0].runs[0] 279 | run.bold = True 280 | run.font.size = Pt(font_size) 281 | run.font.name = font_face 282 | 283 | # Sequentially add and populate rows 284 | for row_data in data.itertuples(): 285 | cells = table.add_row().cells 286 | for idx, (cell, value) in enumerate(zip(cells, row_data)): 287 | try: 288 | # Strip trailing zeros from float values 289 | text = f"{value:.4f}".rstrip("0").rstrip(".") 290 | except ValueError: 291 | text = f"{value}" 292 | 293 | cell.text = text 294 | # Font size and type-face have to be set at `run` level 295 | run = cell.paragraphs[0].runs[0] 296 | run.font.size = Pt(font_size) 297 | run.font.name = font_face 298 | # Make first column values bold if header is True 299 | if idx == 0 and header: 300 | run.bold = True 301 | 302 | # Add empty paragraph. "Spacing" for docx Table isn't yet implemented 303 | self.document.add_paragraph() 304 | 305 | def _to_file(self) -> None: 306 | """Save the report as a file.""" 307 | for section in self.document.sections: 308 | section.left_margin = Inches(1.2) 309 | section.right_margin = Inches(1.2) 310 | 311 | self.document.save(self.OUTPUT_FILENAME) 312 | -------------------------------------------------------------------------------- /tests/test_plotting_functions.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | 3 | import pytest 4 | from matplotlib.axes import Axes 5 | from matplotlib.colors import to_rgb 6 | from matplotlib.figure import Figure 7 | from pandas import DataFrame, Series 8 | 9 | from eda_report.bivariate import Dataset 10 | from eda_report.plotting import ( 11 | _get_or_validate_axes, 12 | _get_color_shades_of, 13 | _plot_dataset, 14 | _plot_regression, 15 | _plot_variable, 16 | _savefig, 17 | bar_plot, 18 | box_plot, 19 | kde_plot, 20 | plot_correlation, 21 | prob_plot, 22 | ) 23 | from eda_report.univariate import Variable 24 | 25 | 26 | def test_savefig_function(): 27 | saved = _savefig(figure=Figure()) 28 | assert isinstance(saved, BytesIO) 29 | 30 | 31 | def test_get_color_shades_of(): 32 | color, num_shades = "green", 5 33 | green_shades = _get_color_shades_of(color, num_shades) 34 | assert green_shades.shape == (num_shades, 3) # each color is an rgb tuple 35 | assert green_shades[0] == pytest.approx(to_rgb(color)) 36 | 37 | 38 | class TestGetAxesFunction: 39 | def test_without_input(self): 40 | ax = _get_or_validate_axes() 41 | assert isinstance(ax, Axes) 42 | 43 | def test_with_axes_input(self): 44 | ax1 = Figure().subplots() 45 | ax2 = _get_or_validate_axes(ax1) 46 | assert ax1 is ax2 47 | 48 | def test_with_invalid_input(self): 49 | with pytest.raises(TypeError) as error: 50 | _get_or_validate_axes(123) 51 | # Check that the error message is as expected 52 | assert "Invalid input for 'ax': " in str(error.value) 53 | 54 | 55 | class TestBoxplot: 56 | data = Series(list(range(25)) + [None, None]) 57 | hue = Series([1, 2, 3] * 9, name="hue-name") 58 | simple_box = box_plot(data, label="simple") 59 | grouped_box = box_plot(data, label="grouped", hue=hue) 60 | 61 | def test_return_type(self): 62 | assert isinstance(self.simple_box, Axes) 63 | assert isinstance(self.grouped_box, Axes) 64 | 65 | def test_plot_title(self): 66 | assert self.simple_box.get_title() == "Box-plot of simple" 67 | assert self.grouped_box.get_title() == "Box-plot of grouped" 68 | 69 | def test_axis_labels(self): 70 | assert self.simple_box.get_xlabel() == "" 71 | assert self.simple_box.get_ylabel() == "" 72 | assert self.grouped_box.get_xlabel() == "" 73 | assert self.grouped_box.get_ylabel() == "Hue-Name" 74 | 75 | boxplot_with_nameless_hue = box_plot( 76 | self.data, label="grouped", hue=self.hue.to_numpy() 77 | ) 78 | assert boxplot_with_nameless_hue.get_xlabel() == "" 79 | assert boxplot_with_nameless_hue.get_ylabel() == "" 80 | 81 | def test_grouping(self): 82 | # Simple box-plot has one patch 83 | assert len(self.simple_box.patches) == 1 84 | # Grouped box-plot has hue.nunique() patches 85 | assert len(self.grouped_box.patches) == self.hue.nunique() 86 | 87 | def test_simple_set_color(self): 88 | box1_color = self.simple_box.patches[0].get_facecolor() 89 | 90 | _color = "blue" 91 | simple_box_2 = box_plot(self.data, label="simple", color=_color) 92 | box2_color = simple_box_2.patches[0].get_facecolor() 93 | 94 | assert box1_color == pytest.approx( 95 | (*to_rgb("C0"), 0.75) # default color 1 and alpha value 96 | ) 97 | assert box2_color == pytest.approx( 98 | (*to_rgb(_color), 0.75) # _color and alpha value 99 | ) 100 | 101 | def test_grouped_set_color(self): 102 | _color = "lime" 103 | # Take last patch since colors are reversed (["CN", .. , "C0"]) 104 | last_box_color = self.grouped_box.patches[-1].get_facecolor() 105 | 106 | grouped_box_2 = box_plot( 107 | self.data, hue=self.hue, label="simple", color=_color 108 | ) 109 | last_box2_color = grouped_box_2.patches[-1].get_facecolor() 110 | 111 | assert last_box_color == pytest.approx( 112 | (*to_rgb("C0"), 0.75) # default color |hue| and alpha value 113 | ) 114 | assert last_box2_color == pytest.approx( 115 | (*to_rgb(_color), 0.75) # _color and alpha value 116 | ) 117 | 118 | 119 | class TestKdeplot: 120 | data = Series(list(range(25)) + [None, None]) 121 | hue = Series([1, 2, 3] * 9, name="hue-name") 122 | simple_kde = kde_plot(data, label="simple") 123 | grouped_kde = kde_plot(data, label="grouped", hue=hue) 124 | 125 | def test_return_type(self): 126 | assert isinstance(self.simple_kde, Axes) 127 | assert isinstance(self.grouped_kde, Axes) 128 | 129 | def test_plot_title(self): 130 | assert self.simple_kde.get_title() == "Density plot of simple" 131 | assert self.grouped_kde.get_title() == "Density plot of grouped" 132 | 133 | def test_axis_labels(self): 134 | assert self.simple_kde.get_xlabel() == "simple" 135 | assert self.simple_kde.get_ylabel() == "" 136 | assert self.grouped_kde.get_xlabel() == "grouped" 137 | assert self.grouped_kde.get_ylabel() == "" 138 | 139 | def test_legend(self): 140 | assert self.simple_kde.get_legend() is None 141 | assert ( 142 | self.grouped_kde.get_legend().get_title().get_text() == "Hue-Name" 143 | ) 144 | grouped_with_nameless_hue = kde_plot( 145 | self.data, label="grouped", hue=self.hue.to_numpy() 146 | ) 147 | assert grouped_with_nameless_hue.get_legend() is None 148 | 149 | def test_grouping(self): 150 | # simple_kde has one line 151 | assert len(self.simple_kde.lines) == 1 152 | # grouped_kde has hue.nunique() lines 153 | assert len(self.grouped_kde.lines) == self.hue.nunique() 154 | 155 | def test_kde_small_sample(self): 156 | # Should plot text explaining that the input data is singular 157 | plot = kde_plot(self.data[:1], label="small-sample") 158 | assert ( 159 | plot.texts[0].get_text() 160 | == "[Could not plot kernel density estimate.\n Data is singular.]" 161 | ) 162 | 163 | def test_kde_zero_variance(self): 164 | # Should plot text explaining that the input data is singular 165 | plot = kde_plot(Series([1] * 25), label="constant-sample") 166 | assert ( 167 | plot.texts[0].get_text() 168 | == "[Could not plot kernel density estimate.\n Data is singular.]" 169 | ) 170 | 171 | def test_simple_set_color(self): 172 | kde1_color = self.simple_kde.lines[0].get_color() 173 | 174 | _color = "violet" 175 | simple_kde_2 = kde_plot(self.data, label="simple", color=_color) 176 | kde2_color = simple_kde_2.lines[0].get_color() 177 | 178 | assert to_rgb(kde1_color) == pytest.approx(to_rgb("C0")) 179 | assert to_rgb(kde2_color) == pytest.approx(to_rgb(_color)) 180 | 181 | def test_grouped_set_color(self): 182 | first_kde_color = self.grouped_kde.lines[0].get_color() 183 | 184 | _color = "aqua" 185 | grouped_kde_2 = kde_plot( 186 | self.data, hue=self.hue, label="simple", color=_color 187 | ) 188 | first_kde2_color = grouped_kde_2.lines[0].get_color() 189 | 190 | assert to_rgb(first_kde_color) == pytest.approx(to_rgb("C0")) 191 | assert to_rgb(first_kde2_color) == pytest.approx(to_rgb(_color)) 192 | 193 | 194 | class TestProbplot: 195 | data = Series(list(range(25)) + [None, None]) 196 | plot = prob_plot(data, label="some-data") 197 | 198 | def test_return_type(self): 199 | assert isinstance(self.plot, Axes) 200 | 201 | def test_plot_title(self): 202 | assert self.plot.get_title() == "Probability plot of some-data" 203 | 204 | def test_axis_labels(self): 205 | assert self.plot.get_xlabel() == "Theoretical Quantiles (Normal)" 206 | assert self.plot.get_ylabel() == "Ordered Values" 207 | 208 | def test_plot_components(self): 209 | # Plot should have 2 lines (input data & normal diagonal) 210 | assert len(self.plot.lines) == 2 211 | 212 | def test_default_colors(self): 213 | markers, reg_line = self.plot.lines 214 | 215 | assert markers.get_color() == "C0" 216 | assert reg_line.get_color() == "#222" 217 | 218 | def test_set_colors(self): 219 | fig = prob_plot( 220 | self.data, 221 | label="some-more-data", 222 | marker_color="yellow", 223 | line_color="salmon", 224 | ) 225 | markers, reg_line = fig.lines 226 | 227 | assert markers.get_color() == "yellow" 228 | assert reg_line.get_color() == "salmon" 229 | 230 | 231 | class TestBarplot: 232 | low_cardinality_data = Series(list("abcdeabcdabcaba")) 233 | high_cardinality_data = Series(list("aabbccddeeffgghhiijjkkllmmnn")) 234 | simple_bar = bar_plot(low_cardinality_data, label="abcde") 235 | truncated_bar = bar_plot(high_cardinality_data, label="a_to_n") 236 | 237 | def test_return_type(self): 238 | assert isinstance(self.simple_bar, Axes) 239 | assert isinstance(self.truncated_bar, Axes) 240 | 241 | def test_plot_title(self): 242 | assert self.simple_bar.get_title() == "Bar-plot of abcde" 243 | assert ( 244 | self.truncated_bar.get_title() 245 | == "Bar-plot of a_to_n (Top 10 of 14)" 246 | ) 247 | 248 | def test_axis_labels(self): 249 | assert self.simple_bar.get_xlabel() == "" 250 | assert self.simple_bar.get_ylabel() == "Count" 251 | 252 | def test_bar_truncation(self): 253 | # Check that only the top 10 categories are plotted 254 | assert len(self.truncated_bar.patches) == 10 # only 10 of 14 255 | 256 | def test_default_color(self): 257 | bar_color = self.simple_bar.patches[0].get_facecolor() 258 | assert to_rgb(bar_color) == pytest.approx(to_rgb("C0")) 259 | 260 | def test_set_color(self): 261 | fig = bar_plot(self.low_cardinality_data, label="test", color="pink") 262 | bar_color = fig.patches[0].get_facecolor() 263 | assert to_rgb(bar_color) == pytest.approx(to_rgb("pink")) 264 | 265 | 266 | class TestPlotvariable: 267 | def test_numeric_plots(self): 268 | data = range(25) 269 | numeric_var = Variable(data, name="numbers") 270 | name, graphs = _plot_variable( 271 | variable_data_hue_and_color=(numeric_var, data, None, "teal") 272 | ) 273 | 274 | assert name == numeric_var.name 275 | assert set(graphs.keys()) == {"box_plot", "kde_plot", "prob_plot"} 276 | for graph in graphs.values(): 277 | assert isinstance(graph, BytesIO) 278 | 279 | def test_categorical_plots(self): 280 | data = list("abcdeabcdabcaba") 281 | categorical_var = Variable(list("abcdeabcdabcaba"), name="letters") 282 | name, graphs = _plot_variable( 283 | variable_data_hue_and_color=(categorical_var, data, None, "navy") 284 | ) 285 | 286 | assert name == categorical_var.name 287 | assert set(graphs.keys()) == {"bar_plot"} 288 | for graph in graphs.values(): 289 | assert isinstance(graph, BytesIO) 290 | 291 | 292 | class TestPlotCorrelation: 293 | def test_with_insufficient_numeric_pairs(self): 294 | # Check None is returned if there are < 2 numeric pairs 295 | single_numeric = plot_correlation(zip(range(5), list("abcde"))) 296 | no_numeric = plot_correlation(list("abcde")) 297 | assert single_numeric is None 298 | assert no_numeric is None 299 | 300 | def test_with_few_numeric_pairs(self): 301 | corr_plot = plot_correlation([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) 302 | assert isinstance(corr_plot, Axes) 303 | assert corr_plot.get_title() == "Pearson Correlation (Top 3)" 304 | assert len(corr_plot.patches) == 3 # 3 unique pairs 305 | 306 | def test_with_excess_numeric_pairs(self): 307 | # Should only plot the top 20 by magnitude 308 | corr_plot = plot_correlation([range(10), [4, 5, 6, 7, 8] * 2]) 309 | 310 | assert isinstance(corr_plot, Axes) 311 | assert corr_plot.get_title() == "Pearson Correlation (Top 20)" 312 | assert len(corr_plot.patches) == 20 # Top 20 of 45 pairs 313 | 314 | def test_default_colors(self): 315 | corr_plot = plot_correlation([[1, 2, 8], [3, 5, 5], [9, 8, 1]]) 316 | bars = corr_plot.patches 317 | # bars = (0.96, -0.98, -1), from origin 318 | pos_bar_color = bars[0].get_facecolor() 319 | neg_bar_color = bars[-1].get_facecolor() 320 | 321 | assert to_rgb(pos_bar_color) == pytest.approx(to_rgb("orangered")) 322 | assert to_rgb(neg_bar_color) == pytest.approx(to_rgb("steelblue")) 323 | 324 | def test_set_colors(self): 325 | corr_plot2 = plot_correlation( 326 | [[10, 20, 80], [30, 50, 50], [90, 80, 10]], 327 | color_neg="skyblue", 328 | color_pos="maroon", 329 | ) 330 | bars2 = corr_plot2.patches 331 | # bars = (0.96, -0.98, -1), from origin 332 | pos_bar_color2 = bars2[0].get_facecolor() 333 | neg_bar_color2 = bars2[-1].get_facecolor() 334 | 335 | assert to_rgb(pos_bar_color2) == pytest.approx(to_rgb("maroon")) 336 | assert to_rgb(neg_bar_color2) == pytest.approx(to_rgb("skyblue")) 337 | 338 | 339 | class TestRegressionPlot: 340 | data = DataFrame({"A": range(60000), "B": [1, 2, 3] * 20000}) 341 | var_pair, reg_plot = _plot_regression(data_and_color=(data, "lime")) 342 | 343 | def test_return_type(self): 344 | assert self.var_pair == ("A", "B") 345 | assert isinstance(self.reg_plot, Axes) 346 | 347 | def test_plot_title(self): 348 | title = self.reg_plot.get_title() 349 | assert "Slope" in title 350 | assert "Intercept" in title 351 | assert "Correlation" in title 352 | 353 | def test_axis_labels(self): 354 | var1, var2 = self.var_pair 355 | assert self.reg_plot.get_xlabel() == var1 356 | assert self.reg_plot.get_ylabel() == var2 357 | 358 | def test_max_sample_size(self): 359 | # Check that a sample of size 50000 is taken for large datasets 360 | points = self.reg_plot.collections[0].get_offsets().data 361 | assert len(points) == 50000 362 | 363 | def test_plot_color(self): 364 | assert self.reg_plot.lines[0].get_color() == "#444" # reg line 365 | assert to_rgb( # markers 366 | self.reg_plot.collections[0].get_facecolor() 367 | ) == pytest.approx(to_rgb("lime")) 368 | 369 | 370 | class TestPlotDataset: 371 | def test_without_numeric_pairs(self): 372 | data = Dataset(range(50)) 373 | assert _plot_dataset(data, color="red") is None 374 | 375 | def test_with_numeric_pairs(self): 376 | data = Dataset([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) 377 | graphs = _plot_dataset(data, color="green") 378 | 379 | assert set(graphs.keys()) == { 380 | "correlation_plot", 381 | "regression_plots", 382 | } 383 | corr_plot = graphs["correlation_plot"] 384 | reg_plots = list(graphs["regression_plots"].values()) 385 | 386 | for graph in reg_plots + [corr_plot]: 387 | assert isinstance(graph, BytesIO) 388 | 389 | def test_limiting_numeric_pairs(self): 390 | data = Dataset([range(12), [1, 2, 3, 4] * 3]) 391 | # `data`` has 12 numeric columns, resulting in up to 66 var_pairs. 392 | # Check if only limit = 20 are plotted. 393 | graphs = _plot_dataset(data, color="green") 394 | assert len(graphs["regression_plots"]) == 20 395 | -------------------------------------------------------------------------------- /eda_report/plotting.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from multiprocessing import get_context 3 | from typing import Dict, Iterable, Optional, Sequence, Tuple, Union 4 | 5 | import matplotlib as mpl 6 | import numpy as np 7 | from matplotlib.axes import Axes 8 | from matplotlib.colors import to_rgb 9 | from matplotlib.figure import Figure 10 | from scipy.stats import gaussian_kde, probplot 11 | from tqdm import tqdm 12 | 13 | from eda_report._validate import _validate_dataset, _validate_univariate_input 14 | from eda_report.bivariate import Dataset 15 | 16 | # Matplotlib configuration 17 | GENERAL_RC_PARAMS = { 18 | "axes.spines.top": False, 19 | "axes.spines.right": False, 20 | "axes.titlesize": 12, 21 | "axes.titleweight": 500, 22 | "figure.autolayout": True, 23 | "figure.figsize": (5.6, 3.5), 24 | "font.family": "serif", 25 | "savefig.dpi": 120, 26 | } 27 | # Customize box-plots 28 | BOXPLOT_RC_PARAMS = { 29 | **GENERAL_RC_PARAMS, 30 | "boxplot.medianprops.color": "black", 31 | "boxplot.patchartist": True, 32 | "boxplot.vertical": False, 33 | } 34 | # Customize correlation-plots 35 | CORRPLOT_RC_PARAMS = {**GENERAL_RC_PARAMS, "figure.figsize": (7, 6.3)} 36 | # Customize regression-plots 37 | REGPLOT_RC_PARAMS = {**GENERAL_RC_PARAMS, "figure.figsize": (5.2, 5)} 38 | 39 | 40 | @mpl.rc_context(GENERAL_RC_PARAMS) 41 | def _savefig(figure: Figure) -> BytesIO: 42 | """Saves the contents of a :class:`~matplotlib.figure.Figure` in PNG 43 | format, as bytes in a file-like object. This allows rapid in-memory 44 | access when compiling the report. 45 | 46 | Args: 47 | figure (matplotlib.figure.Figure): Graph content. 48 | 49 | Returns: 50 | io.BytesIO: A graph in PNG format as bytes. 51 | """ 52 | graph = BytesIO() 53 | figure.savefig(graph, format="png") 54 | return graph 55 | 56 | 57 | def _get_or_validate_axes(ax: Axes = None) -> Axes: 58 | """Create or validate an Axes instance. 59 | 60 | Args: 61 | ax (matplotlib.axes.Axes, optional): Axes instance. Defaults to None. 62 | 63 | Raises: 64 | TypeError: If `ax` is not an Axes instance. 65 | 66 | Returns: 67 | Axes: Axes instance. 68 | """ 69 | if ax is None: 70 | return Figure().subplots() 71 | elif isinstance(ax, Axes): 72 | return ax 73 | else: 74 | raise TypeError(f"Invalid input for 'ax': {type(ax)}") 75 | 76 | 77 | def _get_color_shades_of(color: str, num: int = None) -> Sequence: 78 | """Obtain an array with `num` shades of the specified `color`. 79 | 80 | Args: 81 | color (str): The desired color. 82 | num (int): Desired number of color shades. 83 | 84 | Returns: 85 | Sequence: Array of RGB colors. 86 | """ 87 | color_rgb = to_rgb(color) 88 | return np.linspace(color_rgb, (0.25, 0.25, 0.25), num=num) 89 | 90 | 91 | @mpl.rc_context(BOXPLOT_RC_PARAMS) 92 | def box_plot( 93 | data: Iterable, 94 | *, 95 | label: str, 96 | hue: Iterable = None, 97 | color: Union[str, Sequence] = None, 98 | ax: Axes = None, 99 | ) -> Axes: 100 | """Get a box-plot from numeric values. 101 | 102 | Args: 103 | data (Iterable): Values to plot. 104 | label (str): A name for the ``data``, shown in the title. 105 | hue (Iterable, optional): Values for grouping the ``data``. Defaults to 106 | None. 107 | color (Union[str, Sequence]): A valid matplotlib color specifier. 108 | ax (matplotlib.axes.Axes, optional): Axes instance. Defaults to None. 109 | 110 | Returns: 111 | matplotlib.axes.Axes: Matplotlib axes with the box-plot. 112 | """ 113 | original_data = _validate_univariate_input(data) 114 | data = original_data.dropna() 115 | ax = _get_or_validate_axes(ax) 116 | if hue is None: 117 | bxplot = ax.boxplot( 118 | data, 119 | tick_labels=[label], 120 | sym=".", 121 | boxprops=dict(facecolor=color, alpha=0.75), 122 | ) 123 | ax.set_yticklabels("") 124 | else: 125 | hue = _validate_univariate_input(hue)[original_data.notna()] 126 | groups = {key: sub_series for key, sub_series in data.groupby(hue)} 127 | bxplot = ax.boxplot( 128 | groups.values(), tick_labels=groups.keys(), sym="." 129 | ) 130 | 131 | if color is None: 132 | colors = [f"C{idx}" for idx in range(hue.nunique())] 133 | else: 134 | colors = _get_color_shades_of(color, hue.nunique()) 135 | 136 | for patch, color in zip(bxplot["boxes"], reversed(colors)): 137 | patch.set_facecolor(color) 138 | patch.set_alpha(0.75) 139 | 140 | if hue.name is not None: 141 | ax.set_ylabel(f"{hue.name}".title()) 142 | 143 | ax.set_title(f"Box-plot of {label}") 144 | return ax 145 | 146 | 147 | @mpl.rc_context(GENERAL_RC_PARAMS) 148 | def kde_plot( 149 | data: Iterable, 150 | *, 151 | label: str, 152 | hue: Iterable = None, 153 | color: Union[str, Sequence] = None, 154 | ax: Axes = None, 155 | ) -> Axes: 156 | """Get a kde-plot from numeric values. 157 | 158 | Args: 159 | data (Iterable): Values to plot. 160 | label (str): A name for the ``data``, shown in the title. 161 | hue (Iterable, optional): Values for grouping the ``data``. Defaults to 162 | None. 163 | color (Union[str, Sequence]): A valid matplotlib color specifier. 164 | ax (matplotlib.axes.Axes, optional): Axes instance. Defaults to None. 165 | 166 | Returns: 167 | matplotlib.axes.Axes: Matplotlib axes with the kde-plot. 168 | """ 169 | original_data = _validate_univariate_input(data) 170 | data = original_data.dropna() 171 | ax = _get_or_validate_axes(ax) 172 | if len(data) < 2 or np.isclose(data.std(), 0): 173 | msg = "[Could not plot kernel density estimate.\n Data is singular.]" 174 | ax.text(x=0.08, y=0.45, s=msg, color="#f72", size=14, weight=600) 175 | return ax 176 | 177 | eval_points = np.linspace(data.min(), data.max(), num=len(data)) 178 | if hue is None: 179 | kernel = gaussian_kde(data) 180 | density = kernel(eval_points) 181 | ax.plot(eval_points, density, label=label, color=color) 182 | ax.fill_between(eval_points, density, alpha=0.3, color=color) 183 | else: 184 | hue = _validate_univariate_input(hue)[original_data.notna()] 185 | if color is None: 186 | colors = [f"C{idx}" for idx in range(hue.nunique())] 187 | else: 188 | colors = _get_color_shades_of(color, hue.nunique()) 189 | 190 | for color, (key, series) in zip(colors, data.groupby(hue)): 191 | kernel = gaussian_kde(series) 192 | density = kernel(eval_points) 193 | ax.plot(eval_points, density, label=key, alpha=0.75, color=color) 194 | ax.fill_between(eval_points, density, alpha=0.25, color=color) 195 | 196 | if hue.name is not None: 197 | ax.legend(title=f"{hue.name}".title()) 198 | 199 | ax.set_xlabel(label) 200 | ax.set_ylim(0) 201 | ax.set_title(f"Density plot of {label}") 202 | return ax 203 | 204 | 205 | @mpl.rc_context(REGPLOT_RC_PARAMS) 206 | def prob_plot( 207 | data: Iterable, 208 | *, 209 | label: str, 210 | marker_color: Union[str, Sequence] = "C0", 211 | line_color: Union[str, Sequence] = "#222", 212 | ax: Axes = None, 213 | ) -> Axes: 214 | """Get a probability-plot from numeric values. 215 | 216 | Args: 217 | data (Iterable): Values to plot. 218 | label (str): A name for the ``data``, shown in the title. 219 | marker_color (Union[str, Sequence]): Color for the plotted points. 220 | Defaults to "C0". 221 | line_color (Union[str, Sequence]): Color for the line of best fit. 222 | Defaults to "#222". 223 | ax (matplotlib.axes.Axes, optional): Axes instance. Defaults to None. 224 | 225 | Returns: 226 | matplotlib.axes.Axes: Matplotlib axes with the probability-plot. 227 | """ 228 | original_data = _validate_univariate_input(data) 229 | data = original_data.dropna() 230 | ax = _get_or_validate_axes(ax) 231 | probplot(data, fit=True, plot=ax) 232 | ax.lines[0].set_color(marker_color) 233 | ax.lines[1].set_color(line_color) 234 | ax.set_xlabel("Theoretical Quantiles (Normal)") 235 | ax.set_title(f"Probability plot of {label}") 236 | return ax 237 | 238 | 239 | @mpl.rc_context(GENERAL_RC_PARAMS) 240 | def bar_plot( 241 | data: Iterable, 242 | *, 243 | label: str, 244 | color: Union[str, Sequence] = None, 245 | ax: Axes = None, 246 | ) -> Axes: 247 | """Get a bar-plot from a sequence of values. 248 | 249 | Args: 250 | data (Iterable): Values to plot. 251 | label (str): A name for the ``data``, shown in the title. 252 | color (Union[str, Sequence]): A valid matplotlib color specifier. 253 | ax (matplotlib.axes.Axes, optional): Axes instance. Defaults to None. 254 | 255 | Returns: 256 | matplotlib.axes.Axes: Matplotlib axes with the bar-plot. 257 | """ 258 | original_data = _validate_univariate_input(data) 259 | data = original_data.dropna() 260 | ax = _get_or_validate_axes(ax) 261 | # Include no more than 10 of the most common values 262 | top_10 = data.value_counts().nlargest(10) 263 | bars = ax.bar(top_10.index.map(str), top_10, alpha=0.8, color=color) 264 | ax.bar_label(bars, labels=[f"{x:,.0f}" for x in top_10], padding=2) 265 | if (num_unique := data.nunique()) > 10: 266 | title = f"Bar-plot of {label} (Top 10 of {num_unique})" 267 | else: 268 | title = f"Bar-plot of {label}" 269 | ax.set_title(title) 270 | ax.set_ylabel("Count") 271 | ax.tick_params(axis="x", rotation=90) # Improve visibility for long labels 272 | return ax 273 | 274 | 275 | def _plot_variable(variable_data_hue_and_color: Tuple) -> Tuple: 276 | """Helper function to concurrently plot variables in a multiprocessing 277 | Pool. 278 | 279 | Args: 280 | variable_data_hue_and_color (Tuple): A variable, plot data, hue data 281 | and the desired plot color. 282 | 283 | Returns: 284 | Tuple: `variable`s name, and graphs in a dict. 285 | """ 286 | variable, data, hue, color = variable_data_hue_and_color 287 | if variable.var_type == "numeric": 288 | plots = { 289 | "box_plot": box_plot( 290 | data=data, hue=hue, label=variable.name, color=color 291 | ), 292 | "kde_plot": kde_plot( 293 | data=data, hue=hue, label=variable.name, color=color 294 | ), 295 | "prob_plot": prob_plot( 296 | data, label=variable.name, marker_color=color 297 | ), 298 | } 299 | else: # {"boolean", "categorical", "datetime", "numeric (<=10 levels)"} 300 | plots = {"bar_plot": bar_plot(data, label=variable.name, color=color)} 301 | 302 | graph_images = {name: _savefig(ax.figure) for name, ax in plots.items()} 303 | return variable.name, graph_images 304 | 305 | 306 | @mpl.rc_context(CORRPLOT_RC_PARAMS) 307 | def plot_correlation( 308 | variables: Iterable, 309 | max_pairs: int = 20, 310 | color_pos: Union[str, Sequence] = "orangered", 311 | color_neg: Union[str, Sequence] = "steelblue", 312 | ax: Axes = None, 313 | ) -> Axes: 314 | """Create a bar chart showing the top ``max_pairs`` most correlated 315 | variables. Bars are annotated with variable pairs and their respective 316 | Pearson correlation coefficients. 317 | 318 | Args: 319 | variables (Iterable): 2-dimensional numeric data. 320 | max_pairs (int): The maximum number of numeric pairs to include in the 321 | plot. Defaults to 20. 322 | color_pos (Union[str, Sequence]): Color for positive correlation bars. 323 | Defaults to "orangered". 324 | color_neg (Union[str, Sequence]): Color for negative correlation bars. 325 | Defaults to "steelblue". 326 | ax (matplotlib.axes.Axes, optional): Axes instance. Defaults to None. 327 | 328 | Returns: 329 | matplotlib.axes.Axes: A bar-plot of correlation data. 330 | """ 331 | if not isinstance(variables, Dataset): 332 | variables = Dataset(variables) 333 | 334 | if variables._correlation_values is None: 335 | return None 336 | 337 | # Show at most `max_pairs` numeric pairs. 338 | pairs_to_show = variables._correlation_values[:max_pairs] 339 | # Reverse items so largest values appear at the top. 340 | corr_data = dict(reversed(pairs_to_show)) 341 | labels = [" vs ".join(pair) for pair in corr_data.keys()] 342 | ax = _get_or_validate_axes(ax) 343 | ax.barh(labels, corr_data.values(), edgecolor="#222", linewidth=0.5) 344 | ax.set_xlim(-1.1, 1.1) 345 | ax.spines["left"].set_position("zero") # Place y-axis spine at x=0 346 | ax.yaxis.set_visible(False) # Hide y-axis labels 347 | 348 | for p, label in zip(ax.patches, labels): 349 | p.set_alpha(min(1, abs(p.get_width()) + 0.1)) 350 | if p.get_width() < 0: 351 | p.set_facecolor(color_neg) 352 | ax.text( 353 | p.get_x(), 354 | p.get_y() + p.get_height() / 2, 355 | f"{p.get_width():,.2f} ({label}) ", 356 | size=8, 357 | ha="right", 358 | va="center", 359 | ) 360 | else: 361 | p.set_facecolor(color_pos) 362 | ax.text( 363 | p.get_x(), 364 | p.get_y() + p.get_height() / 2, 365 | f" {p.get_width():,.2} ({label})", 366 | size=8, 367 | ha="left", 368 | va="center", 369 | ) 370 | 371 | ax.set_title(f"Pearson Correlation (Top {len(corr_data)})") 372 | return ax 373 | 374 | 375 | @mpl.rc_context(REGPLOT_RC_PARAMS) 376 | def regression_plot( 377 | x: Iterable, 378 | y: Iterable, 379 | labels: Tuple[str, str], 380 | marker_color: Union[str, Sequence] = "C0", 381 | line_color: Union[str, Sequence] = "#444", 382 | ax: Axes = None, 383 | ) -> Axes: 384 | """Get a regression-plot from the provided pair of numeric values. 385 | 386 | Args: 387 | x (Iterable): Numeric values. 388 | y (Iterable): Numeric values. 389 | labels (Tuple[str, str]): Names for `x` and `y` respectively, shown in 390 | axis labels. 391 | marker_color (Union[str, Sequence]): Color for the plotted points. 392 | Defaults to "C0". 393 | line_color (Union[str, Sequence]): Color for the line of best fit. 394 | Defaults to "#444". 395 | ax (matplotlib.axes.Axes, optional): Axes instance. Defaults to None. 396 | 397 | Returns: 398 | matplotlib.axes.Axes: Matplotlib axes with the regression-plot. 399 | """ 400 | var1, var2 = labels 401 | data = _validate_dataset({var1: x, var2: y}).dropna() 402 | if len(data) > 50000: 403 | data = data.sample(50000) 404 | 405 | ax = _get_or_validate_axes(ax) 406 | x = data[var1] 407 | y = data[var2] 408 | slope, intercept = np.polyfit(x, y, deg=1) 409 | ax.scatter(x, y, s=40, alpha=0.7, color=marker_color, edgecolors="#444") 410 | reg_line_x = np.linspace(x.min(), x.max(), num=20) 411 | reg_line_y = slope * reg_line_x + intercept 412 | ax.plot(reg_line_x, reg_line_y, color=line_color, lw=2) 413 | ax.set_title( 414 | f"Slope: {slope:,.4f}\nIntercept: {intercept:,.4f}\n" 415 | + f"Correlation: {x.corr(y):.4f}", 416 | size=11, 417 | ) 418 | ax.set_xlabel(var1) 419 | ax.set_ylabel(var2) 420 | return ax 421 | 422 | 423 | def _plot_regression(data_and_color: Tuple) -> Tuple: 424 | """Helper function to plot regression-plots concurrently. 425 | 426 | Args: 427 | data_and_color (Tuple): Dataframe, and desired marker-color. 428 | 429 | Returns: 430 | Tuple: Names for the variable pair, and axes with the regression 431 | plot. 432 | """ 433 | data, color = data_and_color 434 | var1, var2 = data.columns 435 | ax = regression_plot( 436 | x=data[var1], y=data[var2], labels=(var1, var2), marker_color=color 437 | ) 438 | return (var1, var2), ax 439 | 440 | 441 | def _plot_dataset(variables: Dataset, color: str = None) -> Optional[Dict]: 442 | """Concurrently plot regression-plots in a multiprocessing Pool. 443 | 444 | Args: 445 | variables (Dataset): Bi-variate analysis results. 446 | color (str, optional): The color to apply to the graphs. 447 | 448 | Returns: 449 | Optional[Dict]: A dictionary with a correlation plot and regression 450 | plots. 451 | """ 452 | if variables._correlation_values is None: 453 | return None 454 | else: 455 | # Take the top 20 pairs by magnitude of correlation. 456 | # 20 var_pairs ≈ 10+ pages in report document 457 | # 20 numeric columns == 190 var_pairs ≈ 95+ pages. 458 | pairs_to_include = [ 459 | pair for pair, _ in variables._correlation_values[:20] 460 | ] 461 | with get_context("spawn").Pool() as p: 462 | paired_data = [ 463 | (variables.data.loc[:, pair], color) 464 | for pair in pairs_to_include 465 | ] 466 | bivariate_regression_plots = dict( 467 | tqdm( 468 | # Plot in parallel processes 469 | p.imap(_plot_regression, paired_data), 470 | # Progress-bar options 471 | total=len(pairs_to_include), 472 | bar_format=( 473 | "{desc} {percentage:3.0f}%|{bar:35}| " 474 | "{n_fmt}/{total_fmt} pairs." 475 | ), 476 | desc="Bivariate analysis:", 477 | dynamic_ncols=True, 478 | ) 479 | ) 480 | return { 481 | "correlation_plot": _savefig(plot_correlation(variables).figure), 482 | "regression_plots": { 483 | var_pair: _savefig(plot.figure) 484 | for var_pair, plot in bivariate_regression_plots.items() 485 | }, 486 | } 487 | --------------------------------------------------------------------------------