├── tests
    ├── __init__.py
    ├── conftest.py
    ├── test_main_functions.py
    ├── test_file_loader.py
    ├── test_content_creation.py
    ├── test_cli.py
    ├── test_document_creation.py
    ├── test_analysis.py
    ├── test_data_validation.py
    ├── test_bivariate_analysis.py
    ├── test_univariate_analysis.py
    └── test_plotting_functions.py
├── runtime.txt
├── MANIFEST.in
├── eda_report
    ├── images
    │   ├── icon.png
    │   └── background.png
    ├── __main__.py
    ├── exceptions.py
    ├── _read_file.py
    ├── __init__.py
    ├── _cli.py
    ├── _content.py
    ├── _validate.py
    ├── _analysis.py
    ├── bivariate.py
    ├── gui.py
    ├── univariate.py
    ├── document.py
    └── plotting.py
├── docs
    ├── source
    │   ├── _static
    │   │   ├── haha.png
    │   │   ├── report.gif
    │   │   ├── report.png
    │   │   ├── bar-plot.png
    │   │   ├── box-plot.png
    │   │   ├── kde-plot.png
    │   │   ├── screencast.gif
    │   │   ├── screencast.png
    │   │   ├── bar-plot-dark.png
    │   │   ├── box-plot-dark.png
    │   │   ├── kde-plot-dark.png
    │   │   ├── correlation-plot.png
    │   │   ├── probability-plot.png
    │   │   ├── regression-plot.png
    │   │   ├── correlation-plot-dark.png
    │   │   ├── probability-plot-dark.png
    │   │   └── regression-plot-dark.png
    │   ├── eda_report.rst
    │   ├── eda_report.bivariate.rst
    │   ├── eda_report.gui.rst
    │   ├── eda_report.univariate.rst
    │   ├── eda_report.exceptions.rst
    │   ├── eda_report.document.rst
    │   ├── modules.rst
    │   ├── installation.rst
    │   ├── quickstart.rst
    │   ├── index.rst
    │   ├── conf.py
    │   ├── eda_report.plotting.rst
    │   └── examples.txt
    ├── Makefile
    ├── make.bat
    └── requirements.txt
├── pyproject.toml
├── .gitignore
├── .coveragerc
├── .readthedocs.yaml
├── requirements.txt
├── requirements-dev.txt
├── .github
    └── workflows
    │   ├── code-cov.yml
    │   ├── publish-pypi.yml
    │   └── unit-tests.yml
├── LICENSE
├── setup.cfg
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.12
2 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include eda_report/images/*.png
2 | exclude tests/*


--------------------------------------------------------------------------------
/eda_report/images/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/eda_report/images/icon.png


--------------------------------------------------------------------------------
/docs/source/_static/haha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/haha.png


--------------------------------------------------------------------------------
/docs/source/_static/report.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/report.gif


--------------------------------------------------------------------------------
/docs/source/_static/report.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/report.png


--------------------------------------------------------------------------------
/docs/source/_static/bar-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/bar-plot.png


--------------------------------------------------------------------------------
/docs/source/_static/box-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/box-plot.png


--------------------------------------------------------------------------------
/docs/source/_static/kde-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/kde-plot.png


--------------------------------------------------------------------------------
/eda_report/images/background.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/eda_report/images/background.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=64", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 | 


--------------------------------------------------------------------------------
/docs/source/_static/screencast.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/screencast.gif


--------------------------------------------------------------------------------
/docs/source/_static/screencast.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/screencast.png


--------------------------------------------------------------------------------
/eda_report/__main__.py:
--------------------------------------------------------------------------------
1 | from eda_report._cli import run_from_cli
2 | 
3 | if __name__ == "__main__":
4 |     run_from_cli()
5 | 


--------------------------------------------------------------------------------
/docs/source/_static/bar-plot-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/bar-plot-dark.png


--------------------------------------------------------------------------------
/docs/source/_static/box-plot-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/box-plot-dark.png


--------------------------------------------------------------------------------
/docs/source/_static/kde-plot-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/kde-plot-dark.png


--------------------------------------------------------------------------------
/docs/source/_static/correlation-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/correlation-plot.png


--------------------------------------------------------------------------------
/docs/source/_static/probability-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/probability-plot.png


--------------------------------------------------------------------------------
/docs/source/_static/regression-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/regression-plot.png


--------------------------------------------------------------------------------
/docs/source/_static/correlation-plot-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/correlation-plot-dark.png


--------------------------------------------------------------------------------
/docs/source/_static/probability-plot-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/probability-plot-dark.png


--------------------------------------------------------------------------------
/docs/source/_static/regression-plot-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tim-Abwao/eda-report/HEAD/docs/source/_static/regression-plot-dark.png


--------------------------------------------------------------------------------
/docs/source/eda_report.rst:
--------------------------------------------------------------------------------
1 | eda_report
2 | ----------
3 | 
4 | .. automodule:: eda_report
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:


--------------------------------------------------------------------------------
/docs/source/eda_report.bivariate.rst:
--------------------------------------------------------------------------------
1 | eda\_report.bivariate
2 | ========================
3 | 
4 | .. automodule:: eda_report.bivariate
5 |    :members:
6 |    :undoc-members:
7 | 


--------------------------------------------------------------------------------
/docs/source/eda_report.gui.rst:
--------------------------------------------------------------------------------
1 | eda\_report.gui
2 | ===============
3 | 
4 | .. automodule:: eda_report.gui
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/eda_report.univariate.rst:
--------------------------------------------------------------------------------
1 | eda\_report.univariate
2 | ======================
3 | 
4 | .. automodule:: eda_report.univariate
5 |    :members:
6 |    :undoc-members:
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | venv
 2 | .ipynb_checkpoints
 3 | __pycache__
 4 | *.docx
 5 | .~lock*
 6 | dist/
 7 | build/
 8 | eda_report.egg-info
 9 | .coverage*
10 | !.coveragerc
11 | htmlcov/
12 | 


--------------------------------------------------------------------------------
/docs/source/eda_report.exceptions.rst:
--------------------------------------------------------------------------------
1 | eda\_report.exceptions
2 | ======================
3 | 
4 | .. automodule:: eda_report.exceptions
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/eda_report.document.rst:
--------------------------------------------------------------------------------
1 | eda\_report.document
2 | ====================
3 | 
4 | .. automodule:: eda_report.document
5 |    :members:
6 |    :inherited-members:
7 |    :undoc-members:
8 |    :show-inheritance:
9 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = true
 3 | concurrency = multiprocessing
 4 | omit = 
 5 |     eda_report/__main__.py
 6 |     eda_report/gui.py
 7 | parallel = true
 8 | sigterm = true
 9 | source = eda_report
10 | 
11 | [report]
12 | fail_under = 85
13 | precision = 2
14 | 


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
 1 | API Reference
 2 | =============
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 3
 6 | 
 7 |    eda_report
 8 |    eda_report.bivariate
 9 |    eda_report.document
10 |    eda_report.exceptions
11 |    eda_report.gui
12 |    eda_report.plotting
13 |    eda_report.univariate


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | version: 2
 5 | build:
 6 |   os: ubuntu-20.04
 7 |   tools:
 8 |     python: "3.11"
 9 | 
10 | # Build documentation in the docs/ directory with Sphinx
11 | sphinx:
12 |    configuration: docs/source/conf.py
13 | formats:
14 |    [htmlzip, pdf]
15 | python:
16 |    install:
17 |    - requirements: docs/requirements.txt


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | contourpy==1.3.1
 2 | cycler==0.12.1
 3 | et_xmlfile==2.0.0
 4 | fonttools==4.55.3
 5 | kiwisolver==1.4.8
 6 | lxml==5.3.0
 7 | matplotlib==3.10.0
 8 | numpy==2.2.1
 9 | openpyxl==3.1.5
10 | packaging==24.2
11 | pandas==2.2.3
12 | pillow==11.0.0
13 | pyparsing==3.2.0
14 | python-dateutil==2.9.0.post0
15 | python-docx==1.1.2
16 | pytz==2024.2
17 | scipy==1.14.1
18 | six==1.17.0
19 | tqdm==4.67.1
20 | typing_extensions==4.12.2
21 | tzdata==2024.2
22 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from shutil import rmtree
 2 | 
 3 | import pytest
 4 | from pandas import DataFrame
 5 | 
 6 | 
 7 | @pytest.fixture(scope="session")
 8 | def temp_data_dir(tmp_path_factory):
 9 |     temp_dir = tmp_path_factory.mktemp("data")
10 |     sample_data = DataFrame([[1, 2, 3], [4, 5, 6]], columns=list("ABC"))
11 |     sample_data.to_csv(temp_dir / "data.csv", index=False)
12 |     sample_data.to_excel(temp_dir / "data.xlsx", index=False)
13 |     yield temp_dir
14 |     rmtree(temp_dir)
15 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | contourpy==1.3.1
 2 | coverage==7.6.10
 3 | cycler==0.12.1
 4 | et_xmlfile==2.0.0
 5 | flake8==7.1.1
 6 | fonttools==4.55.3
 7 | iniconfig==2.0.0
 8 | kiwisolver==1.4.8
 9 | lxml==5.3.0
10 | matplotlib==3.10.0
11 | mccabe==0.7.0
12 | numpy==2.2.1
13 | openpyxl==3.1.5
14 | packaging==24.2
15 | pandas==2.2.3
16 | pillow==11.0.0
17 | pluggy==1.5.0
18 | pycodestyle==2.12.1
19 | pyflakes==3.2.0
20 | pyparsing==3.2.0
21 | pytest==8.3.4
22 | python-dateutil==2.9.0.post0
23 | python-docx==1.1.2
24 | pytz==2024.2
25 | scipy==1.14.1
26 | six==1.17.0
27 | tqdm==4.67.1
28 | typing_extensions==4.12.2
29 | tzdata==2024.2
30 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/tests/test_main_functions.py:
--------------------------------------------------------------------------------
 1 | from pandas import DataFrame
 2 | 
 3 | from eda_report import get_word_report, summarize
 4 | from eda_report.bivariate import Dataset
 5 | from eda_report.document import ReportDocument
 6 | from eda_report.univariate import Variable
 7 | 
 8 | sample_data = DataFrame(
 9 |     {
10 |         "A": range(50),
11 |         "B": list("abcdef") * 8 + ["a"] * 2,
12 |         "C": [True, False] * 24 + [True] * 2,
13 |         "D": [1, 3, 5, 7, 9, 11, 13] * 7 + [17],
14 |     }
15 | )
16 | 
17 | 
18 | def test_get_word_report_function():
19 |     report = get_word_report(sample_data)
20 |     assert isinstance(report, ReportDocument)
21 | 
22 | 
23 | def test_summarize_function():
24 |     summary_1D = summarize(range(25))
25 |     assert isinstance(summary_1D, Variable)
26 | 
27 |     summary_2D = summarize(sample_data)
28 |     assert isinstance(summary_2D, Dataset)
29 | 


--------------------------------------------------------------------------------
/docs/source/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ------------
 3 | 
 4 | .. important::
 5 |     Only **Python3.10 to 3.12** are currently supported.
 6 | 
 7 | .. tip::
 8 |    Consider using a `virtual environment`_. Virtual environments are a great way to ensure that you install the right versions of dependencies, while avoiding breaking other Python packages in your system.
 9 | 
10 | You can install ``eda-report`` from the `Python Package Index`_ using ``pip``::
11 | 
12 |     $ pip install eda-report
13 | 
14 | You can also install the latest stable version right from the `GitHub repository`_ using::
15 | 
16 |     $ pip install https://github.com/tim-abwao/eda-report/archive/main.tar.gz
17 | 
18 | 
19 | .. _virtual environment: https://docs.python.org/3/tutorial/venv.html#virtual-environments-and-packages
20 | .. _Python Package Index: https://pypi.org/project/eda-report/
21 | .. _GitHub repository: https://github.com/Tim-Abwao/eda-report
22 | 


--------------------------------------------------------------------------------
/.github/workflows/code-cov.yml:
--------------------------------------------------------------------------------
 1 | name: Codecov
 2 | on:
 3 |   push:
 4 |     branches: [main, dev]
 5 |   pull_request:
 6 |     branches: [main, dev]
 7 | 
 8 | jobs:
 9 |   run:
10 |     runs-on: ${{ matrix.os }}
11 |     strategy:
12 |       matrix:
13 |         os: [ubuntu-latest, macos-latest, windows-latest]
14 |     env:
15 |       OS: ${{ matrix.os }}
16 |       PYTHON: "3.12"
17 |     steps:
18 |     - uses: actions/checkout@v4
19 |     - name: Setup Python
20 |       uses: actions/setup-python@v5
21 |       with:
22 |         python-version: "3.12"
23 |     - name: Generate coverage report
24 |       run: |
25 |         pip install -r requirements-dev.txt
26 |         coverage run -m pytest
27 |         coverage combine
28 |     - name: Upload coverage to Codecov
29 |       uses: codecov/codecov-action@v4
30 |       with:
31 |         env_vars: OS,PYTHON
32 |         flags: unittests
33 |         name: codecov-umbrella
34 |         verbose: true
35 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/eda_report/exceptions.py:
--------------------------------------------------------------------------------
 1 | class Error(Exception):
 2 |     """The base class for exceptions in this package."""
 3 | 
 4 |     pass
 5 | 
 6 | 
 7 | class InputError(Error):
 8 |     """*Exception* raised when a given input object is *not of the expected
 9 |     type* or is otherwise *invalid*.
10 | 
11 |     In most cases, an attempt is made to cast the erroneous input into the
12 |     proper type, and this *Exception* is raised if it fails.
13 | 
14 |     Args:
15 |         message (str): A brief description of the mishap detected.
16 |     """
17 | 
18 |     def __init__(self, message: str) -> None:
19 |         self.message = message
20 | 
21 | 
22 | class EmptyDataError(InputError):
23 |     """*Exception* raised when an iterable input object has length zero or has
24 |     no more items to yield.
25 |     """
26 | 
27 |     pass
28 | 
29 | 
30 | class GroupbyVariableError(InputError):
31 |     """*Exception* raised when the specified group-by variable is invalid."""
32 | 
33 |     pass
34 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | alabaster==1.0.0
 2 | babel==2.16.0
 3 | beautifulsoup4==4.12.3
 4 | certifi==2024.12.14
 5 | charset-normalizer==3.4.1
 6 | contourpy==1.3.1
 7 | cycler==0.12.1
 8 | docutils==0.21.2
 9 | fonttools==4.55.3
10 | furo==2024.8.6
11 | idna==3.10
12 | imagesize==1.4.1
13 | Jinja2==3.1.5
14 | kiwisolver==1.4.8
15 | lxml==5.3.0
16 | MarkupSafe==3.0.2
17 | matplotlib==3.10.0
18 | numpy==2.2.1
19 | packaging==24.2
20 | pandas==2.2.3
21 | pillow==11.0.0
22 | Pygments==2.18.0
23 | pyparsing==3.2.0
24 | python-dateutil==2.9.0.post0
25 | python-docx==1.1.2
26 | pytz==2024.2
27 | requests==2.32.3
28 | scipy==1.14.1
29 | six==1.17.0
30 | snowballstemmer==2.2.0
31 | soupsieve==2.6
32 | Sphinx==8.1.3
33 | sphinx-basic-ng==1.0.0b2
34 | sphinxcontrib-applehelp==2.0.0
35 | sphinxcontrib-devhelp==2.0.0
36 | sphinxcontrib-htmlhelp==2.1.0
37 | sphinxcontrib-jsmath==1.0.1
38 | sphinxcontrib-qthelp==2.0.0
39 | sphinxcontrib-serializinghtml==2.0.0
40 | tqdm==4.67.1
41 | typing_extensions==4.12.2
42 | tzdata==2024.2
43 | urllib3==2.3.0
44 | 


--------------------------------------------------------------------------------
/tests/test_file_loader.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pandas import DataFrame
 3 | 
 4 | from eda_report._read_file import df_from_file
 5 | from eda_report.exceptions import InputError
 6 | 
 7 | 
 8 | class TestFileLoader:
 9 |     data = DataFrame([[1, 2, 3], [4, 5, 6]], columns=list("ABC"))
10 | 
11 |     def test_csv_file_load(self, temp_data_dir):
12 |         # Check that a valid csv file is read as a DataFrame
13 |         assert df_from_file(temp_data_dir / "data.csv").equals(self.data)
14 | 
15 |     def test_excel_file_load(self, temp_data_dir):
16 |         # Check that a valid excel file is read as a DataFrame
17 |         assert df_from_file(temp_data_dir / "data.xlsx").equals(self.data)
18 | 
19 |     def test_invalid_file(self):
20 |         # Check that an invalid file format/extension raises an InputError
21 |         with pytest.raises(InputError) as error:
22 |             df_from_file("data.some_extension")
23 |         # Check that the error message is as expected
24 |         assert "Invalid input file: 'data.some_extension'" in str(error.value)
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021-2025 Abwao
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-pypi.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | jobs:
16 |   deploy:
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |     - uses: actions/checkout@v4
20 |     - name: Set up Python
21 |       uses: actions/setup-python@v5
22 |       with:
23 |         python-version: '3.12'
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         pip install build
28 |     - name: Build package
29 |       run: python -m build
30 |     - name: Publish package
31 |       uses: pypa/gh-action-pypi-publish@ec4db0b4ddc65acdf4bff5fa45ac92d78b56bdf0
32 |       with:
33 |         user: __token__
34 |         password: ${{ secrets.PYPI_API_TOKEN }}
35 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = eda_report
 3 | version = attr: eda_report.__version__
 4 | description = Automate exploratory data analysis and reporting.
 5 | long_description = file: README.md
 6 | long_description_content_type = text/markdown
 7 | keywords = eda exploratory data analysis report
 8 | author = Abwao
 9 | author_email = abwaomusungu@gmail.com
10 | url = https://eda-report.readthedocs.io/
11 | license = MIT
12 | classifiers =
13 |   Development Status :: 4 - Beta
14 |   Intended Audience :: Science/Research
15 |   License :: OSI Approved :: MIT License
16 |   Operating System :: OS Independent
17 |   Programming Language :: Python :: 3.12
18 | 
19 | project_urls = 
20 |   Source Code = https://github.com/Tim-Abwao/eda-report
21 | 
22 | [options]
23 | packages = find:
24 | install_requires =
25 |   matplotlib>=3.10.0
26 |   openpyxl>=3.1.5
27 |   pandas>=2.2.3
28 |   python-docx>=1.1.2
29 |   scipy>=1.14.1
30 |   tqdm>=4.67.1
31 | include_package_data = True
32 | python_requires = >=3.10
33 | 
34 | [options.entry_points]
35 | console_scripts =
36 |   eda-report = eda_report._cli:run_from_cli
37 | 
38 | [options.extras_require]
39 | dev = 
40 |   black>=24.10.0
41 |   coverage>=7.6.10
42 |   flake8>=7.1.1
43 |   pytest>=8.3.4
44 | 
45 | [options.package_data]
46 | eda_report = eda_report/images/*.png
47 | 


--------------------------------------------------------------------------------
/eda_report/_read_file.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Union
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from eda_report.exceptions import InputError
 7 | 
 8 | 
 9 | def df_from_file(filepath: Union[str, Path]) -> pd.DataFrame:
10 |     """Reads a file, and loads its contents as a :class:`~pandas.DataFrame`.
11 | 
12 |     File formats are currently restricted to *csv* and *excel*, since these
13 |     are the most often used to store data.
14 | 
15 |     This is basically a wrapper around ``pandas'`` input functions:
16 | 
17 |         * :func:`pandas.read_csv`
18 |         * :func:`pandas.read_excel`
19 | 
20 | 
21 |     Args:
22 |         filepath (Union[str, Path]): The path to a file.
23 | 
24 |     Raises:
25 |         InputError: If the supplied filepath is invalid, for instance if the
26 |             file is of an incorrect format or does not exist.
27 | 
28 |     Returns:
29 |         pandas.DataFrame: The specified file's contents.
30 |     """
31 |     file = Path(filepath)
32 | 
33 |     if file.suffix == ".csv":
34 |         return pd.read_csv(file)
35 |     elif file.suffix == ".xlsx":
36 |         return pd.read_excel(file, engine="openpyxl")
37 |     else:
38 |         raise InputError(
39 |             f"Invalid input file: '{filepath}'. Expected a CSV or Excel file."
40 |         )
41 | 


--------------------------------------------------------------------------------
/.github/workflows/unit-tests.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python 3.10-3.12
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [main, dev]
 9 |   pull_request:
10 |     branches: [main, dev]
11 | 
12 | jobs:
13 |   build:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: ["3.10", "3.11", "3.12"]
18 | 
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |       - name: Set up Python ${{ matrix.python-version }}
22 |         uses: actions/setup-python@v5
23 |         with:
24 |           python-version: ${{ matrix.python-version }}
25 |       - name: Install dependencies
26 |         run: |
27 |           pip install -U pip
28 |           pip install -r requirements-dev.txt
29 |       - name: Lint with flake8
30 |         run: |
31 |           # stop the build if there are Python syntax errors or undefined names
32 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
33 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
34 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
35 |       - name: Test with pytest
36 |         run: |
37 |           pytest tests/
38 | 


--------------------------------------------------------------------------------
/tests/test_content_creation.py:
--------------------------------------------------------------------------------
 1 | from pandas import DataFrame
 2 | 
 3 | from eda_report._analysis import _AnalysisResult
 4 | from eda_report._content import _ReportContent
 5 | 
 6 | data = DataFrame(
 7 |     {"A": range(48), "B": list(range(16)) * 3, "C": list("abcd") * 12}
 8 | )
 9 | 
10 | 
11 | class TestReportContent:
12 |     content = _ReportContent(data, title="Some Title")
13 | 
14 |     def test_general_attributes(self):
15 |         assert isinstance(self.content, _AnalysisResult)
16 |         assert self.content.GRAPH_COLOR == "cyan"
17 |         assert self.content.TITLE == "Some Title"
18 |         assert self.content.GROUPBY_DATA is None
19 | 
20 |     def test_intro(self):
21 |         assert self.content.intro_text == (
22 |             "The dataset consists of 48 rows (observations) and 3 columns "
23 |             "(features), 2 of which are numeric."
24 |         )
25 | 
26 |     def test_variable_descriptions(self):
27 |         assert self.content.variable_descriptions == {
28 |             "A": (
29 |                 "A is a numeric variable with 48 unique values."
30 |                 " None of its values are missing."
31 |             ),
32 |             "B": (
33 |                 "B is a numeric variable with 16 unique values."
34 |                 " None of its values are missing."
35 |             ),
36 |             "C": (
37 |                 "C is a categorical variable with 4 unique values."
38 |                 " None of its values are missing."
39 |             ),
40 |         }
41 | 
42 |     def test_bivariate_summaries(self):
43 |         assert self.content.bivariate_summaries == {
44 |             ("A", "B"): "A and B have weak positive correlation (0.33)."
45 |         }
46 | 
47 | 
48 | def test_limiting_bivariate_summaries():
49 |     content = _ReportContent([range(12), [1, 2, 3, 4] * 3])
50 |     # content has 66 var_pairs (66 possible pairs from 12 numeric cols)
51 |     # but the limit for summaries is 20
52 |     assert len(content.bivariate_summaries) == 20
53 | 


--------------------------------------------------------------------------------
/docs/source/quickstart.rst:
--------------------------------------------------------------------------------
 1 | Quickstart
 2 | ==========
 3 | 
 4 | Using the Graphical User Interface
 5 | ----------------------------------
 6 | 
 7 | The command ``eda-report`` launches a graphical window to help select a *csv* or *excel* file to analyze::
 8 | 
 9 |     $ eda-report
10 | 
11 | .. figure:: _static/screencast.*
12 |    :alt: an image of the graphical user interface
13 | 
14 |    A ``tkinter``-based graphical user interface to the application
15 | 
16 | You will be prompted to enter your desired *title*, *groupby/target variable*, *graph color* & *output file-name*. Afterwards, a report is generated, as specified, from the contents of the selected file.
17 | 
18 | .. hint::
19 |     For help with `Tk` - related issues, consider visiting `TkDocs`_.
20 | 
21 | .. _`TkDocs`: https://tkdocs.com/index.html
22 | 
23 | Using the Command Line Interface
24 | --------------------------------
25 | 
26 | You can specify an input file and an output file-name::
27 | 
28 |     $ eda-report -i data.csv -o some_name.docx
29 | 
30 | .. literalinclude:: examples.txt
31 |        :lines: 106-128
32 | 
33 | From an Interactive Session
34 | ---------------------------
35 | 
36 | You can use the :func:`~eda_report.get_word_report` function to generate reports:
37 | 
38 | .. literalinclude:: examples.txt
39 |    :lines: 136-142
40 | 
41 | You can use the :func:`~eda_report.summarize` function to analyze datasets:
42 | 
43 | .. literalinclude:: examples.txt
44 |    :lines: 146-171
45 | .. literalinclude:: examples.txt
46 |    :lines: 172-195
47 | 
48 | You can plot several statistical graphs (see :ref:`plotting-examples`):
49 | 
50 | >>> import eda_report.plotting as ep
51 | >>> ax = ep.plot_correlation(mpg_data)
52 | >>> ax.figure.savefig("correlation-plot.png")
53 | 
54 | .. image:: _static/correlation-plot.png
55 |    :width: 80%
56 |    :align: center
57 |    :alt: a correlation-plot
58 |    :class: only-light
59 | 
60 | .. image:: _static/correlation-plot-dark.png
61 |    :width: 80%
62 |    :align: center
63 |    :alt: a correlation-plot
64 |    :class: only-dark
65 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | ``eda-report`` User Guide
 2 | =========================
 3 | 
 4 | Speed up the `exploratory data analysis`_ and reporting process. Automatically analyze a dataset, and get:
 5 | 
 6 | 1. Statistical properties
 7 | -------------------------
 8 | 
 9 | Descriptive statistics, bivariate analysis, tests for normality and more:
10 | 
11 | .. literalinclude:: examples.txt
12 |    :lines: 146-171
13 | 
14 | 2. Revealing visualizations
15 | ---------------------------
16 | 
17 | - *Box-plots*, *kde-plots*, *normal-probability-plots*, *scatter-plots* and a *correlation bar-chart* for numeric variables.
18 | - *Bar-plots* for categorical variables.
19 | 
20 | >>> import eda_report.plotting as ep
21 | >>> ax = ep.regression_plot(mpg_data["acceleration"], mpg_data["horsepower"],
22 | ...                         labels=("Acceleration", "Horsepower"))
23 | >>> ax.figure.savefig("regression-plot.png")
24 | 
25 | .. image:: _static/regression-plot.png
26 |    :width: 80%
27 |    :align: center
28 |    :alt: a regression-plot
29 |    :class: only-light
30 | 
31 | .. image:: _static/regression-plot-dark.png
32 |    :width: 80%
33 |    :align: center
34 |    :alt: a regression-plot
35 |    :class: only-dark
36 | 
37 | 3. A report in *Word* (.docx) format
38 | ------------------------------------
39 | 
40 | An exploratory data analysis report document complete with variable descriptions, summary statistics, statistical plots, contingency tables and more:
41 | 
42 | .. literalinclude:: examples.txt
43 |          :lines: 136-142
44 | 
45 | .. figure:: _static/report.*
46 |    :alt: iris dataset report animation
47 | 
48 |    A report generated from the *iris dataset*.
49 | 
50 | .. image:: https://mybinder.org/badge_logo.svg
51 |    :target: https://mybinder.org/v2/gh/Tim-Abwao/eda-report/HEAD?filepath=eda-report-basics.ipynb
52 | 
53 | .. _exploratory data analysis: https://en.wikipedia.org/wiki/Exploratory_data_analysis
54 | 
55 | .. toctree::
56 |    :maxdepth: 2
57 | 
58 |    installation
59 |    quickstart
60 |    modules
61 | 
62 | Indices and tables
63 | ==================
64 | 
65 | * :ref:`genindex`
66 | * :ref:`modindex`
67 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pathlib import Path
 3 | 
 4 | from eda_report._cli import run_from_cli
 5 | from eda_report.gui import EDAGUI
 6 | 
 7 | 
 8 | class TestCLIArgumentParsing:
 9 |     def test_with_all_args(self, temp_data_dir, monkeypatch):
10 |         # Simulate supplying all args
11 |         monkeypatch.setattr(
12 |             sys,
13 |             "argv",
14 |             [
15 |                 "eda-report",
16 |                 "-i",
17 |                 f"{temp_data_dir / 'data.csv'}",
18 |                 "-o",
19 |                 f"{temp_data_dir}/cli-test-1.docx",
20 |                 "-t",
21 |                 "CLI Test",
22 |                 "-c",
23 |                 "teal",
24 |                 "-g",
25 |                 "A",
26 |             ],
27 |         )
28 |         run_from_cli()
29 |         expected_output = temp_data_dir / "cli-test-1.docx"
30 |         assert expected_output.is_file()
31 | 
32 |     def test_with_only_input_file(self, temp_data_dir, monkeypatch):
33 |         # Supply the input file it has no default.
34 |         monkeypatch.setattr(
35 |             sys, "argv", ["eda-report", "-i", f"{temp_data_dir / 'data.xlsx'}"]
36 |         )
37 |         run_from_cli()
38 |         expected_output = Path("eda-report.docx")
39 |         assert expected_output.is_file()
40 | 
41 |         Path("eda-report.docx").unlink()  # Remove resultant report
42 | 
43 |     def test_without_optional_args(self, monkeypatch, capsys):
44 |         # Simulate launching the GUI
45 |         def mock_gui_init(gui):
46 |             """Simulate GUI initialization."""
47 |             pass
48 | 
49 |         def mock_gui_mainloop(gui):
50 |             """Simulate running GUI."""
51 |             print("Graphical user interface running in Tk mainloop.")
52 | 
53 |         monkeypatch.setattr(EDAGUI, "__init__", mock_gui_init)
54 |         monkeypatch.setattr(EDAGUI, "mainloop", mock_gui_mainloop)
55 | 
56 |         # Simulate running with no args
57 |         monkeypatch.setattr(sys, "argv", ["eda-report"])
58 |         run_from_cli()
59 | 
60 |         captured = capsys.readouterr()
61 |         assert (
62 |             "Graphical user interface running in Tk mainloop." in captured.out
63 |         )
64 | 


--------------------------------------------------------------------------------
/eda_report/__init__.py:
--------------------------------------------------------------------------------
 1 | from collections.abc import Iterable
 2 | from typing import Union
 3 | 
 4 | from eda_report._validate import _validate_dataset
 5 | from eda_report.bivariate import Dataset
 6 | from eda_report.document import ReportDocument
 7 | from eda_report.univariate import Variable
 8 | 
 9 | __version__ = "2.8.2"
10 | 
11 | 
12 | def get_word_report(
13 |     data: Iterable,
14 |     *,
15 |     title: str = "Exploratory Data Analysis Report",
16 |     graph_color: str = "cyan",
17 |     groupby_variable: Union[str, int] = None,
18 |     output_filename: str = "eda-report.docx",
19 |     table_style: str = "Table Grid",
20 | ) -> ReportDocument:
21 |     """Analyze `data`, and generate a report document in *Word* (*.docx*)
22 |     format.
23 | 
24 |     Args:
25 |         data (Iterable): The data to analyze.
26 |         title (str, optional): The title to assign the report. Defaults to
27 |             "Exploratory Data Analysis Report".
28 |         graph_color (str, optional): The color to apply to the graphs.
29 |             Defaults to "cyan".
30 |         groupby_variable (Union[str, int], optional): The label/index for the
31 |             column to use to group values. Defaults to None.
32 |         output_filename (str, optional): The name/path to save the report
33 |             document. Defaults to "eda-report.docx".
34 |         table_style (str, optional): The style to apply to the tables created.
35 |             Defaults to "Table Grid".
36 | 
37 |     Returns:
38 |         ReportDocument: Document object with analysis results.
39 | 
40 |     Example:
41 |         .. literalinclude:: examples.txt
42 |            :lines: 136-142
43 |     """
44 |     return ReportDocument(
45 |         data,
46 |         title=title,
47 |         graph_color=graph_color,
48 |         output_filename=output_filename,
49 |         groupby_variable=groupby_variable,
50 |         table_style=table_style,
51 |     )
52 | 
53 | 
54 | def summarize(data: Iterable) -> Union[Variable, Dataset]:
55 |     """Get summary statistics for the supplied data.
56 | 
57 |     Args:
58 |         data (Iterable): The data to analyze.
59 | 
60 |     Returns:
61 |         Union[Variable, Dataset]: Analysis results.
62 | 
63 |     Example:
64 |         .. literalinclude:: examples.txt
65 |            :lines: 172-195
66 |     """
67 |     data = _validate_dataset(data)
68 |     if data.shape[1] == 1:
69 |         return Variable(data.squeeze())
70 |     else:
71 |         return Dataset(data)
72 | 


--------------------------------------------------------------------------------
/tests/test_document_creation.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | 
 3 | from pandas import DataFrame
 4 | 
 5 | from eda_report.document import ReportDocument
 6 | 
 7 | 
 8 | class TestReportWithIdealInput:
 9 |     data = DataFrame(
10 |         {"A": range(50), "B": [1, 2, 3, 4, 5] * 10, "C": list("ab") * 25}
11 |     )
12 |     report = ReportDocument(
13 |         data,
14 |         title="Test Report",
15 |         graph_color="teal",
16 |         groupby_variable="C",
17 |         output_filename=BytesIO(),
18 |     )
19 | 
20 |     def test_general_properties(self):
21 |         # Largely covered in _ReportContent tests
22 |         assert self.report.TITLE == "Test Report"
23 |         assert self.report.GRAPH_COLOR == "teal"
24 |         assert "correlation_plot" in self.report.bivariate_graphs
25 |         assert "regression_plots" in self.report.bivariate_graphs
26 |         assert self.report.TABLE_STYLE == "Table Grid"
27 | 
28 | 
29 | class TestReportWithLimitedInput:
30 |     data = DataFrame(
31 |         {"categorical": list("ABCDEFGHIJKL" * 2), "numeric": range(24)}
32 |     )
33 |     report = ReportDocument(
34 |         data,
35 |         title="One Numeric One Categorical",
36 |         graph_color="lime",
37 |         output_filename=BytesIO(),
38 |     )
39 | 
40 |     def test_report_creation(self):
41 |         assert isinstance(self.report, ReportDocument)
42 |         assert self.report.TITLE == "One Numeric One Categorical"
43 |         assert self.report.GRAPH_COLOR == "lime"
44 | 
45 |     def test_bivariate_analysis(self):
46 |         assert self.report.bivariate_summaries is None
47 |         assert self.report.bivariate_graphs is None
48 | 
49 | 
50 | class TestReportWithUnivariateInput:
51 |     univariate_numeric_report = ReportDocument(
52 |         DataFrame(range(5)),
53 |         title="Univariate Numeric Report",
54 |         output_filename=BytesIO(),
55 |     )
56 |     univariate_categorical_report = ReportDocument(
57 |         DataFrame(["a"]),
58 |         title="Univariate Categorical Report",
59 |         output_filename=BytesIO(),
60 |     )
61 | 
62 |     def test_bivariate_analysis(self):
63 |         assert self.univariate_numeric_report.bivariate_summaries is None
64 |         assert self.univariate_categorical_report.bivariate_summaries is None
65 | 
66 |         assert self.univariate_numeric_report.bivariate_graphs is None
67 |         assert self.univariate_categorical_report.bivariate_graphs is None
68 | 
69 | 
70 | def test_output_file(temp_data_dir):
71 |     ReportDocument(range(50), output_filename=temp_data_dir / "eda.docx")
72 |     assert (temp_data_dir / "eda.docx").is_file()
73 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | 
16 | from sphinx.builders.html import StandaloneHTMLBuilder
17 | 
18 | sys.path.insert(0, os.path.abspath("../../"))
19 | 
20 | # Modify supported image order
21 | StandaloneHTMLBuilder.supported_image_types = [
22 |     "image/svg+xml",
23 |     "image/gif",
24 |     "image/png",
25 |     "image/jpeg",
26 | ]
27 | 
28 | # -- Project information -----------------------------------------------------
29 | 
30 | project = "eda-report"
31 | copyright = "2022, Abwao"
32 | author = "Abwao"
33 | 
34 | # The full version, including alpha/beta/rc tags
35 | release = "2.8.2"
36 | 
37 | # -- General configuration ---------------------------------------------------
38 | 
39 | # Add any Sphinx extension module names here, as strings. They can be
40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
41 | # ones.
42 | extensions = [
43 |     "sphinx.ext.autodoc",
44 |     "sphinx.ext.intersphinx",
45 |     "sphinx.ext.napoleon",
46 |     "sphinx.ext.viewcode",
47 | ]
48 | 
49 | intersphinx_mapping = {
50 |     "docx": ("https://python-docx.readthedocs.io/en/latest/", None),
51 |     "pandas": ("https://pandas.pydata.org/docs/", None),
52 |     "python": ("https://docs.python.org/3", None),
53 |     "matplotlib": ("https://matplotlib.org/stable/", None),
54 | }
55 | 
56 | 
57 | # Add any paths that contain templates here, relative to this directory.
58 | templates_path = ["_templates"]
59 | 
60 | # List of patterns, relative to source directory, that match files and
61 | # directories to ignore when looking for source files.
62 | # This pattern also affects html_static_path and html_extra_path.
63 | exclude_patterns = []
64 | 
65 | # -- Options for HTML output -------------------------------------------------
66 | master_doc = "index"
67 | # The theme to use for HTML and HTML Help pages.  See the documentation for
68 | # a list of builtin themes.
69 | #
70 | html_theme = "furo"
71 | html_theme_options = {
72 |     "light_css_variables": {
73 |         "font-stack": "Georgia, serif",
74 |         "font-stack--monospace": "Courier, monospace",
75 |     },
76 | }
77 | # Add any paths that contain custom static files (such as style sheets) here,
78 | # relative to this directory. They are copied after the builtin static files,
79 | # so a file named "default.css" will overwrite the builtin "default.css".
80 | html_static_path = ["_static"]
81 | 


--------------------------------------------------------------------------------
/eda_report/_cli.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from typing import Optional
 3 | 
 4 | from eda_report._read_file import df_from_file
 5 | from eda_report.document import ReportDocument
 6 | 
 7 | 
 8 | def process_cli_args() -> argparse.Namespace:
 9 |     """Captures and parses input from the command line interface using the
10 |     :mod:`argparse` module from the Python standard library.
11 | 
12 |     Returns:
13 |         argparse.Namespace: Object with the parsed arguments as attributes.
14 | 
15 |     Example:
16 |         .. literalinclude:: examples.txt
17 |            :lines: 106-128
18 |     """
19 |     parser = argparse.ArgumentParser(
20 |         prog="eda-report",
21 |         description=(
22 |             "Automatically analyze data and generate reports. A graphical user"
23 |             " interface will be launched if none of the optional arguments is "
24 |             "specified."
25 |         ),
26 |     )
27 |     parser.add_argument(
28 |         "-i",
29 |         "--infile",
30 |         type=df_from_file,
31 |         help="A .csv or .xlsx file to analyze.",
32 |     )
33 |     parser.add_argument(
34 |         "-o",
35 |         "--outfile",
36 |         default="eda-report.docx",
37 |         help="The output name for analysis results (default: %(default)s)",
38 |     )
39 |     parser.add_argument(
40 |         "-t",
41 |         "--title",
42 |         default="Exploratory Data Analysis Report",
43 |         help="The top level heading for the report (default: %(default)s)",
44 |     )
45 |     parser.add_argument(
46 |         "-c",
47 |         "--color",
48 |         default="cyan",
49 |         help="The color to apply to graphs (default: %(default)s)",
50 |     )
51 |     parser.add_argument(
52 |         "-g",
53 |         "-T",
54 |         "--groupby",
55 |         "--target",
56 |         help=(
57 |             "The variable to use for grouping plotted values. An integer value"
58 |             " is treated as a column index, whereas a string is treated as a"
59 |             " column label."
60 |         ),
61 |     )
62 |     return parser.parse_args()
63 | 
64 | 
65 | def run_from_cli() -> Optional[ReportDocument]:
66 |     """Creates an exploratory data analysis report in *Word* format using input
67 |     from the command line interface.
68 | 
69 |     This is the function executed when the package is run as a script (using
70 |     ``python -m eda_report``). It is also the entry point for the
71 |     ``eda-report`` command (console script).
72 |     """
73 |     args = process_cli_args()
74 |     if args.infile is None:
75 |         from eda_report.gui import EDAGUI
76 |         # Launch graphical user interface to select and analyze a file
77 |         app = EDAGUI()
78 |         app.mainloop()
79 |     else:
80 |         ReportDocument(
81 |             args.infile,
82 |             title=args.title,
83 |             graph_color=args.color,
84 |             output_filename=args.outfile,
85 |             groupby_variable=args.groupby,
86 |         )
87 | 


--------------------------------------------------------------------------------
/eda_report/_content.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Iterable, Union
 2 | 
 3 | from eda_report._analysis import _AnalysisResult
 4 | 
 5 | 
 6 | class _ReportContent(_AnalysisResult):
 7 |     """Prepares textual summaries of analysis results.
 8 | 
 9 |     Args:
10 |         data (Iterable): The data to analyze.
11 |         title (str, optional): The title to assign the report. Defaults to
12 |             "Exploratory Data Analysis Report".
13 |         graph_color (str, optional): The color to apply to the graphs.
14 |             Defaults to "cyan".
15 |         groupby_variable (Union[str, int], optional): The column to
16 |             use to group values. Defaults to None.
17 |     """
18 | 
19 |     def __init__(
20 |         self,
21 |         data: Iterable,
22 |         *,
23 |         title: str = "Exploratory Data Analysis Report",
24 |         graph_color: str = "cyan",
25 |         groupby_variable: Union[str, int] = None,
26 |     ) -> None:
27 |         super().__init__(
28 |             data, graph_color=graph_color, groupby_variable=groupby_variable
29 |         )
30 |         self.TITLE = title
31 |         self.intro_text = self._get_introductory_summary()
32 |         self.variable_descriptions = self._describe_variables()
33 | 
34 |     def _get_introductory_summary(self) -> str:
35 |         """Get an overview of the number of rows and the nature of columns.
36 | 
37 |         Returns:
38 |             str: Introduction.
39 |         """
40 |         num_rows, num_cols = self.dataset.data.shape
41 |         if num_rows == 1:
42 |             rows = "1 row (observation)"
43 |         else:
44 |             rows = f"{num_rows:,} rows (observations)"
45 | 
46 |         if num_cols == 1:
47 |             cols = "1 column (feature)"
48 |         else:
49 |             cols = f"{num_cols:,} columns (features)"
50 | 
51 |         if self.dataset._numeric_stats is None:
52 |             numeric_descr = ""
53 |         else:
54 |             num_numeric = self.dataset._numeric_stats.shape[0]
55 |             if num_numeric == 1:
56 |                 numeric_descr = ", 1 of which is numeric"
57 |             else:
58 |                 numeric_descr = f", {num_numeric} of which are numeric"
59 | 
60 |         return f"The dataset consists of {rows} and {cols}{numeric_descr}."
61 | 
62 |     def _describe_variables(self) -> Dict[str, str]:
63 |         """Get summary statistics for a variable.
64 | 
65 |         Returns:
66 |             Dict[str, str]: Summary statistics.
67 |         """
68 |         descriptions = {}
69 |         for name, variable in self.variables.items():
70 |             if variable.num_unique == 1:
71 |                 unique_vals = "1 unique value"
72 |             else:
73 |                 unique_vals = f"{variable.num_unique:,} unique values"
74 | 
75 |             descriptions[name] = (
76 |                 f"{variable.name.capitalize()} is a {variable.var_type} "
77 |                 f"variable with {unique_vals}. {variable.missing} of its "
78 |                 "values are missing."
79 |             )
80 |         return descriptions
81 | 


--------------------------------------------------------------------------------
/docs/source/eda_report.plotting.rst:
--------------------------------------------------------------------------------
  1 | eda\_report.plotting
  2 | ====================
  3 | 
  4 | You can find a wealth of plotting libraries at the `PyViz`_ website.
  5 | 
  6 | .. _PyViz: https://pyviz.org/
  7 | 
  8 | The plotting functions below are implemented using `matplotlib`_. In the interest of efficiency, especially for large datasets with numerous columns; these plotting functions use a *non-interactive* `matplotlib backend`_. This was inspired by `Embedding in a web application server`_, which says in part:
  9 | 
 10 | 
 11 |    When using Matplotlib in a web server [GUI application, in this case] it is strongly recommended to not use :mod:`~matplotlib.pyplot` (pyplot maintains references to the opened figures to make `show`_ work, but this will cause memory leaks unless the figures are properly closed).
 12 | 
 13 | 
 14 | .. _matplotlib: https://matplotlib.org/
 15 | .. _matplotlib backend: https://matplotlib.org/stable/users/explain/backends.html#the-builtin-backends
 16 | .. _Embedding in a web application server: https://matplotlib.org/stable/gallery/user_interfaces/web_application_server_sgskip.html
 17 | .. _show: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.show.html#matplotlib.pyplot.show
 18 | 
 19 | You can conveniently view the generated figures in a *jupyter notebook* using ``%matplotlib inline``, as shown in this `demo notebook`_.
 20 | 
 21 | .. _demo notebook: https://mybinder.org/v2/gh/Tim-Abwao/eda-report/HEAD?filepath=eda-report-basics.ipynb
 22 | 
 23 | .. image:: https://mybinder.org/badge_logo.svg
 24 |    :target: https://mybinder.org/v2/gh/Tim-Abwao/eda-report/HEAD?filepath=eda-report-basics.ipynb
 25 | 
 26 | Otherwise, you'll probably need to export them as images.
 27 | 
 28 | .. _plotting-examples:
 29 | 
 30 | Plotting Examples
 31 | -----------------
 32 | >>> import eda_report.plotting as ep
 33 | >>> ax = ep.bar_plot(mpg_data["origin"], label="Country of Origin")
 34 | >>> ax.figure.savefig("bar-plot.png")
 35 | 
 36 | .. image:: _static/bar-plot.png
 37 |    :width: 80%
 38 |    :align: center
 39 |    :alt: a bar-plot
 40 |    :class: only-light
 41 | 
 42 | .. image:: _static/bar-plot-dark.png
 43 |    :width: 80%
 44 |    :align: center
 45 |    :alt: a bar-plot
 46 |    :class: only-dark
 47 | 
 48 | >>> ax = ep.box_plot(mpg_data["acceleration"], label="Acceleration", hue=mpg_data["origin"])
 49 | >>> ax.figure.savefig("box-plot.png")
 50 | 
 51 | .. image:: _static/box-plot.png
 52 |    :width: 80%
 53 |    :align: center
 54 |    :alt: a box-plot
 55 |    :class: only-light
 56 | 
 57 | .. image:: _static/box-plot-dark.png
 58 |    :width: 80%
 59 |    :align: center
 60 |    :alt: a box-plot
 61 |    :class: only-dark
 62 | 
 63 | >>> ax = ep.kde_plot(mpg_data["mpg"], label="MPG", hue=mpg_data["cylinders"])
 64 | >>> ax.figure.savefig("kde-plot.png")
 65 | 
 66 | .. image:: _static/kde-plot.png
 67 |    :width: 80%
 68 |    :align: center
 69 |    :alt: a kde-plot
 70 |    :class: only-light
 71 | 
 72 | .. image:: _static/kde-plot-dark.png
 73 |    :width: 80%
 74 |    :align: center
 75 |    :alt: a kde-plot
 76 |    :class: only-dark
 77 | 
 78 | >>> ax = ep.regression_plot(mpg_data["acceleration"], mpg_data["horsepower"],
 79 | ...                         labels=("Acceleration", "Horsepower"))
 80 | >>> ax.figure.savefig("regression-plot.png")
 81 | 
 82 | .. image:: _static/regression-plot.png
 83 |    :width: 80%
 84 |    :align: center
 85 |    :alt: a regression-plot
 86 |    :class: only-light
 87 | 
 88 | .. image:: _static/regression-plot-dark.png
 89 |    :width: 80%
 90 |    :align: center
 91 |    :alt: a regression-plot
 92 |    :class: only-dark
 93 | 
 94 | >>> ax = ep.prob_plot(mpg_data["acceleration"], label="Acceleration")
 95 | >>> ax.figure.savefig("probability-plot.png")
 96 | 
 97 | .. image:: _static/probability-plot.png
 98 |    :width: 80%
 99 |    :align: center
100 |    :alt: a probability-plot
101 |    :class: only-light
102 | 
103 | .. image:: _static/probability-plot-dark.png
104 |    :width: 80%
105 |    :align: center
106 |    :alt: a probability-plot
107 |    :class: only-dark
108 | 
109 | >>> ax = ep.plot_correlation(mpg_data)
110 | >>> ax.figure.savefig("correlation-plot.png")
111 | 
112 | .. image:: _static/correlation-plot.png
113 |    :width: 80%
114 |    :align: center
115 |    :alt: a correlation-plot
116 |    :class: only-light
117 | 
118 | .. image:: _static/correlation-plot-dark.png
119 |    :width: 80%
120 |    :align: center
121 |    :alt: a correlation-plot
122 |    :class: only-dark
123 | 
124 | .. automodule:: eda_report.plotting
125 |    :members:
126 |    :inherited-members:
127 |    :undoc-members:
128 |    :show-inheritance:
129 | 


--------------------------------------------------------------------------------
/tests/test_analysis.py:
--------------------------------------------------------------------------------
  1 | from io import BytesIO
  2 | 
  3 | from pandas import DataFrame, Series
  4 | 
  5 | from eda_report._analysis import _AnalysisResult, _get_contingency_tables
  6 | from eda_report.bivariate import Dataset
  7 | 
  8 | data = DataFrame(
  9 |     {"A": range(50), "B": [1, 2, 3, 4, 5] * 10, "C": list("ab") * 25}
 10 | )
 11 | 
 12 | 
 13 | class TestGetContingencyTables:
 14 |     data = DataFrame(
 15 |         [list("abc"), list("abd"), list("bcd")] * 4, columns=list("ABC")
 16 |     )
 17 | 
 18 |     def test_with_empty_data(self):
 19 |         empty_df = self.data[[]]
 20 |         tables = _get_contingency_tables(
 21 |             categorical_df=empty_df, groupby_data=self.data["C"]
 22 |         )
 23 |         assert tables == {}
 24 | 
 25 |     def test_with_null_groupby_data(self):
 26 |         tables = _get_contingency_tables(
 27 |             categorical_df=self.data, groupby_data=None
 28 |         )
 29 |         assert tables == {}
 30 | 
 31 |     def test_with_valid_args(self):
 32 |         tables = _get_contingency_tables(
 33 |             categorical_df=self.data, groupby_data=self.data["C"]
 34 |         )
 35 |         # Check that groupby_data "C" is not included
 36 |         assert set(tables.keys()) == {"A", "B"}
 37 |         assert tables["A"].to_dict() == {
 38 |             "c": {"a": 4, "b": 0, "Total": 4},
 39 |             "d": {"a": 4, "b": 4, "Total": 8},
 40 |             "Total": {"a": 8, "b": 4, "Total": 12},
 41 |         }
 42 |         assert tables["B"].to_dict() == {
 43 |             "c": {"b": 4, "c": 0, "Total": 4},
 44 |             "d": {"b": 4, "c": 4, "Total": 8},
 45 |             "Total": {"b": 8, "c": 4, "Total": 12},
 46 |         }
 47 | 
 48 |     def test_cardinality_limit(self):
 49 |         high_cardinality_data = DataFrame(
 50 |             {
 51 |                 "A": range(50),
 52 |                 "B": list("abcdefghijklmnopqrstuvwxy") * 2,
 53 |                 "C": list(range(10)) * 5,
 54 |             }
 55 |         )
 56 |         tables = _get_contingency_tables(
 57 |             categorical_df=high_cardinality_data,
 58 |             groupby_data=Series([1, 2] * 25),
 59 |         )
 60 |         # "A" and "B" have > 20 unique values, and so are omitted
 61 |         assert set(tables.keys()) == {"C"}
 62 | 
 63 | 
 64 | class TestAnalysisResult:
 65 |     results = _AnalysisResult(data, graph_color="green", groupby_variable="C")
 66 | 
 67 |     def test_general_properties(self):
 68 |         assert isinstance(self.results.dataset, Dataset)
 69 |         assert self.results.GRAPH_COLOR == "green"
 70 |         assert self.results.GROUPBY_DATA.equals(data["C"])
 71 |         assert self.results.bivariate_summaries == {
 72 |             ("A", "B"): "A and B have very weak positive correlation (0.10)."
 73 |         }
 74 | 
 75 |     def test_univariate_analysis(self):
 76 |         assert set(self.results.univariate_stats) == {"A", "B", "C"}
 77 | 
 78 |         # Summary statistics for each variable should be available in a dict
 79 |         assert isinstance(self.results.variables["A"].summary_stats, dict)
 80 |         assert isinstance(self.results.variables["B"].summary_stats, dict)
 81 |         assert isinstance(self.results.variables["C"].summary_stats, dict)
 82 | 
 83 |     def test_univariate_graphs(self):
 84 |         for key, graphs in self.results.univariate_graphs.items():
 85 |             assert key in set("ABC")
 86 |             for graph in graphs.values():
 87 |                 assert isinstance(graph, BytesIO)
 88 | 
 89 |     def test_normality_tests(self):
 90 |         assert set(self.results.normality_tests) == {"A"}
 91 | 
 92 |         for df in self.results.normality_tests.values():
 93 |             assert set(df.index) == {
 94 |                 "D'Agostino's K-squared test",
 95 |                 "Shapiro-Wilk test",
 96 |                 "Kolmogorov-Smirnov test",
 97 |             }
 98 | 
 99 |     def test_contingency_tables(self):
100 |         assert set(self.results.contingency_tables) == {"B"}
101 |         assert self.results.contingency_tables["B"].to_dict() == {
102 |             "a": {1: 5, 2: 5, 3: 5, 4: 5, 5: 5, "Total": 25},
103 |             "b": {1: 5, 2: 5, 3: 5, 4: 5, 5: 5, "Total": 25},
104 |             "Total": {1: 10, 2: 10, 3: 10, 4: 10, 5: 10, "Total": 50},
105 |         }
106 | 
107 |     def test_bivariate_graphs(self):
108 |         assert set(self.results.bivariate_graphs.keys()) == {
109 |             "correlation_plot",
110 |             "regression_plots",
111 |         }
112 |         assert isinstance(
113 |             self.results.bivariate_graphs["correlation_plot"], BytesIO
114 |         )
115 |         for graph in self.results.bivariate_graphs[
116 |             "regression_plots"
117 |         ].values():
118 |             assert isinstance(graph, BytesIO)
119 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # `eda-report` - Automated Exploratory Data Analysis
  2 | 
  3 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Tim-Abwao/eda-report/HEAD?filepath=eda-report-basics.ipynb)
  4 | [![PyPI version](https://badge.fury.io/py/eda-report.svg)](https://badge.fury.io/py/eda-report)
  5 | [![Python 3.10 - 3.12](https://github.com/Tim-Abwao/eda-report/actions/workflows/unit-tests.yml/badge.svg)](https://github.com/Tim-Abwao/eda-report/actions/workflows/unit-tests.yml)
  6 | [![Documentation Status](https://readthedocs.org/projects/eda-report/badge/?version=latest)](https://eda-report.readthedocs.io/en/latest/?badge=latest)
  7 | [![codecov](https://codecov.io/gh/Tim-Abwao/eda-report/branch/main/graph/badge.svg?token=KNQD8XZCWG)](https://codecov.io/gh/Tim-Abwao/eda-report)
  8 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
  9 | 
 10 | A Python program to help automate the exploratory data analysis and reporting process.
 11 | 
 12 | Input data is analyzed using [pandas][pandas] and [SciPy][scipy]. Graphs are plotted using [matplotlib][matplotlib]. The results are then nicely packaged as a *Word (.docx)* document using [python-docx][python-docx].
 13 | 
 14 | ![screencast of report document from iris dataset][report-screencast]
 15 | 
 16 | ## Installation
 17 | 
 18 | You can install the package from [PyPI][eda-report-pypi] using:
 19 | 
 20 | ```bash
 21 | pip install eda-report
 22 | ```
 23 | 
 24 | ## Basic Usage
 25 | 
 26 | ### 1. Graphical User Interface
 27 | 
 28 | The `eda-report` command launches a graphical window to help select a `csv`/`excel` file to analyze:
 29 | 
 30 | ```bash
 31 | eda-report
 32 | ```
 33 | 
 34 | ![screencast of the gui][gui-screencast]
 35 | 
 36 | You'll be prompted to set a *report title*, *group-by/target variable (optional)*, *graph color* and *output filename*; after which the contents of the input file are analyzed, and the results saved in a *Word (.docx)* document.
 37 | 
 38 | >**NOTE:** For help with `Tk` - related issues, consider visiting [TkDocs][tkdocs].
 39 | 
 40 | ### 2. Command Line Interface
 41 | 
 42 | ```bash
 43 | $ eda-report -i iris.csv -o iris-report.docx
 44 | Analyze variables:  100%|███████████████████████████████████| 5/5
 45 | Plot variables:     100%|███████████████████████████████████| 5/5
 46 | Bivariate analysis: 100%|███████████████████████████████████| 6/6 pairs.
 47 | [INFO 02:12:22.146] Done. Results saved as 'iris-report.docx'
 48 | ```
 49 | 
 50 | ```bash
 51 | $ eda-report -h
 52 | usage: eda-report [-h] [-i INFILE] [-o OUTFILE] [-t TITLE] [-c COLOR]
 53 |                   [-g GROUPBY]
 54 | 
 55 | Automatically analyze data and generate reports. A graphical user interface
 56 | will be launched if none of the optional arguments is specified.
 57 | 
 58 | optional arguments:
 59 |   -h, --help            show this help message and exit
 60 |   -i INFILE, --infile INFILE
 61 |                         A .csv or .xlsx file to analyze.
 62 |   -o OUTFILE, --outfile OUTFILE
 63 |                         The output name for analysis results (default: eda-
 64 |                         report.docx)
 65 |   -t TITLE, --title TITLE
 66 |                         The top level heading for the report (default:
 67 |                         Exploratory Data Analysis Report)
 68 |   -c COLOR, --color COLOR
 69 |                         The color to apply to graphs (default: cyan)
 70 |   -g GROUPBY, -T GROUPBY, --groupby GROUPBY, --target GROUPBY
 71 |                         The variable to use for grouping plotted values. An
 72 |                         integer value is treated as a column index, whereas a
 73 |                         string is treated as a column label.
 74 | ```
 75 | 
 76 | </details>
 77 | 
 78 | ### 3. Interpreter Session
 79 | 
 80 | ```python
 81 | >>> eda_report.summarize(iris_data)
 82 | 
 83 |                   Summary Statistics for Numeric features (4)
 84 |                   -------------------------------------------
 85 |                 count     avg  stddev  min  25%   50%  75%  max  skewness  kurtosis
 86 |   sepal_length    150  5.8433  0.8281  4.3  5.1  5.80  6.4  7.9    0.3149   -0.5521
 87 |   sepal_width     150  3.0573  0.4359  2.0  2.8  3.00  3.3  4.4    0.3190    0.2282
 88 |   petal_length    150  3.7580  1.7653  1.0  1.6  4.35  5.1  6.9   -0.2749   -1.4021
 89 |   petal_width     150  1.1993  0.7622  0.1  0.3  1.30  1.8  2.5   -0.1030   -1.3406
 90 | 
 91 |                 Summary Statistics for Categorical features (1)
 92 |                 -----------------------------------------------
 93 |                     count unique     top freq relative freq
 94 |             species   150      3  setosa   50        33.33%
 95 | 
 96 | 
 97 |                         Pearson's Correlation (Top 20)
 98 |                         ------------------------------
 99 |       petal_length & petal_width -> very strong positive correlation (0.96)
100 |      sepal_length & petal_length -> very strong positive correlation (0.87)
101 |       sepal_length & petal_width -> very strong positive correlation (0.82)
102 |       sepal_width & petal_length -> moderate negative correlation (-0.43)
103 |        sepal_width & petal_width -> weak negative correlation (-0.37)
104 |       sepal_length & sepal_width -> very weak negative correlation (-0.12)
105 | ```
106 | 
107 | Check out the [documentation][docs] for more features and details.
108 | 
109 | [docs]: https://eda-report.readthedocs.io/
110 | [eda-report-pypi]: https://pypi.org/project/eda-report/
111 | [matplotlib]: https://matplotlib.org/
112 | [pandas]: https://pandas.pydata.org/
113 | [python-docx]: https://python-docx.readthedocs.io/
114 | [scipy]: https://scipy.org/
115 | [gui-screencast]: https://raw.githubusercontent.com/Tim-Abwao/eda-report/dev/docs/source/_static/screencast.gif
116 | [report-screencast]: https://raw.githubusercontent.com/Tim-Abwao/eda-report/dev/docs/source/_static/report.gif
117 | [tkdocs]: https://tkdocs.com/index.html
118 | 


--------------------------------------------------------------------------------
/eda_report/_validate.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from collections.abc import Iterable
  3 | from typing import Optional, Union
  4 | 
  5 | from pandas import DataFrame, RangeIndex, Series
  6 | from pandas.api.types import is_numeric_dtype
  7 | 
  8 | from eda_report.exceptions import (
  9 |     EmptyDataError,
 10 |     GroupbyVariableError,
 11 |     InputError,
 12 | )
 13 | 
 14 | 
 15 | def _clean_column_labels(data: DataFrame) -> DataFrame:
 16 |     """Makes sure that columns have *meaningful* names.
 17 | 
 18 |     When creating a ``DataFrame`` from an ``Iterable``, if no column names
 19 |     are provided, the columns are set as a :class:`~pandas.RangeIndex` —
 20 |     [0, 1, 2, ...] (default).
 21 | 
 22 |     This function renames such columns to ['var_1', 'var_2, 'var_3', ...],
 23 |     making references and comparisons much more intuitive.
 24 | 
 25 |     It also ensures that column labels are all of similar type (``str``) to
 26 |     allow sorting and the use of string methods.
 27 | 
 28 |     Args:
 29 |         data (pandas.DataFrame): Data to inspect and perhaps edit.
 30 | 
 31 |     Returns:
 32 |         pandas.DataFrame: The ``data``, with human-friendly column
 33 |         names.
 34 |     """
 35 |     if isinstance(data.columns, RangeIndex):
 36 |         data.columns = [f"var_{i+1}" for i in data.columns]
 37 |     elif is_numeric_dtype(data.columns):
 38 |         data.columns = [f"var_{i}" for i in data.columns]
 39 |     else:
 40 |         data.columns = data.columns.map(str)
 41 |     return data
 42 | 
 43 | 
 44 | def _check_cardinality(groupby_data: Series, *, threshold: int = 10) -> None:
 45 |     """Assesses whether the ``groupby_data`` has too many unique values
 46 |     (> ``threshold``, default 10).
 47 | 
 48 |     Args:
 49 |         groupby_data (pandas.Series): The data intended to group values.
 50 |         threshold (int, optional): Maximum allowable cardinality. Defaults to
 51 |             10.
 52 | 
 53 |     Raises:
 54 |         GroupbyVariableError: If the `groupby_data` has cardinality outside the
 55 |             acceptable range.
 56 |     """
 57 |     if groupby_data.nunique() > threshold:
 58 |         message = (
 59 |             f"Group-by variable '{groupby_data.name}' not used to group "
 60 |             f"values. It has high cardinality ({groupby_data.nunique()}) "
 61 |             f"and would clutter graphs."
 62 |         )
 63 |         logging.warning(message)
 64 |         raise GroupbyVariableError(message)
 65 | 
 66 | 
 67 | def _validate_dataset(data: Iterable) -> DataFrame:
 68 |     """Ensures that input data is of type :class:`pandas.DataFrame`.
 69 | 
 70 |     If it isn't, this attempts to explicitly cast it as a ``DataFrame``.
 71 | 
 72 |     Columns in the data that are completely empty will be dropped.
 73 | 
 74 |     Args:
 75 |         data (Iterable): The data to analyze.
 76 | 
 77 |     Raises:
 78 |         InputError: If the ``data`` cannot be cast as a
 79 |             :class:`~pandas.DataFrame`.
 80 |         EmptyDataError: If the ``data`` has no items.
 81 | 
 82 |     Returns:
 83 |         pandas.DataFrame: The input data as a DataFrame.
 84 |     """
 85 |     try:
 86 |         data_frame = DataFrame(data)
 87 |     except Exception:
 88 |         raise InputError(
 89 |             f"Expected a pandas.Dataframe object, but got {type(data)}."
 90 |         )
 91 |     # The data should not be empty
 92 |     if len(data_frame) == 0:
 93 |         raise EmptyDataError("No data to process.")
 94 | 
 95 |     data_frame = (
 96 |         # Attempt to infer better dtypes for columns.
 97 |         data_frame.infer_objects()
 98 |         # Drop completely empty columns.
 99 |         .dropna(axis=1, how="all")
100 |     )
101 |     return _clean_column_labels(data_frame)
102 | 
103 | 
104 | def _validate_univariate_input(
105 |     data: Iterable, *, name: str = None
106 | ) -> Optional[Series]:
107 |     """Ensures that *univariate input data* is of type :class:`pandas.Series`.
108 | 
109 |     If it isn't, this attempts to explicitly cast it as a ``Series``.
110 | 
111 |     Args:
112 |         data (Iterable): The data to analyze.
113 |         name (str, optional): The name to assign the data. Defaults
114 |             to None.
115 | 
116 |     Raises:
117 |         InputError: If the ``data`` cannot be cast as a
118 |             :class:`~pandas.Series`.
119 |         EmptyDataError: If the ``data`` has no items.
120 | 
121 |     Returns:
122 |         Optional[pandas.Series]: The input data as a ``Series``.
123 |     """
124 |     if data is None:
125 |         return None
126 |     else:
127 |         try:
128 |             series = Series(data, name=name)
129 |         except Exception:
130 |             raise InputError(
131 |                 f"Expected a one-dimensional sequence, but got {type(data)}."
132 |             )
133 |     # Convert potentially mixed-type items to strings
134 |     if series.dtype == "O":
135 |         series = series.astype("string")
136 | 
137 |     if series.shape[0] == 0:
138 |         raise EmptyDataError("No data to process.")
139 |     else:
140 |         return series
141 | 
142 | 
143 | def _validate_groupby_variable(
144 |     *, data: DataFrame, groupby_variable: Union[int, str]
145 | ) -> Optional[Series]:
146 |     """Ensures that the specified column label/index for grouping values is
147 |     present in the data.
148 | 
149 |     Args:
150 |         data (DataFrame): The data being analyzed.
151 |         groupby_variable (Union[int, str]): A column label or index.
152 | 
153 |     Raises:
154 |         GroupbyVariableError: If the supplied column label does not exist, or
155 |             the supplied column index is out of bounds.
156 | 
157 |     Returns:
158 |         Optional[pandas.Series]: The groupby variable's data.
159 |     """
160 |     if groupby_variable is None:
161 |         return None
162 |     elif f"{groupby_variable}".isdecimal():
163 |         idx = int(groupby_variable)
164 |         try:
165 |             groupby_data = data.iloc[:, idx]
166 |         except IndexError:
167 |             raise GroupbyVariableError(
168 |                 f"Column index {groupby_variable} is not in the range"
169 |                 f" [0, {data.columns.size}]."
170 |             )
171 |         _check_cardinality(groupby_data)
172 |         return groupby_data
173 |     elif isinstance(groupby_variable, str):
174 |         try:
175 |             groupby_data = data[groupby_variable]
176 |         except KeyError:
177 |             raise GroupbyVariableError(
178 |                 f"{groupby_variable!r} is not in {data.columns.to_list()}"
179 |             )
180 |         _check_cardinality(groupby_data)
181 |         return groupby_data
182 |     else:
183 |         # If groupby_data is neither an index(int) or label(str)
184 |         logging.warning(
185 |             f"Group-by variable '{groupby_variable}' ignored."
186 |             " Not a valid column index or label."
187 |         )
188 |         return None
189 | 


--------------------------------------------------------------------------------
/eda_report/_analysis.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing import get_context
  2 | from typing import Dict, Iterable, Optional, Union
  3 | 
  4 | import pandas as pd
  5 | from tqdm import tqdm
  6 | 
  7 | from eda_report._validate import _validate_groupby_variable
  8 | from eda_report.bivariate import Dataset
  9 | from eda_report.plotting import _plot_dataset, _plot_variable
 10 | from eda_report.univariate import Variable, _analyze_univariate
 11 | 
 12 | mp_context = get_context("spawn")  # Use "spawn" start method
 13 | 
 14 | 
 15 | def _get_contingency_tables(
 16 |     categorical_df: pd.DataFrame, groupby_data: pd.Series
 17 | ) -> Dict[str, pd.DataFrame]:
 18 |     """Get contingency tables for categorical variables.
 19 | 
 20 |     Args:
 21 |         categorical_df (pandas.DataFrame): Categorical data.
 22 |         groupby_data (pandas.Series): Values to group by.
 23 | 
 24 |     Returns:
 25 |         Dict[str, pandas.DataFrame]: Contingency tables for each column.
 26 |     """
 27 |     if (categorical_df.shape[1] == 0) or (groupby_data is None):
 28 |         return {}
 29 | 
 30 |     contingency_tables = {
 31 |         col: pd.crosstab(
 32 |             index=categorical_df[col],
 33 |             columns=groupby_data,
 34 |             margins=True,
 35 |             margins_name="Total",
 36 |         )
 37 |         for col in categorical_df
 38 |         # Only include columns with upto 20 unique values to cut clutter
 39 |         if categorical_df[col].nunique() <= 20
 40 |     }
 41 |     # Exclude groupby_variable in case it is among the categorical cols
 42 |     contingency_tables.pop(groupby_data.name, None)
 43 |     return contingency_tables
 44 | 
 45 | 
 46 | class _AnalysisResult:
 47 |     """Analyzes data, and stores the resultant summary statistics and graphs.
 48 | 
 49 |     Args:
 50 |         data (Iterable): The data to analyse.
 51 |         graph_color (str, optional): The color to apply to the graphs.
 52 |             Defaults to "cyan".
 53 |         groupby_variable (Union[str, int], optional): The column to
 54 |             use to group values. Defaults to None.
 55 |     """
 56 | 
 57 |     def __init__(
 58 |         self,
 59 |         data: Iterable,
 60 |         graph_color: str = "cyan",
 61 |         groupby_variable: Union[str, int] = None,
 62 |     ) -> None:
 63 |         self.GRAPH_COLOR = graph_color
 64 |         self.dataset = Dataset(data)
 65 |         self.GROUPBY_DATA = _validate_groupby_variable(
 66 |             data=self.dataset.data, groupby_variable=groupby_variable
 67 |         )
 68 |         self.variables = self._analyze_variables()
 69 |         self.univariate_stats = self._get_univariate_statistics()
 70 |         self.normality_tests = self._get_normality_test_results()
 71 |         self.univariate_graphs = self._get_univariate_graphs()
 72 |         self.bivariate_graphs = _plot_dataset(self.dataset, color=graph_color)
 73 |         self.bivariate_summaries = self._get_bivariate_summaries()
 74 | 
 75 |     def _analyze_variables(self) -> Dict[str, Variable]:
 76 |         """Compute summary statistics and assess variable properties.
 77 | 
 78 |         Returns:
 79 |             Dict[str, Variable]: Univariate analysis results.
 80 |         """
 81 |         data = self.dataset.data
 82 |         with mp_context.Pool() as p:
 83 |             univariate_stats = dict(
 84 |                 tqdm(
 85 |                     # Analyze variables concurrently
 86 |                     p.imap(_analyze_univariate, data.items()),
 87 |                     # Progress-bar options
 88 |                     total=data.shape[1],
 89 |                     bar_format=(
 90 |                         "{desc} {percentage:3.0f}%|{bar:35}| "
 91 |                         "{n_fmt}/{total_fmt}"
 92 |                     ),
 93 |                     desc="Analyze variables: ",
 94 |                     dynamic_ncols=True,
 95 |                 )
 96 |             )
 97 |         # Create contingency tables
 98 |         categorical_cols = [
 99 |             col_name
100 |             for col_name, var in univariate_stats.items()
101 |             if var.var_type != "numeric"
102 |         ]
103 |         self.contingency_tables = _get_contingency_tables(
104 |             data[categorical_cols], self.GROUPBY_DATA
105 |         )
106 |         return univariate_stats
107 | 
108 |     def _get_univariate_statistics(self) -> Dict[str, pd.DataFrame]:
109 |         """Get a dataframe of summary statistics for all variables.
110 | 
111 |         Returns:
112 |             Dict[str, pandas.DataFrame]: Summary statistics.
113 |         """
114 |         return {
115 |             name: variable.summary_stats
116 |             for name, variable in self.variables.items()
117 |         }
118 | 
119 |     def _get_normality_test_results(self) -> Dict[str, pd.DataFrame]:
120 |         """Perform tests for normality.
121 | 
122 |         Returns:
123 |             Dict[str, pandas.DataFrame]: Normality test results.
124 |         """
125 |         return {
126 |             name: variable._normality_test_results
127 |             for name, variable in self.variables.items()
128 |             if variable.var_type == "numeric"
129 |         }
130 | 
131 |     def _get_univariate_graphs(self) -> Dict[str, Dict]:
132 |         """Plot graphs for all variables present.
133 | 
134 |         Returns:
135 |             Dict[str, Dict]: Univariate graphs.
136 |         """
137 | 
138 |         with mp_context.Pool() as p:
139 |             data = self.dataset.data
140 |             variable_data_hue_and_color = [
141 |                 (
142 |                     variable,
143 |                     data[variable.name],
144 |                     self.GROUPBY_DATA,
145 |                     self.GRAPH_COLOR,
146 |                 )
147 |                 for variable in self.variables.values()
148 |             ]
149 |             univariate_graphs = dict(
150 |                 tqdm(
151 |                     # Plot variables in parallel processes
152 |                     p.imap(_plot_variable, variable_data_hue_and_color),
153 |                     # Progress-bar options
154 |                     total=len(self.variables),
155 |                     bar_format=(
156 |                         "{desc} {percentage:3.0f}%|{bar:35}| "
157 |                         "{n_fmt}/{total_fmt}"
158 |                     ),
159 |                     desc="Plot variables:    ",
160 |                     dynamic_ncols=True,
161 |                 )
162 |             )
163 |         return univariate_graphs
164 | 
165 |     def _get_bivariate_summaries(self) -> Optional[Dict[str, str]]:
166 |         """Get descriptions of the nature of correlation between numeric
167 |         column pairs.
168 | 
169 |         Returns:
170 |             Optional[Dict[str, str]]: Correlation info.
171 |         """
172 |         if self.dataset._correlation_values is None:
173 |             return None
174 |         else:
175 |             # Take the top 20 pairs by magnitude of correlation.
176 |             # 20 var_pairs ≈ 10+ pages
177 |             # 20 numeric columns == 190 var_pairs ≈ 95+ pages.
178 |             pairs_to_include = [
179 |                 pair for pair, _ in self.dataset._correlation_values[:20]
180 |             ]
181 |             correlation_descriptions = self.dataset._correlation_descriptions
182 |             return {
183 |                 var_pair: (
184 |                     f"{var_pair[0].title()} and {var_pair[1].title()} have "
185 |                     f"{correlation_descriptions[var_pair]}."
186 |                 )
187 |                 for var_pair in pairs_to_include
188 |             }
189 | 


--------------------------------------------------------------------------------
/tests/test_data_validation.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from pandas import DataFrame, Series
  3 | 
  4 | from eda_report._validate import (
  5 |     _clean_column_labels,
  6 |     _validate_dataset,
  7 |     _validate_groupby_variable,
  8 |     _validate_univariate_input,
  9 | )
 10 | from eda_report.exceptions import (
 11 |     EmptyDataError,
 12 |     GroupbyVariableError,
 13 |     InputError,
 14 | )
 15 | 
 16 | 
 17 | class TestDatasetValidation:
 18 |     def test_dataframe_input(self):
 19 |         # Check if a dataframe is returned as a dataframe
 20 |         assert isinstance(_validate_dataset(DataFrame(range(10))), DataFrame)
 21 | 
 22 |     def test_series_input(self):
 23 |         # Check if a series returns a dataframe
 24 |         assert isinstance(_validate_dataset(Series(range(10))), DataFrame)
 25 | 
 26 |     def test_iterable_input(self):
 27 |         # Check if a sequence returns a dataframe
 28 |         assert isinstance(_validate_dataset(range(10)), DataFrame)
 29 |         # Check if a generator returns a dataframe
 30 |         assert isinstance(
 31 |             _validate_dataset((x**2 for x in range(10))), DataFrame
 32 |         )
 33 | 
 34 |     def test_invalid_input(self):
 35 |         # Check that invalid input rasies an InputError
 36 |         with pytest.raises(InputError) as error:
 37 |             _validate_dataset(0)
 38 |         assert (
 39 |             "Expected a pandas.Dataframe object, but got <class 'int'>."
 40 |             in str(error.value)
 41 |         )
 42 | 
 43 |     def test_empty_input(self):
 44 |         # Check that empty input rasies an EmptyDataError
 45 |         with pytest.raises(EmptyDataError) as error:
 46 |             _validate_dataset(DataFrame())
 47 |         assert "No data to process." in str(error.value)
 48 | 
 49 |     def test_empty_column_is_dropped(self):
 50 |         # Check that columns consisting entirely of NaN are dropped
 51 |         data_with_empty_col = [[x, None] for x in range(10)]
 52 |         result = _validate_dataset(data_with_empty_col)
 53 |         assert result.shape == (10, 1)
 54 | 
 55 | 
 56 | class TestUnivariateInputValidation:
 57 |     def test_series_input(self):
 58 |         # Check if a series is returned as a series
 59 |         assert isinstance(
 60 |             _validate_univariate_input(Series(range(10))), Series
 61 |         )
 62 | 
 63 |     def test_iterable_input(self):
 64 |         # Check if a sequence-like returns a series
 65 |         assert isinstance(_validate_univariate_input(range(10)), Series)
 66 |         # Check if a generator returns a series
 67 |         assert isinstance(
 68 |             _validate_univariate_input((x**2 for x in range(10))), Series
 69 |         )
 70 | 
 71 |     def test_mixed_type_input(self):
 72 |         # Check that mixed data is stored as strings, not objects
 73 |         mixed_data = _validate_univariate_input([1, 3, True, "hello"])
 74 |         assert mixed_data.dtype == "string"
 75 | 
 76 |     def test_empty_input(self):
 77 |         with pytest.raises(EmptyDataError) as error:
 78 |             _validate_univariate_input(x for x in [])
 79 |         assert "No data to process." in str(error.value)
 80 | 
 81 |     def test_null_input(self):
 82 |         assert _validate_univariate_input(None) is None
 83 | 
 84 |     def test_invalid_input(self):
 85 |         # Check that invalid input rasies an InputError
 86 |         with pytest.raises(InputError) as error:
 87 |             _validate_univariate_input(DataFrame([1, 2, 3]))
 88 |         assert (
 89 |             "Expected a one-dimensional sequence, but got "
 90 |             "<class 'pandas.core.frame.DataFrame'>."
 91 |         ) in str(error.value)
 92 | 
 93 | 
 94 | class TestTargetValidation:
 95 |     data = DataFrame([range(5)] * 3, columns=list("ABCDE"))
 96 | 
 97 |     def test_valid_column_index(self):
 98 |         # Check that a valid column index returns the appropriate column data.
 99 |         assert _validate_groupby_variable(
100 |             data=self.data, groupby_variable=3
101 |         ).equals(self.data.get("D"))
102 | 
103 |     def test_invalid_column_index(self):
104 |         # Check that an error is raised for a column index that is out of
105 |         # bounds.
106 |         with pytest.raises(GroupbyVariableError) as error:
107 |             _validate_groupby_variable(data=self.data, groupby_variable=10)
108 |         assert "Column index 10 is not in the range [0, 5]." in str(
109 |             error.value
110 |         )
111 | 
112 |     def test_valid_column_label(self):
113 |         # Check that a valid column label returns the appropriate column data.
114 |         assert _validate_groupby_variable(
115 |             data=self.data, groupby_variable="D"
116 |         ).equals(self.data.get("D"))
117 | 
118 |     def test_invalid_column_label(self):
119 |         # Check that an invalid column label raises an error.
120 |         with pytest.raises(GroupbyVariableError) as error:
121 |             _validate_groupby_variable(data=self.data, groupby_variable="X")
122 |         assert "'X' is not in ['A', 'B', 'C', 'D', 'E']" in str(error.value)
123 | 
124 |     def test_null_input(self):
125 |         # Check that `groupby_variable=None` returns `None`
126 |         assert (
127 |             _validate_groupby_variable(data=self.data, groupby_variable=None)
128 |             is None
129 |         )
130 | 
131 |     def test_invalid_input_type(self, caplog: pytest.LogCaptureFixture):
132 |         # Check that invalid input (i.e not in {str, int, None} logs a warning
133 |         # and returns None
134 |         assert (
135 |             _validate_groupby_variable(data=self.data, groupby_variable=1.0)
136 |             is None
137 |         )
138 |         assert (
139 |             "Group-by variable '1.0' ignored. "
140 |             "Not a valid column index or label."
141 |         ) in caplog.text
142 | 
143 |     def test_groupby_variable_with_excess_categories(
144 |         self, caplog: pytest.LogCaptureFixture
145 |     ):
146 |         # Check that target variables with more than 10 unique values raise an
147 |         # error and log a warning that color-coding won't be applied.
148 |         _data = DataFrame([range(11)] * 2, index=["X", "Y"]).T
149 |         expected_message = (
150 |             "Group-by variable 'Y' not used to group values. "
151 |             "It has high cardinality (11) and would clutter graphs."
152 |         )
153 |         with pytest.raises(GroupbyVariableError) as error:
154 |             assert _validate_groupby_variable(
155 |                 data=_data, groupby_variable=1
156 |             ).equals(_data.iloc[:, 1])
157 |         assert expected_message in str(error.value)
158 |         assert expected_message in caplog.text
159 | 
160 | 
161 | class TestColumnLabelCleaning:
162 |     def test_cleaning_rangeindex(self):
163 |         with_rangeindex = DataFrame([[0, 1], [1, 2]])
164 |         # Check if columns [0, 1] are changed to ["var_1", "var_2"]
165 |         assert list(_clean_column_labels(with_rangeindex)) == [
166 |             "var_1",
167 |             "var_2",
168 |         ]
169 | 
170 |     def test_cleaning_numeric_colnames(self):
171 |         with_numeric_colnames = DataFrame([[1, 2], [3, 4]], columns=[1, 5])
172 |         # Column names should be prefixed with "var_"
173 |         assert list(_clean_column_labels(with_numeric_colnames)) == [
174 |             "var_1",
175 |             "var_5",
176 |         ]
177 | 
178 |     def test_cleaning_mixed_colnames(self):
179 |         with_mixed_colnames = DataFrame([[1, 2], [3, 4]], columns=[1, "B"])
180 |         # Numeric column names should be converted to strings
181 |         assert list(_clean_column_labels(with_mixed_colnames)) == ["1", "B"]
182 | 


--------------------------------------------------------------------------------
/eda_report/bivariate.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from collections.abc import Iterable
  3 | from itertools import combinations
  4 | from textwrap import indent
  5 | from typing import List
  6 | 
  7 | from pandas import DataFrame
  8 | 
  9 | from eda_report._validate import _validate_dataset
 10 | 
 11 | 
 12 | def _compute_correlation(dataframe: DataFrame) -> List:
 13 |     """Get the Pearson correlation coefficients for numeric variables.
 14 | 
 15 |     Args:
 16 |         dataframe (pandas.DataFrame): A 2D array of numeric data.
 17 | 
 18 |     Returns:
 19 |         Optional[List]: A list of column pairs and their Pearson's correlation
 20 |         coefficients; sorted by magnitude in descending order.
 21 |     """
 22 |     if dataframe is None:
 23 |         return None
 24 | 
 25 |     numeric_data = dataframe.select_dtypes("number")
 26 |     if numeric_data.shape[1] < 2:
 27 |         return None
 28 |     else:
 29 |         correlation_df = numeric_data.corr(method="pearson")
 30 |         unique_pairs = list(combinations(correlation_df.columns, r=2))
 31 |         correlation_info = [
 32 |             (pair, correlation_df.at[pair]) for pair in unique_pairs
 33 |         ]
 34 |         return sorted(correlation_info, key=lambda x: -abs(x[1]))
 35 | 
 36 | 
 37 | def _describe_correlation(corr_value: float) -> str:
 38 |     """Explain the nature and magnitude of correlation.
 39 | 
 40 |     Args:
 41 |         corr_value (str): Pearson's correlation coefficient.
 42 | 
 43 |     Returns:
 44 |         str: Brief description of correlation type.
 45 |     """
 46 |     nature = " positive" if corr_value > 0 else " negative"
 47 |     value = abs(corr_value)
 48 |     if value >= 0.8:
 49 |         strength = "very strong"
 50 |     elif value >= 0.6:
 51 |         strength = "strong"
 52 |     elif value >= 0.4:
 53 |         strength = "moderate"
 54 |     elif value >= 0.2:
 55 |         strength = "weak"
 56 |     elif value >= 0.05:
 57 |         strength = "very weak"
 58 |     else:
 59 |         strength = "virtually no"
 60 |         nature = ""
 61 |     return f"{strength}{ nature} correlation ({corr_value:.2f})"
 62 | 
 63 | 
 64 | class Dataset:
 65 |     """Analyze two-dimensional datasets to obtain descriptive statistics
 66 |     and correlation information.
 67 | 
 68 |     Input data is stored as a :class:`pandas.DataFrame` in order to leverage
 69 |     pandas_' built-in statistical methods.
 70 | 
 71 |     .. _pandas: https://pandas.pydata.org/
 72 | 
 73 |     Args:
 74 |         data (Iterable): The data to analyze.
 75 | 
 76 |     Example:
 77 |         .. literalinclude:: examples.txt
 78 |            :lines: 79-101
 79 |     """
 80 | 
 81 |     def __init__(self, data: Iterable) -> None:
 82 |         self.data = _validate_dataset(data)
 83 |         self._get_summary_statistics()
 84 |         self._get_bivariate_analysis()
 85 | 
 86 |     def __repr__(self) -> str:
 87 |         """Get the string representation for a `Dataset`.
 88 | 
 89 |         Returns:
 90 |             str: The string representation of the `Dataset` instance.
 91 |         """
 92 |         if self._numeric_stats is None:
 93 |             numeric_stats = ""
 94 |         else:
 95 |             numeric_stats_title = (
 96 |                 "Summary Statistics for Numeric features "
 97 |                 f"({self._numeric_stats.shape[0]})"
 98 |             )
 99 |             numeric_stats = "\n".join(
100 |                 [
101 |                     f"\n\t\t  {numeric_stats_title}",
102 |                     f"\t\t  {'-' * len(numeric_stats_title)}",
103 |                     indent(f"{self._numeric_stats}\n", "  "),
104 |                 ]
105 |             )
106 | 
107 |         if self._categorical_stats is None:
108 |             categorical_stats = ""
109 |         else:
110 |             categorical_stats_title = (
111 |                 "Summary Statistics for Categorical features "
112 |                 f"({self._categorical_stats.shape[0]})"
113 |             )
114 |             categorical_stats = "\n".join(
115 |                 [
116 |                     f"\t{categorical_stats_title}",
117 |                     f"\t{'-' * len(categorical_stats_title)}",
118 |                     indent(f"{self._categorical_stats}\n", " " * 4),
119 |                 ]
120 |             )
121 |         if hasattr(self, "_correlation_descriptions"):
122 |             max_pairs = min(20, len(self._correlation_descriptions))
123 |             top_20 = list(self._correlation_descriptions.items())[:max_pairs]
124 |             corr_repr = "\n".join(
125 |                 [
126 |                     f"{var_pair[0] + ' & ' + var_pair[1]:>32} -> "
127 |                     f"{corr_description}"
128 |                     for var_pair, corr_description in top_20
129 |                 ]
130 |             )
131 |             correlation_description = "\n".join(
132 |                 [
133 |                     "\n\t\t\tPearson's Correlation (Top 20)",
134 |                     f"\t\t\t{'-' * 30}",
135 |                     f"{corr_repr}",
136 |                 ]
137 |             )
138 |         else:
139 |             correlation_description = ""
140 | 
141 |         return "\n".join(
142 |             [
143 |                 f"{numeric_stats}",
144 |                 indent(f"{categorical_stats}", "\t"),
145 |                 f"{correlation_description}",
146 |                 "\t",
147 |             ]
148 |         )
149 | 
150 |     def _get_summary_statistics(self) -> None:
151 |         """Compute descriptive statistics."""
152 |         data = self.data.copy()
153 |         numeric_data = data.select_dtypes("number")
154 |         # Consider numeric columns with < 11 unique values as categorical
155 |         categorical_with_numbers = [
156 |             col for col in numeric_data if numeric_data[col].nunique() < 11
157 |         ]
158 |         numeric_data = numeric_data.drop(columns=categorical_with_numbers)
159 |         if numeric_data.shape[1] < 1:
160 |             self._numeric_stats = None
161 |         else:
162 |             numeric_stats = numeric_data.describe().T
163 |             numeric_stats["count"] = numeric_stats["count"].astype("int")
164 |             numeric_stats = numeric_stats.rename(
165 |                 columns={"mean": "avg", "std": "stddev"}
166 |             )
167 |             numeric_stats["skewness"] = numeric_data.skew(numeric_only=True)
168 |             numeric_stats["kurtosis"] = numeric_data.kurt(numeric_only=True)
169 |             self._numeric_stats = numeric_stats.round(4)
170 | 
171 |         categorical_data = data.drop(columns=numeric_data.columns).copy()
172 |         if categorical_data.shape[1] < 1:
173 |             self._categorical_stats = None
174 |         else:
175 |             for col in categorical_data:
176 |                 # Convert categorical columns with "unique ratio" < 0.3 to
177 |                 # categorical dtype, which would consume much less memory.
178 |                 if (
179 |                     categorical_data[col].nunique() / len(categorical_data)
180 |                 ) < 0.3:
181 |                     categorical_data[col] = categorical_data[col].astype(
182 |                         "category"
183 |                     )
184 |                 else:
185 |                     categorical_data[col] = categorical_data[col].astype(
186 |                         "string"
187 |                     )
188 |             categorical_stats = categorical_data.describe().T
189 |             categorical_stats["relative freq"] = (
190 |                 categorical_stats["freq"] / len(self.data)
191 |             ).apply(lambda x: f"{x :.2%}")
192 |             self._categorical_stats = categorical_stats
193 | 
194 |     def _get_bivariate_analysis(self) -> None:
195 |         """Compare numeric column pairs."""
196 |         self._correlation_values = _compute_correlation(self.data)
197 |         if self._correlation_values is None:
198 |             logging.warning(
199 |                 "Skipped Bivariate Analysis: There are less than 2 numeric "
200 |                 "variables."
201 |             )
202 |         else:
203 |             self._get_correlation_descriptions()
204 | 
205 |     def _get_correlation_descriptions(self) -> None:
206 |         """Get brief descriptions of the nature of correlation between numeric
207 |         column pairs."""
208 |         self._correlation_descriptions = {
209 |             pair: _describe_correlation(corr_value)
210 |             for pair, corr_value in self._correlation_values
211 |         }
212 | 


--------------------------------------------------------------------------------
/docs/source/examples.txt:
--------------------------------------------------------------------------------
  1 | UNIVARIATE
  2 | ==========
  3 | 
  4 | Numeric
  5 | -------
  6 | >>> from eda_report.univariate import Variable
  7 | >>> Variable(range(1, 51), name="1 to 50")
  8 | 
  9 | Name: 1 to 50
 10 | Type: numeric
 11 | Non-null Observations: 50
 12 | Unique Values: 50 -> [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, [...]
 13 | Missing Values: None
 14 | 
 15 |                   Summary Statistics
 16 |                   ------------------
 17 |         Average:                      25.5000
 18 |         Standard Deviation:           14.5774
 19 |         Minimum:                       1.0000
 20 |         Lower Quartile:               13.2500
 21 |         Median:                       25.5000
 22 |         Upper Quartile:               37.7500
 23 |         Maximum:                      50.0000
 24 |         Skewness:                      0.0000
 25 |         Kurtosis:                     -1.2000
 26 | 
 27 |                   Tests for Normality
 28 |                   -------------------
 29 |                                p-value Conclusion at α = 0.05
 30 | D'Agostino's K-squared test  0.0015981  Unlikely to be normal
 31 | Kolmogorov-Smirnov test      0.0000000  Unlikely to be normal
 32 | Shapiro-Wilk test            0.0580895        Possibly normal
 33 | 
 34 | Categorical
 35 | -----------
 36 | >>> Variable(["mango", "apple", "pear", "mango", "pear", "mango"], name="fruits")
 37 | 
 38 | Name: fruits
 39 | Type: categorical
 40 | Non-null Observations: 6
 41 | Unique Values: 3 -> ['apple', 'mango', 'pear']
 42 | Missing Values: None
 43 | Mode (Most frequent): mango
 44 | Maximum frequency: 3
 45 | 
 46 |                 Most Common Items
 47 |                 -----------------
 48 |                    mango: 3 (50.00%)
 49 |                     pear: 2 (33.33%)
 50 |                    apple: 1 (16.67%)
 51 | 
 52 | Datetime
 53 | --------
 54 | >>> import pandas as pd
 55 | >>> dt = pd.date_range("2022-03-08", periods=20, freq="D")
 56 | >>> Variable(dt, name="dttm")
 57 | 
 58 | Name: dttm
 59 | Type: datetime
 60 | Non-null Observations: 20
 61 | Unique Values: 20 -> [Timestamp('2022-03-08 00:00:00'), [...]
 62 | Missing Values: None
 63 | 
 64 |                   Summary Statistics
 65 |                   ------------------
 66 |         Average:              2022-03-17 12:00:00
 67 |         Minimum:              2022-03-08 00:00:00
 68 |         Lower Quartile:       2022-03-12 18:00:00
 69 |         Median:               2022-03-17 12:00:00
 70 |         Upper Quartile:       2022-03-22 06:00:00
 71 |         Maximum:              2022-03-27 00:00:00
 72 | 
 73 | 
 74 | BIVARIATE
 75 | =========
 76 | 
 77 | Dataset
 78 | -------
 79 | >>> Dataset(iris_data)
 80 |                   Summary Statistics for Numeric features (4)
 81 |                   -------------------------------------------
 82 |                 count     avg  stddev  min  25%   50%  75%  max  skewness  kurtosis
 83 |   sepal_length    150  5.8433  0.8281  4.3  5.1  5.80  6.4  7.9    0.3149   -0.5521
 84 |   sepal_width     150  3.0573  0.4359  2.0  2.8  3.00  3.3  4.4    0.3190    0.2282
 85 |   petal_length    150  3.7580  1.7653  1.0  1.6  4.35  5.1  6.9   -0.2749   -1.4021
 86 |   petal_width     150  1.1993  0.7622  0.1  0.3  1.30  1.8  2.5   -0.1030   -1.3406
 87 | 
 88 |                 Summary Statistics for Categorical features (1)
 89 |                 -----------------------------------------------
 90 |                     count unique     top freq relative freq
 91 |             species   150      3  setosa   50        33.33%
 92 | 
 93 | 
 94 |                         Pearson's Correlation (Top 20)
 95 |                         ------------------------------
 96 |       petal_length & petal_width -> very strong positive correlation (0.96)
 97 |      sepal_length & petal_length -> very strong positive correlation (0.87)
 98 |       sepal_length & petal_width -> very strong positive correlation (0.82)
 99 |       sepal_width & petal_length -> moderate negative correlation (-0.43)
100 |        sepal_width & petal_width -> weak negative correlation (-0.37)
101 |       sepal_length & sepal_width -> very weak negative correlation (-0.12)
102 | 
103 | 
104 | CLI
105 | ===
106 | $ eda-report -h
107 | usage: eda-report [-h] [-i INFILE] [-o OUTFILE] [-t TITLE] [-c COLOR]
108 |                   [-g GROUPBY]
109 | 
110 | Automatically analyze data and generate reports. A graphical user interface
111 | will be launched if none of the optional arguments is specified.
112 | 
113 | optional arguments:
114 |   -h, --help            show this help message and exit
115 |   -i INFILE, --infile INFILE
116 |                         A .csv or .xlsx file to analyze.
117 |   -o OUTFILE, --outfile OUTFILE
118 |                         The output name for analysis results (default: eda-
119 |                         report.docx)
120 |   -t TITLE, --title TITLE
121 |                         The top level heading for the report (default:
122 |                         Exploratory Data Analysis Report)
123 |   -c COLOR, --color COLOR
124 |                         The color to apply to graphs (default: cyan)
125 |   -g GROUPBY, -T GROUPBY, --groupby GROUPBY, --target GROUPBY
126 |                         The variable to use for grouping plotted values. An
127 |                         integer value is treated as a column index, whereas a
128 |                         string is treated as a column label.
129 | 
130 | 
131 | TOP LEVEL
132 | =========
133 | 
134 | eda_report.get_word_report
135 | --------------------------
136 | >>> import eda_report
137 | >>> eda_report.get_word_report(iris_data)
138 | Analyze variables:  100%|███████████████████████████████████| 5/5
139 | Plot variables:     100%|███████████████████████████████████| 5/5
140 | Bivariate analysis: 100%|███████████████████████████████████| 6/6 pairs.
141 | [INFO 16:14:53.648] Done. Results saved as 'eda-report.docx'
142 | <eda_report.document.ReportDocument object at 0x7f196753bd60>
143 | 
144 | eda_report.summarize
145 | --------------------
146 | >>> eda_report.summarize(range(50))
147 | 
148 | Name: var_1
149 | Type: numeric
150 | Non-null Observations: 50
151 | Unique Values: 50 -> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, [...]
152 | Missing Values: None
153 | 
154 |                   Summary Statistics
155 |                   ------------------
156 |         Average:                      24.5000
157 |         Standard Deviation:           14.5774
158 |         Minimum:                       0.0000
159 |         Lower Quartile:               12.2500
160 |         Median:                       24.5000
161 |         Upper Quartile:               36.7500
162 |         Maximum:                      49.0000
163 |         Skewness:                      0.0000
164 |         Kurtosis:                     -1.2000
165 | 
166 |                   Tests for Normality
167 |                   -------------------
168 |                                p-value Conclusion at α = 0.05
169 | D'Agostino's K-squared test  0.0015981  Unlikely to be normal
170 | Kolmogorov-Smirnov test      0.0000000  Unlikely to be normal
171 | Shapiro-Wilk test            0.0580895        Possibly normal
172 | >>> eda_report.summarize(iris_data)
173 | 
174 |                   Summary Statistics for Numeric features (4)
175 |                   -------------------------------------------
176 |                 count     avg  stddev  min  25%   50%  75%  max  skewness  kurtosis
177 |   sepal_length    150  5.8433  0.8281  4.3  5.1  5.80  6.4  7.9    0.3149   -0.5521
178 |   sepal_width     150  3.0573  0.4359  2.0  2.8  3.00  3.3  4.4    0.3190    0.2282
179 |   petal_length    150  3.7580  1.7653  1.0  1.6  4.35  5.1  6.9   -0.2749   -1.4021
180 |   petal_width     150  1.1993  0.7622  0.1  0.3  1.30  1.8  2.5   -0.1030   -1.3406
181 | 
182 |                 Summary Statistics for Categorical features (1)
183 |                 -----------------------------------------------
184 |                     count unique     top freq relative freq
185 |             species   150      3  setosa   50        33.33%
186 | 
187 | 
188 |                         Pearson's Correlation (Top 20)
189 |                         ------------------------------
190 |       petal_length & petal_width -> very strong positive correlation (0.96)
191 |      sepal_length & petal_length -> very strong positive correlation (0.87)
192 |       sepal_length & petal_width -> very strong positive correlation (0.82)
193 |       sepal_width & petal_length -> moderate negative correlation (-0.43)
194 |        sepal_width & petal_width -> weak negative correlation (-0.37)
195 |       sepal_length & sepal_width -> very weak negative correlation (-0.12)
196 | 


--------------------------------------------------------------------------------
/tests/test_bivariate_analysis.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from pandas import DataFrame
  3 | 
  4 | from eda_report.bivariate import (
  5 |     Dataset,
  6 |     _compute_correlation,
  7 |     _describe_correlation,
  8 | )
  9 | 
 10 | sample_data = DataFrame(
 11 |     {
 12 |         "A": range(50),
 13 |         "B": list("abcdef") * 8 + ["a"] * 2,
 14 |         "C": [True, False] * 24 + [True] * 2,
 15 |         "D": [1, 3, 5, 7, 9, 11, 13] * 7 + [17],
 16 |     }
 17 | )
 18 | 
 19 | 
 20 | def test_correlation_computation():
 21 |     data = sample_data.copy()
 22 |     assert _compute_correlation(None) is None
 23 | 
 24 |     # Check that < 2 numeric cols returns None
 25 |     assert _compute_correlation(data[["A", "B"]]) is None
 26 | 
 27 |     # Check that only numeric columns are processed
 28 |     assert _compute_correlation(data) == pytest.approx(
 29 |         [(("A", "D"), 0.21019754169815516)]
 30 |     )
 31 | 
 32 | 
 33 | def test_correlation_description():
 34 |     assert (
 35 |         _describe_correlation(0.9) == "very strong positive correlation (0.90)"
 36 |     )
 37 |     assert _describe_correlation(-0.7) == "strong negative correlation (-0.70)"
 38 |     assert _describe_correlation(0.5) == "moderate positive correlation (0.50)"
 39 |     assert _describe_correlation(-0.3) == "weak negative correlation (-0.30)"
 40 |     assert (
 41 |         _describe_correlation(0.1) == "very weak positive correlation (0.10)"
 42 |     )
 43 |     assert _describe_correlation(0.025) == "virtually no correlation (0.03)"
 44 | 
 45 | 
 46 | class TestDataset:
 47 |     dataset = Dataset(sample_data.copy())
 48 | 
 49 |     def test_stored_data(self):
 50 |         assert isinstance(self.dataset.data, DataFrame)
 51 | 
 52 |     def test_categorical_summary_statistics(self):
 53 |         assert self.dataset._categorical_stats.to_dict() == {
 54 |             "count": {"B": 50, "C": 50, "D": 50},
 55 |             "unique": {"B": 6, "C": 2, "D": 8},
 56 |             "top": {"B": "a", "C": True, "D": 1},
 57 |             "freq": {"B": 10, "C": 26, "D": 7},
 58 |             "relative freq": {"B": "20.00%", "C": "52.00%", "D": "14.00%"},
 59 |         }
 60 | 
 61 |     def test_numeric_summary_statistics(self):
 62 |         assert self.dataset._numeric_stats.to_dict(
 63 |             orient="list"
 64 |         ) == pytest.approx(
 65 |             {
 66 |                 "count": [50],
 67 |                 "avg": [24.5],
 68 |                 "stddev": [14.5774],
 69 |                 "min": [0.0],
 70 |                 "25%": [12.25],
 71 |                 "50%": [24.5],
 72 |                 "75%": [36.75],
 73 |                 "max": [49.0],
 74 |                 "skewness": [0.0],
 75 |                 "kurtosis": [-1.2],
 76 |             }
 77 |         )
 78 | 
 79 |     def test_correlation(self):
 80 |         assert self.dataset._correlation_values == pytest.approx(
 81 |             [(("A", "D"), 0.21019754169815516)]
 82 |         )
 83 |         assert self.dataset._correlation_descriptions == {
 84 |             ("A", "D"): "weak positive correlation (0.21)"
 85 |         }
 86 | 
 87 |     def test_repr(self):
 88 |         assert str(self.dataset) == (
 89 |             "\n\t\t  Summary Statistics for Numeric features (1)\n\t\t  ------"
 90 |             "-------------------------------------\n     count   avg   stddev "
 91 |             " min    25%   50%    75%   max  skewness  kurtosis\n  A     50  "
 92 |             "24.5  14.5774  0.0  12.25  24.5  36.75  49.0       0.0      -1.2"
 93 |             "\n\n\t\tSummary Statistics for Categorical features (3)\n\t\t---"
 94 |             "--------------------------------------------\n\t      count "
 95 |             "unique   top freq relative freq\n\t    B    50      6     a   10"
 96 |             "        20.00%\n\t    C    50      2  True   26        52.00%\n"
 97 |             "\t    D    50      8     1    7        14.00%\n\n\n\t\t\t"
 98 |             "Pearson's Correlation (Top 20)\n\t\t\t--------------------------"
 99 |             "----\n                           A & D -> weak positive "
100 |             "correlation (0.21)\n\t"
101 |         )
102 | 
103 |     def test_numeric_only_repr(self):
104 |         numeric_only = Dataset(sample_data[["A"]])
105 |         assert str(numeric_only) == (
106 |             "\n\t\t  Summary Statistics for Numeric features (1)\n\t\t  ------"
107 |             "-------------------------------------\n     count   avg   stddev "
108 |             " min    25%   50%    75%   max  skewness  kurtosis\n  A     50  "
109 |             "24.5  14.5774  0.0  12.25  24.5  36.75  49.0       0.0      -1.2"
110 |             "\n\n\n\n\t"
111 |         )
112 | 
113 |     def test_categorical_only_repr(self, caplog: pytest.LogCaptureFixture):
114 |         categorical_only = Dataset(sample_data[["B", "C"]])
115 |         assert (
116 |             "Skipped Bivariate Analysis: There are less than 2 numeric "
117 |             "variables."
118 |         ) in str(caplog.text)
119 |         assert str(categorical_only) == (
120 |             "\n\t\tSummary Statistics for Categorical features (2)\n\t\t-----"
121 |             "------------------------------------------\n\t      count unique"
122 |             "   top freq relative freq\n\t    B    50      6     a   10      "
123 |             "  20.00%\n\t    C    50      2  True   26        52.00%\n\n\n\t"
124 |         )
125 | 
126 |     def test_correlation_info_truncation_(self):
127 |         plenty_numeric = Dataset(
128 |             DataFrame(
129 |                 {
130 |                     "A": range(11),
131 |                     "B": [0, 1, 2, 4, 5, 7, 8, 8, 9, 9, 4],
132 |                     "C": [0, 9, 2, 4, 5, 7, 8, 8, 9, 9, 1],
133 |                     "D": [2, 9, 2, 2, 4, 9, 8, 7, 9, 9, 3],
134 |                     "E": [2, 4, 2, 2, 4, 9, 2, 7, 4, 5, 8],
135 |                     "F": [9, 4, 2, 5, 3, 0, 2, 2, 4, 7, 6],
136 |                     "G": [9, 4, 9, 0, 3, 8, 7, 1, 9, 5, 2],
137 |                 }
138 |             )
139 |         )
140 |         # In particular, only the top 20 correlation descriptions should be
141 |         # displayed.
142 |         assert len(plenty_numeric._correlation_descriptions) == 21
143 |         assert str(plenty_numeric) == (
144 |             "\n\t\t  Summary Statistics for Numeric features (1)\n\t\t  ------"
145 |             "-------------------------------------\n     count  avg  stddev  "
146 |             "min  25%  50%  75%   max  skewness  kurtosis\n  A     11  5.0  "
147 |             "3.3166  0.0  2.5  5.0  7.5  10.0       0.0      -1.2\n\n\t\t"
148 |             "Summary Statistics for Categorical features (6)\n\t\t------------"
149 |             "-----------------------------------\n\t      count unique top "
150 |             "freq relative freq\n\t    B    11      8   4    2        18.18%"
151 |             "\n\t    C    11      8   9    3        27.27%\n\t    D    11    "
152 |             "  6   9    4        36.36%\n\t    E    11      6   2    4       "
153 |             " 36.36%\n\t    F    11      8   2    3        27.27%\n\t    G   "
154 |             " 11      9   9    3        27.27%\n\n\n\t\t\tPearson's "
155 |             "Correlation (Top 20)\n\t\t\t------------------------------\n    "
156 |             "                       C & D -> very strong positive correlation"
157 |             " (0.92)\n                           A & B -> strong positive "
158 |             "correlation (0.78)\n                           B & C -> strong "
159 |             "positive correlation (0.68)\n                           B & D ->"
160 |             " strong positive correlation (0.64)\n                           "
161 |             "A & E -> moderate positive correlation (0.57)\n                 "
162 |             "          C & F -> moderate negative correlation (-0.40)\n      "
163 |             "                     A & D -> weak positive correlation (0.38)\n"
164 |             "                           D & E -> weak positive correlation "
165 |             "(0.37)\n                           B & E -> weak positive "
166 |             "correlation (0.36)\n                           B & F -> weak "
167 |             "negative correlation (-0.35)\n                           D & F -"
168 |             "> weak negative correlation (-0.35)\n                           "
169 |             "A & C -> weak positive correlation (0.33)\n                     "
170 |             "      E & F -> weak negative correlation (-0.29)\n              "
171 |             "             E & G -> weak negative correlation (-0.23)\n       "
172 |             "                    A & G -> weak negative correlation (-0.22)\n"
173 |             "                           C & E -> very weak positive "
174 |             "correlation (0.18)\n                           D & G -> very "
175 |             "weak positive correlation (0.18)\n                           F &"
176 |             " G -> very weak negative correlation (-0.06)\n                  "
177 |             "         A & F -> virtually no correlation (-0.05)\n            "
178 |             "               B & G -> virtually no correlation (-0.04)\n\t"
179 |         )
180 | 


--------------------------------------------------------------------------------
/tests/test_univariate_analysis.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from pandas import DataFrame, Timestamp, date_range
  3 | 
  4 | from eda_report.univariate import Variable, _analyze_univariate
  5 | 
  6 | 
  7 | class TestDtypeDetection:
  8 |     def test_bool_detection(self):
  9 |         boolean = Variable([True, False, True])
 10 |         assert boolean.var_type == "boolean"
 11 | 
 12 |     def test_categorical_detection(self):
 13 |         categorical = Variable(list("abcdefg"))
 14 |         assert categorical.var_type == "categorical"
 15 | 
 16 |     def test_datetime_detection(self):
 17 |         datetime = Variable(date_range("2022-01-01", periods=5, freq="D"))
 18 |         assert datetime.var_type == "datetime"
 19 | 
 20 |     def test_numeric_detection(self):
 21 |         numeric = Variable(range(20))
 22 |         assert numeric.var_type == "numeric"
 23 | 
 24 | 
 25 | class TestGeneralVariableProperties:
 26 |     variable = Variable(list(range(20)) + [None], name="some-variable")
 27 |     unnamed_variable = Variable(list("ababdea"))
 28 | 
 29 |     def test_missing_values(self):
 30 |         assert self.variable.missing == "1 (4.76%)"
 31 |         assert self.variable._num_non_null == 20
 32 |         assert self.unnamed_variable.missing is None
 33 |         assert self.unnamed_variable._num_non_null == 7
 34 | 
 35 |     def test_name(self):
 36 |         assert self.variable.name == "some-variable"
 37 |         assert self.unnamed_variable.name is None
 38 | 
 39 |     def test_renaming(self):
 40 |         self.unnamed_variable.rename(name="new name")
 41 |         assert self.unnamed_variable.name == "new name"
 42 | 
 43 |         self.variable.rename("another new name")
 44 |         assert self.variable.name == "another new name"
 45 | 
 46 |     def test_unique_values(self):
 47 |         assert self.variable.num_unique == 20
 48 |         assert self.variable.unique_values == pytest.approx(list(range(20)))
 49 | 
 50 |         assert self.unnamed_variable.num_unique == 4
 51 |         assert all(self.unnamed_variable.unique_values == list("abde"))
 52 | 
 53 | 
 54 | class TestCategoricalVariables:
 55 |     categorical_variable = Variable(["a", "b", "c", "d", None, "a"])
 56 |     # Numeric variables with less than 10 unique values are treated as
 57 |     # categorical.
 58 |     numeric_categories = Variable([1, 2, 3] * 10)
 59 | 
 60 |     def test_variable_type(self):
 61 |         assert self.categorical_variable.var_type == "categorical"
 62 |         assert self.numeric_categories.var_type == "numeric (<=10 levels)"
 63 | 
 64 |     def test_summary_statistics(self):
 65 |         assert self.categorical_variable.summary_stats == {
 66 |             "Mode (Most frequent)": "a",
 67 |             "Maximum frequency": 2,
 68 |         }
 69 |         assert self.categorical_variable._most_common_categories == {
 70 |             "a": "2 (40.00%)",
 71 |             "b": "1 (20.00%)",
 72 |             "c": "1 (20.00%)",
 73 |             "d": "1 (20.00%)",
 74 |         }
 75 | 
 76 |     def test_normality_results(self):
 77 |         assert self.categorical_variable._normality_test_results is None
 78 |         assert self.numeric_categories._normality_test_results is None
 79 | 
 80 |     def test_repr(self):
 81 |         assert str(self.categorical_variable) == (
 82 |             "\nName: None\nType: categorical\nNon-null Observations: 5"
 83 |             "\nUnique Values: 4 -> ['a' 'b' 'c' 'd']\nMissing Values: "
 84 |             "1 (16.67%)\nMode (Most frequent): a\nMaximum frequency: 2"
 85 |             "\n\n\t\tMost Common Items\n\t\t-----------------\n       "
 86 |             "                a: 2 (40.00%)\n                       b: "
 87 |             "1 (20.00%)\n                       c: 1 (20.00%)\n       "
 88 |             "                d: 1 (20.00%)"
 89 |         )
 90 | 
 91 | 
 92 | class TestBooleanVariables:
 93 |     # Boolean variables are treated as categorical. Only the var_type differs.
 94 |     boolean_variable = Variable([True, False, True, None] * 5)
 95 |     numeric_bool = Variable([1, 0, 1, None] * 5)
 96 |     str_bool_1 = Variable(["Yes", "No", "Yes"] * 5)
 97 |     str_bool_2 = Variable(["Y", "N", "Y"] * 5)
 98 | 
 99 |     def test_dtype(self):
100 |         assert self.boolean_variable.var_type == "boolean"
101 |         assert self.numeric_bool.var_type == "boolean"
102 |         assert self.str_bool_1.var_type == "boolean"
103 |         assert self.str_bool_2.var_type == "boolean"
104 | 
105 | 
106 | class TestDateTimeVariables:
107 |     datetime_variable = Variable(
108 |         date_range("01-01-2022", periods=10, freq="D"), name="dates"
109 |     )
110 | 
111 |     def test_variable_type(self):
112 |         assert self.datetime_variable.var_type == "datetime"
113 | 
114 |     def test_summary_statistics(self):
115 |         assert self.datetime_variable.summary_stats == {
116 |             "Average": Timestamp("2022-01-05 12:00:00"),
117 |             "Minimum": Timestamp("2022-01-01 00:00:00"),
118 |             "Lower Quartile": Timestamp("2022-01-03 06:00:00"),
119 |             "Median": Timestamp("2022-01-05 12:00:00"),
120 |             "Upper Quartile": Timestamp("2022-01-07 18:00:00"),
121 |             "Maximum": Timestamp("2022-01-10 00:00:00"),
122 |         }
123 |         assert self.datetime_variable._most_common_categories is None
124 | 
125 |     def test_normality_results(self):
126 |         assert self.datetime_variable._normality_test_results is None
127 | 
128 |     def test_repr(self):
129 |         assert str(self.datetime_variable) == (
130 |             "\nName: dates\nType: datetime\nNon-null Observations: 10\n"
131 |             "Unique Values: 10 -> ['2022-01-01T00:00:00.000000000' ... "
132 |             "]\nMissing Values: None\n\n\t\t  Summary Statistics\n\t\t "
133 |             " ------------------\n\tAverage:              2022-01-05 12"
134 |             ":00:00\n\tMinimum:              2022-01-01 00:00:00\n\t"
135 |             "Lower Quartile:       2022-01-03 06:00:00\n\tMedian:      "
136 |             "         2022-01-05 12:00:00\n\tUpper Quartile:       2022"
137 |             "-01-07 18:00:00\n\tMaximum:              2022-01-10 00:00:00"
138 |         )
139 | 
140 | 
141 | class TestNumericVariable:
142 |     numeric_variable = Variable(data=range(50), name="1 to 50")
143 | 
144 |     def test_variable_type(self):
145 |         assert self.numeric_variable.var_type == "numeric"
146 | 
147 |     def test_summary_statistics(self):
148 |         assert self.numeric_variable.summary_stats == pytest.approx(
149 |             {
150 |                 "Average": 24.5,
151 |                 "Standard Deviation": 14.577379737113251,
152 |                 "Minimum": 0.0,
153 |                 "Lower Quartile": 12.25,
154 |                 "Median": 24.5,
155 |                 "Upper Quartile": 36.75,
156 |                 "Maximum": 49.0,
157 |                 "Skewness": 0.0,
158 |                 "Kurtosis": -1.2,
159 |             }
160 |         )
161 |         assert self.numeric_variable._most_common_categories is None
162 | 
163 |     def test_normality_results(self):
164 |         assert isinstance(
165 |             self.numeric_variable._normality_test_results, DataFrame
166 |         )
167 |         assert self.numeric_variable._normality_test_results.to_dict() == {
168 |             "p-value": {
169 |                 "D'Agostino's K-squared test": "0.0015981",
170 |                 "Kolmogorov-Smirnov test": "0.0000000",
171 |                 "Shapiro-Wilk test": "0.0580919",
172 |             },
173 |             "Conclusion at α = 0.05": {
174 |                 "D'Agostino's K-squared test": "Unlikely to be normal",
175 |                 "Kolmogorov-Smirnov test": "Unlikely to be normal",
176 |                 "Shapiro-Wilk test": "Possibly normal",
177 |             },
178 |         }
179 | 
180 |     def test_repr(self):
181 |         assert str(self.numeric_variable) == (
182 |             "\nName: 1 to 50\nType: numeric\nNon-null Observations: 50"
183 |             "\nUnique Values: 50 -> [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 1"
184 |             "4 15 16 17 18 ... ]\nMissing Values: None\n\n\t\t  Summary"
185 |             " Statistics\n\t\t  ------------------\n\tAverage:         "
186 |             "             24.5000\n\tStandard Deviation:           14.5"
187 |             "774\n\tMinimum:                       0.0000\n\tLower Quar"
188 |             "tile:               12.2500\n\tMedian:                    "
189 |             "   24.5000\n\tUpper Quartile:               36.7500\n\tMax"
190 |             "imum:                      49.0000\n\tSkewness:           "
191 |             "           0.0000\n\tKurtosis:                     -1.2000"
192 |             "\n\n\t\t  Tests for Normality\n\t\t  -------------------\n"
193 |             "                               p-value Conclusion at α = 0"
194 |             ".05\nD'Agostino's K-squared test  0.0015981  Unlikely to "
195 |             "be normal\nKolmogorov-Smirnov test      0.0000000  Unlikel"
196 |             "y to be normal\nShapiro-Wilk test            0.0580919    "
197 |             "    Possibly normal"
198 |         )
199 | 
200 | 
201 | def test_analyse_variable():
202 |     name, variable = _analyze_univariate(("wantufifty", range(50)))
203 | 
204 |     assert name == "wantufifty"
205 |     assert isinstance(variable, Variable)
206 | 


--------------------------------------------------------------------------------
/eda_report/gui.py:
--------------------------------------------------------------------------------
  1 | import pkgutil
  2 | 
  3 | try:
  4 |     from tkinter import Button, Canvas, Frame, Label, PhotoImage, StringVar
  5 |     from tkinter.colorchooser import askcolor
  6 |     from tkinter.filedialog import askopenfilename, asksaveasfilename
  7 |     from tkinter.messagebox import (
  8 |         askretrycancel,
  9 |         askyesno,
 10 |         showinfo,
 11 |         showwarning,
 12 |     )
 13 |     from tkinter.simpledialog import askstring
 14 | except (ImportError, ModuleNotFoundError) as error:
 15 |     print(
 16 |         f"Unable to lauch app window because:\n\n\t* {error}.\n\n"
 17 |         "Please visit https://tkdocs.com/tutorial/install.html for"
 18 |         " help installing it.\n\nYou can still use the eda-report command. "
 19 |         "Try 'eda-report -h' for more details.\n"
 20 |     )
 21 |     exit()
 22 | 
 23 | from eda_report._read_file import df_from_file
 24 | from eda_report._validate import _validate_groupby_variable
 25 | from eda_report.document import ReportDocument
 26 | from eda_report.exceptions import GroupbyVariableError
 27 | 
 28 | background_image = pkgutil.get_data(__name__, "images/background.png")
 29 | icon = pkgutil.get_data(__name__, "images/icon.png")
 30 | 
 31 | description = (
 32 |     "Speed up exploratory data analysis & reporting.\n\n"
 33 |     "Automatically analyze files, and get a Word report complete with "
 34 |     "summary statistics and graphs."
 35 | )
 36 | 
 37 | 
 38 | class EDAGUI(Frame):  # pragma: no cover
 39 |     """The blueprint for the :mod:`tkinter` - based *graphical user
 40 |     interface* to the application.
 41 | 
 42 |     .. figure:: _static/screencast.*
 43 |        :alt: an image of the graphical user interface
 44 | 
 45 |     The "Select a file" button launches a *file-dialog* to navigate to and
 46 |     select a file to analyze.
 47 | 
 48 |     If a valid file is selected, *text-input widgets* and a *color-picker
 49 |     tool* pop up to help set the report's *title*,
 50 |     *target/groupby variable(optional)* and *graph color*.
 51 | 
 52 |     Afterwards, a final file-dialog appears to help set the destination
 53 |     for the generated report.
 54 | 
 55 |     .. tip::
 56 |         For help with `Tk` - related issues, consider visiting `TkDocs`_.
 57 | 
 58 |     .. _`TkDocs`: https://tkdocs.com/index.html
 59 |     """
 60 | 
 61 |     def __init__(self, master=None, **kwargs) -> None:
 62 |         super().__init__(master)
 63 |         self.master.title("eda-report")
 64 |         self.master.geometry("560x320")
 65 |         self.master.resizable(False, False)  # Fix window size
 66 |         self.master.wm_iconphoto(True, PhotoImage(data=icon))
 67 |         self._create_widgets()
 68 |         self.pack()
 69 | 
 70 |     def _create_widgets(self) -> None:
 71 |         """Creates the widgets for the graphical user interface: A Tk *Frame*
 72 |         with the *canvas(background image)*, *introductory text*, and a
 73 |         *button* to select files to analyze.
 74 |         """
 75 |         self.canvas = Canvas(self, width=560, height=320)
 76 |         # Set background image
 77 |         self.bg_image = PhotoImage(data=background_image)
 78 |         self.canvas.create_image((0, 0), image=self.bg_image, anchor="nw")
 79 |         # Add title
 80 |         self.canvas.create_text(
 81 |             (70, 30),
 82 |             anchor="nw",
 83 |             fill="black",
 84 |             font=("Courier", 28, "bold"),
 85 |             text="eda-report",
 86 |         )
 87 |         # Add description
 88 |         self.canvas.create_text(
 89 |             (40, 90),
 90 |             anchor="nw",
 91 |             fill="black",
 92 |             font=("Courier", 12),
 93 |             text=description,
 94 |             width=480,
 95 |         )
 96 |         # Add a button to select input file
 97 |         self.button = Button(
 98 |             self,
 99 |             bg="#204060",
100 |             command=self._create_report,
101 |             default="active",
102 |             fg="white",
103 |             font=("Courier", 11),
104 |             relief="flat",
105 |             text="Select a file",
106 |         )
107 |         self.canvas.create_window(
108 |             (180, 220), anchor="nw", height=40, width=200, window=self.button
109 |         )
110 |         # Display current action
111 |         self.current_action = StringVar()
112 |         self.display_current_action = Label(
113 |             self,
114 |             bg="#c0d6e3",
115 |             font=("Courier", 10, "italic"),
116 |             textvariable=self.current_action,
117 |         )
118 |         self.canvas.create_window(
119 |             (140, 280),
120 |             anchor="nw",
121 |             window=self.display_current_action,
122 |         )
123 |         self.canvas.pack()
124 | 
125 |     def _create_report(self) -> None:
126 |         """Collects input from the graphical user interface, and uses the
127 |         :class:`~eda_report.document.ReportDocument` object to generate a
128 |         report.
129 |         """
130 |         self.current_action.set("Waiting for input file...")
131 |         self._get_data_from_file()
132 | 
133 |         if self.data is not None:
134 |             self.current_action.set("Waiting for report title...")
135 |             self._get_report_title()
136 | 
137 |             self.current_action.set("Waiting for group-by variable...")
138 |             self._get_groupby_variable()
139 | 
140 |             self.current_action.set("Waiting for graph color...")
141 |             self._get_graph_color()
142 | 
143 |             self.current_action.set("Analysing data & compiling the report...")
144 |             self._get_save_as_name()
145 | 
146 |             # Generate and save the report using the collected arguments
147 |             ReportDocument(
148 |                 self.data,
149 |                 title=self.report_title,
150 |                 graph_color=self.graph_color,
151 |                 output_filename=self.save_name,
152 |                 groupby_variable=self.groupby_variable,
153 |             )
154 |             self.current_action.set("")
155 |             showinfo(message=f"Done! Report saved as {self.save_name!r}.")
156 | 
157 |             # Clear data to free up memory
158 |             del self.data
159 | 
160 |     def _get_data_from_file(self, retries: int = 1) -> None:
161 |         """Creates a file dialog to help navigate to and select a file to
162 |         analyze.
163 | 
164 |         Args:
165 |             retries (int, optional): Number of additional prompts, if input is
166 |                 invalid.
167 |         """
168 |         file_name = askopenfilename(
169 |             title="Select a file to analyze",
170 |             filetypes=(
171 |                 ("All supported formats", ("*.csv", "*.xlsx")),
172 |                 ("csv", "*.csv"),
173 |                 ("excel", "*.xlsx"),
174 |             ),
175 |         )
176 |         if file_name:
177 |             self.data = df_from_file(file_name)
178 |         elif retries > 0:
179 |             if askretrycancel(message="Please select a file to continue"):
180 |                 self._get_data_from_file(retries - 1)
181 |             else:
182 |                 # No data if retry prompt is cancelled
183 |                 self.data = None
184 |         else:
185 |             # No data if no file is selected and retry has been used up
186 |             self.data = None
187 | 
188 |     def _get_report_title(self) -> None:
189 |         """Capture text input for the desired report title."""
190 |         report_title = askstring(
191 |             title="Report Title",
192 |             prompt="Please enter your preferred title for the report:",
193 |             initialvalue="Exploratory Data Analysis Report",
194 |         )
195 |         self.report_title = report_title or "Exploratory Data Analysis Report"
196 | 
197 |     def _get_groupby_variable(self) -> None:
198 |         """Inquire about the groupby variable, and create a text box to
199 |         collect input.
200 |         """
201 |         if askyesno(
202 |             message="Would you like to specify a variable to group by?"
203 |         ):
204 |             self.groupby_variable = askstring(
205 |                 title="Select Group-by Variable",
206 |                 prompt="Please enter the name/index of the group-by variable:",
207 |             )
208 |             try:
209 |                 _validate_groupby_variable(
210 |                     data=self.data, groupby_variable=self.groupby_variable
211 |                 )
212 |             except GroupbyVariableError as error:
213 |                 self.groupby_variable = None
214 |                 showwarning(
215 |                     title="Invalid Group-By Variable", message=error.message
216 |                 )
217 |         else:
218 |             self.groupby_variable = None
219 | 
220 |     def _get_graph_color(self) -> None:
221 |         """Creates a graphical color picking tool to help set the desired
222 |         color for the generated graphs.
223 |         """
224 |         color = askcolor(
225 |             color="cyan", title="Please select a color for the graphs"
226 |         )
227 |         # Pick the hexadecimal color format. `askcolor` returns a tuple e.g
228 |         # ((255.99609375, 69.26953125, 0.0), '#ff4500').
229 |         self.graph_color = color[-1] or "cyan"
230 | 
231 |     def _get_save_as_name(self) -> None:
232 |         """Create a file dialog to set destination of the generated report."""
233 |         save_name = asksaveasfilename(
234 |             initialdir=".",
235 |             initialfile="eda-report.docx",
236 |             filetypes=(("Word document", "*.docx"),),
237 |             title="Please select Save As file name",
238 |         )
239 |         self.save_name = save_name or "eda-report.docx"
240 | 


--------------------------------------------------------------------------------
/eda_report/univariate.py:
--------------------------------------------------------------------------------
  1 | from collections.abc import Iterable
  2 | from textwrap import shorten
  3 | from typing import Dict, Optional, Tuple
  4 | 
  5 | import numpy as np
  6 | from pandas import DataFrame, Series
  7 | from pandas.api.types import (
  8 |     is_bool_dtype,
  9 |     is_datetime64_any_dtype,
 10 |     is_numeric_dtype,
 11 | )
 12 | from scipy import stats
 13 | 
 14 | from eda_report._validate import _validate_univariate_input
 15 | 
 16 | 
 17 | class Variable:
 18 |     """Obtain summary statistics and properties such as data type, missing
 19 |     value info & cardinality from one-dimensional datasets.
 20 | 
 21 |     Args:
 22 |         data (Iterable): The data to analyze.
 23 |         name (str, optional): The name to assign the variable. Defaults to
 24 |             None.
 25 | 
 26 |     Examples:
 27 |         .. literalinclude:: examples.txt
 28 |            :lines: 6-32
 29 |         .. literalinclude:: examples.txt
 30 |            :lines: 36-50
 31 |         .. literalinclude:: examples.txt
 32 |            :lines: 54-71
 33 |     """
 34 | 
 35 |     def __init__(self, data: Iterable, *, name: str = None) -> None:
 36 |         data = _validate_univariate_input(data, name=name)
 37 | 
 38 |         #: str: The variable's *name*. If no name is specified, the name will
 39 |         #: be set the value of the ``name`` attribute of the input data, or
 40 |         #: ``None``.
 41 |         self.name = data.name
 42 | 
 43 |         #: str: The type of variable — one of *"boolean"*, *"categorical"*,
 44 |         #: *"datetime"*, *"numeric"* or *"numeric (<=10 levels)"*.
 45 |         self.var_type = self._get_variable_type(data)
 46 | 
 47 |         #: int: The *number of unique values* present in the variable.
 48 |         self.num_unique = data.nunique()
 49 | 
 50 |         #: numpy.ndarray: The *unique values* present in the variable.
 51 |         self.unique_values = np.sort(data.dropna().unique())
 52 | 
 53 |         #: str: The number of *missing values* in the form
 54 |         #: ``number (% of total count)`` e.g "4 (16.67%)".
 55 |         self.missing = self._get_missing_values_info(data)
 56 | 
 57 |         #: dict: Descriptive statistics
 58 |         self.summary_stats = self._get_summary_statistics(data)
 59 | 
 60 |         self._num_non_null = len(data.dropna())
 61 |         self._normality_test_results = self._test_for_normality(data)
 62 |         self._most_common_categories = self._get_most_common_categories(data)
 63 | 
 64 |     def __repr__(self) -> str:
 65 |         """Define the string representation of a `Variable`.
 66 | 
 67 |         Returns:
 68 |             str: Variable summary.
 69 |         """
 70 |         sample_values = shorten(
 71 |             f"{self.num_unique} -> {self.unique_values}",
 72 |             width=60,
 73 |             placeholder=" ... ]",
 74 |         )
 75 |         basic_details = "\n".join(
 76 |             [
 77 |                 f"\nName: {self.name}",
 78 |                 f"Type: {self.var_type}",
 79 |                 f"Non-null Observations: {self._num_non_null}",
 80 |                 f"Unique Values: {sample_values}",
 81 |                 f"Missing Values: {self.missing}",
 82 |             ]
 83 |         )
 84 |         if self.var_type == "numeric":
 85 |             summary_stats = "\n".join(
 86 |                 [
 87 |                     f"\t{key + ':':21} {value :>15.4f}"
 88 |                     for key, value in self.summary_stats.items()
 89 |                 ],
 90 |             )
 91 |             return "\n".join(
 92 |                 [
 93 |                     f"{basic_details}\n",
 94 |                     "\t\t  Summary Statistics",
 95 |                     "\t\t  ------------------",
 96 |                     summary_stats,
 97 |                     "\n\t\t  Tests for Normality",
 98 |                     "\t\t  -------------------",
 99 |                     f"{self._normality_test_results}",
100 |                 ]
101 |             )
102 |         elif self.var_type == "datetime":
103 |             summary_stats = "\n".join(
104 |                 [
105 |                     f"\t{key + ':':18} {str(value):>22}"
106 |                     for key, value in self.summary_stats.items()
107 |                 ],
108 |             )
109 |             return "\n".join(
110 |                 [
111 |                     f"{basic_details}\n",
112 |                     "\t\t  Summary Statistics",
113 |                     "\t\t  ------------------",
114 |                     summary_stats,
115 |                 ]
116 |             )
117 |         else:
118 |             summary_stats = "\n".join(
119 |                 [
120 |                     f"{key}: {value}"
121 |                     for key, value in self.summary_stats.items()
122 |                 ]
123 |             )
124 |             most_common = "\n".join(
125 |                 [
126 |                     f"{str(key):>24}: {value}"
127 |                     for key, value in self._most_common_categories.items()
128 |                 ]
129 |             )
130 |             return "\n".join(
131 |                 [
132 |                     basic_details,
133 |                     summary_stats,
134 |                     "\n\t\tMost Common Items",
135 |                     "\t\t-----------------",
136 |                     most_common,
137 |                 ]
138 |             )
139 | 
140 |     def _get_variable_type(self, data: Series) -> str:
141 |         """Determine the variable type.
142 | 
143 |         Args:
144 |             data (pandas.Series): The data to analyze.
145 | 
146 |         Returns:
147 |             str: The variable type: `boolean`, `categorical`, `datetime`,
148 |             `numeric` or `numeric (<10 levels)`.
149 |         """
150 |         if is_numeric_dtype(data):
151 |             if is_bool_dtype(data) or set(data.dropna()) == {0, 1}:
152 |                 # Consider data consisting of ones and zeros as boolean
153 |                 return "boolean"
154 |             elif data.nunique() <= 10:
155 |                 # Consider numeric data with cardinality <= 10 as categorical
156 |                 return "numeric (<=10 levels)"
157 |             else:
158 |                 return "numeric"
159 |         # Accomodate common values for boolean variables
160 |         elif set(data.dropna()) in [
161 |             {False, True},
162 |             {"False", "True"},
163 |             {"No", "Yes"},
164 |             {"N", "Y"},
165 |         ]:
166 |             return "boolean"
167 |         elif is_datetime64_any_dtype(data):
168 |             return "datetime"
169 |         else:
170 |             return "categorical"
171 | 
172 |     def _get_missing_values_info(self, data: Series) -> Optional[str]:
173 |         """Get the number of missing values.
174 | 
175 |         Args:
176 |             data (pandas.Series): The data to analyze.
177 | 
178 |         Returns:
179 |             Optional[str]: Details about the number of missing values.
180 |         """
181 |         missing_values = data.isna().sum()
182 |         if missing_values == 0:
183 |             return None
184 |         else:
185 |             return f"{missing_values:,} ({missing_values / len(data):.2%})"
186 | 
187 |     def _get_summary_statistics(self, data: Series) -> Dict:
188 |         """Compute summary statistics for the variable based on data type.
189 | 
190 |         Args:
191 |             data (pandas.Series): The data to analyze.
192 | 
193 |         Returns:
194 |             Dict: Summary statistics.
195 |         """
196 |         if self.var_type == "numeric":
197 |             stats = data.describe()
198 |             return {
199 |                 "Average": stats["mean"],
200 |                 "Standard Deviation": stats["std"],
201 |                 "Minimum": stats["min"],
202 |                 "Lower Quartile": stats["25%"],
203 |                 "Median": stats["50%"],
204 |                 "Upper Quartile": stats["75%"],
205 |                 "Maximum": stats["max"],
206 |                 "Skewness": data.skew(),
207 |                 "Kurtosis": data.kurt(),
208 |             }
209 |         elif self.var_type == "datetime":
210 |             stats = data.describe()
211 |             return {
212 |                 "Average": stats["mean"],
213 |                 "Minimum": stats["min"],
214 |                 "Lower Quartile": stats["25%"],
215 |                 "Median": stats["50%"],
216 |                 "Upper Quartile": stats["75%"],
217 |                 "Maximum": stats["max"],
218 |             }
219 |         else:
220 |             data = data.copy().astype("category")
221 |             stats = data.describe()
222 |             return {
223 |                 "Mode (Most frequent)": stats["top"],
224 |                 "Maximum frequency": stats["freq"],
225 |             }
226 | 
227 |     def _test_for_normality(
228 |         self, data: Series, alpha: float = 0.05
229 |     ) -> DataFrame:
230 |         """Perform the "D'Agostino's K-squared", "Kolmogorov-Smirnov" and
231 |         "Shapiro-Wilk" tests for normality.
232 | 
233 |         Args:
234 |             data (pandas.Series): The data to analyze.
235 |             alpha (float, optional): The level of significance. Defaults to
236 |                 0.05.
237 | 
238 |         Returns:
239 |             pandas.DataFrame: Table of results.
240 |         """
241 |         data = data.dropna()
242 |         if self.var_type == "numeric":
243 |             # The scipy implementation of the Shapiro-Wilk test reports:
244 |             # "For N > 5000 the W test statistic is accurate but the p-value
245 |             # may not be."
246 |             shapiro_sample = data.sample(5000) if len(data) > 5000 else data
247 |             tests = [
248 |                 "D'Agostino's K-squared test",
249 |                 "Kolmogorov-Smirnov test",
250 |                 "Shapiro-Wilk test",
251 |             ]
252 |             p_values = [
253 |                 stats.normaltest(data).pvalue,
254 |                 stats.kstest(data, "norm", N=200).pvalue,
255 |                 stats.shapiro(shapiro_sample).pvalue,
256 |             ]
257 |             results = DataFrame(index=tests)
258 |             results["p-value"] = [f"{x:.7f}" for x in p_values]
259 |             results[f"Conclusion at α = {alpha}"] = [
260 |                 "Possibly normal"
261 |                 if p_value > alpha
262 |                 else "Unlikely to be normal"
263 |                 for p_value in p_values
264 |             ]
265 |             return results
266 |         else:
267 |             return None
268 | 
269 |     def _get_most_common_categories(self, data: Series) -> Dict:
270 |         """Get the top 10 frequently occuring categories.
271 | 
272 |         Args:
273 |             data (pandas.Series): The data to analyze.
274 | 
275 |         Returns:
276 |             Dict: Top 10 categories and their frequency info.
277 |         """
278 |         data = data.dropna()
279 |         if self.var_type in {"numeric", "datetime"}:
280 |             return None
281 |         else:
282 |             top_10 = data.value_counts().nlargest(10)
283 |             return {
284 |                 key: f"{val} ({val/len(data):.2%})"
285 |                 for key, val in top_10.items()
286 |             }
287 | 
288 |     def rename(self, name: str) -> None:
289 |         """Update the variable's name.
290 | 
291 |         Args:
292 |             name (str): New name.
293 |         """
294 |         self.name = name
295 | 
296 | 
297 | def _analyze_univariate(name_and_data: Tuple) -> Variable:
298 |     """Helper function to concurrently analyze data with multiprocessing.
299 | 
300 |     Args:
301 |         name_and_data (Tuple): Name and data.
302 | 
303 |     Returns:
304 |         Variable: `Variable` instance.
305 |     """
306 |     name, data = name_and_data
307 |     var = Variable(data, name=name)
308 |     return name, var
309 | 


--------------------------------------------------------------------------------
/eda_report/document.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Iterable, Sequence, Union
  3 | 
  4 | from docx import Document
  5 | from docx.enum.text import WD_ALIGN_PARAGRAPH
  6 | from docx.shared import Inches, Pt
  7 | from docx.text.paragraph import Paragraph
  8 | from pandas import DataFrame, Series
  9 | 
 10 | from eda_report._content import _ReportContent
 11 | 
 12 | logging.basicConfig(
 13 |     format="[%(levelname)s %(asctime)s.%(msecs)03d] %(message)s",
 14 |     level=logging.INFO,
 15 |     datefmt="%H:%M:%S",
 16 | )
 17 | # Set matplotlib logging level to WARNING.
 18 | mpl_logger = logging.getLogger("matplotlib")
 19 | mpl_logger.setLevel(logging.WARNING)
 20 | 
 21 | 
 22 | class ReportDocument(_ReportContent):
 23 |     """Creates a report :class:`~docx.document.Document` with analysis results.
 24 | 
 25 |     The report consists of 3 main sections:
 26 | 
 27 |     #. An **Overview** of the data and its features.
 28 |     #. **Univariate Analysis**: Summary statistics and graphs for each feature.
 29 |     #. **Bivariate Analysis**: Pair-wise comparisons of numerical features.
 30 | 
 31 |     Args:
 32 |         data (Iterable): The data to analyze.
 33 |         title (str, optional): The title to assign the report. Defaults to
 34 |             "Exploratory Data Analysis Report".
 35 |         graph_color (str, optional): The color to apply to the graphs.
 36 |             Defaults to "cyan".
 37 |         groupby_variable (Union[str, int], optional): The column to
 38 |             use to group values. Defaults to None.
 39 |         output_filename (str, optional): The name/path to save the document
 40 |             to. Defaults to "eda-report.docx".
 41 |         table_style (str, optional): The style to apply to the tables created.
 42 |             Defaults to "Table Grid".
 43 |     """
 44 | 
 45 |     def __init__(
 46 |         self,
 47 |         data: Iterable,
 48 |         *,
 49 |         title: str = "Exploratory Data Analysis Report",
 50 |         graph_color: str = "cyan",
 51 |         groupby_variable: Union[str, int] = None,
 52 |         output_filename: str = "eda-report.docx",
 53 |         table_style: str = "Table Grid",
 54 |     ) -> None:
 55 |         super().__init__(
 56 |             data,
 57 |             title=title,
 58 |             graph_color=graph_color,
 59 |             groupby_variable=groupby_variable,
 60 |         )
 61 |         self.OUTPUT_FILENAME = output_filename
 62 |         self.TABLE_STYLE = table_style
 63 |         self.document = Document()  # Initialize report document
 64 |         self._create_cover_page()
 65 |         self._get_univariate_analysis()
 66 | 
 67 |         if self.dataset._correlation_values is not None:
 68 |             self._get_bivariate_analysis()
 69 | 
 70 |         self._to_file()
 71 |         logging.info(f"Done. Results saved as {self.OUTPUT_FILENAME!r}")
 72 | 
 73 |     def _create_cover_page(self) -> None:
 74 |         """Add a title and overview of the data."""
 75 |         self.document.add_heading(self.TITLE, level=0)
 76 |         self.document.add_paragraph(self.intro_text)
 77 |         self._get_numeric_overview_table()
 78 |         self._get_categorical_overview_table()
 79 |         self.document.add_page_break()
 80 | 
 81 |     def _get_numeric_overview_table(self) -> None:
 82 |         """Create a table with an overview of the numeric features present."""
 83 |         if self.dataset._numeric_stats is None:
 84 |             return None
 85 |         else:
 86 |             heading = self.document.add_heading(
 87 |                 "Overview of Numeric Features", level=1
 88 |             )
 89 |             self._format_paragraph_spacing(heading)
 90 |             # count | avg | stddev | min | 25% | 50% | 75% | max
 91 |             self._create_table(
 92 |                 data=self.dataset._numeric_stats,
 93 |                 header=True,
 94 |                 column_widths=(1.2,) + (0.7,) * 8,
 95 |                 font_size=8.5,
 96 |                 style="Normal Table",
 97 |             )
 98 | 
 99 |     def _get_categorical_overview_table(self) -> None:
100 |         """Create a table with an overview of the categorical features
101 |         present.
102 |         """
103 |         if self.dataset._categorical_stats is None:
104 |             return None
105 |         else:
106 |             heading = self.document.add_heading(
107 |                 "Overview of Categorical Features", level=1
108 |             )
109 |             self._format_paragraph_spacing(heading)
110 |             # column-name | count | unique | top | freq | relative freq
111 |             self._create_table(
112 |                 data=self.dataset._categorical_stats,
113 |                 header=True,
114 |                 column_widths=(1.2,) + (0.9,) * 5,
115 |                 font_size=8.5,
116 |                 style="Normal Table",
117 |             )
118 | 
119 |     def _get_univariate_analysis(self) -> None:
120 |         """Get a brief introduction, summary statistics, and graphs for each
121 |         individual variable.
122 |         """
123 |         univariate_heading = self.document.add_heading(
124 |             "1. Univariate Analysis", level=1
125 |         )
126 |         self._format_paragraph_spacing(univariate_heading, before=0, after=0)
127 |         for idx, variable in enumerate(self.variables.values(), start=1):
128 |             var_name = variable.name
129 |             description = self.variable_descriptions[var_name]
130 |             summary_stats = Series(self.univariate_stats[var_name]).to_frame()
131 |             graphs = self.univariate_graphs[var_name]
132 |             contingency_table = self.contingency_tables.get(var_name)
133 |             normality_tests = self.normality_tests.get(var_name)
134 |             # Variable's title and brief description
135 |             heading = self.document.add_heading(
136 |                 f"1.{idx} {var_name}".title(), level=2
137 |             )
138 |             self._format_paragraph_spacing(heading, before=12, after=5)
139 |             self.document.add_paragraph(description)
140 |             # Summary statistics table
141 |             stats_heading = self.document.add_heading(
142 |                 "Summary Statistics", level=4
143 |             )
144 |             stats_heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
145 |             self._create_table(summary_stats, column_widths=[2.5, 2])
146 |             # Images of plotted graphs
147 |             for name, image in graphs.items():
148 |                 width = 3.3 if name == "prob_plot" else 4.2
149 |                 self.document.add_picture(image, width=Inches(width))
150 |                 picture_paragraph = self.document.paragraphs[-1]
151 |                 picture_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
152 | 
153 |             if contingency_table is not None:
154 |                 contingency_table_heading = self.document.add_heading(
155 |                     "Contingency table", level=4
156 |                 )
157 |                 contingency_table_heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
158 |                 context = self.document.add_paragraph(
159 |                     f"Index = '{var_name}', "
160 |                     f"Columns = '{self.GROUPBY_DATA.name}' "
161 |                 )
162 |                 context.alignment = WD_ALIGN_PARAGRAPH.CENTER
163 |                 context.runs[0].font.size = Pt(8)
164 |                 n_cols = contingency_table.shape[1]
165 |                 max_width = 5.2 if n_cols > 5 else 3.2
166 |                 col_width = max_width / n_cols
167 |                 self._create_table(
168 |                     data=contingency_table,
169 |                     header=True,
170 |                     column_widths=(1.2,) + (col_width,) * n_cols,
171 |                     font_size=8.5,
172 |                 )
173 | 
174 |             if normality_tests is not None:
175 |                 norm_test_heading = self.document.add_heading(
176 |                     "Tests for Normality", level=4
177 |                 )
178 |                 norm_test_heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
179 |                 # type | p-value | conclusion
180 |                 self._create_table(
181 |                     data=normality_tests,
182 |                     header=True,
183 |                     column_widths=(2.2, 1, 2),
184 |                     font_size=8.5,
185 |                     style="Normal Table",
186 |                 )
187 | 
188 |         self.document.add_page_break()
189 | 
190 |     def _get_bivariate_analysis(self) -> None:
191 |         """Get comparisons and regression-plots for pairs of numeric
192 |         variables.
193 |         """
194 |         bivariate_heading = self.document.add_heading(
195 |             "2. Bivariate Analysis", level=1
196 |         )
197 |         self._format_paragraph_spacing(bivariate_heading, before=0)
198 |         overview_heading = self.document.add_heading("2.1 Overview", level=2)
199 |         self._format_paragraph_spacing(overview_heading)
200 |         self.document.add_picture(
201 |             self.bivariate_graphs["correlation_plot"],
202 |             width=Inches(6.4),
203 |         )
204 |         picture_paragraph = self.document.paragraphs[-1]
205 |         picture_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
206 |         self.document.add_page_break()
207 | 
208 |         pairwise_heading = self.document.add_heading(
209 |             "2.2 Regression Plots (Top 20)", level=2
210 |         )
211 |         self._format_paragraph_spacing(pairwise_heading, before=0)
212 |         for idx, var_pair in enumerate(self.bivariate_summaries, start=1):
213 |             heading = self.document.add_heading(
214 |                 f"2.2.{idx} {var_pair[0]} vs {var_pair[1]}".title(), level=3
215 |             )
216 |             self._format_paragraph_spacing(heading, before=16, after=5)
217 |             self.document.add_paragraph(self.bivariate_summaries[var_pair])
218 |             self.document.add_picture(
219 |                 self.bivariate_graphs["regression_plots"][var_pair],
220 |                 width=Inches(3.3),
221 |             )
222 |             picture_paragraph = self.document.paragraphs[-1]
223 |             picture_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
224 | 
225 |     def _format_paragraph_spacing(
226 |         self, paragraph: Paragraph, before: int = 15, after: int = 7
227 |     ) -> None:
228 |         """Set the spacing above or below a paragraph.
229 | 
230 |         Args:
231 |             paragraph (docx.text.paragraph.Paragraph): A paragraph.
232 |             before (int, optional): Size of spacing above the paragraph in pt.
233 |                 Defaults to 15.
234 |             after (int, optional): Size of spacing below the paragraph in pt.
235 |                 Defaults to 7.
236 |         """
237 |         paragraph.paragraph_format.space_before = Pt(before)
238 |         paragraph.paragraph_format.space_after = Pt(after)
239 | 
240 |     def _create_table(
241 |         self,
242 |         data: DataFrame,
243 |         column_widths: Sequence = (),
244 |         font_face: str = "Courier New",
245 |         font_size: float = 10,
246 |         style: str = None,
247 |         header: bool = False,
248 |     ) -> None:
249 |         """Generates a table for the supplied ``data``.
250 | 
251 |         Args:
252 |             data (DataFrame): The data to tabulate.
253 |             column_widths (Sequence, optional): Column dimensions in inches.
254 |                 Defaults to ().
255 |             font_face (str, optional): Font for cell text. Defaults to
256 |                 "Courier New".
257 |             font_size (float, optional): Font size. Defaults to 10.
258 |             style (str, optional): A `Word` table style. Defaults to
259 |                 None.
260 |             header (bool, optional): Whether or not to include column names.
261 |                 Defaults to False.
262 |         """
263 |         table = self.document.add_table(
264 |             rows=0,
265 |             cols=len(column_widths),
266 |             style=style or self.document.styles[self.TABLE_STYLE],
267 |         )
268 |         table.alignment = WD_ALIGN_PARAGRAPH.CENTER
269 |         for idx, width in enumerate(column_widths):
270 |             table.columns[idx].width = Inches(width)
271 | 
272 |         if header:
273 |             cells = table.add_row().cells
274 |             header_labels = [""] + list(data.columns)
275 |             for cell, value in zip(cells, header_labels):
276 |                 cell.text = f"{value}"
277 |                 # Font size and type-face have to be set at `run` level
278 |                 run = cell.paragraphs[0].runs[0]
279 |                 run.bold = True
280 |                 run.font.size = Pt(font_size)
281 |                 run.font.name = font_face
282 | 
283 |         # Sequentially add and populate rows
284 |         for row_data in data.itertuples():
285 |             cells = table.add_row().cells
286 |             for idx, (cell, value) in enumerate(zip(cells, row_data)):
287 |                 try:
288 |                     # Strip trailing zeros from float values
289 |                     text = f"{value:.4f}".rstrip("0").rstrip(".")
290 |                 except ValueError:
291 |                     text = f"{value}"
292 | 
293 |                 cell.text = text
294 |                 # Font size and type-face have to be set at `run` level
295 |                 run = cell.paragraphs[0].runs[0]
296 |                 run.font.size = Pt(font_size)
297 |                 run.font.name = font_face
298 |                 # Make first column values bold if header is True
299 |                 if idx == 0 and header:
300 |                     run.bold = True
301 | 
302 |         # Add empty paragraph. "Spacing" for docx Table isn't yet implemented
303 |         self.document.add_paragraph()
304 | 
305 |     def _to_file(self) -> None:
306 |         """Save the report as a file."""
307 |         for section in self.document.sections:
308 |             section.left_margin = Inches(1.2)
309 |             section.right_margin = Inches(1.2)
310 | 
311 |         self.document.save(self.OUTPUT_FILENAME)
312 | 


--------------------------------------------------------------------------------
/tests/test_plotting_functions.py:
--------------------------------------------------------------------------------
  1 | from io import BytesIO
  2 | 
  3 | import pytest
  4 | from matplotlib.axes import Axes
  5 | from matplotlib.colors import to_rgb
  6 | from matplotlib.figure import Figure
  7 | from pandas import DataFrame, Series
  8 | 
  9 | from eda_report.bivariate import Dataset
 10 | from eda_report.plotting import (
 11 |     _get_or_validate_axes,
 12 |     _get_color_shades_of,
 13 |     _plot_dataset,
 14 |     _plot_regression,
 15 |     _plot_variable,
 16 |     _savefig,
 17 |     bar_plot,
 18 |     box_plot,
 19 |     kde_plot,
 20 |     plot_correlation,
 21 |     prob_plot,
 22 | )
 23 | from eda_report.univariate import Variable
 24 | 
 25 | 
 26 | def test_savefig_function():
 27 |     saved = _savefig(figure=Figure())
 28 |     assert isinstance(saved, BytesIO)
 29 | 
 30 | 
 31 | def test_get_color_shades_of():
 32 |     color, num_shades = "green", 5
 33 |     green_shades = _get_color_shades_of(color, num_shades)
 34 |     assert green_shades.shape == (num_shades, 3)  # each color is an rgb tuple
 35 |     assert green_shades[0] == pytest.approx(to_rgb(color))
 36 | 
 37 | 
 38 | class TestGetAxesFunction:
 39 |     def test_without_input(self):
 40 |         ax = _get_or_validate_axes()
 41 |         assert isinstance(ax, Axes)
 42 | 
 43 |     def test_with_axes_input(self):
 44 |         ax1 = Figure().subplots()
 45 |         ax2 = _get_or_validate_axes(ax1)
 46 |         assert ax1 is ax2
 47 | 
 48 |     def test_with_invalid_input(self):
 49 |         with pytest.raises(TypeError) as error:
 50 |             _get_or_validate_axes(123)
 51 |         # Check that the error message is as expected
 52 |         assert "Invalid input for 'ax': <class 'int'>" in str(error.value)
 53 | 
 54 | 
 55 | class TestBoxplot:
 56 |     data = Series(list(range(25)) + [None, None])
 57 |     hue = Series([1, 2, 3] * 9, name="hue-name")
 58 |     simple_box = box_plot(data, label="simple")
 59 |     grouped_box = box_plot(data, label="grouped", hue=hue)
 60 | 
 61 |     def test_return_type(self):
 62 |         assert isinstance(self.simple_box, Axes)
 63 |         assert isinstance(self.grouped_box, Axes)
 64 | 
 65 |     def test_plot_title(self):
 66 |         assert self.simple_box.get_title() == "Box-plot of simple"
 67 |         assert self.grouped_box.get_title() == "Box-plot of grouped"
 68 | 
 69 |     def test_axis_labels(self):
 70 |         assert self.simple_box.get_xlabel() == ""
 71 |         assert self.simple_box.get_ylabel() == ""
 72 |         assert self.grouped_box.get_xlabel() == ""
 73 |         assert self.grouped_box.get_ylabel() == "Hue-Name"
 74 | 
 75 |         boxplot_with_nameless_hue = box_plot(
 76 |             self.data, label="grouped", hue=self.hue.to_numpy()
 77 |         )
 78 |         assert boxplot_with_nameless_hue.get_xlabel() == ""
 79 |         assert boxplot_with_nameless_hue.get_ylabel() == ""
 80 | 
 81 |     def test_grouping(self):
 82 |         # Simple box-plot has one patch
 83 |         assert len(self.simple_box.patches) == 1
 84 |         # Grouped box-plot has hue.nunique() patches
 85 |         assert len(self.grouped_box.patches) == self.hue.nunique()
 86 | 
 87 |     def test_simple_set_color(self):
 88 |         box1_color = self.simple_box.patches[0].get_facecolor()
 89 | 
 90 |         _color = "blue"
 91 |         simple_box_2 = box_plot(self.data, label="simple", color=_color)
 92 |         box2_color = simple_box_2.patches[0].get_facecolor()
 93 | 
 94 |         assert box1_color == pytest.approx(
 95 |             (*to_rgb("C0"), 0.75)  # default color 1 and alpha value
 96 |         )
 97 |         assert box2_color == pytest.approx(
 98 |             (*to_rgb(_color), 0.75)  # _color and alpha value
 99 |         )
100 | 
101 |     def test_grouped_set_color(self):
102 |         _color = "lime"
103 |         # Take last patch since colors are reversed (["CN", .. , "C0"])
104 |         last_box_color = self.grouped_box.patches[-1].get_facecolor()
105 | 
106 |         grouped_box_2 = box_plot(
107 |             self.data, hue=self.hue, label="simple", color=_color
108 |         )
109 |         last_box2_color = grouped_box_2.patches[-1].get_facecolor()
110 | 
111 |         assert last_box_color == pytest.approx(
112 |             (*to_rgb("C0"), 0.75)  # default color |hue| and alpha value
113 |         )
114 |         assert last_box2_color == pytest.approx(
115 |             (*to_rgb(_color), 0.75)  # _color and alpha value
116 |         )
117 | 
118 | 
119 | class TestKdeplot:
120 |     data = Series(list(range(25)) + [None, None])
121 |     hue = Series([1, 2, 3] * 9, name="hue-name")
122 |     simple_kde = kde_plot(data, label="simple")
123 |     grouped_kde = kde_plot(data, label="grouped", hue=hue)
124 | 
125 |     def test_return_type(self):
126 |         assert isinstance(self.simple_kde, Axes)
127 |         assert isinstance(self.grouped_kde, Axes)
128 | 
129 |     def test_plot_title(self):
130 |         assert self.simple_kde.get_title() == "Density plot of simple"
131 |         assert self.grouped_kde.get_title() == "Density plot of grouped"
132 | 
133 |     def test_axis_labels(self):
134 |         assert self.simple_kde.get_xlabel() == "simple"
135 |         assert self.simple_kde.get_ylabel() == ""
136 |         assert self.grouped_kde.get_xlabel() == "grouped"
137 |         assert self.grouped_kde.get_ylabel() == ""
138 | 
139 |     def test_legend(self):
140 |         assert self.simple_kde.get_legend() is None
141 |         assert (
142 |             self.grouped_kde.get_legend().get_title().get_text() == "Hue-Name"
143 |         )
144 |         grouped_with_nameless_hue = kde_plot(
145 |             self.data, label="grouped", hue=self.hue.to_numpy()
146 |         )
147 |         assert grouped_with_nameless_hue.get_legend() is None
148 | 
149 |     def test_grouping(self):
150 |         # simple_kde has one line
151 |         assert len(self.simple_kde.lines) == 1
152 |         # grouped_kde has hue.nunique() lines
153 |         assert len(self.grouped_kde.lines) == self.hue.nunique()
154 | 
155 |     def test_kde_small_sample(self):
156 |         # Should plot text explaining that the input data is singular
157 |         plot = kde_plot(self.data[:1], label="small-sample")
158 |         assert (
159 |             plot.texts[0].get_text()
160 |             == "[Could not plot kernel density estimate.\n Data is singular.]"
161 |         )
162 | 
163 |     def test_kde_zero_variance(self):
164 |         # Should plot text explaining that the input data is singular
165 |         plot = kde_plot(Series([1] * 25), label="constant-sample")
166 |         assert (
167 |             plot.texts[0].get_text()
168 |             == "[Could not plot kernel density estimate.\n Data is singular.]"
169 |         )
170 | 
171 |     def test_simple_set_color(self):
172 |         kde1_color = self.simple_kde.lines[0].get_color()
173 | 
174 |         _color = "violet"
175 |         simple_kde_2 = kde_plot(self.data, label="simple", color=_color)
176 |         kde2_color = simple_kde_2.lines[0].get_color()
177 | 
178 |         assert to_rgb(kde1_color) == pytest.approx(to_rgb("C0"))
179 |         assert to_rgb(kde2_color) == pytest.approx(to_rgb(_color))
180 | 
181 |     def test_grouped_set_color(self):
182 |         first_kde_color = self.grouped_kde.lines[0].get_color()
183 | 
184 |         _color = "aqua"
185 |         grouped_kde_2 = kde_plot(
186 |             self.data, hue=self.hue, label="simple", color=_color
187 |         )
188 |         first_kde2_color = grouped_kde_2.lines[0].get_color()
189 | 
190 |         assert to_rgb(first_kde_color) == pytest.approx(to_rgb("C0"))
191 |         assert to_rgb(first_kde2_color) == pytest.approx(to_rgb(_color))
192 | 
193 | 
194 | class TestProbplot:
195 |     data = Series(list(range(25)) + [None, None])
196 |     plot = prob_plot(data, label="some-data")
197 | 
198 |     def test_return_type(self):
199 |         assert isinstance(self.plot, Axes)
200 | 
201 |     def test_plot_title(self):
202 |         assert self.plot.get_title() == "Probability plot of some-data"
203 | 
204 |     def test_axis_labels(self):
205 |         assert self.plot.get_xlabel() == "Theoretical Quantiles (Normal)"
206 |         assert self.plot.get_ylabel() == "Ordered Values"
207 | 
208 |     def test_plot_components(self):
209 |         # Plot should have 2 lines (input data & normal diagonal)
210 |         assert len(self.plot.lines) == 2
211 | 
212 |     def test_default_colors(self):
213 |         markers, reg_line = self.plot.lines
214 | 
215 |         assert markers.get_color() == "C0"
216 |         assert reg_line.get_color() == "#222"
217 | 
218 |     def test_set_colors(self):
219 |         fig = prob_plot(
220 |             self.data,
221 |             label="some-more-data",
222 |             marker_color="yellow",
223 |             line_color="salmon",
224 |         )
225 |         markers, reg_line = fig.lines
226 | 
227 |         assert markers.get_color() == "yellow"
228 |         assert reg_line.get_color() == "salmon"
229 | 
230 | 
231 | class TestBarplot:
232 |     low_cardinality_data = Series(list("abcdeabcdabcaba"))
233 |     high_cardinality_data = Series(list("aabbccddeeffgghhiijjkkllmmnn"))
234 |     simple_bar = bar_plot(low_cardinality_data, label="abcde")
235 |     truncated_bar = bar_plot(high_cardinality_data, label="a_to_n")
236 | 
237 |     def test_return_type(self):
238 |         assert isinstance(self.simple_bar, Axes)
239 |         assert isinstance(self.truncated_bar, Axes)
240 | 
241 |     def test_plot_title(self):
242 |         assert self.simple_bar.get_title() == "Bar-plot of abcde"
243 |         assert (
244 |             self.truncated_bar.get_title()
245 |             == "Bar-plot of a_to_n (Top 10 of 14)"
246 |         )
247 | 
248 |     def test_axis_labels(self):
249 |         assert self.simple_bar.get_xlabel() == ""
250 |         assert self.simple_bar.get_ylabel() == "Count"
251 | 
252 |     def test_bar_truncation(self):
253 |         # Check that only the top 10 categories are plotted
254 |         assert len(self.truncated_bar.patches) == 10  # only 10 of 14
255 | 
256 |     def test_default_color(self):
257 |         bar_color = self.simple_bar.patches[0].get_facecolor()
258 |         assert to_rgb(bar_color) == pytest.approx(to_rgb("C0"))
259 | 
260 |     def test_set_color(self):
261 |         fig = bar_plot(self.low_cardinality_data, label="test", color="pink")
262 |         bar_color = fig.patches[0].get_facecolor()
263 |         assert to_rgb(bar_color) == pytest.approx(to_rgb("pink"))
264 | 
265 | 
266 | class TestPlotvariable:
267 |     def test_numeric_plots(self):
268 |         data = range(25)
269 |         numeric_var = Variable(data, name="numbers")
270 |         name, graphs = _plot_variable(
271 |             variable_data_hue_and_color=(numeric_var, data, None, "teal")
272 |         )
273 | 
274 |         assert name == numeric_var.name
275 |         assert set(graphs.keys()) == {"box_plot", "kde_plot", "prob_plot"}
276 |         for graph in graphs.values():
277 |             assert isinstance(graph, BytesIO)
278 | 
279 |     def test_categorical_plots(self):
280 |         data = list("abcdeabcdabcaba")
281 |         categorical_var = Variable(list("abcdeabcdabcaba"), name="letters")
282 |         name, graphs = _plot_variable(
283 |             variable_data_hue_and_color=(categorical_var, data, None, "navy")
284 |         )
285 | 
286 |         assert name == categorical_var.name
287 |         assert set(graphs.keys()) == {"bar_plot"}
288 |         for graph in graphs.values():
289 |             assert isinstance(graph, BytesIO)
290 | 
291 | 
292 | class TestPlotCorrelation:
293 |     def test_with_insufficient_numeric_pairs(self):
294 |         # Check None is returned if there are < 2 numeric pairs
295 |         single_numeric = plot_correlation(zip(range(5), list("abcde")))
296 |         no_numeric = plot_correlation(list("abcde"))
297 |         assert single_numeric is None
298 |         assert no_numeric is None
299 | 
300 |     def test_with_few_numeric_pairs(self):
301 |         corr_plot = plot_correlation([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
302 |         assert isinstance(corr_plot, Axes)
303 |         assert corr_plot.get_title() == "Pearson Correlation (Top 3)"
304 |         assert len(corr_plot.patches) == 3  # 3 unique pairs
305 | 
306 |     def test_with_excess_numeric_pairs(self):
307 |         # Should only plot the top 20 by magnitude
308 |         corr_plot = plot_correlation([range(10), [4, 5, 6, 7, 8] * 2])
309 | 
310 |         assert isinstance(corr_plot, Axes)
311 |         assert corr_plot.get_title() == "Pearson Correlation (Top 20)"
312 |         assert len(corr_plot.patches) == 20  # Top 20 of 45 pairs
313 | 
314 |     def test_default_colors(self):
315 |         corr_plot = plot_correlation([[1, 2, 8], [3, 5, 5], [9, 8, 1]])
316 |         bars = corr_plot.patches
317 |         # bars = (0.96, -0.98, -1), from origin
318 |         pos_bar_color = bars[0].get_facecolor()
319 |         neg_bar_color = bars[-1].get_facecolor()
320 | 
321 |         assert to_rgb(pos_bar_color) == pytest.approx(to_rgb("orangered"))
322 |         assert to_rgb(neg_bar_color) == pytest.approx(to_rgb("steelblue"))
323 | 
324 |     def test_set_colors(self):
325 |         corr_plot2 = plot_correlation(
326 |             [[10, 20, 80], [30, 50, 50], [90, 80, 10]],
327 |             color_neg="skyblue",
328 |             color_pos="maroon",
329 |         )
330 |         bars2 = corr_plot2.patches
331 |         # bars = (0.96, -0.98, -1), from origin
332 |         pos_bar_color2 = bars2[0].get_facecolor()
333 |         neg_bar_color2 = bars2[-1].get_facecolor()
334 | 
335 |         assert to_rgb(pos_bar_color2) == pytest.approx(to_rgb("maroon"))
336 |         assert to_rgb(neg_bar_color2) == pytest.approx(to_rgb("skyblue"))
337 | 
338 | 
339 | class TestRegressionPlot:
340 |     data = DataFrame({"A": range(60000), "B": [1, 2, 3] * 20000})
341 |     var_pair, reg_plot = _plot_regression(data_and_color=(data, "lime"))
342 | 
343 |     def test_return_type(self):
344 |         assert self.var_pair == ("A", "B")
345 |         assert isinstance(self.reg_plot, Axes)
346 | 
347 |     def test_plot_title(self):
348 |         title = self.reg_plot.get_title()
349 |         assert "Slope" in title
350 |         assert "Intercept" in title
351 |         assert "Correlation" in title
352 | 
353 |     def test_axis_labels(self):
354 |         var1, var2 = self.var_pair
355 |         assert self.reg_plot.get_xlabel() == var1
356 |         assert self.reg_plot.get_ylabel() == var2
357 | 
358 |     def test_max_sample_size(self):
359 |         # Check that a sample of size 50000 is taken for large datasets
360 |         points = self.reg_plot.collections[0].get_offsets().data
361 |         assert len(points) == 50000
362 | 
363 |     def test_plot_color(self):
364 |         assert self.reg_plot.lines[0].get_color() == "#444"  # reg line
365 |         assert to_rgb(  # markers
366 |             self.reg_plot.collections[0].get_facecolor()
367 |         ) == pytest.approx(to_rgb("lime"))
368 | 
369 | 
370 | class TestPlotDataset:
371 |     def test_without_numeric_pairs(self):
372 |         data = Dataset(range(50))
373 |         assert _plot_dataset(data, color="red") is None
374 | 
375 |     def test_with_numeric_pairs(self):
376 |         data = Dataset([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
377 |         graphs = _plot_dataset(data, color="green")
378 | 
379 |         assert set(graphs.keys()) == {
380 |             "correlation_plot",
381 |             "regression_plots",
382 |         }
383 |         corr_plot = graphs["correlation_plot"]
384 |         reg_plots = list(graphs["regression_plots"].values())
385 | 
386 |         for graph in reg_plots + [corr_plot]:
387 |             assert isinstance(graph, BytesIO)
388 | 
389 |     def test_limiting_numeric_pairs(self):
390 |         data = Dataset([range(12), [1, 2, 3, 4] * 3])
391 |         # `data`` has 12 numeric columns, resulting in up to 66 var_pairs.
392 |         # Check if only limit = 20 are plotted.
393 |         graphs = _plot_dataset(data, color="green")
394 |         assert len(graphs["regression_plots"]) == 20
395 | 


--------------------------------------------------------------------------------
/eda_report/plotting.py:
--------------------------------------------------------------------------------
  1 | from io import BytesIO
  2 | from multiprocessing import get_context
  3 | from typing import Dict, Iterable, Optional, Sequence, Tuple, Union
  4 | 
  5 | import matplotlib as mpl
  6 | import numpy as np
  7 | from matplotlib.axes import Axes
  8 | from matplotlib.colors import to_rgb
  9 | from matplotlib.figure import Figure
 10 | from scipy.stats import gaussian_kde, probplot
 11 | from tqdm import tqdm
 12 | 
 13 | from eda_report._validate import _validate_dataset, _validate_univariate_input
 14 | from eda_report.bivariate import Dataset
 15 | 
 16 | # Matplotlib configuration
 17 | GENERAL_RC_PARAMS = {
 18 |     "axes.spines.top": False,
 19 |     "axes.spines.right": False,
 20 |     "axes.titlesize": 12,
 21 |     "axes.titleweight": 500,
 22 |     "figure.autolayout": True,
 23 |     "figure.figsize": (5.6, 3.5),
 24 |     "font.family": "serif",
 25 |     "savefig.dpi": 120,
 26 | }
 27 | # Customize box-plots
 28 | BOXPLOT_RC_PARAMS = {
 29 |     **GENERAL_RC_PARAMS,
 30 |     "boxplot.medianprops.color": "black",
 31 |     "boxplot.patchartist": True,
 32 |     "boxplot.vertical": False,
 33 | }
 34 | # Customize correlation-plots
 35 | CORRPLOT_RC_PARAMS = {**GENERAL_RC_PARAMS, "figure.figsize": (7, 6.3)}
 36 | # Customize regression-plots
 37 | REGPLOT_RC_PARAMS = {**GENERAL_RC_PARAMS, "figure.figsize": (5.2, 5)}
 38 | 
 39 | 
 40 | @mpl.rc_context(GENERAL_RC_PARAMS)
 41 | def _savefig(figure: Figure) -> BytesIO:
 42 |     """Saves the contents of a :class:`~matplotlib.figure.Figure` in PNG
 43 |     format, as bytes in a file-like object. This allows rapid in-memory
 44 |     access when compiling the report.
 45 | 
 46 |     Args:
 47 |         figure (matplotlib.figure.Figure): Graph content.
 48 | 
 49 |     Returns:
 50 |         io.BytesIO: A graph in PNG format as bytes.
 51 |     """
 52 |     graph = BytesIO()
 53 |     figure.savefig(graph, format="png")
 54 |     return graph
 55 | 
 56 | 
 57 | def _get_or_validate_axes(ax: Axes = None) -> Axes:
 58 |     """Create or validate an Axes instance.
 59 | 
 60 |     Args:
 61 |         ax (matplotlib.axes.Axes, optional): Axes instance. Defaults to None.
 62 | 
 63 |     Raises:
 64 |         TypeError: If `ax` is not an Axes instance.
 65 | 
 66 |     Returns:
 67 |         Axes: Axes instance.
 68 |     """
 69 |     if ax is None:
 70 |         return Figure().subplots()
 71 |     elif isinstance(ax, Axes):
 72 |         return ax
 73 |     else:
 74 |         raise TypeError(f"Invalid input for 'ax': {type(ax)}")
 75 | 
 76 | 
 77 | def _get_color_shades_of(color: str, num: int = None) -> Sequence:
 78 |     """Obtain an array with `num` shades of the specified `color`.
 79 | 
 80 |     Args:
 81 |         color (str): The desired color.
 82 |         num (int): Desired number of color shades.
 83 | 
 84 |     Returns:
 85 |         Sequence: Array of RGB colors.
 86 |     """
 87 |     color_rgb = to_rgb(color)
 88 |     return np.linspace(color_rgb, (0.25, 0.25, 0.25), num=num)
 89 | 
 90 | 
 91 | @mpl.rc_context(BOXPLOT_RC_PARAMS)
 92 | def box_plot(
 93 |     data: Iterable,
 94 |     *,
 95 |     label: str,
 96 |     hue: Iterable = None,
 97 |     color: Union[str, Sequence] = None,
 98 |     ax: Axes = None,
 99 | ) -> Axes:
100 |     """Get a box-plot from numeric values.
101 | 
102 |     Args:
103 |         data (Iterable): Values to plot.
104 |         label (str): A name for the ``data``, shown in the title.
105 |         hue (Iterable, optional): Values for grouping the ``data``. Defaults to
106 |             None.
107 |         color (Union[str, Sequence]): A valid matplotlib color specifier.
108 |         ax (matplotlib.axes.Axes, optional): Axes instance. Defaults to None.
109 | 
110 |     Returns:
111 |         matplotlib.axes.Axes: Matplotlib axes with the box-plot.
112 |     """
113 |     original_data = _validate_univariate_input(data)
114 |     data = original_data.dropna()
115 |     ax = _get_or_validate_axes(ax)
116 |     if hue is None:
117 |         bxplot = ax.boxplot(
118 |             data,
119 |             tick_labels=[label],
120 |             sym=".",
121 |             boxprops=dict(facecolor=color, alpha=0.75),
122 |         )
123 |         ax.set_yticklabels("")
124 |     else:
125 |         hue = _validate_univariate_input(hue)[original_data.notna()]
126 |         groups = {key: sub_series for key, sub_series in data.groupby(hue)}
127 |         bxplot = ax.boxplot(
128 |             groups.values(), tick_labels=groups.keys(), sym="."
129 |         )
130 | 
131 |         if color is None:
132 |             colors = [f"C{idx}" for idx in range(hue.nunique())]
133 |         else:
134 |             colors = _get_color_shades_of(color, hue.nunique())
135 | 
136 |         for patch, color in zip(bxplot["boxes"], reversed(colors)):
137 |             patch.set_facecolor(color)
138 |             patch.set_alpha(0.75)
139 | 
140 |         if hue.name is not None:
141 |             ax.set_ylabel(f"{hue.name}".title())
142 | 
143 |     ax.set_title(f"Box-plot of {label}")
144 |     return ax
145 | 
146 | 
147 | @mpl.rc_context(GENERAL_RC_PARAMS)
148 | def kde_plot(
149 |     data: Iterable,
150 |     *,
151 |     label: str,
152 |     hue: Iterable = None,
153 |     color: Union[str, Sequence] = None,
154 |     ax: Axes = None,
155 | ) -> Axes:
156 |     """Get a kde-plot from numeric values.
157 | 
158 |     Args:
159 |         data (Iterable): Values to plot.
160 |         label (str): A name for the ``data``, shown in the title.
161 |         hue (Iterable, optional): Values for grouping the ``data``. Defaults to
162 |             None.
163 |         color (Union[str, Sequence]): A valid matplotlib color specifier.
164 |         ax (matplotlib.axes.Axes, optional): Axes instance. Defaults to None.
165 | 
166 |     Returns:
167 |         matplotlib.axes.Axes: Matplotlib axes with the kde-plot.
168 |     """
169 |     original_data = _validate_univariate_input(data)
170 |     data = original_data.dropna()
171 |     ax = _get_or_validate_axes(ax)
172 |     if len(data) < 2 or np.isclose(data.std(), 0):
173 |         msg = "[Could not plot kernel density estimate.\n Data is singular.]"
174 |         ax.text(x=0.08, y=0.45, s=msg, color="#f72", size=14, weight=600)
175 |         return ax
176 | 
177 |     eval_points = np.linspace(data.min(), data.max(), num=len(data))
178 |     if hue is None:
179 |         kernel = gaussian_kde(data)
180 |         density = kernel(eval_points)
181 |         ax.plot(eval_points, density, label=label, color=color)
182 |         ax.fill_between(eval_points, density, alpha=0.3, color=color)
183 |     else:
184 |         hue = _validate_univariate_input(hue)[original_data.notna()]
185 |         if color is None:
186 |             colors = [f"C{idx}" for idx in range(hue.nunique())]
187 |         else:
188 |             colors = _get_color_shades_of(color, hue.nunique())
189 | 
190 |         for color, (key, series) in zip(colors, data.groupby(hue)):
191 |             kernel = gaussian_kde(series)
192 |             density = kernel(eval_points)
193 |             ax.plot(eval_points, density, label=key, alpha=0.75, color=color)
194 |             ax.fill_between(eval_points, density, alpha=0.25, color=color)
195 | 
196 |         if hue.name is not None:
197 |             ax.legend(title=f"{hue.name}".title())
198 | 
199 |     ax.set_xlabel(label)
200 |     ax.set_ylim(0)
201 |     ax.set_title(f"Density plot of {label}")
202 |     return ax
203 | 
204 | 
205 | @mpl.rc_context(REGPLOT_RC_PARAMS)
206 | def prob_plot(
207 |     data: Iterable,
208 |     *,
209 |     label: str,
210 |     marker_color: Union[str, Sequence] = "C0",
211 |     line_color: Union[str, Sequence] = "#222",
212 |     ax: Axes = None,
213 | ) -> Axes:
214 |     """Get a probability-plot from numeric values.
215 | 
216 |     Args:
217 |         data (Iterable): Values to plot.
218 |         label (str): A name for the ``data``, shown in the title.
219 |         marker_color (Union[str, Sequence]): Color for the plotted points.
220 |             Defaults to "C0".
221 |         line_color (Union[str, Sequence]): Color for the line of best fit.
222 |             Defaults to "#222".
223 |         ax (matplotlib.axes.Axes, optional): Axes instance. Defaults to None.
224 | 
225 |     Returns:
226 |         matplotlib.axes.Axes: Matplotlib axes with the probability-plot.
227 |     """
228 |     original_data = _validate_univariate_input(data)
229 |     data = original_data.dropna()
230 |     ax = _get_or_validate_axes(ax)
231 |     probplot(data, fit=True, plot=ax)
232 |     ax.lines[0].set_color(marker_color)
233 |     ax.lines[1].set_color(line_color)
234 |     ax.set_xlabel("Theoretical Quantiles (Normal)")
235 |     ax.set_title(f"Probability plot of {label}")
236 |     return ax
237 | 
238 | 
239 | @mpl.rc_context(GENERAL_RC_PARAMS)
240 | def bar_plot(
241 |     data: Iterable,
242 |     *,
243 |     label: str,
244 |     color: Union[str, Sequence] = None,
245 |     ax: Axes = None,
246 | ) -> Axes:
247 |     """Get a bar-plot from a sequence of values.
248 | 
249 |     Args:
250 |         data (Iterable): Values to plot.
251 |         label (str): A name for the ``data``, shown in the title.
252 |         color (Union[str, Sequence]): A valid matplotlib color specifier.
253 |         ax (matplotlib.axes.Axes, optional): Axes instance. Defaults to None.
254 | 
255 |     Returns:
256 |         matplotlib.axes.Axes: Matplotlib axes with the bar-plot.
257 |     """
258 |     original_data = _validate_univariate_input(data)
259 |     data = original_data.dropna()
260 |     ax = _get_or_validate_axes(ax)
261 |     # Include no more than 10 of the most common values
262 |     top_10 = data.value_counts().nlargest(10)
263 |     bars = ax.bar(top_10.index.map(str), top_10, alpha=0.8, color=color)
264 |     ax.bar_label(bars, labels=[f"{x:,.0f}" for x in top_10], padding=2)
265 |     if (num_unique := data.nunique()) > 10:
266 |         title = f"Bar-plot of {label} (Top 10 of {num_unique})"
267 |     else:
268 |         title = f"Bar-plot of {label}"
269 |     ax.set_title(title)
270 |     ax.set_ylabel("Count")
271 |     ax.tick_params(axis="x", rotation=90)  # Improve visibility for long labels
272 |     return ax
273 | 
274 | 
275 | def _plot_variable(variable_data_hue_and_color: Tuple) -> Tuple:
276 |     """Helper function to concurrently plot variables in a multiprocessing
277 |     Pool.
278 | 
279 |     Args:
280 |         variable_data_hue_and_color (Tuple): A variable, plot data, hue data
281 |             and the desired plot color.
282 | 
283 |     Returns:
284 |         Tuple: `variable`s name, and graphs in a dict.
285 |     """
286 |     variable, data, hue, color = variable_data_hue_and_color
287 |     if variable.var_type == "numeric":
288 |         plots = {
289 |             "box_plot": box_plot(
290 |                 data=data, hue=hue, label=variable.name, color=color
291 |             ),
292 |             "kde_plot": kde_plot(
293 |                 data=data, hue=hue, label=variable.name, color=color
294 |             ),
295 |             "prob_plot": prob_plot(
296 |                 data, label=variable.name, marker_color=color
297 |             ),
298 |         }
299 |     else:  # {"boolean", "categorical", "datetime", "numeric (<=10 levels)"}
300 |         plots = {"bar_plot": bar_plot(data, label=variable.name, color=color)}
301 | 
302 |     graph_images = {name: _savefig(ax.figure) for name, ax in plots.items()}
303 |     return variable.name, graph_images
304 | 
305 | 
306 | @mpl.rc_context(CORRPLOT_RC_PARAMS)
307 | def plot_correlation(
308 |     variables: Iterable,
309 |     max_pairs: int = 20,
310 |     color_pos: Union[str, Sequence] = "orangered",
311 |     color_neg: Union[str, Sequence] = "steelblue",
312 |     ax: Axes = None,
313 | ) -> Axes:
314 |     """Create a bar chart showing the top ``max_pairs`` most correlated
315 |     variables. Bars are annotated with variable pairs and their respective
316 |     Pearson correlation coefficients.
317 | 
318 |     Args:
319 |         variables (Iterable): 2-dimensional numeric data.
320 |         max_pairs (int): The maximum number of numeric pairs to include in the
321 |             plot. Defaults to 20.
322 |         color_pos (Union[str, Sequence]): Color for positive correlation bars.
323 |             Defaults to "orangered".
324 |         color_neg (Union[str, Sequence]): Color for negative correlation bars.
325 |             Defaults to "steelblue".
326 |         ax (matplotlib.axes.Axes, optional): Axes instance. Defaults to None.
327 | 
328 |     Returns:
329 |         matplotlib.axes.Axes: A bar-plot of correlation data.
330 |     """
331 |     if not isinstance(variables, Dataset):
332 |         variables = Dataset(variables)
333 | 
334 |     if variables._correlation_values is None:
335 |         return None
336 | 
337 |     # Show at most `max_pairs` numeric pairs.
338 |     pairs_to_show = variables._correlation_values[:max_pairs]
339 |     # Reverse items so largest values appear at the top.
340 |     corr_data = dict(reversed(pairs_to_show))
341 |     labels = [" vs ".join(pair) for pair in corr_data.keys()]
342 |     ax = _get_or_validate_axes(ax)
343 |     ax.barh(labels, corr_data.values(), edgecolor="#222", linewidth=0.5)
344 |     ax.set_xlim(-1.1, 1.1)
345 |     ax.spines["left"].set_position("zero")  # Place y-axis spine at x=0
346 |     ax.yaxis.set_visible(False)  # Hide y-axis labels
347 | 
348 |     for p, label in zip(ax.patches, labels):
349 |         p.set_alpha(min(1, abs(p.get_width()) + 0.1))
350 |         if p.get_width() < 0:
351 |             p.set_facecolor(color_neg)
352 |             ax.text(
353 |                 p.get_x(),
354 |                 p.get_y() + p.get_height() / 2,
355 |                 f"{p.get_width():,.2f} ({label})  ",
356 |                 size=8,
357 |                 ha="right",
358 |                 va="center",
359 |             )
360 |         else:
361 |             p.set_facecolor(color_pos)
362 |             ax.text(
363 |                 p.get_x(),
364 |                 p.get_y() + p.get_height() / 2,
365 |                 f"  {p.get_width():,.2} ({label})",
366 |                 size=8,
367 |                 ha="left",
368 |                 va="center",
369 |             )
370 | 
371 |     ax.set_title(f"Pearson Correlation (Top {len(corr_data)})")
372 |     return ax
373 | 
374 | 
375 | @mpl.rc_context(REGPLOT_RC_PARAMS)
376 | def regression_plot(
377 |     x: Iterable,
378 |     y: Iterable,
379 |     labels: Tuple[str, str],
380 |     marker_color: Union[str, Sequence] = "C0",
381 |     line_color: Union[str, Sequence] = "#444",
382 |     ax: Axes = None,
383 | ) -> Axes:
384 |     """Get a regression-plot from the provided pair of numeric values.
385 | 
386 |     Args:
387 |         x (Iterable): Numeric values.
388 |         y (Iterable): Numeric values.
389 |         labels (Tuple[str, str]): Names for `x` and `y` respectively, shown in
390 |             axis labels.
391 |         marker_color (Union[str, Sequence]): Color for the plotted points.
392 |             Defaults to "C0".
393 |         line_color (Union[str, Sequence]): Color for the line of best fit.
394 |             Defaults to "#444".
395 |         ax (matplotlib.axes.Axes, optional): Axes instance. Defaults to None.
396 | 
397 |     Returns:
398 |         matplotlib.axes.Axes: Matplotlib axes with the regression-plot.
399 |     """
400 |     var1, var2 = labels
401 |     data = _validate_dataset({var1: x, var2: y}).dropna()
402 |     if len(data) > 50000:
403 |         data = data.sample(50000)
404 | 
405 |     ax = _get_or_validate_axes(ax)
406 |     x = data[var1]
407 |     y = data[var2]
408 |     slope, intercept = np.polyfit(x, y, deg=1)
409 |     ax.scatter(x, y, s=40, alpha=0.7, color=marker_color, edgecolors="#444")
410 |     reg_line_x = np.linspace(x.min(), x.max(), num=20)
411 |     reg_line_y = slope * reg_line_x + intercept
412 |     ax.plot(reg_line_x, reg_line_y, color=line_color, lw=2)
413 |     ax.set_title(
414 |         f"Slope: {slope:,.4f}\nIntercept: {intercept:,.4f}\n"
415 |         + f"Correlation: {x.corr(y):.4f}",
416 |         size=11,
417 |     )
418 |     ax.set_xlabel(var1)
419 |     ax.set_ylabel(var2)
420 |     return ax
421 | 
422 | 
423 | def _plot_regression(data_and_color: Tuple) -> Tuple:
424 |     """Helper function to plot regression-plots concurrently.
425 | 
426 |     Args:
427 |         data_and_color (Tuple): Dataframe, and desired marker-color.
428 | 
429 |     Returns:
430 |         Tuple: Names for the variable pair, and axes with the regression
431 |         plot.
432 |     """
433 |     data, color = data_and_color
434 |     var1, var2 = data.columns
435 |     ax = regression_plot(
436 |         x=data[var1], y=data[var2], labels=(var1, var2), marker_color=color
437 |     )
438 |     return (var1, var2), ax
439 | 
440 | 
441 | def _plot_dataset(variables: Dataset, color: str = None) -> Optional[Dict]:
442 |     """Concurrently plot regression-plots in a multiprocessing Pool.
443 | 
444 |     Args:
445 |         variables (Dataset): Bi-variate analysis results.
446 |         color (str, optional): The color to apply to the graphs.
447 | 
448 |     Returns:
449 |         Optional[Dict]: A dictionary with a correlation plot and regression
450 |         plots.
451 |     """
452 |     if variables._correlation_values is None:
453 |         return None
454 |     else:
455 |         # Take the top 20 pairs by magnitude of correlation.
456 |         # 20 var_pairs ≈ 10+ pages in report document
457 |         # 20 numeric columns == 190 var_pairs ≈ 95+ pages.
458 |         pairs_to_include = [
459 |             pair for pair, _ in variables._correlation_values[:20]
460 |         ]
461 |         with get_context("spawn").Pool() as p:
462 |             paired_data = [
463 |                 (variables.data.loc[:, pair], color)
464 |                 for pair in pairs_to_include
465 |             ]
466 |             bivariate_regression_plots = dict(
467 |                 tqdm(
468 |                     # Plot in parallel processes
469 |                     p.imap(_plot_regression, paired_data),
470 |                     # Progress-bar options
471 |                     total=len(pairs_to_include),
472 |                     bar_format=(
473 |                         "{desc} {percentage:3.0f}%|{bar:35}| "
474 |                         "{n_fmt}/{total_fmt} pairs."
475 |                     ),
476 |                     desc="Bivariate analysis:",
477 |                     dynamic_ncols=True,
478 |                 )
479 |             )
480 |         return {
481 |             "correlation_plot": _savefig(plot_correlation(variables).figure),
482 |             "regression_plots": {
483 |                 var_pair: _savefig(plot.figure)
484 |                 for var_pair, plot in bivariate_regression_plots.items()
485 |             },
486 |         }
487 | 


--------------------------------------------------------------------------------