├── .gitattributes ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── error---bug-report.md │ ├── feature-request.md │ └── general-question.md └── workflows │ ├── pytest.yml │ └── pythonpublish.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── VERSION ├── conftest.py ├── dev_requirements.txt ├── docs ├── _layouts │ └── default.html ├── getting_started │ ├── examples.md │ └── installation.md ├── images │ ├── associations_iris_example.png │ ├── associations_mushrooms_example.png │ ├── favicon.png │ ├── index_banner.png │ ├── ks_example.png │ ├── logo.png │ ├── pr_example.png │ ├── roc_example.png │ ├── social_banner.png │ └── split_hist_example.png ├── index.md ├── modules │ ├── data_utils.md │ ├── model_utils.md │ ├── nominal.md │ └── sampling.md ├── overrides │ └── main.html └── related_blogposts.md ├── dython ├── __init__.py ├── _private.py ├── data_utils.py ├── examples.py ├── model_utils.py ├── nominal.py ├── sampling.py └── typing.py ├── logos ├── README.md ├── dython_300x200.png ├── facebook_cover_photo_1.png ├── facebook_cover_photo_2.png ├── facebook_profile_image.png ├── favicon.png ├── instagram_profile_image.png ├── linkedin_banner_image_1.png ├── linkedin_banner_image_2.png ├── linkedin_profile_image.png ├── logo.png ├── logo_transparent.png ├── pinterest_board_photo.png ├── pinterest_profile_image.png ├── twitter_header_photo_1.png ├── twitter_header_photo_2.png ├── twitter_profile_image.png └── youtube_profile_image.png ├── mkdocs.yml ├── pyproject.toml ├── pytest.ini ├── requirements.txt ├── setup.py └── tests ├── test_data_utils ├── test_one_hot_encode.py └── test_split_hist.py ├── test_model_utils ├── test_ks_abc.py ├── test_metric_graph.py └── test_random_forest_feature_importance.py ├── test_nominal ├── test_associations.py ├── test_associations_parallel.py ├── test_cluster_correlation.py ├── test_correlation_ratio.py ├── test_cramers_v.py └── test_theils_u.py ├── test_private_helpers.py └── test_sampling.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: shakedzy 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/error---bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Error / bug report 3 | about: How to create a report to help us improve 4 | title: '' 5 | labels: 'bug' 6 | assignees: '' 7 | 8 | --- 9 | 14 | 15 | ### Version check: 16 | 21 | Run and copy the output: 22 | ```python 23 | import sys, dython 24 | print(sys.version_info) 25 | print(dython.__version__) 26 | ``` 27 | 28 | ### Describe the bug: 29 | 33 | Code to reproduce: 34 | ```python 35 | import dython 36 | # your code goes here 37 | ``` 38 | 39 | ## Error message: 40 | 41 | Error message: 42 | ``` 43 | # your error message 44 | ``` 45 | 46 | ## Input data: 47 | 48 | 49 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: New feature request 3 | about: How to create a request for a new feature 4 | title: '' 5 | labels: 'enhancement' 6 | assignees: '' 7 | 8 | --- 9 | 13 | 14 | ### Describe the new feature: 15 | 19 | 20 | 21 | ### What is the current outcome? 22 | 23 | 24 | 25 | ### Is it backward-compatible? 26 | 29 | 30 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/general-question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: General question 3 | about: Ask any question you'd like 4 | title: '' 5 | labels: 'question' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 17 | -------------------------------------------------------------------------------- /.github/workflows/pytest.yml: -------------------------------------------------------------------------------- 1 | name: Run pytest 2 | 3 | on: 4 | push: 5 | branches: [ "master", "shakedzy:master" ] 6 | paths-ignore: 7 | - "README.md" 8 | - "CHANGELOG.md" 9 | - "CODE_OF_CONDUCT.md" 10 | - "CONTRIBUTING.md" 11 | - "VERSION" 12 | - "LICENSE" 13 | - ".gitignore" 14 | - "docs/*" 15 | pull_request: 16 | types: [opened, reopened, edited, synchronize] 17 | branches: [ "master", "shakedzy:master" ] 18 | paths-ignore: 19 | - "README.md" 20 | - "CHANGELOG.md" 21 | - "CODE_OF_CONDUCT.md" 22 | - "CONTRIBUTING.md" 23 | - "VERSION" 24 | - "LICENSE" 25 | - ".gitignore" 26 | - "docs/*" 27 | 28 | permissions: 29 | contents: read 30 | 31 | jobs: 32 | build: 33 | strategy: 34 | matrix: 35 | version: ["3.10", "3.12"] 36 | runs-on: ubuntu-latest 37 | steps: 38 | - uses: actions/checkout@v4 39 | - name: Set up Python ${{ matrix.version }} 40 | uses: actions/setup-python@v5 41 | with: 42 | python-version: ${{ matrix.version }} 43 | - name: Install dependencies 44 | run: | 45 | python -m pip install --upgrade pip 46 | pip install -r requirements.txt 47 | pip install -r dev_requirements.txt 48 | pip install . 49 | - name: Test with pytest 50 | run: pytest 51 | -------------------------------------------------------------------------------- /.github/workflows/pythonpublish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | workflow_dispatch: 8 | release: 9 | types: [created] 10 | 11 | jobs: 12 | test: 13 | strategy: 14 | matrix: 15 | version: [ "3.9", "3.10", "3.11", "3.12" ] 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Set up Python ${{ matrix.version }} 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: ${{ matrix.version }} 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install -r requirements.txt 27 | pip install -r dev_requirements.txt 28 | pip install . 29 | - name: Test with pytest 30 | run: pytest 31 | 32 | deploy: 33 | needs: test 34 | runs-on: ubuntu-latest 35 | steps: 36 | - uses: actions/checkout@v4 37 | - name: Set up Python 38 | uses: actions/setup-python@v5 39 | with: 40 | python-version: '3.x' 41 | - name: Install dependencies 42 | run: | 43 | python -m pip install --upgrade pip 44 | pip install setuptools wheel twine 45 | - name: Build and publish 46 | env: 47 | TWINE_USERNAME: '__token__' 48 | TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} 49 | run: | 50 | python setup.py sdist bdist_wheel 51 | twine upload dist/* 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | syntax: glob 2 | .python-version 3 | .venv 4 | env/* 5 | venv/* 6 | ENV/* 7 | .idea 8 | .vscode 9 | .DS_Store 10 | dython.egg*/* 11 | *__pycache__* 12 | *run_stuff.py* 13 | build/* 14 | dist/* 15 | build_deploy.sh 16 | site/* 17 | debug.py 18 | .coverage 19 | .hypothesis 20 | .pytest_cache* -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/ambv/black 5 | rev: 22.8.0 6 | hooks: 7 | - id: black 8 | language: python 9 | types: [python] 10 | args: ["--line-length=80"] 11 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | 3 | ## 0.7.9 4 | * Fixing `nominal.associations(plot=False)` not working as expected on Jupyter-based notebooks (issues [#167](https://github.com/shakedzy/dython/issues/167) & [#168](https://github.com/shakedzy/dython/issues/168)) 5 | 6 | ## 0.7.8 7 | * `nominal.associations` now attempts to set the figure-size automatically based on output (issue [#30](https://github.com/shakedzy/dython/issues/30), by **[@Swish78](https://github.com/Swish78)**) 8 | 9 | ## 0.7.7 10 | * _Drop support for Python 3.8 as it reaches its end-of-life date_ 11 | * Fix issue [#160](https://github.com/shakedzy/dython/issues/160) 12 | 13 | ## 0.7.6 14 | * Fix issue [#162](https://github.com/shakedzy/dython/issues/162) 15 | 16 | ## 0.7.5 17 | * Adding type hints to all functions (issue [#153](https://github.com/shakedzy/dython/issues/153)) 18 | * Dropping dependency in `scikit-plot` as it is no longer maintained (issue [#156](https://github.com/shakedzy/dython/issues/156)) 19 | * Support for Python 3.12 (issue [#155](https://github.com/shakedzy/dython/issues/155)) 20 | 21 | ## 0.7.4 22 | * Handling running plotting functions with `plot=False` in Jupyter and truly avoid plotting (issue [#147](https://github.com/shakedzy/dython/issues/147)) 23 | 24 | ## 0.7.3 25 | * _Dython now officially supports only Python 3.8 or above_ (by-product of issue [#137](https://github.com/shakedzy/dython/issues/137)) 26 | * Added `nominal.replot_last_associations`: a new method to replot `nominal.associations` heat-maps (issue [#136](https://github.com/shakedzy/dython/issues/136)) 27 | * Adding option to drop NaN values in each pair of columns independently in `nominal.associations` (issue [#130](https://github.com/shakedzy/dython/issues/130), by **[@matbb](https://github.com/matbb)**) 28 | * Fixing issues [#139](https://github.com/shakedzy/dython/issues/139) and [#140](https://github.com/shakedzy/dython/issues/140) (by **[@enrir](https://github.com/enrir)**) 29 | 30 | ## 0.7.2 31 | * `nominal.associations` supports multi-core parallel processing (issue [#117](https://github.com/shakedzy/dython/issues/117), by **[@mahieyin-rahmun](https://github.com/mahieyin-rahmun)**) 32 | * Using Black for code formatting (issue [#133](https://github.com/shakedzy/dython/issues/133), by **[@mahieyin-rahmun](https://github.com/mahieyin-rahmun)**) 33 | 34 | ## 0.7.1 (_post4_) 35 | * Fix floating point precision in `theils_u`, `cramer_v` and `correlation_ratio` (issue [#116](https://github.com/shakedzy/dython/issues/116)) 36 | * Fix failing conda builds (by **[@sarthakpati](https://github.com/sarthakpati)**) 37 | * Fix legend argument in `ks_abc` (by **[@lahdjirayhan](https://github.com/lahdjirayhan)**) 38 | 39 | ## 0.7.0 40 | * _License is now MIT_ 41 | * Added tests (issue [#69](https://github.com/shakedzy/dython/issues/69), by **[@lahdjirayhan](https://github.com/lahdjirayhan)**) 42 | * Added option to select which rows/columns to display/hide in `nominal.associations` (issue [#92](https://github.com/shakedzy/dython/issues/92)) 43 | * Fixed deprecation warning when using `datetime` features with `nominal.associations` (issue [#96](https://github.com/shakedzy/dython/issues/96)) 44 | * `nominal.associations` now support custom methods as measures of associations (issue [#104](https://github.com/shakedzy/dython/issues/104)) 45 | * _Important change:_ Theil's U in `nominal.associations` is now read as U(row|col) instead of U(col|row) 46 | * Remove deprecated method `compute_associations` 47 | 48 | ## 0.6.8 49 | * Bug fix in `metric_graph` (issue [#102](https://github.com/shakedzy/dython/issues/102)) 50 | * Bug fix in examples module 51 | 52 | ## 0.6.7 (_post2_) 53 | * First version supported by `conda` (issue [#90](https://github.com/shakedzy/dython/issues/90), by **[@sarthakpati](https://github.com/sarthakpati)**) 54 | * `associations` (and `compute_associations`) now supports several numerical-numerical association measures 55 | (issue [#84](https://github.com/shakedzy/dython/issues/84)) 56 | * `nominal.associations` keyword `bias_correction` is now `cramers_v_bias_correction` 57 | * Added a `numerical_columns` option to `associations` and `compute_associations` 58 | * `roc_graph` is officially removed (replaced with `metric_graph`) 59 | * Deprecating `compute_associations` 60 | 61 | ## 0.6.6 62 | * Fixed issue where `nan_strategy` affected input data (issue [#82](https://github.com/shakedzy/dython/issues/82)) 63 | * Added `datetime` support to `nominal.associations` (issue [#76](https://github.com/shakedzy/dython/issues/76)) 64 | 65 | ## 0.6.5 (_post1_) 66 | * Added `model_utils.ks_abc` 67 | * Fixed a bug in `model_utils.metric_graph` when using `plot=False` 68 | * Added new dependency: `scikit-plot` 69 | 70 | ## 0.6.4 (_post1_) 71 | * Adding `model_utils.metric_graph` instead of `roc_graph`, which now supports ROC curves and Precision-Recall curves 72 | * `roc_graph` is marked as deprecated 73 | 74 | ## 0.6.3 75 | * Added `data_utils.one_hot_encode` 76 | * Added `title` and `filename` options to `associations` and `roc_graph` 77 | 78 | ## 0.6.2 79 | * Added configurable `vmax` and `vmin` to `nominal.associations` (issue [#68](https://github.com/shakedzy/dython/issues/68)) 80 | 81 | ## 0.6.1 82 | * Bug fix in `model_utils.roc_graph` 83 | * `model_utils.roc_graph` now accepts also `legend` and `plot` arguments 84 | 85 | ## 0.6.0 86 | * New module: `data_utils` 87 | * `split_hist` method added, with new example 88 | * `identify_columns_by_type` and `identify_columns_with_na` moved to `data_utils` from `nominal` 89 | 90 | ## 0.5.2 91 | * Added `nominal.identify_columns_with_na` (by **[@musketeer191](https://github.com/musketeer191)**) 92 | * Added `nominal.identify_numeric_columns` (issue [#58](https://github.com/shakedzy/dython/issues/58), by **[@musketeer191](https://github.com/musketeer191)**) 93 | * Added `nominal.identify_columns_by_type` 94 | * `nominal.identify_nominal_columns` no longer accepts the `include` parameter (use `nominal.identify_columns_by_type` instead) 95 | * Fix docstring of `nominal.compute_associations` (issue [#55](https://github.com/shakedzy/dython/issues/55)) 96 | * Requires Pandas 0.23.4 or greater (was required before, but not specified in setup file) 97 | 98 | ## 0.5.1 99 | * Resolve issues [#48](https://github.com/shakedzy/dython/issues/48) and [#49](https://github.com/shakedzy/dython/issues/49) 100 | 101 | ## 0.5.0 (_post2_) 102 | * Fix issues [#28](https://github.com/shakedzy/dython/issues/28), [#31](https://github.com/shakedzy/dython/issues/31), [#41](https://github.com/shakedzy/dython/issues/41), [#46](https://github.com/shakedzy/dython/issues/46) 103 | * `nominal.cramers_v` can be used without bias correction 104 | * Removed `kwargs` from all methods, replaced with explicit API 105 | * `nominal.associations` and `model_utils.roc_graph` now return a dictionary of output values 106 | * `model_utils.roc_graph` can accept an `ax` 107 | * license replaced to BSD-3 108 | 109 | ## 0.4.7 110 | * `nominal.associations` now handles single-value features (issue [#38](https://github.com/shakedzy/dython/issues/38)) 111 | 112 | ## 0.4.6 113 | * Added log-base selection in `nominal.conditional_entropy` (issue [#35](https://github.com/shakedzy/dython/issues/35), by **[@ahmedsalhin](https://github.com/ahmedsalhin)**) 114 | * Added new example: `associations_mushrooms_example` 115 | * Renamed example: `associations_example` is now `associations_iris_example` 116 | 117 | ## 0.4.5 118 | * Requires Python 3.5+ 119 | * Private methods and attributes renamed 120 | * Fixed incorrect `__version__` varaible 121 | 122 | ## 0.4.4 123 | * Minor fixes 124 | * introducing `__all__` to all modules 125 | 126 | ## 0.4.3 127 | * `binary_roc_graph` is now a private method, only `roc_graph` is exposed 128 | 129 | ## 0.4.2 130 | * Added new functionality to `model_utils.roc_graph` (Plot best threshold, print class names) 131 | 132 | ## 0.4.1 133 | * Added `nominal.cluster_correlations`, and an option to cluster `nominal.associations` heatmap (by **[@benman1](https://github.com/benman1)**) 134 | 135 | ## 0.4.0 136 | * Added automatic recognition of categorical columns in `nominal.associations` (by **[@benman1](https://github.com/benman1)**) 137 | 138 | ## 0.3.1 139 | * `nominal.associations` can accept an exisiting Matplotlib `Axe` (issue [#24](https://github.com/shakedzy/dython/issues/24), by **[@Baukebrenninkmeijer](https://github.com/Baukebrenninkmeijer)**) 140 | 141 | ## 0.3.0 142 | * Introducing missing values handeling (`nan_strategy`) in `nominal` module (issue [#15](https://github.com/shakedzy/dython/issues/15)) 143 | 144 | ## 0.2.0 145 | * Added `sampling` module 146 | 147 | ## 0.1.1 148 | * Fixed missing `sqrt` in `nominal.correlation_ratio` (issue [#7](https://github.com/shakedzy/dython/issues/7)) 149 | 150 | ## 0.1.0 151 | * First version of Dython 152 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at shakedzy@gmail.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute to Dython 2 | If you'd like to contribute or assist - then first of all, thanks. This isn't obvious and I appreciate it. 3 | 4 | ### Reporting a bug: 5 | If you found a bug, please open an new _error/bug issue_ [here](https://github.com/shakedzy/dython/issues/new/choose). 6 | Please make sure you are using the latest version of Dython befoe reporting. 7 | 8 | ### Suggesting a new feature: 9 | New features are always welcomed. Please describe it in a _new feature request_ [here](https://github.com/shakedzy/dython/issues/new/choose). 10 | 11 | ### Adding things yourself: 12 | If you want to take an open issue and work on it, or would like to merge something you coded yourself, please open a pull request and explain what it is you're adding. If there's an open issue about it, please state you're working on it. Contibutions are always welcomed, and are very much appreciated. Your name will forever be etched in the [change log](CHANGELOG.md). 13 | 14 | ### Anything else? 15 | If there's anything else you'd like to discuss, feel free top open a _general question_ [here](https://github.com/shakedzy/dython/issues/new/choose) on any topic. 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018-2022, Shaked Zychlinski 4 | All rights reserved. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining 7 | a copy of this software and associated documentation files (the 8 | "Software"), to deal in the Software without restriction, including 9 | without limitation the rights to use, copy, modify, merge, publish, 10 | distribute, sublicense, and/or sell copies of the Software, and to 11 | permit persons to whom the Software is furnished to do so, subject to 12 | the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.md, VERSION, dev_requirements.txt 2 | include requirements.txt 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![banner](http://shakedzy.xyz/dython/images/index_banner.png) 2 | 3 | # Dython 4 | 5 | [![PyPI Version](https://img.shields.io/pypi/v/dython.svg)](https://pypi.org/project/dython/) 6 | [![Conda Version](https://img.shields.io/conda/vn/conda-forge/dython)](https://anaconda.org/conda-forge/dython) 7 | [![Python Version](https://img.shields.io/pypi/pyversions/dython.svg)](https://pypi.org/project/dython/) 8 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/dython)](https://pypistats.org/packages/dython) 9 | [![License](https://img.shields.io/pypi/l/dython)](https://github.com/shakedzy/dython/blob/master/LICENSE) 10 | [![DOI](https://img.shields.io/badge/DOI-10.5281%2Fzenodo.12698421-eb34c6)](https://zenodo.org/doi/10.5281/zenodo.12698421) 11 | 12 | A set of **D**ata analysis tools in p**YTHON** 3.x. 13 | 14 | Dython was designed with analysis usage in mind - meaning ease-of-use, functionality and readability are the core 15 | values of this library. 16 | 17 | ## Installation 18 | Dython can be installed directly using `pip`: 19 | ``` 20 | pip install dython 21 | ``` 22 | or, via the `conda` package manager: 23 | ``` 24 | conda install -c conda-forge dython 25 | ``` 26 | 27 | ## Documentation 28 | Modules documentation can be found on [shakedzy.xyz/dython](http://shakedzy.xyz/dython). 29 | You can also learn more and see examples of the main methods of this library on 30 | [these blogposts](http://shakedzy.xyz/dython/related_blogposts). 31 | 32 | ## Contributing 33 | Contributions are always welcomed - if you found something you can fix, or have an idea for a new feature, feel free to write it and open a pull request. Please make sure to go over the [contributions guidelines](https://github.com/shakedzy/dython/blob/master/CONTRIBUTING.md). 34 | 35 | ## Citing 36 | Use this reference to cite if you use Dython in a paper: 37 | ```bibtex 38 | @software{Zychlinski_dython_2018, 39 | author = {Zychlinski, Shaked}, 40 | title = {{dython}}, 41 | year = {2018}, 42 | url = {https://github.com/shakedzy/dython}, 43 | doi = {10.5281/zenodo.12698421} 44 | } 45 | ``` 46 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.7.9 -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import functools 3 | import matplotlib 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn import datasets 7 | 8 | 9 | @pytest.fixture(autouse=True) 10 | def disable_plot(monkeypatch): 11 | # Patch plt.show to not halt testing flow, by making it not block 12 | # function execution. 13 | # patch = functools.partial(matplotlib.pyplot.show, block=False) 14 | def patch(): 15 | pass 16 | 17 | monkeypatch.setattr(matplotlib.pyplot, "show", patch) 18 | 19 | 20 | @pytest.fixture 21 | def iris_df(): 22 | # Use iris dataset as example when needed. 23 | # Add one made-up categorical column to create a nom-nom relationship. 24 | 25 | iris = datasets.load_iris() 26 | 27 | target = ["C{}".format(i) for i in iris.target] 28 | 29 | rng = np.random.default_rng(2207) 30 | extra = rng.choice(list("ABCDE"), size=len(target)) 31 | 32 | extra = pd.DataFrame(data=extra, columns=["extra"]) 33 | 34 | X = pd.DataFrame(data=iris.data, columns=iris.feature_names) 35 | y = pd.DataFrame(data=target, columns=["target"]) 36 | 37 | df = pd.concat([X, extra, y], axis=1) 38 | 39 | return df 40 | 41 | 42 | @pytest.fixture(autouse=True) 43 | def add_iris(doctest_namespace, iris_df): 44 | # Add iris dataset to namespace 45 | # This fixture is provided with autouse so that 46 | # the doctests can use it 47 | doctest_namespace["iris_df"] = iris_df 48 | -------------------------------------------------------------------------------- /dev_requirements.txt: -------------------------------------------------------------------------------- 1 | pytest>=8.3.2 2 | hypothesis>=6.111.0 3 | black>=24.8.0 4 | pre-commit>=3.8.0 5 | pytest-enabler>=3.1.1 -------------------------------------------------------------------------------- /docs/_layouts/default.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | {% seo %} 9 | 10 | 13 | 14 | 15 |
16 |
17 |

{{ site.title | default: site.github.repository_name }}

18 | 19 | {% if site.logo %} 20 | Logo 21 | {% endif %} 22 | 23 |

{{ site.description | default: site.github.project_tagline }}

24 | 25 | {% if site.github.is_project_page %} 26 |

View the Project on GitHub {{ site.github.repository_nwo }}

27 | {% endif %} 28 | 29 | {% if site.github.is_user_page %} 30 |

View My GitHub Profile

31 | {% endif %} 32 | 33 | {% if site.show_downloads %} 34 | 39 | {% endif %} 40 |
41 |
42 | 43 | {{ content }} 44 | 45 |
46 | 52 |
53 | 54 | {% if site.google_analytics %} 55 | 63 | {% endif %} 64 | 65 | 66 | -------------------------------------------------------------------------------- /docs/getting_started/examples.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: examples 3 | --- 4 | # Examples 5 | 6 | _Examples can be imported and executed from `dython.examples`._ 7 | 8 | #### `associations_iris_example()` 9 | 10 | Plot an example of an associations heat-map of the Iris dataset features. 11 | All features of this dataset are numerical (except for the target). 12 | 13 | **Example code:** 14 | ```python 15 | import pandas as pd 16 | from sklearn import datasets 17 | from dython.nominal import associations 18 | 19 | # Load data 20 | iris = datasets.load_iris() 21 | 22 | # Convert int classes to strings to allow associations 23 | # method to automatically recognize categorical columns 24 | target = ['C{}'.format(i) for i in iris.target] 25 | 26 | # Prepare data 27 | X = pd.DataFrame(data=iris.data, columns=iris.feature_names) 28 | y = pd.DataFrame(data=target, columns=['target']) 29 | df = pd.concat([X, y], axis=1) 30 | 31 | # Plot features associations 32 | associations(df) 33 | ``` 34 | **Output:** 35 | 36 | ![associations_iris_example](../images/associations_iris_example.png) 37 | 38 | __________________ 39 | 40 | #### `associations_mushrooms_example()` 41 | 42 | Plot an example of an associations heat-map of the UCI Mushrooms dataset features. 43 | All features of this dataset are categorical. This example will use Theil's U. 44 | 45 | **Example code:** 46 | ```python 47 | import pandas as pd 48 | from dython.nominal import associations 49 | 50 | # Download and load data from UCI 51 | df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data') 52 | df.columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 53 | 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 54 | 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 55 | 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat'] 56 | 57 | # Plot features associations 58 | associations(df, nom_nom_assoc='theil', figsize=(15, 15)) 59 | ``` 60 | **Output:** 61 | 62 | ![associations_mushrooms_example](../images/associations_mushrooms_example.png) 63 | 64 | __________________ 65 | 66 | #### `ks_abc_example()` 67 | 68 | An example of KS Area Between Curve of a simple binary classifier trained over the Breast Cancer dataset. 69 | 70 | **Example code:** 71 | ```python 72 | from sklearn import datasets 73 | from sklearn.model_selection import train_test_split 74 | from sklearn.linear_model import LogisticRegression 75 | from dython.model_utils import ks_abc 76 | 77 | # Load and split data 78 | data = datasets.load_breast_cancer() 79 | X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=.5, random_state=0) 80 | 81 | # Train model and predict 82 | model = LogisticRegression(solver='liblinear') 83 | model.fit(X_train, y_train) 84 | y_pred = model.predict_proba(X_test) 85 | 86 | # Perform KS test and compute area between curves 87 | ks_abc(y_test, y_pred[:,1]) 88 | ``` 89 | 90 | **Output:** 91 | 92 | ![ks_example](../images/ks_example.png) 93 | __________________ 94 | 95 | #### `pr_graph_example()` 96 | 97 | Plot an example Precision-Recall graph of an SVM model predictions over the Iris dataset. 98 | 99 | **Example code:** 100 | 101 | ```python 102 | import numpy as np 103 | from sklearn import svm, datasets 104 | from sklearn.model_selection import train_test_split 105 | from sklearn.preprocessing import label_binarize 106 | from sklearn.multiclass import OneVsRestClassifier 107 | from dython.model_utils import metric_graph 108 | 109 | # Load data 110 | iris = datasets.load_iris() 111 | X = iris.data 112 | y = label_binarize(iris.target, classes=[0, 1, 2]) 113 | 114 | # Add noisy features 115 | random_state = np.random.RandomState(4) 116 | n_samples, n_features = X.shape 117 | X = np.c_[X, random_state.randn(n_samples, 200 * n_features)] 118 | 119 | # Train a model 120 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) 121 | classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=0)) 122 | 123 | # Predict 124 | y_score = classifier.fit(X_train, y_train).predict_proba(X_test) 125 | 126 | # Plot ROC graphs 127 | metric_graph(y_test, y_score, 'pr', class_names=iris.target_names) 128 | ``` 129 | 130 | **Output:** 131 | 132 | ![pr_example](../images/pr_example.png) 133 | 134 | __________________ 135 | 136 | #### `roc_graph_example()` 137 | 138 | Plot an example ROC graph of an SVM model predictions over the Iris dataset. 139 | 140 | Based on `sklearn` [examples](http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html) 141 | (as was seen on April 2018). 142 | 143 | **Example code:** 144 | 145 | ```python 146 | import numpy as np 147 | from sklearn import svm, datasets 148 | from sklearn.model_selection import train_test_split 149 | from sklearn.preprocessing import label_binarize 150 | from sklearn.multiclass import OneVsRestClassifier 151 | from dython.model_utils import metric_graph 152 | 153 | # Load data 154 | iris = datasets.load_iris() 155 | X = iris.data 156 | y = label_binarize(iris.target, classes=[0, 1, 2]) 157 | 158 | # Add noisy features 159 | random_state = np.random.RandomState(4) 160 | n_samples, n_features = X.shape 161 | X = np.c_[X, random_state.randn(n_samples, 200 * n_features)] 162 | 163 | # Train a model 164 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) 165 | classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=0)) 166 | 167 | # Predict 168 | y_score = classifier.fit(X_train, y_train).predict_proba(X_test) 169 | 170 | # Plot ROC graphs 171 | metric_graph(y_test, y_score, 'roc', class_names=iris.target_names) 172 | ``` 173 | 174 | **Output:** 175 | 176 | ![roc_example](../images/roc_example.png) 177 | 178 | !!! warning "Note:" 179 | 180 | Due to the nature of `np.random.RandomState` which is used in this 181 | example, the output graph may vary from one machine to another. 182 | 183 | __________________ 184 | 185 | #### `split_hist_example()` 186 | 187 | Plot an example of split histogram of data from the breast-cancer dataset. 188 | 189 | While this example presents a numerical column split by a categorical one, categorical columns can also be used 190 | as the values, as well as numerical columns as the split criteria. 191 | 192 | **Example code:** 193 | ```python 194 | import pandas as pd 195 | from sklearn import datasets 196 | from dython.data_utils import split_hist 197 | 198 | # Load data and convert to DataFrame 199 | data = datasets.load_breast_cancer() 200 | df = pd.DataFrame(data=data.data, columns=data.feature_names) 201 | df['malignant'] = [not bool(x) for x in data.target] 202 | 203 | # Plot histogram 204 | split_hist(df, 'mean radius', split_by='malignant', bins=20, figsize=(15,7)) 205 | ``` 206 | 207 | **Output:** 208 | 209 | ![split_hist_example](../images/split_hist_example.png) 210 | -------------------------------------------------------------------------------- /docs/getting_started/installation.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: installation 3 | --- 4 | 5 | # Installing Dython 6 | 7 | ## Installation 8 | 9 | The easiest way to install dython is using `pip install`: 10 | 11 | ```bash 12 | pip install dython 13 | ``` 14 | Or, via the `conda` package manager: 15 | ```bash 16 | conda install -c conda-forge dython 17 | ``` 18 | 19 | If you'd like to use the source code instead, you can install directly from it using any 20 | of the following methods: 21 | 22 | * Install source code using pip: 23 | ```bash 24 | pip install git+https://github.com/shakedzy/dython.git` 25 | ``` 26 | * Download the source code as a [ZIP file](https://github.com/shakedzy/dython/zipball/master) 27 | * Download the source code as a [TAR ball](https://github.com/shakedzy/dython/tarball/master) 28 | 29 | -------------------------------------------------------------------------------- /docs/images/associations_iris_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/associations_iris_example.png -------------------------------------------------------------------------------- /docs/images/associations_mushrooms_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/associations_mushrooms_example.png -------------------------------------------------------------------------------- /docs/images/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/favicon.png -------------------------------------------------------------------------------- /docs/images/index_banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/index_banner.png -------------------------------------------------------------------------------- /docs/images/ks_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/ks_example.png -------------------------------------------------------------------------------- /docs/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/logo.png -------------------------------------------------------------------------------- /docs/images/pr_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/pr_example.png -------------------------------------------------------------------------------- /docs/images/roc_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/roc_example.png -------------------------------------------------------------------------------- /docs/images/social_banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/social_banner.png -------------------------------------------------------------------------------- /docs/images/split_hist_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/split_hist_example.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | is_homepage: 3 | --- 4 | 5 | # Dython 6 | 7 | [![PyPI Version](https://img.shields.io/pypi/v/dython?style=for-the-badge)](https://pypi.org/project/dython/) 8 | [![Conda Version](https://img.shields.io/conda/vn/conda-forge/dython?style=for-the-badge)](https://anaconda.org/conda-forge/dython) 9 | [![Python Version](https://img.shields.io/pypi/pyversions/dython.svg?style=for-the-badge)](https://pypi.org/project/dython/) 10 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/dython?style=for-the-badge)](https://pypistats.org/packages/dython) 11 | [![License](https://img.shields.io/pypi/l/dython?style=for-the-badge)](https://github.com/shakedzy/dython/blob/master/LICENSE) 12 | [![DOI](https://img.shields.io/badge/DOI-10.5281%2Fzenodo.12698421-eb34c6?style=for-the-badge)](https://zenodo.org/doi/10.5281/zenodo.12698421) 13 | 14 | ![banner](images/index_banner.png) 15 | 16 | ## Welcome! 17 | 18 | Dython is a set of **D**ata analysis tools in p**YTHON** 3.x, which can let you get more insights about your data. 19 | 20 | This library was designed with analysis usage in mind - meaning ease-of-use, functionality and readability are the core 21 | values of this library. Production-grade performance, on the other hand, were not considered. 22 | 23 | **Here are some cool things you can do with it:** 24 | 25 | Given a dataset, Dython will automatically find which features are categorical and which are numerical, 26 | compute a relevant measure of association between each and every feature, and plot it all as an easy-to-read 27 | heat-map. And all this is done with a single line: 28 | 29 | ```python 30 | from dython.nominal import associations 31 | associations(data) 32 | ``` 33 | The result: 34 | 35 | ![associations_iris_example](images/associations_iris_example.png) 36 | 37 | Here's another thing - given a machine-learning multi-class model's predictions, you can easily display 38 | each class' ROC curve, AUC score and find the estimated-optimal thresholds - again, with a single line of code: 39 | 40 | ```python 41 | from dython.model_utils import metric_graph 42 | 43 | metric_graph(y_true, y_pred, metric='roc') 44 | ``` 45 | The result: 46 | 47 | ![roc_example](images/roc_example.png) 48 | 49 | ## Installation 50 | Dython can be installed directly using `pip`: 51 | ```bash 52 | pip install dython 53 | ``` 54 | Other installation options are available, see the [installation page](getting_started/installation.md) 55 | for more information. 56 | 57 | ## Examples 58 | See some usage examples of `nominal.associations` and `model_utils.roc_graph` on the [examples page](getting_started/examples.md). 59 | All examples can also be imported and executed from `dython.examples`. 60 | 61 | ## Citing 62 | Use this reference to cite if you use Dython in a paper: 63 | ```bibtex 64 | @software{Zychlinski_dython_2018, 65 | author = {Zychlinski, Shaked}, 66 | title = {{dython}}, 67 | year = {2018}, 68 | url = {https://github.com/shakedzy/dython}, 69 | doi = {10.5281/zenodo.12698421} 70 | } 71 | ``` -------------------------------------------------------------------------------- /docs/modules/data_utils.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: data_utils 3 | --- 4 | 5 | # data_utils 6 | 7 | #### `identify_columns_with_na` 8 | 9 | `identify_columns_with_na(dataset)` 10 | 11 | Given a dataset, return columns names having NA values, 12 | sorted in descending order by their number of NAs. 13 | 14 | - **`dataset`** : `np.ndarray` / `pd.DataFrame` 15 | 16 | **Returns:** A `pd.DataFrame` of two columns (`['column', 'na_count']`), consisting of only 17 | the names of columns with NA values, sorted by their number of NA values. 18 | 19 | **Example:** 20 | ```python 21 | >>> df = pd.DataFrame({'col1': ['a', np.nan, 'a', 'a'], 'col2': [3, np.nan, 2, np.nan], 'col3': [1., 2., 3., 4.]}) 22 | >>> identify_columns_with_na(df) 23 | column na_count 24 | 1 col2 2 25 | 0 col1 1 26 | ``` 27 | 28 | __________________ 29 | 30 | #### `identify_columns_by_type` 31 | 32 | `identify_columns_by_type(dataset, include)` 33 | 34 | Given a dataset, identify columns of the types requested. 35 | 36 | - **`dataset`** : `np.ndarray` / `pd.DataFrame` 37 | 38 | - **`include`** : `list` 39 | 40 | which column types to filter by. 41 | 42 | **Returns:** list of categorical columns 43 | 44 | **Example:** 45 | ```python 46 | >>> df = pd.DataFrame({'col1': ['a', 'b', 'c', 'a'], 'col2': [3, 4, 2, 1], 'col3': [1., 2., 3., 4.]}) 47 | >>> identify_columns_by_type(df, include=['int64', 'float64']) 48 | ['col2', 'col3'] 49 | ``` 50 | 51 | __________________ 52 | 53 | #### `one_hot_encode` 54 | 55 | `one_hot_encode(arr, classes=None)` 56 | 57 | One-hot encode a 1D array. Based on this [StackOverflow answer](https://stackoverflow.com/a/29831596/5863503). 58 | 59 | - **`arr`** : array-like 60 | 61 | An array to be one-hot encoded. Must contain only non-negative integers 62 | 63 | - **`classes`** : `int` or `None` 64 | 65 | number of classes. if None, max value of the array will be used 66 | 67 | **Returns:** 2D one-hot encoded array 68 | 69 | **Example:** 70 | ```python 71 | >>> one_hot_encode([1,0,5]) 72 | [[0. 1. 0. 0. 0. 0.] 73 | [1. 0. 0. 0. 0. 0.] 74 | [0. 0. 0. 0. 0. 1.]] 75 | ``` 76 | __________________ 77 | 78 | #### `split_hist` 79 | 80 | `split_hist(dataset, values, split_by, title='', xlabel='', ylabel=None, figsize=None, legend='best', plot=True, **hist_kwargs)` 81 | 82 | Plot a histogram of values from a given dataset, split by the values of a chosen column 83 | 84 | - **`dataset`** : `pd.DataFrame` 85 | 86 | - **`values`** : `string` 87 | 88 | The column name of the values to be displayed in the histogram 89 | 90 | - **`split_by`** : `string` 91 | 92 | The column name of the values to split the histogram by 93 | 94 | - **`title`** : `string` or `None`, default = '' 95 | 96 | The plot's title. If empty string, will be '{values} by {split_by}' 97 | 98 | - **`xlabel`**: `string` or `None`, default = '' 99 | 100 | x-axis label. If empty string, will be '{values}' 101 | 102 | - **`ylabel`**: `string` or `None`, default: `None` 103 | 104 | y-axis label 105 | 106 | - **`figsize`**: (`int`,`int`) or `None`, default = `None` 107 | 108 | A Matplotlib figure-size tuple. If `None`, falls back to Matplotlib's default. 109 | 110 | - **`legend`**: `string` or `None`, default = 'best' 111 | 112 | A Matplotlib legend location string. See Matplotlib documentation for possible options 113 | 114 | - **`plot`**: `Boolean`, default = True 115 | 116 | Plot the histogram 117 | 118 | - **`hist_kwargs`**: key-value pairs 119 | 120 | A key-value pairs to be passed to Matplotlib hist method. See Matplotlib documentation for possible options 121 | 122 | **Returns:** A Matplotlib `Axe` 123 | 124 | **Example:** See [examples](../getting_started/examples.md). -------------------------------------------------------------------------------- /docs/modules/model_utils.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: model_utils 3 | --- 4 | 5 | # model_utils 6 | 7 | #### `ks_abc` 8 | 9 | `ks_abc(y_true, y_pred, ax=None, figsize=None, colors=('darkorange', 'b'), title=None, xlim=(0.,1.), ylim=(0.,1.), fmt='.2f', lw=2, legend='best', plot=True, filename=None)` 10 | 11 | Perform the Kolmogorov–Smirnov test over the positive and negative distributions of a binary classifier, and compute 12 | the area between curves. 13 | 14 | The KS test plots the fraction of positives and negatives predicted correctly below each threshold. It then finds 15 | the optimal threshold, being the one enabling the best class separation. 16 | 17 | The area between curves allows a better insight into separation. The higher the area is (1 being the maximum), the 18 | more the positive and negative distributions' center-of-mass are closer to 1 and 0, respectively. 19 | 20 | Based on [scikit-plot](https://github.com/reiinakano/scikit-plot) `plot_ks_statistic` method. 21 | 22 | - **`y_true`** : array-like 23 | 24 | The true labels of the dataset 25 | 26 | - **`y_pred`** : array-like 27 | 28 | The probabilities predicted by a binary classifier 29 | 30 | - **`ax`** : matplotlib ax 31 | 32 | _Default: None_ 33 | 34 | Matplotlib Axis on which the curves will be plotted 35 | 36 | - **`figsize`** : `(int,int)` or `None` 37 | 38 | _Default: None_ 39 | 40 | a Matplotlib figure-size tuple. If `None`, falls back to Matplotlib's 41 | default. Only used if `ax=None` 42 | 43 | - **`colors`** : list of Matplotlib color strings 44 | 45 | _Default: `('darkorange', 'b')`_ 46 | 47 | List of colors to be used for the plotted curves 48 | 49 | - **`title`** : string or `None` 50 | 51 | _Default: None_ 52 | 53 | Plotted graph title. If `None`, default title is used 54 | 55 | - **`xlim`** : `(float, float)` 56 | 57 | _Default: (0.,1.)_ 58 | 59 | X-axis limits. 60 | 61 | - **`ylim`** : `(float,float)` 62 | 63 | _Default: (0.,1.)_ 64 | 65 | Y-axis limits. 66 | 67 | - **`fmt`** : `string` 68 | 69 | _Default: '.2f'_ 70 | 71 | String formatting of displayed numbers. 72 | 73 | - **`lw`** : `int` 74 | 75 | _Default: 2_ 76 | 77 | Line-width. 78 | 79 | - **`legend`**: `string` or `None` 80 | 81 | _Default: 'best'_ 82 | 83 | A Matplotlib legend location string. See Matplotlib documentation for possible options 84 | 85 | - **`plot`**: `Boolean`, default = True 86 | 87 | Plot the KS curves 88 | 89 | - **`filename`**: `string` or `None` 90 | 91 | _Default: None_ 92 | 93 | If not None, plot will be saved to the given file name. 94 | 95 | **Returns:** A dictionary of the following keys: 96 | 97 | - `abc`: area between curves 98 | 99 | - `ks_stat`: computed statistic of the KS test 100 | 101 | - `eopt`: estimated optimal threshold 102 | 103 | - `ax`: the ax used to plot the curves 104 | 105 | **Example:** See [examples](../getting_started/examples.md). 106 | 107 | __________________ 108 | 109 | #### `metric_graph` 110 | 111 | `metric_graph(y_true, y_pred, metric, micro=True, macro=True, eoptimal_threshold=True, class_names=None, colors=None, ax=None, figsize=None, xlim=(0.,1.), ylim=(0.,1.02), lw=2, ls='-', ms=10, fmt='.2f', title=None, filename=None, force_multiclass=False)` 112 | 113 | Plot a metric graph of predictor's results (including AUC scores), where each 114 | row of y_true and y_pred represent a single example. 115 | 116 | **ROC:** 117 | Plots true-positive rate as a function of the false-positive rate of the positive label in a binary classification, 118 | where $TPR = TP / (TP + FN)$ and $FPR = FP / (FP + TN)$. A naive algorithm will display a linear line going from 119 | (0,0) to (1,1), therefore having an area under-curve (AUC) of 0.5. 120 | 121 | **Precision-Recall:** 122 | Plots precision as a function of recall of the positive label in a binary classification, where 123 | $Precision = TP / (TP + FP)$ and $Recall = TP / (TP + FN)$. A naive algorithm will display a horizontal linear 124 | line with precision of the ratio of positive examples in the dataset. 125 | 126 | Based on [scikit-learn examples](http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html) (as was seen on April 2018): 127 | 128 | - **`y_true`** : `list / NumPy ndarray` 129 | 130 | The true classes of the predicted data. 131 | If only one or two columns exist, the data is treated as a binary 132 | classification (see input example below). 133 | If there are more than 2 columns, each column is considered a 134 | unique class, and a ROC graph and AUC score will be computed for each. 135 | 136 | - **`y_pred`** : `list / NumPy ndarray` 137 | 138 | The predicted classes. Must have the same shape as `y_true`. 139 | 140 | - **`metric`** : `string` 141 | 142 | The metric graph to plot. Currently supported: 'roc' for Receiver Operating Characteristic curve and 143 | 'pr' for Precision-Recall curve 144 | 145 | - **`micro`** : `Boolean` 146 | 147 | _Default: True_ 148 | 149 | Whether to calculate a Micro graph (not applicable for binary cases) 150 | 151 | - **`macro`** : `Boolean` 152 | 153 | _Default: True_ 154 | 155 | Whether to calculate a Macro graph (ROC metric only, not applicable for binary cases) 156 | 157 | - **`eopt`** : `Boolean` 158 | 159 | _Default: True_ 160 | 161 | Whether to calculate and display the estimated-optimal threshold 162 | for each metric graph. For ROC curves, the estimated-optimal threshold is the closest 163 | computed threshold with (fpr,tpr) values closest to (0,1). For PR curves, it is 164 | the closest one to (1,1) (perfect recall and precision) 165 | 166 | - **`class_names`**: `list` or `string` 167 | 168 | _Default: None_ 169 | 170 | Names of the different classes. In a multi-class classification, the 171 | order must match the order of the classes probabilities in the input 172 | data. In a binary classification, can be a string or a list. If a list, 173 | only the last element will be used. 174 | 175 | - **`colors`** : list of Matplotlib color strings or `None` 176 | 177 | _Default: None_ 178 | 179 | List of colors to be used for the plotted curves. If `None`, falls back 180 | to a predefined default. 181 | 182 | - **`ax`** : matplotlib `ax` 183 | 184 | _Default: None_ 185 | 186 | Matplotlib Axis on which the curves will be plotted 187 | 188 | - **`figsize`** : `(int,int)` or `None` 189 | 190 | _Default: None_ 191 | 192 | A Matplotlib figure-size tuple. If `None`, falls back to Matplotlib's 193 | default. Only used if `ax=None`. 194 | 195 | - **`xlim`** : `(float, float)` 196 | 197 | _Default: (0.,1.)_ 198 | 199 | X-axis limits. 200 | 201 | - **`ylim`** : `(float,float)` 202 | 203 | _Default: (0.,1.02)_ 204 | 205 | Y-axis limits. 206 | 207 | - **`lw`** : `int` 208 | 209 | _Default: 2_ 210 | 211 | Line-width. 212 | 213 | - **`ls`** : `string` 214 | 215 | _Default: '-'_ 216 | 217 | Matplotlib line-style string 218 | 219 | - **`ms`** : `int` 220 | 221 | _Default: 10_ 222 | 223 | Marker-size. 224 | 225 | - **`fmt`** : `string` 226 | 227 | _Default: '.2f'_ 228 | 229 | String formatting of displayed AUC and threshold numbers. 230 | 231 | - **`legend`**: `string` or `None` 232 | 233 | _Default: 'best'_ 234 | 235 | A Matplotlib legend location string. See Matplotlib documentation for possible options 236 | 237 | - **`plot`**: `Boolean`, default = True 238 | 239 | Plot the histogram 240 | 241 | - **`title`**: `string` or `None` 242 | 243 | _Default: None_ 244 | 245 | Plotted graph title. If None, default title is used. 246 | 247 | - **`filename`**: `string` or `None` 248 | 249 | _Default: None_ 250 | 251 | If not None, plot will be saved to the given file name. 252 | 253 | - **`force_multiclass`**: `Boolean` 254 | 255 | _Default: False_ 256 | 257 | Only applicable if `y_true` and `y_pred` have two columns. If so, 258 | consider the data as a multiclass data rather than binary (useful when plotting 259 | curves of different models one against the other) 260 | 261 | **Returns:** A dictionary, one key for each class. Each value is another dictionary, 262 | holding AUC and eOpT values. 263 | 264 | **Example:** See [examples](../getting_started/examples.md). 265 | 266 | **Binary Classification Input Example:** 267 | Consider a data-set of two data-points where the true class of the first line 268 | is class 0, which was predicted with a probability of 0.6, and the second line's 269 | true class is 1, with predicted probability of 0.8. 270 | ```python 271 | # First option: 272 | >>> metric_graph(y_true=[0,1], y_pred=[0.6,0.8], metric='roc') 273 | # Second option: 274 | >>> metric_graph(y_true=[[1,0],[0,1]], y_pred=[[0.6,0.4],[0.2,0.8]], metric='roc') 275 | # Both yield the same result 276 | ``` 277 | 278 | __________________ 279 | 280 | 281 | #### `random_forest_feature_importance` 282 | 283 | `random_forest_feature_importance(forest, features, precision=4)` 284 | 285 | Given a trained `sklearn.ensemble.RandomForestClassifier`, plot the different features based on their 286 | importance according to the classifier, from the most important to the least. 287 | 288 | - **`forest`** : `sklearn.ensemble.RandomForestClassifier` 289 | 290 | A trained `RandomForestClassifier` 291 | 292 | - **`features`** : `list` 293 | 294 | A list of the names of the features the classifier was trained on, ordered by the same order the appeared in the training data 295 | 296 | - **`precision`** : `int` 297 | 298 | _Default: 4_ 299 | 300 | Precision of feature importance. 301 | -------------------------------------------------------------------------------- /docs/modules/nominal.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: nominal 3 | --- 4 | 5 | # nominal 6 | 7 | #### `associations` 8 | 9 | `associations(dataset, nominal_columns='auto', numerical_columns=None, mark_columns=False,nom_nom_assoc='cramer', num_num_assoc='pearson', nom_num_assoc='correlation_ratio', symmetric_nom_nom=True, symmetric_num_num=True, display_rows='all', display_columns='all', hide_rows=None, hide_columns=None, cramers_v_bias_correction=True, nan_strategy=_REPLACE, nan_replace_value=_DEFAULT_REPLACE_VALUE, ax=None, figsize=None, annot=True, fmt='.2f', cmap=None, sv_color='silver', cbar=True, vmax=1.0, vmin=None, plot=True, compute_only=False, clustering=False, title=None, filename=None, multiprocessing=False, max_cpu_cores=None)` 10 | 11 | Calculate the correlation/strength-of-association of features in data-set with both categorical and 12 | continuous features using: 13 | * Pearson's R for continuous-continuous cases 14 | * Correlation Ratio for categorical-continuous cases 15 | * Cramer's V or Theil's U for categorical-categorical cases 16 | 17 | - **`dataset`** : `NumPy ndarray / Pandas DataFrame` 18 | 19 | The data-set for which the features' correlation is computed 20 | 21 | - **`nominal_columns`** : `string / list / NumPy ndarray` 22 | 23 | _Default: 'auto'_ 24 | 25 | Names of columns of the data-set which hold categorical values. Can also be the string 'all' to state that all 26 | columns are categorical, 'auto' (default) to identify nominal columns automatically, or None to state none are 27 | categorical. Only used if `numerical_columns` is `None`. 28 | 29 | - **`numerical_columns`** : `string / list / NumPy ndarray` 30 | 31 | _Default: None_ 32 | 33 | To be used instead of `nominal_columns`. Names of columns of the data-set 34 | which hold numerical values. Can also be the string 'all' to state that 35 | all columns are numerical (equivalent to `nominal_columns=None`) or 36 | 'auto' to try to identify numerical columns (equivalent to 37 | `nominal_columns=auto`). If `None`, `nominal_columns` is used. 38 | 39 | - **`mark_columns`** : `Boolean` 40 | 41 | _Default: False_ 42 | 43 | if True, output's columns' names will have a suffix of '(nom)' or '(con)' based on their type (nominal or 44 | continuous), as provided by nominal_columns 45 | 46 | - **`nom_nom_assoc`** : `callable / string` 47 | 48 | _Default: 'cramer'_ 49 | 50 | !!! info "Method signature change" 51 | This replaces the `theil_u` flag which was used till version 0.6.6. 52 | 53 | If callable, a function which recieves two `pd.Series` and returns a single number. 54 | 55 | If string, name of nominal-nominal (categorical-categorical) association to use: 56 | 57 | * `cramer`: Cramer's V 58 | 59 | * `theil`: Theil's U. When selected, heat-map columns are the provided information (meaning: $U = U(row|col)$) 60 | 61 | - **`num_num_assoc`** : `callable / string` 62 | 63 | _Default: 'pearson'_ 64 | 65 | If callable, a function which recieves two `pd.Series` and returns a single number. 66 | 67 | If string, name of numerical-numerical association to use: 68 | 69 | * `pearson`: Pearson's R 70 | 71 | * `spearman`: Spearman's R 72 | 73 | * `kendall`: Kendall's Tau 74 | 75 | - **`nom_num_assoc`** : `callable / string` 76 | 77 | _Default: 'correlation_ratio'_ 78 | 79 | If callable, a function which recieves two `pd.Series` and returns a single number. 80 | 81 | If string, name of nominal-numerical association to use: 82 | 83 | * `correlation_ratio`: correlation ratio 84 | 85 | - **`symmetric_nom_nom`** : `Boolean` 86 | 87 | _Default: True_ 88 | 89 | Relevant only if `nom_nom_assoc` is a callable. If so, declare whether the function is symmetric ($f(x,y) = f(y,x)$). 90 | If False, heat-map values should be interpreted as $f(row,col)$. 91 | 92 | - **`symmetric_num_num`** : `Boolean` 93 | 94 | _Default: True_ 95 | 96 | Relevant only if `num_num_assoc` is a callable. If so, declare whether the function is symmetric ($f(x,y) = f(y,x)$). 97 | If False, heat-map values should be interpreted as $f(row,col)$. 98 | 99 | - **`display_rows`** : `list / string` 100 | 101 | _Default: 'all'_ 102 | 103 | Choose which of the dataset's features will be displyed in the output's 104 | correlations table rows. If string, can either be a single feature's name or 'all'. 105 | Only used if `hide_rows` is `None`. 106 | 107 | - **`display_columns`** : `list / string` 108 | 109 | _Default: 'all'_ 110 | 111 | Choose which of the dataset's features will be displyed in the output's 112 | correlations table columns. If string, can either be a single feature's name or 'all'. 113 | Only used if `hide_columns` is `None`. 114 | 115 | - **`hide_rows`** : `list / string` 116 | 117 | _Default: None_ 118 | 119 | choose which of the dataset's features will not be displyed in the output's 120 | correlations table rows. If string, must be a single feature's name. If `None`, 121 | `display_rows` is used. 122 | 123 | - **`hide_columns`** : `list / string` 124 | 125 | _Default: None_ 126 | 127 | choose which of the dataset's features will not be displyed in the output's 128 | correlations table columns. If string, must be a single feature's name. If `None`, 129 | `display_columns` is used. 130 | 131 | - **`cramers_v_bias_correction`** : `Boolean` 132 | 133 | _Default: True_ 134 | 135 | !!! info "Method signature change" 136 | This replaces the `bias_correction` flag which was used till version 0.6.6. 137 | 138 | Use bias correction for Cramer's V from Bergsma and Wicher, Journal of the Korean 139 | Statistical Society 42 (2013): 323-328. 140 | 141 | - **`nan_strategy`** : `string` 142 | 143 | _Default: 'replace'_ 144 | 145 | How to handle missing values: can be either `'drop_samples'` to remove 146 | samples with missing values, `'drop_features'` to remove features 147 | (columns) with missing values, `'replace'` to replace all missing 148 | values with the `nan_replace_value`, or `'drop_sample_pairs'` to drop each 149 | pair of missing observables separately before calculating the corresponding coefficient. 150 | Missing values are `None` and `np.nan`. 151 | 152 | - **`nan_replace_value`** : `any` 153 | 154 | _Default: 0.0_ 155 | 156 | The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace' 157 | 158 | - **`ax`** : matplotlib `Axe` 159 | 160 | _Default: None_ 161 | 162 | Matplotlib Axis on which the heat-map will be plotted 163 | 164 | - **`figsize`** : `(float, float)` or `None` 165 | 166 | _Default: None_ 167 | 168 | A Matplotlib figure-size tuple. If `None`, will attempt to set the size automatically. 169 | Only used if `ax=None`. 170 | 171 | - **`annot`** : `Boolean` 172 | 173 | _Default: True_ 174 | 175 | Plot number annotations on the heat-map 176 | 177 | - **`fmt`** : `string` 178 | 179 | _Default: '.2f'_ 180 | 181 | String formatting of annotations 182 | 183 | - **`cmap`** : Matplotlib colormap or `None` 184 | 185 | _Default: None_ 186 | 187 | A colormap to be used for the heat-map. If None, falls back to Seaborn's heat-map default 188 | 189 | - **`sv_color`** : `string` 190 | 191 | _Default: 'silver'_ 192 | 193 | A Matplotlib color. The color to be used when displaying single-value features over the heat-map 194 | 195 | - **`cbar`** : `Boolean` 196 | 197 | _Default: True_ 198 | 199 | Display heat-map's color-bar 200 | 201 | - **`vmax`** : `float` 202 | 203 | _Default: 1.0_ 204 | 205 | Set heat-map `vmax` option 206 | 207 | - **`vmin`** : `float` or `None` 208 | 209 | _Default: None_ 210 | 211 | Set heat-map `vmin` option. If set to `None`, `vmin` will be chosen automatically 212 | between 0 and -1.0, depending on the types of associations used (-1.0 if Pearson's R 213 | is used, 0 otherwise) 214 | 215 | - **`plot`** : `Boolean` 216 | 217 | _Default: True_ 218 | 219 | Plot a heat-map of the correlation matrix. If False, heat-map will still be 220 | drawn, but not shown. The heat-map's `ax` is part of this function's output. 221 | 222 | - **`compute_only`** : `Boolean` 223 | 224 | _Default: False_ 225 | 226 | Use this flag only if you have no need of the plotting at all. This skips the entire 227 | plotting mechanism (similar to the old `compute_associations` method). 228 | 229 | - **`clustering`** : `Boolean` 230 | 231 | _Default: False_ 232 | 233 | If True, the computed associations will be sorted into groups by similar correlations 234 | 235 | - **`title`**: `string` or `None` 236 | 237 | _Default: None_ 238 | 239 | Plotted graph title. 240 | 241 | - **`filename`**: `string` or `None` 242 | 243 | _Default: None_ 244 | 245 | If not None, plot will be saved to the given file name. 246 | 247 | - **`multiprocessing`**: `Boolean` 248 | 249 | _Default: False_ 250 | 251 | If True, use multiprocessing to speed up computations. If None, falls back to single core computation 252 | 253 | - **`max_cpu_cores`**: `int` or `None` 254 | 255 | _Default_: `None` 256 | 257 | If not `None`, `ProcessPoolExecutor` will use the given number of CPU cores 258 | 259 | **Returns:** A dictionary with the following keys: 260 | 261 | - `corr`: A DataFrame of the correlation/strength-of-association between all features 262 | - `ax`: A Matplotlib `Axe` 263 | 264 | **Example:** See [examples](../getting_started/examples.md). 265 | __________________ 266 | 267 | #### `cluster_correlations` 268 | 269 | `cluster_correlations(corr_mat, indexes=None)` 270 | 271 | Apply agglomerative clustering in order to sort a correlation matrix. 272 | Based on [this clustering example](https://github.com/TheLoneNut/CorrelationMatrixClustering/blob/master/CorrelationMatrixClustering.ipynb). 273 | 274 | - **`corr_mat`** : `Pandas DataFrame` 275 | 276 | A correlation matrix (as output from `associations`) 277 | 278 | - **`indexes`** : `list / NumPy ndarray / Pandas Series` 279 | 280 | A sequence of cluster indexes for sorting. If not present, a clustering is performed. 281 | 282 | **Returns:** 283 | 284 | - a sorted correlation matrix (`pd.DataFrame`) 285 | - cluster indexes based on the original dataset (`list`) 286 | 287 | **Example:** 288 | ```python 289 | >>> assoc = associations( 290 | customers, 291 | plot=False 292 | ) 293 | >>> correlations = assoc['corr'] 294 | >>> correlations, _ = cluster_correlations(correlations) 295 | ``` 296 | 297 | __________________ 298 | 299 | #### `compute_associations` 300 | 301 | !!! warning "Deprecated" 302 | 303 | `compute_associations` was deprecated and removed. Use `associations(compute_only=True)['corr']`. 304 | 305 | __________________ 306 | 307 | #### `conditional_entropy` 308 | 309 | `conditional_entropy(x, y, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE, log_base=math.e)` 310 | 311 | Given measurements `x` and `y` of random variables $X$ and $Y$, calculates the conditional entropy of $X$ given $Y$: 312 | 313 | $$ S(X|Y) = - \sum_{x,y} p(x,y) \log\frac{p(x,y)}{p(y)} $$ 314 | 315 | Read more on [Wikipedia](https://en.wikipedia.org/wiki/Conditional_entropy). 316 | 317 | - **`x`** : `list / NumPy ndarray / Pandas Series` 318 | 319 | A sequence of measurements 320 | 321 | - **`y`** : `list / NumPy ndarray / Pandas Series` 322 | 323 | A sequence of measurements 324 | 325 | - **`nan_strategy`** : `string` 326 | 327 | _Default: 'replace'_ 328 | 329 | How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan. 330 | 331 | - **`nan_replace_value`** : `any` 332 | 333 | _Default: 0.0_ 334 | 335 | The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'. 336 | 337 | - **`log_base`** : `float` 338 | 339 | _Default: `math.e`_ 340 | 341 | Specifying base for calculating entropy. 342 | 343 | **Returns:** `float` 344 | 345 | __________________ 346 | 347 | #### `correlation_ratio` 348 | 349 | `correlation_ratio(categories, measurements, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE)` 350 | 351 | Calculates the Correlation Ratio ($\eta$) for categorical-continuous association: 352 | 353 | $$ \eta = \sqrt{\frac{\sum_x{n_x (\bar{y}_x - \bar{y})^2}}{\sum_{x,i}{(y_{xi}-\bar{y})^2}}} $$ 354 | 355 | where $n_x$ is the number of observations in category $x$, and we define: 356 | 357 | $$\bar{y}_x = \frac{\sum_i{y_{xi}}}{n_x} , \bar{y} = \frac{\sum_i{n_x \bar{y}_x}}{\sum_x{n_x}}$$ 358 | 359 | Answers the question - given a continuous value of a measurement, is it possible to know which category is it 360 | associated with? 361 | Value is in the range [0,1], where 0 means a category cannot be determined by a continuous measurement, and 1 means 362 | a category can be determined with absolute certainty. 363 | Read more on [Wikipedia](https://en.wikipedia.org/wiki/Correlation_ratio). 364 | 365 | - **`categories`** : `list / NumPy ndarray / Pandas Series` 366 | 367 | A sequence of categorical measurements 368 | 369 | - **`measurements`** : `list / NumPy ndarray / Pandas Series` 370 | 371 | A sequence of continuous measurements 372 | 373 | - **`nan_strategy`** : `string` 374 | 375 | _Default: 'replace'_ 376 | 377 | How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan. 378 | 379 | - **`nan_replace_value`** : `any` 380 | 381 | _Default: 0.0_ 382 | 383 | The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'. 384 | 385 | **Returns:** float in the range of [0,1] 386 | 387 | __________________ 388 | 389 | #### `cramers_v` 390 | 391 | `cramers_v(x, y, bias_correction=True, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE)` 392 | 393 | Calculates Cramer's V statistic for categorical-categorical association. 394 | This is a symmetric coefficient: $V(x,y) = V(y,x)$. Read more on [Wikipedia](https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V). 395 | 396 | Original function taken from [this answer](https://stackoverflow.com/a/46498792/5863503) on StackOverflow. 397 | 398 | !!! info "Cramer's V limitations when applied on skewed or small datasets" 399 | 400 | As the Cramer's V measure of association depends directly on the counts of each samples-pair in the data, it tends to be suboptimal when applied on skewed or small datasets. 401 | 402 | Consider each of the following cases, where we would expect Cramer's V to reach a high value, yet this only happens in the first scenario: 403 | 404 | ```python 405 | >>> x = ['a'] * 400 + ['b'] * 100 406 | >>> y = ['X'] * 400 + ['Y'] * 100 407 | >>> cramers_v(x,y) 408 | 0.9937374102534072 409 | 410 | # skewed dataset 411 | >>> x = ['a'] * 500 + ['b'] * 1 412 | >>> y = ['X'] * 500 + ['Y'] * 1 413 | >>> cramers_v(x,y) 414 | 0.4974896903293253 415 | 416 | # very small dataset 417 | >>> x = ['a'] * 4 + ['b'] * 1 418 | >>> y = ['X'] * 4 + ['Y'] * 1 419 | >>> cramers_v(x,y) 420 | 0.0 421 | ``` 422 | 423 | - **`x`** : `list / NumPy ndarray / Pandas Series` 424 | 425 | A sequence of categorical measurements 426 | 427 | - **`y`** : `list / NumPy ndarray / Pandas Series` 428 | 429 | A sequence of categorical measurements 430 | 431 | - **`bias_correction`** : `Boolean` 432 | 433 | _Default: True_ 434 | 435 | Use bias correction from Bergsma and Wicher, Journal of the Korean Statistical Society 42 (2013): 323-328. 436 | 437 | - **`nan_strategy`** : `string` 438 | 439 | _Default: 'replace'_ 440 | 441 | How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan. 442 | 443 | - **`nan_replace_value`** : `any` 444 | 445 | _Default: 0.0_ 446 | 447 | The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'. 448 | 449 | **Returns:** float in the range of [0,1] 450 | 451 | __________________ 452 | 453 | #### `identify_nominal_columns` 454 | 455 | `identify_nominal_columns(dataset)` 456 | 457 | Given a dataset, identify categorical columns. This is used internally in `associations` and `numerical_encoding`, 458 | but can also be used directly. 459 | 460 | !!! info "Note:" 461 | 462 | This is a shortcut for `data_utils.identify_columns_by_type(dataset, include=['object', 'category'])` 463 | 464 | - **`dataset`** : `np.ndarray` / `pd.DataFrame` 465 | 466 | **Returns:** list of categorical columns 467 | 468 | **Example:** 469 | ```python 470 | >>> df = pd.DataFrame({'col1': ['a', 'b', 'c', 'a'], 'col2': [3, 4, 2, 1]}) 471 | >>> identify_nominal_columns(df) 472 | ['col1'] 473 | ``` 474 | 475 | __________________ 476 | 477 | #### `identify_numeric_columns` 478 | 479 | `identify_numeric_columns(dataset)` 480 | 481 | Given a dataset, identify numeric columns. 482 | 483 | !!! info "Note:" 484 | 485 | This is a shortcut for `data_utils.identify_columns_by_type(dataset, include=['int64', 'float64'])` 486 | 487 | - **`dataset`** : `np.ndarray` / `pd.DataFrame` 488 | 489 | **Returns:** list of numerical columns 490 | 491 | **Example:** 492 | ```python 493 | >>> df = pd.DataFrame({'col1': ['a', 'b', 'c', 'a'], 'col2': [3, 4, 2, 1], 'col3': [1., 2., 3., 4.]}) 494 | >>> identify_numeric_columns(df) 495 | ['col2', 'col3'] 496 | ``` 497 | 498 | __________________ 499 | 500 | #### `numerical_encoding` 501 | 502 | `numerical_encoding(dataset, nominal_columns='auto', drop_single_label=False, drop_fact_dict=True, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE)` 503 | 504 | Encoding a data-set with mixed data (numerical and categorical) to a numerical-only data-set, 505 | using the following logic: 506 | 507 | * categorical with only a single value will be marked as zero (or dropped, if requested) 508 | 509 | * categorical with two values will be replaced with the result of Pandas `factorize` 510 | 511 | * categorical with more than two values will be replaced with the result of Pandas `get_dummies` 512 | 513 | * numerical columns will not be modified 514 | 515 | - **`dataset`** : `NumPy ndarray / Pandas DataFrame` 516 | 517 | The data-set to encode 518 | 519 | - **`nominal_columns`** : `sequence / string ` 520 | 521 | _Default: 'auto'_ 522 | 523 | Names of columns of the data-set which hold categorical values. Can also be the string 'all' to state that all columns are categorical, 'auto' (default) to identify nominal columns automatically, or None to state none are categorical (nothing happens) 524 | 525 | - **`drop_single_label`** : `Boolean` 526 | 527 | _Default: False_ 528 | 529 | If True, nominal columns with a only a single value will be dropped. 530 | 531 | - **`drop_fact_dict`** : `Boolean` 532 | 533 | _Default: True_ 534 | 535 | If True, the return value will be the encoded DataFrame alone. If False, it will be a tuple of the DataFrame and the dictionary of the binary factorization (originating from pd.factorize) 536 | 537 | - **`nan_strategy`** : `string` 538 | 539 | _Default: 'replace'_ 540 | 541 | How to handle missing values: can be either 'drop_samples' to remove samples with missing values, 'drop_features' to remove features (columns) with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan. 542 | 543 | - **`nan_replace_value`** : `any` 544 | 545 | _Default: 0.0_ 546 | 547 | The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace' 548 | 549 | **Returns:** `pd.DataFrame` or `(pd.DataFrame, dict)`. If `drop_fact_dict` is True, returns the encoded DataFrame. 550 | else, returns a tuple of the encoded DataFrame and dictionary, where each key is a two-value column, and the 551 | value is the original labels, as supplied by Pandas `factorize`. Will be empty if no two-value columns are 552 | present in the data-set 553 | 554 | __________________ 555 | 556 | #### `replot_last_associations` 557 | 558 | `replot_last_associations(ax=None, figsize=None, annot=None, fmt=None, cmap=None, sv_color=None, cbar=None, vmax=None, vmin=None, plot=True, title=None, filename=None)` 559 | 560 | Re-plot last computed associations heat-map. This method performs no new computations, but only allows 561 | to change the visual output of the last computed heat-map. 562 | 563 | - **`ax`** : matplotlib `Axe` 564 | 565 | _Default: `None`_ 566 | 567 | Matplotlib Axis on which the heat-map will be plotted 568 | 569 | - **`figsize`** : `(int,int)` or `None` 570 | 571 | _Default: `None`_ 572 | 573 | A Matplotlib figure-size tuple. If `None`, uses the last `associations` call value. 574 | Only used if `ax=None`. 575 | 576 | - **`annot`** : `Boolean` or `None` 577 | 578 | _Default: `None`_ 579 | 580 | Plot number annotations on the heat-map. If `None`, uses the last `associations` call value. 581 | 582 | - **`fmt`** : `string` 583 | 584 | _Default: `None`_ 585 | 586 | String formatting of annotations. If `None`, uses the last `associations` call value. 587 | 588 | - **`cmap`** : Matplotlib `colormap` or `None` 589 | 590 | _Default: `None`_ 591 | 592 | A colormap to be used for the heat-map. If `None`, uses the last `associations` call value. 593 | 594 | - **`sv_color`** : `string` 595 | 596 | _Default: `None`_ 597 | 598 | A Matplotlib color. The color to be used when displaying single-value. 599 | If `None`, uses the last `associations` call value. 600 | 601 | - **`cbar`** : `Boolean `or `None` 602 | 603 | _Default: `None`_ 604 | 605 | Display heat-map's color-bar. If `None`, uses the last `associations` call value. 606 | 607 | - **`vmax`** : `float` or `None` 608 | 609 | _Default: `None`_ 610 | 611 | Set heat-map `vmax` option. If `None`, uses the last `associations` call value. 612 | 613 | - **`vmin`** : `float` or `None` 614 | 615 | _Default: `None`_ 616 | 617 | Set heat-map `vmin` option. If `None`, uses the last `associations` call value. 618 | 619 | - **`plot`** : `Boolean` 620 | 621 | _Default: `True`_ 622 | 623 | Plot a heat-map of the correlation matrix. If False, plotting still 624 | happens, but the heat-map will not be displayed. 625 | 626 | - **`title`** : `string` or `None` 627 | 628 | _Default: `None`_ 629 | 630 | Plotted graph title. If `None`, uses the last `associations` call value. 631 | 632 | - **`filename`** : `string` or `None` 633 | 634 | _Default: `None`_ 635 | 636 | If not `None`, plot will be saved to the given file name. Note: in order to avoid accidental file 637 | overwrites, the last `associations` call value is never used, and when filename is set to None, 638 | no writing to file occurs. 639 | 640 | **Returns:** A Matplotlib `Axe` 641 | 642 | __________________ 643 | 644 | #### `theils_u` 645 | 646 | `theils_u(x, y, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE)` 647 | 648 | Calculates Theil's U statistic (Uncertainty coefficient) for categorical-categorical association, defined as: 649 | 650 | $$ U(X|Y) = \frac{S(X) - S(X|Y)}{S(X)} $$ 651 | 652 | where $S(X)$ is the entropy of $X$ and $S(X|Y)$ is the [conditional entropy](#conditional_entropy) of $X$ given $Y$. 653 | 654 | This is the uncertainty of x given y: value is on the range of [0,1] - where 0 means y provides no information about 655 | x, and 1 means y provides full information about x. 656 | This is an asymmetric coefficient: $U(x,y) \neq U(y,x)$. Read more on 657 | [Wikipedia](https://en.wikipedia.org/wiki/Uncertainty_coefficient). 658 | 659 | - **`x`** : `list / NumPy ndarray / Pandas Series` 660 | 661 | A sequence of categorical measurements 662 | 663 | - **`y`** : `list / NumPy ndarray / Pandas Series` 664 | 665 | A sequence of categorical measurements 666 | 667 | - **`nan_strategy`** : `string` 668 | 669 | _Default: 'replace'_ 670 | 671 | How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan. 672 | 673 | - **`nan_replace_value`** : `any` 674 | 675 | _Default: 0.0_ 676 | 677 | The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'. 678 | 679 | **Returns:** float in the range of [0,1] 680 | 681 | -------------------------------------------------------------------------------- /docs/modules/sampling.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: sampling 3 | --- 4 | 5 | # sampling 6 | 7 | #### `boltzmann_sampling` 8 | 9 | `boltzmann_sampling(numbers, k=1, with_replacement=False)` 10 | 11 | Return k numbers from a boltzmann-sampling over the supplied numbers 12 | 13 | - **`numbers`** : `List or np.ndarray` 14 | 15 | numbers to sample 16 | 17 | - **`k`** : `int` 18 | 19 | _Default: 1_ 20 | 21 | How many numbers to sample. Choosing `k=None` will yield a single number 22 | 23 | - **`with_replacement`** : `Boolean` `Default: False` 24 | 25 | Allow replacement or not 26 | 27 | **Returns:** `list`, `np.ndarray` or a single number (depending on the input) 28 | 29 | __________________ 30 | 31 | #### `weighted_sampling` 32 | 33 | `weighted_sampling(numbers, k=1, with_replacement=False)` 34 | 35 | Return k numbers from a weighted-sampling over the supplied numbers 36 | 37 | - **`numbers`** : `List or np.ndarray` 38 | 39 | numbers to sample 40 | 41 | - **`k`** : `int` 42 | 43 | _Default: 1_ 44 | 45 | How many numbers to sample. Choosing `k=None` will yield a single number 46 | 47 | - **`with_replacement`** : `Boolean` 48 | 49 | _Default: False_ 50 | 51 | Allow replacement or not 52 | 53 | **Returns:** `list`, `np.ndarray` or a single number (depending on the input) -------------------------------------------------------------------------------- /docs/overrides/main.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block extrahead %} 4 | {% set title = config.site_name %} 5 | {% if page and page.title and not page.is_homepage %} 6 | {% set title = config.site_name ~ ":" ~ page.title | striptags %} 7 | {% endif %} 8 | {% set image = config.site_url ~ 'images/social_banner.png' %} 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | {% endblock %} -------------------------------------------------------------------------------- /docs/related_blogposts.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: related blogposts 3 | --- 4 | # Related Blogposts 5 | 6 | Here are some blogposts I wrote, explaining and using some of the methods of Dython: 7 | 8 | * Read more about the categorical tools on 9 | [The Search for Categorical Correlation](https://medium.com/@shakedzy/the-search-for-categorical-correlation-a1cf7f1888c9) 10 | * Read more about using ROC graphs on 11 | [Hard ROC: Really Understanding & Properly Using ROC and AUC](https://medium.com/@shakedzy/hard-roc-really-understanding-and-properly-using-roc-and-auc-13413cf0dc24) 12 | * Read more about KS Area Between Curves and when not to use ROC graphs (and other common metrics) on 13 | [The Metric System: How to Correctly Measure Your Model](https://shakedzy.medium.com/the-metric-system-how-to-correctly-measure-your-model-17d3feaed6ab) -------------------------------------------------------------------------------- /dython/__init__.py: -------------------------------------------------------------------------------- 1 | from . import nominal, model_utils, sampling, data_utils 2 | from ._private import set_is_jupyter 3 | 4 | 5 | def _get_version_from_setuptools(): 6 | from pkg_resources import get_distribution 7 | 8 | return get_distribution("dython").version 9 | 10 | 11 | __all__ = ["__version__"] 12 | __version__ = _get_version_from_setuptools() 13 | set_is_jupyter() 14 | -------------------------------------------------------------------------------- /dython/_private.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | from numpy.typing import NDArray 6 | from typing import Optional, Any, Tuple, Union, List, Literal 7 | from .typing import Number, OneDimArray 8 | 9 | 10 | IS_JUPYTER: bool = False 11 | 12 | 13 | def set_is_jupyter(force_to: Optional[bool] = None) -> None: 14 | global IS_JUPYTER 15 | if force_to is not None: 16 | IS_JUPYTER = force_to 17 | else: 18 | IS_JUPYTER = "ipykernel_launcher.py" in sys.argv[0] 19 | 20 | 21 | def plot_or_not(plot: bool) -> None: 22 | if plot: 23 | plt.show() 24 | elif not plot and IS_JUPYTER: 25 | fig = plt.gcf() 26 | if fig: 27 | plt.close(fig) 28 | 29 | 30 | def convert( 31 | data: Union[List[Number], NDArray, pd.DataFrame], 32 | to: Literal["array", "list", "dataframe"], 33 | copy: bool = True, 34 | ) -> Union[List[Number], NDArray, pd.DataFrame]: 35 | converted = None 36 | if to == "array": 37 | if isinstance(data, np.ndarray): 38 | converted = data.copy() if copy else data 39 | elif isinstance(data, pd.Series): 40 | converted = data.values 41 | elif isinstance(data, list): 42 | converted = np.array(data) 43 | elif isinstance(data, pd.DataFrame): 44 | converted = data.values() # type: ignore 45 | elif to == "list": 46 | if isinstance(data, list): 47 | converted = data.copy() if copy else data 48 | elif isinstance(data, pd.Series): 49 | converted = data.values.tolist() 50 | elif isinstance(data, np.ndarray): 51 | converted = data.tolist() 52 | elif to == "dataframe": 53 | if isinstance(data, pd.DataFrame): 54 | converted = data.copy(deep=True) if copy else data 55 | elif isinstance(data, np.ndarray): 56 | converted = pd.DataFrame(data) 57 | else: 58 | raise ValueError("Unknown data conversion: {}".format(to)) 59 | if converted is None: 60 | raise TypeError( 61 | "cannot handle data conversion of type: {} to {}".format( 62 | type(data), to 63 | ) 64 | ) 65 | else: 66 | return converted # type: ignore 67 | 68 | 69 | def remove_incomplete_samples( 70 | x: Union[List[Any], OneDimArray], y: Union[List[Any], OneDimArray] 71 | ) -> Tuple[Union[List[Any], OneDimArray], Union[List[Any], OneDimArray]]: 72 | x = [v if v is not None else np.nan for v in x] 73 | y = [v if v is not None else np.nan for v in y] 74 | arr = np.array([x, y]).transpose() 75 | arr = arr[~np.isnan(arr).any(axis=1)].transpose() 76 | if isinstance(x, list): 77 | return arr[0].tolist(), arr[1].tolist() 78 | else: 79 | return arr[0], arr[1] 80 | 81 | 82 | def replace_nan_with_value( 83 | x: Union[List[Any], OneDimArray], 84 | y: Union[List[Any], OneDimArray], 85 | value: Any, 86 | ) -> Tuple[NDArray, NDArray]: 87 | x = np.array( 88 | [v if v == v and v is not None else value for v in x] 89 | ) # NaN != NaN 90 | y = np.array([v if v == v and v is not None else value for v in y]) 91 | return x, y 92 | -------------------------------------------------------------------------------- /dython/data_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from typing import Optional, Tuple, List, Any, Union 5 | from numpy.typing import NDArray 6 | from .typing import Number, TwoDimArray 7 | from ._private import convert, plot_or_not 8 | 9 | 10 | __all__ = [ 11 | "identify_columns_by_type", 12 | "identify_columns_with_na", 13 | "one_hot_encode", 14 | "split_hist", 15 | ] 16 | 17 | 18 | def one_hot_encode( 19 | array: Union[List[Union[Number, str]], NDArray], 20 | classes: Optional[int] = None, 21 | ) -> NDArray: 22 | """ 23 | One-hot encode a 1D array. 24 | Based on this StackOverflow answer: https://stackoverflow.com/a/29831596/5863503 25 | 26 | Parameters: 27 | ----------- 28 | arr : array-like 29 | An array to be one-hot encoded. Must contain only non-negative integers 30 | classes : int or None 31 | number of classes. if None, max value of the array will be used 32 | 33 | Returns: 34 | -------- 35 | 2D one-hot encoded array 36 | 37 | Example: 38 | -------- 39 | >>> one_hot_encode([1,0,5]) 40 | array([[0., 1., 0., 0., 0., 0.], 41 | [1., 0., 0., 0., 0., 0.], 42 | [0., 0., 0., 0., 0., 1.]]) 43 | """ 44 | arr: NDArray = convert(array, "array").astype(int) # type: ignore 45 | if not len(arr.shape) == 1: 46 | raise ValueError( 47 | f"array must have only one dimension, but has shape: {arr.shape}" 48 | ) 49 | if arr.min() < 0: 50 | raise ValueError("array cannot contain negative values") 51 | classes = classes if classes is not None else arr.max() + 1 52 | h = np.zeros((arr.size, classes)) # type: ignore 53 | h[np.arange(arr.size), arr] = 1 54 | return h 55 | 56 | 57 | def split_hist( 58 | dataset: pd.DataFrame, 59 | values: str, 60 | split_by: str, 61 | title: Optional[str] = "", 62 | xlabel: Optional[str] = "", 63 | ylabel: Optional[str] = None, 64 | figsize: Optional[Tuple[int, int]] = None, 65 | legend: Optional[str] = "best", 66 | plot: bool = True, 67 | **hist_kwargs, 68 | ) -> plt.Axes: 69 | """ 70 | Plot a histogram of values from a given dataset, split by the values of a chosen column 71 | 72 | Parameters: 73 | ----------- 74 | dataset : Pandas DataFrame 75 | values : string 76 | The column name of the values to be displayed in the histogram 77 | split_by : string 78 | The column name of the values to split the histogram by 79 | title : string or None, default = '' 80 | The plot's title. If empty string, will be '{values} by {split_by}' 81 | xlabel: string or None, default = '' 82 | x-axis label. If empty string, will be '{values}' 83 | ylabel: string or None, default: None 84 | y-axis label 85 | figsize: (int,int) or None, default = None 86 | A Matplotlib figure-size tuple. If `None`, falls back to Matplotlib's 87 | default. 88 | legend: string or None, default = 'best' 89 | A Matplotlib legend location string. See Matplotlib documentation for possible options 90 | plot: Boolean, default = True 91 | Plot the histogram 92 | hist_kwargs: key-value pairs 93 | A key-value pairs to be passed to Matplotlib hist method. See Matplotlib documentation for possible options 94 | 95 | Returns: 96 | -------- 97 | A Matplotlib `Axes` 98 | 99 | Example: 100 | -------- 101 | See example under `dython.examples` 102 | """ 103 | plt.figure(figsize=figsize) 104 | split_vals = dataset[split_by].unique() 105 | data_split = list() 106 | for val in split_vals: 107 | data_split.append(dataset[dataset[split_by] == val][values]) 108 | hist_kwargs["label"] = split_vals 109 | plt.hist(data_split, **hist_kwargs) 110 | if legend: 111 | plt.legend(loc=legend) 112 | if xlabel is not None: 113 | if xlabel == "": 114 | xlabel = values 115 | plt.xlabel(xlabel) 116 | if title is not None: 117 | if title == "": 118 | title = values + " by " + split_by 119 | plt.title(title) 120 | if ylabel: 121 | plt.ylabel(ylabel) 122 | ax = plt.gca() 123 | plot_or_not(plot) 124 | return ax 125 | 126 | 127 | def identify_columns_by_type( 128 | dataset: TwoDimArray, include: List[str] 129 | ) -> List[Any]: 130 | """ 131 | Given a dataset, identify columns of the types requested. 132 | 133 | Parameters: 134 | ----------- 135 | dataset : NumPy ndarray / Pandas DataFrame 136 | include : list of strings 137 | Desired column types 138 | 139 | Returns: 140 | -------- 141 | A list of columns names 142 | 143 | Example: 144 | -------- 145 | >>> df = pd.DataFrame({'col1': ['a', 'b', 'c', 'a'], 'col2': [3, 4, 2, 1], 'col3': [1., 2., 3., 4.]}) 146 | >>> identify_columns_by_type(df, include=['int64', 'float64']) 147 | ['col2', 'col3'] 148 | 149 | """ 150 | df: pd.DataFrame = convert(dataset, "dataframe") # type: ignore 151 | columns = list(df.select_dtypes(include=include).columns) 152 | return columns 153 | 154 | 155 | def identify_columns_with_na(dataset: TwoDimArray) -> pd.DataFrame: 156 | """ 157 | Return columns names having NA values, sorted in descending order by their number of NAs 158 | 159 | Parameters: 160 | ----------- 161 | dataset : NumPy ndarray / Pandas DataFrame 162 | 163 | Returns: 164 | -------- 165 | A DataFrame of two columns (['column', 'na_count']), consisting of only the names 166 | of columns with NA values, sorted by their number of NA values. 167 | 168 | Example: 169 | -------- 170 | >>> df = pd.DataFrame({'col1': ['a', np.nan, 'a', 'a'], 'col2': [3, np.nan, 2, np.nan], 'col3': [1., 2., 3., 4.]}) 171 | >>> identify_columns_with_na(df) 172 | column na_count 173 | 1 col2 2 174 | 0 col1 1 175 | """ 176 | df: pd.DataFrame = convert(dataset, "dataframe") # type: ignore 177 | na_count = [sum(df[cc].isnull()) for cc in df.columns] 178 | return ( 179 | pd.DataFrame({"column": df.columns, "na_count": na_count}) 180 | .query("na_count > 0") 181 | .sort_values("na_count", ascending=False) 182 | ) 183 | -------------------------------------------------------------------------------- /dython/examples.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn import svm, datasets 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.preprocessing import label_binarize 6 | from sklearn.multiclass import OneVsRestClassifier 7 | from sklearn.linear_model import LogisticRegression 8 | 9 | from .data_utils import split_hist 10 | from .model_utils import metric_graph, ks_abc 11 | from .nominal import associations 12 | 13 | 14 | def roc_graph_example(): 15 | """ 16 | Plot an example ROC graph of an SVM model predictions over the Iris 17 | dataset. 18 | 19 | Based on sklearn examples (as was seen on April 2018): 20 | http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html 21 | """ 22 | 23 | # Load data 24 | iris = datasets.load_iris() 25 | X = iris.data 26 | y = label_binarize(iris.target, classes=[0, 1, 2]) 27 | 28 | # Add noisy features 29 | random_state = np.random.RandomState(4) 30 | n_samples, n_features = X.shape 31 | X = np.c_[X, random_state.randn(n_samples, 200 * n_features)] 32 | 33 | # Train a model 34 | X_train, X_test, y_train, y_test = train_test_split( 35 | X, y, test_size=0.5, random_state=0 36 | ) 37 | classifier = OneVsRestClassifier( 38 | svm.SVC(kernel="linear", probability=True, random_state=0) 39 | ) 40 | 41 | # Predict 42 | y_score = classifier.fit(X_train, y_train).predict_proba(X_test) 43 | 44 | # Plot ROC graphs 45 | return metric_graph( 46 | y_test, y_score, "roc", class_names_list=iris.target_names 47 | ) 48 | 49 | 50 | def pr_graph_example(): 51 | """ 52 | Plot an example PR graph of an SVM model predictions over the Iris 53 | dataset. 54 | """ 55 | 56 | # Load data 57 | iris = datasets.load_iris() 58 | X = iris.data 59 | y = label_binarize(iris.target, classes=[0, 1, 2]) 60 | 61 | # Add noisy features 62 | random_state = np.random.RandomState(4) 63 | n_samples, n_features = X.shape 64 | X = np.c_[X, random_state.randn(n_samples, 200 * n_features)] 65 | 66 | # Train a model 67 | X_train, X_test, y_train, y_test = train_test_split( 68 | X, y, test_size=0.5, random_state=0 69 | ) 70 | classifier = OneVsRestClassifier( 71 | svm.SVC(kernel="linear", probability=True, random_state=0) 72 | ) 73 | 74 | # Predict 75 | y_score = classifier.fit(X_train, y_train).predict_proba(X_test) 76 | 77 | # Plot PR graphs 78 | return metric_graph( 79 | y_test, y_score, "pr", class_names_list=iris.target_names 80 | ) 81 | 82 | 83 | def associations_iris_example(): 84 | """ 85 | Plot an example of an associations heat-map of the Iris dataset features. 86 | All features of this dataset are numerical (except for the target). 87 | """ 88 | 89 | # Load data 90 | iris = datasets.load_iris() 91 | 92 | # Convert int classes to strings to allow associations method 93 | # to automatically recognize categorical columns 94 | target = ["C{}".format(i) for i in iris.target] 95 | 96 | # Prepare data 97 | X = pd.DataFrame(data=iris.data, columns=iris.feature_names) 98 | y = pd.DataFrame(data=target, columns=["target"]) 99 | df = pd.concat([X, y], axis=1) 100 | 101 | # Plot features associations 102 | return associations(df) 103 | 104 | 105 | def associations_mushrooms_example(): 106 | """ 107 | Plot an example of an associations heat-map of the UCI Mushrooms dataset features. 108 | All features of this dataset are categorical. This example will use Theil's U. 109 | """ 110 | 111 | # Download and load data from UCI 112 | df = pd.read_csv( 113 | "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data" 114 | ) 115 | df.columns = [ 116 | "class", 117 | "cap-shape", 118 | "cap-surface", 119 | "cap-color", 120 | "bruises", 121 | "odor", 122 | "gill-attachment", 123 | "gill-spacing", 124 | "gill-size", 125 | "gill-color", 126 | "stalk-shape", 127 | "stalk-root", 128 | "stalk-surface-above-ring", 129 | "stalk-surface-below-ring", 130 | "stalk-color-above-ring", 131 | "stalk-color-below-ring", 132 | "veil-type", 133 | "veil-color", 134 | "ring-number", 135 | "ring-type", 136 | "spore-print-color", 137 | "population", 138 | "habitat", 139 | ] 140 | 141 | # Plot features associations 142 | return associations(df, nom_nom_assoc="theil", figsize=(15, 15)) 143 | 144 | 145 | def split_hist_example(): 146 | """ 147 | Plot an example of split histogram. 148 | While this example presents a numerical column split by a categorical one, categorical columns can also be used 149 | as the values, as well as numerical columns as the split criteria. 150 | """ 151 | 152 | # Load data and convert to DataFrame 153 | data = datasets.load_breast_cancer() 154 | df = pd.DataFrame(data=data.data, columns=data.feature_names) 155 | df["malignant"] = [not bool(x) for x in data.target] 156 | 157 | # Plot histogram 158 | return split_hist(df, "mean radius", "malignant", bins=20, figsize=(15, 7)) 159 | 160 | 161 | def ks_abc_example(): 162 | """ 163 | An example of KS Area Between Curve of a simple binary classifier 164 | trained over the Breast Cancer dataset. 165 | """ 166 | 167 | # Load and split data 168 | data = datasets.load_breast_cancer() 169 | X_train, X_test, y_train, y_test = train_test_split( 170 | data.data, data.target, test_size=0.5, random_state=0 171 | ) 172 | 173 | # Train model and predict 174 | model = LogisticRegression(solver="liblinear") 175 | model.fit(X_train, y_train) 176 | y_pred = model.predict_proba(X_test) 177 | 178 | # Perform KS test and compute area between curves 179 | return ks_abc(y_test, y_pred[:, 1], figsize=(7, 7)) 180 | -------------------------------------------------------------------------------- /dython/model_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.ensemble import RandomForestClassifier 4 | from sklearn.metrics import roc_curve, precision_recall_curve, auc 5 | from sklearn.preprocessing import LabelEncoder 6 | from typing import List, Union, Optional, Tuple, Dict, Any, Iterable 7 | from numpy.typing import NDArray 8 | from .typing import Number, OneDimArray 9 | from ._private import convert, plot_or_not 10 | 11 | __all__ = ["random_forest_feature_importance", "metric_graph", "ks_abc"] 12 | 13 | _ROC_PLOT_COLORS = ["b", "g", "r", "c", "m", "y", "k", "darkorange"] 14 | 15 | 16 | def _display_metric_plot( 17 | ax: plt.Axes, 18 | metric: str, 19 | naives: List[Tuple[Number, Number, Number, Number, str]], 20 | xlim: Tuple[float, float], 21 | ylim: Tuple[float, float], 22 | legend: Optional[str], 23 | title: Optional[str], 24 | filename: Optional[str], 25 | plot: bool, 26 | ) -> plt.Axes: 27 | for n in naives: 28 | ax.plot([n[0], n[1]], [n[2], n[3]], color=n[4], lw=1, linestyle="--") 29 | ax.set_xlim(left=xlim[0], right=xlim[1]) 30 | ax.set_ylim(bottom=ylim[0], top=ylim[1]) 31 | if metric == "roc": 32 | ax.set_xlabel("False Positive Rate") 33 | ax.set_ylabel("True Positive Rate") 34 | ax.set_title(title or "Receiver Operating Characteristic") 35 | else: # metric == 'pr' 36 | ax.set_xlabel("Recall") 37 | ax.set_ylabel("Precision") 38 | ax.set_title(title or "Precision-Recall Curve") 39 | if legend: 40 | ax.legend(loc=legend) 41 | if filename: 42 | plt.savefig(filename) 43 | plot_or_not(plot) 44 | return ax 45 | 46 | 47 | def _draw_estimated_optimal_threshold_mark( 48 | metric: str, 49 | x_axis: OneDimArray, 50 | y_axis: OneDimArray, 51 | thresholds: OneDimArray, 52 | color: str, 53 | ms: int, 54 | fmt: str, 55 | ax: plt.Axes, 56 | ) -> Tuple[Number, Number, Number]: 57 | annotation_offset = (-0.027, 0.03) 58 | a = np.zeros((len(x_axis), 2)) 59 | a[:, 0] = x_axis 60 | a[:, 1] = y_axis 61 | if metric == "roc": 62 | dist = lambda row: row[0] ** 2 + (1 - row[1]) ** 2 # optimal: (0,1) 63 | else: # metric == 'pr' 64 | dist = ( 65 | lambda row: (1 - row[0]) ** 2 + (1 - row[1]) ** 2 66 | ) # optimal: (1,1) 67 | amin = np.apply_along_axis(dist, 1, a).argmin() 68 | ax.plot(x_axis[amin], y_axis[amin], color=color, marker="o", ms=ms) 69 | ax.annotate( 70 | "{th:{fmt}}".format(th=thresholds[amin], fmt=fmt), 71 | xy=(x_axis[amin], y_axis[amin]), 72 | color=color, 73 | xytext=( 74 | x_axis[amin] + annotation_offset[0], 75 | y_axis[amin] + annotation_offset[1], 76 | ), 77 | ) 78 | return thresholds[amin], x_axis[amin], y_axis[amin] 79 | 80 | 81 | def _plot_macro_metric( 82 | x_axis: OneDimArray, 83 | y_axis: OneDimArray, 84 | n: int, 85 | lw: int, 86 | fmt: str, 87 | ax: plt.Axes, 88 | ) -> None: 89 | all_x_axis = np.unique(np.concatenate([x_axis[i] for i in range(n)])) 90 | mean_y_axis = np.zeros_like(all_x_axis) 91 | for i in range(n): 92 | mean_y_axis += np.interp(all_x_axis, x_axis[i], y_axis[i]) 93 | mean_y_axis /= n 94 | x_axis_macro = all_x_axis 95 | y_axis_macro = mean_y_axis 96 | auc_macro = auc(x_axis_macro, y_axis_macro) 97 | label = "ROC curve: macro (AUC = {auc:{fmt}})".format( 98 | auc=auc_macro, fmt=fmt 99 | ) 100 | ax.plot( 101 | x_axis_macro, y_axis_macro, label=label, color="navy", ls=":", lw=lw 102 | ) 103 | 104 | 105 | def _binary_metric_graph( 106 | metric: str, 107 | y_true: OneDimArray, 108 | y_pred: OneDimArray, 109 | eoptimal: bool, 110 | class_label: Optional[str], 111 | color: str, 112 | lw: int, 113 | ls: str, 114 | ms: int, 115 | fmt: str, 116 | ax: plt.Axes, 117 | ) -> Dict[str, Any]: 118 | y_true_array: NDArray = convert(y_true, "array") # type: ignore 119 | y_pred_array: NDArray = convert(y_pred, "array") # type: ignore 120 | if y_pred_array.shape != y_true_array.shape: 121 | raise ValueError("y_true and y_pred must have the same shape") 122 | elif len(y_pred_array.shape) == 1: 123 | y_t = y_true_array 124 | y_p = y_pred_array 125 | else: 126 | y_t = np.array([np.argmax(x) for x in y_true_array]) 127 | y_p = np.array([x[1] for x in y_pred_array]) 128 | y_t_ratio = np.sum(y_t) / y_t.size # type: ignore 129 | if metric == "roc": 130 | x_axis, y_axis, th = roc_curve(y_t, y_p) # x = fpr, y = tpr 131 | else: # metric == 'pr' 132 | y_axis, x_axis, th = precision_recall_curve( 133 | y_t, y_p 134 | ) # x = recall, y = precision 135 | auc_score = auc(x_axis, y_axis) 136 | if class_label is not None: 137 | class_label = ": " + class_label 138 | else: 139 | class_label = "" 140 | label = "{metric} curve{class_label} (AUC = {auc:{fmt}}".format( 141 | metric=metric.upper(), class_label=class_label, auc=auc_score, fmt=fmt 142 | ) 143 | if metric == "pr": 144 | label += ", naive = {ytr:{fmt}}".format(ytr=y_t_ratio, fmt=fmt) 145 | if eoptimal: 146 | eopt, eopt_x, eopt_y = _draw_estimated_optimal_threshold_mark( 147 | metric, x_axis, y_axis, th, color, ms, fmt, ax 148 | ) 149 | label += ", eOpT = {th:{fmt}})".format(th=eopt, fmt=fmt) 150 | else: 151 | eopt = None 152 | eopt_x = None 153 | eopt_y = None 154 | label += ")" 155 | ax.plot(x_axis, y_axis, color=color, lw=lw, ls=ls, label=label) 156 | return { 157 | "x": x_axis, 158 | "y": y_axis, 159 | "thresholds": th, 160 | "auc": auc_score, 161 | "eopt": eopt, 162 | "eopt_x": eopt_x, 163 | "eopt_y": eopt_y, 164 | "y_t_ratio": y_t_ratio, 165 | } 166 | 167 | 168 | def _build_metric_graph_output_dict( 169 | metric: str, d: Dict[str, Any] 170 | ) -> Dict[str, Dict[str, Any]]: 171 | naive = d["y_t_ratio"] if metric == "pr" else 0.5 172 | return { 173 | "auc": {"val": d["auc"], "naive": naive}, 174 | "eopt": {"val": d["eopt"], "x": d["eopt_x"], "y": d["eopt_y"]}, 175 | } 176 | 177 | 178 | def metric_graph( 179 | y_true: OneDimArray, 180 | y_pred: OneDimArray, 181 | metric: str, 182 | *, 183 | micro: bool = True, 184 | macro: bool = True, 185 | eopt: bool = True, 186 | class_names: Optional[Union[str, List[str]]] = None, 187 | colors: Optional[str] = None, 188 | ax: Optional[plt.Axes] = None, 189 | figsize: Optional[Tuple[int, int]] = None, 190 | xlim: Tuple[float, float] = (0.0, 1.0), 191 | ylim: Tuple[float, float] = (0.0, 1.02), 192 | lw: int = 2, 193 | ls: str = "-", 194 | ms: int = 10, 195 | fmt: str = ".2f", 196 | legend: Optional[str] = "best", 197 | plot: bool = True, 198 | title: Optional[str] = None, 199 | filename: Optional[str] = None, 200 | force_multiclass: bool = False, 201 | ) -> Dict[str, Any]: 202 | """ 203 | Plot a ROC graph of predictor's results (including AUC scores), where each 204 | row of y_true and y_pred represent a single example. 205 | If there are 1 or two columns only, the data is treated as a binary 206 | classification (see input example below). 207 | If there are more then 2 columns, each column is considered a 208 | unique class, and a ROC graph and AUC score will be computed for each. 209 | A Macro-ROC and Micro-ROC are computed and plotted too by default. 210 | 211 | Based on sklearn examples (as was seen on April 2018): 212 | http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html 213 | 214 | Parameters: 215 | ----------- 216 | y_true : list / NumPy ndarray 217 | The true classes of the predicted data 218 | y_pred : list / NumPy ndarray 219 | The predicted classes 220 | metric : string 221 | The metric graph to plot. Currently supported: 'roc' for Receiver Operating Characteristic curve and 222 | 'pr' for Precision-Recall curve 223 | micro : Boolean, default = True 224 | Whether to calculate a Micro graph (not applicable for binary cases) 225 | macro : Boolean, default = True 226 | Whether to calculate a Macro graph (ROC metric only, not applicable for binary cases) 227 | eopt : Boolean, default = True 228 | Whether to calculate and display the estimated-optimal threshold 229 | for each metric graph. For ROC curves, the estimated-optimal threshold is the closest 230 | computed threshold with (fpr,tpr) values closest to (0,1). For PR curves, it is 231 | the closest one to (1,1) (perfect recall and precision) 232 | class_names: list or string, default = None 233 | Names of the different classes. In a multi-class classification, the 234 | order must match the order of the classes probabilities in the input 235 | data. In a binary classification, can be a string or a list. If a list, 236 | only the last element will be used. 237 | colors : list of Matplotlib color strings or None, default = None 238 | List of colors to be used for the plotted curves. If `None`, falls back 239 | to a predefined default. 240 | ax : matplotlib ax, default = None 241 | Matplotlib Axis on which the curves will be plotted 242 | figsize : (int,int) or None, default = None 243 | a Matplotlib figure-size tuple. If `None`, falls back to Matplotlib's 244 | default. Only used if `ax=None`. 245 | xlim : (float, float), default = (0.,1.) 246 | X-axis limits. 247 | ylim : (float,float), default = (0.,1.02) 248 | Y-axis limits. 249 | lw : int, default = 2 250 | Line-width. 251 | ls : string, default = '-' 252 | Matplotlib line-style string 253 | ms : int, default = 10, 254 | Marker-size. 255 | fmt : string, default = '.2f' 256 | String formatting of displayed AUC and threshold numbers. 257 | legend : string or None, default = 'best' 258 | Position graph legend. 259 | plot : Boolean, default = True 260 | Display graph 261 | title : string or None, default = None 262 | Plotted graph title. If None, default title is used 263 | filename : string or None, default = None 264 | If not None, plot will be saved to the given file name 265 | force_multiclass : Boolean, default = False 266 | Only applicable if `y_true` and `y_pred` have two columns. If so, 267 | consider the data as a multiclass data rather than binary (useful when plotting 268 | curves of different models one against the other) 269 | 270 | Returns: 271 | -------- 272 | A dictionary, one key for each class. Each value is another dictionary, 273 | holding AUC and eOpT values. 274 | 275 | Binary Classification Input Example: 276 | ------------------------------------ 277 | Consider a data-set of two data-points where the true class of the first line 278 | is class 0, which was predicted with a probability of 0.6, and the second line's 279 | true class is 1, with predicted probability of 0.8. 280 | ```python 281 | # First option: 282 | >>> metric_graph(y_true=[0,1], y_pred=[0.6,0.8], metric='roc') 283 | {...} 284 | 285 | # Second option: 286 | >>> metric_graph(y_true=[[1,0],[0,1]], y_pred=[[0.6,0.4],[0.2,0.8]], metric='roc') 287 | {...} 288 | 289 | # Both yield the same result 290 | ``` 291 | 292 | Example: 293 | -------- 294 | See `roc_graph_example` and pr_graph_example` under `dython.examples` 295 | """ 296 | if metric is None or metric.lower() not in ["roc", "pr"]: 297 | raise ValueError(f"Invalid metric {metric}") 298 | else: 299 | metric = metric.lower() 300 | 301 | all_x_axis = list() 302 | all_y_axis = list() 303 | y_true_array: NDArray = convert(y_true, "array") # type: ignore 304 | y_pred_array: NDArray = convert(y_pred, "array") # type: ignore 305 | 306 | if y_pred_array.shape != y_true_array.shape: 307 | raise ValueError("y_true and y_pred must have the same shape") 308 | 309 | class_names_list: Optional[List[str]] 310 | if class_names is not None: 311 | if not isinstance(class_names, str): 312 | class_names_list = convert(class_names_list, "list") # type: ignore 313 | else: 314 | class_names_list = [class_names] 315 | else: 316 | class_names_list = None 317 | 318 | if ax is None: 319 | plt.figure(figsize=figsize) 320 | axis = plt.gca() 321 | else: 322 | axis = ax 323 | 324 | if isinstance(colors, str): 325 | colors_list = [colors] 326 | else: 327 | colors_list: List[str] = colors or _ROC_PLOT_COLORS 328 | 329 | output_dict = dict() 330 | pr_naives = list() 331 | if ( 332 | len(y_pred_array.shape) == 1 333 | or y_pred_array.shape[1] == 1 334 | or (y_pred_array.shape[1] == 2 and not force_multiclass) 335 | ): 336 | class_label = ( 337 | class_names_list[-1] if class_names_list is not None else None 338 | ) 339 | color = colors_list[-1] 340 | d = _binary_metric_graph( 341 | metric, 342 | y_true_array, 343 | y_pred_array, 344 | eoptimal=eopt, 345 | class_label=class_label, 346 | color=color, 347 | lw=lw, 348 | ls=ls, 349 | ms=ms, 350 | fmt=fmt, 351 | ax=axis, 352 | ) 353 | class_label = class_label or "0" 354 | output_dict[class_label] = _build_metric_graph_output_dict(metric, d) 355 | pr_naives.append([0, 1, d["y_t_ratio"], d["y_t_ratio"], color]) 356 | else: 357 | n = y_pred_array.shape[1] 358 | if class_names_list is not None: 359 | if not isinstance(class_names_list, list): 360 | raise ValueError( 361 | "class_names must be a list of items in multi-class classification." 362 | ) 363 | if len(class_names_list) != n: 364 | raise ValueError( 365 | "Number of class names does not match input data size." 366 | ) 367 | for i in range(0, n): 368 | class_label = ( 369 | class_names_list[i] if class_names_list is not None else str(i) 370 | ) 371 | color = colors_list[i % len(colors_list)] 372 | d = _binary_metric_graph( 373 | metric, 374 | y_true_array[:, i], 375 | y_pred_array[:, i], 376 | eoptimal=eopt, 377 | color=color, 378 | class_label=class_label, 379 | lw=lw, 380 | ls=ls, 381 | ms=ms, 382 | fmt=fmt, 383 | ax=axis, 384 | ) 385 | all_x_axis.append(d["x"]) 386 | all_y_axis.append(d["y"]) 387 | output_dict[class_label] = _build_metric_graph_output_dict( 388 | metric, d 389 | ) 390 | pr_naives.append((0, 1, d["y_t_ratio"], d["y_t_ratio"], color)) 391 | if micro: 392 | _binary_metric_graph( 393 | metric, 394 | y_true_array.ravel(), 395 | y_pred_array.ravel(), 396 | eoptimal=False, 397 | ls=":", 398 | color="deeppink", 399 | class_label="micro", 400 | lw=lw, 401 | ms=ms, 402 | fmt=fmt, 403 | ax=axis, 404 | ) 405 | if macro and metric == "roc": 406 | _plot_macro_metric(all_x_axis, all_y_axis, n, lw, fmt, axis) 407 | if metric == "roc": 408 | naives: List[Tuple[Number, Number, Number, Number, str]] = [ 409 | (0, 1, 0, 1, "grey") 410 | ] 411 | elif metric == "pr": 412 | naives = pr_naives 413 | else: 414 | raise ValueError(f"Invalid metric {metric}") 415 | axis = _display_metric_plot( 416 | axis, 417 | metric, 418 | naives, 419 | xlim=xlim, 420 | ylim=ylim, 421 | legend=legend, 422 | title=title, 423 | filename=filename, 424 | plot=plot, 425 | ) 426 | output_dict["ax"] = axis 427 | return output_dict 428 | 429 | 430 | def random_forest_feature_importance( 431 | forest: RandomForestClassifier, features: List[str], precision: int = 4 432 | ) -> Iterable[Tuple[float, str]]: 433 | """ 434 | Given a trained `sklearn.ensemble.RandomForestClassifier`, plot the 435 | different features based on their importance according to the classifier, 436 | from the most important to the least. 437 | 438 | Parameters: 439 | ----------- 440 | forest : sklearn.ensemble.RandomForestClassifier 441 | A trained `RandomForestClassifier` 442 | features : list 443 | A list of the names of the features the classifier was trained on, 444 | ordered by the same order the appeared 445 | in the training data 446 | precision : int, default = 4 447 | Precision of feature importance 448 | """ 449 | return sorted( 450 | zip( 451 | map(lambda x: round(x, precision), forest.feature_importances_), 452 | features, 453 | ), 454 | reverse=True, 455 | ) 456 | 457 | 458 | def ks_abc( 459 | y_true: OneDimArray, 460 | y_pred: OneDimArray, 461 | *, 462 | ax: Optional[plt.Axes] = None, 463 | figsize: Optional[Tuple[int, int]] = None, 464 | colors: Tuple[str, str] = ("darkorange", "b"), 465 | title: Optional[str] = None, 466 | xlim: Tuple[float, float] = (0.0, 1.0), 467 | ylim: Tuple[float, float] = (0.0, 1.0), 468 | fmt: str = ".2f", 469 | lw: int = 2, 470 | legend: Optional[str] = "best", 471 | plot: bool = True, 472 | filename: Optional[str] = None, 473 | ) -> Dict[str, Any]: 474 | """ 475 | Perform the Kolmogorov–Smirnov test over the positive and negative distributions of a binary classifier, and compute 476 | the area between curves. 477 | The KS test plots the fraction of positives and negatives predicted correctly below each threshold. It then finds 478 | the optimal threshold, being the one enabling the best class separation. 479 | The area between curves allows a better insight into separation. The higher the area is (1 being the maximum), the 480 | more the positive and negative distributions' center-of-mass are closer to 1 and 0, respectively. 481 | 482 | Based on scikit-plot's `plot_ks_statistic` method. 483 | 484 | Parameters: 485 | ----------- 486 | y_true : array-like 487 | The true labels of the dataset 488 | y_pred : array-like 489 | The probabilities predicted by a binary classifier 490 | ax : matplotlib ax, default = None 491 | Matplotlib Axis on which the curves will be plotted 492 | figsize : (int,int) or None, default = None 493 | a Matplotlib figure-size tuple. If `None`, falls back to Matplotlib's 494 | default. Only used if `ax=None`. 495 | colors : a tuple of Matplotlib color strings, default = ('darkorange', 'b') 496 | Colors to be used for the plotted curves. 497 | title : string or None, default = None 498 | Plotted graph title. If None, default title is used 499 | xlim : (float, float), default = (0.,1.) 500 | X-axis limits. 501 | ylim : (float,float), default = (0.,1.) 502 | Y-axis limits. 503 | fmt : string, default = '.2f' 504 | String formatting of displayed numbers. 505 | lw : int, default = 2 506 | Line-width. 507 | legend : string or None, default = 'best' 508 | Position graph legend. 509 | plot : Boolean, default = True 510 | Display graph 511 | filename : string or None, default = None 512 | If not None, plot will be saved to the given file name 513 | 514 | Returns: 515 | -------- 516 | A dictionary of the following keys: 517 | 'abc': area between curves, 518 | 'ks_stat': computed statistic of the KS test, 519 | 'eopt': estimated optimal threshold, 520 | 'ax': the ax used to plot the curves 521 | """ 522 | y_true_arr: NDArray = convert(y_true, "array") # type: ignore 523 | y_pred_arr: NDArray = convert(y_pred, "array") # type: ignore 524 | if y_pred_arr.shape != y_true_arr.shape: 525 | raise ValueError("y_true and y_pred must have the same shape") 526 | elif len(y_pred_arr.shape) == 1 or y_pred_arr.shape[1] == 1: 527 | y_t = y_true_arr 528 | y_p = y_pred_arr 529 | elif y_pred_arr.shape[1] == 2: 530 | y_t = [np.argmax(x) for x in y_true_arr] 531 | y_p = [x[1] for x in y_pred_arr] 532 | else: 533 | raise ValueError( 534 | "y_true and y_pred must originate from a binary classifier, but have {} columns".format( 535 | y_pred_arr.shape[1] 536 | ) 537 | ) 538 | 539 | thresholds, nr, pr, ks_statistic, max_distance_at, _ = _binary_ks_curve( 540 | y_t, y_p # type: ignore 541 | ) 542 | if ax is None: 543 | plt.figure(figsize=figsize) 544 | axis = plt.gca() 545 | else: 546 | axis = ax 547 | 548 | axis.plot(thresholds, pr, lw=lw, color=colors[0], label="Positive Class") 549 | axis.plot(thresholds, nr, lw=lw, color=colors[1], label="Negative Class") 550 | idx = np.where(thresholds == max_distance_at)[0][0] 551 | axis.axvline( 552 | max_distance_at, 553 | *sorted([nr[idx], pr[idx]]), 554 | label="KS Statistic: {ks:{fmt}} at {d:{fmt}}".format( 555 | ks=ks_statistic, d=max_distance_at, fmt=fmt 556 | ), 557 | linestyle=":", 558 | lw=lw, 559 | color="grey", 560 | ) 561 | 562 | thresholds = np.append(thresholds, 1.001) 563 | abc = 0.0 564 | for i in range(len(pr)): 565 | abc += (nr[i] - pr[i]) * (thresholds[i + 1] - thresholds[i]) 566 | 567 | axis.set_xlim(left=xlim[0], right=xlim[1]) 568 | axis.set_ylim(bottom=ylim[0], top=ylim[1]) 569 | axis.set_xlabel("Threshold") 570 | axis.set_ylabel("Fraction below threshold") 571 | axis.set_title( 572 | "{t} [ABC = {a:{fmt}}]".format( 573 | t=title or "KS Statistic Plot", a=abc, fmt=fmt 574 | ) 575 | ) 576 | if legend: 577 | axis.legend(loc=legend) 578 | if filename: 579 | plt.savefig(filename) 580 | plot_or_not(plot) 581 | return { 582 | "abc": abc, 583 | "ks_stat": ks_statistic, 584 | "eopt": max_distance_at, 585 | "ax": axis, 586 | } 587 | 588 | 589 | def _binary_ks_curve( 590 | y_true: OneDimArray, y_probas: OneDimArray 591 | ) -> Tuple[NDArray, NDArray, NDArray, Number, Number, NDArray]: 592 | """Copied from scikit-plot: https://github.com/reiinakano/scikit-plot/blob/master/scikitplot/helpers.py 593 | 594 | This function generates the points necessary to calculate the KS 595 | Statistic curve. 596 | 597 | Args: 598 | y_true (array-like, shape (n_samples)): True labels of the data. 599 | 600 | y_probas (array-like, shape (n_samples)): Probability predictions of 601 | the positive class. 602 | 603 | Returns: 604 | thresholds (numpy.ndarray): An array containing the X-axis values for 605 | plotting the KS Statistic plot. 606 | 607 | pct1 (numpy.ndarray): An array containing the Y-axis values for one 608 | curve of the KS Statistic plot. 609 | 610 | pct2 (numpy.ndarray): An array containing the Y-axis values for one 611 | curve of the KS Statistic plot. 612 | 613 | ks_statistic (float): The KS Statistic, or the maximum vertical 614 | distance between the two curves. 615 | 616 | max_distance_at (float): The X-axis value at which the maximum vertical 617 | distance between the two curves is seen. 618 | 619 | classes (np.ndarray, shape (2)): An array containing the labels of the 620 | two classes making up `y_true`. 621 | 622 | Raises: 623 | ValueError: If `y_true` is not composed of 2 classes. The KS Statistic 624 | is only relevant in binary classification. 625 | """ 626 | y_true, y_probas = np.asarray(y_true), np.asarray(y_probas) 627 | lb = LabelEncoder() 628 | encoded_labels = lb.fit_transform(y_true) 629 | if len(lb.classes_) != 2: 630 | raise ValueError( 631 | "Cannot calculate KS statistic for data with " 632 | "{} category/ies".format(len(lb.classes_)) 633 | ) 634 | idx = encoded_labels == 0 635 | data1 = np.sort(y_probas[idx]) 636 | data2 = np.sort(y_probas[np.logical_not(idx)]) 637 | 638 | ctr1, ctr2 = 0, 0 639 | thresholds, pct1, pct2 = [], [], [] 640 | while ctr1 < len(data1) or ctr2 < len(data2): 641 | # Check if data1 has no more elements 642 | if ctr1 >= len(data1): 643 | current = data2[ctr2] 644 | while ctr2 < len(data2) and current == data2[ctr2]: 645 | ctr2 += 1 646 | 647 | # Check if data2 has no more elements 648 | elif ctr2 >= len(data2): 649 | current = data1[ctr1] 650 | while ctr1 < len(data1) and current == data1[ctr1]: 651 | ctr1 += 1 652 | 653 | else: 654 | if data1[ctr1] > data2[ctr2]: 655 | current = data2[ctr2] 656 | while ctr2 < len(data2) and current == data2[ctr2]: 657 | ctr2 += 1 658 | 659 | elif data1[ctr1] < data2[ctr2]: 660 | current = data1[ctr1] 661 | while ctr1 < len(data1) and current == data1[ctr1]: 662 | ctr1 += 1 663 | 664 | else: 665 | current = data2[ctr2] 666 | while ctr2 < len(data2) and current == data2[ctr2]: 667 | ctr2 += 1 668 | while ctr1 < len(data1) and current == data1[ctr1]: 669 | ctr1 += 1 670 | 671 | thresholds.append(current) 672 | pct1.append(ctr1) 673 | pct2.append(ctr2) 674 | 675 | thresholds = np.asarray(thresholds) 676 | pct1 = np.asarray(pct1) / float(len(data1)) 677 | pct2 = np.asarray(pct2) / float(len(data2)) 678 | 679 | if thresholds[0] != 0: 680 | thresholds = np.insert(thresholds, 0, [0.0]) # type: ignore 681 | pct1 = np.insert(pct1, 0, [0.0]) # type: ignore 682 | pct2 = np.insert(pct2, 0, [0.0]) # type: ignore 683 | if thresholds[-1] != 1: 684 | thresholds = np.append(thresholds, [1.0]) # type: ignore 685 | pct1 = np.append(pct1, [1.0]) # type: ignore 686 | pct2 = np.append(pct2, [1.0]) # type: ignore 687 | 688 | differences = pct1 - pct2 689 | ks_statistic, max_distance_at = ( 690 | np.max(differences), 691 | thresholds[np.argmax(differences)], 692 | ) 693 | 694 | return thresholds, pct1, pct2, ks_statistic, max_distance_at, lb.classes_ # type: ignore 695 | -------------------------------------------------------------------------------- /dython/nominal.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures as cf 2 | import math 3 | import warnings 4 | from collections import Counter 5 | from itertools import repeat 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import pandas as pd 9 | import scipy.cluster.hierarchy as sch 10 | import scipy.stats as ss 11 | import seaborn as sns 12 | from psutil import cpu_count 13 | from typing import ( 14 | Union, 15 | Any, 16 | List, 17 | Optional, 18 | Callable, 19 | Tuple, 20 | Dict, 21 | Iterable, 22 | Set, 23 | Literal, 24 | ) 25 | from numpy.typing import NDArray, ArrayLike 26 | from matplotlib.colors import Colormap 27 | from ._private import ( 28 | convert, 29 | remove_incomplete_samples, 30 | replace_nan_with_value, 31 | plot_or_not, 32 | ) 33 | from .data_utils import identify_columns_by_type 34 | from .typing import Number, OneDimArray, TwoDimArray 35 | 36 | 37 | __all__ = [ 38 | "associations", 39 | "cluster_correlations", 40 | "conditional_entropy", 41 | "correlation_ratio", 42 | "cramers_v", 43 | "identify_nominal_columns", 44 | "identify_numeric_columns", 45 | "numerical_encoding", 46 | "replot_last_associations", 47 | "theils_u", 48 | ] 49 | 50 | _REPLACE = "replace" 51 | _DROP = "drop" 52 | _DROP_SAMPLES = "drop_samples" 53 | _DROP_FEATURES = "drop_features" 54 | _DROP_SAMPLE_PAIRS = "drop_sample_pairs" 55 | _SKIP = "skip" 56 | _DEFAULT_REPLACE_VALUE = 0.0 57 | _PRECISION = 1e-13 58 | 59 | _ASSOC_PLOT_PARAMS: Dict[str, Any] = dict() 60 | 61 | _NO_OP = "no-op" 62 | _SINGLE_VALUE_COLUMN_OP = "single-value-column-op" 63 | _I_EQ_J_OP = "i-equal-j-op" 64 | _ASSOC_OP = "assoc-op" 65 | 66 | NomNumAssocStr = Literal["correlation_ratio"] 67 | NumNumAssocStr = Literal["pearson", "spearman", "kendall"] 68 | NomNomAssocStr = Literal["cramer", "theil"] 69 | 70 | 71 | def _inf_nan_str(x: Union[int, float]) -> str: 72 | if np.isnan(x): 73 | return "NaN" 74 | elif abs(x) == np.inf: 75 | return "inf" 76 | else: 77 | return "" 78 | 79 | 80 | def conditional_entropy( 81 | x: Union[OneDimArray, List[str]], 82 | y: Union[OneDimArray, List[str]], 83 | nan_strategy: str = _REPLACE, 84 | nan_replace_value: Any = _DEFAULT_REPLACE_VALUE, 85 | log_base: Number = math.e, 86 | ) -> float: 87 | """ 88 | Calculates the conditional entropy of x given y: S(x|y) 89 | 90 | Wikipedia: https://en.wikipedia.org/wiki/Conditional_entropy 91 | 92 | Parameters: 93 | ----------- 94 | x : list / NumPy ndarray / Pandas Series 95 | A sequence of measurements 96 | y : list / NumPy ndarray / Pandas Series 97 | A sequence of measurements 98 | nan_strategy : string, default = 'replace' 99 | How to handle missing values: can be either 'drop' to remove samples 100 | with missing values, or 'replace' to replace all missing values with 101 | the nan_replace_value. Missing values are None and np.nan. 102 | nan_replace_value : any, default = 0.0 103 | The value used to replace missing values with. Only applicable when 104 | nan_strategy is set to 'replace'. 105 | log_base: float, default = e 106 | specifying base for calculating entropy. Default is base e. 107 | 108 | Returns: 109 | -------- 110 | float 111 | """ 112 | if nan_strategy == _REPLACE: 113 | x, y = replace_nan_with_value(x, y, nan_replace_value) 114 | elif nan_strategy == _DROP: 115 | x, y = remove_incomplete_samples(x, y) 116 | y_counter = Counter(y) 117 | xy_counter = Counter(list(zip(x, y))) 118 | total_occurrences = sum(y_counter.values()) 119 | entropy = 0.0 120 | for xy in xy_counter.keys(): 121 | p_xy = xy_counter[xy] / total_occurrences 122 | p_y = y_counter[xy[1]] / total_occurrences 123 | entropy += p_xy * math.log(p_y / p_xy, log_base) 124 | return entropy 125 | 126 | 127 | def cramers_v( 128 | x: Union[OneDimArray, List[str]], 129 | y: Union[OneDimArray, List[str]], 130 | bias_correction: bool = True, 131 | nan_strategy: str = _REPLACE, 132 | nan_replace_value: Any = _DEFAULT_REPLACE_VALUE, 133 | ) -> float: 134 | """ 135 | Calculates Cramer's V statistic for categorical-categorical association. 136 | This is a symmetric coefficient: V(x,y) = V(y,x) 137 | 138 | Original function taken from: https://stackoverflow.com/a/46498792/5863503 139 | Wikipedia: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V 140 | 141 | Parameters: 142 | ----------- 143 | x : list / NumPy ndarray / Pandas Series 144 | A sequence of categorical measurements 145 | y : list / NumPy ndarray / Pandas Series 146 | A sequence of categorical measurements 147 | bias_correction : Boolean, default = True 148 | Use bias correction from Bergsma and Wicher, 149 | Journal of the Korean Statistical Society 42 (2013): 323-328. 150 | nan_strategy : string, default = 'replace' 151 | How to handle missing values: can be either 'drop' to remove samples 152 | with missing values, or 'replace' to replace all missing values with 153 | the nan_replace_value. Missing values are None and np.nan. 154 | nan_replace_value : any, default = 0.0 155 | The value used to replace missing values with. Only applicable when 156 | nan_strategy is set to 'replace'. 157 | 158 | Returns: 159 | -------- 160 | float in the range of [0,1] 161 | """ 162 | if nan_strategy == _REPLACE: 163 | x, y = replace_nan_with_value(x, y, nan_replace_value) 164 | elif nan_strategy == _DROP: 165 | x, y = remove_incomplete_samples(x, y) 166 | confusion_matrix = pd.crosstab(x, y) 167 | chi2 = ss.chi2_contingency(confusion_matrix)[0] 168 | n = confusion_matrix.sum().sum() 169 | phi2 = chi2 / n 170 | r, k = confusion_matrix.shape 171 | if bias_correction: 172 | phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1)) 173 | rcorr = r - ((r - 1) ** 2) / (n - 1) 174 | kcorr = k - ((k - 1) ** 2) / (n - 1) 175 | if min((kcorr - 1), (rcorr - 1)) == 0: 176 | warnings.warn( 177 | "Unable to calculate Cramer's V using bias correction. Consider using bias_correction=False (or cramers_v_bias_correction=False if calling from associations)", 178 | RuntimeWarning, 179 | ) 180 | return np.nan 181 | else: 182 | v = np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1))) 183 | else: 184 | v = np.sqrt(phi2 / min(k - 1, r - 1)) 185 | if -_PRECISION <= v < 0.0 or 1.0 < v <= 1.0 + _PRECISION: 186 | rounded_v = 0.0 if v < 0 else 1.0 187 | warnings.warn( 188 | f"Rounded V = {v} to {rounded_v}. This is probably due to floating point precision issues.", 189 | RuntimeWarning, 190 | ) 191 | return rounded_v 192 | else: 193 | return v 194 | 195 | 196 | def theils_u( 197 | x: Union[OneDimArray, List[str]], 198 | y: Union[OneDimArray, List[str]], 199 | nan_strategy: str = _REPLACE, 200 | nan_replace_value: Any = _DEFAULT_REPLACE_VALUE, 201 | ) -> float: 202 | """ 203 | Calculates Theil's U statistic (Uncertainty coefficient) for categorical- 204 | categorical association. This is the uncertainty of x given y: value is 205 | on the range of [0,1] - where 0 means y provides no information about 206 | x, and 1 means y provides full information about x. 207 | 208 | This is an asymmetric coefficient: U(x,y) != U(y,x) 209 | 210 | Wikipedia: https://en.wikipedia.org/wiki/Uncertainty_coefficient 211 | 212 | Parameters: 213 | ----------- 214 | x : list / NumPy ndarray / Pandas Series 215 | A sequence of categorical measurements 216 | y : list / NumPy ndarray / Pandas Series 217 | A sequence of categorical measurements 218 | nan_strategy : string, default = 'replace' 219 | How to handle missing values: can be either 'drop' to remove samples 220 | with missing values, or 'replace' to replace all missing values with 221 | the nan_replace_value. Missing values are None and np.nan. 222 | nan_replace_value : any, default = 0.0 223 | The value used to replace missing values with. Only applicable when 224 | nan_strategy is set to 'replace'. 225 | 226 | Returns: 227 | -------- 228 | float in the range of [0,1] 229 | """ 230 | if nan_strategy == _REPLACE: 231 | x, y = replace_nan_with_value(x, y, nan_replace_value) 232 | elif nan_strategy == _DROP: 233 | x, y = remove_incomplete_samples(x, y) 234 | s_xy = conditional_entropy(x, y) 235 | x_counter = Counter(x) 236 | total_occurrences = sum(x_counter.values()) 237 | p_x = list(map(lambda n: n / total_occurrences, x_counter.values())) 238 | s_x = ss.entropy(p_x) 239 | if s_x == 0: 240 | return 1.0 241 | else: 242 | u = (s_x - s_xy) / s_x # type: ignore 243 | if -_PRECISION <= u < 0.0 or 1.0 < u <= 1.0 + _PRECISION: 244 | rounded_u = 0.0 if u < 0 else 1.0 245 | warnings.warn( 246 | f"Rounded U = {u} to {rounded_u}. This is probably due to floating point precision issues.", 247 | RuntimeWarning, 248 | ) 249 | return rounded_u 250 | else: 251 | return u 252 | 253 | 254 | def correlation_ratio( 255 | categories: Union[OneDimArray, List[str]], 256 | measurements: OneDimArray, 257 | nan_strategy: str = _REPLACE, 258 | nan_replace_value: Any = _DEFAULT_REPLACE_VALUE, 259 | ) -> float: 260 | """ 261 | Calculates the Correlation Ratio (sometimes marked by the greek letter Eta) 262 | for categorical-continuous association. 263 | 264 | Answers the question - given a continuous value of a measurement, is it 265 | possible to know which category is it associated with? 266 | 267 | Value is in the range [0,1], where 0 means a category cannot be determined 268 | by a continuous measurement, and 1 means a category can be determined with 269 | absolute certainty. 270 | 271 | Wikipedia: https://en.wikipedia.org/wiki/Correlation_ratio 272 | 273 | Parameters: 274 | ----------- 275 | categories : list / NumPy ndarray / Pandas Series 276 | A sequence of categorical measurements 277 | measurements : list / NumPy ndarray / Pandas Series 278 | A sequence of continuous measurements 279 | nan_strategy : string, default = 'replace' 280 | How to handle missing values: can be either 'drop' to remove samples 281 | with missing values, or 'replace' to replace all missing values with 282 | the nan_replace_value. Missing values are None and np.nan. 283 | nan_replace_value : any, default = 0.0 284 | The value used to replace missing values with. Only applicable when 285 | nan_strategy is set to 'replace'. 286 | 287 | Returns: 288 | -------- 289 | float in the range of [0,1] 290 | """ 291 | if nan_strategy == _REPLACE: 292 | categories, measurements = replace_nan_with_value( 293 | categories, measurements, nan_replace_value 294 | ) 295 | elif nan_strategy == _DROP: 296 | categories, measurements = remove_incomplete_samples( 297 | categories, measurements 298 | ) 299 | categories_array: NDArray = convert(categories, "array") # type: ignore 300 | measurements_array: NDArray = convert(measurements, "array") # type: ignore 301 | fcat, _ = pd.factorize(categories_array) # type: ignore 302 | cat_num = np.max(fcat) + 1 303 | y_avg_array = np.zeros(cat_num) 304 | n_array = np.zeros(cat_num) 305 | for i in range(0, cat_num): 306 | cat_measures = measurements_array[np.argwhere(fcat == i).flatten()] 307 | n_array[i] = len(cat_measures) 308 | y_avg_array[i] = np.average(cat_measures) 309 | y_total_avg = np.sum(np.multiply(y_avg_array, n_array)) / np.sum(n_array) 310 | numerator = np.sum( 311 | np.multiply(n_array, np.power(np.subtract(y_avg_array, y_total_avg), 2)) 312 | ) 313 | denominator = np.sum( 314 | np.power(np.subtract(measurements_array, y_total_avg), 2) 315 | ) 316 | if numerator == 0: 317 | return 0.0 318 | else: 319 | eta = np.sqrt(numerator / denominator) 320 | if 1.0 < eta <= 1.0 + _PRECISION: 321 | warnings.warn( 322 | f"Rounded eta = {eta} to 1. This is probably due to floating point precision issues.", 323 | RuntimeWarning, 324 | ) 325 | return 1.0 326 | else: 327 | return eta 328 | 329 | 330 | def identify_nominal_columns(dataset: TwoDimArray) -> List[Any]: 331 | """ 332 | Given a dataset, identify categorical columns. 333 | 334 | Parameters: 335 | ----------- 336 | dataset : NumPy ndarray / Pandas DataFrame 337 | 338 | Returns: 339 | -------- 340 | A list of categorical columns names 341 | 342 | Example: 343 | -------- 344 | >>> df = pd.DataFrame({'col1': ['a', 'b', 'c', 'a'], 'col2': [3, 4, 2, 1]}) 345 | >>> identify_nominal_columns(df) 346 | ['col1'] 347 | 348 | """ 349 | return identify_columns_by_type(dataset, include=["object", "category"]) 350 | 351 | 352 | def identify_numeric_columns(dataset: TwoDimArray) -> List[Any]: 353 | """ 354 | Given a dataset, identify numeric columns. 355 | 356 | Parameters: 357 | ----------- 358 | dataset : NumPy ndarray / Pandas DataFrame 359 | 360 | Returns: 361 | -------- 362 | A list of numerical columns names 363 | 364 | Example: 365 | -------- 366 | >>> df = pd.DataFrame({'col1': ['a', 'b', 'c', 'a'], 'col2': [3, 4, 2, 1], 'col3': [1., 2., 3., 4.]}) 367 | >>> identify_numeric_columns(df) 368 | ['col2', 'col3'] 369 | 370 | """ 371 | return identify_columns_by_type(dataset, include=["int64", "float64"]) 372 | 373 | 374 | def associations( 375 | dataset: TwoDimArray, 376 | nominal_columns: Optional[Union[OneDimArray, List[str], str]] = "auto", 377 | *, 378 | numerical_columns: Optional[Union[OneDimArray, List[str], str]] = None, 379 | mark_columns: bool = False, 380 | nom_nom_assoc: Union[ 381 | NomNomAssocStr, Callable[[pd.Series, pd.Series], Number] 382 | ] = "cramer", 383 | num_num_assoc: Union[ 384 | NumNumAssocStr, Callable[[pd.Series, pd.Series], Number] 385 | ] = "pearson", 386 | nom_num_assoc: Union[ 387 | NomNumAssocStr, Callable[[pd.Series, pd.Series], Number] 388 | ] = "correlation_ratio", 389 | symmetric_nom_nom: bool = True, 390 | symmetric_num_num: bool = True, 391 | display_rows: Union[str, List[str]] = "all", 392 | display_columns: Union[str, List[str]] = "all", 393 | hide_rows: Optional[Union[str, List[str]]] = None, 394 | hide_columns: Optional[Union[str, List[str]]] = None, 395 | cramers_v_bias_correction: bool = True, 396 | nan_strategy: str = _REPLACE, 397 | nan_replace_value: Any = _DEFAULT_REPLACE_VALUE, 398 | ax: Optional[plt.Axes] = None, 399 | figsize: Optional[Tuple[float, float]] = None, 400 | annot: bool = True, 401 | fmt: str = ".2f", 402 | cmap: Optional[Colormap] = None, 403 | sv_color: str = "silver", 404 | cbar: bool = True, 405 | vmax: float = 1.0, 406 | vmin: Optional[float] = None, 407 | plot: bool = True, 408 | compute_only: bool = False, 409 | clustering: bool = False, 410 | title: Optional[str] = None, 411 | filename: Optional[str] = None, 412 | multiprocessing: bool = False, 413 | max_cpu_cores: Optional[int] = None, 414 | ) -> Dict[str, Union[pd.DataFrame, plt.Axes]]: 415 | """ 416 | Calculate the correlation/strength-of-association of features in data-set 417 | with both categorical and continuous features using: 418 | * Pearson's R for continuous-continuous cases 419 | * Correlation Ratio for categorical-continuous cases 420 | * Cramer's V or Theil's U for categorical-categorical cases 421 | 422 | Parameters: 423 | ----------- 424 | dataset : NumPy ndarray / Pandas DataFrame 425 | The data-set for which the features' correlation is computed 426 | nominal_columns : string / list / NumPy ndarray, default = 'auto' 427 | Names of columns of the data-set which hold categorical values. Can 428 | also be the string 'all' to state that all columns are categorical, 429 | 'auto' (default) to try to identify nominal columns, or None to state 430 | none are categorical. Only used if `numerical_columns` is `None`. 431 | numerical_columns : string / list / NumPy ndarray, default = None 432 | To be used instead of `nominal_columns`. Names of columns of the data-set 433 | which hold numerical values. Can also be the string 'all' to state that 434 | all columns are numerical (equivalent to `nominal_columns=None`) or 435 | 'auto' to try to identify numerical columns (equivalent to 436 | `nominal_columns=auto`). If `None`, `nominal_columns` is used. 437 | mark_columns : Boolean, default = False 438 | if True, output's columns' names will have a suffix of '(nom)' or 439 | '(con)' based on their type (nominal or continuous), as provided 440 | by nominal_columns 441 | nom_nom_assoc : callable / string, default = 'cramer' 442 | If callable, a function which receives two `pd.Series` and returns a single number. 443 | If string, name of nominal-nominal (categorical-categorical) association to use. 444 | Options are 'cramer' for Cramer's V or `theil` for Theil's U. If 'theil', 445 | heat-map columns are the provided information (U = U(row|col)). 446 | num_num_assoc : callable / string, default = 'pearson' 447 | If callable, a function which receives two `pd.Series` and returns a single number. 448 | If string, name of numerical-numerical association to use. Options are 'pearson' 449 | for Pearson's R, 'spearman' for Spearman's R, 'kendall' for Kendall's Tau. 450 | nom_num_assoc : callable / string, default = 'correlation_ratio' 451 | If callable, a function which receives two `pd.Series` and returns a single number. 452 | If string, name of nominal-numerical association to use. Options are 'correlation_ratio' 453 | for correlation ratio. 454 | symmetric_nom_nom : Boolean, default = True 455 | Relevant only if `nom_nom_assoc` is a callable. Declare whether the function is symmetric (f(x,y) = f(y,x)). 456 | If False, heat-map values should be interpreted as f(row,col) 457 | symmetric_num_num : Boolean, default = True 458 | Relevant only if `num_num_assoc` is a callable. Declare whether the function is symmetric (f(x,y) = f(y,x)). 459 | If False, heat-map values should be interpreted as f(row,col) 460 | display_rows : list / string, default = 'all' 461 | Choose which of the dataset's features will be displayed in the output's 462 | correlations table rows. If string, can either be a single feature's name or 'all'. 463 | Only used if `hide_rows` is `None`. 464 | display_columns : list / string, default = 'all' 465 | Choose which of the dataset's features will be displayed in the output's 466 | correlations table columns. If string, can either be a single feature's name or 'all'. 467 | Only used if `hide_columns` is `None`. 468 | hide_rows : list / string, default = None 469 | Choose which of the dataset's features will not be displayed in the output's 470 | correlations table rows. If string, must be a single feature's name. If `None`, 471 | `display_rows` is used. 472 | hide_columns : list / string, default = None 473 | Choose which of the dataset's features will not be displayed in the output's 474 | correlations table columns. If string, must be a single feature's name. If `None`, 475 | `display_columns` is used. 476 | cramers_v_bias_correction : Boolean, default = True 477 | Use bias correction for Cramer's V from Bergsma and Wicher, 478 | Journal of the Korean Statistical Society 42 (2013): 323-328. 479 | nan_strategy : string, default = 'replace' 480 | How to handle missing values: can be either 'drop_samples' to remove 481 | samples with missing values, 'drop_features' to remove features 482 | (columns) with missing values, 'replace' to replace all missing 483 | values with the nan_replace_value, or 'drop_sample_pairs' to drop each 484 | pair of missing observables separately before calculating the corresponding coefficient. 485 | Missing values are None and np.nan. 486 | nan_replace_value : any, default = 0.0 487 | The value used to replace missing values with. Only applicable when 488 | nan_strategy is set to 'replace' 489 | ax : matplotlib ax, default = None 490 | Matplotlib Axis on which the heat-map will be plotted 491 | figsize : (float, float) or None, default = None 492 | A Matplotlib figure-size tuple. If `None`, will attempt to set the size automatically. 493 | Only used if `ax=None`. 494 | annot : Boolean, default = True 495 | Plot number annotations on the heat-map 496 | fmt : string, default = '.2f' 497 | String formatting of annotations 498 | cmap : Matplotlib colormap or None, default = None 499 | A colormap to be used for the heat-map. If None, falls back to Seaborn's 500 | heat-map default 501 | sv_color : string, default = 'silver' 502 | A Matplotlib color. The color to be used when displaying single-value 503 | features over the heat-map 504 | cbar: Boolean, default = True 505 | Display heat-map's color-bar 506 | vmax: float, default = 1.0 507 | Set heat-map vmax option 508 | vmin: float or None, default = None 509 | Set heat-map vmin option. If set to None, vmin will be chosen automatically 510 | between 0 and -1, depending on the types of associations used (-1 if Pearson's R 511 | is used, 0 otherwise) 512 | plot : Boolean, default = True 513 | Plot a heat-map of the correlation matrix. If False, plotting still 514 | happen, but the heat-map will not be displayed. 515 | compute_only : Boolean, default = False 516 | Use this flag only if you have no need of the plotting at all. This skips the entire 517 | plotting mechanism. 518 | clustering : Boolean, default = False 519 | If True, hierarchical clustering is applied in order to sort 520 | features into meaningful groups 521 | title : string or None, default = None 522 | Plotted graph title 523 | filename : string or None, default = None 524 | If not None, plot will be saved to the given file name 525 | multiprocessing: Boolean, default = False 526 | If True, use `multiprocessing` to speed up computations. If None, falls back to single core computation 527 | max_cpu_cores: int or None, default = None 528 | If not None, ProcessPoolExecutor will use the given number of CPU cores 529 | 530 | Returns: 531 | -------- 532 | A dictionary with the following keys: 533 | - `corr`: A DataFrame of the correlation/strength-of-association between 534 | all features 535 | - `ax`: A Matplotlib `Axe` 536 | 537 | Example: 538 | -------- 539 | See examples under `dython.examples` 540 | """ 541 | df: pd.DataFrame = convert(dataset, "dataframe") # type: ignore 542 | 543 | if numerical_columns is not None: 544 | if numerical_columns == "auto": 545 | nominal_columns = "auto" 546 | elif numerical_columns == "all": 547 | nominal_columns = None 548 | else: 549 | nominal_columns = [ 550 | c for c in df.columns if c not in numerical_columns 551 | ] 552 | 553 | # handling NaN values in data 554 | if nan_strategy == _REPLACE: 555 | # handling pandas categorical 556 | df = _handling_category_for_nan_imputation(df, nan_replace_value) 557 | 558 | df.fillna(nan_replace_value, inplace=True) 559 | elif nan_strategy == _DROP_SAMPLES: 560 | df.dropna(axis=0, inplace=True) 561 | elif nan_strategy == _DROP_FEATURES: 562 | df.dropna(axis=1, inplace=True) 563 | elif nan_strategy == _DROP_SAMPLE_PAIRS: 564 | pass # will be handled pair-by-pair during calculations 565 | else: 566 | raise ValueError( 567 | "Argument nan_strategy [{:s}] is not a valid choice.".format( 568 | nan_strategy 569 | ) 570 | ) 571 | 572 | # identifying categorical columns 573 | columns = df.columns 574 | auto_nominal = False 575 | if nominal_columns is None: 576 | nominal_columns = list() 577 | elif nominal_columns == "all": 578 | nominal_columns = columns.tolist() 579 | elif nominal_columns == "auto": 580 | auto_nominal = True 581 | nominal_columns = identify_nominal_columns(df) 582 | 583 | # selecting rows and columns to be displayed 584 | if hide_rows is not None: 585 | if isinstance(hide_rows, str) or isinstance(hide_rows, int): 586 | hide_rows = [hide_rows] # type: ignore 587 | display_rows = [c for c in df.columns if c not in hide_rows] # type: ignore 588 | else: 589 | if display_rows == "all": 590 | display_rows = columns.tolist() 591 | elif isinstance(display_rows, str) or isinstance(display_rows, int): 592 | display_columns = [display_rows] # type: ignore 593 | 594 | if hide_columns is not None: 595 | if isinstance(hide_columns, str) or isinstance(hide_columns, int): 596 | hide_columns = [hide_columns] # type: ignore 597 | display_columns = [c for c in df.columns if c not in hide_columns] # type: ignore 598 | else: 599 | if display_columns == "all": 600 | display_columns = columns.tolist() 601 | elif isinstance(display_columns, str) or isinstance( 602 | display_columns, int 603 | ): 604 | display_columns = [display_columns] # type: ignore 605 | 606 | if ( 607 | display_rows is None 608 | or display_columns is None 609 | or len(display_rows) < 1 610 | or len(display_columns) < 1 611 | ): 612 | raise ValueError( 613 | "display_rows and display_columns must have at least one element" 614 | ) 615 | displayed_features_set = set.union(set(display_rows), set(display_columns)) 616 | 617 | # Adjusting figsize based on the number of features 618 | if figsize is None: 619 | BASE_SIZE = 1.5 # Size multiplier per feature 620 | num_features = len(displayed_features_set) 621 | figsize = (BASE_SIZE * num_features, BASE_SIZE * num_features) 622 | 623 | # convert timestamp columns to numerical columns, so correlation can be performed 624 | datetime_dtypes = [ 625 | str(x) for x in df.dtypes if str(x).startswith("datetime64") 626 | ] # finding all timezones 627 | if datetime_dtypes: 628 | datetime_cols = identify_columns_by_type(df, datetime_dtypes) 629 | datetime_cols = [c for c in datetime_cols if c not in nominal_columns] 630 | if datetime_cols: 631 | df[datetime_cols] = df[datetime_cols].apply( 632 | lambda col: col.view(np.int64), axis=0 633 | ) 634 | if auto_nominal: 635 | nominal_columns = identify_nominal_columns(df) 636 | 637 | # will be used to store associations values 638 | corr = pd.DataFrame(index=columns, columns=columns, dtype=np.float64) 639 | 640 | # this dataframe is used to keep track of invalid association values, which will be placed on top 641 | # of the corr dataframe. It is done for visualization purposes, so the heatmap values will remain 642 | # between -1 and 1 643 | inf_nan = pd.DataFrame( 644 | data=np.zeros_like(corr), columns=columns, index=columns, dtype="object" 645 | ) 646 | 647 | # finding single-value columns 648 | single_value_columns_set = set() 649 | for c in displayed_features_set: 650 | if df[c].unique().size == 1: 651 | single_value_columns_set.add(c) 652 | 653 | # find the number of physical cpu cores available 654 | n_cores = cpu_count(logical=False) 655 | 656 | # current multiprocessing implementation performs worse on 2 cores than on 1 core, 657 | # so we only use multiprocessing if there are more than 2 physical cores available 658 | if multiprocessing and n_cores > 2: 659 | # find out the list of cartesian products of the column indices 660 | number_of_columns = len(columns) 661 | list_of_indices_pairs_lists = [ 662 | (i, j) 663 | for i in range(number_of_columns) 664 | for j in range(number_of_columns) 665 | ] 666 | 667 | # do not exceed 32 cores under any circumstances 668 | if max_cpu_cores is not None: 669 | max_cpu_cores = min(32, min(max_cpu_cores, n_cores)) 670 | else: 671 | max_cpu_cores = min(32, n_cores) 672 | 673 | # submit each list of cartesian products of column indices to separate processes 674 | # for faster computation. 675 | # process 1 receives: [(0, 0), (0, 1), (0, 2), ... (0, n)] 676 | # process 2 receives: [(1, 0), (1, 1), (1, 2), ... (1, n)] 677 | # ... 678 | # process m receives: [(n, 0), (n, 1), (n, 2), ... (n, n)] 679 | # where, n = num_columns - 1 680 | with cf.ProcessPoolExecutor(max_workers=max_cpu_cores) as executor: 681 | results = executor.map( 682 | _compute_associations, 683 | list_of_indices_pairs_lists, 684 | repeat(df), 685 | repeat(displayed_features_set), 686 | repeat(single_value_columns_set), 687 | repeat(nominal_columns), 688 | repeat(symmetric_nom_nom), 689 | repeat(nom_nom_assoc), 690 | repeat(cramers_v_bias_correction), 691 | repeat(num_num_assoc), 692 | repeat(nom_num_assoc), 693 | repeat(symmetric_num_num), 694 | repeat(nan_strategy), 695 | chunksize=max( 696 | 1, len(list_of_indices_pairs_lists) // max_cpu_cores 697 | ), 698 | ) 699 | else: 700 | results: Iterable[Tuple] = [] 701 | 702 | for i in range(0, len(columns)): 703 | for j in range(i, len(columns)): 704 | results.append( 705 | _compute_associations( 706 | (i, j), 707 | df, 708 | displayed_features_set, 709 | single_value_columns_set, 710 | nominal_columns, 711 | symmetric_nom_nom, 712 | nom_nom_assoc, 713 | cramers_v_bias_correction, 714 | num_num_assoc, 715 | nom_num_assoc, 716 | symmetric_num_num, 717 | nan_strategy, 718 | ) 719 | ) 720 | 721 | # fill the correlation dataframe with the results 722 | for result in results: 723 | try: 724 | if result[0] == _NO_OP: 725 | pass 726 | elif result[0] == _SINGLE_VALUE_COLUMN_OP: 727 | i = result[1] 728 | corr.loc[:, columns[i]] = 0.0 729 | corr.loc[columns[i], :] = 0.0 730 | elif result[0] == _I_EQ_J_OP: 731 | i, j = result[1:] 732 | corr.loc[columns[i], columns[j]] = 1.0 733 | else: 734 | # assoc_op 735 | i, j, ij, ji = result[1:] 736 | corr.loc[columns[i], columns[j]] = ( 737 | ij if not np.isnan(ij) and abs(ij) < np.inf else 0.0 738 | ) 739 | corr.loc[columns[j], columns[i]] = ( 740 | ji if not np.isnan(ji) and abs(ji) < np.inf else 0.0 741 | ) 742 | inf_nan.loc[columns[i], columns[j]] = _inf_nan_str(ij) 743 | inf_nan.loc[columns[j], columns[i]] = _inf_nan_str(ji) 744 | except Exception as exception: 745 | raise exception 746 | 747 | corr.fillna(value=np.nan, inplace=True) 748 | 749 | if clustering: 750 | corr, _ = cluster_correlations(corr) # type: ignore 751 | inf_nan = inf_nan.reindex(columns=corr.columns).reindex( 752 | index=corr.index 753 | ) 754 | 755 | # rearrange dispalyed rows and columns according to the clustered order 756 | display_columns = [c for c in corr.columns if c in display_columns] 757 | display_rows = [c for c in corr.index if c in display_rows] 758 | 759 | # keep only displayed columns and rows 760 | corr: pd.DataFrame = corr.loc[display_rows, display_columns] # type: ignore 761 | inf_nan = inf_nan.loc[display_rows, display_columns] # type: ignore 762 | 763 | if mark_columns: 764 | 765 | def mark(col): 766 | return ( 767 | "{} (nom)".format(col) 768 | if col in nominal_columns 769 | else "{} (con)".format(col) 770 | ) 771 | 772 | corr.columns = [mark(col) for col in corr.columns] 773 | corr.index = [mark(col) for col in corr.index] # type: ignore 774 | inf_nan.columns = corr.columns 775 | inf_nan.index = corr.index 776 | single_value_columns_set = { 777 | mark(col) for col in single_value_columns_set 778 | } 779 | display_rows = [mark(col) for col in display_rows] 780 | display_columns = [mark(col) for col in display_columns] 781 | 782 | if not compute_only: 783 | for v in [ 784 | "corr", 785 | "inf_nan", 786 | "single_value_columns_set", 787 | "display_rows", 788 | "display_columns", 789 | "displayed_features_set", 790 | "nominal_columns", 791 | "figsize", 792 | "vmin", 793 | "vmax", 794 | "cbar", 795 | "cmap", 796 | "sv_color", 797 | "fmt", 798 | "annot", 799 | "title", 800 | ]: 801 | _ASSOC_PLOT_PARAMS[v] = locals()[v] 802 | ax = _plot_associations(ax, filename, plot, **_ASSOC_PLOT_PARAMS) 803 | return {"corr": corr, "ax": ax} # type: ignore 804 | 805 | 806 | def replot_last_associations( 807 | ax: Optional[plt.Axes] = None, 808 | figsize: Optional[Tuple[int, int]] = None, 809 | annot: Optional[bool] = None, 810 | fmt: Optional[str] = None, 811 | cmap: Optional[Colormap] = None, 812 | sv_color: Optional[str] = None, 813 | cbar: Optional[bool] = None, 814 | vmax: Optional[float] = None, 815 | vmin: Optional[float] = None, 816 | plot: bool = True, 817 | title: Optional[str] = None, 818 | filename: Optional[str] = None, 819 | ) -> plt.Axes: 820 | """ 821 | Re-plot last computed associations heat-map. This method performs no new computations, but only allows 822 | to change the visual output of the last computed heat-map. 823 | 824 | Parameters: 825 | ----------- 826 | ax : matplotlib ax, default = None 827 | Matplotlib Axis on which the heat-map will be plotted 828 | figsize : (int,int) or None, default = None 829 | A Matplotlib figure-size tuple. If `None`, uses the last `associations` call value. 830 | Only used if `ax=None`. 831 | annot : Boolean or None, default = None 832 | Plot number annotations on the heat-map. If `None`, uses the last `associations` call value. 833 | fmt : string, default = None 834 | String formatting of annotations. If `None`, uses the last `associations` call value. 835 | cmap : Matplotlib colormap or None, default = None 836 | A colormap to be used for the heat-map. If `None`, uses the last `associations` call value. 837 | sv_color : string, default = None 838 | A Matplotlib color. The color to be used when displaying single-value. 839 | If `None`, uses the last `associations` call value. 840 | cbar : Boolean or None, default = None 841 | Display heat-map's color-bar. If `None`, uses the last `associations` call value. 842 | vmax : float or None, default = None 843 | Set heat-map vmax option. If `None`, uses the last `associations` call value. 844 | vmin : float or None, default = None 845 | Set heat-map vmin option. If `None`, uses the last `associations` call value. 846 | plot : Boolean, default = True 847 | Plot a heat-map of the correlation matrix. If False, plotting still 848 | happens, but the heat-map will not be displayed. 849 | title : string or None, default = None 850 | Plotted graph title. If `None`, uses the last `associations` call value. 851 | filename : string or None, default = None 852 | If not None, plot will be saved to the given file name. Note: in order to avoid accidental file 853 | overwrites, the last `associations` call value is never used, and when filename is set to None, 854 | no writing to file occurs. 855 | 856 | Returns: 857 | -------- 858 | A Matplotlib `Axes` 859 | """ 860 | if not bool(_ASSOC_PLOT_PARAMS): 861 | raise RuntimeError("No associations found to replot.") 862 | new_vars = locals() 863 | new_vars.pop("filename") 864 | new_vars.pop("ax") 865 | new_vars.pop("plot") 866 | plot_vars = _ASSOC_PLOT_PARAMS.copy() 867 | for v in new_vars: 868 | plot_vars[v] = new_vars[v] or plot_vars[v] 869 | return _plot_associations(ax, filename, plot, **plot_vars) 870 | 871 | 872 | def _plot_associations( 873 | ax: Optional[plt.Axes], 874 | filename: Optional[str], 875 | plot: bool, 876 | corr: pd.DataFrame, 877 | inf_nan: pd.DataFrame, 878 | single_value_columns_set: Set[str], 879 | display_rows: List[str], 880 | display_columns: List[str], 881 | displayed_features_set: Set[str], 882 | nominal_columns: List[str], 883 | figsize: Tuple[int, int], 884 | vmin: Optional[Number], 885 | vmax: Number, 886 | cbar: bool, 887 | cmap: Colormap, 888 | sv_color: str, 889 | fmt: str, 890 | annot: bool, 891 | title: str, 892 | ) -> plt.Axes: 893 | if ax is None: 894 | plt.figure(figsize=figsize) 895 | if inf_nan.any(axis=None): 896 | inf_nan_mask = np.vectorize(lambda x: not bool(x))(inf_nan.values) 897 | ax = sns.heatmap( 898 | inf_nan_mask, 899 | cmap=["white"], 900 | annot=inf_nan if annot else None, 901 | fmt="", 902 | center=0, 903 | square=True, 904 | ax=ax, 905 | mask=inf_nan_mask, 906 | cbar=False, 907 | ) 908 | else: 909 | inf_nan_mask = np.ones_like(corr) 910 | if len(single_value_columns_set) > 0: 911 | sv = pd.DataFrame( 912 | data=np.zeros_like(corr), 913 | columns=corr.columns, 914 | index=corr.index, 915 | dtype="object", 916 | ) 917 | for c in single_value_columns_set: 918 | if c in display_rows and c in display_columns: 919 | sv.loc[:, c] = " " 920 | sv.loc[c, :] = " " 921 | sv.loc[c, c] = "SV" 922 | elif c in display_rows: 923 | sv.loc[c, :] = " " 924 | sv.loc[c, sv.columns[0]] = "SV" 925 | else: # c in display_columns 926 | sv.loc[:, c] = " " 927 | sv.loc[sv.index[-1], c] = "SV" 928 | sv_mask = np.vectorize(lambda x: not bool(x))(sv.values) 929 | ax = sns.heatmap( 930 | sv_mask, 931 | cmap=[sv_color], 932 | annot=sv if annot else None, 933 | fmt="", 934 | center=0, 935 | square=True, 936 | ax=ax, 937 | mask=sv_mask, 938 | cbar=False, 939 | ) 940 | else: 941 | sv_mask = np.ones_like(corr) 942 | mask = np.vectorize(lambda x: not bool(x))(inf_nan_mask) + np.vectorize( 943 | lambda x: not bool(x) 944 | )(sv_mask) 945 | vmin = vmin or ( 946 | -1.0 if len(displayed_features_set) - len(nominal_columns) >= 2 else 0.0 947 | ) 948 | ax = sns.heatmap( 949 | corr, 950 | cmap=cmap, 951 | annot=annot, 952 | fmt=fmt, 953 | center=0, 954 | vmax=vmax, 955 | vmin=vmin, 956 | square=True, 957 | mask=mask, 958 | ax=ax, 959 | cbar=cbar, 960 | ) 961 | plt.title(title) 962 | if filename: 963 | plt.savefig(filename) 964 | plot_or_not(plot) 965 | return ax 966 | 967 | 968 | def _handling_category_for_nan_imputation( 969 | dataset: pd.DataFrame, nan_replace_value: Any 970 | ) -> pd.DataFrame: 971 | pd_categorical_columns = identify_columns_by_type( 972 | dataset, include=["category"] 973 | ) 974 | if pd_categorical_columns: 975 | for col in pd_categorical_columns: 976 | if isinstance(nan_replace_value, pd.DataFrame): 977 | values_ = nan_replace_value[col].unique().tolist() 978 | values = [ 979 | x for x in values_ if x not in dataset[col].cat.categories 980 | ] 981 | dataset[col] = dataset[col].cat.add_categories(values) 982 | else: 983 | if isinstance(nan_replace_value, dict): 984 | value = nan_replace_value[col] 985 | else: 986 | value = nan_replace_value 987 | if not value in dataset[col].cat.categories: 988 | dataset[col] = dataset[col].cat.add_categories(value) 989 | return dataset 990 | 991 | 992 | def _nom_num( 993 | nom_column: OneDimArray, 994 | num_column: OneDimArray, 995 | nom_num_assoc: Union[Callable, NomNumAssocStr], 996 | ) -> Tuple[Number, Number]: 997 | """ 998 | Computes the nominal-numerical association value. 999 | """ 1000 | if callable(nom_num_assoc): 1001 | cell = nom_num_assoc(nom_column, num_column) 1002 | ij = cell 1003 | ji = cell 1004 | elif nom_num_assoc == "correlation_ratio": 1005 | cell = correlation_ratio(nom_column, num_column, nan_strategy=_SKIP) 1006 | ij = cell 1007 | ji = cell 1008 | else: 1009 | raise ValueError( 1010 | f"{nom_num_assoc} is not a supported nominal-numerical association" 1011 | ) 1012 | return ij, ji 1013 | 1014 | 1015 | def _compute_associations( 1016 | indices_pair: Tuple[int, int], 1017 | dataset: pd.DataFrame, 1018 | displayed_features_set: Set[str], 1019 | single_value_columns_set: Set[str], 1020 | nominal_columns: Union[OneDimArray, List[str], str], 1021 | symmetric_nom_nom: bool, 1022 | nom_nom_assoc: Union[ 1023 | NomNomAssocStr, Callable[[pd.Series, pd.Series], Number] 1024 | ], 1025 | cramers_v_bias_correction: bool, 1026 | num_num_assoc: Union[ 1027 | NumNumAssocStr, Callable[[pd.Series, pd.Series], Number] 1028 | ], 1029 | nom_num_assoc: Union[ 1030 | NomNumAssocStr, Callable[[pd.Series, pd.Series], Number] 1031 | ], 1032 | symmetric_num_num: bool, 1033 | nan_strategy: str, 1034 | ) -> Tuple: 1035 | """ 1036 | Helper function of associations. 1037 | 1038 | Parameters: 1039 | ----------- 1040 | indices_pair: Tuple[int, int] 1041 | The tuple of indices pairs (i, j) 1042 | dataset: pandas.Dataframe 1043 | the pandas dataframe 1044 | displayed_features_set: Set[str] 1045 | The set of { display_rows } ∪ { display_columns } 1046 | single_value_columns_set: Set[str] 1047 | The set of single-value columns 1048 | nominal_columns : string / list / NumPy ndarray, default = 'auto' 1049 | Names of columns of the data-set which hold categorical values. Can 1050 | also be the string 'all' to state that all columns are categorical, 1051 | 'auto' (default) to try to identify nominal columns, or None to state 1052 | none are categorical. Only used if `numerical_columns` is `None`. 1053 | symmetric_nom_nom : Boolean, default = True 1054 | Relevant only if `nom_nom_assoc` is a callable. Declare whether the function is symmetric (f(x,y) = f(y,x)). 1055 | If False, heat-map values should be interpreted as f(row,col) 1056 | nom_nom_assoc : callable / string, default = 'cramer' 1057 | If callable, a function which recieves two `pd.Series` and returns a single number. 1058 | If string, name of nominal-nominal (categorical-categorical) association to use. 1059 | Options are 'cramer' for Cramer's V or `theil` for Theil's U. If 'theil', 1060 | heat-map columns are the provided information (U = U(row|col)). 1061 | num_num_assoc : callable / string, default = 'pearson' 1062 | If callable, a function which recieves two `pd.Series` and returns a single number. 1063 | If string, name of numerical-numerical association to use. Options are 'pearson' 1064 | for Pearson's R, 'spearman' for Spearman's R, 'kendall' for Kendall's Tau. 1065 | nom_num_assoc : callable / string, default = 'correlation_ratio' 1066 | If callable, a function which recieves two `pd.Series` and returns a single number. 1067 | If string, name of nominal-numerical association to use. Options are 'correlation_ratio' 1068 | for correlation ratio. 1069 | symmetric_num_num : Boolean, default = True 1070 | Relevant only if `num_num_assoc` is a callable. Declare whether the function is symmetric (f(x,y) = f(y,x)). 1071 | If False, heat-map values should be interpreted as f(row,col) 1072 | cramers_v_bias_correction : Boolean, default = True 1073 | Use bias correction for Cramer's V from Bergsma and Wicher, 1074 | Journal of the Korean Statistical Society 42 (2013): 323-328. 1075 | nan_strategy: string 1076 | The provided nan_strategy to associations 1077 | 1078 | Returns: 1079 | -------- 1080 | A list containing tuples. All tuples have one of the following strings in the 1081 | 0-th index: 1082 | * _NO_OP 1083 | * _SINGLE_VALUE_COLUMN_OP 1084 | * _I_EQ_J_OP 1085 | * _ASSOC_OP 1086 | Then, additionally, they can have multiple numerical values. 1087 | """ 1088 | columns = dataset.columns 1089 | 1090 | i, j = indices_pair 1091 | if columns[i] not in displayed_features_set: 1092 | return (_NO_OP, None) 1093 | if columns[i] in single_value_columns_set: 1094 | return (_SINGLE_VALUE_COLUMN_OP, i) 1095 | 1096 | if ( 1097 | columns[j] in single_value_columns_set 1098 | or columns[j] not in displayed_features_set 1099 | ): 1100 | return (_NO_OP, None) 1101 | elif i == j: 1102 | return (_I_EQ_J_OP, i, j) 1103 | else: 1104 | if nan_strategy in [ 1105 | _DROP_SAMPLE_PAIRS, 1106 | ]: 1107 | dataset_c_ij = dataset[[columns[i], columns[j]]].dropna(axis=0) 1108 | c_i, c_j = dataset_c_ij[columns[i]], dataset_c_ij[columns[j]] 1109 | else: 1110 | c_i, c_j = dataset[columns[i]], dataset[columns[j]] 1111 | if columns[i] in nominal_columns: 1112 | if columns[j] in nominal_columns: 1113 | if callable(nom_nom_assoc): 1114 | if symmetric_nom_nom: 1115 | cell = nom_nom_assoc(c_i, c_j) 1116 | ij = cell 1117 | ji = cell 1118 | else: 1119 | ij = nom_nom_assoc(c_i, c_j) 1120 | ji = nom_nom_assoc(c_j, c_i) 1121 | elif nom_nom_assoc == "theil": 1122 | ij = theils_u( 1123 | c_i, 1124 | c_j, 1125 | nan_strategy=_SKIP, 1126 | ) 1127 | ji = theils_u( 1128 | c_j, 1129 | c_i, 1130 | nan_strategy=_SKIP, 1131 | ) 1132 | elif nom_nom_assoc == "cramer": 1133 | cell = cramers_v( 1134 | c_i, 1135 | c_j, 1136 | bias_correction=cramers_v_bias_correction, 1137 | nan_strategy=_SKIP, 1138 | ) 1139 | ij = cell 1140 | ji = cell 1141 | else: 1142 | raise ValueError( 1143 | f"{nom_nom_assoc} is not a supported nominal-nominal association" 1144 | ) 1145 | else: 1146 | ij, ji = _nom_num( 1147 | nom_column=c_i, num_column=c_j, nom_num_assoc=nom_num_assoc 1148 | ) 1149 | else: 1150 | if columns[j] in nominal_columns: 1151 | ij, ji = _nom_num( 1152 | nom_column=c_j, num_column=c_i, nom_num_assoc=nom_num_assoc 1153 | ) 1154 | else: 1155 | if callable(num_num_assoc): 1156 | if symmetric_num_num: 1157 | cell = num_num_assoc(c_i, c_j) 1158 | ij = cell 1159 | ji = cell 1160 | else: 1161 | ij = num_num_assoc(c_i, c_j) 1162 | ji = num_num_assoc(c_j, c_i) 1163 | else: 1164 | if num_num_assoc == "pearson": 1165 | cell, _ = ss.pearsonr(c_i, c_j) 1166 | elif num_num_assoc == "spearman": 1167 | cell, _ = ss.spearmanr(c_i, c_j) 1168 | elif num_num_assoc == "kendall": 1169 | cell, _ = ss.kendalltau(c_i, c_j) 1170 | else: 1171 | raise ValueError( 1172 | f"{num_num_assoc} is not a supported numerical-numerical association" 1173 | ) 1174 | ij = cell 1175 | ji = cell 1176 | 1177 | return (_ASSOC_OP, i, j, ij, ji) 1178 | 1179 | 1180 | def numerical_encoding( 1181 | dataset: TwoDimArray, 1182 | nominal_columns: Optional[ 1183 | Union[List[str], Literal["all", "auto"]] 1184 | ] = "auto", 1185 | drop_single_label: bool = False, 1186 | drop_fact_dict: bool = True, 1187 | nan_strategy: str = _REPLACE, 1188 | nan_replace_value: Any = _DEFAULT_REPLACE_VALUE, 1189 | ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, Dict]]: 1190 | """ 1191 | Encoding a data-set with mixed data (numerical and categorical) to a 1192 | numerical-only data-set using the following logic: 1193 | * categorical with only a single value will be marked as zero (or dropped, 1194 | if requested) 1195 | * categorical with two values will be replaced with the result of Pandas 1196 | `factorize` 1197 | * categorical with more than two values will be replaced with the result 1198 | of Pandas `get_dummies` 1199 | * numerical columns will not be modified 1200 | 1201 | Parameters: 1202 | ----------- 1203 | dataset : NumPy ndarray / Pandas DataFrame 1204 | The data-set to encode 1205 | nominal_columns : None / sequence / string. default = 'all' 1206 | A sequence of the nominal (categorical) columns in the dataset. If 1207 | string, must be 'all' or 'auto. If 'all' to state that all columns are nominal. 1208 | If 'auto', categorical columns will be identified 1209 | based on dtype. If None, nothing happens. 1210 | drop_single_label : Boolean, default = False 1211 | If True, nominal columns with a only a single value will be dropped. 1212 | drop_fact_dict : Boolean, default = True 1213 | If True, the return value will be the encoded DataFrame alone. If 1214 | False, it will be a tuple of the DataFrame and the dictionary of the 1215 | binary factorization (originating from pd.factorize) 1216 | nan_strategy : string, default = 'replace' 1217 | How to handle missing values: can be either 'drop_samples' to remove 1218 | samples with missing values, 'drop_features' to remove features 1219 | (columns) with missing values, or 'replace' to replace all missing 1220 | values with the nan_replace_value. Missing values are None and np.nan. 1221 | nan_replace_value : any, default = 0.0 1222 | The value used to replace missing values with. Only applicable when nan 1223 | _strategy is set to 'replace' 1224 | 1225 | Returns: 1226 | -------- 1227 | DataFrame or (DataFrame, dict). If `drop_fact_dict` is True, 1228 | returns the encoded DataFrame. 1229 | else, returns a tuple of the encoded DataFrame and dictionary, where each 1230 | key is a two-value column, and the value is the original labels, as 1231 | supplied by Pandas `factorize`. Will be empty if no two-value columns are 1232 | present in the data-set 1233 | """ 1234 | df: pd.DataFrame = convert(dataset, "dataframe") # type: ignore 1235 | if nan_strategy == _REPLACE: 1236 | df.fillna(nan_replace_value, inplace=True) 1237 | elif nan_strategy == _DROP_SAMPLES: 1238 | df.dropna(axis=0, inplace=True) 1239 | elif nan_strategy == _DROP_FEATURES: 1240 | df.dropna(axis=1, inplace=True) 1241 | if nominal_columns is None: 1242 | return df 1243 | elif nominal_columns == "all": 1244 | nominal_columns = df.columns.tolist() 1245 | elif nominal_columns == "auto": 1246 | nominal_columns = identify_nominal_columns(df) 1247 | converted_dataset = pd.DataFrame() 1248 | binary_columns_dict = dict() 1249 | for col in df.columns: 1250 | if col not in nominal_columns: 1251 | converted_dataset.loc[:, col] = df[col] 1252 | else: 1253 | unique_values = pd.unique(df[col]) 1254 | if len(unique_values) == 1 and not drop_single_label: 1255 | converted_dataset.loc[:, col] = 0 1256 | elif len(unique_values) == 2: 1257 | ( 1258 | converted_dataset.loc[:, col], 1259 | binary_columns_dict[col], 1260 | ) = pd.factorize(df[col]) 1261 | else: 1262 | dummies = pd.get_dummies(df[col], prefix=col) 1263 | converted_dataset = pd.concat( 1264 | [converted_dataset, dummies], axis=1 1265 | ) 1266 | if drop_fact_dict: 1267 | return converted_dataset 1268 | else: 1269 | return converted_dataset, binary_columns_dict 1270 | 1271 | 1272 | def cluster_correlations( 1273 | corr_mat: TwoDimArray, indices: Optional[ArrayLike] = None 1274 | ) -> Tuple[TwoDimArray, ArrayLike]: 1275 | """ 1276 | Apply agglomerative clustering in order to sort 1277 | a correlation matrix. 1278 | 1279 | Based on https://github.com/TheLoneNut/CorrelationMatrixClustering/blob/master/CorrelationMatrixClustering.ipynb 1280 | 1281 | Parameters: 1282 | ----------- 1283 | - corr_mat : a square correlation matrix (pandas DataFrame) 1284 | - indices : cluster labels [None]; if not provided we'll do 1285 | an aglomerative clustering to get cluster labels. 1286 | 1287 | Returns: 1288 | -------- 1289 | - corr : a sorted correlation matrix 1290 | - indices : cluster indexes based on the original dataset 1291 | 1292 | Example: 1293 | -------- 1294 | >>> assoc = associations( 1295 | ... iris_df, 1296 | ... plot=False 1297 | ... ) 1298 | >>> correlations = assoc['corr'] 1299 | >>> correlations, _ = cluster_correlations(correlations) 1300 | """ 1301 | df: pd.DataFrame = convert(corr_mat, "dataframe") # type: ignore 1302 | if indices is None: 1303 | X = df.values 1304 | d = sch.distance.pdist(X) 1305 | L = sch.linkage(d, method="complete") 1306 | ind: ArrayLike = sch.fcluster(L, 0.5 * d.max(), "distance") # type: ignore 1307 | else: 1308 | ind = indices 1309 | 1310 | columns = [df.columns.tolist()[i] for i in list((np.argsort(ind)))] 1311 | df = df.reindex(columns=columns).reindex(index=columns) 1312 | 1313 | if isinstance(corr_mat, np.ndarray): 1314 | return df.to_numpy(), ind 1315 | else: 1316 | return df, ind 1317 | -------------------------------------------------------------------------------- /dython/sampling.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import Union, List 3 | from numpy.typing import NDArray 4 | from .typing import Number, OneDimArray 5 | 6 | 7 | __all__ = ["boltzmann_sampling", "weighted_sampling"] 8 | 9 | 10 | def _w_sampling( 11 | numbers: OneDimArray, k: int, with_replacement: bool, force_to_list: bool 12 | ) -> Union[Number, OneDimArray]: 13 | sampled = np.random.choice(numbers, size=k, replace=with_replacement) 14 | if (isinstance(numbers, list) or force_to_list) and k is not None: 15 | sampled = sampled.tolist() 16 | return sampled 17 | 18 | 19 | def weighted_sampling( 20 | numbers: OneDimArray, k: int = 1, with_replacement: bool = False 21 | ) -> Union[Number, OneDimArray]: 22 | """ 23 | Return k numbers from a weighted-sampling over the supplied numbers 24 | 25 | Parameters: 26 | ----------- 27 | numbers : List or np.ndarray 28 | numbers to sample 29 | k : int, default = 1 30 | How many numbers to sample. Choosing `k=None` will yield a single 31 | number 32 | with_replacement : Boolean, default = False 33 | Allow replacement or not 34 | 35 | Returns: 36 | -------- 37 | List, np.ndarray or a single number (depending on the input) 38 | """ 39 | return _w_sampling(numbers, k, with_replacement, force_to_list=False) 40 | 41 | 42 | def boltzmann_sampling( 43 | numbers: OneDimArray, k: int = 1, with_replacement: bool = False 44 | ) -> Union[Number, OneDimArray]: 45 | """ 46 | Return k numbers from a boltzmann-sampling over the supplied numbers 47 | 48 | Parameters: 49 | ----------- 50 | numbers : List or np.ndarray 51 | numbers to sample 52 | k : int, default = 1 53 | How many numbers to sample. Choosing `k=None` will yield a single 54 | number 55 | with_replacement : Boolean, default = False 56 | Allow replacement or not 57 | 58 | Returns: 59 | -------- 60 | List, np.ndarray or a single number (depending on the input) 61 | """ 62 | exp_func = np.vectorize(lambda x: np.exp(x)) 63 | exp_numbers = exp_func(numbers) 64 | exp_sum = exp_numbers.sum() 65 | scaling_func = np.vectorize(lambda x: x / exp_sum) 66 | b_numbers = scaling_func(exp_numbers) 67 | return _w_sampling( 68 | b_numbers, 69 | k=k, 70 | with_replacement=with_replacement, 71 | force_to_list=isinstance(numbers, list), 72 | ) 73 | -------------------------------------------------------------------------------- /dython/typing.py: -------------------------------------------------------------------------------- 1 | from pandas import DataFrame, Series 2 | from typing import List, Union, Any 3 | from numpy.typing import NDArray 4 | 5 | 6 | Number = Union[int, float] 7 | OneDimArray = Union[List[Number], NDArray, Series] 8 | TwoDimArray = Union[NDArray, DataFrame] 9 | -------------------------------------------------------------------------------- /logos/README.md: -------------------------------------------------------------------------------- 1 | Logos were made using [hatchful](https://hatchful.shopify.com/). 2 | -------------------------------------------------------------------------------- /logos/dython_300x200.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/dython_300x200.png -------------------------------------------------------------------------------- /logos/facebook_cover_photo_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/facebook_cover_photo_1.png -------------------------------------------------------------------------------- /logos/facebook_cover_photo_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/facebook_cover_photo_2.png -------------------------------------------------------------------------------- /logos/facebook_profile_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/facebook_profile_image.png -------------------------------------------------------------------------------- /logos/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/favicon.png -------------------------------------------------------------------------------- /logos/instagram_profile_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/instagram_profile_image.png -------------------------------------------------------------------------------- /logos/linkedin_banner_image_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/linkedin_banner_image_1.png -------------------------------------------------------------------------------- /logos/linkedin_banner_image_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/linkedin_banner_image_2.png -------------------------------------------------------------------------------- /logos/linkedin_profile_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/linkedin_profile_image.png -------------------------------------------------------------------------------- /logos/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/logo.png -------------------------------------------------------------------------------- /logos/logo_transparent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/logo_transparent.png -------------------------------------------------------------------------------- /logos/pinterest_board_photo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/pinterest_board_photo.png -------------------------------------------------------------------------------- /logos/pinterest_profile_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/pinterest_profile_image.png -------------------------------------------------------------------------------- /logos/twitter_header_photo_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/twitter_header_photo_1.png -------------------------------------------------------------------------------- /logos/twitter_header_photo_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/twitter_header_photo_2.png -------------------------------------------------------------------------------- /logos/twitter_profile_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/twitter_profile_image.png -------------------------------------------------------------------------------- /logos/youtube_profile_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/youtube_profile_image.png -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: dython 2 | site_description: A set of data-analysis tools for Python 3.x 3 | site_long_description: Dython is a set of data-analysis tools written in Python 3.x, which can let you get more insights about your data. Dython was designed with analysis usage in mind - meaning ease-of-use, functionality and readability are the core values of this library. 4 | site_author: Shaked Zychlinski 5 | site_url: http://shakedzy.xyz/dython/ 6 | repo_name: shakedzy/dython 7 | repo_url: https://github.com/shakedzy/dython 8 | copyright: Copyright © Shaked Zychlinski 9 | theme: 10 | name: material 11 | custom_dir: docs/overrides 12 | palette: 13 | - media: "(prefers-color-scheme: light)" 14 | scheme: default 15 | primary: light blue 16 | accent: blue 17 | toggle: 18 | icon: material/toggle-switch-off-outline 19 | name: Switch to dark mode 20 | - media: "(prefers-color-scheme: dark)" 21 | scheme: slate 22 | primary: teal 23 | accent: cyan 24 | toggle: 25 | icon: material/toggle-switch 26 | name: Switch to light mode 27 | logo: images/favicon.png 28 | favicon: images/favicon.png 29 | nav: 30 | - Home: 'index.md' 31 | - Getting Started: 32 | - Installation: 'getting_started/installation.md' 33 | - Examples: 'getting_started/examples.md' 34 | - Modules: 35 | - data_utils: 'modules/data_utils.md' 36 | - nominal: 'modules/nominal.md' 37 | - model_utils: 'modules/model_utils.md' 38 | - sampling: 'modules/sampling.md' 39 | - Related Blogosts: 'related_blogposts.md' 40 | google_analytics: 41 | - UA-141245946-2 42 | - auto 43 | markdown_extensions: 44 | - toc: 45 | permalink: true 46 | - admonition 47 | - codehilite 48 | - pymdownx.arithmatex: 49 | generic: true 50 | - pymdownx.highlight 51 | - pymdownx.superfences 52 | extra_javascript: 53 | - https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-MML-AM_CHTML 54 | plugins: 55 | - search 56 | extra: 57 | social: 58 | - icon: material/web 59 | link: https://shakedzy.xyz/ 60 | - icon: fontawesome/brands/github 61 | link: https://github.com/shakedzy 62 | - icon: fontawesome/brands/linkedin 63 | link: https://www.linkedin.com/in/shakedzy/ 64 | - icon: fontawesome/brands/medium 65 | link: https://shakedzy.medium.com/ -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.pytest.ini_options] 2 | addopts = "--doctest-modules --doctest-continue-on-failure" 3 | testpaths = [ 4 | "./tests", 5 | "./dython" 6 | ] 7 | doctest_optionflags = "NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL ELLIPSIS" 8 | 9 | [tool.black] 10 | line-length = 80 11 | target-version = ['py38'] -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [tool.pytest-enabler.black] 2 | addopts = "--black" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.23.0 2 | pandas>=1.4.2 3 | seaborn>=0.12.0 4 | scipy>=1.7.1 5 | matplotlib>=3.6.0 6 | scikit-learn>=0.24.2 7 | psutil>=5.9.1 8 | setuptools -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from setuptools import setup, find_packages 3 | 4 | HERE = pathlib.Path(__file__).parent.resolve() 5 | 6 | PACKAGE_NAME = "dython" 7 | AUTHOR = "Shaked Zychlinski" 8 | AUTHOR_EMAIL = "shakedzy@gmail.com" 9 | URL = "http://shakedzy.xyz/dython" 10 | DOWNLOAD_URL = "https://pypi.org/project/dython/" 11 | 12 | LICENSE = "MIT" 13 | VERSION = (HERE / "VERSION").read_text(encoding="utf8").strip() 14 | DESCRIPTION = "A set of data tools in Python" 15 | LONG_DESCRIPTION = (HERE / "README.md").read_text(encoding="utf8") 16 | LONG_DESC_TYPE = "text/markdown" 17 | 18 | requirements = (HERE / "requirements.txt").read_text(encoding="utf8") 19 | INSTALL_REQUIRES = [s.strip() for s in requirements.split("\n")] 20 | 21 | dev_requirements = (HERE / "dev_requirements.txt").read_text(encoding="utf8") 22 | EXTRAS_REQUIRE = {"dev": [s.strip() for s in dev_requirements.split("\n")]} 23 | 24 | min_minor = 9 25 | max_minor = 12 26 | CLASSIFIERS = [ 27 | f"Programming Language :: Python :: 3.{str(v)}" for v in range(min_minor, max_minor+1) 28 | ] 29 | PYTHON_REQUIRES = f">=3.{min_minor}" 30 | 31 | setup( 32 | name=PACKAGE_NAME, 33 | version=VERSION, 34 | description=DESCRIPTION, 35 | long_description=LONG_DESCRIPTION, 36 | long_description_content_type=LONG_DESC_TYPE, 37 | author=AUTHOR, 38 | license=LICENSE, 39 | author_email=AUTHOR_EMAIL, 40 | url=URL, 41 | download_url=DOWNLOAD_URL, 42 | python_requires=PYTHON_REQUIRES, 43 | install_requires=INSTALL_REQUIRES, 44 | extras_require=EXTRAS_REQUIRE, 45 | packages=find_packages(), 46 | classifiers=CLASSIFIERS, 47 | ) 48 | -------------------------------------------------------------------------------- /tests/test_data_utils/test_one_hot_encode.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from dython.data_utils import one_hot_encode 3 | 4 | 5 | def test_one_hot_encode_check(): 6 | lst = [0, 0, 2, 5] 7 | row = len(lst) 8 | col = max(lst) + 1 9 | 10 | result = one_hot_encode(lst) 11 | assert result.shape == (row, col) 12 | 13 | 14 | def test_negative_input(): 15 | lst = [-1, -5, 0, 3] 16 | 17 | with pytest.raises(ValueError, match="negative value"): 18 | one_hot_encode(lst) 19 | 20 | 21 | def test_more_than_one_dimension(): 22 | lst = [[0, 1], [2, 3]] 23 | 24 | with pytest.raises(ValueError, match="must have only one dimension"): 25 | one_hot_encode(lst) 26 | -------------------------------------------------------------------------------- /tests/test_data_utils/test_split_hist.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import matplotlib.pyplot as plt 3 | 4 | from dython.data_utils import split_hist 5 | 6 | 7 | def test_split_hist_check(iris_df): 8 | result = split_hist(iris_df, "sepal length (cm)", "target") 9 | 10 | assert isinstance(result, plt.Axes) 11 | -------------------------------------------------------------------------------- /tests/test_model_utils/test_ks_abc.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import matplotlib 4 | 5 | from dython.model_utils import ks_abc 6 | 7 | 8 | @pytest.fixture 9 | def y_true(): 10 | return np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) 11 | 12 | 13 | @pytest.fixture 14 | def y_pred(): 15 | return np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]) 16 | 17 | 18 | def test_ks_abc_check_types(y_true, y_pred): 19 | result = ks_abc(y_true, y_pred) 20 | 21 | assert isinstance(result, dict), "ks_abc should return dict" 22 | 23 | assert "abc" in result, 'ks_abc should return dict containing "abc" key' 24 | assert isinstance( 25 | result["abc"], float 26 | ), "area between curves should be a float" 27 | 28 | assert ( 29 | "ks_stat" in result 30 | ), 'ks_abc should return dict containing "ks_stat" key' 31 | assert isinstance( 32 | result["ks_stat"], float 33 | ), "ks statistic should be a float" 34 | 35 | assert "eopt" in result, 'ks_abc should return dict containing "eopt" key' 36 | assert isinstance( 37 | result["eopt"], float 38 | ), "estimated optimal threshold should be a float" 39 | 40 | assert "ax" in result, 'ks_abc should return dict containing "ax" key' 41 | assert isinstance(result["ax"], matplotlib.axes.Axes) 42 | 43 | 44 | def test_ks_abc_check_known_value(y_true, y_pred): 45 | result = ks_abc(y_true, y_pred) 46 | 47 | assert result["abc"] == pytest.approx(0.55) 48 | assert result["ks_stat"] == pytest.approx(1.0) 49 | assert result["eopt"] == pytest.approx(0.4) 50 | -------------------------------------------------------------------------------- /tests/test_model_utils/test_metric_graph.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import matplotlib 4 | 5 | from dython.model_utils import metric_graph 6 | 7 | 8 | @pytest.fixture 9 | def y_true(): 10 | return np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) 11 | 12 | 13 | @pytest.fixture 14 | def y_pred(): 15 | return np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]) 16 | 17 | 18 | def test_metric_graph_check_types(y_true, y_pred): 19 | result = metric_graph(y_true, y_pred, "roc") 20 | 21 | assert isinstance(result, dict), "metric_graph should return a dict" 22 | 23 | assert "ax" in result, 'metric_graph should return dict containing "ax" key' 24 | 25 | assert isinstance(result["ax"], matplotlib.axes.Axes) 26 | 27 | 28 | def test_metric_graph_bad_metric_parameter(y_true, y_pred): 29 | with pytest.raises(ValueError, match="Invalid metric"): 30 | metric_graph(y_true, y_pred, "bad_metric_param") 31 | -------------------------------------------------------------------------------- /tests/test_model_utils/test_random_forest_feature_importance.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from sklearn.ensemble import RandomForestClassifier 3 | 4 | from dython.model_utils import random_forest_feature_importance 5 | 6 | 7 | def test_random_forest_feature_importance_check_types(iris_df): 8 | X = iris_df.drop(["target", "extra"], axis=1) 9 | y = iris_df["target"].values 10 | 11 | clf = RandomForestClassifier(n_estimators=7) 12 | clf.fit(X, y) 13 | 14 | result = random_forest_feature_importance(clf, X.columns) 15 | 16 | assert isinstance(result, list) 17 | assert isinstance(result[0], tuple) 18 | assert isinstance(result[0][0], float) 19 | assert isinstance(result[0][1], str) 20 | -------------------------------------------------------------------------------- /tests/test_nominal/test_associations.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import matplotlib 3 | import pandas as pd 4 | import numpy as np 5 | import scipy.stats as ss 6 | from sklearn import datasets 7 | from datetime import datetime, timedelta 8 | 9 | from dython.nominal import associations, correlation_ratio 10 | 11 | 12 | def test_return_type_check(iris_df): 13 | assoc = associations(iris_df) 14 | 15 | assert isinstance(assoc, dict), "associations should return a dict" 16 | assert ( 17 | "corr" in assoc 18 | ), 'associations should return a dict containing "corr" key' 19 | assert ( 20 | "ax" in assoc 21 | ), 'associations should return a dict containing "ax" key' 22 | 23 | assert isinstance( 24 | assoc["corr"], pd.DataFrame 25 | ), 'assoc["corr"] should be a pandas DataFrame' 26 | assert isinstance( 27 | assoc["ax"], matplotlib.axes.Axes 28 | ), 'assoc["ax"] should be a matplotlib Axes' 29 | 30 | 31 | def test_dimension_check(iris_df): 32 | corr = associations(iris_df)["corr"] 33 | corr_shape = corr.shape 34 | iris_shape = iris_df.shape 35 | 36 | assert corr_shape[0] == corr_shape[1], "association matrix has wrong shape" 37 | assert ( 38 | corr_shape[1] == iris_shape[1] 39 | ), "association matrix has different shape from input data" 40 | 41 | 42 | def test_single_value_zero_association(iris_df): 43 | SV_COL = 1 44 | iris_df.iloc[:, SV_COL] = 42 45 | 46 | corr = associations(iris_df)["corr"] 47 | 48 | assert ( 49 | corr.iloc[:, SV_COL] == 0 50 | ).all(), "single-value variable should have zero association value" 51 | assert ( 52 | corr.iloc[SV_COL, :] == 0 53 | ).all(), "single-value variable should have zero association value" 54 | 55 | 56 | def test_bad_nom_nom_assoc_parameter(iris_df): 57 | with pytest.raises(ValueError, match="is not a supported"): 58 | associations(iris_df, nom_nom_assoc="bad_parameter_name") 59 | 60 | 61 | def test_bad_num_num_assoc_parameter(iris_df): 62 | with pytest.raises(ValueError, match="is not a supported"): 63 | associations(iris_df, num_num_assoc="bad_parameter_name") 64 | 65 | 66 | def test_compute_only_ax_is_none(iris_df): 67 | assoc = associations(iris_df, compute_only=True) 68 | 69 | assert ( 70 | assoc["ax"] is None 71 | ), 'associations with compute_only should return a None value for "ax" key' 72 | 73 | 74 | def test_mark_columns(iris_df): 75 | corr = associations(iris_df, mark_columns=True)["corr"] 76 | 77 | assert ( 78 | "(con)" in corr.index[0] 79 | ), "first column should contain (con) mark if iris_df is used" 80 | 81 | 82 | def test_udf(iris_df): 83 | def pr(x, y): 84 | return ss.pearsonr(x, y)[0] 85 | 86 | corr1 = associations( 87 | iris_df, 88 | plot=False, 89 | num_num_assoc="pearson", 90 | nom_num_assoc="correlation_ratio", 91 | )["corr"] 92 | corr2 = associations( 93 | iris_df, plot=False, num_num_assoc=pr, nom_num_assoc=correlation_ratio 94 | )["corr"] 95 | assert corr1.compare( 96 | corr2 97 | ).empty, ( 98 | "Computation of built-in measures of associations differs from UDFs" 99 | ) 100 | 101 | 102 | def test_datetime_data(): 103 | dt = datetime(2020, 12, 1) 104 | end = datetime(2020, 12, 2) 105 | step = timedelta(seconds=5) 106 | result = [] 107 | while dt < end: 108 | result.append(dt.strftime("%Y-%m-%d %H:%M:%S")) 109 | dt += step 110 | 111 | nums = list(range(len(result))) 112 | df = pd.DataFrame( 113 | {"dates": result, "up": nums, "down": sorted(nums, reverse=True)} 114 | ) 115 | df["dates"] = pd.to_datetime( 116 | df["dates"], format="%Y-%m-%d %H:%M:%S" 117 | ) # without this, this column is considered as object rather than dates 118 | 119 | correct_corr = pd.DataFrame( 120 | columns=["dates", "up", "down"], 121 | index=["dates", "up", "down"], 122 | data=[[1.0, 1.0, -1.0], [1.0, 1.0, -1.0], [-1.0, -1.0, 1.0]], 123 | ) 124 | corr = associations(df, plot=False)["corr"] 125 | assert corr.compare( 126 | correct_corr 127 | ).empty, f"datetime associations are incorrect. Test should have returned an empty dataframe, received: {corr.head()}" 128 | 129 | 130 | def test_category_nan_replace(iris_df): 131 | iris_df["extra"] = iris_df["extra"].astype("category") 132 | iris_df.loc[5, "extra"] = np.nan 133 | try: 134 | associations(iris_df, nan_strategy="replace") 135 | except TypeError as exception: 136 | assert ( 137 | False 138 | ), f"nan_strategy='replace' with a pandas.CategoricalDtype column raised an exception {exception}" 139 | -------------------------------------------------------------------------------- /tests/test_nominal/test_associations_parallel.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import matplotlib 3 | import pandas as pd 4 | import scipy.stats as ss 5 | 6 | from psutil import cpu_count 7 | from datetime import datetime, timedelta 8 | 9 | from dython.nominal import associations, correlation_ratio 10 | 11 | MAX_CORE_COUNT = cpu_count(logical=False) 12 | 13 | 14 | def test_return_type_check(iris_df): 15 | assoc = associations( 16 | iris_df, multiprocessing=True, max_cpu_cores=MAX_CORE_COUNT 17 | ) 18 | 19 | assert isinstance(assoc, dict), "associations should return a dict" 20 | assert ( 21 | "corr" in assoc 22 | ), 'associations should return a dict containing "corr" key' 23 | assert ( 24 | "ax" in assoc 25 | ), 'associations should return a dict containing "ax" key' 26 | 27 | assert isinstance( 28 | assoc["corr"], pd.DataFrame 29 | ), 'assoc["corr"] should be a pandas DataFrame' 30 | assert isinstance( 31 | assoc["ax"], matplotlib.axes.Axes 32 | ), 'assoc["ax"] should be a matplotlib Axes' 33 | 34 | 35 | def test_dimension_check(iris_df): 36 | corr = associations( 37 | iris_df, multiprocessing=True, max_cpu_cores=MAX_CORE_COUNT 38 | )["corr"] 39 | corr_shape = corr.shape 40 | iris_shape = iris_df.shape 41 | 42 | assert corr_shape[0] == corr_shape[1], "association matrix has wrong shape" 43 | assert ( 44 | corr_shape[1] == iris_shape[1] 45 | ), "association matrix has different shape from input data" 46 | 47 | 48 | def test_single_value_zero_association(iris_df): 49 | SV_COL = 1 50 | iris_df.iloc[:, SV_COL] = 42 51 | 52 | corr = associations( 53 | iris_df, multiprocessing=True, max_cpu_cores=MAX_CORE_COUNT 54 | )["corr"] 55 | 56 | assert ( 57 | corr.iloc[:, SV_COL] == 0 58 | ).all(), "single-value variable should have zero association value" 59 | assert ( 60 | corr.iloc[SV_COL, :] == 0 61 | ).all(), "single-value variable should have zero association value" 62 | 63 | 64 | def test_bad_nom_nom_assoc_parameter(iris_df): 65 | with pytest.raises(ValueError, match="is not a supported"): 66 | associations( 67 | iris_df, 68 | nom_nom_assoc="bad_parameter_name", 69 | multiprocessing=True, 70 | max_cpu_cores=MAX_CORE_COUNT, 71 | ) 72 | 73 | 74 | def test_bad_num_num_assoc_parameter(iris_df): 75 | with pytest.raises(ValueError, match="is not a supported"): 76 | associations(iris_df, num_num_assoc="bad_parameter_name") 77 | 78 | 79 | def test_compute_only_ax_is_none(iris_df): 80 | assoc = associations( 81 | iris_df, 82 | compute_only=True, 83 | multiprocessing=True, 84 | max_cpu_cores=MAX_CORE_COUNT, 85 | ) 86 | 87 | assert ( 88 | assoc["ax"] is None 89 | ), 'associations with compute_only should return a None value for "ax" key' 90 | 91 | 92 | def test_mark_columns(iris_df): 93 | corr = associations( 94 | iris_df, 95 | mark_columns=True, 96 | multiprocessing=True, 97 | max_cpu_cores=MAX_CORE_COUNT, 98 | )["corr"] 99 | 100 | assert ( 101 | "(con)" in corr.index[0] 102 | ), "first column should contain (con) mark if iris_df is used" 103 | 104 | 105 | def pr(x, y): 106 | return ss.pearsonr(x, y)[0] 107 | 108 | 109 | def test_udf(iris_df): 110 | corr1 = associations( 111 | iris_df, 112 | plot=False, 113 | num_num_assoc="pearson", 114 | nom_num_assoc="correlation_ratio", 115 | multiprocessing=True, 116 | max_cpu_cores=MAX_CORE_COUNT, 117 | )["corr"] 118 | corr2 = associations( 119 | iris_df, 120 | plot=False, 121 | num_num_assoc=pr, 122 | nom_num_assoc=correlation_ratio, 123 | multiprocessing=True, 124 | max_cpu_cores=MAX_CORE_COUNT, 125 | )["corr"] 126 | assert corr1.compare( 127 | corr2 128 | ).empty, ( 129 | "Computation of built-in measures of associations differs from UDFs" 130 | ) 131 | 132 | 133 | def test_datetime_data(): 134 | dt = datetime(2020, 12, 1) 135 | end = datetime(2020, 12, 2) 136 | step = timedelta(seconds=5) 137 | result = [] 138 | while dt < end: 139 | result.append(dt.strftime("%Y-%m-%d %H:%M:%S")) 140 | dt += step 141 | 142 | nums = list(range(len(result))) 143 | df = pd.DataFrame( 144 | {"dates": result, "up": nums, "down": sorted(nums, reverse=True)} 145 | ) 146 | # without this, this column is considered as object rather than dates 147 | df["dates"] = pd.to_datetime(df["dates"], format="%Y-%m-%d %H:%M:%S") 148 | 149 | correct_corr = pd.DataFrame( 150 | columns=["dates", "up", "down"], 151 | index=["dates", "up", "down"], 152 | data=[[1.0, 1.0, -1.0], [1.0, 1.0, -1.0], [-1.0, -1.0, 1.0]], 153 | ) 154 | corr = associations( 155 | df, plot=False, multiprocessing=True, max_cpu_cores=MAX_CORE_COUNT 156 | )["corr"] 157 | assert corr.compare( 158 | correct_corr 159 | ).empty, f"datetime associations are incorrect. Test should have returned an empty dataframe, received: {corr.head()}" 160 | -------------------------------------------------------------------------------- /tests/test_nominal/test_cluster_correlation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import pandas as pd 4 | 5 | from dython.nominal import cluster_correlations 6 | 7 | 8 | @pytest.fixture 9 | def corr_example(): 10 | return pd.DataFrame( 11 | np.array( 12 | [ 13 | [1, 0.5, 0.7, 0.3], 14 | [0.5, 1, 0.8, 0.2], 15 | [0.7, 0.8, 1, 0.1], 16 | [0.3, 0.2, 0.1, 1], 17 | ] 18 | ), 19 | columns=list("ABCD"), 20 | index=list("ABCD"), 21 | ) 22 | 23 | 24 | def test_cluster_correlation_check_return_values(corr_example): 25 | result = cluster_correlations(corr_example) 26 | 27 | assert isinstance(result, tuple), "should return a tuple" 28 | 29 | sorted_corr, indices = result 30 | 31 | assert isinstance( 32 | sorted_corr, pd.DataFrame 33 | ), "sorted correlation should be a pd.DataFrame correlation matrix" 34 | assert isinstance(indices, np.ndarray), "indices should be a np.ndarray" 35 | -------------------------------------------------------------------------------- /tests/test_nominal/test_correlation_ratio.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from hypothesis import given, strategies as st, assume, settings, example 4 | 5 | from dython.nominal import correlation_ratio 6 | 7 | 8 | categories = st.text(alphabet=list("ABCDE"), min_size=1, max_size=1) 9 | 10 | 11 | @st.composite 12 | def categories_and_measurements(draw): 13 | n = draw(st.integers(min_value=2, max_value=30)) 14 | category_lists = st.lists(categories, min_size=n, max_size=n) 15 | measurement_lists = st.lists(st.floats(), min_size=n, max_size=n) 16 | 17 | return draw(category_lists), draw(measurement_lists) 18 | 19 | 20 | @given(c_m=categories_and_measurements()) 21 | def test_correlation_ratio_value_range(c_m): 22 | category, measurement = c_m 23 | 24 | corr_ratio = correlation_ratio(category, measurement) 25 | 26 | assert 0.0 <= corr_ratio <= 1.0 or np.isnan(corr_ratio) 27 | -------------------------------------------------------------------------------- /tests/test_nominal/test_cramers_v.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import functools 3 | import numpy as np 4 | from hypothesis import given, strategies as st, assume, settings, example 5 | 6 | from dython.nominal import cramers_v 7 | 8 | 9 | # "Patch" pytest.approx to increase its tolerance range 10 | approx = functools.partial(pytest.approx, abs=1e-6, rel=1e-6) 11 | 12 | 13 | def test_cramers_v_check(iris_df): 14 | x = iris_df["extra"] 15 | y = iris_df["target"] 16 | 17 | # Note: this measure is symmetric 18 | assert cramers_v(x, y) == pytest.approx(0.14201914309546954) 19 | assert cramers_v(y, x) == pytest.approx(0.14201914309546954) 20 | 21 | 22 | categories = st.text(alphabet=list("ABCDE"), min_size=1, max_size=1) 23 | 24 | 25 | @st.composite 26 | def two_categorical_lists(draw): 27 | n = draw(st.integers(min_value=2, max_value=30)) 28 | categorical_lists = st.lists(categories, min_size=n, max_size=n) 29 | 30 | return draw(categorical_lists), draw(categorical_lists) 31 | 32 | 33 | @given(x_y=two_categorical_lists()) 34 | def test_cramers_v_value_range(x_y): 35 | x, y = x_y 36 | 37 | v_xy = cramers_v(x, y) 38 | 39 | assume(not np.isnan(v_xy)) 40 | 41 | # 0.0 <= v_xy <= 1.0 is false when v_xy == 1.00000000000004 42 | # hence this weird-looking assertion, to avoid hypothesis saying it's "flaky" 43 | assert ( 44 | v_xy == pytest.approx(0.0) 45 | or 0.0 < v_xy < 1.0 46 | or v_xy == pytest.approx(1.0) 47 | ) 48 | 49 | 50 | @given(x_y=two_categorical_lists()) 51 | @settings(deadline=1000) 52 | def test_cramers_v_symmetry(x_y): 53 | x, y = x_y 54 | v_xy = cramers_v(x, y) 55 | v_yx = cramers_v(y, x) 56 | 57 | # Can be overridden by passing nan_ok = True to 58 | # pytest.approx, but this feels more appropriate 59 | assume(not np.isnan(v_xy) and not np.isnan(v_yx)) 60 | 61 | assert approx(v_xy) == approx(v_yx) 62 | -------------------------------------------------------------------------------- /tests/test_nominal/test_theils_u.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from hypothesis import given, strategies as st, assume 3 | 4 | from dython.nominal import theils_u 5 | 6 | 7 | def test_theils_u_check(iris_df): 8 | x = iris_df["extra"] 9 | y = iris_df["target"] 10 | 11 | # Note: this measure is not symmetric 12 | assert theils_u(x, y) == pytest.approx(0.02907500150218738) 13 | assert theils_u(y, x) == pytest.approx(0.0424761859049835) 14 | 15 | 16 | categories = st.text(alphabet=list("ABCDE"), min_size=1, max_size=1) 17 | 18 | 19 | @given(x=st.lists(categories, min_size=2, max_size=30)) 20 | def test_theils_u_identity(x): 21 | assert theils_u(x, x) == pytest.approx(1.0) 22 | 23 | 24 | @st.composite 25 | def two_categorical_lists(draw): 26 | n = draw(st.integers(min_value=2, max_value=30)) 27 | categorical_lists = st.lists(categories, min_size=n, max_size=n) 28 | 29 | return draw(categorical_lists), draw(categorical_lists) 30 | 31 | 32 | @given(x_y=two_categorical_lists()) 33 | def test_theils_u_value_range(x_y): 34 | x, y = x_y 35 | 36 | u_xy = theils_u(x, y) 37 | 38 | assert 0.0 <= u_xy <= 1.0 39 | -------------------------------------------------------------------------------- /tests/test_private_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from sklearn import datasets 5 | from dython._private import ( 6 | convert, 7 | remove_incomplete_samples, 8 | replace_nan_with_value, 9 | ) 10 | 11 | # Make pandas not emit SettingWithCopyWarning 12 | # SettingWithCopyWarning looks relatively safe to ignore, 13 | # compare with DeprecationWarning that eventually needs attention. 14 | # https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#evaluation-order-matters 15 | pd.set_option("mode.chained_assignment", None) 16 | 17 | 18 | @pytest.fixture 19 | def iris_df(): 20 | iris = datasets.load_iris() 21 | df = pd.DataFrame(data=iris.data, columns=iris.feature_names) 22 | df["target"] = iris.target 23 | 24 | return df 25 | 26 | 27 | @pytest.fixture(params=["str", "tuple", "dict"]) 28 | def bad_input(request): 29 | if request.param == "str": 30 | return "EXAMPLE STRING" 31 | 32 | if request.param == "tuple": 33 | return "EXAMPLE", "TUPLE" 34 | 35 | if request.param == "dict": 36 | return {1: "EXAMPLE", 2: "DICT"} 37 | 38 | 39 | @pytest.mark.parametrize("output_type", ["list", "array", "dataframe"]) 40 | def test_convert_good_output_bad_input(bad_input, output_type): 41 | with pytest.raises(TypeError, match="cannot handle data conversion"): 42 | convert(bad_input, output_type) 43 | 44 | 45 | def test_convert_bad_output(iris_df): 46 | with pytest.raises(ValueError, match="Unknown"): 47 | convert(iris_df, "bad_parameter") 48 | 49 | 50 | @pytest.fixture 51 | def x_y(iris_df): 52 | x = iris_df[iris_df.columns[0]] 53 | y = iris_df[iris_df.columns[1]] 54 | return x, y 55 | 56 | 57 | def test_remove_incomplete_cases_one_nan_each(x_y): 58 | x, y = x_y 59 | x[0] = None 60 | y[1] = None 61 | 62 | x_, y_ = remove_incomplete_samples(x, y) 63 | 64 | assert len(x_) == len(y_) == len(x) - 2 65 | 66 | 67 | def test_remove_incomplete_cases_all_nan(x_y): 68 | x, y = x_y 69 | x = [None for _ in x] 70 | 71 | x_, y_ = remove_incomplete_samples(x, y) 72 | assert len(x_) == len(y_) == 0 73 | 74 | 75 | def test_replace_nan_one_nan_each(x_y): 76 | x, y = x_y 77 | x[0] = None 78 | y[1] = None 79 | 80 | x_, y_ = replace_nan_with_value(x, y, 1_000) 81 | 82 | assert len(x_) == len(y_) == len(y) 83 | assert x_[0] == y_[1] == 1_000 84 | 85 | 86 | def test_replace_nan_all_nan(x_y): 87 | x, y = x_y 88 | x = [None for _ in x] 89 | 90 | x_, y_ = replace_nan_with_value(x, y, 1_000) 91 | 92 | assert all([elem == 1_000 for elem in x_]) 93 | -------------------------------------------------------------------------------- /tests/test_sampling.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from dython.sampling import boltzmann_sampling, weighted_sampling 4 | 5 | 6 | @pytest.fixture(params=["list", "array"]) 7 | def population(request): 8 | if request.param == "list": 9 | return [0.0, 1.0, 2.0, 3.0, 4.0] 10 | elif request.param == "array": 11 | return np.array([0.0, 1.0, 2.0, 3.0, 4.0]) 12 | 13 | 14 | parametrize_sampling_funcs = pytest.mark.parametrize( 15 | "func", [boltzmann_sampling, weighted_sampling] 16 | ) 17 | 18 | 19 | @parametrize_sampling_funcs 20 | def test_k_none(func, population): 21 | result = func(population, k=None) 22 | assert type(result) is np.float64 23 | 24 | 25 | @parametrize_sampling_funcs 26 | @pytest.mark.parametrize("k", [1, 2]) 27 | def test_k_number(func, population, k): 28 | result = func(population, k=k) 29 | assert type(result) == type( 30 | population 31 | ), "Sampling with k != None should return same type as input" 32 | --------------------------------------------------------------------------------