├── .gitattributes
├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── error---bug-report.md
│ ├── feature-request.md
│ └── general-question.md
└── workflows
│ ├── pytest.yml
│ └── pythonpublish.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── VERSION
├── conftest.py
├── dev_requirements.txt
├── docs
├── _layouts
│ └── default.html
├── getting_started
│ ├── examples.md
│ └── installation.md
├── images
│ ├── associations_iris_example.png
│ ├── associations_mushrooms_example.png
│ ├── favicon.png
│ ├── index_banner.png
│ ├── ks_example.png
│ ├── logo.png
│ ├── pr_example.png
│ ├── roc_example.png
│ ├── social_banner.png
│ └── split_hist_example.png
├── index.md
├── modules
│ ├── data_utils.md
│ ├── model_utils.md
│ ├── nominal.md
│ └── sampling.md
├── overrides
│ └── main.html
└── related_blogposts.md
├── dython
├── __init__.py
├── _private.py
├── data_utils.py
├── examples.py
├── model_utils.py
├── nominal.py
├── sampling.py
└── typing.py
├── logos
├── README.md
├── dython_300x200.png
├── facebook_cover_photo_1.png
├── facebook_cover_photo_2.png
├── facebook_profile_image.png
├── favicon.png
├── instagram_profile_image.png
├── linkedin_banner_image_1.png
├── linkedin_banner_image_2.png
├── linkedin_profile_image.png
├── logo.png
├── logo_transparent.png
├── pinterest_board_photo.png
├── pinterest_profile_image.png
├── twitter_header_photo_1.png
├── twitter_header_photo_2.png
├── twitter_profile_image.png
└── youtube_profile_image.png
├── mkdocs.yml
├── pyproject.toml
├── pytest.ini
├── requirements.txt
├── setup.py
└── tests
├── test_data_utils
├── test_one_hot_encode.py
└── test_split_hist.py
├── test_model_utils
├── test_ks_abc.py
├── test_metric_graph.py
└── test_random_forest_feature_importance.py
├── test_nominal
├── test_associations.py
├── test_associations_parallel.py
├── test_cluster_correlation.py
├── test_correlation_ratio.py
├── test_cramers_v.py
└── test_theils_u.py
├── test_private_helpers.py
└── test_sampling.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
4 | patreon: # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: shakedzy
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/error---bug-report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Error / bug report
3 | about: How to create a report to help us improve
4 | title: ''
5 | labels: 'bug'
6 | assignees: ''
7 |
8 | ---
9 |
14 |
15 | ### Version check:
16 |
21 | Run and copy the output:
22 | ```python
23 | import sys, dython
24 | print(sys.version_info)
25 | print(dython.__version__)
26 | ```
27 |
28 | ### Describe the bug:
29 |
33 | Code to reproduce:
34 | ```python
35 | import dython
36 | # your code goes here
37 | ```
38 |
39 | ## Error message:
40 |
41 | Error message:
42 | ```
43 | # your error message
44 | ```
45 |
46 | ## Input data:
47 |
48 |
49 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: New feature request
3 | about: How to create a request for a new feature
4 | title: ''
5 | labels: 'enhancement'
6 | assignees: ''
7 |
8 | ---
9 |
13 |
14 | ### Describe the new feature:
15 |
19 |
20 |
21 | ### What is the current outcome?
22 |
23 |
24 |
25 | ### Is it backward-compatible?
26 |
29 |
30 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/general-question.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: General question
3 | about: Ask any question you'd like
4 | title: ''
5 | labels: 'question'
6 | assignees: ''
7 |
8 | ---
9 |
10 |
17 |
--------------------------------------------------------------------------------
/.github/workflows/pytest.yml:
--------------------------------------------------------------------------------
1 | name: Run pytest
2 |
3 | on:
4 | push:
5 | branches: [ "master", "shakedzy:master" ]
6 | paths-ignore:
7 | - "README.md"
8 | - "CHANGELOG.md"
9 | - "CODE_OF_CONDUCT.md"
10 | - "CONTRIBUTING.md"
11 | - "VERSION"
12 | - "LICENSE"
13 | - ".gitignore"
14 | - "docs/*"
15 | pull_request:
16 | types: [opened, reopened, edited, synchronize]
17 | branches: [ "master", "shakedzy:master" ]
18 | paths-ignore:
19 | - "README.md"
20 | - "CHANGELOG.md"
21 | - "CODE_OF_CONDUCT.md"
22 | - "CONTRIBUTING.md"
23 | - "VERSION"
24 | - "LICENSE"
25 | - ".gitignore"
26 | - "docs/*"
27 |
28 | permissions:
29 | contents: read
30 |
31 | jobs:
32 | build:
33 | strategy:
34 | matrix:
35 | version: ["3.10", "3.12"]
36 | runs-on: ubuntu-latest
37 | steps:
38 | - uses: actions/checkout@v4
39 | - name: Set up Python ${{ matrix.version }}
40 | uses: actions/setup-python@v5
41 | with:
42 | python-version: ${{ matrix.version }}
43 | - name: Install dependencies
44 | run: |
45 | python -m pip install --upgrade pip
46 | pip install -r requirements.txt
47 | pip install -r dev_requirements.txt
48 | pip install .
49 | - name: Test with pytest
50 | run: pytest
51 |
--------------------------------------------------------------------------------
/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
1 | # This workflows will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | name: Upload Python Package
5 |
6 | on:
7 | workflow_dispatch:
8 | release:
9 | types: [created]
10 |
11 | jobs:
12 | test:
13 | strategy:
14 | matrix:
15 | version: [ "3.9", "3.10", "3.11", "3.12" ]
16 | runs-on: ubuntu-latest
17 | steps:
18 | - uses: actions/checkout@v4
19 | - name: Set up Python ${{ matrix.version }}
20 | uses: actions/setup-python@v5
21 | with:
22 | python-version: ${{ matrix.version }}
23 | - name: Install dependencies
24 | run: |
25 | python -m pip install --upgrade pip
26 | pip install -r requirements.txt
27 | pip install -r dev_requirements.txt
28 | pip install .
29 | - name: Test with pytest
30 | run: pytest
31 |
32 | deploy:
33 | needs: test
34 | runs-on: ubuntu-latest
35 | steps:
36 | - uses: actions/checkout@v4
37 | - name: Set up Python
38 | uses: actions/setup-python@v5
39 | with:
40 | python-version: '3.x'
41 | - name: Install dependencies
42 | run: |
43 | python -m pip install --upgrade pip
44 | pip install setuptools wheel twine
45 | - name: Build and publish
46 | env:
47 | TWINE_USERNAME: '__token__'
48 | TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
49 | run: |
50 | python setup.py sdist bdist_wheel
51 | twine upload dist/*
52 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | syntax: glob
2 | .python-version
3 | .venv
4 | env/*
5 | venv/*
6 | ENV/*
7 | .idea
8 | .vscode
9 | .DS_Store
10 | dython.egg*/*
11 | *__pycache__*
12 | *run_stuff.py*
13 | build/*
14 | dist/*
15 | build_deploy.sh
16 | site/*
17 | debug.py
18 | .coverage
19 | .hypothesis
20 | .pytest_cache*
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # See https://pre-commit.com for more information
2 | # See https://pre-commit.com/hooks.html for more hooks
3 | repos:
4 | - repo: https://github.com/ambv/black
5 | rev: 22.8.0
6 | hooks:
7 | - id: black
8 | language: python
9 | types: [python]
10 | args: ["--line-length=80"]
11 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Change Log
2 |
3 | ## 0.7.9
4 | * Fixing `nominal.associations(plot=False)` not working as expected on Jupyter-based notebooks (issues [#167](https://github.com/shakedzy/dython/issues/167) & [#168](https://github.com/shakedzy/dython/issues/168))
5 |
6 | ## 0.7.8
7 | * `nominal.associations` now attempts to set the figure-size automatically based on output (issue [#30](https://github.com/shakedzy/dython/issues/30), by **[@Swish78](https://github.com/Swish78)**)
8 |
9 | ## 0.7.7
10 | * _Drop support for Python 3.8 as it reaches its end-of-life date_
11 | * Fix issue [#160](https://github.com/shakedzy/dython/issues/160)
12 |
13 | ## 0.7.6
14 | * Fix issue [#162](https://github.com/shakedzy/dython/issues/162)
15 |
16 | ## 0.7.5
17 | * Adding type hints to all functions (issue [#153](https://github.com/shakedzy/dython/issues/153))
18 | * Dropping dependency in `scikit-plot` as it is no longer maintained (issue [#156](https://github.com/shakedzy/dython/issues/156))
19 | * Support for Python 3.12 (issue [#155](https://github.com/shakedzy/dython/issues/155))
20 |
21 | ## 0.7.4
22 | * Handling running plotting functions with `plot=False` in Jupyter and truly avoid plotting (issue [#147](https://github.com/shakedzy/dython/issues/147))
23 |
24 | ## 0.7.3
25 | * _Dython now officially supports only Python 3.8 or above_ (by-product of issue [#137](https://github.com/shakedzy/dython/issues/137))
26 | * Added `nominal.replot_last_associations`: a new method to replot `nominal.associations` heat-maps (issue [#136](https://github.com/shakedzy/dython/issues/136))
27 | * Adding option to drop NaN values in each pair of columns independently in `nominal.associations` (issue [#130](https://github.com/shakedzy/dython/issues/130), by **[@matbb](https://github.com/matbb)**)
28 | * Fixing issues [#139](https://github.com/shakedzy/dython/issues/139) and [#140](https://github.com/shakedzy/dython/issues/140) (by **[@enrir](https://github.com/enrir)**)
29 |
30 | ## 0.7.2
31 | * `nominal.associations` supports multi-core parallel processing (issue [#117](https://github.com/shakedzy/dython/issues/117), by **[@mahieyin-rahmun](https://github.com/mahieyin-rahmun)**)
32 | * Using Black for code formatting (issue [#133](https://github.com/shakedzy/dython/issues/133), by **[@mahieyin-rahmun](https://github.com/mahieyin-rahmun)**)
33 |
34 | ## 0.7.1 (_post4_)
35 | * Fix floating point precision in `theils_u`, `cramer_v` and `correlation_ratio` (issue [#116](https://github.com/shakedzy/dython/issues/116))
36 | * Fix failing conda builds (by **[@sarthakpati](https://github.com/sarthakpati)**)
37 | * Fix legend argument in `ks_abc` (by **[@lahdjirayhan](https://github.com/lahdjirayhan)**)
38 |
39 | ## 0.7.0
40 | * _License is now MIT_
41 | * Added tests (issue [#69](https://github.com/shakedzy/dython/issues/69), by **[@lahdjirayhan](https://github.com/lahdjirayhan)**)
42 | * Added option to select which rows/columns to display/hide in `nominal.associations` (issue [#92](https://github.com/shakedzy/dython/issues/92))
43 | * Fixed deprecation warning when using `datetime` features with `nominal.associations` (issue [#96](https://github.com/shakedzy/dython/issues/96))
44 | * `nominal.associations` now support custom methods as measures of associations (issue [#104](https://github.com/shakedzy/dython/issues/104))
45 | * _Important change:_ Theil's U in `nominal.associations` is now read as U(row|col) instead of U(col|row)
46 | * Remove deprecated method `compute_associations`
47 |
48 | ## 0.6.8
49 | * Bug fix in `metric_graph` (issue [#102](https://github.com/shakedzy/dython/issues/102))
50 | * Bug fix in examples module
51 |
52 | ## 0.6.7 (_post2_)
53 | * First version supported by `conda` (issue [#90](https://github.com/shakedzy/dython/issues/90), by **[@sarthakpati](https://github.com/sarthakpati)**)
54 | * `associations` (and `compute_associations`) now supports several numerical-numerical association measures
55 | (issue [#84](https://github.com/shakedzy/dython/issues/84))
56 | * `nominal.associations` keyword `bias_correction` is now `cramers_v_bias_correction`
57 | * Added a `numerical_columns` option to `associations` and `compute_associations`
58 | * `roc_graph` is officially removed (replaced with `metric_graph`)
59 | * Deprecating `compute_associations`
60 |
61 | ## 0.6.6
62 | * Fixed issue where `nan_strategy` affected input data (issue [#82](https://github.com/shakedzy/dython/issues/82))
63 | * Added `datetime` support to `nominal.associations` (issue [#76](https://github.com/shakedzy/dython/issues/76))
64 |
65 | ## 0.6.5 (_post1_)
66 | * Added `model_utils.ks_abc`
67 | * Fixed a bug in `model_utils.metric_graph` when using `plot=False`
68 | * Added new dependency: `scikit-plot`
69 |
70 | ## 0.6.4 (_post1_)
71 | * Adding `model_utils.metric_graph` instead of `roc_graph`, which now supports ROC curves and Precision-Recall curves
72 | * `roc_graph` is marked as deprecated
73 |
74 | ## 0.6.3
75 | * Added `data_utils.one_hot_encode`
76 | * Added `title` and `filename` options to `associations` and `roc_graph`
77 |
78 | ## 0.6.2
79 | * Added configurable `vmax` and `vmin` to `nominal.associations` (issue [#68](https://github.com/shakedzy/dython/issues/68))
80 |
81 | ## 0.6.1
82 | * Bug fix in `model_utils.roc_graph`
83 | * `model_utils.roc_graph` now accepts also `legend` and `plot` arguments
84 |
85 | ## 0.6.0
86 | * New module: `data_utils`
87 | * `split_hist` method added, with new example
88 | * `identify_columns_by_type` and `identify_columns_with_na` moved to `data_utils` from `nominal`
89 |
90 | ## 0.5.2
91 | * Added `nominal.identify_columns_with_na` (by **[@musketeer191](https://github.com/musketeer191)**)
92 | * Added `nominal.identify_numeric_columns` (issue [#58](https://github.com/shakedzy/dython/issues/58), by **[@musketeer191](https://github.com/musketeer191)**)
93 | * Added `nominal.identify_columns_by_type`
94 | * `nominal.identify_nominal_columns` no longer accepts the `include` parameter (use `nominal.identify_columns_by_type` instead)
95 | * Fix docstring of `nominal.compute_associations` (issue [#55](https://github.com/shakedzy/dython/issues/55))
96 | * Requires Pandas 0.23.4 or greater (was required before, but not specified in setup file)
97 |
98 | ## 0.5.1
99 | * Resolve issues [#48](https://github.com/shakedzy/dython/issues/48) and [#49](https://github.com/shakedzy/dython/issues/49)
100 |
101 | ## 0.5.0 (_post2_)
102 | * Fix issues [#28](https://github.com/shakedzy/dython/issues/28), [#31](https://github.com/shakedzy/dython/issues/31), [#41](https://github.com/shakedzy/dython/issues/41), [#46](https://github.com/shakedzy/dython/issues/46)
103 | * `nominal.cramers_v` can be used without bias correction
104 | * Removed `kwargs` from all methods, replaced with explicit API
105 | * `nominal.associations` and `model_utils.roc_graph` now return a dictionary of output values
106 | * `model_utils.roc_graph` can accept an `ax`
107 | * license replaced to BSD-3
108 |
109 | ## 0.4.7
110 | * `nominal.associations` now handles single-value features (issue [#38](https://github.com/shakedzy/dython/issues/38))
111 |
112 | ## 0.4.6
113 | * Added log-base selection in `nominal.conditional_entropy` (issue [#35](https://github.com/shakedzy/dython/issues/35), by **[@ahmedsalhin](https://github.com/ahmedsalhin)**)
114 | * Added new example: `associations_mushrooms_example`
115 | * Renamed example: `associations_example` is now `associations_iris_example`
116 |
117 | ## 0.4.5
118 | * Requires Python 3.5+
119 | * Private methods and attributes renamed
120 | * Fixed incorrect `__version__` varaible
121 |
122 | ## 0.4.4
123 | * Minor fixes
124 | * introducing `__all__` to all modules
125 |
126 | ## 0.4.3
127 | * `binary_roc_graph` is now a private method, only `roc_graph` is exposed
128 |
129 | ## 0.4.2
130 | * Added new functionality to `model_utils.roc_graph` (Plot best threshold, print class names)
131 |
132 | ## 0.4.1
133 | * Added `nominal.cluster_correlations`, and an option to cluster `nominal.associations` heatmap (by **[@benman1](https://github.com/benman1)**)
134 |
135 | ## 0.4.0
136 | * Added automatic recognition of categorical columns in `nominal.associations` (by **[@benman1](https://github.com/benman1)**)
137 |
138 | ## 0.3.1
139 | * `nominal.associations` can accept an exisiting Matplotlib `Axe` (issue [#24](https://github.com/shakedzy/dython/issues/24), by **[@Baukebrenninkmeijer](https://github.com/Baukebrenninkmeijer)**)
140 |
141 | ## 0.3.0
142 | * Introducing missing values handeling (`nan_strategy`) in `nominal` module (issue [#15](https://github.com/shakedzy/dython/issues/15))
143 |
144 | ## 0.2.0
145 | * Added `sampling` module
146 |
147 | ## 0.1.1
148 | * Fixed missing `sqrt` in `nominal.correlation_ratio` (issue [#7](https://github.com/shakedzy/dython/issues/7))
149 |
150 | ## 0.1.0
151 | * First version of Dython
152 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at shakedzy@gmail.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute to Dython
2 | If you'd like to contribute or assist - then first of all, thanks. This isn't obvious and I appreciate it.
3 |
4 | ### Reporting a bug:
5 | If you found a bug, please open an new _error/bug issue_ [here](https://github.com/shakedzy/dython/issues/new/choose).
6 | Please make sure you are using the latest version of Dython befoe reporting.
7 |
8 | ### Suggesting a new feature:
9 | New features are always welcomed. Please describe it in a _new feature request_ [here](https://github.com/shakedzy/dython/issues/new/choose).
10 |
11 | ### Adding things yourself:
12 | If you want to take an open issue and work on it, or would like to merge something you coded yourself, please open a pull request and explain what it is you're adding. If there's an open issue about it, please state you're working on it. Contibutions are always welcomed, and are very much appreciated. Your name will forever be etched in the [change log](CHANGELOG.md).
13 |
14 | ### Anything else?
15 | If there's anything else you'd like to discuss, feel free top open a _general question_ [here](https://github.com/shakedzy/dython/issues/new/choose) on any topic.
16 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018-2022, Shaked Zychlinski
4 | All rights reserved.
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining
7 | a copy of this software and associated documentation files (the
8 | "Software"), to deal in the Software without restriction, including
9 | without limitation the rights to use, copy, modify, merge, publish,
10 | distribute, sublicense, and/or sell copies of the Software, and to
11 | permit persons to whom the Software is furnished to do so, subject to
12 | the following conditions:
13 |
14 | The above copyright notice and this permission notice shall be
15 | included in all copies or substantial portions of the Software.
16 |
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.md, VERSION, dev_requirements.txt
2 | include requirements.txt
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | # Dython
4 |
5 | [](https://pypi.org/project/dython/)
6 | [](https://anaconda.org/conda-forge/dython)
7 | [](https://pypi.org/project/dython/)
8 | [](https://pypistats.org/packages/dython)
9 | [](https://github.com/shakedzy/dython/blob/master/LICENSE)
10 | [](https://zenodo.org/doi/10.5281/zenodo.12698421)
11 |
12 | A set of **D**ata analysis tools in p**YTHON** 3.x.
13 |
14 | Dython was designed with analysis usage in mind - meaning ease-of-use, functionality and readability are the core
15 | values of this library.
16 |
17 | ## Installation
18 | Dython can be installed directly using `pip`:
19 | ```
20 | pip install dython
21 | ```
22 | or, via the `conda` package manager:
23 | ```
24 | conda install -c conda-forge dython
25 | ```
26 |
27 | ## Documentation
28 | Modules documentation can be found on [shakedzy.xyz/dython](http://shakedzy.xyz/dython).
29 | You can also learn more and see examples of the main methods of this library on
30 | [these blogposts](http://shakedzy.xyz/dython/related_blogposts).
31 |
32 | ## Contributing
33 | Contributions are always welcomed - if you found something you can fix, or have an idea for a new feature, feel free to write it and open a pull request. Please make sure to go over the [contributions guidelines](https://github.com/shakedzy/dython/blob/master/CONTRIBUTING.md).
34 |
35 | ## Citing
36 | Use this reference to cite if you use Dython in a paper:
37 | ```bibtex
38 | @software{Zychlinski_dython_2018,
39 | author = {Zychlinski, Shaked},
40 | title = {{dython}},
41 | year = {2018},
42 | url = {https://github.com/shakedzy/dython},
43 | doi = {10.5281/zenodo.12698421}
44 | }
45 | ```
46 |
--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.7.9
--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import functools
3 | import matplotlib
4 | import numpy as np
5 | import pandas as pd
6 | from sklearn import datasets
7 |
8 |
9 | @pytest.fixture(autouse=True)
10 | def disable_plot(monkeypatch):
11 | # Patch plt.show to not halt testing flow, by making it not block
12 | # function execution.
13 | # patch = functools.partial(matplotlib.pyplot.show, block=False)
14 | def patch():
15 | pass
16 |
17 | monkeypatch.setattr(matplotlib.pyplot, "show", patch)
18 |
19 |
20 | @pytest.fixture
21 | def iris_df():
22 | # Use iris dataset as example when needed.
23 | # Add one made-up categorical column to create a nom-nom relationship.
24 |
25 | iris = datasets.load_iris()
26 |
27 | target = ["C{}".format(i) for i in iris.target]
28 |
29 | rng = np.random.default_rng(2207)
30 | extra = rng.choice(list("ABCDE"), size=len(target))
31 |
32 | extra = pd.DataFrame(data=extra, columns=["extra"])
33 |
34 | X = pd.DataFrame(data=iris.data, columns=iris.feature_names)
35 | y = pd.DataFrame(data=target, columns=["target"])
36 |
37 | df = pd.concat([X, extra, y], axis=1)
38 |
39 | return df
40 |
41 |
42 | @pytest.fixture(autouse=True)
43 | def add_iris(doctest_namespace, iris_df):
44 | # Add iris dataset to namespace
45 | # This fixture is provided with autouse so that
46 | # the doctests can use it
47 | doctest_namespace["iris_df"] = iris_df
48 |
--------------------------------------------------------------------------------
/dev_requirements.txt:
--------------------------------------------------------------------------------
1 | pytest>=8.3.2
2 | hypothesis>=6.111.0
3 | black>=24.8.0
4 | pre-commit>=3.8.0
5 | pytest-enabler>=3.1.1
--------------------------------------------------------------------------------
/docs/_layouts/default.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | {% seo %}
9 |
10 |
13 |
14 |
15 |
16 |
17 |
18 |
19 | {% if site.logo %}
20 |
21 | {% endif %}
22 |
23 | {{ site.description | default: site.github.project_tagline }}
24 |
25 | {% if site.github.is_project_page %}
26 | View the Project on GitHub {{ site.github.repository_nwo }}
27 | {% endif %}
28 |
29 | {% if site.github.is_user_page %}
30 | View My GitHub Profile
31 | {% endif %}
32 |
33 | {% if site.show_downloads %}
34 |
39 | {% endif %}
40 |
41 |
42 |
43 | {{ content }}
44 |
45 |
46 |
52 |
53 |
54 | {% if site.google_analytics %}
55 |
63 | {% endif %}
64 |
65 |
66 |
--------------------------------------------------------------------------------
/docs/getting_started/examples.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: examples
3 | ---
4 | # Examples
5 |
6 | _Examples can be imported and executed from `dython.examples`._
7 |
8 | #### `associations_iris_example()`
9 |
10 | Plot an example of an associations heat-map of the Iris dataset features.
11 | All features of this dataset are numerical (except for the target).
12 |
13 | **Example code:**
14 | ```python
15 | import pandas as pd
16 | from sklearn import datasets
17 | from dython.nominal import associations
18 |
19 | # Load data
20 | iris = datasets.load_iris()
21 |
22 | # Convert int classes to strings to allow associations
23 | # method to automatically recognize categorical columns
24 | target = ['C{}'.format(i) for i in iris.target]
25 |
26 | # Prepare data
27 | X = pd.DataFrame(data=iris.data, columns=iris.feature_names)
28 | y = pd.DataFrame(data=target, columns=['target'])
29 | df = pd.concat([X, y], axis=1)
30 |
31 | # Plot features associations
32 | associations(df)
33 | ```
34 | **Output:**
35 |
36 | 
37 |
38 | __________________
39 |
40 | #### `associations_mushrooms_example()`
41 |
42 | Plot an example of an associations heat-map of the UCI Mushrooms dataset features.
43 | All features of this dataset are categorical. This example will use Theil's U.
44 |
45 | **Example code:**
46 | ```python
47 | import pandas as pd
48 | from dython.nominal import associations
49 |
50 | # Download and load data from UCI
51 | df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data')
52 | df.columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment',
53 | 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
54 | 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
55 | 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
56 |
57 | # Plot features associations
58 | associations(df, nom_nom_assoc='theil', figsize=(15, 15))
59 | ```
60 | **Output:**
61 |
62 | 
63 |
64 | __________________
65 |
66 | #### `ks_abc_example()`
67 |
68 | An example of KS Area Between Curve of a simple binary classifier trained over the Breast Cancer dataset.
69 |
70 | **Example code:**
71 | ```python
72 | from sklearn import datasets
73 | from sklearn.model_selection import train_test_split
74 | from sklearn.linear_model import LogisticRegression
75 | from dython.model_utils import ks_abc
76 |
77 | # Load and split data
78 | data = datasets.load_breast_cancer()
79 | X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=.5, random_state=0)
80 |
81 | # Train model and predict
82 | model = LogisticRegression(solver='liblinear')
83 | model.fit(X_train, y_train)
84 | y_pred = model.predict_proba(X_test)
85 |
86 | # Perform KS test and compute area between curves
87 | ks_abc(y_test, y_pred[:,1])
88 | ```
89 |
90 | **Output:**
91 |
92 | 
93 | __________________
94 |
95 | #### `pr_graph_example()`
96 |
97 | Plot an example Precision-Recall graph of an SVM model predictions over the Iris dataset.
98 |
99 | **Example code:**
100 |
101 | ```python
102 | import numpy as np
103 | from sklearn import svm, datasets
104 | from sklearn.model_selection import train_test_split
105 | from sklearn.preprocessing import label_binarize
106 | from sklearn.multiclass import OneVsRestClassifier
107 | from dython.model_utils import metric_graph
108 |
109 | # Load data
110 | iris = datasets.load_iris()
111 | X = iris.data
112 | y = label_binarize(iris.target, classes=[0, 1, 2])
113 |
114 | # Add noisy features
115 | random_state = np.random.RandomState(4)
116 | n_samples, n_features = X.shape
117 | X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
118 |
119 | # Train a model
120 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)
121 | classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=0))
122 |
123 | # Predict
124 | y_score = classifier.fit(X_train, y_train).predict_proba(X_test)
125 |
126 | # Plot ROC graphs
127 | metric_graph(y_test, y_score, 'pr', class_names=iris.target_names)
128 | ```
129 |
130 | **Output:**
131 |
132 | 
133 |
134 | __________________
135 |
136 | #### `roc_graph_example()`
137 |
138 | Plot an example ROC graph of an SVM model predictions over the Iris dataset.
139 |
140 | Based on `sklearn` [examples](http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html)
141 | (as was seen on April 2018).
142 |
143 | **Example code:**
144 |
145 | ```python
146 | import numpy as np
147 | from sklearn import svm, datasets
148 | from sklearn.model_selection import train_test_split
149 | from sklearn.preprocessing import label_binarize
150 | from sklearn.multiclass import OneVsRestClassifier
151 | from dython.model_utils import metric_graph
152 |
153 | # Load data
154 | iris = datasets.load_iris()
155 | X = iris.data
156 | y = label_binarize(iris.target, classes=[0, 1, 2])
157 |
158 | # Add noisy features
159 | random_state = np.random.RandomState(4)
160 | n_samples, n_features = X.shape
161 | X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
162 |
163 | # Train a model
164 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)
165 | classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=0))
166 |
167 | # Predict
168 | y_score = classifier.fit(X_train, y_train).predict_proba(X_test)
169 |
170 | # Plot ROC graphs
171 | metric_graph(y_test, y_score, 'roc', class_names=iris.target_names)
172 | ```
173 |
174 | **Output:**
175 |
176 | 
177 |
178 | !!! warning "Note:"
179 |
180 | Due to the nature of `np.random.RandomState` which is used in this
181 | example, the output graph may vary from one machine to another.
182 |
183 | __________________
184 |
185 | #### `split_hist_example()`
186 |
187 | Plot an example of split histogram of data from the breast-cancer dataset.
188 |
189 | While this example presents a numerical column split by a categorical one, categorical columns can also be used
190 | as the values, as well as numerical columns as the split criteria.
191 |
192 | **Example code:**
193 | ```python
194 | import pandas as pd
195 | from sklearn import datasets
196 | from dython.data_utils import split_hist
197 |
198 | # Load data and convert to DataFrame
199 | data = datasets.load_breast_cancer()
200 | df = pd.DataFrame(data=data.data, columns=data.feature_names)
201 | df['malignant'] = [not bool(x) for x in data.target]
202 |
203 | # Plot histogram
204 | split_hist(df, 'mean radius', split_by='malignant', bins=20, figsize=(15,7))
205 | ```
206 |
207 | **Output:**
208 |
209 | 
210 |
--------------------------------------------------------------------------------
/docs/getting_started/installation.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: installation
3 | ---
4 |
5 | # Installing Dython
6 |
7 | ## Installation
8 |
9 | The easiest way to install dython is using `pip install`:
10 |
11 | ```bash
12 | pip install dython
13 | ```
14 | Or, via the `conda` package manager:
15 | ```bash
16 | conda install -c conda-forge dython
17 | ```
18 |
19 | If you'd like to use the source code instead, you can install directly from it using any
20 | of the following methods:
21 |
22 | * Install source code using pip:
23 | ```bash
24 | pip install git+https://github.com/shakedzy/dython.git`
25 | ```
26 | * Download the source code as a [ZIP file](https://github.com/shakedzy/dython/zipball/master)
27 | * Download the source code as a [TAR ball](https://github.com/shakedzy/dython/tarball/master)
28 |
29 |
--------------------------------------------------------------------------------
/docs/images/associations_iris_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/associations_iris_example.png
--------------------------------------------------------------------------------
/docs/images/associations_mushrooms_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/associations_mushrooms_example.png
--------------------------------------------------------------------------------
/docs/images/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/favicon.png
--------------------------------------------------------------------------------
/docs/images/index_banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/index_banner.png
--------------------------------------------------------------------------------
/docs/images/ks_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/ks_example.png
--------------------------------------------------------------------------------
/docs/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/logo.png
--------------------------------------------------------------------------------
/docs/images/pr_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/pr_example.png
--------------------------------------------------------------------------------
/docs/images/roc_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/roc_example.png
--------------------------------------------------------------------------------
/docs/images/social_banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/social_banner.png
--------------------------------------------------------------------------------
/docs/images/split_hist_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/docs/images/split_hist_example.png
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | is_homepage:
3 | ---
4 |
5 | # Dython
6 |
7 | [](https://pypi.org/project/dython/)
8 | [](https://anaconda.org/conda-forge/dython)
9 | [](https://pypi.org/project/dython/)
10 | [](https://pypistats.org/packages/dython)
11 | [](https://github.com/shakedzy/dython/blob/master/LICENSE)
12 | [](https://zenodo.org/doi/10.5281/zenodo.12698421)
13 |
14 | 
15 |
16 | ## Welcome!
17 |
18 | Dython is a set of **D**ata analysis tools in p**YTHON** 3.x, which can let you get more insights about your data.
19 |
20 | This library was designed with analysis usage in mind - meaning ease-of-use, functionality and readability are the core
21 | values of this library. Production-grade performance, on the other hand, were not considered.
22 |
23 | **Here are some cool things you can do with it:**
24 |
25 | Given a dataset, Dython will automatically find which features are categorical and which are numerical,
26 | compute a relevant measure of association between each and every feature, and plot it all as an easy-to-read
27 | heat-map. And all this is done with a single line:
28 |
29 | ```python
30 | from dython.nominal import associations
31 | associations(data)
32 | ```
33 | The result:
34 |
35 | 
36 |
37 | Here's another thing - given a machine-learning multi-class model's predictions, you can easily display
38 | each class' ROC curve, AUC score and find the estimated-optimal thresholds - again, with a single line of code:
39 |
40 | ```python
41 | from dython.model_utils import metric_graph
42 |
43 | metric_graph(y_true, y_pred, metric='roc')
44 | ```
45 | The result:
46 |
47 | 
48 |
49 | ## Installation
50 | Dython can be installed directly using `pip`:
51 | ```bash
52 | pip install dython
53 | ```
54 | Other installation options are available, see the [installation page](getting_started/installation.md)
55 | for more information.
56 |
57 | ## Examples
58 | See some usage examples of `nominal.associations` and `model_utils.roc_graph` on the [examples page](getting_started/examples.md).
59 | All examples can also be imported and executed from `dython.examples`.
60 |
61 | ## Citing
62 | Use this reference to cite if you use Dython in a paper:
63 | ```bibtex
64 | @software{Zychlinski_dython_2018,
65 | author = {Zychlinski, Shaked},
66 | title = {{dython}},
67 | year = {2018},
68 | url = {https://github.com/shakedzy/dython},
69 | doi = {10.5281/zenodo.12698421}
70 | }
71 | ```
--------------------------------------------------------------------------------
/docs/modules/data_utils.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: data_utils
3 | ---
4 |
5 | # data_utils
6 |
7 | #### `identify_columns_with_na`
8 |
9 | `identify_columns_with_na(dataset)`
10 |
11 | Given a dataset, return columns names having NA values,
12 | sorted in descending order by their number of NAs.
13 |
14 | - **`dataset`** : `np.ndarray` / `pd.DataFrame`
15 |
16 | **Returns:** A `pd.DataFrame` of two columns (`['column', 'na_count']`), consisting of only
17 | the names of columns with NA values, sorted by their number of NA values.
18 |
19 | **Example:**
20 | ```python
21 | >>> df = pd.DataFrame({'col1': ['a', np.nan, 'a', 'a'], 'col2': [3, np.nan, 2, np.nan], 'col3': [1., 2., 3., 4.]})
22 | >>> identify_columns_with_na(df)
23 | column na_count
24 | 1 col2 2
25 | 0 col1 1
26 | ```
27 |
28 | __________________
29 |
30 | #### `identify_columns_by_type`
31 |
32 | `identify_columns_by_type(dataset, include)`
33 |
34 | Given a dataset, identify columns of the types requested.
35 |
36 | - **`dataset`** : `np.ndarray` / `pd.DataFrame`
37 |
38 | - **`include`** : `list`
39 |
40 | which column types to filter by.
41 |
42 | **Returns:** list of categorical columns
43 |
44 | **Example:**
45 | ```python
46 | >>> df = pd.DataFrame({'col1': ['a', 'b', 'c', 'a'], 'col2': [3, 4, 2, 1], 'col3': [1., 2., 3., 4.]})
47 | >>> identify_columns_by_type(df, include=['int64', 'float64'])
48 | ['col2', 'col3']
49 | ```
50 |
51 | __________________
52 |
53 | #### `one_hot_encode`
54 |
55 | `one_hot_encode(arr, classes=None)`
56 |
57 | One-hot encode a 1D array. Based on this [StackOverflow answer](https://stackoverflow.com/a/29831596/5863503).
58 |
59 | - **`arr`** : array-like
60 |
61 | An array to be one-hot encoded. Must contain only non-negative integers
62 |
63 | - **`classes`** : `int` or `None`
64 |
65 | number of classes. if None, max value of the array will be used
66 |
67 | **Returns:** 2D one-hot encoded array
68 |
69 | **Example:**
70 | ```python
71 | >>> one_hot_encode([1,0,5])
72 | [[0. 1. 0. 0. 0. 0.]
73 | [1. 0. 0. 0. 0. 0.]
74 | [0. 0. 0. 0. 0. 1.]]
75 | ```
76 | __________________
77 |
78 | #### `split_hist`
79 |
80 | `split_hist(dataset, values, split_by, title='', xlabel='', ylabel=None, figsize=None, legend='best', plot=True, **hist_kwargs)`
81 |
82 | Plot a histogram of values from a given dataset, split by the values of a chosen column
83 |
84 | - **`dataset`** : `pd.DataFrame`
85 |
86 | - **`values`** : `string`
87 |
88 | The column name of the values to be displayed in the histogram
89 |
90 | - **`split_by`** : `string`
91 |
92 | The column name of the values to split the histogram by
93 |
94 | - **`title`** : `string` or `None`, default = ''
95 |
96 | The plot's title. If empty string, will be '{values} by {split_by}'
97 |
98 | - **`xlabel`**: `string` or `None`, default = ''
99 |
100 | x-axis label. If empty string, will be '{values}'
101 |
102 | - **`ylabel`**: `string` or `None`, default: `None`
103 |
104 | y-axis label
105 |
106 | - **`figsize`**: (`int`,`int`) or `None`, default = `None`
107 |
108 | A Matplotlib figure-size tuple. If `None`, falls back to Matplotlib's default.
109 |
110 | - **`legend`**: `string` or `None`, default = 'best'
111 |
112 | A Matplotlib legend location string. See Matplotlib documentation for possible options
113 |
114 | - **`plot`**: `Boolean`, default = True
115 |
116 | Plot the histogram
117 |
118 | - **`hist_kwargs`**: key-value pairs
119 |
120 | A key-value pairs to be passed to Matplotlib hist method. See Matplotlib documentation for possible options
121 |
122 | **Returns:** A Matplotlib `Axe`
123 |
124 | **Example:** See [examples](../getting_started/examples.md).
--------------------------------------------------------------------------------
/docs/modules/model_utils.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: model_utils
3 | ---
4 |
5 | # model_utils
6 |
7 | #### `ks_abc`
8 |
9 | `ks_abc(y_true, y_pred, ax=None, figsize=None, colors=('darkorange', 'b'), title=None, xlim=(0.,1.), ylim=(0.,1.), fmt='.2f', lw=2, legend='best', plot=True, filename=None)`
10 |
11 | Perform the Kolmogorov–Smirnov test over the positive and negative distributions of a binary classifier, and compute
12 | the area between curves.
13 |
14 | The KS test plots the fraction of positives and negatives predicted correctly below each threshold. It then finds
15 | the optimal threshold, being the one enabling the best class separation.
16 |
17 | The area between curves allows a better insight into separation. The higher the area is (1 being the maximum), the
18 | more the positive and negative distributions' center-of-mass are closer to 1 and 0, respectively.
19 |
20 | Based on [scikit-plot](https://github.com/reiinakano/scikit-plot) `plot_ks_statistic` method.
21 |
22 | - **`y_true`** : array-like
23 |
24 | The true labels of the dataset
25 |
26 | - **`y_pred`** : array-like
27 |
28 | The probabilities predicted by a binary classifier
29 |
30 | - **`ax`** : matplotlib ax
31 |
32 | _Default: None_
33 |
34 | Matplotlib Axis on which the curves will be plotted
35 |
36 | - **`figsize`** : `(int,int)` or `None`
37 |
38 | _Default: None_
39 |
40 | a Matplotlib figure-size tuple. If `None`, falls back to Matplotlib's
41 | default. Only used if `ax=None`
42 |
43 | - **`colors`** : list of Matplotlib color strings
44 |
45 | _Default: `('darkorange', 'b')`_
46 |
47 | List of colors to be used for the plotted curves
48 |
49 | - **`title`** : string or `None`
50 |
51 | _Default: None_
52 |
53 | Plotted graph title. If `None`, default title is used
54 |
55 | - **`xlim`** : `(float, float)`
56 |
57 | _Default: (0.,1.)_
58 |
59 | X-axis limits.
60 |
61 | - **`ylim`** : `(float,float)`
62 |
63 | _Default: (0.,1.)_
64 |
65 | Y-axis limits.
66 |
67 | - **`fmt`** : `string`
68 |
69 | _Default: '.2f'_
70 |
71 | String formatting of displayed numbers.
72 |
73 | - **`lw`** : `int`
74 |
75 | _Default: 2_
76 |
77 | Line-width.
78 |
79 | - **`legend`**: `string` or `None`
80 |
81 | _Default: 'best'_
82 |
83 | A Matplotlib legend location string. See Matplotlib documentation for possible options
84 |
85 | - **`plot`**: `Boolean`, default = True
86 |
87 | Plot the KS curves
88 |
89 | - **`filename`**: `string` or `None`
90 |
91 | _Default: None_
92 |
93 | If not None, plot will be saved to the given file name.
94 |
95 | **Returns:** A dictionary of the following keys:
96 |
97 | - `abc`: area between curves
98 |
99 | - `ks_stat`: computed statistic of the KS test
100 |
101 | - `eopt`: estimated optimal threshold
102 |
103 | - `ax`: the ax used to plot the curves
104 |
105 | **Example:** See [examples](../getting_started/examples.md).
106 |
107 | __________________
108 |
109 | #### `metric_graph`
110 |
111 | `metric_graph(y_true, y_pred, metric, micro=True, macro=True, eoptimal_threshold=True, class_names=None, colors=None, ax=None, figsize=None, xlim=(0.,1.), ylim=(0.,1.02), lw=2, ls='-', ms=10, fmt='.2f', title=None, filename=None, force_multiclass=False)`
112 |
113 | Plot a metric graph of predictor's results (including AUC scores), where each
114 | row of y_true and y_pred represent a single example.
115 |
116 | **ROC:**
117 | Plots true-positive rate as a function of the false-positive rate of the positive label in a binary classification,
118 | where $TPR = TP / (TP + FN)$ and $FPR = FP / (FP + TN)$. A naive algorithm will display a linear line going from
119 | (0,0) to (1,1), therefore having an area under-curve (AUC) of 0.5.
120 |
121 | **Precision-Recall:**
122 | Plots precision as a function of recall of the positive label in a binary classification, where
123 | $Precision = TP / (TP + FP)$ and $Recall = TP / (TP + FN)$. A naive algorithm will display a horizontal linear
124 | line with precision of the ratio of positive examples in the dataset.
125 |
126 | Based on [scikit-learn examples](http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html) (as was seen on April 2018):
127 |
128 | - **`y_true`** : `list / NumPy ndarray`
129 |
130 | The true classes of the predicted data.
131 | If only one or two columns exist, the data is treated as a binary
132 | classification (see input example below).
133 | If there are more than 2 columns, each column is considered a
134 | unique class, and a ROC graph and AUC score will be computed for each.
135 |
136 | - **`y_pred`** : `list / NumPy ndarray`
137 |
138 | The predicted classes. Must have the same shape as `y_true`.
139 |
140 | - **`metric`** : `string`
141 |
142 | The metric graph to plot. Currently supported: 'roc' for Receiver Operating Characteristic curve and
143 | 'pr' for Precision-Recall curve
144 |
145 | - **`micro`** : `Boolean`
146 |
147 | _Default: True_
148 |
149 | Whether to calculate a Micro graph (not applicable for binary cases)
150 |
151 | - **`macro`** : `Boolean`
152 |
153 | _Default: True_
154 |
155 | Whether to calculate a Macro graph (ROC metric only, not applicable for binary cases)
156 |
157 | - **`eopt`** : `Boolean`
158 |
159 | _Default: True_
160 |
161 | Whether to calculate and display the estimated-optimal threshold
162 | for each metric graph. For ROC curves, the estimated-optimal threshold is the closest
163 | computed threshold with (fpr,tpr) values closest to (0,1). For PR curves, it is
164 | the closest one to (1,1) (perfect recall and precision)
165 |
166 | - **`class_names`**: `list` or `string`
167 |
168 | _Default: None_
169 |
170 | Names of the different classes. In a multi-class classification, the
171 | order must match the order of the classes probabilities in the input
172 | data. In a binary classification, can be a string or a list. If a list,
173 | only the last element will be used.
174 |
175 | - **`colors`** : list of Matplotlib color strings or `None`
176 |
177 | _Default: None_
178 |
179 | List of colors to be used for the plotted curves. If `None`, falls back
180 | to a predefined default.
181 |
182 | - **`ax`** : matplotlib `ax`
183 |
184 | _Default: None_
185 |
186 | Matplotlib Axis on which the curves will be plotted
187 |
188 | - **`figsize`** : `(int,int)` or `None`
189 |
190 | _Default: None_
191 |
192 | A Matplotlib figure-size tuple. If `None`, falls back to Matplotlib's
193 | default. Only used if `ax=None`.
194 |
195 | - **`xlim`** : `(float, float)`
196 |
197 | _Default: (0.,1.)_
198 |
199 | X-axis limits.
200 |
201 | - **`ylim`** : `(float,float)`
202 |
203 | _Default: (0.,1.02)_
204 |
205 | Y-axis limits.
206 |
207 | - **`lw`** : `int`
208 |
209 | _Default: 2_
210 |
211 | Line-width.
212 |
213 | - **`ls`** : `string`
214 |
215 | _Default: '-'_
216 |
217 | Matplotlib line-style string
218 |
219 | - **`ms`** : `int`
220 |
221 | _Default: 10_
222 |
223 | Marker-size.
224 |
225 | - **`fmt`** : `string`
226 |
227 | _Default: '.2f'_
228 |
229 | String formatting of displayed AUC and threshold numbers.
230 |
231 | - **`legend`**: `string` or `None`
232 |
233 | _Default: 'best'_
234 |
235 | A Matplotlib legend location string. See Matplotlib documentation for possible options
236 |
237 | - **`plot`**: `Boolean`, default = True
238 |
239 | Plot the histogram
240 |
241 | - **`title`**: `string` or `None`
242 |
243 | _Default: None_
244 |
245 | Plotted graph title. If None, default title is used.
246 |
247 | - **`filename`**: `string` or `None`
248 |
249 | _Default: None_
250 |
251 | If not None, plot will be saved to the given file name.
252 |
253 | - **`force_multiclass`**: `Boolean`
254 |
255 | _Default: False_
256 |
257 | Only applicable if `y_true` and `y_pred` have two columns. If so,
258 | consider the data as a multiclass data rather than binary (useful when plotting
259 | curves of different models one against the other)
260 |
261 | **Returns:** A dictionary, one key for each class. Each value is another dictionary,
262 | holding AUC and eOpT values.
263 |
264 | **Example:** See [examples](../getting_started/examples.md).
265 |
266 | **Binary Classification Input Example:**
267 | Consider a data-set of two data-points where the true class of the first line
268 | is class 0, which was predicted with a probability of 0.6, and the second line's
269 | true class is 1, with predicted probability of 0.8.
270 | ```python
271 | # First option:
272 | >>> metric_graph(y_true=[0,1], y_pred=[0.6,0.8], metric='roc')
273 | # Second option:
274 | >>> metric_graph(y_true=[[1,0],[0,1]], y_pred=[[0.6,0.4],[0.2,0.8]], metric='roc')
275 | # Both yield the same result
276 | ```
277 |
278 | __________________
279 |
280 |
281 | #### `random_forest_feature_importance`
282 |
283 | `random_forest_feature_importance(forest, features, precision=4)`
284 |
285 | Given a trained `sklearn.ensemble.RandomForestClassifier`, plot the different features based on their
286 | importance according to the classifier, from the most important to the least.
287 |
288 | - **`forest`** : `sklearn.ensemble.RandomForestClassifier`
289 |
290 | A trained `RandomForestClassifier`
291 |
292 | - **`features`** : `list`
293 |
294 | A list of the names of the features the classifier was trained on, ordered by the same order the appeared in the training data
295 |
296 | - **`precision`** : `int`
297 |
298 | _Default: 4_
299 |
300 | Precision of feature importance.
301 |
--------------------------------------------------------------------------------
/docs/modules/nominal.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: nominal
3 | ---
4 |
5 | # nominal
6 |
7 | #### `associations`
8 |
9 | `associations(dataset, nominal_columns='auto', numerical_columns=None, mark_columns=False,nom_nom_assoc='cramer', num_num_assoc='pearson', nom_num_assoc='correlation_ratio', symmetric_nom_nom=True, symmetric_num_num=True, display_rows='all', display_columns='all', hide_rows=None, hide_columns=None, cramers_v_bias_correction=True, nan_strategy=_REPLACE, nan_replace_value=_DEFAULT_REPLACE_VALUE, ax=None, figsize=None, annot=True, fmt='.2f', cmap=None, sv_color='silver', cbar=True, vmax=1.0, vmin=None, plot=True, compute_only=False, clustering=False, title=None, filename=None, multiprocessing=False, max_cpu_cores=None)`
10 |
11 | Calculate the correlation/strength-of-association of features in data-set with both categorical and
12 | continuous features using:
13 | * Pearson's R for continuous-continuous cases
14 | * Correlation Ratio for categorical-continuous cases
15 | * Cramer's V or Theil's U for categorical-categorical cases
16 |
17 | - **`dataset`** : `NumPy ndarray / Pandas DataFrame`
18 |
19 | The data-set for which the features' correlation is computed
20 |
21 | - **`nominal_columns`** : `string / list / NumPy ndarray`
22 |
23 | _Default: 'auto'_
24 |
25 | Names of columns of the data-set which hold categorical values. Can also be the string 'all' to state that all
26 | columns are categorical, 'auto' (default) to identify nominal columns automatically, or None to state none are
27 | categorical. Only used if `numerical_columns` is `None`.
28 |
29 | - **`numerical_columns`** : `string / list / NumPy ndarray`
30 |
31 | _Default: None_
32 |
33 | To be used instead of `nominal_columns`. Names of columns of the data-set
34 | which hold numerical values. Can also be the string 'all' to state that
35 | all columns are numerical (equivalent to `nominal_columns=None`) or
36 | 'auto' to try to identify numerical columns (equivalent to
37 | `nominal_columns=auto`). If `None`, `nominal_columns` is used.
38 |
39 | - **`mark_columns`** : `Boolean`
40 |
41 | _Default: False_
42 |
43 | if True, output's columns' names will have a suffix of '(nom)' or '(con)' based on their type (nominal or
44 | continuous), as provided by nominal_columns
45 |
46 | - **`nom_nom_assoc`** : `callable / string`
47 |
48 | _Default: 'cramer'_
49 |
50 | !!! info "Method signature change"
51 | This replaces the `theil_u` flag which was used till version 0.6.6.
52 |
53 | If callable, a function which recieves two `pd.Series` and returns a single number.
54 |
55 | If string, name of nominal-nominal (categorical-categorical) association to use:
56 |
57 | * `cramer`: Cramer's V
58 |
59 | * `theil`: Theil's U. When selected, heat-map columns are the provided information (meaning: $U = U(row|col)$)
60 |
61 | - **`num_num_assoc`** : `callable / string`
62 |
63 | _Default: 'pearson'_
64 |
65 | If callable, a function which recieves two `pd.Series` and returns a single number.
66 |
67 | If string, name of numerical-numerical association to use:
68 |
69 | * `pearson`: Pearson's R
70 |
71 | * `spearman`: Spearman's R
72 |
73 | * `kendall`: Kendall's Tau
74 |
75 | - **`nom_num_assoc`** : `callable / string`
76 |
77 | _Default: 'correlation_ratio'_
78 |
79 | If callable, a function which recieves two `pd.Series` and returns a single number.
80 |
81 | If string, name of nominal-numerical association to use:
82 |
83 | * `correlation_ratio`: correlation ratio
84 |
85 | - **`symmetric_nom_nom`** : `Boolean`
86 |
87 | _Default: True_
88 |
89 | Relevant only if `nom_nom_assoc` is a callable. If so, declare whether the function is symmetric ($f(x,y) = f(y,x)$).
90 | If False, heat-map values should be interpreted as $f(row,col)$.
91 |
92 | - **`symmetric_num_num`** : `Boolean`
93 |
94 | _Default: True_
95 |
96 | Relevant only if `num_num_assoc` is a callable. If so, declare whether the function is symmetric ($f(x,y) = f(y,x)$).
97 | If False, heat-map values should be interpreted as $f(row,col)$.
98 |
99 | - **`display_rows`** : `list / string`
100 |
101 | _Default: 'all'_
102 |
103 | Choose which of the dataset's features will be displyed in the output's
104 | correlations table rows. If string, can either be a single feature's name or 'all'.
105 | Only used if `hide_rows` is `None`.
106 |
107 | - **`display_columns`** : `list / string`
108 |
109 | _Default: 'all'_
110 |
111 | Choose which of the dataset's features will be displyed in the output's
112 | correlations table columns. If string, can either be a single feature's name or 'all'.
113 | Only used if `hide_columns` is `None`.
114 |
115 | - **`hide_rows`** : `list / string`
116 |
117 | _Default: None_
118 |
119 | choose which of the dataset's features will not be displyed in the output's
120 | correlations table rows. If string, must be a single feature's name. If `None`,
121 | `display_rows` is used.
122 |
123 | - **`hide_columns`** : `list / string`
124 |
125 | _Default: None_
126 |
127 | choose which of the dataset's features will not be displyed in the output's
128 | correlations table columns. If string, must be a single feature's name. If `None`,
129 | `display_columns` is used.
130 |
131 | - **`cramers_v_bias_correction`** : `Boolean`
132 |
133 | _Default: True_
134 |
135 | !!! info "Method signature change"
136 | This replaces the `bias_correction` flag which was used till version 0.6.6.
137 |
138 | Use bias correction for Cramer's V from Bergsma and Wicher, Journal of the Korean
139 | Statistical Society 42 (2013): 323-328.
140 |
141 | - **`nan_strategy`** : `string`
142 |
143 | _Default: 'replace'_
144 |
145 | How to handle missing values: can be either `'drop_samples'` to remove
146 | samples with missing values, `'drop_features'` to remove features
147 | (columns) with missing values, `'replace'` to replace all missing
148 | values with the `nan_replace_value`, or `'drop_sample_pairs'` to drop each
149 | pair of missing observables separately before calculating the corresponding coefficient.
150 | Missing values are `None` and `np.nan`.
151 |
152 | - **`nan_replace_value`** : `any`
153 |
154 | _Default: 0.0_
155 |
156 | The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'
157 |
158 | - **`ax`** : matplotlib `Axe`
159 |
160 | _Default: None_
161 |
162 | Matplotlib Axis on which the heat-map will be plotted
163 |
164 | - **`figsize`** : `(float, float)` or `None`
165 |
166 | _Default: None_
167 |
168 | A Matplotlib figure-size tuple. If `None`, will attempt to set the size automatically.
169 | Only used if `ax=None`.
170 |
171 | - **`annot`** : `Boolean`
172 |
173 | _Default: True_
174 |
175 | Plot number annotations on the heat-map
176 |
177 | - **`fmt`** : `string`
178 |
179 | _Default: '.2f'_
180 |
181 | String formatting of annotations
182 |
183 | - **`cmap`** : Matplotlib colormap or `None`
184 |
185 | _Default: None_
186 |
187 | A colormap to be used for the heat-map. If None, falls back to Seaborn's heat-map default
188 |
189 | - **`sv_color`** : `string`
190 |
191 | _Default: 'silver'_
192 |
193 | A Matplotlib color. The color to be used when displaying single-value features over the heat-map
194 |
195 | - **`cbar`** : `Boolean`
196 |
197 | _Default: True_
198 |
199 | Display heat-map's color-bar
200 |
201 | - **`vmax`** : `float`
202 |
203 | _Default: 1.0_
204 |
205 | Set heat-map `vmax` option
206 |
207 | - **`vmin`** : `float` or `None`
208 |
209 | _Default: None_
210 |
211 | Set heat-map `vmin` option. If set to `None`, `vmin` will be chosen automatically
212 | between 0 and -1.0, depending on the types of associations used (-1.0 if Pearson's R
213 | is used, 0 otherwise)
214 |
215 | - **`plot`** : `Boolean`
216 |
217 | _Default: True_
218 |
219 | Plot a heat-map of the correlation matrix. If False, heat-map will still be
220 | drawn, but not shown. The heat-map's `ax` is part of this function's output.
221 |
222 | - **`compute_only`** : `Boolean`
223 |
224 | _Default: False_
225 |
226 | Use this flag only if you have no need of the plotting at all. This skips the entire
227 | plotting mechanism (similar to the old `compute_associations` method).
228 |
229 | - **`clustering`** : `Boolean`
230 |
231 | _Default: False_
232 |
233 | If True, the computed associations will be sorted into groups by similar correlations
234 |
235 | - **`title`**: `string` or `None`
236 |
237 | _Default: None_
238 |
239 | Plotted graph title.
240 |
241 | - **`filename`**: `string` or `None`
242 |
243 | _Default: None_
244 |
245 | If not None, plot will be saved to the given file name.
246 |
247 | - **`multiprocessing`**: `Boolean`
248 |
249 | _Default: False_
250 |
251 | If True, use multiprocessing to speed up computations. If None, falls back to single core computation
252 |
253 | - **`max_cpu_cores`**: `int` or `None`
254 |
255 | _Default_: `None`
256 |
257 | If not `None`, `ProcessPoolExecutor` will use the given number of CPU cores
258 |
259 | **Returns:** A dictionary with the following keys:
260 |
261 | - `corr`: A DataFrame of the correlation/strength-of-association between all features
262 | - `ax`: A Matplotlib `Axe`
263 |
264 | **Example:** See [examples](../getting_started/examples.md).
265 | __________________
266 |
267 | #### `cluster_correlations`
268 |
269 | `cluster_correlations(corr_mat, indexes=None)`
270 |
271 | Apply agglomerative clustering in order to sort a correlation matrix.
272 | Based on [this clustering example](https://github.com/TheLoneNut/CorrelationMatrixClustering/blob/master/CorrelationMatrixClustering.ipynb).
273 |
274 | - **`corr_mat`** : `Pandas DataFrame`
275 |
276 | A correlation matrix (as output from `associations`)
277 |
278 | - **`indexes`** : `list / NumPy ndarray / Pandas Series`
279 |
280 | A sequence of cluster indexes for sorting. If not present, a clustering is performed.
281 |
282 | **Returns:**
283 |
284 | - a sorted correlation matrix (`pd.DataFrame`)
285 | - cluster indexes based on the original dataset (`list`)
286 |
287 | **Example:**
288 | ```python
289 | >>> assoc = associations(
290 | customers,
291 | plot=False
292 | )
293 | >>> correlations = assoc['corr']
294 | >>> correlations, _ = cluster_correlations(correlations)
295 | ```
296 |
297 | __________________
298 |
299 | #### `compute_associations`
300 |
301 | !!! warning "Deprecated"
302 |
303 | `compute_associations` was deprecated and removed. Use `associations(compute_only=True)['corr']`.
304 |
305 | __________________
306 |
307 | #### `conditional_entropy`
308 |
309 | `conditional_entropy(x, y, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE, log_base=math.e)`
310 |
311 | Given measurements `x` and `y` of random variables $X$ and $Y$, calculates the conditional entropy of $X$ given $Y$:
312 |
313 | $$ S(X|Y) = - \sum_{x,y} p(x,y) \log\frac{p(x,y)}{p(y)} $$
314 |
315 | Read more on [Wikipedia](https://en.wikipedia.org/wiki/Conditional_entropy).
316 |
317 | - **`x`** : `list / NumPy ndarray / Pandas Series`
318 |
319 | A sequence of measurements
320 |
321 | - **`y`** : `list / NumPy ndarray / Pandas Series`
322 |
323 | A sequence of measurements
324 |
325 | - **`nan_strategy`** : `string`
326 |
327 | _Default: 'replace'_
328 |
329 | How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan.
330 |
331 | - **`nan_replace_value`** : `any`
332 |
333 | _Default: 0.0_
334 |
335 | The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'.
336 |
337 | - **`log_base`** : `float`
338 |
339 | _Default: `math.e`_
340 |
341 | Specifying base for calculating entropy.
342 |
343 | **Returns:** `float`
344 |
345 | __________________
346 |
347 | #### `correlation_ratio`
348 |
349 | `correlation_ratio(categories, measurements, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE)`
350 |
351 | Calculates the Correlation Ratio ($\eta$) for categorical-continuous association:
352 |
353 | $$ \eta = \sqrt{\frac{\sum_x{n_x (\bar{y}_x - \bar{y})^2}}{\sum_{x,i}{(y_{xi}-\bar{y})^2}}} $$
354 |
355 | where $n_x$ is the number of observations in category $x$, and we define:
356 |
357 | $$\bar{y}_x = \frac{\sum_i{y_{xi}}}{n_x} , \bar{y} = \frac{\sum_i{n_x \bar{y}_x}}{\sum_x{n_x}}$$
358 |
359 | Answers the question - given a continuous value of a measurement, is it possible to know which category is it
360 | associated with?
361 | Value is in the range [0,1], where 0 means a category cannot be determined by a continuous measurement, and 1 means
362 | a category can be determined with absolute certainty.
363 | Read more on [Wikipedia](https://en.wikipedia.org/wiki/Correlation_ratio).
364 |
365 | - **`categories`** : `list / NumPy ndarray / Pandas Series`
366 |
367 | A sequence of categorical measurements
368 |
369 | - **`measurements`** : `list / NumPy ndarray / Pandas Series`
370 |
371 | A sequence of continuous measurements
372 |
373 | - **`nan_strategy`** : `string`
374 |
375 | _Default: 'replace'_
376 |
377 | How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan.
378 |
379 | - **`nan_replace_value`** : `any`
380 |
381 | _Default: 0.0_
382 |
383 | The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'.
384 |
385 | **Returns:** float in the range of [0,1]
386 |
387 | __________________
388 |
389 | #### `cramers_v`
390 |
391 | `cramers_v(x, y, bias_correction=True, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE)`
392 |
393 | Calculates Cramer's V statistic for categorical-categorical association.
394 | This is a symmetric coefficient: $V(x,y) = V(y,x)$. Read more on [Wikipedia](https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V).
395 |
396 | Original function taken from [this answer](https://stackoverflow.com/a/46498792/5863503) on StackOverflow.
397 |
398 | !!! info "Cramer's V limitations when applied on skewed or small datasets"
399 |
400 | As the Cramer's V measure of association depends directly on the counts of each samples-pair in the data, it tends to be suboptimal when applied on skewed or small datasets.
401 |
402 | Consider each of the following cases, where we would expect Cramer's V to reach a high value, yet this only happens in the first scenario:
403 |
404 | ```python
405 | >>> x = ['a'] * 400 + ['b'] * 100
406 | >>> y = ['X'] * 400 + ['Y'] * 100
407 | >>> cramers_v(x,y)
408 | 0.9937374102534072
409 |
410 | # skewed dataset
411 | >>> x = ['a'] * 500 + ['b'] * 1
412 | >>> y = ['X'] * 500 + ['Y'] * 1
413 | >>> cramers_v(x,y)
414 | 0.4974896903293253
415 |
416 | # very small dataset
417 | >>> x = ['a'] * 4 + ['b'] * 1
418 | >>> y = ['X'] * 4 + ['Y'] * 1
419 | >>> cramers_v(x,y)
420 | 0.0
421 | ```
422 |
423 | - **`x`** : `list / NumPy ndarray / Pandas Series`
424 |
425 | A sequence of categorical measurements
426 |
427 | - **`y`** : `list / NumPy ndarray / Pandas Series`
428 |
429 | A sequence of categorical measurements
430 |
431 | - **`bias_correction`** : `Boolean`
432 |
433 | _Default: True_
434 |
435 | Use bias correction from Bergsma and Wicher, Journal of the Korean Statistical Society 42 (2013): 323-328.
436 |
437 | - **`nan_strategy`** : `string`
438 |
439 | _Default: 'replace'_
440 |
441 | How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan.
442 |
443 | - **`nan_replace_value`** : `any`
444 |
445 | _Default: 0.0_
446 |
447 | The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'.
448 |
449 | **Returns:** float in the range of [0,1]
450 |
451 | __________________
452 |
453 | #### `identify_nominal_columns`
454 |
455 | `identify_nominal_columns(dataset)`
456 |
457 | Given a dataset, identify categorical columns. This is used internally in `associations` and `numerical_encoding`,
458 | but can also be used directly.
459 |
460 | !!! info "Note:"
461 |
462 | This is a shortcut for `data_utils.identify_columns_by_type(dataset, include=['object', 'category'])`
463 |
464 | - **`dataset`** : `np.ndarray` / `pd.DataFrame`
465 |
466 | **Returns:** list of categorical columns
467 |
468 | **Example:**
469 | ```python
470 | >>> df = pd.DataFrame({'col1': ['a', 'b', 'c', 'a'], 'col2': [3, 4, 2, 1]})
471 | >>> identify_nominal_columns(df)
472 | ['col1']
473 | ```
474 |
475 | __________________
476 |
477 | #### `identify_numeric_columns`
478 |
479 | `identify_numeric_columns(dataset)`
480 |
481 | Given a dataset, identify numeric columns.
482 |
483 | !!! info "Note:"
484 |
485 | This is a shortcut for `data_utils.identify_columns_by_type(dataset, include=['int64', 'float64'])`
486 |
487 | - **`dataset`** : `np.ndarray` / `pd.DataFrame`
488 |
489 | **Returns:** list of numerical columns
490 |
491 | **Example:**
492 | ```python
493 | >>> df = pd.DataFrame({'col1': ['a', 'b', 'c', 'a'], 'col2': [3, 4, 2, 1], 'col3': [1., 2., 3., 4.]})
494 | >>> identify_numeric_columns(df)
495 | ['col2', 'col3']
496 | ```
497 |
498 | __________________
499 |
500 | #### `numerical_encoding`
501 |
502 | `numerical_encoding(dataset, nominal_columns='auto', drop_single_label=False, drop_fact_dict=True, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE)`
503 |
504 | Encoding a data-set with mixed data (numerical and categorical) to a numerical-only data-set,
505 | using the following logic:
506 |
507 | * categorical with only a single value will be marked as zero (or dropped, if requested)
508 |
509 | * categorical with two values will be replaced with the result of Pandas `factorize`
510 |
511 | * categorical with more than two values will be replaced with the result of Pandas `get_dummies`
512 |
513 | * numerical columns will not be modified
514 |
515 | - **`dataset`** : `NumPy ndarray / Pandas DataFrame`
516 |
517 | The data-set to encode
518 |
519 | - **`nominal_columns`** : `sequence / string `
520 |
521 | _Default: 'auto'_
522 |
523 | Names of columns of the data-set which hold categorical values. Can also be the string 'all' to state that all columns are categorical, 'auto' (default) to identify nominal columns automatically, or None to state none are categorical (nothing happens)
524 |
525 | - **`drop_single_label`** : `Boolean`
526 |
527 | _Default: False_
528 |
529 | If True, nominal columns with a only a single value will be dropped.
530 |
531 | - **`drop_fact_dict`** : `Boolean`
532 |
533 | _Default: True_
534 |
535 | If True, the return value will be the encoded DataFrame alone. If False, it will be a tuple of the DataFrame and the dictionary of the binary factorization (originating from pd.factorize)
536 |
537 | - **`nan_strategy`** : `string`
538 |
539 | _Default: 'replace'_
540 |
541 | How to handle missing values: can be either 'drop_samples' to remove samples with missing values, 'drop_features' to remove features (columns) with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan.
542 |
543 | - **`nan_replace_value`** : `any`
544 |
545 | _Default: 0.0_
546 |
547 | The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'
548 |
549 | **Returns:** `pd.DataFrame` or `(pd.DataFrame, dict)`. If `drop_fact_dict` is True, returns the encoded DataFrame.
550 | else, returns a tuple of the encoded DataFrame and dictionary, where each key is a two-value column, and the
551 | value is the original labels, as supplied by Pandas `factorize`. Will be empty if no two-value columns are
552 | present in the data-set
553 |
554 | __________________
555 |
556 | #### `replot_last_associations`
557 |
558 | `replot_last_associations(ax=None, figsize=None, annot=None, fmt=None, cmap=None, sv_color=None, cbar=None, vmax=None, vmin=None, plot=True, title=None, filename=None)`
559 |
560 | Re-plot last computed associations heat-map. This method performs no new computations, but only allows
561 | to change the visual output of the last computed heat-map.
562 |
563 | - **`ax`** : matplotlib `Axe`
564 |
565 | _Default: `None`_
566 |
567 | Matplotlib Axis on which the heat-map will be plotted
568 |
569 | - **`figsize`** : `(int,int)` or `None`
570 |
571 | _Default: `None`_
572 |
573 | A Matplotlib figure-size tuple. If `None`, uses the last `associations` call value.
574 | Only used if `ax=None`.
575 |
576 | - **`annot`** : `Boolean` or `None`
577 |
578 | _Default: `None`_
579 |
580 | Plot number annotations on the heat-map. If `None`, uses the last `associations` call value.
581 |
582 | - **`fmt`** : `string`
583 |
584 | _Default: `None`_
585 |
586 | String formatting of annotations. If `None`, uses the last `associations` call value.
587 |
588 | - **`cmap`** : Matplotlib `colormap` or `None`
589 |
590 | _Default: `None`_
591 |
592 | A colormap to be used for the heat-map. If `None`, uses the last `associations` call value.
593 |
594 | - **`sv_color`** : `string`
595 |
596 | _Default: `None`_
597 |
598 | A Matplotlib color. The color to be used when displaying single-value.
599 | If `None`, uses the last `associations` call value.
600 |
601 | - **`cbar`** : `Boolean `or `None`
602 |
603 | _Default: `None`_
604 |
605 | Display heat-map's color-bar. If `None`, uses the last `associations` call value.
606 |
607 | - **`vmax`** : `float` or `None`
608 |
609 | _Default: `None`_
610 |
611 | Set heat-map `vmax` option. If `None`, uses the last `associations` call value.
612 |
613 | - **`vmin`** : `float` or `None`
614 |
615 | _Default: `None`_
616 |
617 | Set heat-map `vmin` option. If `None`, uses the last `associations` call value.
618 |
619 | - **`plot`** : `Boolean`
620 |
621 | _Default: `True`_
622 |
623 | Plot a heat-map of the correlation matrix. If False, plotting still
624 | happens, but the heat-map will not be displayed.
625 |
626 | - **`title`** : `string` or `None`
627 |
628 | _Default: `None`_
629 |
630 | Plotted graph title. If `None`, uses the last `associations` call value.
631 |
632 | - **`filename`** : `string` or `None`
633 |
634 | _Default: `None`_
635 |
636 | If not `None`, plot will be saved to the given file name. Note: in order to avoid accidental file
637 | overwrites, the last `associations` call value is never used, and when filename is set to None,
638 | no writing to file occurs.
639 |
640 | **Returns:** A Matplotlib `Axe`
641 |
642 | __________________
643 |
644 | #### `theils_u`
645 |
646 | `theils_u(x, y, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE)`
647 |
648 | Calculates Theil's U statistic (Uncertainty coefficient) for categorical-categorical association, defined as:
649 |
650 | $$ U(X|Y) = \frac{S(X) - S(X|Y)}{S(X)} $$
651 |
652 | where $S(X)$ is the entropy of $X$ and $S(X|Y)$ is the [conditional entropy](#conditional_entropy) of $X$ given $Y$.
653 |
654 | This is the uncertainty of x given y: value is on the range of [0,1] - where 0 means y provides no information about
655 | x, and 1 means y provides full information about x.
656 | This is an asymmetric coefficient: $U(x,y) \neq U(y,x)$. Read more on
657 | [Wikipedia](https://en.wikipedia.org/wiki/Uncertainty_coefficient).
658 |
659 | - **`x`** : `list / NumPy ndarray / Pandas Series`
660 |
661 | A sequence of categorical measurements
662 |
663 | - **`y`** : `list / NumPy ndarray / Pandas Series`
664 |
665 | A sequence of categorical measurements
666 |
667 | - **`nan_strategy`** : `string`
668 |
669 | _Default: 'replace'_
670 |
671 | How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan.
672 |
673 | - **`nan_replace_value`** : `any`
674 |
675 | _Default: 0.0_
676 |
677 | The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'.
678 |
679 | **Returns:** float in the range of [0,1]
680 |
681 |
--------------------------------------------------------------------------------
/docs/modules/sampling.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: sampling
3 | ---
4 |
5 | # sampling
6 |
7 | #### `boltzmann_sampling`
8 |
9 | `boltzmann_sampling(numbers, k=1, with_replacement=False)`
10 |
11 | Return k numbers from a boltzmann-sampling over the supplied numbers
12 |
13 | - **`numbers`** : `List or np.ndarray`
14 |
15 | numbers to sample
16 |
17 | - **`k`** : `int`
18 |
19 | _Default: 1_
20 |
21 | How many numbers to sample. Choosing `k=None` will yield a single number
22 |
23 | - **`with_replacement`** : `Boolean` `Default: False`
24 |
25 | Allow replacement or not
26 |
27 | **Returns:** `list`, `np.ndarray` or a single number (depending on the input)
28 |
29 | __________________
30 |
31 | #### `weighted_sampling`
32 |
33 | `weighted_sampling(numbers, k=1, with_replacement=False)`
34 |
35 | Return k numbers from a weighted-sampling over the supplied numbers
36 |
37 | - **`numbers`** : `List or np.ndarray`
38 |
39 | numbers to sample
40 |
41 | - **`k`** : `int`
42 |
43 | _Default: 1_
44 |
45 | How many numbers to sample. Choosing `k=None` will yield a single number
46 |
47 | - **`with_replacement`** : `Boolean`
48 |
49 | _Default: False_
50 |
51 | Allow replacement or not
52 |
53 | **Returns:** `list`, `np.ndarray` or a single number (depending on the input)
--------------------------------------------------------------------------------
/docs/overrides/main.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 |
3 | {% block extrahead %}
4 | {% set title = config.site_name %}
5 | {% if page and page.title and not page.is_homepage %}
6 | {% set title = config.site_name ~ ":" ~ page.title | striptags %}
7 | {% endif %}
8 | {% set image = config.site_url ~ 'images/social_banner.png' %}
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 | {% endblock %}
--------------------------------------------------------------------------------
/docs/related_blogposts.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: related blogposts
3 | ---
4 | # Related Blogposts
5 |
6 | Here are some blogposts I wrote, explaining and using some of the methods of Dython:
7 |
8 | * Read more about the categorical tools on
9 | [The Search for Categorical Correlation](https://medium.com/@shakedzy/the-search-for-categorical-correlation-a1cf7f1888c9)
10 | * Read more about using ROC graphs on
11 | [Hard ROC: Really Understanding & Properly Using ROC and AUC](https://medium.com/@shakedzy/hard-roc-really-understanding-and-properly-using-roc-and-auc-13413cf0dc24)
12 | * Read more about KS Area Between Curves and when not to use ROC graphs (and other common metrics) on
13 | [The Metric System: How to Correctly Measure Your Model](https://shakedzy.medium.com/the-metric-system-how-to-correctly-measure-your-model-17d3feaed6ab)
--------------------------------------------------------------------------------
/dython/__init__.py:
--------------------------------------------------------------------------------
1 | from . import nominal, model_utils, sampling, data_utils
2 | from ._private import set_is_jupyter
3 |
4 |
5 | def _get_version_from_setuptools():
6 | from pkg_resources import get_distribution
7 |
8 | return get_distribution("dython").version
9 |
10 |
11 | __all__ = ["__version__"]
12 | __version__ = _get_version_from_setuptools()
13 | set_is_jupyter()
14 |
--------------------------------------------------------------------------------
/dython/_private.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import numpy as np
3 | import pandas as pd
4 | import matplotlib.pyplot as plt
5 | from numpy.typing import NDArray
6 | from typing import Optional, Any, Tuple, Union, List, Literal
7 | from .typing import Number, OneDimArray
8 |
9 |
10 | IS_JUPYTER: bool = False
11 |
12 |
13 | def set_is_jupyter(force_to: Optional[bool] = None) -> None:
14 | global IS_JUPYTER
15 | if force_to is not None:
16 | IS_JUPYTER = force_to
17 | else:
18 | IS_JUPYTER = "ipykernel_launcher.py" in sys.argv[0]
19 |
20 |
21 | def plot_or_not(plot: bool) -> None:
22 | if plot:
23 | plt.show()
24 | elif not plot and IS_JUPYTER:
25 | fig = plt.gcf()
26 | if fig:
27 | plt.close(fig)
28 |
29 |
30 | def convert(
31 | data: Union[List[Number], NDArray, pd.DataFrame],
32 | to: Literal["array", "list", "dataframe"],
33 | copy: bool = True,
34 | ) -> Union[List[Number], NDArray, pd.DataFrame]:
35 | converted = None
36 | if to == "array":
37 | if isinstance(data, np.ndarray):
38 | converted = data.copy() if copy else data
39 | elif isinstance(data, pd.Series):
40 | converted = data.values
41 | elif isinstance(data, list):
42 | converted = np.array(data)
43 | elif isinstance(data, pd.DataFrame):
44 | converted = data.values() # type: ignore
45 | elif to == "list":
46 | if isinstance(data, list):
47 | converted = data.copy() if copy else data
48 | elif isinstance(data, pd.Series):
49 | converted = data.values.tolist()
50 | elif isinstance(data, np.ndarray):
51 | converted = data.tolist()
52 | elif to == "dataframe":
53 | if isinstance(data, pd.DataFrame):
54 | converted = data.copy(deep=True) if copy else data
55 | elif isinstance(data, np.ndarray):
56 | converted = pd.DataFrame(data)
57 | else:
58 | raise ValueError("Unknown data conversion: {}".format(to))
59 | if converted is None:
60 | raise TypeError(
61 | "cannot handle data conversion of type: {} to {}".format(
62 | type(data), to
63 | )
64 | )
65 | else:
66 | return converted # type: ignore
67 |
68 |
69 | def remove_incomplete_samples(
70 | x: Union[List[Any], OneDimArray], y: Union[List[Any], OneDimArray]
71 | ) -> Tuple[Union[List[Any], OneDimArray], Union[List[Any], OneDimArray]]:
72 | x = [v if v is not None else np.nan for v in x]
73 | y = [v if v is not None else np.nan for v in y]
74 | arr = np.array([x, y]).transpose()
75 | arr = arr[~np.isnan(arr).any(axis=1)].transpose()
76 | if isinstance(x, list):
77 | return arr[0].tolist(), arr[1].tolist()
78 | else:
79 | return arr[0], arr[1]
80 |
81 |
82 | def replace_nan_with_value(
83 | x: Union[List[Any], OneDimArray],
84 | y: Union[List[Any], OneDimArray],
85 | value: Any,
86 | ) -> Tuple[NDArray, NDArray]:
87 | x = np.array(
88 | [v if v == v and v is not None else value for v in x]
89 | ) # NaN != NaN
90 | y = np.array([v if v == v and v is not None else value for v in y])
91 | return x, y
92 |
--------------------------------------------------------------------------------
/dython/data_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import matplotlib.pyplot as plt
4 | from typing import Optional, Tuple, List, Any, Union
5 | from numpy.typing import NDArray
6 | from .typing import Number, TwoDimArray
7 | from ._private import convert, plot_or_not
8 |
9 |
10 | __all__ = [
11 | "identify_columns_by_type",
12 | "identify_columns_with_na",
13 | "one_hot_encode",
14 | "split_hist",
15 | ]
16 |
17 |
18 | def one_hot_encode(
19 | array: Union[List[Union[Number, str]], NDArray],
20 | classes: Optional[int] = None,
21 | ) -> NDArray:
22 | """
23 | One-hot encode a 1D array.
24 | Based on this StackOverflow answer: https://stackoverflow.com/a/29831596/5863503
25 |
26 | Parameters:
27 | -----------
28 | arr : array-like
29 | An array to be one-hot encoded. Must contain only non-negative integers
30 | classes : int or None
31 | number of classes. if None, max value of the array will be used
32 |
33 | Returns:
34 | --------
35 | 2D one-hot encoded array
36 |
37 | Example:
38 | --------
39 | >>> one_hot_encode([1,0,5])
40 | array([[0., 1., 0., 0., 0., 0.],
41 | [1., 0., 0., 0., 0., 0.],
42 | [0., 0., 0., 0., 0., 1.]])
43 | """
44 | arr: NDArray = convert(array, "array").astype(int) # type: ignore
45 | if not len(arr.shape) == 1:
46 | raise ValueError(
47 | f"array must have only one dimension, but has shape: {arr.shape}"
48 | )
49 | if arr.min() < 0:
50 | raise ValueError("array cannot contain negative values")
51 | classes = classes if classes is not None else arr.max() + 1
52 | h = np.zeros((arr.size, classes)) # type: ignore
53 | h[np.arange(arr.size), arr] = 1
54 | return h
55 |
56 |
57 | def split_hist(
58 | dataset: pd.DataFrame,
59 | values: str,
60 | split_by: str,
61 | title: Optional[str] = "",
62 | xlabel: Optional[str] = "",
63 | ylabel: Optional[str] = None,
64 | figsize: Optional[Tuple[int, int]] = None,
65 | legend: Optional[str] = "best",
66 | plot: bool = True,
67 | **hist_kwargs,
68 | ) -> plt.Axes:
69 | """
70 | Plot a histogram of values from a given dataset, split by the values of a chosen column
71 |
72 | Parameters:
73 | -----------
74 | dataset : Pandas DataFrame
75 | values : string
76 | The column name of the values to be displayed in the histogram
77 | split_by : string
78 | The column name of the values to split the histogram by
79 | title : string or None, default = ''
80 | The plot's title. If empty string, will be '{values} by {split_by}'
81 | xlabel: string or None, default = ''
82 | x-axis label. If empty string, will be '{values}'
83 | ylabel: string or None, default: None
84 | y-axis label
85 | figsize: (int,int) or None, default = None
86 | A Matplotlib figure-size tuple. If `None`, falls back to Matplotlib's
87 | default.
88 | legend: string or None, default = 'best'
89 | A Matplotlib legend location string. See Matplotlib documentation for possible options
90 | plot: Boolean, default = True
91 | Plot the histogram
92 | hist_kwargs: key-value pairs
93 | A key-value pairs to be passed to Matplotlib hist method. See Matplotlib documentation for possible options
94 |
95 | Returns:
96 | --------
97 | A Matplotlib `Axes`
98 |
99 | Example:
100 | --------
101 | See example under `dython.examples`
102 | """
103 | plt.figure(figsize=figsize)
104 | split_vals = dataset[split_by].unique()
105 | data_split = list()
106 | for val in split_vals:
107 | data_split.append(dataset[dataset[split_by] == val][values])
108 | hist_kwargs["label"] = split_vals
109 | plt.hist(data_split, **hist_kwargs)
110 | if legend:
111 | plt.legend(loc=legend)
112 | if xlabel is not None:
113 | if xlabel == "":
114 | xlabel = values
115 | plt.xlabel(xlabel)
116 | if title is not None:
117 | if title == "":
118 | title = values + " by " + split_by
119 | plt.title(title)
120 | if ylabel:
121 | plt.ylabel(ylabel)
122 | ax = plt.gca()
123 | plot_or_not(plot)
124 | return ax
125 |
126 |
127 | def identify_columns_by_type(
128 | dataset: TwoDimArray, include: List[str]
129 | ) -> List[Any]:
130 | """
131 | Given a dataset, identify columns of the types requested.
132 |
133 | Parameters:
134 | -----------
135 | dataset : NumPy ndarray / Pandas DataFrame
136 | include : list of strings
137 | Desired column types
138 |
139 | Returns:
140 | --------
141 | A list of columns names
142 |
143 | Example:
144 | --------
145 | >>> df = pd.DataFrame({'col1': ['a', 'b', 'c', 'a'], 'col2': [3, 4, 2, 1], 'col3': [1., 2., 3., 4.]})
146 | >>> identify_columns_by_type(df, include=['int64', 'float64'])
147 | ['col2', 'col3']
148 |
149 | """
150 | df: pd.DataFrame = convert(dataset, "dataframe") # type: ignore
151 | columns = list(df.select_dtypes(include=include).columns)
152 | return columns
153 |
154 |
155 | def identify_columns_with_na(dataset: TwoDimArray) -> pd.DataFrame:
156 | """
157 | Return columns names having NA values, sorted in descending order by their number of NAs
158 |
159 | Parameters:
160 | -----------
161 | dataset : NumPy ndarray / Pandas DataFrame
162 |
163 | Returns:
164 | --------
165 | A DataFrame of two columns (['column', 'na_count']), consisting of only the names
166 | of columns with NA values, sorted by their number of NA values.
167 |
168 | Example:
169 | --------
170 | >>> df = pd.DataFrame({'col1': ['a', np.nan, 'a', 'a'], 'col2': [3, np.nan, 2, np.nan], 'col3': [1., 2., 3., 4.]})
171 | >>> identify_columns_with_na(df)
172 | column na_count
173 | 1 col2 2
174 | 0 col1 1
175 | """
176 | df: pd.DataFrame = convert(dataset, "dataframe") # type: ignore
177 | na_count = [sum(df[cc].isnull()) for cc in df.columns]
178 | return (
179 | pd.DataFrame({"column": df.columns, "na_count": na_count})
180 | .query("na_count > 0")
181 | .sort_values("na_count", ascending=False)
182 | )
183 |
--------------------------------------------------------------------------------
/dython/examples.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn import svm, datasets
4 | from sklearn.model_selection import train_test_split
5 | from sklearn.preprocessing import label_binarize
6 | from sklearn.multiclass import OneVsRestClassifier
7 | from sklearn.linear_model import LogisticRegression
8 |
9 | from .data_utils import split_hist
10 | from .model_utils import metric_graph, ks_abc
11 | from .nominal import associations
12 |
13 |
14 | def roc_graph_example():
15 | """
16 | Plot an example ROC graph of an SVM model predictions over the Iris
17 | dataset.
18 |
19 | Based on sklearn examples (as was seen on April 2018):
20 | http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
21 | """
22 |
23 | # Load data
24 | iris = datasets.load_iris()
25 | X = iris.data
26 | y = label_binarize(iris.target, classes=[0, 1, 2])
27 |
28 | # Add noisy features
29 | random_state = np.random.RandomState(4)
30 | n_samples, n_features = X.shape
31 | X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
32 |
33 | # Train a model
34 | X_train, X_test, y_train, y_test = train_test_split(
35 | X, y, test_size=0.5, random_state=0
36 | )
37 | classifier = OneVsRestClassifier(
38 | svm.SVC(kernel="linear", probability=True, random_state=0)
39 | )
40 |
41 | # Predict
42 | y_score = classifier.fit(X_train, y_train).predict_proba(X_test)
43 |
44 | # Plot ROC graphs
45 | return metric_graph(
46 | y_test, y_score, "roc", class_names_list=iris.target_names
47 | )
48 |
49 |
50 | def pr_graph_example():
51 | """
52 | Plot an example PR graph of an SVM model predictions over the Iris
53 | dataset.
54 | """
55 |
56 | # Load data
57 | iris = datasets.load_iris()
58 | X = iris.data
59 | y = label_binarize(iris.target, classes=[0, 1, 2])
60 |
61 | # Add noisy features
62 | random_state = np.random.RandomState(4)
63 | n_samples, n_features = X.shape
64 | X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
65 |
66 | # Train a model
67 | X_train, X_test, y_train, y_test = train_test_split(
68 | X, y, test_size=0.5, random_state=0
69 | )
70 | classifier = OneVsRestClassifier(
71 | svm.SVC(kernel="linear", probability=True, random_state=0)
72 | )
73 |
74 | # Predict
75 | y_score = classifier.fit(X_train, y_train).predict_proba(X_test)
76 |
77 | # Plot PR graphs
78 | return metric_graph(
79 | y_test, y_score, "pr", class_names_list=iris.target_names
80 | )
81 |
82 |
83 | def associations_iris_example():
84 | """
85 | Plot an example of an associations heat-map of the Iris dataset features.
86 | All features of this dataset are numerical (except for the target).
87 | """
88 |
89 | # Load data
90 | iris = datasets.load_iris()
91 |
92 | # Convert int classes to strings to allow associations method
93 | # to automatically recognize categorical columns
94 | target = ["C{}".format(i) for i in iris.target]
95 |
96 | # Prepare data
97 | X = pd.DataFrame(data=iris.data, columns=iris.feature_names)
98 | y = pd.DataFrame(data=target, columns=["target"])
99 | df = pd.concat([X, y], axis=1)
100 |
101 | # Plot features associations
102 | return associations(df)
103 |
104 |
105 | def associations_mushrooms_example():
106 | """
107 | Plot an example of an associations heat-map of the UCI Mushrooms dataset features.
108 | All features of this dataset are categorical. This example will use Theil's U.
109 | """
110 |
111 | # Download and load data from UCI
112 | df = pd.read_csv(
113 | "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
114 | )
115 | df.columns = [
116 | "class",
117 | "cap-shape",
118 | "cap-surface",
119 | "cap-color",
120 | "bruises",
121 | "odor",
122 | "gill-attachment",
123 | "gill-spacing",
124 | "gill-size",
125 | "gill-color",
126 | "stalk-shape",
127 | "stalk-root",
128 | "stalk-surface-above-ring",
129 | "stalk-surface-below-ring",
130 | "stalk-color-above-ring",
131 | "stalk-color-below-ring",
132 | "veil-type",
133 | "veil-color",
134 | "ring-number",
135 | "ring-type",
136 | "spore-print-color",
137 | "population",
138 | "habitat",
139 | ]
140 |
141 | # Plot features associations
142 | return associations(df, nom_nom_assoc="theil", figsize=(15, 15))
143 |
144 |
145 | def split_hist_example():
146 | """
147 | Plot an example of split histogram.
148 | While this example presents a numerical column split by a categorical one, categorical columns can also be used
149 | as the values, as well as numerical columns as the split criteria.
150 | """
151 |
152 | # Load data and convert to DataFrame
153 | data = datasets.load_breast_cancer()
154 | df = pd.DataFrame(data=data.data, columns=data.feature_names)
155 | df["malignant"] = [not bool(x) for x in data.target]
156 |
157 | # Plot histogram
158 | return split_hist(df, "mean radius", "malignant", bins=20, figsize=(15, 7))
159 |
160 |
161 | def ks_abc_example():
162 | """
163 | An example of KS Area Between Curve of a simple binary classifier
164 | trained over the Breast Cancer dataset.
165 | """
166 |
167 | # Load and split data
168 | data = datasets.load_breast_cancer()
169 | X_train, X_test, y_train, y_test = train_test_split(
170 | data.data, data.target, test_size=0.5, random_state=0
171 | )
172 |
173 | # Train model and predict
174 | model = LogisticRegression(solver="liblinear")
175 | model.fit(X_train, y_train)
176 | y_pred = model.predict_proba(X_test)
177 |
178 | # Perform KS test and compute area between curves
179 | return ks_abc(y_test, y_pred[:, 1], figsize=(7, 7))
180 |
--------------------------------------------------------------------------------
/dython/model_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn.ensemble import RandomForestClassifier
4 | from sklearn.metrics import roc_curve, precision_recall_curve, auc
5 | from sklearn.preprocessing import LabelEncoder
6 | from typing import List, Union, Optional, Tuple, Dict, Any, Iterable
7 | from numpy.typing import NDArray
8 | from .typing import Number, OneDimArray
9 | from ._private import convert, plot_or_not
10 |
11 | __all__ = ["random_forest_feature_importance", "metric_graph", "ks_abc"]
12 |
13 | _ROC_PLOT_COLORS = ["b", "g", "r", "c", "m", "y", "k", "darkorange"]
14 |
15 |
16 | def _display_metric_plot(
17 | ax: plt.Axes,
18 | metric: str,
19 | naives: List[Tuple[Number, Number, Number, Number, str]],
20 | xlim: Tuple[float, float],
21 | ylim: Tuple[float, float],
22 | legend: Optional[str],
23 | title: Optional[str],
24 | filename: Optional[str],
25 | plot: bool,
26 | ) -> plt.Axes:
27 | for n in naives:
28 | ax.plot([n[0], n[1]], [n[2], n[3]], color=n[4], lw=1, linestyle="--")
29 | ax.set_xlim(left=xlim[0], right=xlim[1])
30 | ax.set_ylim(bottom=ylim[0], top=ylim[1])
31 | if metric == "roc":
32 | ax.set_xlabel("False Positive Rate")
33 | ax.set_ylabel("True Positive Rate")
34 | ax.set_title(title or "Receiver Operating Characteristic")
35 | else: # metric == 'pr'
36 | ax.set_xlabel("Recall")
37 | ax.set_ylabel("Precision")
38 | ax.set_title(title or "Precision-Recall Curve")
39 | if legend:
40 | ax.legend(loc=legend)
41 | if filename:
42 | plt.savefig(filename)
43 | plot_or_not(plot)
44 | return ax
45 |
46 |
47 | def _draw_estimated_optimal_threshold_mark(
48 | metric: str,
49 | x_axis: OneDimArray,
50 | y_axis: OneDimArray,
51 | thresholds: OneDimArray,
52 | color: str,
53 | ms: int,
54 | fmt: str,
55 | ax: plt.Axes,
56 | ) -> Tuple[Number, Number, Number]:
57 | annotation_offset = (-0.027, 0.03)
58 | a = np.zeros((len(x_axis), 2))
59 | a[:, 0] = x_axis
60 | a[:, 1] = y_axis
61 | if metric == "roc":
62 | dist = lambda row: row[0] ** 2 + (1 - row[1]) ** 2 # optimal: (0,1)
63 | else: # metric == 'pr'
64 | dist = (
65 | lambda row: (1 - row[0]) ** 2 + (1 - row[1]) ** 2
66 | ) # optimal: (1,1)
67 | amin = np.apply_along_axis(dist, 1, a).argmin()
68 | ax.plot(x_axis[amin], y_axis[amin], color=color, marker="o", ms=ms)
69 | ax.annotate(
70 | "{th:{fmt}}".format(th=thresholds[amin], fmt=fmt),
71 | xy=(x_axis[amin], y_axis[amin]),
72 | color=color,
73 | xytext=(
74 | x_axis[amin] + annotation_offset[0],
75 | y_axis[amin] + annotation_offset[1],
76 | ),
77 | )
78 | return thresholds[amin], x_axis[amin], y_axis[amin]
79 |
80 |
81 | def _plot_macro_metric(
82 | x_axis: OneDimArray,
83 | y_axis: OneDimArray,
84 | n: int,
85 | lw: int,
86 | fmt: str,
87 | ax: plt.Axes,
88 | ) -> None:
89 | all_x_axis = np.unique(np.concatenate([x_axis[i] for i in range(n)]))
90 | mean_y_axis = np.zeros_like(all_x_axis)
91 | for i in range(n):
92 | mean_y_axis += np.interp(all_x_axis, x_axis[i], y_axis[i])
93 | mean_y_axis /= n
94 | x_axis_macro = all_x_axis
95 | y_axis_macro = mean_y_axis
96 | auc_macro = auc(x_axis_macro, y_axis_macro)
97 | label = "ROC curve: macro (AUC = {auc:{fmt}})".format(
98 | auc=auc_macro, fmt=fmt
99 | )
100 | ax.plot(
101 | x_axis_macro, y_axis_macro, label=label, color="navy", ls=":", lw=lw
102 | )
103 |
104 |
105 | def _binary_metric_graph(
106 | metric: str,
107 | y_true: OneDimArray,
108 | y_pred: OneDimArray,
109 | eoptimal: bool,
110 | class_label: Optional[str],
111 | color: str,
112 | lw: int,
113 | ls: str,
114 | ms: int,
115 | fmt: str,
116 | ax: plt.Axes,
117 | ) -> Dict[str, Any]:
118 | y_true_array: NDArray = convert(y_true, "array") # type: ignore
119 | y_pred_array: NDArray = convert(y_pred, "array") # type: ignore
120 | if y_pred_array.shape != y_true_array.shape:
121 | raise ValueError("y_true and y_pred must have the same shape")
122 | elif len(y_pred_array.shape) == 1:
123 | y_t = y_true_array
124 | y_p = y_pred_array
125 | else:
126 | y_t = np.array([np.argmax(x) for x in y_true_array])
127 | y_p = np.array([x[1] for x in y_pred_array])
128 | y_t_ratio = np.sum(y_t) / y_t.size # type: ignore
129 | if metric == "roc":
130 | x_axis, y_axis, th = roc_curve(y_t, y_p) # x = fpr, y = tpr
131 | else: # metric == 'pr'
132 | y_axis, x_axis, th = precision_recall_curve(
133 | y_t, y_p
134 | ) # x = recall, y = precision
135 | auc_score = auc(x_axis, y_axis)
136 | if class_label is not None:
137 | class_label = ": " + class_label
138 | else:
139 | class_label = ""
140 | label = "{metric} curve{class_label} (AUC = {auc:{fmt}}".format(
141 | metric=metric.upper(), class_label=class_label, auc=auc_score, fmt=fmt
142 | )
143 | if metric == "pr":
144 | label += ", naive = {ytr:{fmt}}".format(ytr=y_t_ratio, fmt=fmt)
145 | if eoptimal:
146 | eopt, eopt_x, eopt_y = _draw_estimated_optimal_threshold_mark(
147 | metric, x_axis, y_axis, th, color, ms, fmt, ax
148 | )
149 | label += ", eOpT = {th:{fmt}})".format(th=eopt, fmt=fmt)
150 | else:
151 | eopt = None
152 | eopt_x = None
153 | eopt_y = None
154 | label += ")"
155 | ax.plot(x_axis, y_axis, color=color, lw=lw, ls=ls, label=label)
156 | return {
157 | "x": x_axis,
158 | "y": y_axis,
159 | "thresholds": th,
160 | "auc": auc_score,
161 | "eopt": eopt,
162 | "eopt_x": eopt_x,
163 | "eopt_y": eopt_y,
164 | "y_t_ratio": y_t_ratio,
165 | }
166 |
167 |
168 | def _build_metric_graph_output_dict(
169 | metric: str, d: Dict[str, Any]
170 | ) -> Dict[str, Dict[str, Any]]:
171 | naive = d["y_t_ratio"] if metric == "pr" else 0.5
172 | return {
173 | "auc": {"val": d["auc"], "naive": naive},
174 | "eopt": {"val": d["eopt"], "x": d["eopt_x"], "y": d["eopt_y"]},
175 | }
176 |
177 |
178 | def metric_graph(
179 | y_true: OneDimArray,
180 | y_pred: OneDimArray,
181 | metric: str,
182 | *,
183 | micro: bool = True,
184 | macro: bool = True,
185 | eopt: bool = True,
186 | class_names: Optional[Union[str, List[str]]] = None,
187 | colors: Optional[str] = None,
188 | ax: Optional[plt.Axes] = None,
189 | figsize: Optional[Tuple[int, int]] = None,
190 | xlim: Tuple[float, float] = (0.0, 1.0),
191 | ylim: Tuple[float, float] = (0.0, 1.02),
192 | lw: int = 2,
193 | ls: str = "-",
194 | ms: int = 10,
195 | fmt: str = ".2f",
196 | legend: Optional[str] = "best",
197 | plot: bool = True,
198 | title: Optional[str] = None,
199 | filename: Optional[str] = None,
200 | force_multiclass: bool = False,
201 | ) -> Dict[str, Any]:
202 | """
203 | Plot a ROC graph of predictor's results (including AUC scores), where each
204 | row of y_true and y_pred represent a single example.
205 | If there are 1 or two columns only, the data is treated as a binary
206 | classification (see input example below).
207 | If there are more then 2 columns, each column is considered a
208 | unique class, and a ROC graph and AUC score will be computed for each.
209 | A Macro-ROC and Micro-ROC are computed and plotted too by default.
210 |
211 | Based on sklearn examples (as was seen on April 2018):
212 | http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
213 |
214 | Parameters:
215 | -----------
216 | y_true : list / NumPy ndarray
217 | The true classes of the predicted data
218 | y_pred : list / NumPy ndarray
219 | The predicted classes
220 | metric : string
221 | The metric graph to plot. Currently supported: 'roc' for Receiver Operating Characteristic curve and
222 | 'pr' for Precision-Recall curve
223 | micro : Boolean, default = True
224 | Whether to calculate a Micro graph (not applicable for binary cases)
225 | macro : Boolean, default = True
226 | Whether to calculate a Macro graph (ROC metric only, not applicable for binary cases)
227 | eopt : Boolean, default = True
228 | Whether to calculate and display the estimated-optimal threshold
229 | for each metric graph. For ROC curves, the estimated-optimal threshold is the closest
230 | computed threshold with (fpr,tpr) values closest to (0,1). For PR curves, it is
231 | the closest one to (1,1) (perfect recall and precision)
232 | class_names: list or string, default = None
233 | Names of the different classes. In a multi-class classification, the
234 | order must match the order of the classes probabilities in the input
235 | data. In a binary classification, can be a string or a list. If a list,
236 | only the last element will be used.
237 | colors : list of Matplotlib color strings or None, default = None
238 | List of colors to be used for the plotted curves. If `None`, falls back
239 | to a predefined default.
240 | ax : matplotlib ax, default = None
241 | Matplotlib Axis on which the curves will be plotted
242 | figsize : (int,int) or None, default = None
243 | a Matplotlib figure-size tuple. If `None`, falls back to Matplotlib's
244 | default. Only used if `ax=None`.
245 | xlim : (float, float), default = (0.,1.)
246 | X-axis limits.
247 | ylim : (float,float), default = (0.,1.02)
248 | Y-axis limits.
249 | lw : int, default = 2
250 | Line-width.
251 | ls : string, default = '-'
252 | Matplotlib line-style string
253 | ms : int, default = 10,
254 | Marker-size.
255 | fmt : string, default = '.2f'
256 | String formatting of displayed AUC and threshold numbers.
257 | legend : string or None, default = 'best'
258 | Position graph legend.
259 | plot : Boolean, default = True
260 | Display graph
261 | title : string or None, default = None
262 | Plotted graph title. If None, default title is used
263 | filename : string or None, default = None
264 | If not None, plot will be saved to the given file name
265 | force_multiclass : Boolean, default = False
266 | Only applicable if `y_true` and `y_pred` have two columns. If so,
267 | consider the data as a multiclass data rather than binary (useful when plotting
268 | curves of different models one against the other)
269 |
270 | Returns:
271 | --------
272 | A dictionary, one key for each class. Each value is another dictionary,
273 | holding AUC and eOpT values.
274 |
275 | Binary Classification Input Example:
276 | ------------------------------------
277 | Consider a data-set of two data-points where the true class of the first line
278 | is class 0, which was predicted with a probability of 0.6, and the second line's
279 | true class is 1, with predicted probability of 0.8.
280 | ```python
281 | # First option:
282 | >>> metric_graph(y_true=[0,1], y_pred=[0.6,0.8], metric='roc')
283 | {...}
284 |
285 | # Second option:
286 | >>> metric_graph(y_true=[[1,0],[0,1]], y_pred=[[0.6,0.4],[0.2,0.8]], metric='roc')
287 | {...}
288 |
289 | # Both yield the same result
290 | ```
291 |
292 | Example:
293 | --------
294 | See `roc_graph_example` and pr_graph_example` under `dython.examples`
295 | """
296 | if metric is None or metric.lower() not in ["roc", "pr"]:
297 | raise ValueError(f"Invalid metric {metric}")
298 | else:
299 | metric = metric.lower()
300 |
301 | all_x_axis = list()
302 | all_y_axis = list()
303 | y_true_array: NDArray = convert(y_true, "array") # type: ignore
304 | y_pred_array: NDArray = convert(y_pred, "array") # type: ignore
305 |
306 | if y_pred_array.shape != y_true_array.shape:
307 | raise ValueError("y_true and y_pred must have the same shape")
308 |
309 | class_names_list: Optional[List[str]]
310 | if class_names is not None:
311 | if not isinstance(class_names, str):
312 | class_names_list = convert(class_names_list, "list") # type: ignore
313 | else:
314 | class_names_list = [class_names]
315 | else:
316 | class_names_list = None
317 |
318 | if ax is None:
319 | plt.figure(figsize=figsize)
320 | axis = plt.gca()
321 | else:
322 | axis = ax
323 |
324 | if isinstance(colors, str):
325 | colors_list = [colors]
326 | else:
327 | colors_list: List[str] = colors or _ROC_PLOT_COLORS
328 |
329 | output_dict = dict()
330 | pr_naives = list()
331 | if (
332 | len(y_pred_array.shape) == 1
333 | or y_pred_array.shape[1] == 1
334 | or (y_pred_array.shape[1] == 2 and not force_multiclass)
335 | ):
336 | class_label = (
337 | class_names_list[-1] if class_names_list is not None else None
338 | )
339 | color = colors_list[-1]
340 | d = _binary_metric_graph(
341 | metric,
342 | y_true_array,
343 | y_pred_array,
344 | eoptimal=eopt,
345 | class_label=class_label,
346 | color=color,
347 | lw=lw,
348 | ls=ls,
349 | ms=ms,
350 | fmt=fmt,
351 | ax=axis,
352 | )
353 | class_label = class_label or "0"
354 | output_dict[class_label] = _build_metric_graph_output_dict(metric, d)
355 | pr_naives.append([0, 1, d["y_t_ratio"], d["y_t_ratio"], color])
356 | else:
357 | n = y_pred_array.shape[1]
358 | if class_names_list is not None:
359 | if not isinstance(class_names_list, list):
360 | raise ValueError(
361 | "class_names must be a list of items in multi-class classification."
362 | )
363 | if len(class_names_list) != n:
364 | raise ValueError(
365 | "Number of class names does not match input data size."
366 | )
367 | for i in range(0, n):
368 | class_label = (
369 | class_names_list[i] if class_names_list is not None else str(i)
370 | )
371 | color = colors_list[i % len(colors_list)]
372 | d = _binary_metric_graph(
373 | metric,
374 | y_true_array[:, i],
375 | y_pred_array[:, i],
376 | eoptimal=eopt,
377 | color=color,
378 | class_label=class_label,
379 | lw=lw,
380 | ls=ls,
381 | ms=ms,
382 | fmt=fmt,
383 | ax=axis,
384 | )
385 | all_x_axis.append(d["x"])
386 | all_y_axis.append(d["y"])
387 | output_dict[class_label] = _build_metric_graph_output_dict(
388 | metric, d
389 | )
390 | pr_naives.append((0, 1, d["y_t_ratio"], d["y_t_ratio"], color))
391 | if micro:
392 | _binary_metric_graph(
393 | metric,
394 | y_true_array.ravel(),
395 | y_pred_array.ravel(),
396 | eoptimal=False,
397 | ls=":",
398 | color="deeppink",
399 | class_label="micro",
400 | lw=lw,
401 | ms=ms,
402 | fmt=fmt,
403 | ax=axis,
404 | )
405 | if macro and metric == "roc":
406 | _plot_macro_metric(all_x_axis, all_y_axis, n, lw, fmt, axis)
407 | if metric == "roc":
408 | naives: List[Tuple[Number, Number, Number, Number, str]] = [
409 | (0, 1, 0, 1, "grey")
410 | ]
411 | elif metric == "pr":
412 | naives = pr_naives
413 | else:
414 | raise ValueError(f"Invalid metric {metric}")
415 | axis = _display_metric_plot(
416 | axis,
417 | metric,
418 | naives,
419 | xlim=xlim,
420 | ylim=ylim,
421 | legend=legend,
422 | title=title,
423 | filename=filename,
424 | plot=plot,
425 | )
426 | output_dict["ax"] = axis
427 | return output_dict
428 |
429 |
430 | def random_forest_feature_importance(
431 | forest: RandomForestClassifier, features: List[str], precision: int = 4
432 | ) -> Iterable[Tuple[float, str]]:
433 | """
434 | Given a trained `sklearn.ensemble.RandomForestClassifier`, plot the
435 | different features based on their importance according to the classifier,
436 | from the most important to the least.
437 |
438 | Parameters:
439 | -----------
440 | forest : sklearn.ensemble.RandomForestClassifier
441 | A trained `RandomForestClassifier`
442 | features : list
443 | A list of the names of the features the classifier was trained on,
444 | ordered by the same order the appeared
445 | in the training data
446 | precision : int, default = 4
447 | Precision of feature importance
448 | """
449 | return sorted(
450 | zip(
451 | map(lambda x: round(x, precision), forest.feature_importances_),
452 | features,
453 | ),
454 | reverse=True,
455 | )
456 |
457 |
458 | def ks_abc(
459 | y_true: OneDimArray,
460 | y_pred: OneDimArray,
461 | *,
462 | ax: Optional[plt.Axes] = None,
463 | figsize: Optional[Tuple[int, int]] = None,
464 | colors: Tuple[str, str] = ("darkorange", "b"),
465 | title: Optional[str] = None,
466 | xlim: Tuple[float, float] = (0.0, 1.0),
467 | ylim: Tuple[float, float] = (0.0, 1.0),
468 | fmt: str = ".2f",
469 | lw: int = 2,
470 | legend: Optional[str] = "best",
471 | plot: bool = True,
472 | filename: Optional[str] = None,
473 | ) -> Dict[str, Any]:
474 | """
475 | Perform the Kolmogorov–Smirnov test over the positive and negative distributions of a binary classifier, and compute
476 | the area between curves.
477 | The KS test plots the fraction of positives and negatives predicted correctly below each threshold. It then finds
478 | the optimal threshold, being the one enabling the best class separation.
479 | The area between curves allows a better insight into separation. The higher the area is (1 being the maximum), the
480 | more the positive and negative distributions' center-of-mass are closer to 1 and 0, respectively.
481 |
482 | Based on scikit-plot's `plot_ks_statistic` method.
483 |
484 | Parameters:
485 | -----------
486 | y_true : array-like
487 | The true labels of the dataset
488 | y_pred : array-like
489 | The probabilities predicted by a binary classifier
490 | ax : matplotlib ax, default = None
491 | Matplotlib Axis on which the curves will be plotted
492 | figsize : (int,int) or None, default = None
493 | a Matplotlib figure-size tuple. If `None`, falls back to Matplotlib's
494 | default. Only used if `ax=None`.
495 | colors : a tuple of Matplotlib color strings, default = ('darkorange', 'b')
496 | Colors to be used for the plotted curves.
497 | title : string or None, default = None
498 | Plotted graph title. If None, default title is used
499 | xlim : (float, float), default = (0.,1.)
500 | X-axis limits.
501 | ylim : (float,float), default = (0.,1.)
502 | Y-axis limits.
503 | fmt : string, default = '.2f'
504 | String formatting of displayed numbers.
505 | lw : int, default = 2
506 | Line-width.
507 | legend : string or None, default = 'best'
508 | Position graph legend.
509 | plot : Boolean, default = True
510 | Display graph
511 | filename : string or None, default = None
512 | If not None, plot will be saved to the given file name
513 |
514 | Returns:
515 | --------
516 | A dictionary of the following keys:
517 | 'abc': area between curves,
518 | 'ks_stat': computed statistic of the KS test,
519 | 'eopt': estimated optimal threshold,
520 | 'ax': the ax used to plot the curves
521 | """
522 | y_true_arr: NDArray = convert(y_true, "array") # type: ignore
523 | y_pred_arr: NDArray = convert(y_pred, "array") # type: ignore
524 | if y_pred_arr.shape != y_true_arr.shape:
525 | raise ValueError("y_true and y_pred must have the same shape")
526 | elif len(y_pred_arr.shape) == 1 or y_pred_arr.shape[1] == 1:
527 | y_t = y_true_arr
528 | y_p = y_pred_arr
529 | elif y_pred_arr.shape[1] == 2:
530 | y_t = [np.argmax(x) for x in y_true_arr]
531 | y_p = [x[1] for x in y_pred_arr]
532 | else:
533 | raise ValueError(
534 | "y_true and y_pred must originate from a binary classifier, but have {} columns".format(
535 | y_pred_arr.shape[1]
536 | )
537 | )
538 |
539 | thresholds, nr, pr, ks_statistic, max_distance_at, _ = _binary_ks_curve(
540 | y_t, y_p # type: ignore
541 | )
542 | if ax is None:
543 | plt.figure(figsize=figsize)
544 | axis = plt.gca()
545 | else:
546 | axis = ax
547 |
548 | axis.plot(thresholds, pr, lw=lw, color=colors[0], label="Positive Class")
549 | axis.plot(thresholds, nr, lw=lw, color=colors[1], label="Negative Class")
550 | idx = np.where(thresholds == max_distance_at)[0][0]
551 | axis.axvline(
552 | max_distance_at,
553 | *sorted([nr[idx], pr[idx]]),
554 | label="KS Statistic: {ks:{fmt}} at {d:{fmt}}".format(
555 | ks=ks_statistic, d=max_distance_at, fmt=fmt
556 | ),
557 | linestyle=":",
558 | lw=lw,
559 | color="grey",
560 | )
561 |
562 | thresholds = np.append(thresholds, 1.001)
563 | abc = 0.0
564 | for i in range(len(pr)):
565 | abc += (nr[i] - pr[i]) * (thresholds[i + 1] - thresholds[i])
566 |
567 | axis.set_xlim(left=xlim[0], right=xlim[1])
568 | axis.set_ylim(bottom=ylim[0], top=ylim[1])
569 | axis.set_xlabel("Threshold")
570 | axis.set_ylabel("Fraction below threshold")
571 | axis.set_title(
572 | "{t} [ABC = {a:{fmt}}]".format(
573 | t=title or "KS Statistic Plot", a=abc, fmt=fmt
574 | )
575 | )
576 | if legend:
577 | axis.legend(loc=legend)
578 | if filename:
579 | plt.savefig(filename)
580 | plot_or_not(plot)
581 | return {
582 | "abc": abc,
583 | "ks_stat": ks_statistic,
584 | "eopt": max_distance_at,
585 | "ax": axis,
586 | }
587 |
588 |
589 | def _binary_ks_curve(
590 | y_true: OneDimArray, y_probas: OneDimArray
591 | ) -> Tuple[NDArray, NDArray, NDArray, Number, Number, NDArray]:
592 | """Copied from scikit-plot: https://github.com/reiinakano/scikit-plot/blob/master/scikitplot/helpers.py
593 |
594 | This function generates the points necessary to calculate the KS
595 | Statistic curve.
596 |
597 | Args:
598 | y_true (array-like, shape (n_samples)): True labels of the data.
599 |
600 | y_probas (array-like, shape (n_samples)): Probability predictions of
601 | the positive class.
602 |
603 | Returns:
604 | thresholds (numpy.ndarray): An array containing the X-axis values for
605 | plotting the KS Statistic plot.
606 |
607 | pct1 (numpy.ndarray): An array containing the Y-axis values for one
608 | curve of the KS Statistic plot.
609 |
610 | pct2 (numpy.ndarray): An array containing the Y-axis values for one
611 | curve of the KS Statistic plot.
612 |
613 | ks_statistic (float): The KS Statistic, or the maximum vertical
614 | distance between the two curves.
615 |
616 | max_distance_at (float): The X-axis value at which the maximum vertical
617 | distance between the two curves is seen.
618 |
619 | classes (np.ndarray, shape (2)): An array containing the labels of the
620 | two classes making up `y_true`.
621 |
622 | Raises:
623 | ValueError: If `y_true` is not composed of 2 classes. The KS Statistic
624 | is only relevant in binary classification.
625 | """
626 | y_true, y_probas = np.asarray(y_true), np.asarray(y_probas)
627 | lb = LabelEncoder()
628 | encoded_labels = lb.fit_transform(y_true)
629 | if len(lb.classes_) != 2:
630 | raise ValueError(
631 | "Cannot calculate KS statistic for data with "
632 | "{} category/ies".format(len(lb.classes_))
633 | )
634 | idx = encoded_labels == 0
635 | data1 = np.sort(y_probas[idx])
636 | data2 = np.sort(y_probas[np.logical_not(idx)])
637 |
638 | ctr1, ctr2 = 0, 0
639 | thresholds, pct1, pct2 = [], [], []
640 | while ctr1 < len(data1) or ctr2 < len(data2):
641 | # Check if data1 has no more elements
642 | if ctr1 >= len(data1):
643 | current = data2[ctr2]
644 | while ctr2 < len(data2) and current == data2[ctr2]:
645 | ctr2 += 1
646 |
647 | # Check if data2 has no more elements
648 | elif ctr2 >= len(data2):
649 | current = data1[ctr1]
650 | while ctr1 < len(data1) and current == data1[ctr1]:
651 | ctr1 += 1
652 |
653 | else:
654 | if data1[ctr1] > data2[ctr2]:
655 | current = data2[ctr2]
656 | while ctr2 < len(data2) and current == data2[ctr2]:
657 | ctr2 += 1
658 |
659 | elif data1[ctr1] < data2[ctr2]:
660 | current = data1[ctr1]
661 | while ctr1 < len(data1) and current == data1[ctr1]:
662 | ctr1 += 1
663 |
664 | else:
665 | current = data2[ctr2]
666 | while ctr2 < len(data2) and current == data2[ctr2]:
667 | ctr2 += 1
668 | while ctr1 < len(data1) and current == data1[ctr1]:
669 | ctr1 += 1
670 |
671 | thresholds.append(current)
672 | pct1.append(ctr1)
673 | pct2.append(ctr2)
674 |
675 | thresholds = np.asarray(thresholds)
676 | pct1 = np.asarray(pct1) / float(len(data1))
677 | pct2 = np.asarray(pct2) / float(len(data2))
678 |
679 | if thresholds[0] != 0:
680 | thresholds = np.insert(thresholds, 0, [0.0]) # type: ignore
681 | pct1 = np.insert(pct1, 0, [0.0]) # type: ignore
682 | pct2 = np.insert(pct2, 0, [0.0]) # type: ignore
683 | if thresholds[-1] != 1:
684 | thresholds = np.append(thresholds, [1.0]) # type: ignore
685 | pct1 = np.append(pct1, [1.0]) # type: ignore
686 | pct2 = np.append(pct2, [1.0]) # type: ignore
687 |
688 | differences = pct1 - pct2
689 | ks_statistic, max_distance_at = (
690 | np.max(differences),
691 | thresholds[np.argmax(differences)],
692 | )
693 |
694 | return thresholds, pct1, pct2, ks_statistic, max_distance_at, lb.classes_ # type: ignore
695 |
--------------------------------------------------------------------------------
/dython/nominal.py:
--------------------------------------------------------------------------------
1 | import concurrent.futures as cf
2 | import math
3 | import warnings
4 | from collections import Counter
5 | from itertools import repeat
6 | import matplotlib.pyplot as plt
7 | import numpy as np
8 | import pandas as pd
9 | import scipy.cluster.hierarchy as sch
10 | import scipy.stats as ss
11 | import seaborn as sns
12 | from psutil import cpu_count
13 | from typing import (
14 | Union,
15 | Any,
16 | List,
17 | Optional,
18 | Callable,
19 | Tuple,
20 | Dict,
21 | Iterable,
22 | Set,
23 | Literal,
24 | )
25 | from numpy.typing import NDArray, ArrayLike
26 | from matplotlib.colors import Colormap
27 | from ._private import (
28 | convert,
29 | remove_incomplete_samples,
30 | replace_nan_with_value,
31 | plot_or_not,
32 | )
33 | from .data_utils import identify_columns_by_type
34 | from .typing import Number, OneDimArray, TwoDimArray
35 |
36 |
37 | __all__ = [
38 | "associations",
39 | "cluster_correlations",
40 | "conditional_entropy",
41 | "correlation_ratio",
42 | "cramers_v",
43 | "identify_nominal_columns",
44 | "identify_numeric_columns",
45 | "numerical_encoding",
46 | "replot_last_associations",
47 | "theils_u",
48 | ]
49 |
50 | _REPLACE = "replace"
51 | _DROP = "drop"
52 | _DROP_SAMPLES = "drop_samples"
53 | _DROP_FEATURES = "drop_features"
54 | _DROP_SAMPLE_PAIRS = "drop_sample_pairs"
55 | _SKIP = "skip"
56 | _DEFAULT_REPLACE_VALUE = 0.0
57 | _PRECISION = 1e-13
58 |
59 | _ASSOC_PLOT_PARAMS: Dict[str, Any] = dict()
60 |
61 | _NO_OP = "no-op"
62 | _SINGLE_VALUE_COLUMN_OP = "single-value-column-op"
63 | _I_EQ_J_OP = "i-equal-j-op"
64 | _ASSOC_OP = "assoc-op"
65 |
66 | NomNumAssocStr = Literal["correlation_ratio"]
67 | NumNumAssocStr = Literal["pearson", "spearman", "kendall"]
68 | NomNomAssocStr = Literal["cramer", "theil"]
69 |
70 |
71 | def _inf_nan_str(x: Union[int, float]) -> str:
72 | if np.isnan(x):
73 | return "NaN"
74 | elif abs(x) == np.inf:
75 | return "inf"
76 | else:
77 | return ""
78 |
79 |
80 | def conditional_entropy(
81 | x: Union[OneDimArray, List[str]],
82 | y: Union[OneDimArray, List[str]],
83 | nan_strategy: str = _REPLACE,
84 | nan_replace_value: Any = _DEFAULT_REPLACE_VALUE,
85 | log_base: Number = math.e,
86 | ) -> float:
87 | """
88 | Calculates the conditional entropy of x given y: S(x|y)
89 |
90 | Wikipedia: https://en.wikipedia.org/wiki/Conditional_entropy
91 |
92 | Parameters:
93 | -----------
94 | x : list / NumPy ndarray / Pandas Series
95 | A sequence of measurements
96 | y : list / NumPy ndarray / Pandas Series
97 | A sequence of measurements
98 | nan_strategy : string, default = 'replace'
99 | How to handle missing values: can be either 'drop' to remove samples
100 | with missing values, or 'replace' to replace all missing values with
101 | the nan_replace_value. Missing values are None and np.nan.
102 | nan_replace_value : any, default = 0.0
103 | The value used to replace missing values with. Only applicable when
104 | nan_strategy is set to 'replace'.
105 | log_base: float, default = e
106 | specifying base for calculating entropy. Default is base e.
107 |
108 | Returns:
109 | --------
110 | float
111 | """
112 | if nan_strategy == _REPLACE:
113 | x, y = replace_nan_with_value(x, y, nan_replace_value)
114 | elif nan_strategy == _DROP:
115 | x, y = remove_incomplete_samples(x, y)
116 | y_counter = Counter(y)
117 | xy_counter = Counter(list(zip(x, y)))
118 | total_occurrences = sum(y_counter.values())
119 | entropy = 0.0
120 | for xy in xy_counter.keys():
121 | p_xy = xy_counter[xy] / total_occurrences
122 | p_y = y_counter[xy[1]] / total_occurrences
123 | entropy += p_xy * math.log(p_y / p_xy, log_base)
124 | return entropy
125 |
126 |
127 | def cramers_v(
128 | x: Union[OneDimArray, List[str]],
129 | y: Union[OneDimArray, List[str]],
130 | bias_correction: bool = True,
131 | nan_strategy: str = _REPLACE,
132 | nan_replace_value: Any = _DEFAULT_REPLACE_VALUE,
133 | ) -> float:
134 | """
135 | Calculates Cramer's V statistic for categorical-categorical association.
136 | This is a symmetric coefficient: V(x,y) = V(y,x)
137 |
138 | Original function taken from: https://stackoverflow.com/a/46498792/5863503
139 | Wikipedia: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
140 |
141 | Parameters:
142 | -----------
143 | x : list / NumPy ndarray / Pandas Series
144 | A sequence of categorical measurements
145 | y : list / NumPy ndarray / Pandas Series
146 | A sequence of categorical measurements
147 | bias_correction : Boolean, default = True
148 | Use bias correction from Bergsma and Wicher,
149 | Journal of the Korean Statistical Society 42 (2013): 323-328.
150 | nan_strategy : string, default = 'replace'
151 | How to handle missing values: can be either 'drop' to remove samples
152 | with missing values, or 'replace' to replace all missing values with
153 | the nan_replace_value. Missing values are None and np.nan.
154 | nan_replace_value : any, default = 0.0
155 | The value used to replace missing values with. Only applicable when
156 | nan_strategy is set to 'replace'.
157 |
158 | Returns:
159 | --------
160 | float in the range of [0,1]
161 | """
162 | if nan_strategy == _REPLACE:
163 | x, y = replace_nan_with_value(x, y, nan_replace_value)
164 | elif nan_strategy == _DROP:
165 | x, y = remove_incomplete_samples(x, y)
166 | confusion_matrix = pd.crosstab(x, y)
167 | chi2 = ss.chi2_contingency(confusion_matrix)[0]
168 | n = confusion_matrix.sum().sum()
169 | phi2 = chi2 / n
170 | r, k = confusion_matrix.shape
171 | if bias_correction:
172 | phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
173 | rcorr = r - ((r - 1) ** 2) / (n - 1)
174 | kcorr = k - ((k - 1) ** 2) / (n - 1)
175 | if min((kcorr - 1), (rcorr - 1)) == 0:
176 | warnings.warn(
177 | "Unable to calculate Cramer's V using bias correction. Consider using bias_correction=False (or cramers_v_bias_correction=False if calling from associations)",
178 | RuntimeWarning,
179 | )
180 | return np.nan
181 | else:
182 | v = np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
183 | else:
184 | v = np.sqrt(phi2 / min(k - 1, r - 1))
185 | if -_PRECISION <= v < 0.0 or 1.0 < v <= 1.0 + _PRECISION:
186 | rounded_v = 0.0 if v < 0 else 1.0
187 | warnings.warn(
188 | f"Rounded V = {v} to {rounded_v}. This is probably due to floating point precision issues.",
189 | RuntimeWarning,
190 | )
191 | return rounded_v
192 | else:
193 | return v
194 |
195 |
196 | def theils_u(
197 | x: Union[OneDimArray, List[str]],
198 | y: Union[OneDimArray, List[str]],
199 | nan_strategy: str = _REPLACE,
200 | nan_replace_value: Any = _DEFAULT_REPLACE_VALUE,
201 | ) -> float:
202 | """
203 | Calculates Theil's U statistic (Uncertainty coefficient) for categorical-
204 | categorical association. This is the uncertainty of x given y: value is
205 | on the range of [0,1] - where 0 means y provides no information about
206 | x, and 1 means y provides full information about x.
207 |
208 | This is an asymmetric coefficient: U(x,y) != U(y,x)
209 |
210 | Wikipedia: https://en.wikipedia.org/wiki/Uncertainty_coefficient
211 |
212 | Parameters:
213 | -----------
214 | x : list / NumPy ndarray / Pandas Series
215 | A sequence of categorical measurements
216 | y : list / NumPy ndarray / Pandas Series
217 | A sequence of categorical measurements
218 | nan_strategy : string, default = 'replace'
219 | How to handle missing values: can be either 'drop' to remove samples
220 | with missing values, or 'replace' to replace all missing values with
221 | the nan_replace_value. Missing values are None and np.nan.
222 | nan_replace_value : any, default = 0.0
223 | The value used to replace missing values with. Only applicable when
224 | nan_strategy is set to 'replace'.
225 |
226 | Returns:
227 | --------
228 | float in the range of [0,1]
229 | """
230 | if nan_strategy == _REPLACE:
231 | x, y = replace_nan_with_value(x, y, nan_replace_value)
232 | elif nan_strategy == _DROP:
233 | x, y = remove_incomplete_samples(x, y)
234 | s_xy = conditional_entropy(x, y)
235 | x_counter = Counter(x)
236 | total_occurrences = sum(x_counter.values())
237 | p_x = list(map(lambda n: n / total_occurrences, x_counter.values()))
238 | s_x = ss.entropy(p_x)
239 | if s_x == 0:
240 | return 1.0
241 | else:
242 | u = (s_x - s_xy) / s_x # type: ignore
243 | if -_PRECISION <= u < 0.0 or 1.0 < u <= 1.0 + _PRECISION:
244 | rounded_u = 0.0 if u < 0 else 1.0
245 | warnings.warn(
246 | f"Rounded U = {u} to {rounded_u}. This is probably due to floating point precision issues.",
247 | RuntimeWarning,
248 | )
249 | return rounded_u
250 | else:
251 | return u
252 |
253 |
254 | def correlation_ratio(
255 | categories: Union[OneDimArray, List[str]],
256 | measurements: OneDimArray,
257 | nan_strategy: str = _REPLACE,
258 | nan_replace_value: Any = _DEFAULT_REPLACE_VALUE,
259 | ) -> float:
260 | """
261 | Calculates the Correlation Ratio (sometimes marked by the greek letter Eta)
262 | for categorical-continuous association.
263 |
264 | Answers the question - given a continuous value of a measurement, is it
265 | possible to know which category is it associated with?
266 |
267 | Value is in the range [0,1], where 0 means a category cannot be determined
268 | by a continuous measurement, and 1 means a category can be determined with
269 | absolute certainty.
270 |
271 | Wikipedia: https://en.wikipedia.org/wiki/Correlation_ratio
272 |
273 | Parameters:
274 | -----------
275 | categories : list / NumPy ndarray / Pandas Series
276 | A sequence of categorical measurements
277 | measurements : list / NumPy ndarray / Pandas Series
278 | A sequence of continuous measurements
279 | nan_strategy : string, default = 'replace'
280 | How to handle missing values: can be either 'drop' to remove samples
281 | with missing values, or 'replace' to replace all missing values with
282 | the nan_replace_value. Missing values are None and np.nan.
283 | nan_replace_value : any, default = 0.0
284 | The value used to replace missing values with. Only applicable when
285 | nan_strategy is set to 'replace'.
286 |
287 | Returns:
288 | --------
289 | float in the range of [0,1]
290 | """
291 | if nan_strategy == _REPLACE:
292 | categories, measurements = replace_nan_with_value(
293 | categories, measurements, nan_replace_value
294 | )
295 | elif nan_strategy == _DROP:
296 | categories, measurements = remove_incomplete_samples(
297 | categories, measurements
298 | )
299 | categories_array: NDArray = convert(categories, "array") # type: ignore
300 | measurements_array: NDArray = convert(measurements, "array") # type: ignore
301 | fcat, _ = pd.factorize(categories_array) # type: ignore
302 | cat_num = np.max(fcat) + 1
303 | y_avg_array = np.zeros(cat_num)
304 | n_array = np.zeros(cat_num)
305 | for i in range(0, cat_num):
306 | cat_measures = measurements_array[np.argwhere(fcat == i).flatten()]
307 | n_array[i] = len(cat_measures)
308 | y_avg_array[i] = np.average(cat_measures)
309 | y_total_avg = np.sum(np.multiply(y_avg_array, n_array)) / np.sum(n_array)
310 | numerator = np.sum(
311 | np.multiply(n_array, np.power(np.subtract(y_avg_array, y_total_avg), 2))
312 | )
313 | denominator = np.sum(
314 | np.power(np.subtract(measurements_array, y_total_avg), 2)
315 | )
316 | if numerator == 0:
317 | return 0.0
318 | else:
319 | eta = np.sqrt(numerator / denominator)
320 | if 1.0 < eta <= 1.0 + _PRECISION:
321 | warnings.warn(
322 | f"Rounded eta = {eta} to 1. This is probably due to floating point precision issues.",
323 | RuntimeWarning,
324 | )
325 | return 1.0
326 | else:
327 | return eta
328 |
329 |
330 | def identify_nominal_columns(dataset: TwoDimArray) -> List[Any]:
331 | """
332 | Given a dataset, identify categorical columns.
333 |
334 | Parameters:
335 | -----------
336 | dataset : NumPy ndarray / Pandas DataFrame
337 |
338 | Returns:
339 | --------
340 | A list of categorical columns names
341 |
342 | Example:
343 | --------
344 | >>> df = pd.DataFrame({'col1': ['a', 'b', 'c', 'a'], 'col2': [3, 4, 2, 1]})
345 | >>> identify_nominal_columns(df)
346 | ['col1']
347 |
348 | """
349 | return identify_columns_by_type(dataset, include=["object", "category"])
350 |
351 |
352 | def identify_numeric_columns(dataset: TwoDimArray) -> List[Any]:
353 | """
354 | Given a dataset, identify numeric columns.
355 |
356 | Parameters:
357 | -----------
358 | dataset : NumPy ndarray / Pandas DataFrame
359 |
360 | Returns:
361 | --------
362 | A list of numerical columns names
363 |
364 | Example:
365 | --------
366 | >>> df = pd.DataFrame({'col1': ['a', 'b', 'c', 'a'], 'col2': [3, 4, 2, 1], 'col3': [1., 2., 3., 4.]})
367 | >>> identify_numeric_columns(df)
368 | ['col2', 'col3']
369 |
370 | """
371 | return identify_columns_by_type(dataset, include=["int64", "float64"])
372 |
373 |
374 | def associations(
375 | dataset: TwoDimArray,
376 | nominal_columns: Optional[Union[OneDimArray, List[str], str]] = "auto",
377 | *,
378 | numerical_columns: Optional[Union[OneDimArray, List[str], str]] = None,
379 | mark_columns: bool = False,
380 | nom_nom_assoc: Union[
381 | NomNomAssocStr, Callable[[pd.Series, pd.Series], Number]
382 | ] = "cramer",
383 | num_num_assoc: Union[
384 | NumNumAssocStr, Callable[[pd.Series, pd.Series], Number]
385 | ] = "pearson",
386 | nom_num_assoc: Union[
387 | NomNumAssocStr, Callable[[pd.Series, pd.Series], Number]
388 | ] = "correlation_ratio",
389 | symmetric_nom_nom: bool = True,
390 | symmetric_num_num: bool = True,
391 | display_rows: Union[str, List[str]] = "all",
392 | display_columns: Union[str, List[str]] = "all",
393 | hide_rows: Optional[Union[str, List[str]]] = None,
394 | hide_columns: Optional[Union[str, List[str]]] = None,
395 | cramers_v_bias_correction: bool = True,
396 | nan_strategy: str = _REPLACE,
397 | nan_replace_value: Any = _DEFAULT_REPLACE_VALUE,
398 | ax: Optional[plt.Axes] = None,
399 | figsize: Optional[Tuple[float, float]] = None,
400 | annot: bool = True,
401 | fmt: str = ".2f",
402 | cmap: Optional[Colormap] = None,
403 | sv_color: str = "silver",
404 | cbar: bool = True,
405 | vmax: float = 1.0,
406 | vmin: Optional[float] = None,
407 | plot: bool = True,
408 | compute_only: bool = False,
409 | clustering: bool = False,
410 | title: Optional[str] = None,
411 | filename: Optional[str] = None,
412 | multiprocessing: bool = False,
413 | max_cpu_cores: Optional[int] = None,
414 | ) -> Dict[str, Union[pd.DataFrame, plt.Axes]]:
415 | """
416 | Calculate the correlation/strength-of-association of features in data-set
417 | with both categorical and continuous features using:
418 | * Pearson's R for continuous-continuous cases
419 | * Correlation Ratio for categorical-continuous cases
420 | * Cramer's V or Theil's U for categorical-categorical cases
421 |
422 | Parameters:
423 | -----------
424 | dataset : NumPy ndarray / Pandas DataFrame
425 | The data-set for which the features' correlation is computed
426 | nominal_columns : string / list / NumPy ndarray, default = 'auto'
427 | Names of columns of the data-set which hold categorical values. Can
428 | also be the string 'all' to state that all columns are categorical,
429 | 'auto' (default) to try to identify nominal columns, or None to state
430 | none are categorical. Only used if `numerical_columns` is `None`.
431 | numerical_columns : string / list / NumPy ndarray, default = None
432 | To be used instead of `nominal_columns`. Names of columns of the data-set
433 | which hold numerical values. Can also be the string 'all' to state that
434 | all columns are numerical (equivalent to `nominal_columns=None`) or
435 | 'auto' to try to identify numerical columns (equivalent to
436 | `nominal_columns=auto`). If `None`, `nominal_columns` is used.
437 | mark_columns : Boolean, default = False
438 | if True, output's columns' names will have a suffix of '(nom)' or
439 | '(con)' based on their type (nominal or continuous), as provided
440 | by nominal_columns
441 | nom_nom_assoc : callable / string, default = 'cramer'
442 | If callable, a function which receives two `pd.Series` and returns a single number.
443 | If string, name of nominal-nominal (categorical-categorical) association to use.
444 | Options are 'cramer' for Cramer's V or `theil` for Theil's U. If 'theil',
445 | heat-map columns are the provided information (U = U(row|col)).
446 | num_num_assoc : callable / string, default = 'pearson'
447 | If callable, a function which receives two `pd.Series` and returns a single number.
448 | If string, name of numerical-numerical association to use. Options are 'pearson'
449 | for Pearson's R, 'spearman' for Spearman's R, 'kendall' for Kendall's Tau.
450 | nom_num_assoc : callable / string, default = 'correlation_ratio'
451 | If callable, a function which receives two `pd.Series` and returns a single number.
452 | If string, name of nominal-numerical association to use. Options are 'correlation_ratio'
453 | for correlation ratio.
454 | symmetric_nom_nom : Boolean, default = True
455 | Relevant only if `nom_nom_assoc` is a callable. Declare whether the function is symmetric (f(x,y) = f(y,x)).
456 | If False, heat-map values should be interpreted as f(row,col)
457 | symmetric_num_num : Boolean, default = True
458 | Relevant only if `num_num_assoc` is a callable. Declare whether the function is symmetric (f(x,y) = f(y,x)).
459 | If False, heat-map values should be interpreted as f(row,col)
460 | display_rows : list / string, default = 'all'
461 | Choose which of the dataset's features will be displayed in the output's
462 | correlations table rows. If string, can either be a single feature's name or 'all'.
463 | Only used if `hide_rows` is `None`.
464 | display_columns : list / string, default = 'all'
465 | Choose which of the dataset's features will be displayed in the output's
466 | correlations table columns. If string, can either be a single feature's name or 'all'.
467 | Only used if `hide_columns` is `None`.
468 | hide_rows : list / string, default = None
469 | Choose which of the dataset's features will not be displayed in the output's
470 | correlations table rows. If string, must be a single feature's name. If `None`,
471 | `display_rows` is used.
472 | hide_columns : list / string, default = None
473 | Choose which of the dataset's features will not be displayed in the output's
474 | correlations table columns. If string, must be a single feature's name. If `None`,
475 | `display_columns` is used.
476 | cramers_v_bias_correction : Boolean, default = True
477 | Use bias correction for Cramer's V from Bergsma and Wicher,
478 | Journal of the Korean Statistical Society 42 (2013): 323-328.
479 | nan_strategy : string, default = 'replace'
480 | How to handle missing values: can be either 'drop_samples' to remove
481 | samples with missing values, 'drop_features' to remove features
482 | (columns) with missing values, 'replace' to replace all missing
483 | values with the nan_replace_value, or 'drop_sample_pairs' to drop each
484 | pair of missing observables separately before calculating the corresponding coefficient.
485 | Missing values are None and np.nan.
486 | nan_replace_value : any, default = 0.0
487 | The value used to replace missing values with. Only applicable when
488 | nan_strategy is set to 'replace'
489 | ax : matplotlib ax, default = None
490 | Matplotlib Axis on which the heat-map will be plotted
491 | figsize : (float, float) or None, default = None
492 | A Matplotlib figure-size tuple. If `None`, will attempt to set the size automatically.
493 | Only used if `ax=None`.
494 | annot : Boolean, default = True
495 | Plot number annotations on the heat-map
496 | fmt : string, default = '.2f'
497 | String formatting of annotations
498 | cmap : Matplotlib colormap or None, default = None
499 | A colormap to be used for the heat-map. If None, falls back to Seaborn's
500 | heat-map default
501 | sv_color : string, default = 'silver'
502 | A Matplotlib color. The color to be used when displaying single-value
503 | features over the heat-map
504 | cbar: Boolean, default = True
505 | Display heat-map's color-bar
506 | vmax: float, default = 1.0
507 | Set heat-map vmax option
508 | vmin: float or None, default = None
509 | Set heat-map vmin option. If set to None, vmin will be chosen automatically
510 | between 0 and -1, depending on the types of associations used (-1 if Pearson's R
511 | is used, 0 otherwise)
512 | plot : Boolean, default = True
513 | Plot a heat-map of the correlation matrix. If False, plotting still
514 | happen, but the heat-map will not be displayed.
515 | compute_only : Boolean, default = False
516 | Use this flag only if you have no need of the plotting at all. This skips the entire
517 | plotting mechanism.
518 | clustering : Boolean, default = False
519 | If True, hierarchical clustering is applied in order to sort
520 | features into meaningful groups
521 | title : string or None, default = None
522 | Plotted graph title
523 | filename : string or None, default = None
524 | If not None, plot will be saved to the given file name
525 | multiprocessing: Boolean, default = False
526 | If True, use `multiprocessing` to speed up computations. If None, falls back to single core computation
527 | max_cpu_cores: int or None, default = None
528 | If not None, ProcessPoolExecutor will use the given number of CPU cores
529 |
530 | Returns:
531 | --------
532 | A dictionary with the following keys:
533 | - `corr`: A DataFrame of the correlation/strength-of-association between
534 | all features
535 | - `ax`: A Matplotlib `Axe`
536 |
537 | Example:
538 | --------
539 | See examples under `dython.examples`
540 | """
541 | df: pd.DataFrame = convert(dataset, "dataframe") # type: ignore
542 |
543 | if numerical_columns is not None:
544 | if numerical_columns == "auto":
545 | nominal_columns = "auto"
546 | elif numerical_columns == "all":
547 | nominal_columns = None
548 | else:
549 | nominal_columns = [
550 | c for c in df.columns if c not in numerical_columns
551 | ]
552 |
553 | # handling NaN values in data
554 | if nan_strategy == _REPLACE:
555 | # handling pandas categorical
556 | df = _handling_category_for_nan_imputation(df, nan_replace_value)
557 |
558 | df.fillna(nan_replace_value, inplace=True)
559 | elif nan_strategy == _DROP_SAMPLES:
560 | df.dropna(axis=0, inplace=True)
561 | elif nan_strategy == _DROP_FEATURES:
562 | df.dropna(axis=1, inplace=True)
563 | elif nan_strategy == _DROP_SAMPLE_PAIRS:
564 | pass # will be handled pair-by-pair during calculations
565 | else:
566 | raise ValueError(
567 | "Argument nan_strategy [{:s}] is not a valid choice.".format(
568 | nan_strategy
569 | )
570 | )
571 |
572 | # identifying categorical columns
573 | columns = df.columns
574 | auto_nominal = False
575 | if nominal_columns is None:
576 | nominal_columns = list()
577 | elif nominal_columns == "all":
578 | nominal_columns = columns.tolist()
579 | elif nominal_columns == "auto":
580 | auto_nominal = True
581 | nominal_columns = identify_nominal_columns(df)
582 |
583 | # selecting rows and columns to be displayed
584 | if hide_rows is not None:
585 | if isinstance(hide_rows, str) or isinstance(hide_rows, int):
586 | hide_rows = [hide_rows] # type: ignore
587 | display_rows = [c for c in df.columns if c not in hide_rows] # type: ignore
588 | else:
589 | if display_rows == "all":
590 | display_rows = columns.tolist()
591 | elif isinstance(display_rows, str) or isinstance(display_rows, int):
592 | display_columns = [display_rows] # type: ignore
593 |
594 | if hide_columns is not None:
595 | if isinstance(hide_columns, str) or isinstance(hide_columns, int):
596 | hide_columns = [hide_columns] # type: ignore
597 | display_columns = [c for c in df.columns if c not in hide_columns] # type: ignore
598 | else:
599 | if display_columns == "all":
600 | display_columns = columns.tolist()
601 | elif isinstance(display_columns, str) or isinstance(
602 | display_columns, int
603 | ):
604 | display_columns = [display_columns] # type: ignore
605 |
606 | if (
607 | display_rows is None
608 | or display_columns is None
609 | or len(display_rows) < 1
610 | or len(display_columns) < 1
611 | ):
612 | raise ValueError(
613 | "display_rows and display_columns must have at least one element"
614 | )
615 | displayed_features_set = set.union(set(display_rows), set(display_columns))
616 |
617 | # Adjusting figsize based on the number of features
618 | if figsize is None:
619 | BASE_SIZE = 1.5 # Size multiplier per feature
620 | num_features = len(displayed_features_set)
621 | figsize = (BASE_SIZE * num_features, BASE_SIZE * num_features)
622 |
623 | # convert timestamp columns to numerical columns, so correlation can be performed
624 | datetime_dtypes = [
625 | str(x) for x in df.dtypes if str(x).startswith("datetime64")
626 | ] # finding all timezones
627 | if datetime_dtypes:
628 | datetime_cols = identify_columns_by_type(df, datetime_dtypes)
629 | datetime_cols = [c for c in datetime_cols if c not in nominal_columns]
630 | if datetime_cols:
631 | df[datetime_cols] = df[datetime_cols].apply(
632 | lambda col: col.view(np.int64), axis=0
633 | )
634 | if auto_nominal:
635 | nominal_columns = identify_nominal_columns(df)
636 |
637 | # will be used to store associations values
638 | corr = pd.DataFrame(index=columns, columns=columns, dtype=np.float64)
639 |
640 | # this dataframe is used to keep track of invalid association values, which will be placed on top
641 | # of the corr dataframe. It is done for visualization purposes, so the heatmap values will remain
642 | # between -1 and 1
643 | inf_nan = pd.DataFrame(
644 | data=np.zeros_like(corr), columns=columns, index=columns, dtype="object"
645 | )
646 |
647 | # finding single-value columns
648 | single_value_columns_set = set()
649 | for c in displayed_features_set:
650 | if df[c].unique().size == 1:
651 | single_value_columns_set.add(c)
652 |
653 | # find the number of physical cpu cores available
654 | n_cores = cpu_count(logical=False)
655 |
656 | # current multiprocessing implementation performs worse on 2 cores than on 1 core,
657 | # so we only use multiprocessing if there are more than 2 physical cores available
658 | if multiprocessing and n_cores > 2:
659 | # find out the list of cartesian products of the column indices
660 | number_of_columns = len(columns)
661 | list_of_indices_pairs_lists = [
662 | (i, j)
663 | for i in range(number_of_columns)
664 | for j in range(number_of_columns)
665 | ]
666 |
667 | # do not exceed 32 cores under any circumstances
668 | if max_cpu_cores is not None:
669 | max_cpu_cores = min(32, min(max_cpu_cores, n_cores))
670 | else:
671 | max_cpu_cores = min(32, n_cores)
672 |
673 | # submit each list of cartesian products of column indices to separate processes
674 | # for faster computation.
675 | # process 1 receives: [(0, 0), (0, 1), (0, 2), ... (0, n)]
676 | # process 2 receives: [(1, 0), (1, 1), (1, 2), ... (1, n)]
677 | # ...
678 | # process m receives: [(n, 0), (n, 1), (n, 2), ... (n, n)]
679 | # where, n = num_columns - 1
680 | with cf.ProcessPoolExecutor(max_workers=max_cpu_cores) as executor:
681 | results = executor.map(
682 | _compute_associations,
683 | list_of_indices_pairs_lists,
684 | repeat(df),
685 | repeat(displayed_features_set),
686 | repeat(single_value_columns_set),
687 | repeat(nominal_columns),
688 | repeat(symmetric_nom_nom),
689 | repeat(nom_nom_assoc),
690 | repeat(cramers_v_bias_correction),
691 | repeat(num_num_assoc),
692 | repeat(nom_num_assoc),
693 | repeat(symmetric_num_num),
694 | repeat(nan_strategy),
695 | chunksize=max(
696 | 1, len(list_of_indices_pairs_lists) // max_cpu_cores
697 | ),
698 | )
699 | else:
700 | results: Iterable[Tuple] = []
701 |
702 | for i in range(0, len(columns)):
703 | for j in range(i, len(columns)):
704 | results.append(
705 | _compute_associations(
706 | (i, j),
707 | df,
708 | displayed_features_set,
709 | single_value_columns_set,
710 | nominal_columns,
711 | symmetric_nom_nom,
712 | nom_nom_assoc,
713 | cramers_v_bias_correction,
714 | num_num_assoc,
715 | nom_num_assoc,
716 | symmetric_num_num,
717 | nan_strategy,
718 | )
719 | )
720 |
721 | # fill the correlation dataframe with the results
722 | for result in results:
723 | try:
724 | if result[0] == _NO_OP:
725 | pass
726 | elif result[0] == _SINGLE_VALUE_COLUMN_OP:
727 | i = result[1]
728 | corr.loc[:, columns[i]] = 0.0
729 | corr.loc[columns[i], :] = 0.0
730 | elif result[0] == _I_EQ_J_OP:
731 | i, j = result[1:]
732 | corr.loc[columns[i], columns[j]] = 1.0
733 | else:
734 | # assoc_op
735 | i, j, ij, ji = result[1:]
736 | corr.loc[columns[i], columns[j]] = (
737 | ij if not np.isnan(ij) and abs(ij) < np.inf else 0.0
738 | )
739 | corr.loc[columns[j], columns[i]] = (
740 | ji if not np.isnan(ji) and abs(ji) < np.inf else 0.0
741 | )
742 | inf_nan.loc[columns[i], columns[j]] = _inf_nan_str(ij)
743 | inf_nan.loc[columns[j], columns[i]] = _inf_nan_str(ji)
744 | except Exception as exception:
745 | raise exception
746 |
747 | corr.fillna(value=np.nan, inplace=True)
748 |
749 | if clustering:
750 | corr, _ = cluster_correlations(corr) # type: ignore
751 | inf_nan = inf_nan.reindex(columns=corr.columns).reindex(
752 | index=corr.index
753 | )
754 |
755 | # rearrange dispalyed rows and columns according to the clustered order
756 | display_columns = [c for c in corr.columns if c in display_columns]
757 | display_rows = [c for c in corr.index if c in display_rows]
758 |
759 | # keep only displayed columns and rows
760 | corr: pd.DataFrame = corr.loc[display_rows, display_columns] # type: ignore
761 | inf_nan = inf_nan.loc[display_rows, display_columns] # type: ignore
762 |
763 | if mark_columns:
764 |
765 | def mark(col):
766 | return (
767 | "{} (nom)".format(col)
768 | if col in nominal_columns
769 | else "{} (con)".format(col)
770 | )
771 |
772 | corr.columns = [mark(col) for col in corr.columns]
773 | corr.index = [mark(col) for col in corr.index] # type: ignore
774 | inf_nan.columns = corr.columns
775 | inf_nan.index = corr.index
776 | single_value_columns_set = {
777 | mark(col) for col in single_value_columns_set
778 | }
779 | display_rows = [mark(col) for col in display_rows]
780 | display_columns = [mark(col) for col in display_columns]
781 |
782 | if not compute_only:
783 | for v in [
784 | "corr",
785 | "inf_nan",
786 | "single_value_columns_set",
787 | "display_rows",
788 | "display_columns",
789 | "displayed_features_set",
790 | "nominal_columns",
791 | "figsize",
792 | "vmin",
793 | "vmax",
794 | "cbar",
795 | "cmap",
796 | "sv_color",
797 | "fmt",
798 | "annot",
799 | "title",
800 | ]:
801 | _ASSOC_PLOT_PARAMS[v] = locals()[v]
802 | ax = _plot_associations(ax, filename, plot, **_ASSOC_PLOT_PARAMS)
803 | return {"corr": corr, "ax": ax} # type: ignore
804 |
805 |
806 | def replot_last_associations(
807 | ax: Optional[plt.Axes] = None,
808 | figsize: Optional[Tuple[int, int]] = None,
809 | annot: Optional[bool] = None,
810 | fmt: Optional[str] = None,
811 | cmap: Optional[Colormap] = None,
812 | sv_color: Optional[str] = None,
813 | cbar: Optional[bool] = None,
814 | vmax: Optional[float] = None,
815 | vmin: Optional[float] = None,
816 | plot: bool = True,
817 | title: Optional[str] = None,
818 | filename: Optional[str] = None,
819 | ) -> plt.Axes:
820 | """
821 | Re-plot last computed associations heat-map. This method performs no new computations, but only allows
822 | to change the visual output of the last computed heat-map.
823 |
824 | Parameters:
825 | -----------
826 | ax : matplotlib ax, default = None
827 | Matplotlib Axis on which the heat-map will be plotted
828 | figsize : (int,int) or None, default = None
829 | A Matplotlib figure-size tuple. If `None`, uses the last `associations` call value.
830 | Only used if `ax=None`.
831 | annot : Boolean or None, default = None
832 | Plot number annotations on the heat-map. If `None`, uses the last `associations` call value.
833 | fmt : string, default = None
834 | String formatting of annotations. If `None`, uses the last `associations` call value.
835 | cmap : Matplotlib colormap or None, default = None
836 | A colormap to be used for the heat-map. If `None`, uses the last `associations` call value.
837 | sv_color : string, default = None
838 | A Matplotlib color. The color to be used when displaying single-value.
839 | If `None`, uses the last `associations` call value.
840 | cbar : Boolean or None, default = None
841 | Display heat-map's color-bar. If `None`, uses the last `associations` call value.
842 | vmax : float or None, default = None
843 | Set heat-map vmax option. If `None`, uses the last `associations` call value.
844 | vmin : float or None, default = None
845 | Set heat-map vmin option. If `None`, uses the last `associations` call value.
846 | plot : Boolean, default = True
847 | Plot a heat-map of the correlation matrix. If False, plotting still
848 | happens, but the heat-map will not be displayed.
849 | title : string or None, default = None
850 | Plotted graph title. If `None`, uses the last `associations` call value.
851 | filename : string or None, default = None
852 | If not None, plot will be saved to the given file name. Note: in order to avoid accidental file
853 | overwrites, the last `associations` call value is never used, and when filename is set to None,
854 | no writing to file occurs.
855 |
856 | Returns:
857 | --------
858 | A Matplotlib `Axes`
859 | """
860 | if not bool(_ASSOC_PLOT_PARAMS):
861 | raise RuntimeError("No associations found to replot.")
862 | new_vars = locals()
863 | new_vars.pop("filename")
864 | new_vars.pop("ax")
865 | new_vars.pop("plot")
866 | plot_vars = _ASSOC_PLOT_PARAMS.copy()
867 | for v in new_vars:
868 | plot_vars[v] = new_vars[v] or plot_vars[v]
869 | return _plot_associations(ax, filename, plot, **plot_vars)
870 |
871 |
872 | def _plot_associations(
873 | ax: Optional[plt.Axes],
874 | filename: Optional[str],
875 | plot: bool,
876 | corr: pd.DataFrame,
877 | inf_nan: pd.DataFrame,
878 | single_value_columns_set: Set[str],
879 | display_rows: List[str],
880 | display_columns: List[str],
881 | displayed_features_set: Set[str],
882 | nominal_columns: List[str],
883 | figsize: Tuple[int, int],
884 | vmin: Optional[Number],
885 | vmax: Number,
886 | cbar: bool,
887 | cmap: Colormap,
888 | sv_color: str,
889 | fmt: str,
890 | annot: bool,
891 | title: str,
892 | ) -> plt.Axes:
893 | if ax is None:
894 | plt.figure(figsize=figsize)
895 | if inf_nan.any(axis=None):
896 | inf_nan_mask = np.vectorize(lambda x: not bool(x))(inf_nan.values)
897 | ax = sns.heatmap(
898 | inf_nan_mask,
899 | cmap=["white"],
900 | annot=inf_nan if annot else None,
901 | fmt="",
902 | center=0,
903 | square=True,
904 | ax=ax,
905 | mask=inf_nan_mask,
906 | cbar=False,
907 | )
908 | else:
909 | inf_nan_mask = np.ones_like(corr)
910 | if len(single_value_columns_set) > 0:
911 | sv = pd.DataFrame(
912 | data=np.zeros_like(corr),
913 | columns=corr.columns,
914 | index=corr.index,
915 | dtype="object",
916 | )
917 | for c in single_value_columns_set:
918 | if c in display_rows and c in display_columns:
919 | sv.loc[:, c] = " "
920 | sv.loc[c, :] = " "
921 | sv.loc[c, c] = "SV"
922 | elif c in display_rows:
923 | sv.loc[c, :] = " "
924 | sv.loc[c, sv.columns[0]] = "SV"
925 | else: # c in display_columns
926 | sv.loc[:, c] = " "
927 | sv.loc[sv.index[-1], c] = "SV"
928 | sv_mask = np.vectorize(lambda x: not bool(x))(sv.values)
929 | ax = sns.heatmap(
930 | sv_mask,
931 | cmap=[sv_color],
932 | annot=sv if annot else None,
933 | fmt="",
934 | center=0,
935 | square=True,
936 | ax=ax,
937 | mask=sv_mask,
938 | cbar=False,
939 | )
940 | else:
941 | sv_mask = np.ones_like(corr)
942 | mask = np.vectorize(lambda x: not bool(x))(inf_nan_mask) + np.vectorize(
943 | lambda x: not bool(x)
944 | )(sv_mask)
945 | vmin = vmin or (
946 | -1.0 if len(displayed_features_set) - len(nominal_columns) >= 2 else 0.0
947 | )
948 | ax = sns.heatmap(
949 | corr,
950 | cmap=cmap,
951 | annot=annot,
952 | fmt=fmt,
953 | center=0,
954 | vmax=vmax,
955 | vmin=vmin,
956 | square=True,
957 | mask=mask,
958 | ax=ax,
959 | cbar=cbar,
960 | )
961 | plt.title(title)
962 | if filename:
963 | plt.savefig(filename)
964 | plot_or_not(plot)
965 | return ax
966 |
967 |
968 | def _handling_category_for_nan_imputation(
969 | dataset: pd.DataFrame, nan_replace_value: Any
970 | ) -> pd.DataFrame:
971 | pd_categorical_columns = identify_columns_by_type(
972 | dataset, include=["category"]
973 | )
974 | if pd_categorical_columns:
975 | for col in pd_categorical_columns:
976 | if isinstance(nan_replace_value, pd.DataFrame):
977 | values_ = nan_replace_value[col].unique().tolist()
978 | values = [
979 | x for x in values_ if x not in dataset[col].cat.categories
980 | ]
981 | dataset[col] = dataset[col].cat.add_categories(values)
982 | else:
983 | if isinstance(nan_replace_value, dict):
984 | value = nan_replace_value[col]
985 | else:
986 | value = nan_replace_value
987 | if not value in dataset[col].cat.categories:
988 | dataset[col] = dataset[col].cat.add_categories(value)
989 | return dataset
990 |
991 |
992 | def _nom_num(
993 | nom_column: OneDimArray,
994 | num_column: OneDimArray,
995 | nom_num_assoc: Union[Callable, NomNumAssocStr],
996 | ) -> Tuple[Number, Number]:
997 | """
998 | Computes the nominal-numerical association value.
999 | """
1000 | if callable(nom_num_assoc):
1001 | cell = nom_num_assoc(nom_column, num_column)
1002 | ij = cell
1003 | ji = cell
1004 | elif nom_num_assoc == "correlation_ratio":
1005 | cell = correlation_ratio(nom_column, num_column, nan_strategy=_SKIP)
1006 | ij = cell
1007 | ji = cell
1008 | else:
1009 | raise ValueError(
1010 | f"{nom_num_assoc} is not a supported nominal-numerical association"
1011 | )
1012 | return ij, ji
1013 |
1014 |
1015 | def _compute_associations(
1016 | indices_pair: Tuple[int, int],
1017 | dataset: pd.DataFrame,
1018 | displayed_features_set: Set[str],
1019 | single_value_columns_set: Set[str],
1020 | nominal_columns: Union[OneDimArray, List[str], str],
1021 | symmetric_nom_nom: bool,
1022 | nom_nom_assoc: Union[
1023 | NomNomAssocStr, Callable[[pd.Series, pd.Series], Number]
1024 | ],
1025 | cramers_v_bias_correction: bool,
1026 | num_num_assoc: Union[
1027 | NumNumAssocStr, Callable[[pd.Series, pd.Series], Number]
1028 | ],
1029 | nom_num_assoc: Union[
1030 | NomNumAssocStr, Callable[[pd.Series, pd.Series], Number]
1031 | ],
1032 | symmetric_num_num: bool,
1033 | nan_strategy: str,
1034 | ) -> Tuple:
1035 | """
1036 | Helper function of associations.
1037 |
1038 | Parameters:
1039 | -----------
1040 | indices_pair: Tuple[int, int]
1041 | The tuple of indices pairs (i, j)
1042 | dataset: pandas.Dataframe
1043 | the pandas dataframe
1044 | displayed_features_set: Set[str]
1045 | The set of { display_rows } ∪ { display_columns }
1046 | single_value_columns_set: Set[str]
1047 | The set of single-value columns
1048 | nominal_columns : string / list / NumPy ndarray, default = 'auto'
1049 | Names of columns of the data-set which hold categorical values. Can
1050 | also be the string 'all' to state that all columns are categorical,
1051 | 'auto' (default) to try to identify nominal columns, or None to state
1052 | none are categorical. Only used if `numerical_columns` is `None`.
1053 | symmetric_nom_nom : Boolean, default = True
1054 | Relevant only if `nom_nom_assoc` is a callable. Declare whether the function is symmetric (f(x,y) = f(y,x)).
1055 | If False, heat-map values should be interpreted as f(row,col)
1056 | nom_nom_assoc : callable / string, default = 'cramer'
1057 | If callable, a function which recieves two `pd.Series` and returns a single number.
1058 | If string, name of nominal-nominal (categorical-categorical) association to use.
1059 | Options are 'cramer' for Cramer's V or `theil` for Theil's U. If 'theil',
1060 | heat-map columns are the provided information (U = U(row|col)).
1061 | num_num_assoc : callable / string, default = 'pearson'
1062 | If callable, a function which recieves two `pd.Series` and returns a single number.
1063 | If string, name of numerical-numerical association to use. Options are 'pearson'
1064 | for Pearson's R, 'spearman' for Spearman's R, 'kendall' for Kendall's Tau.
1065 | nom_num_assoc : callable / string, default = 'correlation_ratio'
1066 | If callable, a function which recieves two `pd.Series` and returns a single number.
1067 | If string, name of nominal-numerical association to use. Options are 'correlation_ratio'
1068 | for correlation ratio.
1069 | symmetric_num_num : Boolean, default = True
1070 | Relevant only if `num_num_assoc` is a callable. Declare whether the function is symmetric (f(x,y) = f(y,x)).
1071 | If False, heat-map values should be interpreted as f(row,col)
1072 | cramers_v_bias_correction : Boolean, default = True
1073 | Use bias correction for Cramer's V from Bergsma and Wicher,
1074 | Journal of the Korean Statistical Society 42 (2013): 323-328.
1075 | nan_strategy: string
1076 | The provided nan_strategy to associations
1077 |
1078 | Returns:
1079 | --------
1080 | A list containing tuples. All tuples have one of the following strings in the
1081 | 0-th index:
1082 | * _NO_OP
1083 | * _SINGLE_VALUE_COLUMN_OP
1084 | * _I_EQ_J_OP
1085 | * _ASSOC_OP
1086 | Then, additionally, they can have multiple numerical values.
1087 | """
1088 | columns = dataset.columns
1089 |
1090 | i, j = indices_pair
1091 | if columns[i] not in displayed_features_set:
1092 | return (_NO_OP, None)
1093 | if columns[i] in single_value_columns_set:
1094 | return (_SINGLE_VALUE_COLUMN_OP, i)
1095 |
1096 | if (
1097 | columns[j] in single_value_columns_set
1098 | or columns[j] not in displayed_features_set
1099 | ):
1100 | return (_NO_OP, None)
1101 | elif i == j:
1102 | return (_I_EQ_J_OP, i, j)
1103 | else:
1104 | if nan_strategy in [
1105 | _DROP_SAMPLE_PAIRS,
1106 | ]:
1107 | dataset_c_ij = dataset[[columns[i], columns[j]]].dropna(axis=0)
1108 | c_i, c_j = dataset_c_ij[columns[i]], dataset_c_ij[columns[j]]
1109 | else:
1110 | c_i, c_j = dataset[columns[i]], dataset[columns[j]]
1111 | if columns[i] in nominal_columns:
1112 | if columns[j] in nominal_columns:
1113 | if callable(nom_nom_assoc):
1114 | if symmetric_nom_nom:
1115 | cell = nom_nom_assoc(c_i, c_j)
1116 | ij = cell
1117 | ji = cell
1118 | else:
1119 | ij = nom_nom_assoc(c_i, c_j)
1120 | ji = nom_nom_assoc(c_j, c_i)
1121 | elif nom_nom_assoc == "theil":
1122 | ij = theils_u(
1123 | c_i,
1124 | c_j,
1125 | nan_strategy=_SKIP,
1126 | )
1127 | ji = theils_u(
1128 | c_j,
1129 | c_i,
1130 | nan_strategy=_SKIP,
1131 | )
1132 | elif nom_nom_assoc == "cramer":
1133 | cell = cramers_v(
1134 | c_i,
1135 | c_j,
1136 | bias_correction=cramers_v_bias_correction,
1137 | nan_strategy=_SKIP,
1138 | )
1139 | ij = cell
1140 | ji = cell
1141 | else:
1142 | raise ValueError(
1143 | f"{nom_nom_assoc} is not a supported nominal-nominal association"
1144 | )
1145 | else:
1146 | ij, ji = _nom_num(
1147 | nom_column=c_i, num_column=c_j, nom_num_assoc=nom_num_assoc
1148 | )
1149 | else:
1150 | if columns[j] in nominal_columns:
1151 | ij, ji = _nom_num(
1152 | nom_column=c_j, num_column=c_i, nom_num_assoc=nom_num_assoc
1153 | )
1154 | else:
1155 | if callable(num_num_assoc):
1156 | if symmetric_num_num:
1157 | cell = num_num_assoc(c_i, c_j)
1158 | ij = cell
1159 | ji = cell
1160 | else:
1161 | ij = num_num_assoc(c_i, c_j)
1162 | ji = num_num_assoc(c_j, c_i)
1163 | else:
1164 | if num_num_assoc == "pearson":
1165 | cell, _ = ss.pearsonr(c_i, c_j)
1166 | elif num_num_assoc == "spearman":
1167 | cell, _ = ss.spearmanr(c_i, c_j)
1168 | elif num_num_assoc == "kendall":
1169 | cell, _ = ss.kendalltau(c_i, c_j)
1170 | else:
1171 | raise ValueError(
1172 | f"{num_num_assoc} is not a supported numerical-numerical association"
1173 | )
1174 | ij = cell
1175 | ji = cell
1176 |
1177 | return (_ASSOC_OP, i, j, ij, ji)
1178 |
1179 |
1180 | def numerical_encoding(
1181 | dataset: TwoDimArray,
1182 | nominal_columns: Optional[
1183 | Union[List[str], Literal["all", "auto"]]
1184 | ] = "auto",
1185 | drop_single_label: bool = False,
1186 | drop_fact_dict: bool = True,
1187 | nan_strategy: str = _REPLACE,
1188 | nan_replace_value: Any = _DEFAULT_REPLACE_VALUE,
1189 | ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, Dict]]:
1190 | """
1191 | Encoding a data-set with mixed data (numerical and categorical) to a
1192 | numerical-only data-set using the following logic:
1193 | * categorical with only a single value will be marked as zero (or dropped,
1194 | if requested)
1195 | * categorical with two values will be replaced with the result of Pandas
1196 | `factorize`
1197 | * categorical with more than two values will be replaced with the result
1198 | of Pandas `get_dummies`
1199 | * numerical columns will not be modified
1200 |
1201 | Parameters:
1202 | -----------
1203 | dataset : NumPy ndarray / Pandas DataFrame
1204 | The data-set to encode
1205 | nominal_columns : None / sequence / string. default = 'all'
1206 | A sequence of the nominal (categorical) columns in the dataset. If
1207 | string, must be 'all' or 'auto. If 'all' to state that all columns are nominal.
1208 | If 'auto', categorical columns will be identified
1209 | based on dtype. If None, nothing happens.
1210 | drop_single_label : Boolean, default = False
1211 | If True, nominal columns with a only a single value will be dropped.
1212 | drop_fact_dict : Boolean, default = True
1213 | If True, the return value will be the encoded DataFrame alone. If
1214 | False, it will be a tuple of the DataFrame and the dictionary of the
1215 | binary factorization (originating from pd.factorize)
1216 | nan_strategy : string, default = 'replace'
1217 | How to handle missing values: can be either 'drop_samples' to remove
1218 | samples with missing values, 'drop_features' to remove features
1219 | (columns) with missing values, or 'replace' to replace all missing
1220 | values with the nan_replace_value. Missing values are None and np.nan.
1221 | nan_replace_value : any, default = 0.0
1222 | The value used to replace missing values with. Only applicable when nan
1223 | _strategy is set to 'replace'
1224 |
1225 | Returns:
1226 | --------
1227 | DataFrame or (DataFrame, dict). If `drop_fact_dict` is True,
1228 | returns the encoded DataFrame.
1229 | else, returns a tuple of the encoded DataFrame and dictionary, where each
1230 | key is a two-value column, and the value is the original labels, as
1231 | supplied by Pandas `factorize`. Will be empty if no two-value columns are
1232 | present in the data-set
1233 | """
1234 | df: pd.DataFrame = convert(dataset, "dataframe") # type: ignore
1235 | if nan_strategy == _REPLACE:
1236 | df.fillna(nan_replace_value, inplace=True)
1237 | elif nan_strategy == _DROP_SAMPLES:
1238 | df.dropna(axis=0, inplace=True)
1239 | elif nan_strategy == _DROP_FEATURES:
1240 | df.dropna(axis=1, inplace=True)
1241 | if nominal_columns is None:
1242 | return df
1243 | elif nominal_columns == "all":
1244 | nominal_columns = df.columns.tolist()
1245 | elif nominal_columns == "auto":
1246 | nominal_columns = identify_nominal_columns(df)
1247 | converted_dataset = pd.DataFrame()
1248 | binary_columns_dict = dict()
1249 | for col in df.columns:
1250 | if col not in nominal_columns:
1251 | converted_dataset.loc[:, col] = df[col]
1252 | else:
1253 | unique_values = pd.unique(df[col])
1254 | if len(unique_values) == 1 and not drop_single_label:
1255 | converted_dataset.loc[:, col] = 0
1256 | elif len(unique_values) == 2:
1257 | (
1258 | converted_dataset.loc[:, col],
1259 | binary_columns_dict[col],
1260 | ) = pd.factorize(df[col])
1261 | else:
1262 | dummies = pd.get_dummies(df[col], prefix=col)
1263 | converted_dataset = pd.concat(
1264 | [converted_dataset, dummies], axis=1
1265 | )
1266 | if drop_fact_dict:
1267 | return converted_dataset
1268 | else:
1269 | return converted_dataset, binary_columns_dict
1270 |
1271 |
1272 | def cluster_correlations(
1273 | corr_mat: TwoDimArray, indices: Optional[ArrayLike] = None
1274 | ) -> Tuple[TwoDimArray, ArrayLike]:
1275 | """
1276 | Apply agglomerative clustering in order to sort
1277 | a correlation matrix.
1278 |
1279 | Based on https://github.com/TheLoneNut/CorrelationMatrixClustering/blob/master/CorrelationMatrixClustering.ipynb
1280 |
1281 | Parameters:
1282 | -----------
1283 | - corr_mat : a square correlation matrix (pandas DataFrame)
1284 | - indices : cluster labels [None]; if not provided we'll do
1285 | an aglomerative clustering to get cluster labels.
1286 |
1287 | Returns:
1288 | --------
1289 | - corr : a sorted correlation matrix
1290 | - indices : cluster indexes based on the original dataset
1291 |
1292 | Example:
1293 | --------
1294 | >>> assoc = associations(
1295 | ... iris_df,
1296 | ... plot=False
1297 | ... )
1298 | >>> correlations = assoc['corr']
1299 | >>> correlations, _ = cluster_correlations(correlations)
1300 | """
1301 | df: pd.DataFrame = convert(corr_mat, "dataframe") # type: ignore
1302 | if indices is None:
1303 | X = df.values
1304 | d = sch.distance.pdist(X)
1305 | L = sch.linkage(d, method="complete")
1306 | ind: ArrayLike = sch.fcluster(L, 0.5 * d.max(), "distance") # type: ignore
1307 | else:
1308 | ind = indices
1309 |
1310 | columns = [df.columns.tolist()[i] for i in list((np.argsort(ind)))]
1311 | df = df.reindex(columns=columns).reindex(index=columns)
1312 |
1313 | if isinstance(corr_mat, np.ndarray):
1314 | return df.to_numpy(), ind
1315 | else:
1316 | return df, ind
1317 |
--------------------------------------------------------------------------------
/dython/sampling.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from typing import Union, List
3 | from numpy.typing import NDArray
4 | from .typing import Number, OneDimArray
5 |
6 |
7 | __all__ = ["boltzmann_sampling", "weighted_sampling"]
8 |
9 |
10 | def _w_sampling(
11 | numbers: OneDimArray, k: int, with_replacement: bool, force_to_list: bool
12 | ) -> Union[Number, OneDimArray]:
13 | sampled = np.random.choice(numbers, size=k, replace=with_replacement)
14 | if (isinstance(numbers, list) or force_to_list) and k is not None:
15 | sampled = sampled.tolist()
16 | return sampled
17 |
18 |
19 | def weighted_sampling(
20 | numbers: OneDimArray, k: int = 1, with_replacement: bool = False
21 | ) -> Union[Number, OneDimArray]:
22 | """
23 | Return k numbers from a weighted-sampling over the supplied numbers
24 |
25 | Parameters:
26 | -----------
27 | numbers : List or np.ndarray
28 | numbers to sample
29 | k : int, default = 1
30 | How many numbers to sample. Choosing `k=None` will yield a single
31 | number
32 | with_replacement : Boolean, default = False
33 | Allow replacement or not
34 |
35 | Returns:
36 | --------
37 | List, np.ndarray or a single number (depending on the input)
38 | """
39 | return _w_sampling(numbers, k, with_replacement, force_to_list=False)
40 |
41 |
42 | def boltzmann_sampling(
43 | numbers: OneDimArray, k: int = 1, with_replacement: bool = False
44 | ) -> Union[Number, OneDimArray]:
45 | """
46 | Return k numbers from a boltzmann-sampling over the supplied numbers
47 |
48 | Parameters:
49 | -----------
50 | numbers : List or np.ndarray
51 | numbers to sample
52 | k : int, default = 1
53 | How many numbers to sample. Choosing `k=None` will yield a single
54 | number
55 | with_replacement : Boolean, default = False
56 | Allow replacement or not
57 |
58 | Returns:
59 | --------
60 | List, np.ndarray or a single number (depending on the input)
61 | """
62 | exp_func = np.vectorize(lambda x: np.exp(x))
63 | exp_numbers = exp_func(numbers)
64 | exp_sum = exp_numbers.sum()
65 | scaling_func = np.vectorize(lambda x: x / exp_sum)
66 | b_numbers = scaling_func(exp_numbers)
67 | return _w_sampling(
68 | b_numbers,
69 | k=k,
70 | with_replacement=with_replacement,
71 | force_to_list=isinstance(numbers, list),
72 | )
73 |
--------------------------------------------------------------------------------
/dython/typing.py:
--------------------------------------------------------------------------------
1 | from pandas import DataFrame, Series
2 | from typing import List, Union, Any
3 | from numpy.typing import NDArray
4 |
5 |
6 | Number = Union[int, float]
7 | OneDimArray = Union[List[Number], NDArray, Series]
8 | TwoDimArray = Union[NDArray, DataFrame]
9 |
--------------------------------------------------------------------------------
/logos/README.md:
--------------------------------------------------------------------------------
1 | Logos were made using [hatchful](https://hatchful.shopify.com/).
2 |
--------------------------------------------------------------------------------
/logos/dython_300x200.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/dython_300x200.png
--------------------------------------------------------------------------------
/logos/facebook_cover_photo_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/facebook_cover_photo_1.png
--------------------------------------------------------------------------------
/logos/facebook_cover_photo_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/facebook_cover_photo_2.png
--------------------------------------------------------------------------------
/logos/facebook_profile_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/facebook_profile_image.png
--------------------------------------------------------------------------------
/logos/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/favicon.png
--------------------------------------------------------------------------------
/logos/instagram_profile_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/instagram_profile_image.png
--------------------------------------------------------------------------------
/logos/linkedin_banner_image_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/linkedin_banner_image_1.png
--------------------------------------------------------------------------------
/logos/linkedin_banner_image_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/linkedin_banner_image_2.png
--------------------------------------------------------------------------------
/logos/linkedin_profile_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/linkedin_profile_image.png
--------------------------------------------------------------------------------
/logos/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/logo.png
--------------------------------------------------------------------------------
/logos/logo_transparent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/logo_transparent.png
--------------------------------------------------------------------------------
/logos/pinterest_board_photo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/pinterest_board_photo.png
--------------------------------------------------------------------------------
/logos/pinterest_profile_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/pinterest_profile_image.png
--------------------------------------------------------------------------------
/logos/twitter_header_photo_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/twitter_header_photo_1.png
--------------------------------------------------------------------------------
/logos/twitter_header_photo_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/twitter_header_photo_2.png
--------------------------------------------------------------------------------
/logos/twitter_profile_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/twitter_profile_image.png
--------------------------------------------------------------------------------
/logos/youtube_profile_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shakedzy/dython/152c3db0766e8358c7ae0ed257ecc7d47c1f2e8c/logos/youtube_profile_image.png
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: dython
2 | site_description: A set of data-analysis tools for Python 3.x
3 | site_long_description: Dython is a set of data-analysis tools written in Python 3.x, which can let you get more insights about your data. Dython was designed with analysis usage in mind - meaning ease-of-use, functionality and readability are the core values of this library.
4 | site_author: Shaked Zychlinski
5 | site_url: http://shakedzy.xyz/dython/
6 | repo_name: shakedzy/dython
7 | repo_url: https://github.com/shakedzy/dython
8 | copyright: Copyright © Shaked Zychlinski
9 | theme:
10 | name: material
11 | custom_dir: docs/overrides
12 | palette:
13 | - media: "(prefers-color-scheme: light)"
14 | scheme: default
15 | primary: light blue
16 | accent: blue
17 | toggle:
18 | icon: material/toggle-switch-off-outline
19 | name: Switch to dark mode
20 | - media: "(prefers-color-scheme: dark)"
21 | scheme: slate
22 | primary: teal
23 | accent: cyan
24 | toggle:
25 | icon: material/toggle-switch
26 | name: Switch to light mode
27 | logo: images/favicon.png
28 | favicon: images/favicon.png
29 | nav:
30 | - Home: 'index.md'
31 | - Getting Started:
32 | - Installation: 'getting_started/installation.md'
33 | - Examples: 'getting_started/examples.md'
34 | - Modules:
35 | - data_utils: 'modules/data_utils.md'
36 | - nominal: 'modules/nominal.md'
37 | - model_utils: 'modules/model_utils.md'
38 | - sampling: 'modules/sampling.md'
39 | - Related Blogosts: 'related_blogposts.md'
40 | google_analytics:
41 | - UA-141245946-2
42 | - auto
43 | markdown_extensions:
44 | - toc:
45 | permalink: true
46 | - admonition
47 | - codehilite
48 | - pymdownx.arithmatex:
49 | generic: true
50 | - pymdownx.highlight
51 | - pymdownx.superfences
52 | extra_javascript:
53 | - https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-MML-AM_CHTML
54 | plugins:
55 | - search
56 | extra:
57 | social:
58 | - icon: material/web
59 | link: https://shakedzy.xyz/
60 | - icon: fontawesome/brands/github
61 | link: https://github.com/shakedzy
62 | - icon: fontawesome/brands/linkedin
63 | link: https://www.linkedin.com/in/shakedzy/
64 | - icon: fontawesome/brands/medium
65 | link: https://shakedzy.medium.com/
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.pytest.ini_options]
2 | addopts = "--doctest-modules --doctest-continue-on-failure"
3 | testpaths = [
4 | "./tests",
5 | "./dython"
6 | ]
7 | doctest_optionflags = "NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL ELLIPSIS"
8 |
9 | [tool.black]
10 | line-length = 80
11 | target-version = ['py38']
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [tool.pytest-enabler.black]
2 | addopts = "--black"
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.23.0
2 | pandas>=1.4.2
3 | seaborn>=0.12.0
4 | scipy>=1.7.1
5 | matplotlib>=3.6.0
6 | scikit-learn>=0.24.2
7 | psutil>=5.9.1
8 | setuptools
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 | from setuptools import setup, find_packages
3 |
4 | HERE = pathlib.Path(__file__).parent.resolve()
5 |
6 | PACKAGE_NAME = "dython"
7 | AUTHOR = "Shaked Zychlinski"
8 | AUTHOR_EMAIL = "shakedzy@gmail.com"
9 | URL = "http://shakedzy.xyz/dython"
10 | DOWNLOAD_URL = "https://pypi.org/project/dython/"
11 |
12 | LICENSE = "MIT"
13 | VERSION = (HERE / "VERSION").read_text(encoding="utf8").strip()
14 | DESCRIPTION = "A set of data tools in Python"
15 | LONG_DESCRIPTION = (HERE / "README.md").read_text(encoding="utf8")
16 | LONG_DESC_TYPE = "text/markdown"
17 |
18 | requirements = (HERE / "requirements.txt").read_text(encoding="utf8")
19 | INSTALL_REQUIRES = [s.strip() for s in requirements.split("\n")]
20 |
21 | dev_requirements = (HERE / "dev_requirements.txt").read_text(encoding="utf8")
22 | EXTRAS_REQUIRE = {"dev": [s.strip() for s in dev_requirements.split("\n")]}
23 |
24 | min_minor = 9
25 | max_minor = 12
26 | CLASSIFIERS = [
27 | f"Programming Language :: Python :: 3.{str(v)}" for v in range(min_minor, max_minor+1)
28 | ]
29 | PYTHON_REQUIRES = f">=3.{min_minor}"
30 |
31 | setup(
32 | name=PACKAGE_NAME,
33 | version=VERSION,
34 | description=DESCRIPTION,
35 | long_description=LONG_DESCRIPTION,
36 | long_description_content_type=LONG_DESC_TYPE,
37 | author=AUTHOR,
38 | license=LICENSE,
39 | author_email=AUTHOR_EMAIL,
40 | url=URL,
41 | download_url=DOWNLOAD_URL,
42 | python_requires=PYTHON_REQUIRES,
43 | install_requires=INSTALL_REQUIRES,
44 | extras_require=EXTRAS_REQUIRE,
45 | packages=find_packages(),
46 | classifiers=CLASSIFIERS,
47 | )
48 |
--------------------------------------------------------------------------------
/tests/test_data_utils/test_one_hot_encode.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from dython.data_utils import one_hot_encode
3 |
4 |
5 | def test_one_hot_encode_check():
6 | lst = [0, 0, 2, 5]
7 | row = len(lst)
8 | col = max(lst) + 1
9 |
10 | result = one_hot_encode(lst)
11 | assert result.shape == (row, col)
12 |
13 |
14 | def test_negative_input():
15 | lst = [-1, -5, 0, 3]
16 |
17 | with pytest.raises(ValueError, match="negative value"):
18 | one_hot_encode(lst)
19 |
20 |
21 | def test_more_than_one_dimension():
22 | lst = [[0, 1], [2, 3]]
23 |
24 | with pytest.raises(ValueError, match="must have only one dimension"):
25 | one_hot_encode(lst)
26 |
--------------------------------------------------------------------------------
/tests/test_data_utils/test_split_hist.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import matplotlib.pyplot as plt
3 |
4 | from dython.data_utils import split_hist
5 |
6 |
7 | def test_split_hist_check(iris_df):
8 | result = split_hist(iris_df, "sepal length (cm)", "target")
9 |
10 | assert isinstance(result, plt.Axes)
11 |
--------------------------------------------------------------------------------
/tests/test_model_utils/test_ks_abc.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import matplotlib
4 |
5 | from dython.model_utils import ks_abc
6 |
7 |
8 | @pytest.fixture
9 | def y_true():
10 | return np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
11 |
12 |
13 | @pytest.fixture
14 | def y_pred():
15 | return np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
16 |
17 |
18 | def test_ks_abc_check_types(y_true, y_pred):
19 | result = ks_abc(y_true, y_pred)
20 |
21 | assert isinstance(result, dict), "ks_abc should return dict"
22 |
23 | assert "abc" in result, 'ks_abc should return dict containing "abc" key'
24 | assert isinstance(
25 | result["abc"], float
26 | ), "area between curves should be a float"
27 |
28 | assert (
29 | "ks_stat" in result
30 | ), 'ks_abc should return dict containing "ks_stat" key'
31 | assert isinstance(
32 | result["ks_stat"], float
33 | ), "ks statistic should be a float"
34 |
35 | assert "eopt" in result, 'ks_abc should return dict containing "eopt" key'
36 | assert isinstance(
37 | result["eopt"], float
38 | ), "estimated optimal threshold should be a float"
39 |
40 | assert "ax" in result, 'ks_abc should return dict containing "ax" key'
41 | assert isinstance(result["ax"], matplotlib.axes.Axes)
42 |
43 |
44 | def test_ks_abc_check_known_value(y_true, y_pred):
45 | result = ks_abc(y_true, y_pred)
46 |
47 | assert result["abc"] == pytest.approx(0.55)
48 | assert result["ks_stat"] == pytest.approx(1.0)
49 | assert result["eopt"] == pytest.approx(0.4)
50 |
--------------------------------------------------------------------------------
/tests/test_model_utils/test_metric_graph.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import matplotlib
4 |
5 | from dython.model_utils import metric_graph
6 |
7 |
8 | @pytest.fixture
9 | def y_true():
10 | return np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
11 |
12 |
13 | @pytest.fixture
14 | def y_pred():
15 | return np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
16 |
17 |
18 | def test_metric_graph_check_types(y_true, y_pred):
19 | result = metric_graph(y_true, y_pred, "roc")
20 |
21 | assert isinstance(result, dict), "metric_graph should return a dict"
22 |
23 | assert "ax" in result, 'metric_graph should return dict containing "ax" key'
24 |
25 | assert isinstance(result["ax"], matplotlib.axes.Axes)
26 |
27 |
28 | def test_metric_graph_bad_metric_parameter(y_true, y_pred):
29 | with pytest.raises(ValueError, match="Invalid metric"):
30 | metric_graph(y_true, y_pred, "bad_metric_param")
31 |
--------------------------------------------------------------------------------
/tests/test_model_utils/test_random_forest_feature_importance.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from sklearn.ensemble import RandomForestClassifier
3 |
4 | from dython.model_utils import random_forest_feature_importance
5 |
6 |
7 | def test_random_forest_feature_importance_check_types(iris_df):
8 | X = iris_df.drop(["target", "extra"], axis=1)
9 | y = iris_df["target"].values
10 |
11 | clf = RandomForestClassifier(n_estimators=7)
12 | clf.fit(X, y)
13 |
14 | result = random_forest_feature_importance(clf, X.columns)
15 |
16 | assert isinstance(result, list)
17 | assert isinstance(result[0], tuple)
18 | assert isinstance(result[0][0], float)
19 | assert isinstance(result[0][1], str)
20 |
--------------------------------------------------------------------------------
/tests/test_nominal/test_associations.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import matplotlib
3 | import pandas as pd
4 | import numpy as np
5 | import scipy.stats as ss
6 | from sklearn import datasets
7 | from datetime import datetime, timedelta
8 |
9 | from dython.nominal import associations, correlation_ratio
10 |
11 |
12 | def test_return_type_check(iris_df):
13 | assoc = associations(iris_df)
14 |
15 | assert isinstance(assoc, dict), "associations should return a dict"
16 | assert (
17 | "corr" in assoc
18 | ), 'associations should return a dict containing "corr" key'
19 | assert (
20 | "ax" in assoc
21 | ), 'associations should return a dict containing "ax" key'
22 |
23 | assert isinstance(
24 | assoc["corr"], pd.DataFrame
25 | ), 'assoc["corr"] should be a pandas DataFrame'
26 | assert isinstance(
27 | assoc["ax"], matplotlib.axes.Axes
28 | ), 'assoc["ax"] should be a matplotlib Axes'
29 |
30 |
31 | def test_dimension_check(iris_df):
32 | corr = associations(iris_df)["corr"]
33 | corr_shape = corr.shape
34 | iris_shape = iris_df.shape
35 |
36 | assert corr_shape[0] == corr_shape[1], "association matrix has wrong shape"
37 | assert (
38 | corr_shape[1] == iris_shape[1]
39 | ), "association matrix has different shape from input data"
40 |
41 |
42 | def test_single_value_zero_association(iris_df):
43 | SV_COL = 1
44 | iris_df.iloc[:, SV_COL] = 42
45 |
46 | corr = associations(iris_df)["corr"]
47 |
48 | assert (
49 | corr.iloc[:, SV_COL] == 0
50 | ).all(), "single-value variable should have zero association value"
51 | assert (
52 | corr.iloc[SV_COL, :] == 0
53 | ).all(), "single-value variable should have zero association value"
54 |
55 |
56 | def test_bad_nom_nom_assoc_parameter(iris_df):
57 | with pytest.raises(ValueError, match="is not a supported"):
58 | associations(iris_df, nom_nom_assoc="bad_parameter_name")
59 |
60 |
61 | def test_bad_num_num_assoc_parameter(iris_df):
62 | with pytest.raises(ValueError, match="is not a supported"):
63 | associations(iris_df, num_num_assoc="bad_parameter_name")
64 |
65 |
66 | def test_compute_only_ax_is_none(iris_df):
67 | assoc = associations(iris_df, compute_only=True)
68 |
69 | assert (
70 | assoc["ax"] is None
71 | ), 'associations with compute_only should return a None value for "ax" key'
72 |
73 |
74 | def test_mark_columns(iris_df):
75 | corr = associations(iris_df, mark_columns=True)["corr"]
76 |
77 | assert (
78 | "(con)" in corr.index[0]
79 | ), "first column should contain (con) mark if iris_df is used"
80 |
81 |
82 | def test_udf(iris_df):
83 | def pr(x, y):
84 | return ss.pearsonr(x, y)[0]
85 |
86 | corr1 = associations(
87 | iris_df,
88 | plot=False,
89 | num_num_assoc="pearson",
90 | nom_num_assoc="correlation_ratio",
91 | )["corr"]
92 | corr2 = associations(
93 | iris_df, plot=False, num_num_assoc=pr, nom_num_assoc=correlation_ratio
94 | )["corr"]
95 | assert corr1.compare(
96 | corr2
97 | ).empty, (
98 | "Computation of built-in measures of associations differs from UDFs"
99 | )
100 |
101 |
102 | def test_datetime_data():
103 | dt = datetime(2020, 12, 1)
104 | end = datetime(2020, 12, 2)
105 | step = timedelta(seconds=5)
106 | result = []
107 | while dt < end:
108 | result.append(dt.strftime("%Y-%m-%d %H:%M:%S"))
109 | dt += step
110 |
111 | nums = list(range(len(result)))
112 | df = pd.DataFrame(
113 | {"dates": result, "up": nums, "down": sorted(nums, reverse=True)}
114 | )
115 | df["dates"] = pd.to_datetime(
116 | df["dates"], format="%Y-%m-%d %H:%M:%S"
117 | ) # without this, this column is considered as object rather than dates
118 |
119 | correct_corr = pd.DataFrame(
120 | columns=["dates", "up", "down"],
121 | index=["dates", "up", "down"],
122 | data=[[1.0, 1.0, -1.0], [1.0, 1.0, -1.0], [-1.0, -1.0, 1.0]],
123 | )
124 | corr = associations(df, plot=False)["corr"]
125 | assert corr.compare(
126 | correct_corr
127 | ).empty, f"datetime associations are incorrect. Test should have returned an empty dataframe, received: {corr.head()}"
128 |
129 |
130 | def test_category_nan_replace(iris_df):
131 | iris_df["extra"] = iris_df["extra"].astype("category")
132 | iris_df.loc[5, "extra"] = np.nan
133 | try:
134 | associations(iris_df, nan_strategy="replace")
135 | except TypeError as exception:
136 | assert (
137 | False
138 | ), f"nan_strategy='replace' with a pandas.CategoricalDtype column raised an exception {exception}"
139 |
--------------------------------------------------------------------------------
/tests/test_nominal/test_associations_parallel.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import matplotlib
3 | import pandas as pd
4 | import scipy.stats as ss
5 |
6 | from psutil import cpu_count
7 | from datetime import datetime, timedelta
8 |
9 | from dython.nominal import associations, correlation_ratio
10 |
11 | MAX_CORE_COUNT = cpu_count(logical=False)
12 |
13 |
14 | def test_return_type_check(iris_df):
15 | assoc = associations(
16 | iris_df, multiprocessing=True, max_cpu_cores=MAX_CORE_COUNT
17 | )
18 |
19 | assert isinstance(assoc, dict), "associations should return a dict"
20 | assert (
21 | "corr" in assoc
22 | ), 'associations should return a dict containing "corr" key'
23 | assert (
24 | "ax" in assoc
25 | ), 'associations should return a dict containing "ax" key'
26 |
27 | assert isinstance(
28 | assoc["corr"], pd.DataFrame
29 | ), 'assoc["corr"] should be a pandas DataFrame'
30 | assert isinstance(
31 | assoc["ax"], matplotlib.axes.Axes
32 | ), 'assoc["ax"] should be a matplotlib Axes'
33 |
34 |
35 | def test_dimension_check(iris_df):
36 | corr = associations(
37 | iris_df, multiprocessing=True, max_cpu_cores=MAX_CORE_COUNT
38 | )["corr"]
39 | corr_shape = corr.shape
40 | iris_shape = iris_df.shape
41 |
42 | assert corr_shape[0] == corr_shape[1], "association matrix has wrong shape"
43 | assert (
44 | corr_shape[1] == iris_shape[1]
45 | ), "association matrix has different shape from input data"
46 |
47 |
48 | def test_single_value_zero_association(iris_df):
49 | SV_COL = 1
50 | iris_df.iloc[:, SV_COL] = 42
51 |
52 | corr = associations(
53 | iris_df, multiprocessing=True, max_cpu_cores=MAX_CORE_COUNT
54 | )["corr"]
55 |
56 | assert (
57 | corr.iloc[:, SV_COL] == 0
58 | ).all(), "single-value variable should have zero association value"
59 | assert (
60 | corr.iloc[SV_COL, :] == 0
61 | ).all(), "single-value variable should have zero association value"
62 |
63 |
64 | def test_bad_nom_nom_assoc_parameter(iris_df):
65 | with pytest.raises(ValueError, match="is not a supported"):
66 | associations(
67 | iris_df,
68 | nom_nom_assoc="bad_parameter_name",
69 | multiprocessing=True,
70 | max_cpu_cores=MAX_CORE_COUNT,
71 | )
72 |
73 |
74 | def test_bad_num_num_assoc_parameter(iris_df):
75 | with pytest.raises(ValueError, match="is not a supported"):
76 | associations(iris_df, num_num_assoc="bad_parameter_name")
77 |
78 |
79 | def test_compute_only_ax_is_none(iris_df):
80 | assoc = associations(
81 | iris_df,
82 | compute_only=True,
83 | multiprocessing=True,
84 | max_cpu_cores=MAX_CORE_COUNT,
85 | )
86 |
87 | assert (
88 | assoc["ax"] is None
89 | ), 'associations with compute_only should return a None value for "ax" key'
90 |
91 |
92 | def test_mark_columns(iris_df):
93 | corr = associations(
94 | iris_df,
95 | mark_columns=True,
96 | multiprocessing=True,
97 | max_cpu_cores=MAX_CORE_COUNT,
98 | )["corr"]
99 |
100 | assert (
101 | "(con)" in corr.index[0]
102 | ), "first column should contain (con) mark if iris_df is used"
103 |
104 |
105 | def pr(x, y):
106 | return ss.pearsonr(x, y)[0]
107 |
108 |
109 | def test_udf(iris_df):
110 | corr1 = associations(
111 | iris_df,
112 | plot=False,
113 | num_num_assoc="pearson",
114 | nom_num_assoc="correlation_ratio",
115 | multiprocessing=True,
116 | max_cpu_cores=MAX_CORE_COUNT,
117 | )["corr"]
118 | corr2 = associations(
119 | iris_df,
120 | plot=False,
121 | num_num_assoc=pr,
122 | nom_num_assoc=correlation_ratio,
123 | multiprocessing=True,
124 | max_cpu_cores=MAX_CORE_COUNT,
125 | )["corr"]
126 | assert corr1.compare(
127 | corr2
128 | ).empty, (
129 | "Computation of built-in measures of associations differs from UDFs"
130 | )
131 |
132 |
133 | def test_datetime_data():
134 | dt = datetime(2020, 12, 1)
135 | end = datetime(2020, 12, 2)
136 | step = timedelta(seconds=5)
137 | result = []
138 | while dt < end:
139 | result.append(dt.strftime("%Y-%m-%d %H:%M:%S"))
140 | dt += step
141 |
142 | nums = list(range(len(result)))
143 | df = pd.DataFrame(
144 | {"dates": result, "up": nums, "down": sorted(nums, reverse=True)}
145 | )
146 | # without this, this column is considered as object rather than dates
147 | df["dates"] = pd.to_datetime(df["dates"], format="%Y-%m-%d %H:%M:%S")
148 |
149 | correct_corr = pd.DataFrame(
150 | columns=["dates", "up", "down"],
151 | index=["dates", "up", "down"],
152 | data=[[1.0, 1.0, -1.0], [1.0, 1.0, -1.0], [-1.0, -1.0, 1.0]],
153 | )
154 | corr = associations(
155 | df, plot=False, multiprocessing=True, max_cpu_cores=MAX_CORE_COUNT
156 | )["corr"]
157 | assert corr.compare(
158 | correct_corr
159 | ).empty, f"datetime associations are incorrect. Test should have returned an empty dataframe, received: {corr.head()}"
160 |
--------------------------------------------------------------------------------
/tests/test_nominal/test_cluster_correlation.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import pandas as pd
4 |
5 | from dython.nominal import cluster_correlations
6 |
7 |
8 | @pytest.fixture
9 | def corr_example():
10 | return pd.DataFrame(
11 | np.array(
12 | [
13 | [1, 0.5, 0.7, 0.3],
14 | [0.5, 1, 0.8, 0.2],
15 | [0.7, 0.8, 1, 0.1],
16 | [0.3, 0.2, 0.1, 1],
17 | ]
18 | ),
19 | columns=list("ABCD"),
20 | index=list("ABCD"),
21 | )
22 |
23 |
24 | def test_cluster_correlation_check_return_values(corr_example):
25 | result = cluster_correlations(corr_example)
26 |
27 | assert isinstance(result, tuple), "should return a tuple"
28 |
29 | sorted_corr, indices = result
30 |
31 | assert isinstance(
32 | sorted_corr, pd.DataFrame
33 | ), "sorted correlation should be a pd.DataFrame correlation matrix"
34 | assert isinstance(indices, np.ndarray), "indices should be a np.ndarray"
35 |
--------------------------------------------------------------------------------
/tests/test_nominal/test_correlation_ratio.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | from hypothesis import given, strategies as st, assume, settings, example
4 |
5 | from dython.nominal import correlation_ratio
6 |
7 |
8 | categories = st.text(alphabet=list("ABCDE"), min_size=1, max_size=1)
9 |
10 |
11 | @st.composite
12 | def categories_and_measurements(draw):
13 | n = draw(st.integers(min_value=2, max_value=30))
14 | category_lists = st.lists(categories, min_size=n, max_size=n)
15 | measurement_lists = st.lists(st.floats(), min_size=n, max_size=n)
16 |
17 | return draw(category_lists), draw(measurement_lists)
18 |
19 |
20 | @given(c_m=categories_and_measurements())
21 | def test_correlation_ratio_value_range(c_m):
22 | category, measurement = c_m
23 |
24 | corr_ratio = correlation_ratio(category, measurement)
25 |
26 | assert 0.0 <= corr_ratio <= 1.0 or np.isnan(corr_ratio)
27 |
--------------------------------------------------------------------------------
/tests/test_nominal/test_cramers_v.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import functools
3 | import numpy as np
4 | from hypothesis import given, strategies as st, assume, settings, example
5 |
6 | from dython.nominal import cramers_v
7 |
8 |
9 | # "Patch" pytest.approx to increase its tolerance range
10 | approx = functools.partial(pytest.approx, abs=1e-6, rel=1e-6)
11 |
12 |
13 | def test_cramers_v_check(iris_df):
14 | x = iris_df["extra"]
15 | y = iris_df["target"]
16 |
17 | # Note: this measure is symmetric
18 | assert cramers_v(x, y) == pytest.approx(0.14201914309546954)
19 | assert cramers_v(y, x) == pytest.approx(0.14201914309546954)
20 |
21 |
22 | categories = st.text(alphabet=list("ABCDE"), min_size=1, max_size=1)
23 |
24 |
25 | @st.composite
26 | def two_categorical_lists(draw):
27 | n = draw(st.integers(min_value=2, max_value=30))
28 | categorical_lists = st.lists(categories, min_size=n, max_size=n)
29 |
30 | return draw(categorical_lists), draw(categorical_lists)
31 |
32 |
33 | @given(x_y=two_categorical_lists())
34 | def test_cramers_v_value_range(x_y):
35 | x, y = x_y
36 |
37 | v_xy = cramers_v(x, y)
38 |
39 | assume(not np.isnan(v_xy))
40 |
41 | # 0.0 <= v_xy <= 1.0 is false when v_xy == 1.00000000000004
42 | # hence this weird-looking assertion, to avoid hypothesis saying it's "flaky"
43 | assert (
44 | v_xy == pytest.approx(0.0)
45 | or 0.0 < v_xy < 1.0
46 | or v_xy == pytest.approx(1.0)
47 | )
48 |
49 |
50 | @given(x_y=two_categorical_lists())
51 | @settings(deadline=1000)
52 | def test_cramers_v_symmetry(x_y):
53 | x, y = x_y
54 | v_xy = cramers_v(x, y)
55 | v_yx = cramers_v(y, x)
56 |
57 | # Can be overridden by passing nan_ok = True to
58 | # pytest.approx, but this feels more appropriate
59 | assume(not np.isnan(v_xy) and not np.isnan(v_yx))
60 |
61 | assert approx(v_xy) == approx(v_yx)
62 |
--------------------------------------------------------------------------------
/tests/test_nominal/test_theils_u.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from hypothesis import given, strategies as st, assume
3 |
4 | from dython.nominal import theils_u
5 |
6 |
7 | def test_theils_u_check(iris_df):
8 | x = iris_df["extra"]
9 | y = iris_df["target"]
10 |
11 | # Note: this measure is not symmetric
12 | assert theils_u(x, y) == pytest.approx(0.02907500150218738)
13 | assert theils_u(y, x) == pytest.approx(0.0424761859049835)
14 |
15 |
16 | categories = st.text(alphabet=list("ABCDE"), min_size=1, max_size=1)
17 |
18 |
19 | @given(x=st.lists(categories, min_size=2, max_size=30))
20 | def test_theils_u_identity(x):
21 | assert theils_u(x, x) == pytest.approx(1.0)
22 |
23 |
24 | @st.composite
25 | def two_categorical_lists(draw):
26 | n = draw(st.integers(min_value=2, max_value=30))
27 | categorical_lists = st.lists(categories, min_size=n, max_size=n)
28 |
29 | return draw(categorical_lists), draw(categorical_lists)
30 |
31 |
32 | @given(x_y=two_categorical_lists())
33 | def test_theils_u_value_range(x_y):
34 | x, y = x_y
35 |
36 | u_xy = theils_u(x, y)
37 |
38 | assert 0.0 <= u_xy <= 1.0
39 |
--------------------------------------------------------------------------------
/tests/test_private_helpers.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pytest
4 | from sklearn import datasets
5 | from dython._private import (
6 | convert,
7 | remove_incomplete_samples,
8 | replace_nan_with_value,
9 | )
10 |
11 | # Make pandas not emit SettingWithCopyWarning
12 | # SettingWithCopyWarning looks relatively safe to ignore,
13 | # compare with DeprecationWarning that eventually needs attention.
14 | # https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#evaluation-order-matters
15 | pd.set_option("mode.chained_assignment", None)
16 |
17 |
18 | @pytest.fixture
19 | def iris_df():
20 | iris = datasets.load_iris()
21 | df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
22 | df["target"] = iris.target
23 |
24 | return df
25 |
26 |
27 | @pytest.fixture(params=["str", "tuple", "dict"])
28 | def bad_input(request):
29 | if request.param == "str":
30 | return "EXAMPLE STRING"
31 |
32 | if request.param == "tuple":
33 | return "EXAMPLE", "TUPLE"
34 |
35 | if request.param == "dict":
36 | return {1: "EXAMPLE", 2: "DICT"}
37 |
38 |
39 | @pytest.mark.parametrize("output_type", ["list", "array", "dataframe"])
40 | def test_convert_good_output_bad_input(bad_input, output_type):
41 | with pytest.raises(TypeError, match="cannot handle data conversion"):
42 | convert(bad_input, output_type)
43 |
44 |
45 | def test_convert_bad_output(iris_df):
46 | with pytest.raises(ValueError, match="Unknown"):
47 | convert(iris_df, "bad_parameter")
48 |
49 |
50 | @pytest.fixture
51 | def x_y(iris_df):
52 | x = iris_df[iris_df.columns[0]]
53 | y = iris_df[iris_df.columns[1]]
54 | return x, y
55 |
56 |
57 | def test_remove_incomplete_cases_one_nan_each(x_y):
58 | x, y = x_y
59 | x[0] = None
60 | y[1] = None
61 |
62 | x_, y_ = remove_incomplete_samples(x, y)
63 |
64 | assert len(x_) == len(y_) == len(x) - 2
65 |
66 |
67 | def test_remove_incomplete_cases_all_nan(x_y):
68 | x, y = x_y
69 | x = [None for _ in x]
70 |
71 | x_, y_ = remove_incomplete_samples(x, y)
72 | assert len(x_) == len(y_) == 0
73 |
74 |
75 | def test_replace_nan_one_nan_each(x_y):
76 | x, y = x_y
77 | x[0] = None
78 | y[1] = None
79 |
80 | x_, y_ = replace_nan_with_value(x, y, 1_000)
81 |
82 | assert len(x_) == len(y_) == len(y)
83 | assert x_[0] == y_[1] == 1_000
84 |
85 |
86 | def test_replace_nan_all_nan(x_y):
87 | x, y = x_y
88 | x = [None for _ in x]
89 |
90 | x_, y_ = replace_nan_with_value(x, y, 1_000)
91 |
92 | assert all([elem == 1_000 for elem in x_])
93 |
--------------------------------------------------------------------------------
/tests/test_sampling.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | from dython.sampling import boltzmann_sampling, weighted_sampling
4 |
5 |
6 | @pytest.fixture(params=["list", "array"])
7 | def population(request):
8 | if request.param == "list":
9 | return [0.0, 1.0, 2.0, 3.0, 4.0]
10 | elif request.param == "array":
11 | return np.array([0.0, 1.0, 2.0, 3.0, 4.0])
12 |
13 |
14 | parametrize_sampling_funcs = pytest.mark.parametrize(
15 | "func", [boltzmann_sampling, weighted_sampling]
16 | )
17 |
18 |
19 | @parametrize_sampling_funcs
20 | def test_k_none(func, population):
21 | result = func(population, k=None)
22 | assert type(result) is np.float64
23 |
24 |
25 | @parametrize_sampling_funcs
26 | @pytest.mark.parametrize("k", [1, 2])
27 | def test_k_number(func, population, k):
28 | result = func(population, k=k)
29 | assert type(result) == type(
30 | population
31 | ), "Sampling with k != None should return same type as input"
32 |
--------------------------------------------------------------------------------