├── tests
├── __init__.py
├── conftest.py
├── README.md
├── test_pandas.py
├── test_markov.py
└── test_sklearn.py
├── docs
├── authors.md
├── changelog.md
├── contributing.md
├── license.md
├── _static
│ ├── favicon.ico
│ ├── redflag_logo.png
│ ├── custom.css
│ └── redflag.svg
├── redflag.rst
├── Makefile
├── installation.md
├── pre_process_ipynb.py
├── post_process_html.py
├── development.md
├── index.rst
├── what_is_redflag.md
├── conf.py
└── notebooks
│ ├── Using_redflag_with_Pandas.ipynb
│ └── _Pandas_accessor.ipynb
├── AUTHORS.md
├── SECURITY.md
├── .github
├── ISSUE_TEMPLATE
│ └── bug_report.md
└── workflows
│ ├── publish-docs.yml
│ ├── pypi-publish.yml
│ └── build-test.yml
├── src
└── redflag
│ ├── __init__.py
│ ├── independence.py
│ ├── pandas.py
│ ├── importance.py
│ ├── imbalance.py
│ ├── target.py
│ ├── outliers.py
│ ├── markov.py
│ └── distributions.py
├── CONTRIBUTING.md
├── pyproject.toml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── README.md
├── LICENSE
└── CHANGELOG.md
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/authors.md:
--------------------------------------------------------------------------------
1 | ```{include} ../AUTHORS.md
2 | ```
--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
1 | ```{include} ../CHANGELOG.md
2 | ```
--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
1 | ```{include} ../CONTRIBUTING.md
2 | ```
--------------------------------------------------------------------------------
/docs/license.md:
--------------------------------------------------------------------------------
1 | # License
2 |
3 | ```{include} ../LICENSE
4 | ```
5 |
--------------------------------------------------------------------------------
/docs/_static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scienxlab/redflag/HEAD/docs/_static/favicon.ico
--------------------------------------------------------------------------------
/docs/_static/redflag_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scienxlab/redflag/HEAD/docs/_static/redflag_logo.png
--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
1 | # Authors
2 |
3 | The following people have contributed to the project (in alphabetical order):
4 |
5 | - [Matt Hall](https://github.com/kwinkunks), Agile Scientific, Canada (ORCID: [0000-0002-4054-8295](https://orcid.org/0000-0002-4054-8295))
6 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security policy
2 |
3 |
4 | ## Supported versions
5 |
6 | Only the latest version of `redflag` is supported.
7 |
8 |
9 | ## Reporting a vulnerability
10 |
11 | Please do not report on issue on GitHub, instead report vulnerabilities to hello@scienxlab.org
12 |
13 | We do not award bounties for security vulnerabilities, but will notify you if and when the report is accepted and acted upon.
14 |
--------------------------------------------------------------------------------
/docs/redflag.rst:
--------------------------------------------------------------------------------
1 | redflag package
2 | ===============
3 |
4 | Submodules
5 | ----------
6 |
7 | .. toctree::
8 | :maxdepth: 4
9 |
10 | redflag.distributions
11 | redflag.imbalance
12 | redflag.importance
13 | redflag.independence
14 | redflag.markov
15 | redflag.outliers
16 | redflag.pandas
17 | redflag.sklearn
18 | redflag.target
19 | redflag.utils
20 |
21 | Module contents
22 | ---------------
23 |
24 | .. automodule:: redflag
25 | :members:
26 | :undoc-members:
27 | :show-inheritance:
28 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Desktop (please complete the following information):**
24 | - OS: [e.g. iOS]
25 |
26 | **Additional context**
27 | Add any other context about the problem here.
28 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 |
2 | # You can set these variables from the command line, and also
3 | # from the environment for the first two.
4 | SPHINXOPTS ?=
5 | SPHINXBUILD ?= sphinx-build
6 | SOURCEDIR = .
7 | BUILDDIR = _build
8 |
9 | # Put it first so that "make" without argument is like "make help".
10 | help:
11 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
12 |
13 | .PHONY: help html
14 |
15 | html:
16 | python pre_process_ipynb.py $(SOURCEDIR)/notebooks
17 | $(SPHINXBUILD) -E -b html $(SPHINXOPTS) $(SOURCEDIR) $(BUILDDIR)/html
18 | python post_process_html.py $(BUILDDIR)/html
19 | @echo
20 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
21 |
--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
1 | # 🚩 Installation
2 |
3 | At the command line:
4 |
5 | ```shell
6 | pip install redflag
7 | ```
8 |
9 | Or, if you use Conda environments:
10 |
11 | ```shell
12 | conda install -c conda-forge redflag
13 | ```
14 |
15 | You can add the `conda-forge` channel as a source for future installations like so:
16 |
17 | ```shell
18 | conda config --add channels conda-forge
19 | conda config --set channel_priority strict
20 | ```
21 |
22 |
23 | ## Optional dependencies
24 |
25 | For developers, there is an option to install `dev` dependencies: `pip install "redflag[dev]"` to install all testing and documentation packages.
26 |
27 | If you want to help develop `redflag`, please read [Development](development.md).
28 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import doctest
2 | import re
3 | import platform
4 |
5 |
6 | OutputChecker = doctest.OutputChecker
7 | class CustomOutputChecker(OutputChecker):
8 | def check_output(self, want, got, optionflags):
9 | """
10 | Remove the dtype from NumPy array reprs, to avoid some doctests
11 | failing on Windows, which often uses int32 instead of int64.
12 | """
13 | pattern = re.compile(r"(array\(.+?)(, dtype=int)(32|64)(\))")
14 | want = pattern.sub(r"\1\4", want)
15 | got = pattern.sub(r"\1\4", got)
16 | return OutputChecker.check_output(self, want, got, optionflags)
17 |
18 | if platform.system() == 'Windows':
19 | doctest.OutputChecker = CustomOutputChecker
20 |
--------------------------------------------------------------------------------
/.github/workflows/publish-docs.yml:
--------------------------------------------------------------------------------
1 | name: Docs
2 |
3 | on:
4 | workflow_call:
5 | workflow_dispatch:
6 |
7 | jobs:
8 | deploy:
9 | runs-on: ubuntu-latest
10 | steps:
11 |
12 | - uses: actions/checkout@v4
13 |
14 | - name: Set up Python
15 | uses: actions/setup-python@v5
16 | with:
17 | python-version: '3.x'
18 |
19 | - name: Install package
20 | run: |
21 | python -m pip install --upgrade pip
22 | pip install .[dev]
23 |
24 | - name: Build docs
25 | run: |
26 | cd docs
27 | make html
28 |
29 | - name: Publish docs
30 | uses: JamesIves/github-pages-deploy-action@v4
31 | with:
32 | branch: gh-pages
33 | folder: docs/_build/html
34 |
--------------------------------------------------------------------------------
/src/redflag/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *
2 | from .sklearn import *
3 | from .pandas import *
4 | from .markov import Markov_chain
5 |
6 | # Targets
7 | from .target import *
8 | from .imbalance import *
9 |
10 | # Features
11 | from .distributions import *
12 | from .independence import *
13 | from .importance import *
14 | from .outliers import *
15 |
16 | # It used to be conventional to define a __version__ attribute.
17 | # However, it is now considered best practice to get version
18 | # information from the package metadata directly, eg by using
19 | # importlib.metadata.version (see below).
20 | #
21 | # This will be deprecated in v0.6.0 but for now we do this:
22 | #
23 | from importlib.metadata import version
24 | __version__ = version(__package__ or __name__)
25 |
--------------------------------------------------------------------------------
/.github/workflows/pypi-publish.yml:
--------------------------------------------------------------------------------
1 | name: Publish to PyPI
2 |
3 | on:
4 | release:
5 | types: [published]
6 |
7 | jobs:
8 | tests:
9 | uses: ./.github/workflows/build-test.yml
10 |
11 | docs:
12 | uses: ./.github/workflows/publish-docs.yml
13 |
14 | deploy:
15 | needs: [tests, docs]
16 | runs-on: ubuntu-latest
17 | steps:
18 |
19 | - uses: actions/checkout@v4
20 |
21 | - name: Set up Python
22 | uses: actions/setup-python@v5
23 | with:
24 | python-version: '3.x'
25 |
26 | - name: Install package
27 | run: |
28 | python -m pip install --upgrade pip
29 | pip install .[dev]
30 |
31 | - name: Build package
32 | run: python -m build
33 |
34 | - name: Publish package
35 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
36 | with:
37 | user: __token__
38 | password: ${{ secrets.PYPI_API_TOKEN }}
39 |
--------------------------------------------------------------------------------
/.github/workflows/build-test.yml:
--------------------------------------------------------------------------------
1 | name: Tests
2 |
3 | on:
4 | workflow_call:
5 | workflow_dispatch:
6 | push:
7 | branches: [ main, develop ]
8 | pull_request:
9 | branches: [ main ]
10 |
11 | jobs:
12 | build:
13 | runs-on: ${{ matrix.os }}
14 | strategy:
15 | fail-fast: false
16 | matrix:
17 | os: [ubuntu-latest, macos-latest, windows-latest]
18 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
19 |
20 | steps:
21 |
22 | - uses: actions/checkout@v4
23 |
24 | - name: Set up Python ${{ matrix.python-version }}
25 | uses: actions/setup-python@v5
26 | with:
27 | python-version: ${{ matrix.python-version }}
28 | allow-prereleases: true
29 |
30 | - name: Install dependencies
31 | run: |
32 | python -m pip install --upgrade pip
33 | pip install .[dev]
34 |
35 | - name: Test with pytest
36 | run: |
37 | pytest
38 |
--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | ## Tests
2 |
3 | Note that most of the tests in `redflag` are doctests. The testing code is in the docstrings of the various functions, under the 'examples' heading.
4 |
5 | There are some pytest files in `tests` as well.
6 |
7 | The Jupyter Notebooks in `docs/notebooks` are currently not run as part of the tests, but there is an open issue to implemement this.
8 |
9 | Test options are in `pyproject.toml`, so to run the tests: clone the repo, install the dev dependencies (e.g. with `"pip install .[dev]`") and do this from the root directory:
10 |
11 | pytest
12 |
13 |
14 | ## A note about NumPy dtypes
15 |
16 | Owing to an idiosyncracy of 64-bit Windows machines, which count a 'long' int as 32-bit not 64, I have stopped `doctest` from comparing any `dtype=int64` or similar in test outputs. This is done by the custom `doctest.OutputChecker` in `tests/conftest.py`. It only runs on Windows
17 | machines (e.g. in the CI matrix).
18 |
--------------------------------------------------------------------------------
/docs/pre_process_ipynb.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import glob
3 | import json
4 | import pathlib
5 | import shutil
6 |
7 |
8 | def change_kernel(notebook):
9 | """
10 | Vanillafy the kernelspec.
11 | """
12 | new_kernelspec = {
13 | "display_name": "Python 3 (ipykernel)",
14 | "language": "python",
15 | "name": "python3",
16 | }
17 | notebook['metadata']['kernelspec'].update(new_kernelspec)
18 | return notebook
19 |
20 |
21 | def main(path):
22 | """
23 | Process the IPYNB files in path, save in place (side-effect).
24 | """
25 | fnames = glob.glob(path.strip('/') + '/[!_]*.ipynb') # Not files with underscore.
26 | outpath = pathlib.Path('_notebooks')
27 | if outpath.exists():
28 | shutil.rmtree(outpath)
29 | outpath.mkdir(exist_ok=True)
30 |
31 | for fname in fnames:
32 | with open(fname, encoding='utf-8') as f:
33 | notebook = json.loads(f.read())
34 |
35 | new_nb = change_kernel(notebook)
36 | filepart = pathlib.Path(fname).name
37 |
38 | with open(outpath / filepart, 'w') as f:
39 | _ = f.write(json.dumps(new_nb))
40 |
41 | return
42 |
43 |
44 | if __name__ == '__main__':
45 | print(sys.argv[1])
46 | _ = main(sys.argv[1])
47 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | **🙌 Thank you for considering contributing to this project!**
4 |
5 | There are several important ways you can help; here are some examples:
6 |
7 | - Submitting bug reports and feature requests: see [Issues](https://github.com/scienxlab/redflag/issues).
8 | - Proposing code for bug fixes and new features, then [making a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests).
9 | - Fixing typos and generally improving the documentation.
10 | - Writing tutorials, examples, and how-to documents.
11 |
12 |
13 | ## Code of conduct
14 |
15 | We're fortunate to be part of a large professional community that conducts itself with mutual respect and consideration for others. Scienxlab's [Code of Conduct](https://github.com/scienxlab/community/blob/main/CODE_OF_CONDUCT.md) is part of protecting these features for everyone, everywhere. Please read it.
16 |
17 |
18 | ## Authorship
19 |
20 | If you contribute a pull request to the project and you wish to be identified as an author, please add yourself to `AUTHORS.md`.
21 |
22 |
23 | ## License
24 |
25 | By making a contribution, you agree that it shall be governed by the terms of the license unless another, specific agreement is made with Agile.
26 |
--------------------------------------------------------------------------------
/docs/post_process_html.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import glob
3 | import re
4 |
5 |
6 | def simplify_credits(html):
7 | """
8 | Replace the credit part of the HTML footer. Return the new text.
9 | """
10 | s = r"""@pradyunsg's"""
11 | pattern = re.compile(s)
12 | html = pattern.sub(r'', html)
13 |
14 | s = r'Copyright © 2024, The Redflag Authors'
15 | pattern = re.compile(s)
16 | new_s = '© 2024, The Redflag Authors | CC BY'
17 | html = pattern.sub(new_s, html)
18 |
19 | return html
20 |
21 |
22 | def add_analytics(html):
23 | """
24 | Add snippet to head.
25 | """
26 | s = r''
27 | pattern = re.compile(s)
28 | new_s = ''
29 | html = pattern.sub(new_s, html)
30 |
31 | return html
32 |
33 |
34 | def main(path):
35 | """
36 | Process the HTML files in path, save in place (side-effect).
37 | """
38 | fnames = glob.glob(path.strip('/') + '/*.html')
39 | for fname in fnames:
40 | with open(fname, 'r+') as f:
41 | html = f.read()
42 |
43 | new_html = simplify_credits(html)
44 | new_html = add_analytics(html)
45 |
46 | f.seek(0)
47 | f.write(new_html)
48 | f.truncate()
49 | return
50 |
51 |
52 | if __name__ == '__main__':
53 | _ = main(sys.argv[1])
54 |
--------------------------------------------------------------------------------
/docs/_static/custom.css:
--------------------------------------------------------------------------------
1 | /* Removes Captions from the main page. */
2 | article p.caption {
3 | display: none;
4 | }
5 |
6 | /* Styles the 'line block' https://docutils.sourceforge.io/docs/user/rst/quickref.html#line-blocks. */
7 | blockquote {
8 | background: none;
9 | border-left-width: 0px;
10 | padding: 0em;
11 | }
12 |
13 | blockquote div.line {
14 | color: #838383;
15 | display: inline;
16 | font-style: normal !important;
17 | font-size: 150%;
18 | line-height: 125%;
19 | }
20 |
21 | /* Adds the GitHub ribbon. */
22 | #forkongithub a {
23 | background:rgb(158, 158, 158);
24 | color:#fff;
25 | text-decoration:none;
26 | font-family:arial,sans-serif;
27 | text-align:center;
28 | font-weight:bold;
29 | padding:5px 40px;
30 | font-size:1rem;
31 | line-height:2rem;
32 | position:relative;
33 | transition:0.5s;
34 | }
35 |
36 | #forkongithub a:hover {
37 | background:#14ca29;
38 | color:#fff;
39 | }
40 |
41 | #forkongithub a::after {
42 | bottom:1px;
43 | top:auto;
44 | }
45 |
46 | @media screen and (min-width:800px) {
47 | #forkongithub{
48 | position:fixed;
49 | display:block;
50 | top:0;
51 | right:0;
52 | width:200px;
53 | overflow:hidden;
54 | height:200px;
55 | z-index:9999;
56 | }
57 |
58 | #forkongithub a {
59 | width:200px;
60 | position:absolute;
61 | top:60px;
62 | right:-60px;
63 | transform:rotate(45deg);
64 | -webkit-transform:rotate(45deg);
65 | -ms-transform:rotate(45deg);
66 | -moz-transform:rotate(45deg);
67 | -o-transform:rotate(45deg);
68 | box-shadow:4px 4px 10px rgba(0,0,0,0.4);
69 | }
70 | }
--------------------------------------------------------------------------------
/tests/test_pandas.py:
--------------------------------------------------------------------------------
1 | """Test Pandas accessors."""
2 | import pytest
3 | import pandas as pd
4 | from redflag.pandas import null_decorator, SeriesAccessor
5 |
6 |
7 | c = pd.Series([1, 1, 1, 1, 1, 2, 2, 2, 3, 3])
8 | r = pd.Series([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 3.0])
9 |
10 |
11 | def test_null_decorator():
12 | @null_decorator('foo')
13 | def f():
14 | return None
15 | assert f() is None
16 |
17 |
18 | def test_dummy_scores():
19 | c_scores = c.redflag.dummy_scores(random_state=42)
20 | r_scores = r.redflag.dummy_scores(random_state=42)
21 |
22 | assert c_scores['roc_auc'] - 0.6801587301587301 < 1e-12
23 | assert r_scores['mean_squared_error'] - 0.5710743801652893 < 1e-12
24 |
25 |
26 | def test_imbalance():
27 | assert c.redflag.is_imbalanced(threshold=0.24, method='tv')
28 |
29 | minorities = c.redflag.minority_classes()
30 | assert 2 in minorities and 3 in minorities
31 |
32 | imb_degree = c.redflag.imbalance_degree()
33 | assert imb_degree - 1.25 < 1e-9
34 |
35 |
36 | def test_is_ordered():
37 | assert c.redflag.is_ordered()
38 |
39 |
40 | def test_is_ordered_warns_for_continuous_data():
41 | with pytest.raises(ValueError, match='Cannot check order of continuous data.'):
42 | r.redflag.is_ordered()
43 |
44 |
45 | def test_warnings():
46 | with pytest.warns(UserWarning, match="The Series does not seem categorical."):
47 | r.redflag.minority_classes()
48 | with pytest.warns(UserWarning, match="The Series does not seem categorical."):
49 | r.redflag.imbalance_degree()
50 |
51 |
52 | def test_series_categorical_report():
53 | report_c = c.redflag.report()
54 | assert 'Categorical' in report_c
55 |
56 |
57 | def test_series_continuous_report():
58 | report_r = r.redflag.report()
59 | assert 'Continuous' in report_r
60 |
61 |
62 | def test_feature_importances_docstring():
63 | s = pd.DataFrame([c, r]).redflag.feature_importances.__doc__
64 | assert s.strip().startswith("Estimate feature importances on a supervised task, given X and y.")
65 |
--------------------------------------------------------------------------------
/tests/test_markov.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for the Markov module. This code was originally implemented in
3 | https://github.com/agilescientific/striplog.
4 | """
5 | import numpy as np
6 |
7 | from redflag.markov import Markov_chain
8 |
9 | data = "sssmmmlllmlmlsslsllsmmllllmssssllllssmmlllllssssssmmmmsmllllssslmslmsmmmslsllll"""
10 |
11 | def test_basics():
12 | data = [[0, 37, 3, 2],
13 | [21, 0, 41, 14],
14 | [20, 25, 0, 0],
15 | [1, 14, 1, 0]]
16 |
17 | m = Markov_chain(data, states=['A', 'B', 'C', 'D'])
18 |
19 | ans = (35.73687369691601, 11.070497693516351, 0.9999989278539752)
20 | assert np.allclose(m.chi_squared(), ans)
21 |
22 | ans = np.array([[0., 31.27069125, 8.17143874, 2.55787001],
23 | [31.28238248, 0., 34.05692583, 10.66069169],
24 | [8.17137105, 34.04391563, 0., 2.78471333],
25 | [2.5579797, 10.65716447, 2.78485582, 0.]])
26 | assert np.allclose(m.expected_counts, ans)
27 |
28 |
29 | def test_sequence():
30 | m = Markov_chain.from_sequence(data, include_self=True)
31 |
32 | assert len(m._state_counts) == 3
33 |
34 | ans = np.array([[19., 5., 7.],
35 | [6., 9., 5.],
36 | [7., 6., 14.]])
37 | assert np.allclose(m.observed_counts, ans)
38 |
39 | ans = np.array([[0.49712747, 0.19796476, 0.30490777],
40 | [0.49712747, 0.19796476, 0.30490777],
41 | [0.49712747, 0.19796476, 0.30490777]])
42 | assert np.allclose(m.expected_freqs, ans)
43 |
44 | ans = np.array([[-2.24633883, -2.14054029, -2.81568096],
45 | [-1.81677174, 1.82886491, -0.94412655],
46 | [-2.68890472, -0.51627836, 0.76836845]])
47 | assert np.allclose(m.normalized_difference, ans)
48 |
49 |
50 | def test_generate():
51 | m = Markov_chain.from_sequence(data, include_self=True)
52 |
53 | assert len(m.generate_states()) == 10
54 |
55 |
56 | def test_step_2():
57 | m = Markov_chain.from_sequence(data, include_self=True, step=2)
58 |
59 | assert m.observed_freqs.ndim == 3
60 |
--------------------------------------------------------------------------------
/docs/development.md:
--------------------------------------------------------------------------------
1 | # Development
2 |
3 | If you'd like to develop `redflag`, this page should help you get started.
4 |
5 |
6 | ## Installation
7 |
8 | You can install this package with `pip` or `conda`. The `dev` option will install the packages you need for testing and building the documentation.
9 |
10 | ```shell
11 | python -m pip install "redflag[dev]"
12 | ```
13 |
14 |
15 | ## Contributing
16 |
17 | If you'd like to contribute pull requests back to the main `redflag ` project, please see [`CONTRIBUTING.md`](https://github.com/scienxlab/redflag/blob/main/CONTRIBUTING.md).
18 |
19 |
20 | ## Testing
21 |
22 | You can run the tests (requires `pytest` and `pytest-cov`) with
23 |
24 | ```shell
25 | pytest
26 | ```
27 |
28 | Most of the tests are `doctest` tests, which are contained in the docstrings of this package's functions. There are further tests in the `tests` folder.
29 |
30 |
31 | ## Building the package
32 |
33 | This repo uses PEP 518-style packaging. [Read more about this](https://setuptools.pypa.io/en/latest/build_meta.html) and [about Python packaging in general](https://packaging.python.org/en/latest/tutorials/packaging-projects/).
34 |
35 | To build `redflag` locally:
36 |
37 | ```shell
38 | python -m build
39 | ```
40 |
41 | This builds both `.tar.gz` and `.whl` files, either of which you can install with `pip`.
42 |
43 |
44 | ## Building the docs
45 |
46 | You can build the docs with the following commands:
47 |
48 | ```shell
49 | cd docs
50 | make html
51 | ```
52 |
53 | Don't just run `sphinx-build` manually: there is other stuff happening in the `Makefile`.
54 |
55 | There is a continuous integration script to update the docs on published releases.
56 |
57 |
58 | ## Continuous integration
59 |
60 | This repo has two GitHub 'workflows' or 'actions':
61 |
62 | - Push to `main`: Run all tests on all version of Python. This is the **Build and test** workflow.
63 | - Publish a new release: Build and upload to PyPI. This is the **Publish to PyPI** workflow. Publish using the GitHub interface, for example ([read more](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository)).
64 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=68", "setuptools-scm>=8"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "redflag"
7 | dynamic = ["version"]
8 | requires-python = ">=3.8"
9 | authors = [{ name="Matt Hall", email="kwinkunks@gmail.com" },]
10 | description = "Safety net for machine learning pipelines."
11 | license = {file = "LICENSE"}
12 | readme = "README.md"
13 | keywords = ["machine learning", "data science", "quality assurance"]
14 | classifiers = [
15 | "Intended Audience :: Science/Research",
16 | "Topic :: Scientific/Engineering :: Information Analysis",
17 | "Development Status :: 4 - Beta",
18 | "Natural Language :: English",
19 | "Programming Language :: Python :: 3.8",
20 | "Programming Language :: Python :: 3.9",
21 | "Programming Language :: Python :: 3.10",
22 | "Programming Language :: Python :: 3.11",
23 | "Programming Language :: Python :: 3.12",
24 | "License :: OSI Approved :: Apache Software License",
25 | "Operating System :: OS Independent",
26 | ]
27 |
28 | dependencies = [
29 | "numpy<2.0", # NumPy 2 will likely break some things.
30 | "scipy!=1.10.0", # Bug in stats.powerlaw.
31 | "scikit-learn",
32 | ]
33 |
34 | [project.optional-dependencies]
35 | dev = [
36 | "setuptools",
37 | "build",
38 | "mypy",
39 | "types-setuptools",
40 | "pytest",
41 | "coverage[toml]",
42 | "pytest-cov",
43 | "sphinx",
44 | "sphinxcontrib-apidoc",
45 | "sphinx_copybutton",
46 | "furo",
47 | "myst_nb",
48 | "jupyter",
49 | "pandas",
50 | "seaborn",
51 | ]
52 |
53 | [project.urls]
54 | "documentation" = "https://scienxlab.org/redflag"
55 | "repository" = "https://github.com/scienxlab/redflag"
56 |
57 | [tool.setuptools_scm]
58 | # Empty section, required for dynamic versioning.
59 |
60 | [tool.setuptools.packages.find]
61 | where = ["src"]
62 |
63 | [tool.mypy]
64 | warn_return_any = true
65 | ignore_missing_imports = true
66 | exclude = ['docs', 'tests']
67 |
68 | [tool.pytest.ini_options]
69 | # pytest configuration: http://doc.pytest.org/en/latest/customize.html
70 | addopts = "--doctest-modules --doctest-continue-on-failure --ignore=docs --cov=redflag"
71 | filterwarnings = [
72 | "ignore:pkg_resources is deprecated as an API:DeprecationWarning",
73 | "ignore:Deprecated call to `pkg_resources.declare_namespace:DeprecationWarning",
74 | ]
75 |
76 | [tool.coverage.run]
77 | # coverage configuration: https://coverage.readthedocs.io/
78 | omit = [
79 | "src/redflag/__init__.py",
80 | ]
81 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | :hide-toc:
2 |
3 | .. container::
4 | :name: forkongithub
5 |
6 | `Fork on GitHub `_
7 |
8 |
9 | Redflag: safer ML by design
10 | ===========================
11 |
12 | | ``redflag`` is a lightweight safety net for machine
13 | | learning. Given a ``DataFrame`` or ``ndarray``,
14 | | ``redflag`` will analyse the features and the target,
15 | | and warn you about class imbalance, leakage, outliers,
16 | | anomalous data patterns, threats to the IID assumption,
17 | | and more.
18 |
19 |
20 | Quick start
21 | -----------
22 |
23 | .. toctree::
24 | :caption: Quick start
25 |
26 | Install ``redflag`` with pip or with ``conda`` from the ``conda-forge`` channel:
27 |
28 | .. code-block:: shell
29 |
30 | pip install redflag
31 |
32 | Import ``redflag`` in your Python program:
33 |
34 | .. code-block:: python
35 |
36 | import redflag as rf
37 |
38 | There are three main ways to use ``redflag``:
39 |
40 | 1. ``scikit-learn`` components for your pipelines, e.g. ``rf.ImbalanceDetector().fit_transform(X, y)``.
41 | 2. ``pandas`` accessors on Series and DataFrames, e.g. ``df['target'].redflag.imbalance_degree()``.
42 | 3. As a library of standalone functions, e.g. ``rf.imbalance_degree(y)``.
43 |
44 | Carry on exploring with the user guide below.
45 |
46 |
47 | User guide
48 | ----------
49 |
50 | .. toctree::
51 | :maxdepth: 2
52 | :caption: User guide
53 |
54 | installation
55 | what_is_redflag
56 | _notebooks/Basic_usage.ipynb
57 | _notebooks/Using_redflag_with_sklearn.ipynb
58 | _notebooks/Using_redflag_with_Pandas.ipynb
59 | _notebooks/Tutorial.ipynb
60 |
61 |
62 | API reference
63 | -------------
64 |
65 | .. toctree::
66 | :maxdepth: 2
67 | :caption: API reference
68 |
69 | redflag
70 |
71 |
72 | Other resources
73 | ---------------
74 |
75 | .. toctree::
76 | :maxdepth: 1
77 | :caption: Other resources
78 |
79 | development
80 | contributing
81 | authors
82 | license
83 | changelog
84 |
85 |
86 | Indices and tables
87 | ------------------
88 |
89 | * :ref:`genindex`
90 | * :ref:`modindex`
91 | * :ref:`search`
92 |
93 |
94 | .. toctree::
95 | :caption: Project links
96 | :hidden:
97 |
98 | PyPI releases
99 | Code in GitHub
100 | Issue tracker
101 | Community guidelines
102 | Scienxlab
103 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Mac
2 | .DS_Store
3 |
4 | # Version file autocreated in pyproject.toml and redflag/__init__.py
5 | _version.py
6 |
7 | # Processed docs
8 | _notebooks
9 |
10 | # API docs are built
11 | docs/redflag.*.rst
12 |
13 | # Other
14 | .vscode
15 |
16 | # Byte-compiled / optimized / DLL files
17 | __pycache__/
18 | *.py[cod]
19 | *$py.class
20 |
21 | # C extensions
22 | *.so
23 |
24 | # Distribution / packaging
25 | .Python
26 | build/
27 | develop-eggs/
28 | dist/
29 | downloads/
30 | eggs/
31 | .eggs/
32 | lib/
33 | lib64/
34 | parts/
35 | sdist/
36 | var/
37 | wheels/
38 | pip-wheel-metadata/
39 | share/python-wheels/
40 | *.egg-info/
41 | .installed.cfg
42 | *.egg
43 | MANIFEST*
44 |
45 | # PyInstaller
46 | # Usually these files are written by a python script from a template
47 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
48 | *.manifest
49 | *.spec
50 |
51 | # Installer logs
52 | pip-log.txt
53 | pip-delete-this-directory.txt
54 |
55 | # Unit test / coverage reports
56 | htmlcov/
57 | .tox/
58 | .nox/
59 | .coverage
60 | .coverage.*
61 | .cache
62 | nosetests.xml
63 | coverage.xml
64 | *.cover
65 | *.py,cover
66 | .hypothesis/
67 | .pytest_cache/
68 |
69 | # Translations
70 | *.mo
71 | *.pot
72 |
73 | # Django stuff:
74 | *.log
75 | local_settings.py
76 | db.sqlite3
77 | db.sqlite3-journal
78 |
79 | # Flask stuff:
80 | instance/
81 | .webassets-cache
82 |
83 | # Scrapy stuff:
84 | .scrapy
85 |
86 | # Sphinx documentation
87 | docs/_build/
88 |
89 | # PyBuilder
90 | target/
91 |
92 | # Jupyter Notebook
93 | .ipynb_checkpoints
94 |
95 | # IPython
96 | profile_default/
97 | ipython_config.py
98 |
99 | # pyenv
100 | .python-version
101 |
102 | # pipenv
103 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
104 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
105 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
106 | # install all needed dependencies.
107 | #Pipfile.lock
108 |
109 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
110 | __pypackages__/
111 |
112 | # Celery stuff
113 | celerybeat-schedule
114 | celerybeat.pid
115 |
116 | # SageMath parsed files
117 | *.sage.py
118 |
119 | # Environments
120 | .env
121 | .venv
122 | env/
123 | venv/
124 | ENV/
125 | env.bak/
126 | venv.bak/
127 |
128 | # Spyder project settings
129 | .spyderproject
130 | .spyproject
131 |
132 | # Rope project settings
133 | .ropeproject
134 |
135 | # mkdocs documentation
136 | /site
137 |
138 | # mypy
139 | .mypy_cache/
140 | .dmypy.json
141 | dmypy.json
142 |
143 | # Pyre type checker
144 | .pyre/
145 |
--------------------------------------------------------------------------------
/docs/what_is_redflag.md:
--------------------------------------------------------------------------------
1 | # 🚩 What is `redflag`?
2 |
3 | ## Overview
4 |
5 | _Redflag_ is a Python library that applies "safety by design" to machine
6 | learning. It helps researchers and practitioners in this field ensure their
7 | models are safe and reliable by alerting them to potential pitfalls. These
8 | pitfalls could lead to overconfidence in the model or wildly spurious
9 | predictions. _Redflag_ offers accessible ways for users to integrate safety
10 | checks into their workflows by providing `scikit-learn` transformers, `pandas`
11 | accessors, and standalone functions. These components can easily be
12 | incorporated into existing workflows, helping identify issues and enhance the
13 | quality and safety of predictive models.
14 |
15 |
16 | ## Safety by design
17 |
18 | _Safety by design_ means to 'design out' hazardous situations from complex
19 | machines or processes before they can do harm. The concept, also known as
20 | _prevention through design_, has been applied to civil engineering and
21 | industrial design for decades. Recently it has also been applied to software
22 | engineering and, more recently still, to machine learning
23 | [@van-gelder-etal-2021]. _Redflag_ helps machine learning researchers and
24 | practitioners design safety into their workflows.
25 |
26 | To read more about the motivation for this package, check out
27 | [the draft paper](https://github.com/scienxlab/redflag/blob/paper/paper/paper.md)
28 | submitted to [JOSS](https://joss.theoj.org).
29 |
30 |
31 | ## What's in `redflag`
32 |
33 | _Redflag_ offers three ways for users to insert safety checks into their
34 | machine learning workflows:
35 |
36 | 1. **`scikit-learn` transformers** which fit directly into the pipelines that
37 | most data scientists are already using, e.g.
38 | `redflag.ImbalanceDetector().fit_transform(X, y)`.
39 | 2. **`pandas` accessors** on Series and DataFrames, which can be called like a
40 | method on existing Pandas objects, e.g. `df['target'].redflag.is_imbalanced()`.
41 | 3. **Standalone functions** which the user can compose their own checks and
42 | tests with, e.g. `redflag.is_imbalanced(y)`.
43 |
44 | There are two kinds of `scikit-learn` transformer:
45 |
46 | - **Detectors** check every dataset they encounter. For example,
47 | `redflag.ClippingDetector` checks for clipped data during both model fitting
48 | and during prediction.
49 | - **Comparators** learn some parameter in the model fitting step, then check
50 | subsequent data against those parameters. For example,
51 | `redflag.DistributionComparator` learns the empirical univariate distributions
52 | of the training features, then checks that the features in subsequent datasets
53 | are tolerably close to these baselines.
54 |
55 | Although the `scikit-learn` components are implemented as transformers,
56 | subclassing `sklearn.base.BaseEstimator`, `sklearn.base.TransformerMixin`, they
57 | do not transform the data. They only raise warnings (or, optionally,
58 | exceptions) when a check fails. _Redflag_ does not attempt to fix any problems
59 | it encounters.
60 |
--------------------------------------------------------------------------------
/src/redflag/independence.py:
--------------------------------------------------------------------------------
1 | """
2 | Functions related to understanding row independence.
3 |
4 | Author: Matt Hall, scienxlab.org
5 | Licence: Apache 2.0
6 |
7 | Copyright 2024 Redflag contributors
8 |
9 | Licensed under the Apache License, Version 2.0 (the "License");
10 | you may not use this file except in compliance with the License.
11 | You may obtain a copy of the License at
12 |
13 | http://www.apache.org/licenses/LICENSE-2.0
14 |
15 | Unless required by applicable law or agreed to in writing, software
16 | distributed under the License is distributed on an "AS IS" BASIS,
17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | See the License for the specific language governing permissions and
19 | limitations under the License.
20 | """
21 | import numpy as np
22 | from numpy.typing import ArrayLike
23 |
24 |
25 | def is_correlated(a: ArrayLike, n: int=20, s: int=20, threshold: float=0.1) -> bool:
26 | """
27 | Check if a dataset is auto-correlated. This function returns True if
28 | the 1D input array `a` appears to be correlated to itself, perhaps
29 | because it consists of measurements sampled at neighbouring points
30 | in time or space, at a spacing short enough that samples are correlated.
31 |
32 | If samples are correlated in this way, then the records in your dataset
33 | may break the IID assumption implicit in much of statistics (though not
34 | in specialist geostatistics or timeseries algorithms). This is not
35 | necessarily a big problem, but it does mean you need to be careful
36 | about how you split your data, for example a random split between train
37 | and test will leak information from train to test, because neighbouring
38 | samples are correlated.
39 |
40 | This function inspects s random chunks of n samples, averaging the
41 | autocorrelation coefficients across chunks. If the mean first non-zero
42 | lag is greater than the threshold, the array may be autocorrelated.
43 |
44 | See the Tutorial in the documentation for more about how to use this
45 | function.
46 |
47 | Args:
48 | a (array): The data.
49 | n (int): The number of samples per chunk.
50 | s (int): The number of chunks.
51 | threshold (float): The auto-correlation threshold.
52 |
53 | Returns:
54 | bool: True if the data are autocorrelated.
55 |
56 | Examples:
57 | >>> is_correlated([7, 1, 6, 8, 7, 6, 2, 9, 4, 2])
58 | False
59 | >>> is_correlated([1, 2, 1, 7, 6, 8, 6, 2, 1, 1])
60 | True
61 | """
62 | a = np.asarray(a)
63 |
64 | # Split into chunks n samples long.
65 | L_chunks = min(a.size, n)
66 | chunks = np.array_split(a, a.size//L_chunks)
67 |
68 | # Choose up to s chunk indices at random.
69 | N_chunks = min(len(chunks), s)
70 | rng = np.random.default_rng()
71 | r = rng.choice(np.arange(len(chunks)), size=N_chunks, replace=False)
72 |
73 | # Loop over selected chunks and count ones with correlation.
74 | acs: list = []
75 | for chunk in [c for i, c in enumerate(chunks) if i in r]:
76 | c = chunk[:L_chunks] - np.nanmean(chunk)
77 | autocorr = np.correlate(c, c, mode='same')
78 | acs.append(autocorr / (c.size * np.nanvar(c)))
79 |
80 | # Average the autocorrelations.
81 | acs = np.sum(acs, axis=0) / N_chunks
82 |
83 | p = acs[c.size//2 - 1] # First non-zero lag.
84 | q = acs[c.size//2 - 2] # Next non-zero lag.
85 |
86 | return (p >= threshold) & (q >= 0)
87 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for Sphinx documentation builder.
2 |
3 | # -- Setup function ----------------------------------------------------------
4 |
5 | # Defines custom steps in the process.
6 |
7 | def autodoc_skip_member(app, what, name, obj, skip, options):
8 | """Exclude all private attributes, methods, and dunder methods from Sphinx."""
9 | import re
10 | exclude = re.findall(r'\._.*', str(obj))
11 | return skip or exclude
12 |
13 | def remove_module_docstring(app, what, name, obj, options, lines):
14 | """Remove everything after 'Author: '."""
15 | if what == "module":
16 | keep = [i for i, line in enumerate(lines) if line.startswith("Author: ")]
17 | if keep:
18 | del lines[keep[0]:]
19 | return
20 |
21 | def setup(app):
22 | app.connect('autodoc-skip-member', autodoc_skip_member)
23 | app.connect("autodoc-process-docstring", remove_module_docstring)
24 | return
25 |
26 |
27 | # -- Path setup --------------------------------------------------------------
28 |
29 | # If extensions (or modules to document with autodoc) are in another directory,
30 | # add these directories to sys.path here. If the directory is relative to the
31 | # documentation root, use os.path.abspath to make it absolute, like shown here.
32 |
33 | import os
34 | import sys
35 | sys.path.insert(0, os.path.abspath('../src'))
36 |
37 |
38 | # -- Project information -----------------------------------------------------
39 |
40 | project = 'redflag'
41 | copyright = '2024, The Redflag Authors'
42 | author = 'The Redflag Authors'
43 |
44 |
45 | # -- General configuration ---------------------------------------------------
46 |
47 | # Add any Sphinx extension module names here, as strings. They can be
48 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
49 | # ones.
50 | extensions = [
51 | 'sphinxcontrib.apidoc',
52 | 'sphinx.ext.githubpages',
53 | 'sphinx.ext.napoleon',
54 | 'sphinx.ext.coverage',
55 | 'sphinx_copybutton',
56 | 'myst_nb',
57 | ]
58 |
59 | myst_enable_extensions = ["dollarmath", "amsmath"]
60 |
61 | # Apidoc automation
62 | # https://pypi.org/project/sphinxcontrib-apidoc/
63 | # The apidoc extension and this code automatically update apidoc.
64 | apidoc_module_dir = '../src/redflag'
65 | apidoc_output_dir = './'
66 | apidoc_excluded_paths = []
67 | apidoc_toc_file = False
68 | apidoc_separate_modules = True
69 |
70 | # Add any paths that contain templates here, relative to this directory.
71 | templates_path = ['_templates']
72 |
73 | # List of patterns, relative to source directory, that match files and
74 | # directories to ignore when looking for source files.
75 | # This pattern also affects html_static_path and html_extra_path.
76 | exclude_patterns = ['_build', 'notebooks']
77 |
78 |
79 | # -- Options for HTML output -------------------------------------------------
80 |
81 | # The theme to use for HTML and HTML Help pages. See the documentation for
82 | # a list of builtin themes.
83 | #
84 | # https://sphinx-themes.org/sample-sites/furo/
85 | html_theme = 'furo'
86 | html_title = ''
87 | html_theme_options = {
88 | "sidebar_hide_name": True,
89 | }
90 |
91 | # Add any paths that contain custom static files (such as style sheets) here,
92 | # relative to this directory. They are copied after the builtin static files,
93 | # so a file named "default.css" will overwrite the builtin "default.css".
94 | html_static_path = ['_static']
95 |
96 | html_css_files = [
97 | 'custom.css',
98 | ]
99 |
100 | # Branding.
101 | html_favicon = '_static/favicon.ico'
102 | html_logo = '_static/redflag_logo.png'
103 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to make participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Racist or racially biased remarks, attacks, or content.
28 | * Trolling, insulting/derogatory comments, and personal or political attacks
29 | * Public or private harassment
30 | * Publishing others' private information, such as a physical or electronic
31 | address, without explicit permission
32 | * Other conduct which could reasonably be considered inappropriate in a
33 | professional setting
34 |
35 | ## Our responsibilities
36 |
37 | Project maintainers are responsible for clarifying the standards of acceptable
38 | behavior and are expected to take appropriate and fair corrective action in
39 | response to any instances of unacceptable behavior.
40 |
41 | Project maintainers have the right and responsibility to remove, edit, or
42 | reject comments, commits, code, wiki edits, issues, and other contributions
43 | that are not aligned to this Code of Conduct, or to ban temporarily or
44 | permanently any contributor for other behaviors that they deem inappropriate,
45 | threatening, offensive, or harmful.
46 |
47 | ## Scope
48 |
49 | This Code of Conduct applies within all project spaces, and it also applies when
50 | an individual is representing the project or its community in public spaces.
51 | Examples of representing a project or community include using an official
52 | project e-mail address, posting via an official social media account, or acting
53 | as an appointed representative at an online or offline event. Representation of
54 | a project may be further defined and clarified by project maintainers.
55 |
56 | ## Enforcement
57 |
58 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
59 | reported by contacting any of the following people:
60 |
61 | - Matt Hall, [kwinkunks@gmail.com](mailto:kwinkunks@gmail.com)
62 |
63 | All complaints will be reviewed and investigated and will result in a response that
64 | is deemed necessary and appropriate to the circumstances. The project team is
65 | obligated to maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 |
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 |
72 | ## Attribution
73 |
74 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/), version 1.4,
75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76 |
77 | For answers to common questions about this code of conduct, see
78 | https://www.contributor-covenant.org/faq
79 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # redflag
2 |
3 | [](https://github.com/scienxlab/redflag/actions/workflows/build-test.yml)
4 | [](https://github.com/scienxlab/redflag/actions/workflows/publish-docs.yml)
5 | [](https://pypi.org/project/redflag/)
6 | [](https://anaconda.org/conda-forge/redflag)
7 | [](https://pypi.org/project/redflag/)
8 | [](https://pypi.org/project/redflag/)
9 |
10 | 🚩 `redflag` aims to be an automatic safety net for machine learning datasets. The vision is to accept input of a Pandas `DataFrame` or NumPy `ndarray` representing the input `X` and target `y` in a machine learning task. `redflag` will provide an analysis of each feature, and of the target, including aspects such as class imbalance, leakage, outliers, anomalous data patterns, threats to the IID assumption, and so on. The goal is to complement other projects like `pandas-profiling` and `greatexpectations`.
11 |
12 |
13 | ## Installation
14 |
15 | You can install this package with `pip`:
16 |
17 | python -m pip install redflag
18 |
19 | Alternatively, you can use the `conda` package manager, pointed at the `conda-forge` channel:
20 |
21 | conda install -c conda-forge redflag
22 |
23 | For developers, there is a `pip` option for installing `dev` dependencies. Use `pip install "redflag[dev]"` to install all testing and documentation packages.
24 |
25 |
26 | ## Example with `sklearn`
27 |
28 | The most useful components of `redflag` are probably the `scikit-learn` "detectors". These sit in your pipeline, look at your training and validation data, and emit warnings if something looks like it might cause a problem. For example, we can get alerted to an imbalanced target vector `y` like so:
29 |
30 | ```python
31 | import redflag as rf
32 | from sklearn.datasets import make_classification
33 |
34 | X, y = make_classification(weights=[0.1])
35 |
36 | _ = rf.ImbalanceDetector().fit(X, y)
37 | ```
38 |
39 | This raises a warning:
40 |
41 | ```python
42 | 🚩 The labels are imbalanced by more than the threshold (0.780 > 0.400). See self.minority_classes_ for the minority classes.
43 | ```
44 |
45 | For maximum effect, put this and other detectors in your pipeline, or use the pre-build `rf.pipeline` which contains several useful alerts.
46 |
47 | See [the documentation](https://scienxlab.org/redflag), and specifically the notebook [Using `redflag` with `sklearn`.ipynb](https://github.com/scienxlab/redflag/blob/main/docs/notebooks/Using_redflag_with_sklearn.ipynb) for other examples.
48 |
49 |
50 | ## Example of function call
51 |
52 | `redflag` is also a collection of functions. Most of the useful ones take one or more columns of data (usually a 1D or 2D NumPy array) and run a single test. For example, we can do some outlier detection. The `get_outliers()` function returns the indices of data points that are considered outliers:
53 |
54 | ```python
55 | >>> import redflag as rf
56 | >>> data = 3 * [-3, -2, -2, -1, 0, 0, 0, 1, 2, 2, 3]
57 | >>> rf.get_outliers(data)
58 | array([], dtype=int64)
59 | ```
60 |
61 | That is, there are no outliers. But let's add a clear outlier: a new data record with a value of 100. The function returns the index position(s) of the outlier point(s):
62 |
63 | ```python
64 | >>> rf.get_outliers(data + [100])
65 | array([33])
66 | ```
67 |
68 | See [the documentation](https://scienxlab.org/redflag), and specifically the notebook [Basic_usage.ipynb](https://github.com/scienxlab/redflag/blob/main/docs/notebooks/Basic_usage.ipynb) for several other basic examples.
69 |
70 |
71 | ## Documentation
72 |
73 | [The documentation is online.](https://scienxlab.org/redflag)
74 |
75 |
76 | ## Contributing
77 |
78 | Please see [`CONTRIBUTING.md`](https://github.com/scienxlab/redflag/blob/main/CONTRIBUTING.md). There is also a section [in the documentation](https://scienxlab.org/redflag) about _Development_.
79 |
--------------------------------------------------------------------------------
/docs/_static/redflag.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
107 |
--------------------------------------------------------------------------------
/src/redflag/pandas.py:
--------------------------------------------------------------------------------
1 | """
2 | Pandas accessors.
3 |
4 | Author: Matt Hall, scienxlab.org
5 | Licence: Apache 2.0
6 |
7 | Copyright 2024 Redflag contributors
8 |
9 | Licensed under the Apache License, Version 2.0 (the "License");
10 | you may not use this file except in compliance with the License.
11 | You may obtain a copy of the License at
12 |
13 | http://www.apache.org/licenses/LICENSE-2.0
14 |
15 | Unless required by applicable law or agreed to in writing, software
16 | distributed under the License is distributed on an "AS IS" BASIS,
17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | See the License for the specific language governing permissions and
19 | limitations under the License.
20 | """
21 | import warnings
22 | from typing import Optional
23 |
24 | from .imbalance import imbalance_degree, minority_classes, is_imbalanced
25 | from .importance import feature_importances as feature_importances
26 | from .outliers import get_outliers
27 | from .target import *
28 | from .independence import is_correlated
29 | from .utils import docstring_from
30 |
31 |
32 | def null_decorator(arg):
33 | """
34 | Returns a decorator that does nothing but wrap the function it
35 | decorates. Need to do this to accept an argument on the decorator.
36 | """
37 | def decorator(func):
38 | return func
39 | return decorator
40 |
41 |
42 | try:
43 | from pandas.api.extensions import register_dataframe_accessor
44 | from pandas.api.extensions import register_series_accessor
45 | except:
46 | register_dataframe_accessor = null_decorator
47 | register_series_accessor = null_decorator
48 |
49 |
50 | TEMPLATES = {
51 | 'continuous': """Continuous data suitable for regression
52 | Outliers: {outliers}
53 | Correlated: {correlated}
54 | Dummy scores:{dummy_scores}
55 | """,
56 | 'categorical': """Categorical data suitable for classification
57 | Imbalance degree: {imbalance}
58 | Minority classes: {minority_classes}
59 | Dummy scores: {dummy_scores}
60 | """
61 | }
62 |
63 | @register_series_accessor("redflag")
64 | class SeriesAccessor:
65 | def __init__(self, pandas_obj):
66 | self._obj = pandas_obj
67 |
68 | @docstring_from(minority_classes)
69 | def minority_classes(self):
70 | if is_continuous(self._obj):
71 | warnings.warn('The Series does not seem categorical.')
72 | return minority_classes(self._obj)
73 |
74 | @docstring_from(imbalance_degree)
75 | def imbalance_degree(self):
76 | if is_continuous(self._obj):
77 | warnings.warn('The Series does not seem categorical.')
78 | return imbalance_degree(self._obj)
79 |
80 | @docstring_from(is_imbalanced)
81 | def is_imbalanced(self, threshold=0.4, method='tv', classes=None):
82 | if is_continuous(self._obj):
83 | warnings.warn('The Series does not seem categorical.')
84 | return is_imbalanced(self._obj,
85 | threshold=threshold,
86 | method=method,
87 | classes=classes
88 | )
89 |
90 | @docstring_from(is_ordered)
91 | def is_ordered(self, q=0.95):
92 | return is_ordered(self._obj, q=q)
93 |
94 | @docstring_from(dummy_scores)
95 | def dummy_scores(self, task='auto', random_state=None):
96 | return dummy_scores(self._obj, task=task, random_state=random_state)
97 |
98 | def report(self, random_state=None):
99 | results = {}
100 | if is_continuous(self._obj):
101 | results['outliers'] = get_outliers(self._obj)
102 | results['correlated'] = is_correlated(self._obj)
103 | results['dummy_scores'] = dummy_regression_scores(self._obj)
104 | template = TEMPLATES['continuous']
105 | else:
106 | # Categorical.
107 | results['minority_classes'] = minority_classes(self._obj)
108 | results['imbalance'] = imbalance_degree(self._obj)
109 | results['dummy_scores'] = dummy_classification_scores(self._obj, random_state=random_state)
110 | template = TEMPLATES['categorical']
111 |
112 | return template.format(**results)
113 |
114 |
115 | @register_dataframe_accessor("redflag")
116 | class DataFrameAccessor:
117 | def __init__(self, pandas_obj):
118 | self._obj = pandas_obj
119 |
120 | @docstring_from(feature_importances)
121 | def feature_importances(self, features=None, target=None,
122 | task: Optional[str]=None,
123 | random_state: Optional[int]=None,
124 | ):
125 | if target is None:
126 | raise ValueError('You must provide a target column.')
127 | else:
128 | y_ = self._obj[target]
129 | if is_continuous(y_):
130 | task = 'regression'
131 | else:
132 | task = 'classification'
133 | if len(y_.shape) > 1:
134 | raise NotImplementedError('Multilabel targets are not supported.')
135 | if features is None and target is not None:
136 | X_ = self._obj.drop(columns=target)
137 | else:
138 | X_ = self._obj[features]
139 | return feature_importances(X_, y_,
140 | task=task,
141 | random_state=random_state
142 | )
143 |
144 |
145 | def correlation_detector(self, features=None, target=None, n=20, s=20, threshold=0.1):
146 | """
147 | This is an experimental feature.
148 | """
149 | if target is not None:
150 | y_ = self._obj[target]
151 | if len(y_.shape) > 1:
152 | raise NotImplementedError('Multilabel targets are not supported.')
153 | if is_correlated(y_):
154 | warnings.warn('The target appears to be autocorrelated.',stacklevel=2)
155 |
156 | if features is None and target is not None:
157 | X_ = self._obj.drop(target, axis=1).values
158 | else:
159 | X_ = self._obj[features].values
160 |
161 | for i, x in enumerate(X_.T):
162 | if is_correlated(x, n=n, s=s, threshold=threshold):
163 | warnings.warn(f'🚩 Feature {i} appears to be autocorrelated.', stacklevel=2)
164 |
165 | # There is probably something more useful to return.
166 | return
167 |
--------------------------------------------------------------------------------
/src/redflag/importance.py:
--------------------------------------------------------------------------------
1 | """
2 | Feature importance metrics.
3 |
4 | Author: Matt Hall, scienxlab.org
5 | Licence: Apache 2.0
6 |
7 | Copyright 2024 Redflag contributors
8 |
9 | Licensed under the Apache License, Version 2.0 (the "License");
10 | you may not use this file except in compliance with the License.
11 | You may obtain a copy of the License at
12 |
13 | http://www.apache.org/licenses/LICENSE-2.0
14 |
15 | Unless required by applicable law or agreed to in writing, software
16 | distributed under the License is distributed on an "AS IS" BASIS,
17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | See the License for the specific language governing permissions and
19 | limitations under the License.
20 | """
21 | from typing import Optional
22 |
23 | import numpy as np
24 | from numpy.typing import ArrayLike
25 | from sklearn.inspection import permutation_importance
26 | from sklearn.linear_model import LinearRegression
27 | from sklearn.ensemble import RandomForestRegressor
28 | from sklearn.neighbors import KNeighborsClassifier
29 | from sklearn.neighbors import KNeighborsRegressor
30 | from sklearn.linear_model import LogisticRegression
31 | from sklearn.ensemble import RandomForestClassifier
32 |
33 | from .target import is_continuous
34 | from .utils import split_and_standardize
35 | from .utils import aggregate
36 |
37 |
38 | def feature_importances(X: ArrayLike, y: ArrayLike=None,
39 | task: Optional[str]=None,
40 | random_state: Optional[int]=None,
41 | ) -> np.ndarray:
42 | """
43 | Estimate feature importances on a supervised task, given X and y.
44 |
45 | Classification tasks are assessed with logistic regression, a random
46 | forest, and KNN permutation importance. Regression tasks are assessed with
47 | lasso regression, a random forest, and KNN permutation importance.
48 |
49 | The scores from these assessments are normalized, and the normalized
50 | sum is returned.
51 |
52 | See the Tutorial in the documentation for more information.
53 |
54 | Args:
55 | X (array): an array representing the data.
56 | y (array or None): an array representing the target. If None, the task
57 | is assumed to be an unsupervised clustering task.
58 | task (str or None): either 'classification' or 'regression'. If None,
59 | the task will be inferred from the labels and a warning will show
60 | the assumption being made.
61 | random_state (int or None): the random state to use.
62 |
63 | Returns:
64 | array: The importance of the features, in the order in which they
65 | appear in X.
66 |
67 | Examples:
68 | >>> X = [[0, 0, 0], [0, 1, 1], [0, 2, 0], [0, 3, 1], [0, 4, 0], [0, 5, 1], [0, 7, 0], [0, 8, 1], [0, 8, 0]]
69 | >>> y = [5, 15, 25, 35, 45, 55, 80, 85, 90]
70 | >>> feature_importances(X, y, task='regression', random_state=42)
71 | array([0. , 0.9831828, 0.0168172])
72 | >>> y = ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c']
73 | >>> x0, x1, x2 = feature_importances(X, y, task='classification', random_state=42)
74 | >>> x1 > x2 > x0 # See Issue #49 for why this test is like this.
75 | True
76 | """
77 | if y is None:
78 | raise NotImplementedError('Unsupervised importance is not yet implemented.')
79 |
80 | if task is None:
81 | task = 'regression' if is_continuous(y) else 'classification'
82 |
83 | # Split the data and ensure it is standardized.
84 | X, X_train, X_val, y, y_train, y_val = split_and_standardize(X, y, random_state=random_state)
85 |
86 | # Train three models and gather the importances.
87 | imps: list = []
88 | if task == 'classification':
89 | imps.append(np.abs(LogisticRegression(random_state=random_state).fit(X, y).coef_.sum(axis=0)))
90 | imps.append(RandomForestClassifier(random_state=random_state).fit(X, y).feature_importances_)
91 | model = KNeighborsClassifier().fit(X_train, y_train)
92 | r = permutation_importance(model, X_val, y_val, n_repeats=8, scoring='f1_weighted', random_state=random_state)
93 | imps.append(r.importances_mean)
94 | elif task == 'regression':
95 | imps.append(np.abs(LinearRegression().fit(X, y).coef_))
96 | imps.append(RandomForestRegressor(random_state=random_state).fit(X, y).feature_importances_)
97 | model = KNeighborsRegressor().fit(X_train, y_train)
98 | r = permutation_importance(model, X_val, y_val, n_repeats=8, scoring='neg_mean_squared_error', random_state=random_state)
99 | imps.append(r.importances_mean)
100 |
101 | # Eliminate negative values and aggregate.
102 | imps = np.array(imps)
103 | imps[imps < 0] = 0
104 | return aggregate(imps, normalize_input=True, normalize_output=True)
105 |
106 |
107 | def least_important_features(importances: ArrayLike,
108 | threshold: Optional[float]=None) -> np.ndarray:
109 | """
110 | Returns the least important features, in order of importance (least
111 | important first). The threshold controls how many features are returned.
112 | Set it to None to set it automatically.
113 |
114 | Args:
115 | importances (array): the importance of the features, in the order in
116 | which they appear in X.
117 | threshold (float or None): the cutoff for the importance. If None, the
118 | cutoff is set to half the expectation of the importance (i.e. 0.5/M
119 | where M is the number of features).
120 |
121 | Returns:
122 | array: The indices of the least important features.
123 |
124 | Examples:
125 | >>> least_important_features([0.05, 0.01, 0.24, 0.4, 0.3])
126 | array([1, 0])
127 | >>> least_important_features([0.2, 0.2, 0.2, 0.2, 0.2])
128 | array([], dtype=int64)
129 | """
130 | if threshold is None:
131 | threshold = 0.5 / len(importances)
132 |
133 | least_important: dict = {}
134 | for arg, imp in zip(np.argsort(importances), np.sort(importances)):
135 | if sum(least_important.values()) + imp > threshold:
136 | break
137 | least_important[arg] = imp
138 |
139 | return np.array(list(least_important)).astype(int)
140 |
141 |
142 | def most_important_features(importances: ArrayLike,
143 | threshold: Optional[float]=None) -> np.ndarray :
144 | """
145 | Returns the indices of the most important features, in reverse order of
146 | importance (most important first). The threshold controls how many features
147 | are returned. Set it to None to set it automatically.
148 |
149 | Args:
150 | importances (array): the importance of the features, in the order in
151 | which they appear in X.
152 | threshold (float or None): the cutoff for the importance. If None,
153 | the cutoff is set to (M-1)/M where M is the number of features.
154 |
155 | Returns:
156 | array: The indices of the most important features.
157 |
158 | Examples:
159 | >>> most_important_features([0.05, 0.01, 0.24, 0.4, 0.3])
160 | array([3, 4, 2])
161 | >>> most_important_features([0.2, 0.2, 0.2, 0.2, 0.2])
162 | array([4, 3, 2, 1, 0])
163 | """
164 | if threshold is None:
165 | threshold = 1 - 0.5 / len(importances)
166 |
167 | most_important: dict = {}
168 | args = np.argsort(importances)[::-1]
169 | imps = np.sort(importances)[::-1]
170 | for arg, imp in zip(args, imps):
171 | most_important[arg] = imp
172 | if sum(most_important.values()) > threshold:
173 | break
174 |
175 | return np.array(list(most_important)).astype(int)
176 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
--------------------------------------------------------------------------------
/tests/test_sklearn.py:
--------------------------------------------------------------------------------
1 | """Test sklearn classes."""
2 | import pytest
3 | import numpy as np
4 | from sklearn.pipeline import make_pipeline
5 | from sklearn.datasets import make_classification, make_regression
6 |
7 | import redflag as rf
8 |
9 | """
10 | NB Most of redflag is tested by its doctests, but doctest cannot test
11 | for warnings, AFAIK. Most of the tests in this file are of the sklearn API.
12 | """
13 |
14 | def test_clip_detector():
15 | """
16 | Checks for clipped data. Detects clipping by looking for multiple values
17 | of max and/or min.
18 | """
19 | pipe = make_pipeline(rf.ClipDetector())
20 | X = np.array([[2, 1], [3, 2], [4, 3], [5, 3]])
21 | with pytest.warns(UserWarning, match="Feature 1 has samples that may be clipped."):
22 | pipe.fit_transform(X)
23 |
24 | # Warns about y, but only on continuous data.
25 | rng = np.random.default_rng(0)
26 | X = rng.normal(size=(100, 2))
27 | y = rng.normal(size=100)
28 | y[:3] = y.max()
29 | with pytest.warns(UserWarning, match="Target 0 has samples that may be clipped."):
30 | pipe.fit_transform(X, y)
31 |
32 | # Raises:
33 | pipe = make_pipeline(rf.ClipDetector(warn=False))
34 | with pytest.raises(ValueError) as e:
35 | pipe.fit_transform(X, y)
36 |
37 | # Does not warn:
38 | X = np.array([[2, 1], [3, 2], [4, 3], [5, 4]])
39 | pipe.fit_transform(X)
40 |
41 |
42 | def test_correlation_detector():
43 | """
44 | Checks for data which is correlated to itself.
45 | """
46 | pipe = make_pipeline(rf.CorrelationDetector())
47 | rng = np.random.default_rng(0)
48 | X = np.stack([rng.uniform(size=20), np.sin(np.linspace(0, 1, 20))]).T
49 | with pytest.warns(UserWarning, match="Feature 1 has samples that may be correlated."):
50 | pipe.fit_transform(X)
51 |
52 |
53 | def test_insufficient_data_detector():
54 | """
55 | Checks for too few samples.
56 | """
57 | pipe = make_pipeline(rf.InsufficientDataDetector())
58 | rng = np.random.default_rng(0)
59 |
60 | # Does not warn:
61 | X = rng.normal(size=(36, 6))
62 | pipe.fit_transform(X)
63 |
64 | # Warns:
65 | X = rng.normal(size=(35, 6))
66 | with pytest.warns(UserWarning, match="Dataset contains only 35 samples"):
67 | pipe.fit_transform(X)
68 |
69 | # Raises:
70 | pipe = make_pipeline(rf.InsufficientDataDetector(warn=False))
71 | with pytest.raises(ValueError) as e:
72 | pipe.fit_transform(X)
73 |
74 |
75 | def test_multimodality_detector():
76 | """
77 | Checks for features with a multimodal distribution, considered across the
78 | entire dataset.
79 | """
80 | pipe = make_pipeline(rf.MultimodalityDetector())
81 | rng = np.random.default_rng(0)
82 | X1 = np.stack([rng.normal(size=80), rng.normal(size=80)]).T
83 | X2 = np.stack([rng.normal(size=80), 3 + rng.normal(size=80)]).T
84 | X = np.vstack([X1, X2])
85 | with pytest.warns(UserWarning, match="Feature 1 has a multimodal distribution."):
86 | pipe.fit_transform(X)
87 | y = np.hstack([np.zeros(80), np.ones(80)])
88 |
89 | # Does not warn.
90 | pipe.fit(X, y)
91 |
92 |
93 | def test_custom_detector():
94 | """
95 | Checks for data which fails a user-supplied test.
96 | """
97 | has_negative = lambda x: np.any(x < 0)
98 | pipe = rf.make_detector_pipeline({has_negative: "are negative"})
99 | X = np.array([[-2, 1], [3, 2], [4, 3], [5, 4]])
100 | with pytest.warns(UserWarning, match="Feature 0 has samples that are negative."):
101 | pipe.fit_transform(X)
102 |
103 | pipe = rf.make_detector_pipeline([has_negative])
104 | with pytest.warns(UserWarning, match="Feature 0 has samples that fail"):
105 | pipe.fit_transform(X)
106 |
107 | detector = rf.Detector(has_negative)
108 | X = np.random.random(size=(100, 2))
109 | y = np.random.random(size=100) - 0.1
110 | assert has_negative(y)
111 | assert rf.is_continuous(y)
112 | with pytest.warns(UserWarning, match="Target 0 has samples that fail"):
113 | pipe.fit_transform(X, y)
114 |
115 |
116 | def test_distribution_comparator():
117 | """
118 | Checks that the distribution of test data (i.e. transformed only) is the
119 | same as the distribution of the training data (i.e. fit and transformed).
120 | """
121 | pipe = make_pipeline(rf.DistributionComparator(threshold=0.5))
122 | rng = np.random.default_rng(0)
123 | X = rng.normal(size=(1_000, 2))
124 | pipe.fit_transform(X) # fit() never throws a warning, just learns the distribution.
125 |
126 | # Throws a warning on test data (tested against training statistics):
127 | X_test = 1 + rng.normal(size=(500, 2))
128 | with pytest.warns(UserWarning, match="Features 0, 1 have distributions that are different from training."):
129 | pipe.transform(X_test)
130 |
131 | # Does not warn if distribution is the same:
132 | X_test = rng.normal(size=(500, 2))
133 | pipe.fit_transform(X)
134 |
135 |
136 | def test_univariate_outlier_detector():
137 | # Use a factor of 0.5 to almost guarantee that this will throw a warning.
138 | pipe = make_pipeline(rf.UnivariateOutlierDetector(factor=0.5))
139 | rng = np.random.default_rng(0)
140 | X = rng.normal(size=1_000).reshape(-1, 1)
141 | with pytest.warns(UserWarning, match="Feature 0 has samples that are excess univariate outliers"):
142 | pipe.fit_transform(X)
143 |
144 | # Does not warn with factor of 2.5:
145 | pipe = make_pipeline(rf.UnivariateOutlierDetector(factor=2.5))
146 | pipe.fit_transform(X)
147 |
148 |
149 | def test_multivariate_outlier_detector():
150 | # Use a factor of 0.5 to almost guarantee that this will throw a warning.
151 | pipe = make_pipeline(rf.MultivariateOutlierDetector(factor=0.5))
152 | rng = np.random.default_rng(0)
153 | X = rng.normal(size=(1_000, 2))
154 | with pytest.warns(UserWarning, match="Dataset has more multivariate outlier samples than expected."):
155 | pipe.fit_transform(X)
156 |
157 | # Warns for y too.
158 | pipe = make_pipeline(rf.MultivariateOutlierDetector(factor=0.5, p=0.8))
159 | X = rng.uniform(size=(1_000, 2))
160 | y = rng.normal(size=1_000)
161 | # y[:100] = 10
162 | with pytest.warns(UserWarning, match="Target has more univariate outlier samples than expected."):
163 | pipe.fit_transform(X, y)
164 |
165 | # Does not warn with factor of 2.5:
166 | pipe = make_pipeline(rf.MultivariateOutlierDetector(factor=2.5))
167 | pipe.fit_transform(X)
168 |
169 | # Does not warn for y.
170 | y = rng.normal(size=1_000)
171 | pipe.fit(X, y)
172 |
173 |
174 | def test_outlier_detector():
175 | # Use a factor of 0.5 to almost guarantee that this will throw a warning.
176 | pipe = make_pipeline(rf.OutlierDetector(factor=0.5))
177 | rng = np.random.default_rng(0)
178 | X = rng.normal(size=(1_000, 2))
179 | with pytest.warns(UserWarning, match="There are more outliers than expected in the training data"):
180 | pipe.fit_transform(X)
181 |
182 | # Throws a warning on test data (tested against training statistics):
183 | X_test = rng.normal(size=(500, 2))
184 | with pytest.warns(UserWarning, match="There are more outliers than expected in the data"):
185 | pipe.transform(X_test)
186 |
187 | # Does not warn with factor of 2:
188 | pipe = make_pipeline(rf.OutlierDetector(factor=2.0))
189 | pipe.fit_transform(X)
190 |
191 |
192 | def test_imbalance_detector():
193 | pipe = make_pipeline(rf.ImbalanceDetector())
194 | rng = np.random.default_rng(0)
195 | X = rng.normal(size=(100, 1))
196 | y = rf.generate_data([20, 80])
197 | with pytest.warns(UserWarning, match="The labels are imbalanced"):
198 | pipe.fit_transform(X, y)
199 |
200 | # Check other method.
201 | pipe = make_pipeline(rf.ImbalanceDetector(method='ir', threshold=2))
202 | with pytest.warns(UserWarning, match="The labels are imbalanced"):
203 | pipe.fit_transform(X, y)
204 |
205 | # Does not warn with higher threshold (summary statistic for this y is 0.6):
206 | pipe = make_pipeline(rf.ImbalanceDetector(threshold=0.7))
207 | pipe.fit_transform(X, y)
208 |
209 | # Warns about wrong kind of y (continuous):
210 | y = rng.normal(size=100)
211 | with pytest.warns(UserWarning, match="Target y seems continuous"):
212 | pipe.fit_transform(X, y)
213 |
214 | # No warning if y is None, just skips.
215 | pipe.fit_transform(X)
216 |
217 | # Raises error because method doesn't exist:
218 | with pytest.raises(ValueError) as e:
219 | pipe = make_pipeline(rf.ImbalanceDetector(method='foo'))
220 |
221 | # Raises error because threshold is wrong.
222 | with pytest.raises(ValueError) as e:
223 | pipe = make_pipeline(rf.ImbalanceDetector(method='ir', threshold=0.5))
224 |
225 | # Raises error because threshold is wrong.
226 | with pytest.raises(ValueError) as e:
227 | pipe = make_pipeline(rf.ImbalanceDetector(method='id', threshold=2))
228 |
229 |
230 | def test_imbalance_comparator():
231 | """
232 | The 'comparator' learns the imbalance statistics of the training set,
233 | then compares subsequent sets to the learned stats.
234 | """
235 | # We need to use the special redflag pipeline object, which passes
236 | # both X and y to `transform()`.
237 | pipe = rf.make_rf_pipeline(rf.ImbalanceComparator())
238 |
239 | # The rest is standard.
240 | rng = np.random.default_rng(0)
241 | X = rng.normal(size=(200, 1))
242 | y = rf.generate_data([20, 20, 20, 140])
243 |
244 | # Does not raise a warning because we're only fitting.
245 | pipe.fit(X, y)
246 |
247 | # Warns about different number of minority classes.
248 | y = rf.generate_data([20, 20, 80, 80])
249 | with pytest.warns(UserWarning, match="There is a different number"):
250 | pipe.transform(X, y)
251 |
252 | # Warns about wrong kind of y (continuous):
253 | y = rng.normal(size=100)
254 | with pytest.warns(UserWarning, match="Target y seems continuous"):
255 | pipe.fit_transform(X, y)
256 | with pytest.warns(UserWarning, match="Target y seems continuous"):
257 | pipe.transform(X, y)
258 |
259 | # No warning if y is None, just skips:
260 | pipe.fit_transform(X)
261 |
262 | # Raises error because threshold is wrong.
263 | with pytest.raises(ValueError) as e:
264 | pipe = make_pipeline(rf.ImbalanceComparator(method='ir', threshold=0.5))
265 |
266 | # Raises error because threshold is wrong.
267 | with pytest.raises(ValueError) as e:
268 | pipe = make_pipeline(rf.ImbalanceComparator(method='id', threshold=2))
269 |
270 |
271 | def test_importance_detector():
272 | # Raises error because method doesn't exist:
273 | with pytest.raises(ValueError) as e:
274 | pipe = make_pipeline(rf.ImportanceDetector(threshold=2))
275 |
276 | pipe = make_pipeline(rf.ImportanceDetector(random_state=0))
277 |
278 | # Warns about low importance.
279 | X, y = make_classification(n_samples=200, n_features=4, n_informative=3, n_redundant=0, n_classes=2, random_state=42)
280 | with pytest.warns(UserWarning, match="Feature 3 has low importance"):
281 | pipe.fit_transform(X, y)
282 |
283 | # Warns about high importance.
284 | X, y = make_classification(n_samples=200, n_features=3, n_informative=2, n_redundant=0, n_classes=2, random_state=42)
285 | with pytest.warns(UserWarning, match="Feature 1 has very high importance"):
286 | pipe.fit_transform(X, y)
287 |
288 | # Warns about wrong kind of y.
289 | y = None
290 | with pytest.warns(UserWarning, match="Target y is None"):
291 | pipe.fit_transform(X, y)
292 |
293 |
294 | def test_dummy_predictor():
295 | """
296 | Checks that the dummy regressor and classifier work as expected.
297 | """
298 | pipe = make_pipeline(rf.DummyPredictor(random_state=42))
299 |
300 | # Regression:
301 | X, y = make_regression(random_state=42)
302 | with pytest.warns(UserWarning, match="Dummy regressor scores:"):
303 | pipe.fit_transform(X, y)
304 |
305 | # Classification:
306 | X, y = make_classification(random_state=42)
307 | with pytest.warns(UserWarning, match="Dummy classifier scores:"):
308 | pipe.fit_transform(X, y)
309 |
310 | # Warns about wrong kind of y.
311 | y = None
312 | with pytest.warns(UserWarning, match="Target y is None"):
313 | pipe.fit_transform(X, y)
314 |
--------------------------------------------------------------------------------
/src/redflag/imbalance.py:
--------------------------------------------------------------------------------
1 | """
2 | Imbalance metrics.
3 |
4 | This work is derived from the following reference work:
5 | Jonathan Ortigosa-Hernandez, Inaki Inza, and Jose A. Lozano
6 | Measuring the Class-imbalance Extent of Multi-class Problems
7 | Pattern Recognition Letters 98 (2017)
8 | https://doi.org/10.1016/j.patrec.2017.08.002
9 |
10 | Author: Matt Hall, scienxlab.org
11 | Licence: Apache 2.0
12 |
13 | Copyright 2024 Redflag contributors
14 |
15 | Licensed under the Apache License, Version 2.0 (the "License");
16 | you may not use this file except in compliance with the License.
17 | You may obtain a copy of the License at
18 |
19 | http://www.apache.org/licenses/LICENSE-2.0
20 |
21 | Unless required by applicable law or agreed to in writing, software
22 | distributed under the License is distributed on an "AS IS" BASIS,
23 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
24 | See the License for the specific language governing permissions and
25 | limitations under the License.
26 | """
27 | from __future__ import annotations
28 |
29 | from typing import Optional, Callable, Union
30 | from collections import Counter
31 | import warnings
32 |
33 | import numpy as np
34 | from numpy.typing import ArrayLike
35 |
36 | from .target import *
37 | from .utils import *
38 |
39 |
40 | def class_counts(a: ArrayLike, classes: Optional[ArrayLike]=None) -> dict:
41 | """
42 | Make a Counter of the class labels in `classes`, or in `a` if `classes`
43 | is None.
44 |
45 | Args:
46 | a (array): A list of class labels.
47 | classes (array): A list of classes, in the event that `a` does not
48 | contain all of the classes, or if you want to ignore some classes
49 | in `a` (not recommended) you can omit them from this list.
50 |
51 | Returns:
52 | dict. The counts, in the order in which classes are encountered in
53 | `classes` (if `classes is not `None`) or `a`.
54 |
55 | Example:
56 | >>> class_counts([1, 3, 2, 2, 3, 3])
57 | {1: 1, 3: 3, 2: 2}
58 | """
59 | counts = Counter(a)
60 |
61 | if classes is None:
62 | classes = counts.keys()
63 |
64 | if len(counts) < len(classes):
65 | message = 'Some classes in the data are not in the list of classes.'
66 | warnings.warn(message, stacklevel=2)
67 |
68 | return {k: counts[k] for k in classes}
69 |
70 |
71 | def empirical_distribution(a: ArrayLike, classes: Optional[ArrayLike]=None) -> tuple[np.ndarray, np.ndarray]:
72 | """
73 | Compute zeta and e. Equation 5 in Ortigosa-Hernandez et al. (2017).
74 |
75 | Args:
76 | a (array): A list of class labels.
77 | classes (array): A list of classes, in the event that `a` does not
78 | contain all of the classes, or if you want to ignore some classes
79 | in `a` (not recommended) you can omit them from this list.
80 |
81 | Returns:
82 | tuple: (zeta, e). Both arrays are length K, where K is the number of
83 | classes discovered in `a` (if `classes` is None) or named in
84 | `classes` otherwise.
85 | """
86 | c = class_counts(a, classes=classes)
87 | ζ = np.array([v / sum(c.values()) for v in c.values()])
88 | e = np.array([1 / len(c) for _ in c.values()])
89 | return ζ, e
90 |
91 |
92 | def imbalance_ratio(a: ArrayLike, classes: Optional[ArrayLike]=None) -> float:
93 | """
94 | Compute the IR. Equation 6 in Ortigosa-Hernandez et al. (2017).
95 |
96 | This measure is useful for binary problems, but not for multiclass problems.
97 |
98 | Args:
99 | a (array): A list of class labels.
100 | classes (array): A list of classes, in the event that `a` does not
101 | contain all of the classes, or if you want to ignore some classes
102 | in `a` (not recommended) you can omit them from this list.
103 |
104 | Returns:
105 | float: The imbalance ratio.
106 | """
107 | ζ, _ = empirical_distribution(a, classes=classes)
108 | epsilon = 1e-12
109 | return max(ζ) / (min(ζ) + epsilon)
110 |
111 |
112 | def major_minor(a: ArrayLike, classes: Optional[ArrayLike]=None) -> tuple[int, int]:
113 | """
114 | Returns the number of majority and minority classes.
115 |
116 | Args:
117 | a (array): A list of class labels.
118 | classes (array): A list of classes, in the event that `a` does not
119 | contain all of the classes, or if you want to ignore some classes
120 | in `a` (not recommended) you can omit them from this list.
121 |
122 | Returns:
123 | tuple: (maj, min), the number of majority and minority classes.
124 |
125 | Example:
126 | >>> major_minor([1, 1, 2, 2, 3, 3, 3])
127 | (1, 2)
128 | """
129 | ζ, e = empirical_distribution(a, classes=classes)
130 | return sum(ζ >= e), sum(ζ < e)
131 |
132 |
133 | def divergence(method: str='hellinger') -> Callable:
134 | """
135 | Provides a function for computing the divergence between two discrete
136 | probability distributions. Used by `imbalance_degree()`.
137 |
138 | `method` can be a string from:
139 | - `hellinger`: Recommended by Ortigosa-Hernandez et al. (2017).
140 | - `euclidean`: Not recommended.
141 | - `manhattan`: Recommended.
142 | - `kl`: Not recommended.
143 | - `tv`: Recommended.
144 |
145 | If `method` is a function, this function just hands it back.
146 |
147 | Args:
148 | ζ (array): The actual distribution.
149 | e (array): The expected distribution.
150 | method (str): The method to use.
151 |
152 | Returns:
153 | function: A divergence function.
154 |
155 | Reference:
156 | Ortigosa-Hernandez et al. (2017)
157 | """
158 | functions = {
159 | 'hellinger': lambda x, y: np.sqrt(np.sum((np.sqrt(x) - np.sqrt(y))**2)) / np.sqrt(2),
160 | 'euclidean': lambda x, y: np.sqrt(np.sum((x - y)**2)),
161 | 'manhattan': lambda x, y: np.sum(np.abs(x - y)),
162 | 'kl': lambda x, y: np.sum(x * np.log((x + 1e-12) / y)), # Kullback-Leibler.
163 | 'tv': lambda x, y: np.sum(np.abs(x - y)) / 2, # Total variation.
164 | }
165 | return functions.get(method, method)
166 |
167 |
168 | def furthest_distribution(a: ArrayLike, classes: Optional[ArrayLike]=None) -> np.ndarray:
169 | """
170 | Compute the furthest distribution from `a`; used by `imbalance_degree()`.
171 | See Ortigosa-Hernandez et al. (2017).
172 |
173 | Args:
174 | a (array): A list of class labels.
175 | classes (array): A list of classes, in the event that `a` does not
176 | contain all of the classes, or if you want to ignore some classes
177 | in `a` (not recommended) you can omit them from this list.
178 |
179 | Returns:
180 | array: The furthest distribution.
181 |
182 | Example:
183 | >>> furthest_distribution([3,0,0,1,2,3,2,3,2,3,1,1,2,3,3,4,3,4,3,4,])
184 | array([0.8, 0. , 0. , 0.2, 0. ])
185 | """
186 | ζ, e = empirical_distribution(a, classes=classes)
187 | # Construct the vector according to Eq 9.
188 | i = [ei if ζi >= ei else 0 for ζi, ei in zip(ζ, e)]
189 | # Arbitrarily increase one of the non-zero probs to sum to 1.
190 | i[np.argmax(i)] += 1 - sum(i)
191 | return np.array(i)
192 |
193 |
194 | def imbalance_degree(a: ArrayLike,
195 | method: Union[str, Callable]='tv',
196 | classes: Optional[ArrayLike]=None,
197 | ) -> float:
198 | r"""
199 | The imbalance degree reflects the degree to which the distribution of
200 | classes is imbalanced. The integer part of the imbalance degree is the
201 | number of minority classes minus 1 (m - 1, below). The fractional part
202 | is the distance between the actual (empirical) and expected distributions.
203 | The distance can be defined in different ways, depending on the method.
204 |
205 | IR is defined according to Eq 8 in Ortigosa-Hernandez et al. (2017).
206 |
207 | .. math::
208 | \mathrm{ID}(\zeta) = \frac{d_\mathrm{\Delta}(\mathbf{\zeta}, \mathbf{e})}
209 | {d_\mathrm{\Delta}(\mathbf{\iota}_m, \mathbf{e})} + (m - 1)
210 |
211 | `method` can be a string from:
212 | - 'manhattan': Manhattan distance or L1 norm
213 | - 'euclidean': Euclidean distance or L2 norm
214 | - 'hellinger': Hellinger distance, recommended by Ortigosa-Hernandez et al. (2017)
215 | - 'tv': total variation distance, recommended by Ortigosa-Hernandez et al. (2017)
216 | - 'kl': Kullback-Leibner divergence
217 |
218 | It can also be a function returning a divergence.
219 |
220 | Args:
221 | a (array): A list of class labels.
222 | method (str or function): The method to use.
223 | classes (array): A list of classes, in the event that `a` does not
224 | contain all of the classes, or if you want to ignore some classes
225 | in `a` (not recommended) you can omit them from this list.
226 |
227 | Returns:
228 | float: The imbalance degree.
229 |
230 | Examples:
231 | >>> ID = imbalance_degree(generate_data([288, 49, 288]), 'tv')
232 | >>> round(ID, 2)
233 | 0.76
234 | >>> ID = imbalance_degree(generate_data([629, 333, 511]), 'euclidean')
235 | >>> round(ID, 2)
236 | 0.3
237 | >>> ID = imbalance_degree(generate_data([2, 81, 61, 4]), 'hellinger')
238 | >>> round(ID, 2)
239 | 1.73
240 | >>> ID = imbalance_degree(generate_data([2, 81, 61, 4]), 'kl')
241 | >>> round(ID, 2)
242 | 1.65
243 | """
244 | ζ, e = empirical_distribution(a, classes=classes)
245 | m = sum(ζ < e)
246 | i = furthest_distribution(a, classes=classes)
247 | div = divergence(method)
248 | epsilon = 1e-12
249 | return (div(ζ, e) / (epsilon + div(i, e))) + (m - 1)
250 |
251 |
252 | def minority_classes(a: ArrayLike, classes: Optional[ArrayLike]=None) -> np.ndarray:
253 | """
254 | Get the minority classes, based on the empirical distribution.
255 | The classes are listed in order of increasing frequency.
256 |
257 | Args:
258 | a (array): A list of class labels.
259 | classes (array): A list of classes, in the event that `a` does not
260 | contain all of the classes, or if you want to ignore some classes
261 | in `a` (not recommended) you can omit them from this list.
262 |
263 | Returns:
264 | array: The minority classes.
265 |
266 | Example:
267 | >>> minority_classes([1, 2, 2, 2, 3, 3, 3, 3, 4, 4])
268 | array([1, 4])
269 | """
270 | a = np.asarray(a)
271 | ζ, e = empirical_distribution(a, classes=classes)
272 |
273 | # We can suppress this warning (if any) because it would already have
274 | # been raised by `empirical_distribution`.
275 | with warnings.catch_warnings():
276 | warnings.simplefilter("ignore")
277 | classes = class_counts(a, classes=classes).keys()
278 |
279 | # Return the minority classes in order, smallest first.
280 | return np.array([c for ζi, ei, c in sorted(zip(ζ, e, classes)) if ζi < ei])
281 |
282 |
283 | def is_imbalanced(a: ArrayLike,
284 | threshold: float=0.4,
285 | method: Union[str, Callable]='tv',
286 | classes: Optional[ArrayLike]=None,
287 | ) -> bool:
288 | """
289 | Check if a dataset is imbalanced by first checking that there are minority
290 | classes, then inspecting the fractional part of the imbalance degree metric.
291 | The metric is compared to the threshold you provide (default 0.4, same as
292 | the sklearn detector ImbalanceDetector).
293 |
294 | Args:
295 | a (array): A list of class labels.
296 | threshold (float): The threshold to use. Default: 0.5.
297 | method (str or function): The method to use.
298 | classes (array): A list of classes, in the event that `a` does not
299 | contain all of the classes, or if you want to ignore some classes
300 | in `a` (not recommended) you can omit them from this list.
301 |
302 | Returns:
303 | bool: True if the dataset is imbalanced.
304 |
305 | Example:
306 | >>> is_imbalanced(generate_data([2, 81, 61, 4]))
307 | True
308 | """
309 | if not minority_classes(a, classes=classes).size:
310 | return False
311 | im_deg = imbalance_degree(a, method, classes)
312 | return im_deg - int(im_deg) >= threshold
313 |
--------------------------------------------------------------------------------
/src/redflag/target.py:
--------------------------------------------------------------------------------
1 | """
2 | Functions related to understanding the target and the type of task.
3 |
4 | Author: Matt Hall, scienxlab.org
5 | Licence: Apache 2.0
6 |
7 | Copyright 2024 Redflag contributors
8 |
9 | Licensed under the Apache License, Version 2.0 (the "License");
10 | you may not use this file except in compliance with the License.
11 | You may obtain a copy of the License at
12 |
13 | http://www.apache.org/licenses/LICENSE-2.0
14 |
15 | Unless required by applicable law or agreed to in writing, software
16 | distributed under the License is distributed on an "AS IS" BASIS,
17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | See the License for the specific language governing permissions and
19 | limitations under the License.
20 | """
21 | from __future__ import annotations
22 | from typing import Optional
23 |
24 | import numpy as np
25 | from numpy.typing import ArrayLike
26 | from sklearn.dummy import DummyClassifier, DummyRegressor
27 | from sklearn.metrics import f1_score, roc_auc_score
28 | from sklearn.metrics import mean_squared_error, r2_score
29 |
30 | from .utils import *
31 | from .markov import Markov_chain
32 |
33 |
34 | def is_continuous(a: ArrayLike, n: Optional[int]=None) -> bool:
35 | """
36 | Decide if this is most likely a continuous variable (and thus, if this is
37 | the target, for example, most likely a regression task).
38 |
39 | Args:
40 | a (array): A target vector.
41 | n (int): The number of potential categories. That is, if there are
42 | fewer than n unique values in the data, it is estimated to be
43 | categorical. Default: the square root of the sample size, which
44 | is all the data or 10_000 random samples, whichever is smaller.
45 |
46 | Returns:
47 | bool: True if arr is probably best suited to regression.
48 |
49 | Examples:
50 | >>> is_continuous(10 * ['a', 'b'])
51 | False
52 | >>> is_continuous(100 * [1, 2, 3])
53 | False
54 | >>> import numpy as np
55 | >>> is_continuous(np.random.random(size=100))
56 | True
57 | >>> is_continuous(np.random.randint(0, 15, size=200))
58 | False
59 | """
60 | arr = np.asarray(a)
61 |
62 | if not is_numeric(arr):
63 | return False
64 |
65 | # Now we are dealing with numbers that could represent categories.
66 |
67 | if is_binary(arr):
68 | return False
69 |
70 | # Starting with this and having the uplifts be 0.666 means
71 | # that at least 2 tests must trigger to get over 0.5.
72 | p = 1 / 3
73 |
74 | # Take a sample if array is large.
75 | if arr.size < 10_000:
76 | sample = arr
77 | else:
78 | sample = np.random.choice(arr, size=10_000, replace=False)
79 |
80 | if n is None:
81 | n = np.sqrt(sample.size)
82 |
83 | # Check if floats.
84 | if np.issubdtype(sample.dtype, np.floating):
85 |
86 | # If not ints in disguise.
87 | if not np.all([xi.is_integer() for xi in np.unique(sample)]):
88 | p = update_p(p, 2/3, 2/3)
89 |
90 | # If low precision.
91 | if np.all((sample.astype(int) - sample) < 1e-3):
92 | p = update_p(p, 2/3, 2/3)
93 |
94 | # If many unique values.
95 | if np.unique(sample).size > n:
96 | p = update_p(p, 2/3, 2/3)
97 |
98 | # If many sizes of gaps between numbers.
99 | many_gap_sizes = np.unique(np.diff(np.sort(sample))).size > n
100 | if many_gap_sizes:
101 | p = update_p(p, 2/3, 2/3)
102 |
103 | return p > 0.5
104 |
105 |
106 | def n_classes(y: ArrayLike) -> int:
107 | """
108 | Count the classes.
109 |
110 | Args:
111 | y (array): A list of class labels.
112 |
113 | Returns:
114 | int: The number of classes.
115 |
116 | Examples:
117 | >>> n_classes([1, 1, 1])
118 | 1
119 | >>> n_classes([0, 1, 1])
120 | 2
121 | >>> n_classes([1, 2, 3])
122 | 3
123 | """
124 | y_ = np.asanyarray(y)
125 | return np.unique(y_).size
126 |
127 |
128 | def is_multioutput(y: ArrayLike) -> bool:
129 | """
130 | Decide if a target array is multi-output.
131 |
132 | Raises TypeError if y has more than 2 dimensions.
133 |
134 | Args:
135 | y (array): A list of class labels.
136 |
137 | Returns:
138 | bool: True if y has more than 1 dimensions.
139 |
140 | Examples:
141 | >>> is_multioutput([1, 2, 3])
142 | False
143 | >>> is_multioutput([[1, 2], [3, 4]])
144 | True
145 | >>> is_multioutput([[1], [2]])
146 | False
147 | >>> is_multioutput([[[1], [2]],[[3], [4]]])
148 | Traceback (most recent call last):
149 | TypeError: Target array has too many dimensions.
150 | """
151 | y_ = np.asanyarray(y)
152 | if y_.ndim == 1:
153 | return False
154 | elif (y_.ndim == 2):
155 | return y_.shape[1] > 1
156 | else:
157 | message = "Target array has too many dimensions."
158 | raise TypeError(message)
159 |
160 |
161 | def is_multiclass(y: ArrayLike) -> bool:
162 | """
163 | Decide if a single target is multiclass.
164 |
165 | Args:
166 | y (array): A list of class labels.
167 |
168 | Returns:
169 | bool: True if y has more than 2 classes.
170 |
171 | Examples:
172 | >>> print(is_multiclass([1, 1, 1]))
173 | False
174 | >>> is_multiclass([0, 1, 1])
175 | False
176 | >>> is_multiclass([1, 2, 3])
177 | True
178 | """
179 | if n_classes(y) > 2:
180 | return True
181 | else:
182 | return False
183 |
184 |
185 | def is_binary(y: ArrayLike) -> bool:
186 | """
187 | Decide if a single target is binary.
188 |
189 | Args:
190 | y (array): A list of class labels.
191 |
192 | Returns:
193 | bool: True if y has exactly 2 classes.
194 |
195 | Examples:
196 | >>> print(is_binary([1, 1, 1]))
197 | False
198 | >>> is_binary([0, 1, 1])
199 | True
200 | >>> is_binary([1, 2, 3])
201 | False
202 | """
203 | return n_classes(y) == 2
204 |
205 |
206 | def dummy_classification_scores(y: ArrayLike, random_state:Optional[int]=None) -> dict:
207 | """
208 | Make dummy classifications, which can indicate a good lower-bound baseline
209 | for classification tasks. Wraps scikit-learn's `DummyClassifier`, using the
210 | `most_frequent` and `stratified` methods, and provides a dictionary of F1
211 | and ROC-AUC scores.
212 |
213 | Args:
214 | y (array): A list of class labels.
215 | random_state (int): A seed for the random number generator.
216 |
217 | Returns:
218 | dict: A dictionary of scores.
219 |
220 | Examples:
221 | >>> y = [1, 1, 1, 1, 1, 2, 2, 2, 3, 3]
222 | >>> scores = dummy_classification_scores(y, random_state=42)
223 | >>> scores['most_frequent'] # Precision issue with stratified test.
224 | {'f1': 0.3333333333333333, 'roc_auc': 0.5}
225 | """
226 | result = {'most_frequent': {}, 'stratified': {}}
227 | y = np.asanyarray(y)
228 | if y.ndim > 1:
229 | raise ValueError("Multilabel target is not supported.")
230 | X = np.ones_like(y).reshape(-1, 1) # X is not used by the model.
231 | for method, scores in result.items():
232 | model = DummyClassifier(strategy=method, random_state=random_state)
233 | _ = model.fit(X, y)
234 | scores['f1'] = f1_score(y, model.predict(X), average='weighted')
235 | y_prob = model.predict_proba(X)
236 | if is_binary(y):
237 | scores['roc_auc'] = roc_auc_score(y, y_prob[:, 1])
238 | else:
239 | scores['roc_auc'] = roc_auc_score(y, y_prob, multi_class='ovr')
240 | return result
241 |
242 |
243 | def dummy_regression_scores(y: ArrayLike) -> dict:
244 | """
245 | Make dummy predictions, which can indicate a good lower-bound baseline
246 | for regression tasks. Wraps scikit-learn's `DummyRegressor`, using the
247 | `mean` method, and provides a dictionary of MSE and R-squared scores.
248 |
249 | Args:
250 | y (array): A list of values.
251 |
252 | Returns:
253 | dict: A dictionary of scores.
254 |
255 | Examples:
256 | >>> y = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
257 | >>> dummy_regression_scores(y)
258 | {'mean': {'mean_squared_error': 8.25, 'r2': 0.0}}
259 | """
260 | result = {'mean': {}}
261 | y = np.asanyarray(y)
262 | if y.ndim > 1:
263 | raise ValueError("Multilabel target is not supported.")
264 | X = np.ones_like(y).reshape(-1, 1) # X is not used by the model.
265 | for method, scores in result.items():
266 | model = DummyRegressor(strategy=method)
267 | _ = model.fit(X, y)
268 | y_pred = model.predict(X)
269 | scores['mean_squared_error'] = mean_squared_error(y, y_pred)
270 | scores['r2'] = r2_score(y, y_pred)
271 | return result
272 |
273 |
274 | def dummy_scores(y: ArrayLike, task='auto', random_state:Optional[int]=None) -> dict:
275 | """
276 | Provide scores from a 'dummy' (naive) model. This can be useful for
277 | understanding the difficulty of the task. For example, if the dummy
278 | model does well, then the task is probably easy and you should be
279 | suspicious of any model that does not do well.
280 |
281 | The function automatically decides whether y is continuous or categorical
282 | and calls the appropriate scoring function.
283 |
284 | Args:
285 | y (array): A list of class labels.
286 | task (str): What kind of task: 'regression' or 'classification', or 'auto'
287 | to decide automatically. In general regression tasks predict continuous
288 | variables (e.g. temperature tomorrow), while classification tasks predict
289 | categorical variables (e.g. rain, cloud or sun).
290 | random_state (int): A seed for the random number generator. Only required
291 | classification tasks (categorical variables).
292 |
293 | Returns:
294 | dict: A dictionary of scores.
295 |
296 | Examples:
297 | >>> y = [1, 1, 1, 1, 1, 2, 2, 2, 3, 3]
298 | >>> dummy_scores(y, random_state=42)
299 | {'f1': 0.3333333333333333, 'roc_auc': 0.5, 'strategy': 'most_frequent', 'task': 'classification'}
300 | >>> y = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
301 | >>> dummy_scores(y, task='regression')
302 | {'mean_squared_error': 8.25, 'r2': 0.0, 'strategy': 'mean', 'task': 'regression'}
303 | """
304 | if task == 'auto':
305 | task = 'regression' if is_continuous(y) else 'classification'
306 |
307 | if task == 'classification':
308 | scores = dummy_classification_scores(y, random_state=random_state)
309 | scores_mf, scores_st = scores['most_frequent'], scores['stratified']
310 | if scores_mf['f1'] >= scores_st['f1']:
311 | scores_ = scores_mf
312 | scores_['strategy'] = 'most_frequent'
313 | else:
314 | scores_ = scores_st
315 | scores_['strategy'] = 'stratified'
316 | scores_['task'] = 'classification'
317 | elif task == 'regression':
318 | scores = dummy_regression_scores(y)
319 | scores_ = scores['mean']
320 | scores_['strategy'] = 'mean'
321 | scores_['task'] = 'regression'
322 | else:
323 | raise ValueError("`task` must be 'classification' or 'regression', or 'auto' to decide automatically.")
324 |
325 | return scores_
326 |
327 |
328 | def is_ordered(y: ArrayLike, q: float=0.95) -> bool:
329 | """
330 | Decide if a single target is ordered.
331 |
332 | Args:
333 | y (array): A list of class labels.
334 | q (float): The confidence level, as a float in the range 0 to 1.
335 | Default: 0.95.
336 |
337 | Returns:
338 | bool: True if y is ordered.
339 |
340 | Examples:
341 | >>> is_ordered(10 * ['top', 'top', 'middle', 'middle', 'bottom'])
342 | True
343 | >>> is_ordered(10 * [0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3])
344 | True
345 | >>> rng = np.random.default_rng(42)
346 | >>> is_ordered(rng.integers(low=0, high=9, size=200))
347 | False
348 | """
349 | y_ = np.asanyarray(y)
350 | if is_continuous(y_):
351 | raise ValueError('Cannot check order of continuous data.')
352 | if y_.ndim > 1:
353 | raise ValueError('Cannot check order of multilabel data.')
354 | sas = isinstance(y[0], str)
355 | m = Markov_chain.from_sequence(y_, strings_are_states=sas, include_self=True)
356 | chi2, crit, perc = m.chi_squared(q=q)
357 | return chi2 > crit
358 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 |
4 | ## 0.5.0, 21 April 2024
5 |
6 | - This release makes more changes to the tests and documentation in reponse to the review process for [the submission](https://joss.theoj.org/papers/e1ca575ec0c5344144f87176539ef547) to JOSS (see below).
7 | - In particular, see the following issue: [#97](https://github.com/scienxlab/redflag/issues/97)
8 | - Changed the method of handling dynamic versioning. For now the package `__version__` attribute is still defined, but it is deprecated and will be removed in `0.6.0`. Use `from importlib.metadata.version('redflag')` to get the version information instead.
9 | - Changed the default `get_outliers()` method from isolation forest (`'iso'`) to Mahalanobis (`'mah'`) to match other functions, eg `has_outliers()` and the `sklearn` pipeline object.
10 | - Updated `actions/setup-python` to use v5.
11 |
12 |
13 | ## 0.4.2, 10 December 2023
14 |
15 | - This is a minor release making changes to the tests and documentation in reponse to the review process for [a submission](https://joss.theoj.org/papers/e1ca575ec0c5344144f87176539ef547) to [The Journal of Open Source Software](https://joss.theoj.org) (JOSS).
16 | - See the following issues: [#89](https://github.com/scienxlab/redflag/issues/89), [#90](https://github.com/scienxlab/redflag/issues/90), [#91](https://github.com/scienxlab/redflag/issues/91), [#92](https://github.com/scienxlab/redflag/issues/92), [#93](https://github.com/scienxlab/redflag/issues/93), [#94](https://github.com/scienxlab/redflag/issues/94) and [#95](https://github.com/scienxlab/redflag/issues/95).
17 | - Now building and testing on Windows and MacOS as well as Linux.
18 | - Python version `3.12` added to package classifiers
19 | - Python version `3.12` tested during CI
20 |
21 |
22 | ## 0.4.1, 2 October 2023
23 |
24 | - This is a minor release intended to preview new `pandas`-related features for version 0.5.0.
25 | - Added another `pandas` Series accessor, `is_imbalanced()`.
26 | - Added two `pandas` DataFrame accessors, `feature_importances()` and `correlation_detector()`. These are experimental features.
27 |
28 |
29 | ## 0.4.0, 28 September 2023
30 |
31 | - `redflag` can now be installed by the `conda` package and environment manager. To do so, use `conda install -c conda-forge redflag`.
32 | - All of the `sklearn` components can now be instantiated with `warn=False` in order to trigger a `ValueException` instead of a warning. This allows you to build pipelines that will break if a detector is triggered.
33 | - Added `redflag.target.is_ordered()` to check if a single-label categorical target is ordered in some way. The test uses a Markov chain analysis, applying chi-squared test to the transition matrix. In general, the Boolean result should only be used on targets with several classes, perhaps at least 10. Below that, it seems to give a lot of false positives.
34 | - You can now pass `groups` to `redflag.distributions.is_multimodal()`. If present, the modality will be checked for each group, returning a Boolean array of values (one for each group). This allows you to check a feature partitioned by target class, for example.
35 | - Added `redflag.sklearn.MultimodalityDetector` to provide a way to check for multimodal features. If `y` is passed and is categorical, it will be used to partition the data and modality will be checked for each class.
36 | - Added `redflag.sklearn.InsufficientDataDetector` which checks that there are at least M2 records (rows in `X`), where M is the number of features (i.e. columns) in `X`.
37 | - Removed `RegressionMultimodalDetector`. Use `MultimodalDetector` instead.
38 |
39 |
40 | ## 0.3.0, 21 September 2023
41 |
42 | - Added some accessors to give access to `redflag` functions directly from `pandas.Series` objects, via an 'accessor'. For example, for a Series `s`, one can call `minority_classes = s.redflag.minority_classes()` instead of `redflag.minority_classes(s)`. Other functions include `imbalance_degree()`, `dummy_scores()` (see below). Probably not very useful yet, but future releases will add some reporting functions that wrap multiple Redflag functions. **This is an experimental feature and subject to change.**
43 | - Added a Series accessor `report()` to perform a range of tests and make a small text report suitable for printing. Access for a Series `s` like `s.redflag.report()`. **This is an experimental feature and subject to change.**
44 | - Added new documentation page for the Pandas accessor.
45 | - Added `redflag.target.dummy_classification_scores()`, `redflag.target.dummy_regression_scores()`, which train a dummy (i.e. naive) model and compute various relevant scores (MSE and R2 for regression, F1 and ROC-AUC for classification tasks). Additionally, both `most_frequent` and `stratified` strategies are tested for classification tasks; only the `mean` strategy is employed for regression tasks. The helper function `redflag.target.dummy_scores()` tries to guess what kind of task suits the data and calls the appropriate function.
46 | - Moved `redflag.target.update_p()` to `redflag.utils`.
47 | - Added `is_imbalanced()` to return a Boolean depending on a threshold of imbalance degree. Default threshold is 0.5 but the best value is up for debate.
48 | - Removed `utils.has_low_distance_stdev`.
49 |
50 |
51 | ## 0.2.0, 4 September 2023
52 |
53 | - Moved to something more closely resembling semantic versioning, which is the main reason this is version 0.2.0.
54 | - Builds and tests on Python 3.11 have been successful, so now supporting this version.
55 | - Added custom 'alarm' `Detector`, which can be instantiated with a function and a warning to emit when the function returns True for a 1D array. You can easily write your own detectors with this class.
56 | - Added `make_detector_pipeline()` which can take sequences of functions and warnings (or a mapping of functions to warnings) and returns a `scikit-learn.pipeline.Pipeline` containing a `Detector` for each function.
57 | - Added `RegressionMultimodalDetector` to allow detection of non-unimodal distributions in features, when considered across the entire dataset. (Coming soon, a similar detector for classification tasks that will partition the data by class.)
58 | - Redefined `is_standardized` (deprecated) as `is_standard_normal`, which implements the Kolmogorov–Smirnov test. It seems more reliable than assuming the data will have a mean of almost exactly 0 and standard deviation of exactly 1, when all we really care about is that the feature is roughly normal.
59 | - Changed the wording slightly in the existing detector warning messages.
60 | - No longer warning if `y` is `None` in, eg, `ImportanceDetector`, since you most likely know this.
61 | - Some changes to `ImportanceDetector`. It now uses KNN estimators instead of SVMs as the third measure of importance; the SVMs were too unstable, causing numerical issues. It also now requires that the number of important features is less than the total number of features to be triggered. So if you have 2 features and both are important, it does not trigger.
62 | - Improved `is_continuous()` which was erroneously classifying integer arrays with many consecutive values as non-continuous.
63 | - Note that `wasserstein` no longer checks that the data are standardized; this check will probably return in the future, however.
64 | - Added a `Tutorial.ipynb` notebook to the docs.
65 | - Added a **Copy** button to code blocks in the docs.
66 |
67 |
68 | ## 0.1.10, 21 November 2022
69 |
70 | - Added `redflag.importance.least_important_features()` and `redflag.importance.most_important_features()`. These functions are complementary (in other words, if the same threshold is used in each, then between them they return all of the features). The default threshold for importance is half the expected value. E.g. if there are 5 features, then the default threshold is half of 0.2, or 0.1. Part of [Issue 2](https://github.com/scienxlab/redflag/issues/2).
71 | - Added `redflag.sklearn.ImportanceDetector` class, which warns if 1 or 2 features have anomalously high importance, or if some features have anomalously low importance. Part of [Issue 2](https://github.com/scienxlab/redflag/issues/2).
72 | - Added `redflag.sklearn.ImbalanceComparator` class, which learns the imbalance present in the training data, then compares what is observed in subsequent data (evaluation, test, or production data). If there's a difference, it throws a warning. Note: it does not warn if there is imbalance present in the training data; use `ImbalanceDetector` for that.
73 | - Added `redflag.sklearn.RfPipeline` class, which is needed to include the `ImbalanceComparator` in a pipeline (because the common-or-garden `sklearn.pipeline.Pipeline` class does not pass `y` into a transformer's `transform()` method). Also added the `redflag.sklearn.make_rf_pipeline()` function to help make pipelines with this special class. These components are straight-up forks of the code in `scikit-learn` (3-clause BSD licensed).
74 | - Added example to `docs/notebooks/Using_redflag_with_sklearn.ipynb` to show how to use these new objects.
75 | - Improved `redflag.is_continuous()`, which was buggy; see [Issue 3](https://github.com/scienxlab/redflag/issues/3). It still fails on some cases. I'm not sure a definitive test for continuousness (or, conversely, discreteness) is possible; it's just a heuristic.
76 |
77 |
78 | ## 0.1.9, 25 August 2022
79 |
80 | - Added some experimental `sklearn` transformers that implement various `redflag` tests. These do not transform the data in any way, they just inspect the data and emit warnings if tests fail. The main ones are: `redflag.sklearn.ClipDetector`, `redflag.sklearn.OutlierDetector`, `redflag.sklearn.CorrelationDetector`, `redflag.sklearn.ImbalanceDetector`, and `redflag.sklearn.DistributionComparator`.
81 | - Added tests for the `sklearn` transformers. These are in `redflag/tests/test_redflag.py` file, whereas all other tests are doctests. You can run all the tests at once with `pytest`; coverage is currently 94%.
82 | - Added `docs/notebooks/Using_redflag_with_sklearn.ipynb` to show how to use these new objects in an `sklearn` pipeline.
83 | - Since there's quite a bit of `sklearn` code in the `redflag` package, it is now a hard dependency. I removed the other dependencies because they are all dependencies of `sklearn`.
84 | - Added `redflag.has_outliers()` to make it easier to check for excessive outliers in a dataset. This function only uses Mahalanobis distance and always works in a multivariate sense.
85 | - Reorganized the `redflag.features` module into new modules: `redflag.distributions`, `redflag.outliers`, and `redflag.independence`. All of the functions are still imported into the `redflag` namespace, so this doesn't affect existing code.
86 | - Added examples to `docs/notebooks/Basic_usage.ipynb`.
87 | - Removed the `class_imbalance()` function, which was confusing. Use `imbalance_ratio()` instead.
88 |
89 |
90 | ## 0.1.8, 8 July 2022
91 |
92 | - Added Wasserstein distance comparisons for univariate and multivariate distributions. This works for either a `groups` array, or for multiple dataset splits if that's more convenient.
93 | - Improved `get_outliers()`, removing OneClassSVM method and adding EllipticEnvelope and Mahalanobis distance.
94 | - Added Mahalanobis distance outlier detection function to serve `get_outliers()` or be used on its own. Reproduces the results `zscore_outliers()` used to give for univariate data, so removed that.
95 | - Added `kde_peaks()` function to find peaks in a kernel density estimate. This also needed some other functions, including `fit_kde()`, `get_kde()`, `find_large_peaks()`, and the bandwidth estimators, `bw_silverman()` and `bw_scott()`.
96 | - Added `classes` argument to the class imbalance function, in case there are classes with no data, or to override the classes in the data.
97 | - Fixed a bug in the `feature_importances()` function.
98 | - Fixed a bug in the `is_continuous()` function.
99 | - Improved the `Using_redflag.ipynb` notebook.
100 | - Added `has_nans()`, `has_monotonic()`, and `has_flat()` functions to detect interpolation issues.
101 | - Moved some more helper functions into utils, eg `iter_groups()`, `ecdf()`, `flatten()`, `stdev_to_proportion()` and `proportion_to_stdev()`.
102 | - Wrote a lot more tests, coverage is now at 95%.
103 |
104 |
105 | ## 0.1.3 to 0.1.7, 9–11 February 2022
106 |
107 | - Added `utils.has_low_distance_stdev`.
108 | - Added `utils.has_few_samples`.
109 | - Added `utils.is_standardized()` function to test if a feature or regression target appears to be a Z-score.
110 | - Changed name of `clips()` function to `clipped()` to be more predictable (it goes with `is_clipped()`).
111 | - Documentation.
112 | - CI workflow seems to be stable.
113 | - Mostly just a lot of flailing.
114 |
115 |
116 | ## 0.1.2, 1 February 2022
117 |
118 | - Early release.
119 | - Added auto-versioning.
120 |
121 |
122 | ## 0.1.1, 31 January 2022
123 |
124 | - Early release.
125 |
126 |
127 | ## 0.1.0, 30 January 2022
128 |
129 | - Early release.
130 |
--------------------------------------------------------------------------------
/src/redflag/outliers.py:
--------------------------------------------------------------------------------
1 | """
2 | Functions related to understanding features.
3 |
4 | Author: Matt Hall, scienxlab.org
5 | Licence: Apache 2.0
6 |
7 | Copyright 2024 Redflag contributors
8 |
9 | Licensed under the Apache License, Version 2.0 (the "License");
10 | you may not use this file except in compliance with the License.
11 | You may obtain a copy of the License at
12 |
13 | http://www.apache.org/licenses/LICENSE-2.0
14 |
15 | Unless required by applicable law or agreed to in writing, software
16 | distributed under the License is distributed on an "AS IS" BASIS,
17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | See the License for the specific language governing permissions and
19 | limitations under the License.
20 | """
21 | from __future__ import annotations
22 |
23 | from typing import Optional
24 | from functools import reduce, partial
25 | import warnings
26 |
27 | import numpy as np
28 | from numpy.typing import ArrayLike
29 | from sklearn.neighbors import LocalOutlierFactor
30 | from sklearn.ensemble import IsolationForest
31 | from sklearn.covariance import EllipticEnvelope
32 |
33 | from .utils import stdev_to_proportion, proportion_to_stdev
34 | from .utils import get_idx
35 |
36 |
37 | def mahalanobis(X: ArrayLike, correction: bool=False) -> np.ndarray:
38 | """
39 | Compute the Mahalanobis distances of every record (row) in a 2D dataset.
40 |
41 | If X has a single feature, this is equivalent to computing the Z-scores
42 | of the data. For more features, the Mahalanobis distance is the distance
43 | of each point from the centroid of the data, in units analogous to the
44 | standard deviation. It is a multivariate analog of the Z-score.
45 |
46 | The empirical covariance correction factor suggested by Rousseeuw and
47 | Van Driessen may be optionally applied by setting `correction=True`.
48 |
49 | Args:
50 | X (array): The data. Must be a 2D array, shape (n_samples, n_features).
51 | correction (bool): Whether to apply the empirical covariance correction.
52 |
53 | Returns:
54 | array: The Mahalanobis distances.
55 |
56 | Examples:
57 | >>> data = np.array([-3, -2, -2, -1, 0, 0, 0, 1, 2, 2, 3]).reshape(-1, 1)
58 | >>> mahalanobis(data)
59 | array([1.6583124, 1.1055416, 1.1055416, 0.5527708, 0. , 0. ,
60 | 0. , 0.5527708, 1.1055416, 1.1055416, 1.6583124])
61 | >>> mahalanobis(data, correction=True)
62 | array([1.01173463, 0.67448975, 0.67448975, 0.33724488, 0. ,
63 | 0. , 0. , 0.33724488, 0.67448975, 0.67448975,
64 | 1.01173463])
65 | """
66 | X = np.asarray(X)
67 |
68 | ee = EllipticEnvelope(support_fraction=1.0).fit(X)
69 |
70 | if correction:
71 | ee.correct_covariance(X)
72 |
73 | return np.sqrt(ee.dist_)
74 |
75 |
76 | def mahalanobis_outliers(X: ArrayLike,
77 | p: float=0.99,
78 | threshold: Optional[float]=None,
79 | ) -> np.ndarray:
80 | """
81 | Find outliers given samples and a threshold in multiples of stdev.
82 | Returns -1 for outliers and 1 for inliers (to match the sklearn API).
83 |
84 | For univariate data, we expect this many points outside (in units of
85 | standard deviation, and with equivalent p-values):
86 | - 1 sd: expect 31.7 points in 100 (p = 1 - 0.317 = 0.683)
87 | - 2 sd: 4.55 in 100 (p = 1 - 0.0455 = 0.9545)
88 | - 3 sd: 2.70 in 1000 (p = 1 - 0.0027 = 0.9973)
89 | - 4 sd: 6.3 in 100,000 (p = 1 - 0.000063 = 0.999937)
90 | - 4.89163847 sd: 1 in 1 million (p = 1 - 0.000001 = 0.999999)
91 | - 5 sd: 5.7 in 10 million datapoints
92 | - 6 sd: 2.0 in 1 billion points
93 |
94 | Args:
95 | X (array): The data. Can be a 2D array, shape (n_samples, n_features),
96 | or a 1D array, shape (n_samples).
97 | p (float): The probability threshold, in the range [0, 1]. This value
98 | is ignored if `threshold` is not None; in this case, `p` will be
99 | computed using `utils.stdev_to_proportion(threshold)`.
100 | threshold (float): The threshold in Mahalanobis distance, analogous to
101 | multiples of standard deviation for a single variable. If not None,
102 | the threshold will be used to compute `p`.
103 |
104 | Returns:
105 | array: Array identifying outliers; -1 for outliers and 1 for inliers.
106 |
107 | Examples:
108 | >>> data = [-3, -2, -2, -1, 0, 0, 0, 1, 2, 2, 3]
109 | >>> mahalanobis_outliers(data)
110 | array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
111 | >>> mahalanobis_outliers(data + [100], threshold=3)
112 | array([ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1])
113 | """
114 | X = np.asarray(X)
115 | if X.ndim == 1:
116 | X = X.reshape(-1, 1)
117 |
118 | _, d = X.shape
119 |
120 | # Determine the Mahalanobis distance for the given confidence level.
121 | if threshold is None:
122 | threshold = proportion_to_stdev(p=p, d=d)
123 |
124 | # Compute the Mahalanobis distance.
125 | z = mahalanobis(X)
126 |
127 | # Decide whether each point is an outlier or not.
128 | idx, = np.where((z < -threshold) | (z > threshold))
129 | outliers = np.full(z.shape, 1)
130 | outliers[idx] = -1
131 |
132 | return outliers
133 |
134 |
135 | def get_outliers(a: ArrayLike,
136 | method: Optional[str]=None, # Can change to 'mah' in 0.6.0.
137 | p: float=0.99,
138 | threshold: Optional[float]=None,
139 | ) -> np.ndarray:
140 | """
141 | Returns outliers in the data, considering all of the features. What counts
142 | as an outlier is determined by the threshold, which is in multiples of
143 | the standard deviation. (The conversion to 'contamination' is approximate.)
144 |
145 | Methods: 'iso' (isolation forest), 'lof' (local outlier factor),
146 | 'ee' (elliptic envelope), or 'mah' (Mahanalobis distance, the default), or
147 | pass a function that returns an array of outlier flags (-1 for outliers and 1
148 | for inliers, matching the `sklearn` convention). You can also pass 'any',
149 | which will try all three outlier detection methods and return the outliers
150 | which are detected by any of them, or 'all', which will return the outliers
151 | which are common to all four methods. That is, 'all' is a rather conservative
152 | outlier detector, 'any' is rather liberal, and both of these are slower
153 | than choosing a single algorithm.
154 |
155 | Args:
156 | a (array): The data.
157 | method (str): The method to use. Can be 'mah' (the default), 'iso', 'lof',
158 | 'ee', 'any', 'all', or a function that returns a Boolean array of
159 | outlier flags.
160 | p (float): The probability threshold, in the range [0, 1]. This value
161 | is ignored if `threshold` is not None; in this case, `p` will be
162 | computed using `utils.stdev_to_proportion(threshold)`.
163 | threshold (float): The threshold in Mahalanobis distance, analogous to
164 | multiples of standard deviation for a single variable. If not None,
165 | the threshold will be used to compute `p`.
166 |
167 | Returns:
168 | array: The indices of the outliers.
169 |
170 | Examples:
171 | >>> data = [-3, -2, -2, -1, 0, 0, 0, 1, 2, 2, 3]
172 | >>> get_outliers(3 * data)
173 | array([], dtype=int64)
174 | >>> get_outliers(3 * data + [100])
175 | array([33])
176 | >>> get_outliers(3 * data + [100], method='mah')
177 | array([33])
178 | >>> get_outliers(3 * data + [100], method='any')
179 | array([33])
180 | >>> get_outliers(3 * data + [100], method='all')
181 | array([33])
182 | """
183 | if method is None:
184 | # Was called with the default method, which changed in 0.4.3
185 | method = 'mah'
186 | warnings.warn('The default method for get_outliers has changed to "mah". '
187 | 'Please specify the method explicitly to avoid this warning.',
188 | DeprecationWarning, stacklevel=2)
189 | if p >= 1 or p < 0:
190 | raise ValueError('p must be in the range [0, 1).')
191 | a = np.asarray(a)
192 | if a.ndim == 1:
193 | a = a.reshape(-1, 1)
194 | if threshold is None:
195 | expect = 1 - p
196 | else:
197 | expect = 1 - stdev_to_proportion(threshold)
198 | p = 1 - expect
199 | methods = {
200 | 'iso': IsolationForest(contamination=expect).fit_predict,
201 | 'lof': LocalOutlierFactor(contamination=expect, novelty=False).fit_predict,
202 | 'ee': EllipticEnvelope(contamination=expect).fit_predict,
203 | 'mah': partial(mahalanobis_outliers, p=p, threshold=threshold),
204 | }
205 | if method == 'any':
206 | results = [get_idx(func(a)==-1) for func in methods.values()]
207 | outliers = reduce(np.union1d, results)
208 | elif method == 'all':
209 | results = [get_idx(func(a)==-1) for func in methods.values()]
210 | outliers = reduce(np.intersect1d, results)
211 | else:
212 | func = methods.get(method, method)
213 | outliers, = np.where(func(a)==-1)
214 | return outliers
215 |
216 |
217 | def expected_outliers(n: int,
218 | d: int=1,
219 | p: float=0.99,
220 | threshold: Optional[float]=None,
221 | ) -> int:
222 | """
223 | Expected number of outliers in a dataset, under the assumption that the
224 | data are multivariate-normally distributed. What counts as an outlier is
225 | determined by the threshold, which is in multiples of the standard
226 | deviation, or by the p-value, which is the probability of a point being
227 | an outlier. Note that passing p = 0.99 does not necessarily mean that
228 | 1% of the points will be outliers, only that 1% of the points are expected
229 | to be outliers, on average, if the data are normally distributed.
230 |
231 | Args:
232 | n (int): The number of samples.
233 | d (int): The number of features. Note that if threshold is None, this
234 | value is not used in the calculation. Default: 1.
235 | p (float): The probability threshold, in the range [0, 1]. This value
236 | is ignored if `threshold` is not None and `p` will be computed
237 | using `utils.stdev_to_proportion(threshold)`. Default: 0.99.
238 | threshold (float): The threshold in Mahalanobis distance, analogous to
239 | multiples of standard deviation for a single variable. If not None,
240 | the threshold will be used to compute `p`.
241 |
242 | Returns:
243 | int: The expected number of outliers.
244 |
245 | Example:
246 | >>> expected_outliers(10_000, 6, threshold=4)
247 | 137
248 | """
249 | if threshold is not None:
250 | p = stdev_to_proportion(threshold, d)
251 | return int(n * (1 - p))
252 |
253 |
254 | def has_outliers(a: ArrayLike,
255 | p: float=0.99,
256 | threshold: Optional[float]=None,
257 | factor: float=1.0,
258 | ) -> bool:
259 | """
260 | Use Mahalanobis distance to determine if there are more outliers than
261 | expected at the given confidence level or Mahalanobis distance threshold.
262 | A Boolean wrapper around `expected_outliers` and `get_outliers`.
263 |
264 | Args:
265 | a (array): The data. If 2D, the rows are samples and the columns are
266 | features. If 1D, the data are assumed to be univariate.
267 | p (float): The probability threshold, in the range [0, 1]. This value
268 | is ignored if `threshold` is not None and `p` will be computed
269 | using `utils.stdev_to_proportion(threshold)`. Default: 0.99.
270 | threshold (float): The threshold in Mahalanobis distance, analogous to
271 | multiples of standard deviation for a single variable. If not None,
272 | the threshold will be used to compute `p`.
273 | factor (float): The factor by which to multiply the expected number of
274 | outliers before comparing to the actual number of outliers.
275 |
276 | Returns:
277 | bool: True if there are more outliers than expected at the given
278 | confidence level.
279 | """
280 | a = np.asarray(a)
281 | if a.ndim == 1:
282 | a = a.reshape(-1, 1)
283 | n, d = a.shape
284 |
285 | if threshold is not None:
286 | p = stdev_to_proportion(threshold, d)
287 |
288 | expected = expected_outliers(n, d, p=p)
289 |
290 | return get_outliers(a, method='mah', p=p).size > factor * expected
291 |
--------------------------------------------------------------------------------
/src/redflag/markov.py:
--------------------------------------------------------------------------------
1 | """
2 | Functions related to Markov chains. This code was originally implemented in
3 | https://github.com/agilescientific/striplog.
4 |
5 | Author: Matt Hall, scienxlab.org
6 | Licence: Apache 2.0
7 |
8 | Copyright 2024 Matt Hall
9 |
10 | Licensed under the Apache License, Version 2.0 (the "License");
11 | you may not use this file except in compliance with the License.
12 | You may obtain a copy of the License at
13 |
14 | http://www.apache.org/licenses/LICENSE-2.0
15 |
16 | Unless required by applicable law or agreed to in writing, software
17 | distributed under the License is distributed on an "AS IS" BASIS,
18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 | See the License for the specific language governing permissions and
20 | limitations under the License.
21 | """
22 | from collections import namedtuple
23 |
24 | import numpy as np
25 | import scipy.stats
26 |
27 |
28 | def observations(seq_of_seqs, states, step=1, include_self=False):
29 | """
30 | Compute observation matrix.
31 |
32 | Returns the matrix of transition counts between states.
33 |
34 | Args:
35 | seq_of_seqs (list-like): A list-like, or list-like of list-likes.
36 | The inner list-likes represent sequences of states.
37 | For example, can be a string or list of strings, or
38 | a list or list of lists.
39 | states (list-like): A list or array of the names of the states.
40 | If not provided, it will be inferred from the data.
41 | step (integer): The distance to step. Default is 1: use
42 | the previous state only. If 2, then the previous-but-
43 | one state is used as well as the previous state (and
44 | the matrix has one more dimension).
45 | include_self (bool): Whether to include self-to-self
46 | transitions (default is `False`: do not include them).
47 |
48 | Returns:
49 | ndarray. The observation matrix.
50 | """
51 | O = np.zeros(tuple(states.size for _ in range(step+1)))
52 | for seq in seq_of_seqs:
53 | seq = np.array(seq)
54 | _, integer_seq = np.where(seq.reshape(-1, 1) == states)
55 | for idx in zip(*[integer_seq[n:] for n in range(step+1)]):
56 | if (not include_self) and (0 in np.diff(idx)):
57 | continue
58 | O[idx] += 1
59 | return O
60 |
61 |
62 | def hollow_matrix(M):
63 | """
64 | Utility funtion to return hollow matrix (zeros on diagonal).
65 |
66 | Args
67 | M (ndarray): a 'square' ndarray.
68 |
69 | Returns
70 | ndarray. The same array with zeros on the diagonal.
71 | """
72 | s = M.shape[0]
73 | idx = np.unravel_index(np.arange(0, s**2, s + 1), M.shape)
74 | M[idx] = 0
75 | return M
76 |
77 |
78 | def regularize(sequence, strings_are_states=False) -> tuple:
79 | """
80 | Turn a sequence or sequence of sequences into a tuple of
81 | the unique elements in the sequence(s), plus a sequence
82 | of sequences (sort of equivalent to `np.atleast_2d()`).
83 |
84 | Args
85 | sequence (list-like): A list-like container of either
86 | states, or of list-likes of states.
87 | strings_are_states (bool): True if the strings are
88 | themselves states (i.e. words or tokens) and not
89 | sequences of one-character states. For example,
90 | set to True if you provide something like:
91 |
92 | ['sst', 'mud', 'mud', 'sst', 'lst', 'lst']
93 |
94 | Returns
95 | tuple. A tuple of the unique states, and a sequence
96 | of sequences.
97 | """
98 | if strings_are_states:
99 | if isinstance(sequence[0], str):
100 | seq_of_seqs = [sequence]
101 | else:
102 | seq_of_seqs = sequence
103 | else:
104 | # Just try to iterate over the contents of the sequence.
105 | try:
106 | seq_of_seqs = [list(i) if len(i) > 1 else i for i in sequence]
107 | except TypeError:
108 | seq_of_seqs = [list(sequence)]
109 |
110 | # Annoyingly, still have to fix case of single sequence of
111 | # strings... this seems really hacky.
112 | if len(seq_of_seqs[0]) == 1:
113 | seq_of_seqs = [seq_of_seqs]
114 |
115 | # Now we know we have a sequence of sequences.
116 | uniques = set()
117 | for seq in seq_of_seqs:
118 | for i in seq:
119 | uniques.add(i)
120 |
121 | return np.array(sorted(uniques)), seq_of_seqs
122 |
123 |
124 | class Markov_chain:
125 |
126 | def __init__(self,
127 | observed_counts,
128 | states=None,
129 | step=1,
130 | include_self=None,
131 | ):
132 | """
133 | Initialize the Markov chain instance.
134 |
135 | Args:
136 | observed_counts (ndarray): A 2-D array representing the counts
137 | of change of state in the Markov Chain.
138 | states (array-like): An array-like representing the possible states
139 | of the Markov Chain. Must be in the same order as `observed
140 | counts`.
141 | step (int): The maximum step size, default 1.
142 | include_self (bool): Whether to include self-to-self transitions.
143 | """
144 | self.step = step
145 | self.observed_counts = np.atleast_2d(observed_counts).astype(int)
146 |
147 | if include_self is not None:
148 | self.include_self = include_self
149 | else:
150 | self.include_self = np.any(np.diagonal(self.observed_counts))
151 |
152 | if not self.include_self:
153 | self.observed_counts = hollow_matrix(self.observed_counts)
154 |
155 | if states is not None:
156 | self.states = np.asarray(states)
157 | else:
158 | self.states = np.arange(self.observed_counts.shape[0])
159 |
160 | if self.step > 1:
161 | self.expected_counts = self._compute_expected_mc()
162 | else:
163 | self.expected_counts = self._compute_expected()
164 |
165 | return
166 |
167 | @staticmethod
168 | def _compute_freqs(C):
169 | """
170 | Compute frequencies from counts.
171 | """
172 | epsilon = 1e-12
173 | return (C.T / (epsilon+np.sum(C.T, axis=0))).T
174 |
175 | @staticmethod
176 | def _stop_iter(a, b, tol=0.01):
177 | """
178 | Stopping criterion for Powers & Easterling method.
179 | """
180 | a_small = np.all(np.abs(a[-1] - a[-2]) < tol*a[-1])
181 | b_small = np.all(np.abs(b[-1] - b[-2]) < tol*b[-1])
182 | return (a_small and b_small)
183 |
184 | @property
185 | def _index_dict(self):
186 | """
187 | A dictionary mapping the states to their indices.
188 | """
189 | if self.states is None:
190 | return {}
191 | return {self.states[index]: index for index in range(len(self.states))}
192 |
193 | @property
194 | def _state_dict(self):
195 | """
196 | A dictionary mapping the indices to their states.
197 | """
198 | if self.states is None:
199 | return {}
200 | return {index: self.states[index] for index in range(len(self.states))}
201 |
202 | @property
203 | def observed_freqs(self):
204 | """
205 | The observed frequencies of each state, given the previous state.
206 | """
207 | return self._compute_freqs(self.observed_counts)
208 |
209 | @property
210 | def expected_freqs(self):
211 | """
212 | The expected frequencies of each state, given the previous state.
213 | """
214 | return self._compute_freqs(self.expected_counts)
215 |
216 | @property
217 | def _state_counts(self):
218 | """
219 | The number of times each state occurs.
220 | """
221 | s = self.observed_counts.copy()
222 |
223 | # Deal with more than 2 dimensions.
224 | for _ in range(self.observed_counts.ndim - 2):
225 | s = np.sum(s, axis=0)
226 |
227 | a = np.sum(s, axis=0)
228 | b = np.sum(s, axis=1)
229 | return np.maximum(a, b)
230 |
231 | @property
232 | def _state_probs(self):
233 | """
234 | The probability of each state.
235 | """
236 | return self._state_counts / np.sum(self._state_counts)
237 |
238 | @property
239 | def normalized_difference(self):
240 | """
241 | The normalized difference between observed and expected counts.
242 | """
243 | O = self.observed_counts
244 | E = self.expected_counts
245 | epsilon = 1e-12
246 | return (O - E) / np.sqrt(E + epsilon)
247 |
248 | @classmethod
249 | def from_sequence(cls,
250 | sequence,
251 | states=None,
252 | strings_are_states=False,
253 | include_self=False,
254 | step=1,
255 | ):
256 | """
257 | Parse a sequence and make the transition matrix of the specified order.
258 |
259 | You must provide sequence(s) in causal order (e.g. time order).
260 |
261 | Args:
262 | sequence (list-like): A list-like, or list-like of list-likes.
263 | The inner list-likes represent sequences of states.
264 | For example, can be a string or list of strings, or
265 | a list or list of lists.
266 | states (list-like): A list or array of the names of the states.
267 | If not provided, it will be inferred from the data.
268 | strings_are_states (bool): True if the strings are
269 | themselves states (i.e. words or tokens) and not
270 | sequences of one-character states. For example,
271 | set to True if you provide something like:
272 |
273 | ['sst', 'mud', 'mud', 'sst', 'lst', 'lst']
274 |
275 | include_self (bool): Whether to include self-to-self
276 | transitions (default is `False`: do not include them).
277 | step (integer): The distance to step. Default is 1: use
278 | the previous state only. If 2, then the previous-but-
279 | one state is used as well as the previous state (and
280 | the matrix has one more dimension).
281 | """
282 | uniques, seq_of_seqs = regularize(sequence, strings_are_states=strings_are_states)
283 |
284 | if states is None:
285 | states = uniques
286 | else:
287 | states = np.asarray(list(states))
288 |
289 | O = observations(seq_of_seqs, states=states, step=step, include_self=include_self)
290 |
291 | return cls(observed_counts=np.array(O),
292 | states=states,
293 | include_self=include_self,
294 | step=step,
295 | )
296 |
297 | def _conditional_probs(self, state):
298 | """
299 | Conditional probabilities of each state, given a
300 | current state.
301 | """
302 | return self.observed_freqs[self._index_dict[state]]
303 |
304 | def _next_state(self, current_state):
305 | """
306 | Returns the state of the random variable at the next time
307 | instance.
308 |
309 | Args:
310 | current_state (str): The current state of the system.
311 |
312 | Returns:
313 | str. One realization of the next state.
314 | """
315 | return np.random.choice(self.states,
316 | p=self._conditional_probs(current_state)
317 | )
318 |
319 | def generate_states(self, n: int = 10, current_state=None):
320 | """
321 | Generates the next states of the system.
322 |
323 | Args:
324 | n (int): The number of future states to generate.
325 | current_state (str): The state of the current random variable.
326 |
327 | Returns:
328 | list. The next n states.
329 | """
330 | if current_state is None:
331 | current_state = np.random.choice(self.states, p=self._state_probs)
332 |
333 | future_states = []
334 | for _ in range(n):
335 | next_state = self._next_state(current_state)
336 | future_states.append(next_state)
337 | current_state = next_state
338 |
339 | return future_states
340 |
341 | def _compute_expected(self):
342 | """
343 | Try to use Powers & Easterling, fall back on Monte Carlo sampling
344 | based on the proportions of states in the data.
345 | """
346 | try:
347 | E = self._compute_expected_pe()
348 | except:
349 | E = self._compute_expected_mc()
350 |
351 | return E
352 |
353 | def _compute_expected_mc(self, n: int = 100000):
354 | """
355 | If we can't use Powers & Easterling's method, and it's possible there's
356 | a way to extend it to higher dimensions (which we have for step > 1),
357 | the next best thing might be to use brute force and just compute a lot
358 | of random sequence transitions, given the observed proportions. This is
359 | what P & E's method tries to estimate iteratively.
360 |
361 | What to do about 'self transitions' is a bit of a problem here, since
362 | there are a lot of n-grams that include at least one self-transition.
363 | """
364 | seq = np.random.choice(self.states, size=n, p=self._state_probs)
365 | E = observations(np.atleast_2d(seq), self.states, step=self.step, include_self=self.include_self)
366 | if not self.include_self:
367 | E = hollow_matrix(E)
368 | return np.sum(self.observed_counts) * E / np.sum(E)
369 |
370 | def _compute_expected_pe(self, max_iter: int = 100):
371 | """
372 | Compute the independent trials matrix, using method of
373 | Powers & Easterling 1982.
374 | """
375 | m = len(self.states)
376 | M = self.observed_counts
377 | a, b = [], []
378 |
379 | # Loop 1
380 | a.append(np.sum(M, axis=1) / (m - 1))
381 | b.append(np.sum(M, axis=0) / (np.sum(a[-1]) - a[-1]))
382 |
383 | i = 2
384 | while i < max_iter:
385 |
386 | a.append(np.sum(M, axis=1) / (np.sum(b[-1]) - b[-1]))
387 | b.append(np.sum(M, axis=0) / (np.sum(a[-1]) - a[-1]))
388 |
389 | # Check for stopping criterion.
390 | if self._stop_iter(a, b, tol=0.001):
391 | break
392 |
393 | i += 1
394 |
395 | E = a[-1] * b[-1].reshape(-1, 1)
396 |
397 | if not self.include_self:
398 | return hollow_matrix(E)
399 | else:
400 | return E
401 |
402 | @property
403 | def degrees_of_freedom(self) -> int:
404 | m = len(self.states)
405 | return (m - 1)**2 - m
406 |
407 | def _chi_squared_critical(self, q: float = 0.95, df: int = None) -> float:
408 | """
409 | The chi-squared critical value for a confidence level q
410 | and degrees of freedom df.
411 | """
412 | if df is None:
413 | df = self.degrees_of_freedom
414 | return scipy.stats.chi2.ppf(q=q, df=df)
415 |
416 | def _chi_squared_percentile(self, x: float, df: int = None) -> float:
417 | """
418 | The chi-squared percentile for a value x and degrees of
419 | freedom df.
420 | """
421 | if df is None:
422 | df = self.degrees_of_freedom
423 | return scipy.stats.chi2.cdf(x, df=df)
424 |
425 | def chi_squared(self, q: float = 0.95) -> tuple:
426 | """
427 | The chi-squared statistic for the given transition
428 | frequencies.
429 |
430 | Also returns the critical statistic at the given confidence
431 | level q (default 95%).
432 |
433 | If the first number is bigger than the second number,
434 | then you can reject the hypothesis that the sequence
435 | is randomly ordered.
436 |
437 | Args:
438 | q (float): The confidence level, as a float in the range 0 to 1.
439 | Default: 0.95.
440 |
441 | Returns:
442 | float: The chi-squared statistic.
443 | """
444 | # Observed and Expected matrices:
445 | O = self.observed_counts
446 | E = self.expected_counts
447 |
448 | # Adjustment for divide-by-zero
449 | epsilon = 1e-12
450 | chi2 = np.sum((O - E)**2 / (E + epsilon))
451 | crit = self._chi_squared_critical(q=q)
452 | perc = self._chi_squared_percentile(x=chi2)
453 | Chi2 = namedtuple('Chi2', ['chi2', 'crit', 'perc'])
454 |
455 | return Chi2(chi2, crit, perc)
456 |
--------------------------------------------------------------------------------
/docs/notebooks/Using_redflag_with_Pandas.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "a8d12712-5c7b-4acb-bb8b-e73efcb9b5dc",
6 | "metadata": {},
7 | "source": [
8 | "# 🚩 Using `redflag` with Pandas\n",
9 | "\n",
10 | "As well as using `redflag`'s functions directly (see `Basic_usage.ipynb`), or with `sklearn` (see `Using_redflag_with_Pandas.ipynb`), `redflag` has some Pandas 'accessors' that give you access to some `redflag` functions almost as if they were methods on Pandas objects.\n",
11 | "\n",
12 | "The best way to get the idea is to look at an example.\n",
13 | "\n",
14 | "First, even though we may not use it directly, we have to import `redflag` to get access to its functions. As long as you have `pandas` installed, it will register the accessors."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 4,
20 | "id": "77aa7f67-0bc7-48e9-87f4-183aa2dc2c35",
21 | "metadata": {},
22 | "outputs": [
23 | {
24 | "data": {
25 | "text/plain": [
26 | "'0.4.2rc2.dev14+g54704af.d20240421'"
27 | ]
28 | },
29 | "execution_count": 4,
30 | "metadata": {},
31 | "output_type": "execute_result"
32 | }
33 | ],
34 | "source": [
35 | "import redflag as rf\n",
36 | "\n",
37 | "rf.__version__"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 5,
43 | "id": "3dbcf6e1-1cb5-4ca5-b64a-bc1d9e7b174f",
44 | "metadata": {},
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/html": [
49 | "\n",
50 | "\n",
63 | "
\n",
64 | " \n",
65 | " \n",
66 | " | \n",
67 | " Well Name | \n",
68 | " Depth | \n",
69 | " Formation | \n",
70 | " RelPos | \n",
71 | " Marine | \n",
72 | " GR | \n",
73 | " ILD | \n",
74 | " DeltaPHI | \n",
75 | " PHIND | \n",
76 | " PE | \n",
77 | " Facies | \n",
78 | " LATITUDE | \n",
79 | " LONGITUDE | \n",
80 | " ILD_log10 | \n",
81 | " Lithology | \n",
82 | " RHOB | \n",
83 | " Mineralogy | \n",
84 | " Siliciclastic | \n",
85 | "
\n",
86 | " \n",
87 | " \n",
88 | " \n",
89 | " | 0 | \n",
90 | " SHRIMPLIN | \n",
91 | " 851.3064 | \n",
92 | " A1 SH | \n",
93 | " 1.000 | \n",
94 | " 1 | \n",
95 | " 77.45 | \n",
96 | " 4.613176 | \n",
97 | " 9.9 | \n",
98 | " 11.915 | \n",
99 | " 4.6 | \n",
100 | " 3.0 | \n",
101 | " 37.978076 | \n",
102 | " -100.987305 | \n",
103 | " 0.664 | \n",
104 | " siltstone | \n",
105 | " 2393.499945 | \n",
106 | " siliciclastic | \n",
107 | " True | \n",
108 | "
\n",
109 | " \n",
110 | " | 1 | \n",
111 | " SHRIMPLIN | \n",
112 | " 851.4588 | \n",
113 | " A1 SH | \n",
114 | " 0.979 | \n",
115 | " 1 | \n",
116 | " 78.26 | \n",
117 | " 4.581419 | \n",
118 | " 14.2 | \n",
119 | " 12.565 | \n",
120 | " 4.1 | \n",
121 | " 3.0 | \n",
122 | " 37.978076 | \n",
123 | " -100.987305 | \n",
124 | " 0.661 | \n",
125 | " siltstone | \n",
126 | " 2416.119814 | \n",
127 | " siliciclastic | \n",
128 | " True | \n",
129 | "
\n",
130 | " \n",
131 | " | 2 | \n",
132 | " SHRIMPLIN | \n",
133 | " 851.6112 | \n",
134 | " A1 SH | \n",
135 | " 0.957 | \n",
136 | " 1 | \n",
137 | " 79.05 | \n",
138 | " 4.549881 | \n",
139 | " 14.8 | \n",
140 | " 13.050 | \n",
141 | " 3.6 | \n",
142 | " 3.0 | \n",
143 | " 37.978076 | \n",
144 | " -100.987305 | \n",
145 | " 0.658 | \n",
146 | " siltstone | \n",
147 | " 2404.576056 | \n",
148 | " siliciclastic | \n",
149 | " True | \n",
150 | "
\n",
151 | " \n",
152 | " | 3 | \n",
153 | " SHRIMPLIN | \n",
154 | " 851.7636 | \n",
155 | " A1 SH | \n",
156 | " 0.936 | \n",
157 | " 1 | \n",
158 | " 86.10 | \n",
159 | " 4.518559 | \n",
160 | " 13.9 | \n",
161 | " 13.115 | \n",
162 | " 3.5 | \n",
163 | " 3.0 | \n",
164 | " 37.978076 | \n",
165 | " -100.987305 | \n",
166 | " 0.655 | \n",
167 | " siltstone | \n",
168 | " 2393.249071 | \n",
169 | " siliciclastic | \n",
170 | " True | \n",
171 | "
\n",
172 | " \n",
173 | " | 4 | \n",
174 | " SHRIMPLIN | \n",
175 | " 851.9160 | \n",
176 | " A1 SH | \n",
177 | " 0.915 | \n",
178 | " 1 | \n",
179 | " 74.58 | \n",
180 | " 4.436086 | \n",
181 | " 13.5 | \n",
182 | " 13.300 | \n",
183 | " 3.4 | \n",
184 | " 3.0 | \n",
185 | " 37.978076 | \n",
186 | " -100.987305 | \n",
187 | " 0.647 | \n",
188 | " siltstone | \n",
189 | " 2382.602601 | \n",
190 | " siliciclastic | \n",
191 | " True | \n",
192 | "
\n",
193 | " \n",
194 | "
\n",
195 | "
"
196 | ],
197 | "text/plain": [
198 | " Well Name Depth Formation RelPos Marine GR ILD DeltaPHI \\\n",
199 | "0 SHRIMPLIN 851.3064 A1 SH 1.000 1 77.45 4.613176 9.9 \n",
200 | "1 SHRIMPLIN 851.4588 A1 SH 0.979 1 78.26 4.581419 14.2 \n",
201 | "2 SHRIMPLIN 851.6112 A1 SH 0.957 1 79.05 4.549881 14.8 \n",
202 | "3 SHRIMPLIN 851.7636 A1 SH 0.936 1 86.10 4.518559 13.9 \n",
203 | "4 SHRIMPLIN 851.9160 A1 SH 0.915 1 74.58 4.436086 13.5 \n",
204 | "\n",
205 | " PHIND PE Facies LATITUDE LONGITUDE ILD_log10 Lithology \\\n",
206 | "0 11.915 4.6 3.0 37.978076 -100.987305 0.664 siltstone \n",
207 | "1 12.565 4.1 3.0 37.978076 -100.987305 0.661 siltstone \n",
208 | "2 13.050 3.6 3.0 37.978076 -100.987305 0.658 siltstone \n",
209 | "3 13.115 3.5 3.0 37.978076 -100.987305 0.655 siltstone \n",
210 | "4 13.300 3.4 3.0 37.978076 -100.987305 0.647 siltstone \n",
211 | "\n",
212 | " RHOB Mineralogy Siliciclastic \n",
213 | "0 2393.499945 siliciclastic True \n",
214 | "1 2416.119814 siliciclastic True \n",
215 | "2 2404.576056 siliciclastic True \n",
216 | "3 2393.249071 siliciclastic True \n",
217 | "4 2382.602601 siliciclastic True "
218 | ]
219 | },
220 | "execution_count": 5,
221 | "metadata": {},
222 | "output_type": "execute_result"
223 | }
224 | ],
225 | "source": [
226 | "import pandas as pd\n",
227 | "\n",
228 | "df = pd.read_csv('https://raw.githubusercontent.com/scienxlab/datasets/main/kgs/panoma-training-data.csv')\n",
229 | "\n",
230 | "df.head()"
231 | ]
232 | },
233 | {
234 | "cell_type": "markdown",
235 | "id": "d77e460b-b925-4dec-b56d-d3f18ed1ecbb",
236 | "metadata": {},
237 | "source": [
238 | "## Series accessor"
239 | ]
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "id": "98f5c772-a33d-43cf-82cf-54dc21535133",
244 | "metadata": {},
245 | "source": [
246 | "For the time being, there are only accessors on Pandas `Series` objects. For example:"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": 6,
252 | "id": "1b17a7e8-1d28-4e1b-9b7d-ecdbbe750aaf",
253 | "metadata": {},
254 | "outputs": [],
255 | "source": [
256 | "# Call the Series s for simplicity:\n",
257 | "s = df['Lithology']"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "id": "52e5e4e1-9200-46b2-9d77-9cc0d2cbc4a8",
263 | "metadata": {},
264 | "source": [
265 | "Now we can call the `redflag` function `imbalance_degree()` as if it were a method (but notice the extra `redflag` we have to insert to access the method):"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": 7,
271 | "id": "6af691f4-90a3-4a8e-b842-a20f70c72314",
272 | "metadata": {},
273 | "outputs": [
274 | {
275 | "data": {
276 | "text/plain": [
277 | "3.378593040846633"
278 | ]
279 | },
280 | "execution_count": 7,
281 | "metadata": {},
282 | "output_type": "execute_result"
283 | }
284 | ],
285 | "source": [
286 | "s.redflag.imbalance_degree()"
287 | ]
288 | },
289 | {
290 | "cell_type": "markdown",
291 | "id": "f2ea6821-0610-44b4-a855-653642ea089d",
292 | "metadata": {},
293 | "source": [
294 | "Or we can ask for the new 'dummy' scores:"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 8,
300 | "id": "5897e460-cc15-4858-939b-b91b19fafc9f",
301 | "metadata": {},
302 | "outputs": [
303 | {
304 | "data": {
305 | "text/plain": [
306 | "{'f1': 0.24566600930871996,\n",
307 | " 'roc_auc': 0.5021684735059516,\n",
308 | " 'strategy': 'stratified',\n",
309 | " 'task': 'classification'}"
310 | ]
311 | },
312 | "execution_count": 8,
313 | "metadata": {},
314 | "output_type": "execute_result"
315 | }
316 | ],
317 | "source": [
318 | "s.redflag.dummy_scores()"
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "id": "3b9be98e-5642-4bab-80eb-f4c226422781",
324 | "metadata": {},
325 | "source": [
326 | "Let's try that on a regression target like `df['RHOB']`"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 9,
332 | "id": "f734bb50-15e9-43c3-b31f-1a078e398dc3",
333 | "metadata": {},
334 | "outputs": [
335 | {
336 | "data": {
337 | "text/plain": [
338 | "{'mean_squared_error': 47528.78263092096,\n",
339 | " 'r2': 0.0,\n",
340 | " 'strategy': 'mean',\n",
341 | " 'task': 'regression'}"
342 | ]
343 | },
344 | "execution_count": 9,
345 | "metadata": {},
346 | "output_type": "execute_result"
347 | }
348 | ],
349 | "source": [
350 | "df['RHOB'].redflag.dummy_scores()"
351 | ]
352 | },
353 | {
354 | "cell_type": "markdown",
355 | "id": "381501a1-8944-4b3f-a4cf-d80e08fbac4f",
356 | "metadata": {},
357 | "source": [
358 | "Or we can ask for a 'report' (very simple for now):"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": 10,
364 | "id": "02380595-2b47-4718-9b58-ef6b170f29b1",
365 | "metadata": {},
366 | "outputs": [
367 | {
368 | "name": "stdout",
369 | "output_type": "stream",
370 | "text": [
371 | "Continuous data suitable for regression\n",
372 | "Outliers: [ 95 96 132 175 176 177 222 223 263 526 527 531 532 533\n",
373 | " 534 575 576 577 578 579 580 581 582 583 584 585 586 587\n",
374 | " 588 621 622 633 634 635 636 652 653 654 660 661 662 663\n",
375 | " 711 712 713 756 757 758 759 760 768 769 770 771 772 773\n",
376 | " 774 775 776 777 778 779 780 781 782 800 801 802 803 804\n",
377 | " 818 819 821 822 823 824 835 836 841 842 843 844 845 846\n",
378 | " 849 850 934 935 936 937 938 1039 1040 1044 1048 1049 1113 1114\n",
379 | " 1115 1116 1145 1146 1147 1148 1149 1150 1151 1216 1217 1218 1221 1222\n",
380 | " 1223 1224 1225 1304 1313 1314 1315 1316 1368 1369 1370 1371 1372 1373\n",
381 | " 1374 1375 1446 1447 1496 1497 1498 1499 1546 1547 1548 1549 1567 1568\n",
382 | " 1622 1623 1624 1662 1663 1664 1665 1666 1722 1723 1724 1725 1726 1735\n",
383 | " 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1753 1754 1755 1756\n",
384 | " 1757 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789\n",
385 | " 1790 1805 1806 1807 1808 1809 1810 1812 1813 1866 1868 1869 1870 1981\n",
386 | " 1982 2054 2055 2139 2327 2415 2416 2417 2418 2488 2489 2490 2867 2868\n",
387 | " 2869 2870 2871 2872 2873 2882 2883 2884 2888 2889 2921 2922 2923 2924\n",
388 | " 2925 2926 2927 2928 2929 2930 2931 2932 2933 2972 2973 2974 2975 2976\n",
389 | " 3004 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099\n",
390 | " 3100 3101 3102 3109 3110 3111 3112 3113 3114 3115 3341 3429 3430 3443\n",
391 | " 3444 3515 3516 3517 3861 3862 3863 3905 3906 3907 3931 3932 3933 3934\n",
392 | " 3935]\n",
393 | "Correlated: True\n",
394 | "Dummy scores:{'mean': {'mean_squared_error': 47528.78263092096, 'r2': 0.0}}\n",
395 | "\n"
396 | ]
397 | }
398 | ],
399 | "source": [
400 | "print(df['RHOB'].redflag.report())"
401 | ]
402 | },
403 | {
404 | "cell_type": "markdown",
405 | "id": "e007d9b8-4346-4d9a-93a5-45222a137248",
406 | "metadata": {},
407 | "source": [
408 | "This is an experimental feature; future releases will have more functions. Feedback welcome!"
409 | ]
410 | },
411 | {
412 | "cell_type": "markdown",
413 | "id": "ba98b9a2-e0a4-4ed8-a2d4-f87eb882af40",
414 | "metadata": {},
415 | "source": [
416 | "## DataFrame accessor\n",
417 | "\n",
418 | "Experimental feature: so far only `feature_importances` and `correlation_detector` are implemented."
419 | ]
420 | },
421 | {
422 | "cell_type": "code",
423 | "execution_count": 11,
424 | "id": "274cc24d-69ad-49ef-8606-cc9b77b154dc",
425 | "metadata": {},
426 | "outputs": [
427 | {
428 | "data": {
429 | "text/plain": [
430 | "array([0.29029061, 0.18151719, 0.30409475, 0.22409746])"
431 | ]
432 | },
433 | "execution_count": 11,
434 | "metadata": {},
435 | "output_type": "execute_result"
436 | }
437 | ],
438 | "source": [
439 | "features = ['GR', 'RHOB', 'PE', 'ILD_log10']\n",
440 | "df.redflag.feature_importances(features, target='Lithology')"
441 | ]
442 | },
443 | {
444 | "cell_type": "code",
445 | "execution_count": 12,
446 | "id": "12e3e4ee-e8df-47ba-810d-3bff492d5389",
447 | "metadata": {},
448 | "outputs": [
449 | {
450 | "name": "stderr",
451 | "output_type": "stream",
452 | "text": [
453 | "🚩 Feature 0 appears to be autocorrelated.\n",
454 | "🚩 Feature 1 appears to be autocorrelated.\n",
455 | "🚩 Feature 2 appears to be autocorrelated.\n",
456 | "🚩 Feature 3 appears to be autocorrelated.\n"
457 | ]
458 | }
459 | ],
460 | "source": [
461 | "df.redflag.correlation_detector(features, target=None)"
462 | ]
463 | },
464 | {
465 | "cell_type": "markdown",
466 | "id": "a3185f63-64b1-47fd-875d-2c646b84aa65",
467 | "metadata": {},
468 | "source": [
469 | "Indeed, all of these features are correlated."
470 | ]
471 | }
472 | ],
473 | "metadata": {
474 | "kernelspec": {
475 | "display_name": "redflag",
476 | "language": "python",
477 | "name": "redflag"
478 | },
479 | "language_info": {
480 | "codemirror_mode": {
481 | "name": "ipython",
482 | "version": 3
483 | },
484 | "file_extension": ".py",
485 | "mimetype": "text/x-python",
486 | "name": "python",
487 | "nbconvert_exporter": "python",
488 | "pygments_lexer": "ipython3",
489 | "version": "3.12.0"
490 | }
491 | },
492 | "nbformat": 4,
493 | "nbformat_minor": 5
494 | }
495 |
--------------------------------------------------------------------------------
/src/redflag/distributions.py:
--------------------------------------------------------------------------------
1 | """
2 | Functions related to understanding distributions.
3 |
4 | Author: Matt Hall, scienxlab.org
5 | Licence: Apache 2.0
6 |
7 | Copyright 2024 Redflag contributors
8 |
9 | Licensed under the Apache License, Version 2.0 (the "License");
10 | you may not use this file except in compliance with the License.
11 | You may obtain a copy of the License at
12 |
13 | http://www.apache.org/licenses/LICENSE-2.0
14 |
15 | Unless required by applicable law or agreed to in writing, software
16 | distributed under the License is distributed on an "AS IS" BASIS,
17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | See the License for the specific language governing permissions and
19 | limitations under the License.
20 | """
21 | from __future__ import annotations
22 |
23 | from typing import Optional, NamedTuple, Callable, Union
24 | from collections import namedtuple
25 | from itertools import combinations
26 | import warnings
27 |
28 | import numpy as np
29 | from numpy.typing import ArrayLike
30 | import scipy.stats as ss
31 | from scipy.stats import wasserstein_distance
32 | from scipy.spatial.distance import squareform
33 | from scipy.signal import find_peaks
34 | from sklearn.neighbors import KernelDensity
35 | from sklearn.model_selection import GridSearchCV
36 |
37 | from .utils import is_standard_normal
38 | from .utils import iter_groups
39 |
40 |
41 | DISTS = ['norm', 'cosine', 'expon', 'exponpow', 'gamma', 'gumbel_l', 'gumbel_r',
42 | 'powerlaw', 'triang', 'trapz', 'uniform',
43 | ]
44 |
45 | def best_distribution(a: ArrayLike, bins: Optional[int]=None) -> NamedTuple:
46 | """
47 | Model data by finding best fit distribution to data.
48 |
49 | By default, the following distributions are tried: normal, cosine,
50 | exponential, exponential power, gamma, left-skewed Gumbel, right-skewed
51 | Gumbel, power law, triangular, trapezoidal, and uniform.
52 |
53 | The best fit is determined by the sum of squared errors (SSE) between the
54 | histogram and the probability density function (PDF) of the distribution.
55 |
56 | Returns the best fit distribution and its parameters in a named tuple.
57 |
58 | Args:
59 | a (array): The data.
60 | bins (int): The number of bins to use for the histogram.
61 |
62 | Returns:
63 | tuple: The best fit distribution and its parameters.
64 |
65 | Examples:
66 | >>> a = [0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 8]
67 | >>> best_distribution(a)
68 | Distribution(name='norm', shape=[], loc=4.0, scale=1.8771812708978117)
69 | >>> best_distribution([1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 6, 6, 7])
70 | Distribution(name='triang', shape=[0.5001419889107208], loc=0.3286356643172673, scale=7.3406453953773365)
71 | """
72 | if bins is None:
73 | bins = min(max(20, len(a) // 100), 200)
74 | n, x = np.histogram(a, bins=bins, density=True)
75 | x = (x[1:] + x[:-1]) / 2
76 |
77 | dists = [getattr(ss, d) for d in DISTS]
78 |
79 | best_dist = None
80 | best_params = None
81 | best_sse = np.inf
82 |
83 | for dist in dists:
84 | *shape, μ, σ = dist.fit(a)
85 | n_pred = dist.pdf(x, loc=μ, scale=σ, *shape)
86 | sse = np.sum((n - n_pred)**2)
87 | if 0 < sse < best_sse:
88 | best_dist = dist
89 | best_params = shape + [μ] + [σ]
90 | best_sse = sse
91 |
92 | *shape, μ, σ = best_params
93 | Distribution = namedtuple('Distribution', ['name', 'shape', 'loc', 'scale'])
94 | return Distribution(best_dist.name, shape, μ, σ)
95 |
96 |
97 | def wasserstein_ovr(a: ArrayLike, groups: ArrayLike=None, standardize: bool=False) -> np.ndarray:
98 | """
99 | First Wasserstein distance between each group in `a` vs the rest of `a`
100 | ('one vs rest' or OVR). The groups are provided by `groups`, which must be
101 | a 1D array of group labels, the same length as `a`.
102 |
103 | The Wasserstein distance is a measure of the distance between two
104 | probability distributions. It is also known as the earth mover's distance.
105 | This function uses the implementation in `scipy.stats.wasserstein_distance`.
106 |
107 | The results are in `np.unique(a)` order.
108 |
109 | Data should be standardized for results you can compare across different
110 | measurements. The function does not apply standardization by default.
111 |
112 | Returns K scores for K groups.
113 |
114 | Args:
115 | a (array): The data.
116 | groups (array): The group labels.
117 | standardize (bool): Whether to standardize the data. Default False.
118 |
119 | Returns:
120 | array: The Wasserstein distance scores in `np.unique(a)` order.
121 |
122 | Examples:
123 | >>> data = [1, 1, 1, 2, 2, 1, 1, 2, 2, 3, 2, 2, 2, 3, 3]
124 | >>> groups = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
125 | >>> wasserstein_ovr(data, groups=groups, standardize=True)
126 | array([0.97490053, 0.1392715 , 1.11417203])
127 | """
128 | if standardize:
129 | a = (a - np.nanmean(a)) / np.nanstd(a)
130 | dists = []
131 | for group in iter_groups(groups):
132 | dist = wasserstein_distance(a[group], a[~group])
133 | dists.append(dist)
134 | return np.array(dists)
135 |
136 |
137 | def wasserstein_ovo(a: ArrayLike, groups: ArrayLike=None, standardize: bool=False) -> np.ndarray:
138 | """
139 | First Wasserstein distance between each group in `a` vs each other group
140 | ('one vs one' or OVO). The groups are provided by `groups`, which must be
141 | a 1D array of group labels, the same length as `a`.
142 |
143 | The Wasserstein distance is a measure of the distance between two
144 | probability distributions. It is also known as the earth mover's distance.
145 | This function uses the implementation in `scipy.stats.wasserstein_distance`.
146 |
147 | The results are in the order given by `combinations(np.unique(groups),
148 | r=2)`, which matches the order of `scipy.spatial.distance` metrics.
149 |
150 | Data should be standardized for results you can compare across different
151 | measurements. The function does not apply standardization by default.
152 |
153 | Returns K(K-1)/2 scores for K groups.
154 |
155 | Args:
156 | a (array): The data.
157 | groups (array): The group labels.
158 | standardize (bool): Whether to standardize the data. Defaults to False.
159 |
160 | Returns:
161 | array: The Wasserstein distance scores. Note that the order is the
162 | same as you would get from `scipy.spatial.distance` metrics. You
163 | can pass the result to `scipy.spatial.distance.squareform` to
164 | get a square matrix.
165 |
166 | Examples:
167 | >>> data = [1, 1, 1, 2, 2, 1, 1, 2, 2, 3, 2, 2, 2, 3, 3]
168 | >>> groups = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
169 | >>> wasserstein_ovo(data, groups=groups, standardize=True)
170 | array([0.55708601, 1.39271504, 0.83562902])
171 | >>> squareform(wasserstein_ovo(data, groups=groups, standardize=True))
172 | array([[0. , 0.55708601, 1.39271504],
173 | [0.55708601, 0. , 0.83562902],
174 | [1.39271504, 0.83562902, 0. ]])
175 | """
176 | if standardize:
177 | a = (a - np.nanmean(a)) / np.nanstd(a)
178 | dists = []
179 | for (group_1, group_2) in combinations(np.unique(groups), r=2):
180 | dist = wasserstein_distance(a[groups==group_1], a[groups==group_2])
181 | dists.append(dist)
182 | return np.array(dists)
183 |
184 |
185 | def wasserstein(X: ArrayLike,
186 | groups: ArrayLike=None,
187 | method: str='ovr',
188 | standardize: bool=False,
189 | reducer: Callable=None) -> np.ndarray:
190 | """
191 | Step over all features and apply the distance function to the groups.
192 |
193 | Method can be 'ovr', 'ovo', or a function.
194 |
195 | The function `reducer` is applied to the ovo result to reduce it to one
196 | value per group per feature. If you want the full array of each group
197 | against each other, either pass the identity function (`lambda x: x`,
198 | which adds an axis) or use `wasserstein_ovo()` directly, one feature at
199 | a time. Default function: `np.mean`.
200 |
201 | The Wasserstein distance is a measure of the distance between two
202 | probability distributions. It is also known as the earth mover's distance.
203 | This function uses the implementation in `scipy.stats.wasserstein_distance`.
204 |
205 | Args:
206 | X (array): The data. Must be a 2D array, or a sequence of 2D arrays.
207 | If the latter, then the groups are implicitly assumed to be the
208 | datasets in the sequence and the `groups` argument is ignored.
209 | groups (array): The group labels.
210 | method (str or func): The method to use. Can be 'ovr', 'ovo', or a
211 | function.
212 | standardize (bool): Whether to standardize the data. Default False.
213 | reducer (func): The function to reduce the ovo result to one value
214 | per group. Default: `np.mean`.
215 |
216 | Returns:
217 | array: The 2D array of Wasserstein distance scores.
218 |
219 | Examples:
220 | >>> data = np.array([1, 1, 1, 2, 2, 1, 1, 2, 2, 3, 2, 2, 2, 3, 3])
221 | >>> groups = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
222 | >>> wasserstein(data.reshape(-1, 1), groups=groups, standardize=True)
223 | array([[0.97490053],
224 | [0.1392715 ],
225 | [1.11417203]])
226 | >>> wasserstein(data.reshape(-1, 1), groups=groups, method='ovo', standardize=True)
227 | array([[0.97490053],
228 | [0.69635752],
229 | [1.11417203]])
230 | >>> data = [[[1], [1.22475], [-1.22475], [0], [1], [-1], [-1]], [[1], [0], [1]], [[1], [0], [-1]]]
231 | >>> wasserstein(data, standardize=False)
232 | array([[0.39754762],
233 | [0.71161667],
234 | [0.24495 ]])
235 | """
236 | # If the data is a sequence of arrays, then assume the groups are the
237 | # datasets in the sequence and the `groups` argument is ignored.
238 | try:
239 | first = X[0]
240 | except KeyError:
241 | # Probably a DataFrame.
242 | first = np.asarray(X)[0]
243 |
244 | stacked = False
245 | first = np.asarray(first)
246 | try:
247 | if first.ndim == 2:
248 | stacked = True
249 | except AttributeError:
250 | # It's probably a 1D array or list.
251 | pass
252 |
253 | if stacked:
254 | # Not sure this test makes sense any more.
255 | # if not is_standard_normal(first.flat):
256 | # warnings.warn('First group does not appear to be standardized.', stacklevel=2)
257 | groups = np.hstack([len(dataset)*[i] for i, dataset in enumerate(X)])
258 | X = np.vstack(X)
259 |
260 | # Now we can treat X as a 2D array.
261 | X = np.asarray(X)
262 | if X.ndim != 2:
263 | raise ValueError("X must be a 2D array-like.")
264 |
265 | if groups is None:
266 | raise ValueError("Must provide a 1D array of group labels if X is a 2D array.")
267 | n_groups = np.unique(groups).size
268 |
269 | if n_groups < 2:
270 | raise ValueError("Must have 2 or more groups.")
271 |
272 | methods = {
273 | 'ovr': wasserstein_ovr,
274 | 'ovo': wasserstein_ovo,
275 | }
276 | func = methods.get(method, method)
277 |
278 | if reducer is None:
279 | reducer = np.mean
280 |
281 | dist_arrs = []
282 | for feature in X.T:
283 | dists = func(feature, groups=groups, standardize=standardize)
284 | if method == 'ovo':
285 | dists = squareform(dists)
286 | dists = dists[~np.eye(n_groups, dtype=bool)].reshape(n_groups, -1)
287 | dists = [reducer(d) for d in dists]
288 | dist_arrs.append(dists)
289 |
290 | return np.swapaxes(dist_arrs, 0, 1)
291 |
292 |
293 | def bw_silverman(a: ArrayLike) -> float:
294 | """
295 | Calculate the Silverman bandwidth, a popular rule of thumb for kernel
296 | density estimation bandwidth.
297 |
298 | Silverman, BW (1981), "Using kernel density estimates to investigate
299 | multimodality", Journal of the Royal Statistical Society. Series B Vol. 43,
300 | No. 1 (1981), pp. 97-99.
301 |
302 | Args:
303 | a (array): The data.
304 |
305 | Returns:
306 | float: The Silverman bandwidth.
307 |
308 | Examples:
309 | >>> data = [1, 1, 1, 2, 2, 1, 1, 2, 2, 3, 2, 2, 2, 3, 3]
310 | >>> abs(bw_silverman(data) - 0.581810759152688) < 1e-9
311 | True
312 | """
313 | n, d = np.array(a).size, 1
314 | return np.power(n, -1 / (d + 4))
315 |
316 |
317 | def bw_scott(a: ArrayLike) -> float:
318 | """
319 | Calculate the Scott bandwidth, a popular rule of thumb for kernel
320 | density estimation bandwidth.
321 |
322 | Args:
323 | a (array): The data.
324 |
325 | Returns:
326 | float: The Scott bandwidth.
327 |
328 | Examples:
329 | >>> data = [1, 1, 1, 2, 2, 1, 1, 2, 2, 3, 2, 2, 2, 3, 3]
330 | >>> abs(bw_scott(data) - 0.6162678270732356) < 1e-9
331 | True
332 | """
333 | n, d = np.array(a).size, 1
334 | return np.power(n * (d + 2) / 4, -1 / (d + 4))
335 |
336 |
337 | def cv_kde(a: ArrayLike, n_bandwidths: int=20, cv: int=10) -> float:
338 | """
339 | Run a cross validation grid search to identify the optimal bandwidth for
340 | the kernel density estimation.
341 |
342 | Searches between half the minimum of the Silverman and Scott bandwidths,
343 | and twice the maximum. Checks `n_bandwidths` bandwidths, default 20.
344 |
345 | Args:
346 | a (array): The data.
347 | n_bandwidths (int): The number of bandwidths to try. Default 20.
348 | cv (int): The number of cross validation folds. Default 10.
349 |
350 | Returns:
351 | float. The optimal bandwidth.
352 |
353 | Example:
354 | >>> rng = np.random.default_rng(42)
355 | >>> data = rng.normal(size=100)
356 | >>> cv_kde(data, n_bandwidths=3, cv=3)
357 | 0.5212113989811242
358 | >>> cv_kde(rng.normal(size=(10, 10)))
359 | Traceback (most recent call last):
360 | ...
361 | ValueError: Data must be 1D.
362 | """
363 | a = np.asarray(a)
364 | if a.ndim >= 2:
365 | raise ValueError("Data must be 1D.")
366 | if not is_standard_normal(a):
367 | warnings.warn('Data does not appear to be standardized, the KDE may be a poor fit.', stacklevel=2)
368 | a = a.reshape(-1, 1)
369 |
370 | silverman = bw_silverman(a)
371 | scott = bw_scott(a)
372 | start = min(silverman, scott)/2
373 | stop = max(silverman, scott)*2
374 | params = {'bandwidth': np.linspace(start, stop, n_bandwidths)}
375 | model = GridSearchCV(KernelDensity(), params, cv=cv)
376 | model.fit(a)
377 | return model.best_params_['bandwidth']
378 |
379 |
380 | def fit_kde(a: ArrayLike, bandwidth: float=1.0, kernel: str='gaussian') -> tuple[np.ndarray, np.ndarray]:
381 | """
382 | Fit a kernel density estimation to the data.
383 |
384 | Args:
385 | a (array): The data.
386 | bandwidth (float): The bandwidth. Default 1.0.
387 | kernel (str): The kernel. Default 'gaussian'.
388 |
389 | Returns:
390 | tuple: (x, kde).
391 |
392 | Example:
393 | >>> rng = np.random.default_rng(42)
394 | >>> data = rng.normal(size=100)
395 | >>> x, kde = fit_kde(data)
396 | >>> x[0] + 3.2124714013056916 < 1e-9
397 | True
398 | >>> kde[0] - 0.014367259502733645 < 1e-9
399 | True
400 | >>> len(kde)
401 | 200
402 | >>> fit_kde(rng.normal(size=(10, 10)))
403 | Traceback (most recent call last):
404 | ...
405 | ValueError: Data must be 1D.
406 | """
407 | a = np.squeeze(a)
408 | if a.ndim >= 2:
409 | raise ValueError("Data must be 1D.")
410 | if not is_standard_normal(a):
411 | warnings.warn('Data does not appear to be standardized, the KDE may be a poor fit.', stacklevel=2)
412 | a = a.reshape(-1, 1)
413 | model = KernelDensity(kernel=kernel, bandwidth=bandwidth)
414 | model.fit(a)
415 | mima = 1.5 * bandwidth * np.abs(a).max()
416 | x = np.linspace(-mima, mima, 200).reshape(-1, 1)
417 | log_density = model.score_samples(x)
418 |
419 | return np.squeeze(x), np.exp(log_density)
420 |
421 |
422 | def get_kde(a: ArrayLike, method: str='scott') -> tuple[np.ndarray, np.ndarray]:
423 | """
424 | Get a kernel density estimation for the data. By default, the bandwidth is
425 | estimated using the Scott rule of thumb. Other options are the Silverman
426 | rule of thumb, or cross validation (using the `cv_kde()` function).
427 |
428 | This function is a wrapper for `fit_kde()`, with convenient options for
429 | bandwidth estimation.
430 |
431 | Args:
432 | a (array): The data.
433 | method (str): The rule of thumb for bandwidth estimation. Must be one
434 | of 'silverman', 'scott', or 'cv'. Default 'scott'.
435 |
436 | Returns:
437 | tuple: (x, kde).
438 |
439 | Examples:
440 | >>> rng = np.random.default_rng(42)
441 | >>> data = rng.normal(size=100)
442 | >>> x, kde = get_kde(data)
443 | >>> x[0] + 1.354649738246933 < 1e-9
444 | True
445 | >>> kde[0] - 0.162332012191087 < 1e-9
446 | True
447 | >>> len(kde)
448 | 200
449 | """
450 | methods = {'silverman': bw_silverman, 'scott': bw_scott, 'cv': cv_kde}
451 | bw = methods.get(method)(a)
452 | return fit_kde(a, bandwidth=bw)
453 |
454 |
455 | def find_large_peaks(x: ArrayLike, y: ArrayLike, threshold: float=0.1) -> tuple[np.ndarray, np.ndarray]:
456 | """
457 | Find the peaks in the array. Returns the values of x and y at the largest
458 | peaks, using threshold × max(peak amplitudes) as the cut-off. That is,
459 | peaks smaller than that are not returned.
460 |
461 | Uses `scipy.signal.find_peaks()`, with convenient options for thresholding,
462 | and returns the x and y values of the peaks in a named tuple.
463 |
464 | Args:
465 | x (array): The x values.
466 | y (array): The y values.
467 | threshold (float): The threshold for peak amplitude. Default 0.1.
468 |
469 | Returns:
470 | tuple: (x_peaks, y_peaks). Arrays representing the x and y values of
471 | the peaks.
472 |
473 | Examples:
474 | >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
475 | >>> y = [1, 2, 3, 2, 1, 2, 15, 40, 19, 2, 1, 1]
476 | >>> x_peaks, y_peaks = find_large_peaks(x, y)
477 | >>> x_peaks
478 | array([8.])
479 | >>> y_peaks
480 | array([40.])
481 | """
482 | x, y = np.asarray(x), np.asarray(y)
483 | pos, hts = find_peaks(y, height=y)
484 | hts = hts['peak_heights']
485 | if any(hts):
486 | z, h = np.array([(x[p].item(), h) for p, h in zip(pos, hts) if h > threshold * hts.max()]).T
487 | else:
488 | z, h = np.array([]), np.array([])
489 | Peaks = namedtuple('Peaks', ['positions', 'heights'])
490 | return Peaks(z, h)
491 |
492 |
493 | def kde_peaks(a: ArrayLike, method: str='scott', threshold: float=0.1) -> tuple[np.ndarray, np.ndarray]:
494 | """
495 | Find the peaks in the kernel density estimation. This might help you
496 | identify the modes in the data.
497 |
498 | Wraps `get_kde()` and `find_large_peaks()` to find the peaks in the
499 | kernel density estimation. By default, the bandwidth is estimated using
500 | the Scott rule of thumb. Other options are the Silverman rule of thumb, or
501 | cross validation (using the `cv_kde()` function).
502 |
503 | Args:
504 | a (array): The data.
505 | method (str): The rule of thumb for bandwidth estimation. Must be one
506 | of 'silverman', 'scott', or 'cv'. Default 'scott'.
507 | threshold (float): The threshold for peak amplitude. Default 0.1.
508 |
509 | Returns:
510 | tuple: (x_peaks, y_peaks). Arrays representing the x and y values of
511 | the peaks.
512 |
513 | Examples:
514 | >>> rng = np.random.default_rng(42)
515 | >>> data = np.concatenate([rng.normal(size=100)-2, rng.normal(size=100)+2])
516 | >>> x_peaks, y_peaks = kde_peaks(data)
517 | >>> x_peaks
518 | array([-1.67243035, 1.88998226])
519 | >>> y_peaks
520 | array([0.22014721, 0.19729456])
521 | """
522 | return find_large_peaks(*get_kde(a, method), threshold=threshold)
523 |
524 |
525 | def is_multimodal(a: ArrayLike,
526 | groups:Optional[ArrayLike]=None,
527 | method: str='scott',
528 | threshold: float=0.1) -> Union[bool, np.ndarray]:
529 | """
530 | Test if the data is multimodal by looking for peaks in the kernel density
531 | estimation. If there is more than one peak, the data are considered
532 | multimodal.
533 |
534 | If groups are passed, the data are partitioned by group and tested
535 | separately. The result is an array of booleans, one per group.
536 |
537 | Wraps `kde_peaks()` to find the peaks in the kernel density estimation.
538 |
539 | Args:
540 | a (array): The data.
541 | groups (array): Group labels, if the data is to be partitioned before
542 | testing.
543 | method (str): The rule of thumb for bandwidth estimation. Must be one
544 | of 'silverman', 'scott', or 'cv'. Default 'scott'.
545 | threshold (float): The threshold for peak amplitude. Default 0.1.
546 |
547 | Returns:
548 | bool or np.ndarray: True if the data appear to be multimodal. If groups
549 | were passed, an array with one result per group is returned.
550 |
551 | Examples:
552 | >>> rng = np.random.default_rng(42)
553 | >>> a = rng.normal(size=200)
554 | >>> is_multimodal(a)
555 | False
556 | >>> b = np.concatenate([rng.normal(size=100)-2, rng.normal(size=100)+2])
557 | >>> is_multimodal(b)
558 | True
559 | >>> c = np.concatenate([a, b])
560 | >>> is_multimodal(c, groups=[0]*200 + [1]*200)
561 | array([False, True])
562 | """
563 | a = np.asarray(a)
564 | result = []
565 | with warnings.catch_warnings(record=True) as w:
566 | for group in iter_groups(groups):
567 | x, y = kde_peaks(a[group], method=method, threshold=threshold)
568 | result.append(len(x) > 1)
569 | if w:
570 | warnings.warn('ℹ️ Multimodality detection may not have been possible for all groups.', stacklevel=2)
571 | return result[0] if len(result) == 1 else np.array(result)
572 |
--------------------------------------------------------------------------------
/docs/notebooks/_Pandas_accessor.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "355e4657",
6 | "metadata": {},
7 | "source": [
8 | "# Pandas accessor for `redflag`"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "id": "75bb8303",
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | "\n",
21 | "\n",
34 | "
\n",
35 | " \n",
36 | " \n",
37 | " | \n",
38 | " Vp | \n",
39 | " Vs | \n",
40 | " rho | \n",
41 | " Lithology | \n",
42 | "
\n",
43 | " \n",
44 | " \n",
45 | " \n",
46 | " | 0 | \n",
47 | " 3045.6 | \n",
48 | " 1595.7 | \n",
49 | " 2.109121 | \n",
50 | " sandstone | \n",
51 | "
\n",
52 | " \n",
53 | " | 1 | \n",
54 | " 3000.6 | \n",
55 | " 1517.1 | \n",
56 | " 2.090342 | \n",
57 | " sandstone | \n",
58 | "
\n",
59 | " \n",
60 | " | 2 | \n",
61 | " 3363.6 | \n",
62 | " 2041.5 | \n",
63 | " 2.131990 | \n",
64 | " sandstone | \n",
65 | "
\n",
66 | " \n",
67 | " | 3 | \n",
68 | " 3195.3 | \n",
69 | " 1606.2 | \n",
70 | " 2.184939 | \n",
71 | " sandstone | \n",
72 | "
\n",
73 | " \n",
74 | " | 4 | \n",
75 | " 4237.5 | \n",
76 | " 2448.6 | \n",
77 | " 2.472231 | \n",
78 | " sandstone | \n",
79 | "
\n",
80 | " \n",
81 | "
\n",
82 | "
"
83 | ],
84 | "text/plain": [
85 | " Vp Vs rho Lithology\n",
86 | "0 3045.6 1595.7 2.109121 sandstone\n",
87 | "1 3000.6 1517.1 2.090342 sandstone\n",
88 | "2 3363.6 2041.5 2.131990 sandstone\n",
89 | "3 3195.3 1606.2 2.184939 sandstone\n",
90 | "4 4237.5 2448.6 2.472231 sandstone"
91 | ]
92 | },
93 | "execution_count": 1,
94 | "metadata": {},
95 | "output_type": "execute_result"
96 | }
97 | ],
98 | "source": [
99 | "import pandas as pd\n",
100 | "\n",
101 | "df = pd.read_csv(\"https://geocomp.s3.amazonaws.com/data/RPC_simple.csv\")\n",
102 | "\n",
103 | "df.head()"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 2,
109 | "id": "39832c6c",
110 | "metadata": {},
111 | "outputs": [
112 | {
113 | "data": {
114 | "text/plain": [
115 | "1.4130434782602501"
116 | ]
117 | },
118 | "execution_count": 2,
119 | "metadata": {},
120 | "output_type": "execute_result"
121 | }
122 | ],
123 | "source": [
124 | "import redflag as rf\n",
125 | "\n",
126 | "rf.imbalance_degree([1,1,2,1,1,2,2,1,1,1,1,1,2,1,3,3,3,3,2,1,1,1,1])"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 4,
132 | "id": "372a6bf1",
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "from pandas.api.extensions import register_dataframe_accessor\n",
137 | "\n",
138 | "@register_dataframe_accessor(\"redflag\")\n",
139 | "class RedflagAccessor:\n",
140 | " def __init__(self, pandas_obj):\n",
141 | " self._obj = pandas_obj\n",
142 | "\n",
143 | " def imbalance_degree(self, target=None):\n",
144 | " return rf.imbalance_degree(self._obj[target])\n",
145 | "\n",
146 | " def minority_classes(self, target=None):\n",
147 | " return rf.minority_classes(self._obj[target])"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 14,
153 | "id": "b110936f",
154 | "metadata": {},
155 | "outputs": [
156 | {
157 | "data": {
158 | "text/plain": [
159 | "True"
160 | ]
161 | },
162 | "execution_count": 14,
163 | "metadata": {},
164 | "output_type": "execute_result"
165 | }
166 | ],
167 | "source": [
168 | "rf.dummy_re([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 5,
174 | "id": "7c3963ec",
175 | "metadata": {},
176 | "outputs": [
177 | {
178 | "data": {
179 | "text/plain": [
180 | "-1.0"
181 | ]
182 | },
183 | "execution_count": 5,
184 | "metadata": {},
185 | "output_type": "execute_result"
186 | }
187 | ],
188 | "source": [
189 | "df.redflag.imbalance_degree(target='Lithology')"
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "id": "ea50c3ce",
195 | "metadata": {},
196 | "source": [
197 | "Noice."
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": 6,
203 | "id": "94f7c2cd",
204 | "metadata": {},
205 | "outputs": [
206 | {
207 | "data": {
208 | "text/plain": [
209 | "array([], dtype=float64)"
210 | ]
211 | },
212 | "execution_count": 6,
213 | "metadata": {},
214 | "output_type": "execute_result"
215 | }
216 | ],
217 | "source": [
218 | "df.redflag.minority_classes(target='Lithology')"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 7,
224 | "id": "62ea78b5",
225 | "metadata": {},
226 | "outputs": [
227 | {
228 | "data": {
229 | "text/plain": [
230 | "array([], dtype=int64)"
231 | ]
232 | },
233 | "execution_count": 7,
234 | "metadata": {},
235 | "output_type": "execute_result"
236 | }
237 | ],
238 | "source": [
239 | "import redflag as rf\n",
240 | "data = 3 * [-3, -2, -2, -1, 0, 0, 0, 1, 2, 2, 3]\n",
241 | "rf.get_outliers(data)\n",
242 | "# array([], dtype=int64)"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 8,
248 | "id": "84c883db",
249 | "metadata": {},
250 | "outputs": [
251 | {
252 | "data": {
253 | "text/plain": [
254 | "array([1.8, 1. , 1.2, 1.6, 1.4, 1.5, 1.1, 1.9, 1.3, 1.7])"
255 | ]
256 | },
257 | "execution_count": 8,
258 | "metadata": {},
259 | "output_type": "execute_result"
260 | }
261 | ],
262 | "source": [
263 | "import numpy as np\n",
264 | "import redflag as rf\n",
265 | "from sklearn.linear_model import LinearRegression\n",
266 | "from sklearn.pipeline import make_pipeline\n",
267 | "\n",
268 | "X = np.arange(10).reshape(-1, 1)\n",
269 | "np.random.shuffle(X)\n",
270 | "y = np.squeeze(10 * X + 1)\n",
271 | "pipe = make_pipeline(rf.DistributionComparator(), LinearRegression())\n",
272 | "pipe.fit(X, y)\n",
273 | "pipe.predict(X / 100) # Dramatically different distribution."
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 9,
279 | "id": "6427e5ee",
280 | "metadata": {},
281 | "outputs": [
282 | {
283 | "data": {
284 | "text/plain": [
285 | "array([1.8, 1. , 1.2, 1.6, 1.4, 1.5, 1.1, 1.9, 1.3, 1.7])"
286 | ]
287 | },
288 | "execution_count": 9,
289 | "metadata": {},
290 | "output_type": "execute_result"
291 | }
292 | ],
293 | "source": [
294 | "pipe.predict(X / 100)"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 10,
300 | "id": "6e912a70",
301 | "metadata": {},
302 | "outputs": [
303 | {
304 | "data": {
305 | "text/plain": [
306 | "array([[8],\n",
307 | " [0],\n",
308 | " [2],\n",
309 | " [6],\n",
310 | " [4],\n",
311 | " [5],\n",
312 | " [1],\n",
313 | " [9],\n",
314 | " [3],\n",
315 | " [7]])"
316 | ]
317 | },
318 | "execution_count": 10,
319 | "metadata": {},
320 | "output_type": "execute_result"
321 | }
322 | ],
323 | "source": [
324 | "X"
325 | ]
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "id": "45185280",
330 | "metadata": {},
331 | "source": [
332 | "## Series Accessor"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": 11,
338 | "id": "7ec28d7f",
339 | "metadata": {},
340 | "outputs": [],
341 | "source": [
342 | "from pandas.api.extensions import register_series_accessor\n",
343 | "from pandas.api.extensions import register_dataframe_accessor\n",
344 | "\n",
345 | "@register_series_accessor(\"redflag\")\n",
346 | "class SeriesAccessor:\n",
347 | " def __init__(self, pandas_obj):\n",
348 | " self._obj = pandas_obj\n",
349 | "\n",
350 | " def imbalance_degree(self):\n",
351 | " return rf.imbalance_degree(self._obj)\n",
352 | "\n",
353 | " def minority_classes(self):\n",
354 | " return rf.minority_classes(self._obj)\n",
355 | " \n",
356 | "\n",
357 | " def dummy_scores(self, task=None, random_state=None):\n",
358 | " if task is None:\n",
359 | " task = 'regression' if rf.is_continuous(self._obj) else 'classification'\n",
360 | " if task == 'classification':\n",
361 | " return rf.dummy_classification_scores(self._obj, random_state=random_state)\n",
362 | " elif task == 'regression':\n",
363 | " return rf.dummy_regression_scores(self._obj)\n",
364 | " else:\n",
365 | " raise ValueError(\"`task` must be 'classification' or 'regression', or None to decide automatically.\")\n",
366 | " "
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": 12,
372 | "id": "88447a57",
373 | "metadata": {},
374 | "outputs": [
375 | {
376 | "data": {
377 | "text/plain": [
378 | "-1.0"
379 | ]
380 | },
381 | "execution_count": 12,
382 | "metadata": {},
383 | "output_type": "execute_result"
384 | }
385 | ],
386 | "source": [
387 | "df['Lithology'].redflag.imbalance_degree()"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": 13,
393 | "id": "5f89c66d",
394 | "metadata": {},
395 | "outputs": [
396 | {
397 | "ename": "AttributeError",
398 | "evalue": "module 'redflag' has no attribute 'dummy_classification_scores'",
399 | "output_type": "error",
400 | "traceback": [
401 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
402 | "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
403 | "Cell \u001b[0;32mIn [13], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mLithology\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mredflag\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdummy_scores\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
404 | "Cell \u001b[0;32mIn [11], line 20\u001b[0m, in \u001b[0;36mSeriesAccessor.dummy_scores\u001b[0;34m(self, task, random_state)\u001b[0m\n\u001b[1;32m 18\u001b[0m task \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mregression\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m rf\u001b[38;5;241m.\u001b[39mis_continuous(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_obj) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclassification\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m task \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclassification\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[0;32m---> 20\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mrf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdummy_classification_scores\u001b[49m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_obj, random_state\u001b[38;5;241m=\u001b[39mrandom_state)\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m task \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mregression\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m rf\u001b[38;5;241m.\u001b[39mdummy_regression_scores(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_obj)\n",
405 | "\u001b[0;31mAttributeError\u001b[0m: module 'redflag' has no attribute 'dummy_classification_scores'"
406 | ]
407 | }
408 | ],
409 | "source": [
410 | "df['Lithology'].redflag.dummy_scores()"
411 | ]
412 | },
413 | {
414 | "cell_type": "markdown",
415 | "id": "369cf2f6",
416 | "metadata": {},
417 | "source": [
418 | "## Avoid depending on pandas\n",
419 | "\n",
420 | "We want to avoid importing Pandas if a person doesn't want to use the accessors.\n",
421 | "\n",
422 | "BTW, we can't (or don't want to) avoid depending on `sklearn` so the sklearn.py module does not need to do the same."
423 | ]
424 | },
425 | {
426 | "cell_type": "code",
427 | "execution_count": 14,
428 | "id": "e9b7c6f0",
429 | "metadata": {},
430 | "outputs": [],
431 | "source": [
432 | "def identity(arg):\n",
433 | " def decorator(func):\n",
434 | " return func \n",
435 | " return decorator\n",
436 | "\n",
437 | "@identity('foo')\n",
438 | "def hello(x):\n",
439 | " return f\"Hello {x}\""
440 | ]
441 | },
442 | {
443 | "cell_type": "code",
444 | "execution_count": 15,
445 | "id": "2dc1164b",
446 | "metadata": {},
447 | "outputs": [
448 | {
449 | "data": {
450 | "text/plain": [
451 | "'Hello Matt'"
452 | ]
453 | },
454 | "execution_count": 15,
455 | "metadata": {},
456 | "output_type": "execute_result"
457 | }
458 | ],
459 | "source": [
460 | "hello('Matt')"
461 | ]
462 | },
463 | {
464 | "cell_type": "markdown",
465 | "id": "7c51e1d2",
466 | "metadata": {},
467 | "source": [
468 | "Test with environment `foo`, which does not have `pandas`..."
469 | ]
470 | },
471 | {
472 | "cell_type": "code",
473 | "execution_count": 16,
474 | "id": "e6ea05f0",
475 | "metadata": {},
476 | "outputs": [
477 | {
478 | "data": {
479 | "text/html": [
480 | "\n",
481 | "\n",
494 | "
\n",
495 | " \n",
496 | " \n",
497 | " | \n",
498 | " Vp | \n",
499 | " Vs | \n",
500 | " rho | \n",
501 | " Lithology | \n",
502 | "
\n",
503 | " \n",
504 | " \n",
505 | " \n",
506 | " | 0 | \n",
507 | " 3045.6 | \n",
508 | " 1595.7 | \n",
509 | " 2.109121 | \n",
510 | " sandstone | \n",
511 | "
\n",
512 | " \n",
513 | " | 1 | \n",
514 | " 3000.6 | \n",
515 | " 1517.1 | \n",
516 | " 2.090342 | \n",
517 | " sandstone | \n",
518 | "
\n",
519 | " \n",
520 | " | 2 | \n",
521 | " 3363.6 | \n",
522 | " 2041.5 | \n",
523 | " 2.131990 | \n",
524 | " sandstone | \n",
525 | "
\n",
526 | " \n",
527 | " | 3 | \n",
528 | " 3195.3 | \n",
529 | " 1606.2 | \n",
530 | " 2.184939 | \n",
531 | " sandstone | \n",
532 | "
\n",
533 | " \n",
534 | " | 4 | \n",
535 | " 4237.5 | \n",
536 | " 2448.6 | \n",
537 | " 2.472231 | \n",
538 | " sandstone | \n",
539 | "
\n",
540 | " \n",
541 | "
\n",
542 | "
"
543 | ],
544 | "text/plain": [
545 | " Vp Vs rho Lithology\n",
546 | "0 3045.6 1595.7 2.109121 sandstone\n",
547 | "1 3000.6 1517.1 2.090342 sandstone\n",
548 | "2 3363.6 2041.5 2.131990 sandstone\n",
549 | "3 3195.3 1606.2 2.184939 sandstone\n",
550 | "4 4237.5 2448.6 2.472231 sandstone"
551 | ]
552 | },
553 | "execution_count": 16,
554 | "metadata": {},
555 | "output_type": "execute_result"
556 | }
557 | ],
558 | "source": [
559 | "import pandas as pd\n",
560 | "\n",
561 | "df = pd.read_csv(\"https://geocomp.s3.amazonaws.com/data/RPC_simple.csv\")\n",
562 | "\n",
563 | "df.head()"
564 | ]
565 | },
566 | {
567 | "cell_type": "code",
568 | "execution_count": 17,
569 | "id": "348a0d99",
570 | "metadata": {},
571 | "outputs": [
572 | {
573 | "data": {
574 | "text/plain": [
575 | "1.4130434782602501"
576 | ]
577 | },
578 | "execution_count": 17,
579 | "metadata": {},
580 | "output_type": "execute_result"
581 | }
582 | ],
583 | "source": [
584 | "import redflag as rf\n",
585 | "\n",
586 | "rf.imbalance_degree([1,1,2,1,1,2,2,1,1,1,1,1,2,1,3,3,3,3,2,1,1,1,1])"
587 | ]
588 | },
589 | {
590 | "cell_type": "markdown",
591 | "id": "7aa17834",
592 | "metadata": {},
593 | "source": [
594 | "## Dummy models"
595 | ]
596 | },
597 | {
598 | "cell_type": "code",
599 | "execution_count": 18,
600 | "id": "f40ed815",
601 | "metadata": {},
602 | "outputs": [
603 | {
604 | "data": {
605 | "text/plain": [
606 | "0 sandstone\n",
607 | "1 sandstone\n",
608 | "2 sandstone\n",
609 | "3 sandstone\n",
610 | "4 sandstone\n",
611 | " ... \n",
612 | "395 shale\n",
613 | "396 shale\n",
614 | "397 shale\n",
615 | "398 shale\n",
616 | "399 shale\n",
617 | "Name: Lithology, Length: 400, dtype: object"
618 | ]
619 | },
620 | "execution_count": 18,
621 | "metadata": {},
622 | "output_type": "execute_result"
623 | }
624 | ],
625 | "source": [
626 | "df['Lithology']"
627 | ]
628 | },
629 | {
630 | "cell_type": "code",
631 | "execution_count": 67,
632 | "id": "ffadaf98",
633 | "metadata": {},
634 | "outputs": [
635 | {
636 | "data": {
637 | "text/plain": [
638 | "{'most_frequent': {'f1': 0.33333333333333326, 'roc_auc': 0.5},\n",
639 | " 'stratified': {'f1': 0.47233840363611357, 'roc_auc': 0.4725}}"
640 | ]
641 | },
642 | "execution_count": 67,
643 | "metadata": {},
644 | "output_type": "execute_result"
645 | }
646 | ],
647 | "source": [
648 | "from sklearn.dummy import DummyClassifier\n",
649 | "from sklearn.metrics import f1_score, roc_auc_score\n",
650 | "from sklearn.metrics import mean_squared_error, r2_score\n",
651 | "\n",
652 | "def dummy_classification_scores(y, random_state=None):\n",
653 | " result = {'most_frequent': {}, 'stratified': {}}\n",
654 | " y = np.asanyarray(y)\n",
655 | " X = np.ones_like(y).reshape(-1, 1)\n",
656 | " for method, scores in result.items():\n",
657 | " model = DummyClassifier(strategy=method, random_state=random_state)\n",
658 | " _ = model.fit(X, y)\n",
659 | " scores['f1'] = f1_score(y, model.predict(X), average='weighted')\n",
660 | " y_prob = model.predict_proba(X)\n",
661 | " if rf.is_binary(y):\n",
662 | " scores['roc_auc'] = roc_auc_score(y, y_prob[:, 1])\n",
663 | " else:\n",
664 | " scores['roc_auc'] = roc_auc_score(y, y_prob, multi_class='ovr') \n",
665 | " return result\n",
666 | "\n",
667 | "dummy_classification_scores(df['Lithology'], random_state=42)"
668 | ]
669 | },
670 | {
671 | "cell_type": "code",
672 | "execution_count": 68,
673 | "id": "b4c958c6",
674 | "metadata": {},
675 | "outputs": [
676 | {
677 | "data": {
678 | "text/plain": [
679 | "{'most_frequent': {'f1': 0.3333333333333333, 'roc_auc': 0.5},\n",
680 | " 'stratified': {'f1': 0.20000000000000004, 'roc_auc': 0.35654761904761906}}"
681 | ]
682 | },
683 | "execution_count": 68,
684 | "metadata": {},
685 | "output_type": "execute_result"
686 | }
687 | ],
688 | "source": [
689 | "y_ = [1, 1, 1, 1, 1, 2, 2, 2, 3, 3]\n",
690 | "dummy_classification_scores(y_, random_state=42)"
691 | ]
692 | },
693 | {
694 | "cell_type": "code",
695 | "execution_count": 72,
696 | "id": "2add677d",
697 | "metadata": {},
698 | "outputs": [],
699 | "source": [
700 | "from sklearn.dummy import DummyRegressor\n",
701 | "\n",
702 | "def dummy_regression_scores(y):\n",
703 | " result = {'mean': {}}\n",
704 | " y = np.asanyarray(y)\n",
705 | " if y.ndim > 1:\n",
706 | " raise ValueError(\"Multilabel target is not supported.\")\n",
707 | " X = np.ones_like(y).reshape(-1, 1) # X is not used by the model.\n",
708 | " for method, scores in result.items():\n",
709 | " model = DummyRegressor(strategy=method)\n",
710 | " _ = model.fit(X, y)\n",
711 | " y_pred = model.predict(X)\n",
712 | " scores['mean_squared_error'] = mean_squared_error(y, y_pred)\n",
713 | " scores['r2'] = r2_score(y, y_pred)\n",
714 | " return result"
715 | ]
716 | },
717 | {
718 | "cell_type": "code",
719 | "execution_count": 73,
720 | "id": "41775588",
721 | "metadata": {},
722 | "outputs": [
723 | {
724 | "data": {
725 | "text/plain": [
726 | "{'mean': {'mean_squared_error': 8.25, 'r2': 0.0}}"
727 | ]
728 | },
729 | "execution_count": 73,
730 | "metadata": {},
731 | "output_type": "execute_result"
732 | }
733 | ],
734 | "source": [
735 | "y = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]\n",
736 | "dummy_regression_scores(y)"
737 | ]
738 | },
739 | {
740 | "cell_type": "code",
741 | "execution_count": null,
742 | "id": "f537e306",
743 | "metadata": {},
744 | "outputs": [],
745 | "source": []
746 | }
747 | ],
748 | "metadata": {
749 | "kernelspec": {
750 | "display_name": "redflag",
751 | "language": "python",
752 | "name": "redflag"
753 | },
754 | "language_info": {
755 | "codemirror_mode": {
756 | "name": "ipython",
757 | "version": 3
758 | },
759 | "file_extension": ".py",
760 | "mimetype": "text/x-python",
761 | "name": "python",
762 | "nbconvert_exporter": "python",
763 | "pygments_lexer": "ipython3",
764 | "version": "3.10.8"
765 | }
766 | },
767 | "nbformat": 4,
768 | "nbformat_minor": 5
769 | }
770 |
--------------------------------------------------------------------------------