├── .github ├── ISSUE_TEMPLATE │ └── bug_report.md └── workflows │ ├── build-test.yml │ ├── publish-docs.yml │ └── pypi-publish.yml ├── .gitignore ├── AUTHORS.md ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SECURITY.md ├── docs ├── Makefile ├── _static │ ├── custom.css │ ├── favicon.ico │ ├── redflag.svg │ ├── redflag_logo.png │ └── redflag_social.svg ├── authors.md ├── changelog.md ├── conf.py ├── contributing.md ├── development.md ├── index.rst ├── installation.md ├── license.md ├── notebooks │ ├── Basic_usage.ipynb │ ├── Demo.ipynb │ ├── Tutorial.ipynb │ ├── Using_redflag_with_Pandas.ipynb │ ├── Using_redflag_with_sklearn.ipynb │ └── _Pandas_accessor.ipynb ├── post_process_html.py ├── pre_process_ipynb.py ├── redflag.rst └── what_is_redflag.md ├── pyproject.toml ├── src └── redflag │ ├── __init__.py │ ├── distributions.py │ ├── imbalance.py │ ├── importance.py │ ├── independence.py │ ├── markov.py │ ├── outliers.py │ ├── pandas.py │ ├── sklearn.py │ ├── target.py │ └── utils.py └── tests ├── README.md ├── __init__.py ├── conftest.py ├── test_markov.py ├── test_pandas.py └── test_sklearn.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Desktop (please complete the following information):** 24 | - OS: [e.g. iOS] 25 | 26 | **Additional context** 27 | Add any other context about the problem here. 28 | -------------------------------------------------------------------------------- /.github/workflows/build-test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | workflow_call: 5 | workflow_dispatch: 6 | push: 7 | branches: [ main, develop ] 8 | pull_request: 9 | branches: [ main ] 10 | 11 | jobs: 12 | build: 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | os: [ubuntu-latest, macos-latest, windows-latest] 18 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 19 | 20 | steps: 21 | 22 | - uses: actions/checkout@v4 23 | 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | allow-prereleases: true 29 | 30 | - name: Install dependencies 31 | run: | 32 | python -m pip install --upgrade pip 33 | pip install .[dev] 34 | 35 | - name: Test with pytest 36 | run: | 37 | pytest 38 | -------------------------------------------------------------------------------- /.github/workflows/publish-docs.yml: -------------------------------------------------------------------------------- 1 | name: Docs 2 | 3 | on: 4 | workflow_call: 5 | workflow_dispatch: 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | 12 | - uses: actions/checkout@v4 13 | 14 | - name: Set up Python 15 | uses: actions/setup-python@v5 16 | with: 17 | python-version: '3.x' 18 | 19 | - name: Install package 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install .[dev] 23 | 24 | - name: Build docs 25 | run: | 26 | cd docs 27 | make html 28 | 29 | - name: Publish docs 30 | uses: JamesIves/github-pages-deploy-action@v4 31 | with: 32 | branch: gh-pages 33 | folder: docs/_build/html 34 | -------------------------------------------------------------------------------- /.github/workflows/pypi-publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | tests: 9 | uses: ./.github/workflows/build-test.yml 10 | 11 | docs: 12 | uses: ./.github/workflows/publish-docs.yml 13 | 14 | deploy: 15 | needs: [tests, docs] 16 | runs-on: ubuntu-latest 17 | steps: 18 | 19 | - uses: actions/checkout@v4 20 | 21 | - name: Set up Python 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: '3.x' 25 | 26 | - name: Install package 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install .[dev] 30 | 31 | - name: Build package 32 | run: python -m build 33 | 34 | - name: Publish package 35 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 36 | with: 37 | user: __token__ 38 | password: ${{ secrets.PYPI_API_TOKEN }} 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Mac 2 | .DS_Store 3 | 4 | # Version file autocreated in pyproject.toml and redflag/__init__.py 5 | _version.py 6 | 7 | # Processed docs 8 | _notebooks 9 | 10 | # API docs are built 11 | docs/redflag.*.rst 12 | 13 | # Other 14 | .vscode 15 | 16 | # Byte-compiled / optimized / DLL files 17 | __pycache__/ 18 | *.py[cod] 19 | *$py.class 20 | 21 | # C extensions 22 | *.so 23 | 24 | # Distribution / packaging 25 | .Python 26 | build/ 27 | develop-eggs/ 28 | dist/ 29 | downloads/ 30 | eggs/ 31 | .eggs/ 32 | lib/ 33 | lib64/ 34 | parts/ 35 | sdist/ 36 | var/ 37 | wheels/ 38 | pip-wheel-metadata/ 39 | share/python-wheels/ 40 | *.egg-info/ 41 | .installed.cfg 42 | *.egg 43 | MANIFEST* 44 | 45 | # PyInstaller 46 | # Usually these files are written by a python script from a template 47 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 48 | *.manifest 49 | *.spec 50 | 51 | # Installer logs 52 | pip-log.txt 53 | pip-delete-this-directory.txt 54 | 55 | # Unit test / coverage reports 56 | htmlcov/ 57 | .tox/ 58 | .nox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *.cover 65 | *.py,cover 66 | .hypothesis/ 67 | .pytest_cache/ 68 | 69 | # Translations 70 | *.mo 71 | *.pot 72 | 73 | # Django stuff: 74 | *.log 75 | local_settings.py 76 | db.sqlite3 77 | db.sqlite3-journal 78 | 79 | # Flask stuff: 80 | instance/ 81 | .webassets-cache 82 | 83 | # Scrapy stuff: 84 | .scrapy 85 | 86 | # Sphinx documentation 87 | docs/_build/ 88 | 89 | # PyBuilder 90 | target/ 91 | 92 | # Jupyter Notebook 93 | .ipynb_checkpoints 94 | 95 | # IPython 96 | profile_default/ 97 | ipython_config.py 98 | 99 | # pyenv 100 | .python-version 101 | 102 | # pipenv 103 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 104 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 105 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 106 | # install all needed dependencies. 107 | #Pipfile.lock 108 | 109 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 110 | __pypackages__/ 111 | 112 | # Celery stuff 113 | celerybeat-schedule 114 | celerybeat.pid 115 | 116 | # SageMath parsed files 117 | *.sage.py 118 | 119 | # Environments 120 | .env 121 | .venv 122 | env/ 123 | venv/ 124 | ENV/ 125 | env.bak/ 126 | venv.bak/ 127 | 128 | # Spyder project settings 129 | .spyderproject 130 | .spyproject 131 | 132 | # Rope project settings 133 | .ropeproject 134 | 135 | # mkdocs documentation 136 | /site 137 | 138 | # mypy 139 | .mypy_cache/ 140 | .dmypy.json 141 | dmypy.json 142 | 143 | # Pyre type checker 144 | .pyre/ 145 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | # Authors 2 | 3 | The following people have contributed to the project (in alphabetical order): 4 | 5 | - [Matt Hall](https://github.com/kwinkunks), Agile Scientific, Canada (ORCID: [0000-0002-4054-8295](https://orcid.org/0000-0002-4054-8295)) 6 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | 4 | ## 0.5.0, 21 April 2024 5 | 6 | - This release makes more changes to the tests and documentation in reponse to the review process for [the submission](https://joss.theoj.org/papers/e1ca575ec0c5344144f87176539ef547) to JOSS (see below). 7 | - In particular, see the following issue: [#97](https://github.com/scienxlab/redflag/issues/97) 8 | - Changed the method of handling dynamic versioning. For now the package `__version__` attribute is still defined, but it is deprecated and will be removed in `0.6.0`. Use `from importlib.metadata.version('redflag')` to get the version information instead. 9 | - Changed the default `get_outliers()` method from isolation forest (`'iso'`) to Mahalanobis (`'mah'`) to match other functions, eg `has_outliers()` and the `sklearn` pipeline object. 10 | - Updated `actions/setup-python` to use v5. 11 | 12 | 13 | ## 0.4.2, 10 December 2023 14 | 15 | - This is a minor release making changes to the tests and documentation in reponse to the review process for [a submission](https://joss.theoj.org/papers/e1ca575ec0c5344144f87176539ef547) to [The Journal of Open Source Software](https://joss.theoj.org) (JOSS). 16 | - See the following issues: [#89](https://github.com/scienxlab/redflag/issues/89), [#90](https://github.com/scienxlab/redflag/issues/90), [#91](https://github.com/scienxlab/redflag/issues/91), [#92](https://github.com/scienxlab/redflag/issues/92), [#93](https://github.com/scienxlab/redflag/issues/93), [#94](https://github.com/scienxlab/redflag/issues/94) and [#95](https://github.com/scienxlab/redflag/issues/95). 17 | - Now building and testing on Windows and MacOS as well as Linux. 18 | - Python version `3.12` added to package classifiers 19 | - Python version `3.12` tested during CI 20 | 21 | 22 | ## 0.4.1, 2 October 2023 23 | 24 | - This is a minor release intended to preview new `pandas`-related features for version 0.5.0. 25 | - Added another `pandas` Series accessor, `is_imbalanced()`. 26 | - Added two `pandas` DataFrame accessors, `feature_importances()` and `correlation_detector()`. These are experimental features. 27 | 28 | 29 | ## 0.4.0, 28 September 2023 30 | 31 | - `redflag` can now be installed by the `conda` package and environment manager. To do so, use `conda install -c conda-forge redflag`. 32 | - All of the `sklearn` components can now be instantiated with `warn=False` in order to trigger a `ValueException` instead of a warning. This allows you to build pipelines that will break if a detector is triggered. 33 | - Added `redflag.target.is_ordered()` to check if a single-label categorical target is ordered in some way. The test uses a Markov chain analysis, applying chi-squared test to the transition matrix. In general, the Boolean result should only be used on targets with several classes, perhaps at least 10. Below that, it seems to give a lot of false positives. 34 | - You can now pass `groups` to `redflag.distributions.is_multimodal()`. If present, the modality will be checked for each group, returning a Boolean array of values (one for each group). This allows you to check a feature partitioned by target class, for example. 35 | - Added `redflag.sklearn.MultimodalityDetector` to provide a way to check for multimodal features. If `y` is passed and is categorical, it will be used to partition the data and modality will be checked for each class. 36 | - Added `redflag.sklearn.InsufficientDataDetector` which checks that there are at least M2 records (rows in `X`), where M is the number of features (i.e. columns) in `X`. 37 | - Removed `RegressionMultimodalDetector`. Use `MultimodalDetector` instead. 38 | 39 | 40 | ## 0.3.0, 21 September 2023 41 | 42 | - Added some accessors to give access to `redflag` functions directly from `pandas.Series` objects, via an 'accessor'. For example, for a Series `s`, one can call `minority_classes = s.redflag.minority_classes()` instead of `redflag.minority_classes(s)`. Other functions include `imbalance_degree()`, `dummy_scores()` (see below). Probably not very useful yet, but future releases will add some reporting functions that wrap multiple Redflag functions. **This is an experimental feature and subject to change.** 43 | - Added a Series accessor `report()` to perform a range of tests and make a small text report suitable for printing. Access for a Series `s` like `s.redflag.report()`. **This is an experimental feature and subject to change.** 44 | - Added new documentation page for the Pandas accessor. 45 | - Added `redflag.target.dummy_classification_scores()`, `redflag.target.dummy_regression_scores()`, which train a dummy (i.e. naive) model and compute various relevant scores (MSE and R2 for regression, F1 and ROC-AUC for classification tasks). Additionally, both `most_frequent` and `stratified` strategies are tested for classification tasks; only the `mean` strategy is employed for regression tasks. The helper function `redflag.target.dummy_scores()` tries to guess what kind of task suits the data and calls the appropriate function. 46 | - Moved `redflag.target.update_p()` to `redflag.utils`. 47 | - Added `is_imbalanced()` to return a Boolean depending on a threshold of imbalance degree. Default threshold is 0.5 but the best value is up for debate. 48 | - Removed `utils.has_low_distance_stdev`. 49 | 50 | 51 | ## 0.2.0, 4 September 2023 52 | 53 | - Moved to something more closely resembling semantic versioning, which is the main reason this is version 0.2.0. 54 | - Builds and tests on Python 3.11 have been successful, so now supporting this version. 55 | - Added custom 'alarm' `Detector`, which can be instantiated with a function and a warning to emit when the function returns True for a 1D array. You can easily write your own detectors with this class. 56 | - Added `make_detector_pipeline()` which can take sequences of functions and warnings (or a mapping of functions to warnings) and returns a `scikit-learn.pipeline.Pipeline` containing a `Detector` for each function. 57 | - Added `RegressionMultimodalDetector` to allow detection of non-unimodal distributions in features, when considered across the entire dataset. (Coming soon, a similar detector for classification tasks that will partition the data by class.) 58 | - Redefined `is_standardized` (deprecated) as `is_standard_normal`, which implements the Kolmogorov–Smirnov test. It seems more reliable than assuming the data will have a mean of almost exactly 0 and standard deviation of exactly 1, when all we really care about is that the feature is roughly normal. 59 | - Changed the wording slightly in the existing detector warning messages. 60 | - No longer warning if `y` is `None` in, eg, `ImportanceDetector`, since you most likely know this. 61 | - Some changes to `ImportanceDetector`. It now uses KNN estimators instead of SVMs as the third measure of importance; the SVMs were too unstable, causing numerical issues. It also now requires that the number of important features is less than the total number of features to be triggered. So if you have 2 features and both are important, it does not trigger. 62 | - Improved `is_continuous()` which was erroneously classifying integer arrays with many consecutive values as non-continuous. 63 | - Note that `wasserstein` no longer checks that the data are standardized; this check will probably return in the future, however. 64 | - Added a `Tutorial.ipynb` notebook to the docs. 65 | - Added a **Copy** button to code blocks in the docs. 66 | 67 | 68 | ## 0.1.10, 21 November 2022 69 | 70 | - Added `redflag.importance.least_important_features()` and `redflag.importance.most_important_features()`. These functions are complementary (in other words, if the same threshold is used in each, then between them they return all of the features). The default threshold for importance is half the expected value. E.g. if there are 5 features, then the default threshold is half of 0.2, or 0.1. Part of [Issue 2](https://github.com/scienxlab/redflag/issues/2). 71 | - Added `redflag.sklearn.ImportanceDetector` class, which warns if 1 or 2 features have anomalously high importance, or if some features have anomalously low importance. Part of [Issue 2](https://github.com/scienxlab/redflag/issues/2). 72 | - Added `redflag.sklearn.ImbalanceComparator` class, which learns the imbalance present in the training data, then compares what is observed in subsequent data (evaluation, test, or production data). If there's a difference, it throws a warning. Note: it does not warn if there is imbalance present in the training data; use `ImbalanceDetector` for that. 73 | - Added `redflag.sklearn.RfPipeline` class, which is needed to include the `ImbalanceComparator` in a pipeline (because the common-or-garden `sklearn.pipeline.Pipeline` class does not pass `y` into a transformer's `transform()` method). Also added the `redflag.sklearn.make_rf_pipeline()` function to help make pipelines with this special class. These components are straight-up forks of the code in `scikit-learn` (3-clause BSD licensed). 74 | - Added example to `docs/notebooks/Using_redflag_with_sklearn.ipynb` to show how to use these new objects. 75 | - Improved `redflag.is_continuous()`, which was buggy; see [Issue 3](https://github.com/scienxlab/redflag/issues/3). It still fails on some cases. I'm not sure a definitive test for continuousness (or, conversely, discreteness) is possible; it's just a heuristic. 76 | 77 | 78 | ## 0.1.9, 25 August 2022 79 | 80 | - Added some experimental `sklearn` transformers that implement various `redflag` tests. These do not transform the data in any way, they just inspect the data and emit warnings if tests fail. The main ones are: `redflag.sklearn.ClipDetector`, `redflag.sklearn.OutlierDetector`, `redflag.sklearn.CorrelationDetector`, `redflag.sklearn.ImbalanceDetector`, and `redflag.sklearn.DistributionComparator`. 81 | - Added tests for the `sklearn` transformers. These are in `redflag/tests/test_redflag.py` file, whereas all other tests are doctests. You can run all the tests at once with `pytest`; coverage is currently 94%. 82 | - Added `docs/notebooks/Using_redflag_with_sklearn.ipynb` to show how to use these new objects in an `sklearn` pipeline. 83 | - Since there's quite a bit of `sklearn` code in the `redflag` package, it is now a hard dependency. I removed the other dependencies because they are all dependencies of `sklearn`. 84 | - Added `redflag.has_outliers()` to make it easier to check for excessive outliers in a dataset. This function only uses Mahalanobis distance and always works in a multivariate sense. 85 | - Reorganized the `redflag.features` module into new modules: `redflag.distributions`, `redflag.outliers`, and `redflag.independence`. All of the functions are still imported into the `redflag` namespace, so this doesn't affect existing code. 86 | - Added examples to `docs/notebooks/Basic_usage.ipynb`. 87 | - Removed the `class_imbalance()` function, which was confusing. Use `imbalance_ratio()` instead. 88 | 89 | 90 | ## 0.1.8, 8 July 2022 91 | 92 | - Added Wasserstein distance comparisons for univariate and multivariate distributions. This works for either a `groups` array, or for multiple dataset splits if that's more convenient. 93 | - Improved `get_outliers()`, removing OneClassSVM method and adding EllipticEnvelope and Mahalanobis distance. 94 | - Added Mahalanobis distance outlier detection function to serve `get_outliers()` or be used on its own. Reproduces the results `zscore_outliers()` used to give for univariate data, so removed that. 95 | - Added `kde_peaks()` function to find peaks in a kernel density estimate. This also needed some other functions, including `fit_kde()`, `get_kde()`, `find_large_peaks()`, and the bandwidth estimators, `bw_silverman()` and `bw_scott()`. 96 | - Added `classes` argument to the class imbalance function, in case there are classes with no data, or to override the classes in the data. 97 | - Fixed a bug in the `feature_importances()` function. 98 | - Fixed a bug in the `is_continuous()` function. 99 | - Improved the `Using_redflag.ipynb` notebook. 100 | - Added `has_nans()`, `has_monotonic()`, and `has_flat()` functions to detect interpolation issues. 101 | - Moved some more helper functions into utils, eg `iter_groups()`, `ecdf()`, `flatten()`, `stdev_to_proportion()` and `proportion_to_stdev()`. 102 | - Wrote a lot more tests, coverage is now at 95%. 103 | 104 | 105 | ## 0.1.3 to 0.1.7, 9–11 February 2022 106 | 107 | - Added `utils.has_low_distance_stdev`. 108 | - Added `utils.has_few_samples`. 109 | - Added `utils.is_standardized()` function to test if a feature or regression target appears to be a Z-score. 110 | - Changed name of `clips()` function to `clipped()` to be more predictable (it goes with `is_clipped()`). 111 | - Documentation. 112 | - CI workflow seems to be stable. 113 | - Mostly just a lot of flailing. 114 | 115 | 116 | ## 0.1.2, 1 February 2022 117 | 118 | - Early release. 119 | - Added auto-versioning. 120 | 121 | 122 | ## 0.1.1, 31 January 2022 123 | 124 | - Early release. 125 | 126 | 127 | ## 0.1.0, 30 January 2022 128 | 129 | - Early release. 130 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Racist or racially biased remarks, attacks, or content. 28 | * Trolling, insulting/derogatory comments, and personal or political attacks 29 | * Public or private harassment 30 | * Publishing others' private information, such as a physical or electronic 31 | address, without explicit permission 32 | * Other conduct which could reasonably be considered inappropriate in a 33 | professional setting 34 | 35 | ## Our responsibilities 36 | 37 | Project maintainers are responsible for clarifying the standards of acceptable 38 | behavior and are expected to take appropriate and fair corrective action in 39 | response to any instances of unacceptable behavior. 40 | 41 | Project maintainers have the right and responsibility to remove, edit, or 42 | reject comments, commits, code, wiki edits, issues, and other contributions 43 | that are not aligned to this Code of Conduct, or to ban temporarily or 44 | permanently any contributor for other behaviors that they deem inappropriate, 45 | threatening, offensive, or harmful. 46 | 47 | ## Scope 48 | 49 | This Code of Conduct applies within all project spaces, and it also applies when 50 | an individual is representing the project or its community in public spaces. 51 | Examples of representing a project or community include using an official 52 | project e-mail address, posting via an official social media account, or acting 53 | as an appointed representative at an online or offline event. Representation of 54 | a project may be further defined and clarified by project maintainers. 55 | 56 | ## Enforcement 57 | 58 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 59 | reported by contacting any of the following people: 60 | 61 | - Matt Hall, [kwinkunks@gmail.com](mailto:kwinkunks@gmail.com) 62 | 63 | All complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/), version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | For answers to common questions about this code of conduct, see 78 | https://www.contributor-covenant.org/faq 79 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | **🙌 Thank you for considering contributing to this project!** 4 | 5 | There are several important ways you can help; here are some examples: 6 | 7 | - Submitting bug reports and feature requests: see [Issues](https://github.com/scienxlab/redflag/issues). 8 | - Proposing code for bug fixes and new features, then [making a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests). 9 | - Fixing typos and generally improving the documentation. 10 | - Writing tutorials, examples, and how-to documents. 11 | 12 | 13 | ## Code of conduct 14 | 15 | We're fortunate to be part of a large professional community that conducts itself with mutual respect and consideration for others. Scienxlab's [Code of Conduct](https://github.com/scienxlab/community/blob/main/CODE_OF_CONDUCT.md) is part of protecting these features for everyone, everywhere. Please read it. 16 | 17 | 18 | ## Authorship 19 | 20 | If you contribute a pull request to the project and you wish to be identified as an author, please add yourself to `AUTHORS.md`. 21 | 22 | 23 | ## License 24 | 25 | By making a contribution, you agree that it shall be governed by the terms of the license unless another, specific agreement is made with Agile. 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # redflag 2 | 3 | [![Tests](https://github.com/scienxlab/redflag/actions/workflows/build-test.yml/badge.svg)](https://github.com/scienxlab/redflag/actions/workflows/build-test.yml) 4 | [![Docs](https://github.com/scienxlab/redflag/actions/workflows/publish-docs.yml/badge.svg)](https://github.com/scienxlab/redflag/actions/workflows/publish-docs.yml) 5 | [![PyPI version](https://img.shields.io/pypi/v/redflag.svg)](https://pypi.org/project/redflag/) 6 | [![Conda Version](https://img.shields.io/conda/vn/conda-forge/redflag.svg)](https://anaconda.org/conda-forge/redflag) 7 | [![PyPI versions](https://img.shields.io/pypi/pyversions/redflag.svg)](https://pypi.org/project/redflag/) 8 | [![PyPI license](https://img.shields.io/pypi/l/redflag.svg)](https://pypi.org/project/redflag/) 9 | 10 | 🚩 `redflag` aims to be an automatic safety net for machine learning datasets. The vision is to accept input of a Pandas `DataFrame` or NumPy `ndarray` representing the input `X` and target `y` in a machine learning task. `redflag` will provide an analysis of each feature, and of the target, including aspects such as class imbalance, leakage, outliers, anomalous data patterns, threats to the IID assumption, and so on. The goal is to complement other projects like `pandas-profiling` and `greatexpectations`. 11 | 12 | 13 | ## Installation 14 | 15 | You can install this package with `pip`: 16 | 17 | python -m pip install redflag 18 | 19 | Alternatively, you can use the `conda` package manager, pointed at the `conda-forge` channel: 20 | 21 | conda install -c conda-forge redflag 22 | 23 | For developers, there is a `pip` option for installing `dev` dependencies. Use `pip install "redflag[dev]"` to install all testing and documentation packages. 24 | 25 | 26 | ## Example with `sklearn` 27 | 28 | The most useful components of `redflag` are probably the `scikit-learn` "detectors". These sit in your pipeline, look at your training and validation data, and emit warnings if something looks like it might cause a problem. For example, we can get alerted to an imbalanced target vector `y` like so: 29 | 30 | ```python 31 | import redflag as rf 32 | from sklearn.datasets import make_classification 33 | 34 | X, y = make_classification(weights=[0.1]) 35 | 36 | _ = rf.ImbalanceDetector().fit(X, y) 37 | ``` 38 | 39 | This raises a warning: 40 | 41 | ```python 42 | 🚩 The labels are imbalanced by more than the threshold (0.780 > 0.400). See self.minority_classes_ for the minority classes. 43 | ``` 44 | 45 | For maximum effect, put this and other detectors in your pipeline, or use the pre-build `rf.pipeline` which contains several useful alerts. 46 | 47 | See [the documentation](https://scienxlab.org/redflag), and specifically the notebook [Using `redflag` with `sklearn`.ipynb](https://github.com/scienxlab/redflag/blob/main/docs/notebooks/Using_redflag_with_sklearn.ipynb) for other examples. 48 | 49 | 50 | ## Example of function call 51 | 52 | `redflag` is also a collection of functions. Most of the useful ones take one or more columns of data (usually a 1D or 2D NumPy array) and run a single test. For example, we can do some outlier detection. The `get_outliers()` function returns the indices of data points that are considered outliers: 53 | 54 | ```python 55 | >>> import redflag as rf 56 | >>> data = 3 * [-3, -2, -2, -1, 0, 0, 0, 1, 2, 2, 3] 57 | >>> rf.get_outliers(data) 58 | array([], dtype=int64) 59 | ``` 60 | 61 | That is, there are no outliers. But let's add a clear outlier: a new data record with a value of 100. The function returns the index position(s) of the outlier point(s): 62 | 63 | ```python 64 | >>> rf.get_outliers(data + [100]) 65 | array([33]) 66 | ``` 67 | 68 | See [the documentation](https://scienxlab.org/redflag), and specifically the notebook [Basic_usage.ipynb](https://github.com/scienxlab/redflag/blob/main/docs/notebooks/Basic_usage.ipynb) for several other basic examples. 69 | 70 | 71 | ## Documentation 72 | 73 | [The documentation is online.](https://scienxlab.org/redflag) 74 | 75 | 76 | ## Contributing 77 | 78 | Please see [`CONTRIBUTING.md`](https://github.com/scienxlab/redflag/blob/main/CONTRIBUTING.md). There is also a section [in the documentation](https://scienxlab.org/redflag) about _Development_. 79 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security policy 2 | 3 | 4 | ## Supported versions 5 | 6 | Only the latest version of `redflag` is supported. 7 | 8 | 9 | ## Reporting a vulnerability 10 | 11 | Please do not report on issue on GitHub, instead report vulnerabilities to hello@scienxlab.org 12 | 13 | We do not award bounties for security vulnerabilities, but will notify you if and when the report is accepted and acted upon. 14 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | 2 | # You can set these variables from the command line, and also 3 | # from the environment for the first two. 4 | SPHINXOPTS ?= 5 | SPHINXBUILD ?= sphinx-build 6 | SOURCEDIR = . 7 | BUILDDIR = _build 8 | 9 | # Put it first so that "make" without argument is like "make help". 10 | help: 11 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 12 | 13 | .PHONY: help html 14 | 15 | html: 16 | python pre_process_ipynb.py $(SOURCEDIR)/notebooks 17 | $(SPHINXBUILD) -E -b html $(SPHINXOPTS) $(SOURCEDIR) $(BUILDDIR)/html 18 | python post_process_html.py $(BUILDDIR)/html 19 | @echo 20 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 21 | -------------------------------------------------------------------------------- /docs/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* Removes Captions from the main page. */ 2 | article p.caption { 3 | display: none; 4 | } 5 | 6 | /* Styles the 'line block' https://docutils.sourceforge.io/docs/user/rst/quickref.html#line-blocks. */ 7 | blockquote { 8 | background: none; 9 | border-left-width: 0px; 10 | padding: 0em; 11 | } 12 | 13 | blockquote div.line { 14 | color: #838383; 15 | display: inline; 16 | font-style: normal !important; 17 | font-size: 150%; 18 | line-height: 125%; 19 | } 20 | 21 | /* Adds the GitHub ribbon. */ 22 | #forkongithub a { 23 | background:rgb(158, 158, 158); 24 | color:#fff; 25 | text-decoration:none; 26 | font-family:arial,sans-serif; 27 | text-align:center; 28 | font-weight:bold; 29 | padding:5px 40px; 30 | font-size:1rem; 31 | line-height:2rem; 32 | position:relative; 33 | transition:0.5s; 34 | } 35 | 36 | #forkongithub a:hover { 37 | background:#14ca29; 38 | color:#fff; 39 | } 40 | 41 | #forkongithub a::after { 42 | bottom:1px; 43 | top:auto; 44 | } 45 | 46 | @media screen and (min-width:800px) { 47 | #forkongithub{ 48 | position:fixed; 49 | display:block; 50 | top:0; 51 | right:0; 52 | width:200px; 53 | overflow:hidden; 54 | height:200px; 55 | z-index:9999; 56 | } 57 | 58 | #forkongithub a { 59 | width:200px; 60 | position:absolute; 61 | top:60px; 62 | right:-60px; 63 | transform:rotate(45deg); 64 | -webkit-transform:rotate(45deg); 65 | -ms-transform:rotate(45deg); 66 | -moz-transform:rotate(45deg); 67 | -o-transform:rotate(45deg); 68 | box-shadow:4px 4px 10px rgba(0,0,0,0.4); 69 | } 70 | } -------------------------------------------------------------------------------- /docs/_static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scienxlab/redflag/f495ddd0729c7ac61dc8d8f54fc706178a0d253f/docs/_static/favicon.ico -------------------------------------------------------------------------------- /docs/_static/redflag.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 40 | 42 | 49 | 56 | 57 | 61 | 70 | 75 | 84 | 89 | redflag 100 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /docs/_static/redflag_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scienxlab/redflag/f495ddd0729c7ac61dc8d8f54fc706178a0d253f/docs/_static/redflag_logo.png -------------------------------------------------------------------------------- /docs/authors.md: -------------------------------------------------------------------------------- 1 | ```{include} ../AUTHORS.md 2 | ``` -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- 1 | ```{include} ../CHANGELOG.md 2 | ``` -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for Sphinx documentation builder. 2 | 3 | # -- Setup function ---------------------------------------------------------- 4 | 5 | # Defines custom steps in the process. 6 | 7 | def autodoc_skip_member(app, what, name, obj, skip, options): 8 | """Exclude all private attributes, methods, and dunder methods from Sphinx.""" 9 | import re 10 | exclude = re.findall(r'\._.*', str(obj)) 11 | return skip or exclude 12 | 13 | def remove_module_docstring(app, what, name, obj, options, lines): 14 | """Remove everything after 'Author: '.""" 15 | if what == "module": 16 | keep = [i for i, line in enumerate(lines) if line.startswith("Author: ")] 17 | if keep: 18 | del lines[keep[0]:] 19 | return 20 | 21 | def setup(app): 22 | app.connect('autodoc-skip-member', autodoc_skip_member) 23 | app.connect("autodoc-process-docstring", remove_module_docstring) 24 | return 25 | 26 | 27 | # -- Path setup -------------------------------------------------------------- 28 | 29 | # If extensions (or modules to document with autodoc) are in another directory, 30 | # add these directories to sys.path here. If the directory is relative to the 31 | # documentation root, use os.path.abspath to make it absolute, like shown here. 32 | 33 | import os 34 | import sys 35 | sys.path.insert(0, os.path.abspath('../src')) 36 | 37 | 38 | # -- Project information ----------------------------------------------------- 39 | 40 | project = 'redflag' 41 | copyright = '2024, The Redflag Authors' 42 | author = 'The Redflag Authors' 43 | 44 | 45 | # -- General configuration --------------------------------------------------- 46 | 47 | # Add any Sphinx extension module names here, as strings. They can be 48 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 49 | # ones. 50 | extensions = [ 51 | 'sphinxcontrib.apidoc', 52 | 'sphinx.ext.githubpages', 53 | 'sphinx.ext.napoleon', 54 | 'sphinx.ext.coverage', 55 | 'sphinx_copybutton', 56 | 'myst_nb', 57 | ] 58 | 59 | myst_enable_extensions = ["dollarmath", "amsmath"] 60 | 61 | # Apidoc automation 62 | # https://pypi.org/project/sphinxcontrib-apidoc/ 63 | # The apidoc extension and this code automatically update apidoc. 64 | apidoc_module_dir = '../src/redflag' 65 | apidoc_output_dir = './' 66 | apidoc_excluded_paths = [] 67 | apidoc_toc_file = False 68 | apidoc_separate_modules = True 69 | 70 | # Add any paths that contain templates here, relative to this directory. 71 | templates_path = ['_templates'] 72 | 73 | # List of patterns, relative to source directory, that match files and 74 | # directories to ignore when looking for source files. 75 | # This pattern also affects html_static_path and html_extra_path. 76 | exclude_patterns = ['_build', 'notebooks'] 77 | 78 | 79 | # -- Options for HTML output ------------------------------------------------- 80 | 81 | # The theme to use for HTML and HTML Help pages. See the documentation for 82 | # a list of builtin themes. 83 | # 84 | # https://sphinx-themes.org/sample-sites/furo/ 85 | html_theme = 'furo' 86 | html_title = '' 87 | html_theme_options = { 88 | "sidebar_hide_name": True, 89 | } 90 | 91 | # Add any paths that contain custom static files (such as style sheets) here, 92 | # relative to this directory. They are copied after the builtin static files, 93 | # so a file named "default.css" will overwrite the builtin "default.css". 94 | html_static_path = ['_static'] 95 | 96 | html_css_files = [ 97 | 'custom.css', 98 | ] 99 | 100 | # Branding. 101 | html_favicon = '_static/favicon.ico' 102 | html_logo = '_static/redflag_logo.png' 103 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | ```{include} ../CONTRIBUTING.md 2 | ``` -------------------------------------------------------------------------------- /docs/development.md: -------------------------------------------------------------------------------- 1 | # Development 2 | 3 | If you'd like to develop `redflag`, this page should help you get started. 4 | 5 | 6 | ## Installation 7 | 8 | You can install this package with `pip` or `conda`. The `dev` option will install the packages you need for testing and building the documentation. 9 | 10 | ```shell 11 | python -m pip install "redflag[dev]" 12 | ``` 13 | 14 | 15 | ## Contributing 16 | 17 | If you'd like to contribute pull requests back to the main `redflag ` project, please see [`CONTRIBUTING.md`](https://github.com/scienxlab/redflag/blob/main/CONTRIBUTING.md). 18 | 19 | 20 | ## Testing 21 | 22 | You can run the tests (requires `pytest` and `pytest-cov`) with 23 | 24 | ```shell 25 | pytest 26 | ``` 27 | 28 | Most of the tests are `doctest` tests, which are contained in the docstrings of this package's functions. There are further tests in the `tests` folder. 29 | 30 | 31 | ## Building the package 32 | 33 | This repo uses PEP 518-style packaging. [Read more about this](https://setuptools.pypa.io/en/latest/build_meta.html) and [about Python packaging in general](https://packaging.python.org/en/latest/tutorials/packaging-projects/). 34 | 35 | To build `redflag` locally: 36 | 37 | ```shell 38 | python -m build 39 | ``` 40 | 41 | This builds both `.tar.gz` and `.whl` files, either of which you can install with `pip`. 42 | 43 | 44 | ## Building the docs 45 | 46 | You can build the docs with the following commands: 47 | 48 | ```shell 49 | cd docs 50 | make html 51 | ``` 52 | 53 | Don't just run `sphinx-build` manually: there is other stuff happening in the `Makefile`. 54 | 55 | There is a continuous integration script to update the docs on published releases. 56 | 57 | 58 | ## Continuous integration 59 | 60 | This repo has two GitHub 'workflows' or 'actions': 61 | 62 | - Push to `main`: Run all tests on all version of Python. This is the **Build and test** workflow. 63 | - Publish a new release: Build and upload to PyPI. This is the **Publish to PyPI** workflow. Publish using the GitHub interface, for example ([read more](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository)). 64 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | :hide-toc: 2 | 3 | .. container:: 4 | :name: forkongithub 5 | 6 | `Fork on GitHub `_ 7 | 8 | 9 | Redflag: safer ML by design 10 | =========================== 11 | 12 | | ``redflag`` is a lightweight safety net for machine 13 | | learning. Given a ``DataFrame`` or ``ndarray``, 14 | | ``redflag`` will analyse the features and the target, 15 | | and warn you about class imbalance, leakage, outliers, 16 | | anomalous data patterns, threats to the IID assumption, 17 | | and more. 18 | 19 | 20 | Quick start 21 | ----------- 22 | 23 | .. toctree:: 24 | :caption: Quick start 25 | 26 | Install ``redflag`` with pip or with ``conda`` from the ``conda-forge`` channel: 27 | 28 | .. code-block:: shell 29 | 30 | pip install redflag 31 | 32 | Import ``redflag`` in your Python program: 33 | 34 | .. code-block:: python 35 | 36 | import redflag as rf 37 | 38 | There are three main ways to use ``redflag``: 39 | 40 | 1. ``scikit-learn`` components for your pipelines, e.g. ``rf.ImbalanceDetector().fit_transform(X, y)``. 41 | 2. ``pandas`` accessors on Series and DataFrames, e.g. ``df['target'].redflag.imbalance_degree()``. 42 | 3. As a library of standalone functions, e.g. ``rf.imbalance_degree(y)``. 43 | 44 | Carry on exploring with the user guide below. 45 | 46 | 47 | User guide 48 | ---------- 49 | 50 | .. toctree:: 51 | :maxdepth: 2 52 | :caption: User guide 53 | 54 | installation 55 | what_is_redflag 56 | _notebooks/Basic_usage.ipynb 57 | _notebooks/Using_redflag_with_sklearn.ipynb 58 | _notebooks/Using_redflag_with_Pandas.ipynb 59 | _notebooks/Tutorial.ipynb 60 | 61 | 62 | API reference 63 | ------------- 64 | 65 | .. toctree:: 66 | :maxdepth: 2 67 | :caption: API reference 68 | 69 | redflag 70 | 71 | 72 | Other resources 73 | --------------- 74 | 75 | .. toctree:: 76 | :maxdepth: 1 77 | :caption: Other resources 78 | 79 | development 80 | contributing 81 | authors 82 | license 83 | changelog 84 | 85 | 86 | Indices and tables 87 | ------------------ 88 | 89 | * :ref:`genindex` 90 | * :ref:`modindex` 91 | * :ref:`search` 92 | 93 | 94 | .. toctree:: 95 | :caption: Project links 96 | :hidden: 97 | 98 | PyPI releases 99 | Code in GitHub 100 | Issue tracker 101 | Community guidelines 102 | Scienxlab 103 | -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | # 🚩 Installation 2 | 3 | At the command line: 4 | 5 | ```shell 6 | pip install redflag 7 | ``` 8 | 9 | Or, if you use Conda environments: 10 | 11 | ```shell 12 | conda install -c conda-forge redflag 13 | ``` 14 | 15 | You can add the `conda-forge` channel as a source for future installations like so: 16 | 17 | ```shell 18 | conda config --add channels conda-forge 19 | conda config --set channel_priority strict 20 | ``` 21 | 22 | 23 | ## Optional dependencies 24 | 25 | For developers, there is an option to install `dev` dependencies: `pip install "redflag[dev]"` to install all testing and documentation packages. 26 | 27 | If you want to help develop `redflag`, please read [Development](development.md). 28 | -------------------------------------------------------------------------------- /docs/license.md: -------------------------------------------------------------------------------- 1 | # License 2 | 3 | ```{include} ../LICENSE 4 | ``` 5 | -------------------------------------------------------------------------------- /docs/notebooks/Using_redflag_with_Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "a8d12712-5c7b-4acb-bb8b-e73efcb9b5dc", 6 | "metadata": {}, 7 | "source": [ 8 | "# 🚩 Using `redflag` with Pandas\n", 9 | "\n", 10 | "As well as using `redflag`'s functions directly (see `Basic_usage.ipynb`), or with `sklearn` (see `Using_redflag_with_Pandas.ipynb`), `redflag` has some Pandas 'accessors' that give you access to some `redflag` functions almost as if they were methods on Pandas objects.\n", 11 | "\n", 12 | "The best way to get the idea is to look at an example.\n", 13 | "\n", 14 | "First, even though we may not use it directly, we have to import `redflag` to get access to its functions. As long as you have `pandas` installed, it will register the accessors." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 4, 20 | "id": "77aa7f67-0bc7-48e9-87f4-183aa2dc2c35", 21 | "metadata": {}, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "text/plain": [ 26 | "'0.4.2rc2.dev14+g54704af.d20240421'" 27 | ] 28 | }, 29 | "execution_count": 4, 30 | "metadata": {}, 31 | "output_type": "execute_result" 32 | } 33 | ], 34 | "source": [ 35 | "import redflag as rf\n", 36 | "\n", 37 | "rf.__version__" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 5, 43 | "id": "3dbcf6e1-1cb5-4ca5-b64a-bc1d9e7b174f", 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/html": [ 49 | "
\n", 50 | "\n", 63 | "\n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | "
Well NameDepthFormationRelPosMarineGRILDDeltaPHIPHINDPEFaciesLATITUDELONGITUDEILD_log10LithologyRHOBMineralogySiliciclastic
0SHRIMPLIN851.3064A1 SH1.000177.454.6131769.911.9154.63.037.978076-100.9873050.664siltstone2393.499945siliciclasticTrue
1SHRIMPLIN851.4588A1 SH0.979178.264.58141914.212.5654.13.037.978076-100.9873050.661siltstone2416.119814siliciclasticTrue
2SHRIMPLIN851.6112A1 SH0.957179.054.54988114.813.0503.63.037.978076-100.9873050.658siltstone2404.576056siliciclasticTrue
3SHRIMPLIN851.7636A1 SH0.936186.104.51855913.913.1153.53.037.978076-100.9873050.655siltstone2393.249071siliciclasticTrue
4SHRIMPLIN851.9160A1 SH0.915174.584.43608613.513.3003.43.037.978076-100.9873050.647siltstone2382.602601siliciclasticTrue
\n", 195 | "
" 196 | ], 197 | "text/plain": [ 198 | " Well Name Depth Formation RelPos Marine GR ILD DeltaPHI \\\n", 199 | "0 SHRIMPLIN 851.3064 A1 SH 1.000 1 77.45 4.613176 9.9 \n", 200 | "1 SHRIMPLIN 851.4588 A1 SH 0.979 1 78.26 4.581419 14.2 \n", 201 | "2 SHRIMPLIN 851.6112 A1 SH 0.957 1 79.05 4.549881 14.8 \n", 202 | "3 SHRIMPLIN 851.7636 A1 SH 0.936 1 86.10 4.518559 13.9 \n", 203 | "4 SHRIMPLIN 851.9160 A1 SH 0.915 1 74.58 4.436086 13.5 \n", 204 | "\n", 205 | " PHIND PE Facies LATITUDE LONGITUDE ILD_log10 Lithology \\\n", 206 | "0 11.915 4.6 3.0 37.978076 -100.987305 0.664 siltstone \n", 207 | "1 12.565 4.1 3.0 37.978076 -100.987305 0.661 siltstone \n", 208 | "2 13.050 3.6 3.0 37.978076 -100.987305 0.658 siltstone \n", 209 | "3 13.115 3.5 3.0 37.978076 -100.987305 0.655 siltstone \n", 210 | "4 13.300 3.4 3.0 37.978076 -100.987305 0.647 siltstone \n", 211 | "\n", 212 | " RHOB Mineralogy Siliciclastic \n", 213 | "0 2393.499945 siliciclastic True \n", 214 | "1 2416.119814 siliciclastic True \n", 215 | "2 2404.576056 siliciclastic True \n", 216 | "3 2393.249071 siliciclastic True \n", 217 | "4 2382.602601 siliciclastic True " 218 | ] 219 | }, 220 | "execution_count": 5, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "import pandas as pd\n", 227 | "\n", 228 | "df = pd.read_csv('https://raw.githubusercontent.com/scienxlab/datasets/main/kgs/panoma-training-data.csv')\n", 229 | "\n", 230 | "df.head()" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "id": "d77e460b-b925-4dec-b56d-d3f18ed1ecbb", 236 | "metadata": {}, 237 | "source": [ 238 | "## Series accessor" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "id": "98f5c772-a33d-43cf-82cf-54dc21535133", 244 | "metadata": {}, 245 | "source": [ 246 | "For the time being, there are only accessors on Pandas `Series` objects. For example:" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 6, 252 | "id": "1b17a7e8-1d28-4e1b-9b7d-ecdbbe750aaf", 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "# Call the Series s for simplicity:\n", 257 | "s = df['Lithology']" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "id": "52e5e4e1-9200-46b2-9d77-9cc0d2cbc4a8", 263 | "metadata": {}, 264 | "source": [ 265 | "Now we can call the `redflag` function `imbalance_degree()` as if it were a method (but notice the extra `redflag` we have to insert to access the method):" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 7, 271 | "id": "6af691f4-90a3-4a8e-b842-a20f70c72314", 272 | "metadata": {}, 273 | "outputs": [ 274 | { 275 | "data": { 276 | "text/plain": [ 277 | "3.378593040846633" 278 | ] 279 | }, 280 | "execution_count": 7, 281 | "metadata": {}, 282 | "output_type": "execute_result" 283 | } 284 | ], 285 | "source": [ 286 | "s.redflag.imbalance_degree()" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "id": "f2ea6821-0610-44b4-a855-653642ea089d", 292 | "metadata": {}, 293 | "source": [ 294 | "Or we can ask for the new 'dummy' scores:" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 8, 300 | "id": "5897e460-cc15-4858-939b-b91b19fafc9f", 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/plain": [ 306 | "{'f1': 0.24566600930871996,\n", 307 | " 'roc_auc': 0.5021684735059516,\n", 308 | " 'strategy': 'stratified',\n", 309 | " 'task': 'classification'}" 310 | ] 311 | }, 312 | "execution_count": 8, 313 | "metadata": {}, 314 | "output_type": "execute_result" 315 | } 316 | ], 317 | "source": [ 318 | "s.redflag.dummy_scores()" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "id": "3b9be98e-5642-4bab-80eb-f4c226422781", 324 | "metadata": {}, 325 | "source": [ 326 | "Let's try that on a regression target like `df['RHOB']`" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 9, 332 | "id": "f734bb50-15e9-43c3-b31f-1a078e398dc3", 333 | "metadata": {}, 334 | "outputs": [ 335 | { 336 | "data": { 337 | "text/plain": [ 338 | "{'mean_squared_error': 47528.78263092096,\n", 339 | " 'r2': 0.0,\n", 340 | " 'strategy': 'mean',\n", 341 | " 'task': 'regression'}" 342 | ] 343 | }, 344 | "execution_count": 9, 345 | "metadata": {}, 346 | "output_type": "execute_result" 347 | } 348 | ], 349 | "source": [ 350 | "df['RHOB'].redflag.dummy_scores()" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "id": "381501a1-8944-4b3f-a4cf-d80e08fbac4f", 356 | "metadata": {}, 357 | "source": [ 358 | "Or we can ask for a 'report' (very simple for now):" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 10, 364 | "id": "02380595-2b47-4718-9b58-ef6b170f29b1", 365 | "metadata": {}, 366 | "outputs": [ 367 | { 368 | "name": "stdout", 369 | "output_type": "stream", 370 | "text": [ 371 | "Continuous data suitable for regression\n", 372 | "Outliers: [ 95 96 132 175 176 177 222 223 263 526 527 531 532 533\n", 373 | " 534 575 576 577 578 579 580 581 582 583 584 585 586 587\n", 374 | " 588 621 622 633 634 635 636 652 653 654 660 661 662 663\n", 375 | " 711 712 713 756 757 758 759 760 768 769 770 771 772 773\n", 376 | " 774 775 776 777 778 779 780 781 782 800 801 802 803 804\n", 377 | " 818 819 821 822 823 824 835 836 841 842 843 844 845 846\n", 378 | " 849 850 934 935 936 937 938 1039 1040 1044 1048 1049 1113 1114\n", 379 | " 1115 1116 1145 1146 1147 1148 1149 1150 1151 1216 1217 1218 1221 1222\n", 380 | " 1223 1224 1225 1304 1313 1314 1315 1316 1368 1369 1370 1371 1372 1373\n", 381 | " 1374 1375 1446 1447 1496 1497 1498 1499 1546 1547 1548 1549 1567 1568\n", 382 | " 1622 1623 1624 1662 1663 1664 1665 1666 1722 1723 1724 1725 1726 1735\n", 383 | " 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1753 1754 1755 1756\n", 384 | " 1757 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789\n", 385 | " 1790 1805 1806 1807 1808 1809 1810 1812 1813 1866 1868 1869 1870 1981\n", 386 | " 1982 2054 2055 2139 2327 2415 2416 2417 2418 2488 2489 2490 2867 2868\n", 387 | " 2869 2870 2871 2872 2873 2882 2883 2884 2888 2889 2921 2922 2923 2924\n", 388 | " 2925 2926 2927 2928 2929 2930 2931 2932 2933 2972 2973 2974 2975 2976\n", 389 | " 3004 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099\n", 390 | " 3100 3101 3102 3109 3110 3111 3112 3113 3114 3115 3341 3429 3430 3443\n", 391 | " 3444 3515 3516 3517 3861 3862 3863 3905 3906 3907 3931 3932 3933 3934\n", 392 | " 3935]\n", 393 | "Correlated: True\n", 394 | "Dummy scores:{'mean': {'mean_squared_error': 47528.78263092096, 'r2': 0.0}}\n", 395 | "\n" 396 | ] 397 | } 398 | ], 399 | "source": [ 400 | "print(df['RHOB'].redflag.report())" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "id": "e007d9b8-4346-4d9a-93a5-45222a137248", 406 | "metadata": {}, 407 | "source": [ 408 | "This is an experimental feature; future releases will have more functions. Feedback welcome!" 409 | ] 410 | }, 411 | { 412 | "cell_type": "markdown", 413 | "id": "ba98b9a2-e0a4-4ed8-a2d4-f87eb882af40", 414 | "metadata": {}, 415 | "source": [ 416 | "## DataFrame accessor\n", 417 | "\n", 418 | "Experimental feature: so far only `feature_importances` and `correlation_detector` are implemented." 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 11, 424 | "id": "274cc24d-69ad-49ef-8606-cc9b77b154dc", 425 | "metadata": {}, 426 | "outputs": [ 427 | { 428 | "data": { 429 | "text/plain": [ 430 | "array([0.29029061, 0.18151719, 0.30409475, 0.22409746])" 431 | ] 432 | }, 433 | "execution_count": 11, 434 | "metadata": {}, 435 | "output_type": "execute_result" 436 | } 437 | ], 438 | "source": [ 439 | "features = ['GR', 'RHOB', 'PE', 'ILD_log10']\n", 440 | "df.redflag.feature_importances(features, target='Lithology')" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 12, 446 | "id": "12e3e4ee-e8df-47ba-810d-3bff492d5389", 447 | "metadata": {}, 448 | "outputs": [ 449 | { 450 | "name": "stderr", 451 | "output_type": "stream", 452 | "text": [ 453 | "🚩 Feature 0 appears to be autocorrelated.\n", 454 | "🚩 Feature 1 appears to be autocorrelated.\n", 455 | "🚩 Feature 2 appears to be autocorrelated.\n", 456 | "🚩 Feature 3 appears to be autocorrelated.\n" 457 | ] 458 | } 459 | ], 460 | "source": [ 461 | "df.redflag.correlation_detector(features, target=None)" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "id": "a3185f63-64b1-47fd-875d-2c646b84aa65", 467 | "metadata": {}, 468 | "source": [ 469 | "Indeed, all of these features are correlated." 470 | ] 471 | } 472 | ], 473 | "metadata": { 474 | "kernelspec": { 475 | "display_name": "redflag", 476 | "language": "python", 477 | "name": "redflag" 478 | }, 479 | "language_info": { 480 | "codemirror_mode": { 481 | "name": "ipython", 482 | "version": 3 483 | }, 484 | "file_extension": ".py", 485 | "mimetype": "text/x-python", 486 | "name": "python", 487 | "nbconvert_exporter": "python", 488 | "pygments_lexer": "ipython3", 489 | "version": "3.12.0" 490 | } 491 | }, 492 | "nbformat": 4, 493 | "nbformat_minor": 5 494 | } 495 | -------------------------------------------------------------------------------- /docs/notebooks/_Pandas_accessor.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "355e4657", 6 | "metadata": {}, 7 | "source": [ 8 | "# Pandas accessor for `redflag`" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "75bb8303", 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "
\n", 21 | "\n", 34 | "\n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | "
VpVsrhoLithology
03045.61595.72.109121sandstone
13000.61517.12.090342sandstone
23363.62041.52.131990sandstone
33195.31606.22.184939sandstone
44237.52448.62.472231sandstone
\n", 82 | "
" 83 | ], 84 | "text/plain": [ 85 | " Vp Vs rho Lithology\n", 86 | "0 3045.6 1595.7 2.109121 sandstone\n", 87 | "1 3000.6 1517.1 2.090342 sandstone\n", 88 | "2 3363.6 2041.5 2.131990 sandstone\n", 89 | "3 3195.3 1606.2 2.184939 sandstone\n", 90 | "4 4237.5 2448.6 2.472231 sandstone" 91 | ] 92 | }, 93 | "execution_count": 1, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "import pandas as pd\n", 100 | "\n", 101 | "df = pd.read_csv(\"https://geocomp.s3.amazonaws.com/data/RPC_simple.csv\")\n", 102 | "\n", 103 | "df.head()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 2, 109 | "id": "39832c6c", 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "data": { 114 | "text/plain": [ 115 | "1.4130434782602501" 116 | ] 117 | }, 118 | "execution_count": 2, 119 | "metadata": {}, 120 | "output_type": "execute_result" 121 | } 122 | ], 123 | "source": [ 124 | "import redflag as rf\n", 125 | "\n", 126 | "rf.imbalance_degree([1,1,2,1,1,2,2,1,1,1,1,1,2,1,3,3,3,3,2,1,1,1,1])" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 4, 132 | "id": "372a6bf1", 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "from pandas.api.extensions import register_dataframe_accessor\n", 137 | "\n", 138 | "@register_dataframe_accessor(\"redflag\")\n", 139 | "class RedflagAccessor:\n", 140 | " def __init__(self, pandas_obj):\n", 141 | " self._obj = pandas_obj\n", 142 | "\n", 143 | " def imbalance_degree(self, target=None):\n", 144 | " return rf.imbalance_degree(self._obj[target])\n", 145 | "\n", 146 | " def minority_classes(self, target=None):\n", 147 | " return rf.minority_classes(self._obj[target])" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 14, 153 | "id": "b110936f", 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "True" 160 | ] 161 | }, 162 | "execution_count": 14, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "rf.dummy_re([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 5, 174 | "id": "7c3963ec", 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "data": { 179 | "text/plain": [ 180 | "-1.0" 181 | ] 182 | }, 183 | "execution_count": 5, 184 | "metadata": {}, 185 | "output_type": "execute_result" 186 | } 187 | ], 188 | "source": [ 189 | "df.redflag.imbalance_degree(target='Lithology')" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "id": "ea50c3ce", 195 | "metadata": {}, 196 | "source": [ 197 | "Noice." 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 6, 203 | "id": "94f7c2cd", 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "data": { 208 | "text/plain": [ 209 | "array([], dtype=float64)" 210 | ] 211 | }, 212 | "execution_count": 6, 213 | "metadata": {}, 214 | "output_type": "execute_result" 215 | } 216 | ], 217 | "source": [ 218 | "df.redflag.minority_classes(target='Lithology')" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 7, 224 | "id": "62ea78b5", 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "data": { 229 | "text/plain": [ 230 | "array([], dtype=int64)" 231 | ] 232 | }, 233 | "execution_count": 7, 234 | "metadata": {}, 235 | "output_type": "execute_result" 236 | } 237 | ], 238 | "source": [ 239 | "import redflag as rf\n", 240 | "data = 3 * [-3, -2, -2, -1, 0, 0, 0, 1, 2, 2, 3]\n", 241 | "rf.get_outliers(data)\n", 242 | "# array([], dtype=int64)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 8, 248 | "id": "84c883db", 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "data": { 253 | "text/plain": [ 254 | "array([1.8, 1. , 1.2, 1.6, 1.4, 1.5, 1.1, 1.9, 1.3, 1.7])" 255 | ] 256 | }, 257 | "execution_count": 8, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "import numpy as np\n", 264 | "import redflag as rf\n", 265 | "from sklearn.linear_model import LinearRegression\n", 266 | "from sklearn.pipeline import make_pipeline\n", 267 | "\n", 268 | "X = np.arange(10).reshape(-1, 1)\n", 269 | "np.random.shuffle(X)\n", 270 | "y = np.squeeze(10 * X + 1)\n", 271 | "pipe = make_pipeline(rf.DistributionComparator(), LinearRegression())\n", 272 | "pipe.fit(X, y)\n", 273 | "pipe.predict(X / 100) # Dramatically different distribution." 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 9, 279 | "id": "6427e5ee", 280 | "metadata": {}, 281 | "outputs": [ 282 | { 283 | "data": { 284 | "text/plain": [ 285 | "array([1.8, 1. , 1.2, 1.6, 1.4, 1.5, 1.1, 1.9, 1.3, 1.7])" 286 | ] 287 | }, 288 | "execution_count": 9, 289 | "metadata": {}, 290 | "output_type": "execute_result" 291 | } 292 | ], 293 | "source": [ 294 | "pipe.predict(X / 100)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 10, 300 | "id": "6e912a70", 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/plain": [ 306 | "array([[8],\n", 307 | " [0],\n", 308 | " [2],\n", 309 | " [6],\n", 310 | " [4],\n", 311 | " [5],\n", 312 | " [1],\n", 313 | " [9],\n", 314 | " [3],\n", 315 | " [7]])" 316 | ] 317 | }, 318 | "execution_count": 10, 319 | "metadata": {}, 320 | "output_type": "execute_result" 321 | } 322 | ], 323 | "source": [ 324 | "X" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "id": "45185280", 330 | "metadata": {}, 331 | "source": [ 332 | "## Series Accessor" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 11, 338 | "id": "7ec28d7f", 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "from pandas.api.extensions import register_series_accessor\n", 343 | "from pandas.api.extensions import register_dataframe_accessor\n", 344 | "\n", 345 | "@register_series_accessor(\"redflag\")\n", 346 | "class SeriesAccessor:\n", 347 | " def __init__(self, pandas_obj):\n", 348 | " self._obj = pandas_obj\n", 349 | "\n", 350 | " def imbalance_degree(self):\n", 351 | " return rf.imbalance_degree(self._obj)\n", 352 | "\n", 353 | " def minority_classes(self):\n", 354 | " return rf.minority_classes(self._obj)\n", 355 | " \n", 356 | "\n", 357 | " def dummy_scores(self, task=None, random_state=None):\n", 358 | " if task is None:\n", 359 | " task = 'regression' if rf.is_continuous(self._obj) else 'classification'\n", 360 | " if task == 'classification':\n", 361 | " return rf.dummy_classification_scores(self._obj, random_state=random_state)\n", 362 | " elif task == 'regression':\n", 363 | " return rf.dummy_regression_scores(self._obj)\n", 364 | " else:\n", 365 | " raise ValueError(\"`task` must be 'classification' or 'regression', or None to decide automatically.\")\n", 366 | " " 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 12, 372 | "id": "88447a57", 373 | "metadata": {}, 374 | "outputs": [ 375 | { 376 | "data": { 377 | "text/plain": [ 378 | "-1.0" 379 | ] 380 | }, 381 | "execution_count": 12, 382 | "metadata": {}, 383 | "output_type": "execute_result" 384 | } 385 | ], 386 | "source": [ 387 | "df['Lithology'].redflag.imbalance_degree()" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 13, 393 | "id": "5f89c66d", 394 | "metadata": {}, 395 | "outputs": [ 396 | { 397 | "ename": "AttributeError", 398 | "evalue": "module 'redflag' has no attribute 'dummy_classification_scores'", 399 | "output_type": "error", 400 | "traceback": [ 401 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 402 | "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", 403 | "Cell \u001b[0;32mIn [13], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mLithology\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mredflag\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdummy_scores\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", 404 | "Cell \u001b[0;32mIn [11], line 20\u001b[0m, in \u001b[0;36mSeriesAccessor.dummy_scores\u001b[0;34m(self, task, random_state)\u001b[0m\n\u001b[1;32m 18\u001b[0m task \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mregression\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m rf\u001b[38;5;241m.\u001b[39mis_continuous(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_obj) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclassification\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m task \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclassification\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[0;32m---> 20\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mrf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdummy_classification_scores\u001b[49m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_obj, random_state\u001b[38;5;241m=\u001b[39mrandom_state)\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m task \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mregression\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m rf\u001b[38;5;241m.\u001b[39mdummy_regression_scores(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_obj)\n", 405 | "\u001b[0;31mAttributeError\u001b[0m: module 'redflag' has no attribute 'dummy_classification_scores'" 406 | ] 407 | } 408 | ], 409 | "source": [ 410 | "df['Lithology'].redflag.dummy_scores()" 411 | ] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "id": "369cf2f6", 416 | "metadata": {}, 417 | "source": [ 418 | "## Avoid depending on pandas\n", 419 | "\n", 420 | "We want to avoid importing Pandas if a person doesn't want to use the accessors.\n", 421 | "\n", 422 | "BTW, we can't (or don't want to) avoid depending on `sklearn` so the sklearn.py module does not need to do the same." 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 14, 428 | "id": "e9b7c6f0", 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "def identity(arg):\n", 433 | " def decorator(func):\n", 434 | " return func \n", 435 | " return decorator\n", 436 | "\n", 437 | "@identity('foo')\n", 438 | "def hello(x):\n", 439 | " return f\"Hello {x}\"" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 15, 445 | "id": "2dc1164b", 446 | "metadata": {}, 447 | "outputs": [ 448 | { 449 | "data": { 450 | "text/plain": [ 451 | "'Hello Matt'" 452 | ] 453 | }, 454 | "execution_count": 15, 455 | "metadata": {}, 456 | "output_type": "execute_result" 457 | } 458 | ], 459 | "source": [ 460 | "hello('Matt')" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "id": "7c51e1d2", 466 | "metadata": {}, 467 | "source": [ 468 | "Test with environment `foo`, which does not have `pandas`..." 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 16, 474 | "id": "e6ea05f0", 475 | "metadata": {}, 476 | "outputs": [ 477 | { 478 | "data": { 479 | "text/html": [ 480 | "
\n", 481 | "\n", 494 | "\n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | "
VpVsrhoLithology
03045.61595.72.109121sandstone
13000.61517.12.090342sandstone
23363.62041.52.131990sandstone
33195.31606.22.184939sandstone
44237.52448.62.472231sandstone
\n", 542 | "
" 543 | ], 544 | "text/plain": [ 545 | " Vp Vs rho Lithology\n", 546 | "0 3045.6 1595.7 2.109121 sandstone\n", 547 | "1 3000.6 1517.1 2.090342 sandstone\n", 548 | "2 3363.6 2041.5 2.131990 sandstone\n", 549 | "3 3195.3 1606.2 2.184939 sandstone\n", 550 | "4 4237.5 2448.6 2.472231 sandstone" 551 | ] 552 | }, 553 | "execution_count": 16, 554 | "metadata": {}, 555 | "output_type": "execute_result" 556 | } 557 | ], 558 | "source": [ 559 | "import pandas as pd\n", 560 | "\n", 561 | "df = pd.read_csv(\"https://geocomp.s3.amazonaws.com/data/RPC_simple.csv\")\n", 562 | "\n", 563 | "df.head()" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": 17, 569 | "id": "348a0d99", 570 | "metadata": {}, 571 | "outputs": [ 572 | { 573 | "data": { 574 | "text/plain": [ 575 | "1.4130434782602501" 576 | ] 577 | }, 578 | "execution_count": 17, 579 | "metadata": {}, 580 | "output_type": "execute_result" 581 | } 582 | ], 583 | "source": [ 584 | "import redflag as rf\n", 585 | "\n", 586 | "rf.imbalance_degree([1,1,2,1,1,2,2,1,1,1,1,1,2,1,3,3,3,3,2,1,1,1,1])" 587 | ] 588 | }, 589 | { 590 | "cell_type": "markdown", 591 | "id": "7aa17834", 592 | "metadata": {}, 593 | "source": [ 594 | "## Dummy models" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": 18, 600 | "id": "f40ed815", 601 | "metadata": {}, 602 | "outputs": [ 603 | { 604 | "data": { 605 | "text/plain": [ 606 | "0 sandstone\n", 607 | "1 sandstone\n", 608 | "2 sandstone\n", 609 | "3 sandstone\n", 610 | "4 sandstone\n", 611 | " ... \n", 612 | "395 shale\n", 613 | "396 shale\n", 614 | "397 shale\n", 615 | "398 shale\n", 616 | "399 shale\n", 617 | "Name: Lithology, Length: 400, dtype: object" 618 | ] 619 | }, 620 | "execution_count": 18, 621 | "metadata": {}, 622 | "output_type": "execute_result" 623 | } 624 | ], 625 | "source": [ 626 | "df['Lithology']" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": 67, 632 | "id": "ffadaf98", 633 | "metadata": {}, 634 | "outputs": [ 635 | { 636 | "data": { 637 | "text/plain": [ 638 | "{'most_frequent': {'f1': 0.33333333333333326, 'roc_auc': 0.5},\n", 639 | " 'stratified': {'f1': 0.47233840363611357, 'roc_auc': 0.4725}}" 640 | ] 641 | }, 642 | "execution_count": 67, 643 | "metadata": {}, 644 | "output_type": "execute_result" 645 | } 646 | ], 647 | "source": [ 648 | "from sklearn.dummy import DummyClassifier\n", 649 | "from sklearn.metrics import f1_score, roc_auc_score\n", 650 | "from sklearn.metrics import mean_squared_error, r2_score\n", 651 | "\n", 652 | "def dummy_classification_scores(y, random_state=None):\n", 653 | " result = {'most_frequent': {}, 'stratified': {}}\n", 654 | " y = np.asanyarray(y)\n", 655 | " X = np.ones_like(y).reshape(-1, 1)\n", 656 | " for method, scores in result.items():\n", 657 | " model = DummyClassifier(strategy=method, random_state=random_state)\n", 658 | " _ = model.fit(X, y)\n", 659 | " scores['f1'] = f1_score(y, model.predict(X), average='weighted')\n", 660 | " y_prob = model.predict_proba(X)\n", 661 | " if rf.is_binary(y):\n", 662 | " scores['roc_auc'] = roc_auc_score(y, y_prob[:, 1])\n", 663 | " else:\n", 664 | " scores['roc_auc'] = roc_auc_score(y, y_prob, multi_class='ovr') \n", 665 | " return result\n", 666 | "\n", 667 | "dummy_classification_scores(df['Lithology'], random_state=42)" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": 68, 673 | "id": "b4c958c6", 674 | "metadata": {}, 675 | "outputs": [ 676 | { 677 | "data": { 678 | "text/plain": [ 679 | "{'most_frequent': {'f1': 0.3333333333333333, 'roc_auc': 0.5},\n", 680 | " 'stratified': {'f1': 0.20000000000000004, 'roc_auc': 0.35654761904761906}}" 681 | ] 682 | }, 683 | "execution_count": 68, 684 | "metadata": {}, 685 | "output_type": "execute_result" 686 | } 687 | ], 688 | "source": [ 689 | "y_ = [1, 1, 1, 1, 1, 2, 2, 2, 3, 3]\n", 690 | "dummy_classification_scores(y_, random_state=42)" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": 72, 696 | "id": "2add677d", 697 | "metadata": {}, 698 | "outputs": [], 699 | "source": [ 700 | "from sklearn.dummy import DummyRegressor\n", 701 | "\n", 702 | "def dummy_regression_scores(y):\n", 703 | " result = {'mean': {}}\n", 704 | " y = np.asanyarray(y)\n", 705 | " if y.ndim > 1:\n", 706 | " raise ValueError(\"Multilabel target is not supported.\")\n", 707 | " X = np.ones_like(y).reshape(-1, 1) # X is not used by the model.\n", 708 | " for method, scores in result.items():\n", 709 | " model = DummyRegressor(strategy=method)\n", 710 | " _ = model.fit(X, y)\n", 711 | " y_pred = model.predict(X)\n", 712 | " scores['mean_squared_error'] = mean_squared_error(y, y_pred)\n", 713 | " scores['r2'] = r2_score(y, y_pred)\n", 714 | " return result" 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "execution_count": 73, 720 | "id": "41775588", 721 | "metadata": {}, 722 | "outputs": [ 723 | { 724 | "data": { 725 | "text/plain": [ 726 | "{'mean': {'mean_squared_error': 8.25, 'r2': 0.0}}" 727 | ] 728 | }, 729 | "execution_count": 73, 730 | "metadata": {}, 731 | "output_type": "execute_result" 732 | } 733 | ], 734 | "source": [ 735 | "y = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]\n", 736 | "dummy_regression_scores(y)" 737 | ] 738 | }, 739 | { 740 | "cell_type": "code", 741 | "execution_count": null, 742 | "id": "f537e306", 743 | "metadata": {}, 744 | "outputs": [], 745 | "source": [] 746 | } 747 | ], 748 | "metadata": { 749 | "kernelspec": { 750 | "display_name": "redflag", 751 | "language": "python", 752 | "name": "redflag" 753 | }, 754 | "language_info": { 755 | "codemirror_mode": { 756 | "name": "ipython", 757 | "version": 3 758 | }, 759 | "file_extension": ".py", 760 | "mimetype": "text/x-python", 761 | "name": "python", 762 | "nbconvert_exporter": "python", 763 | "pygments_lexer": "ipython3", 764 | "version": "3.10.8" 765 | } 766 | }, 767 | "nbformat": 4, 768 | "nbformat_minor": 5 769 | } 770 | -------------------------------------------------------------------------------- /docs/post_process_html.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import glob 3 | import re 4 | 5 | 6 | def simplify_credits(html): 7 | """ 8 | Replace the credit part of the HTML footer. Return the new text. 9 | """ 10 | s = r"""@pradyunsg's""" 11 | pattern = re.compile(s) 12 | html = pattern.sub(r'', html) 13 | 14 | s = r'Copyright © 2024, The Redflag Authors' 15 | pattern = re.compile(s) 16 | new_s = '© 2024, The Redflag Authors | CC BY' 17 | html = pattern.sub(new_s, html) 18 | 19 | return html 20 | 21 | 22 | def add_analytics(html): 23 | """ 24 | Add snippet to head. 25 | """ 26 | s = r'' 27 | pattern = re.compile(s) 28 | new_s = '' 29 | html = pattern.sub(new_s, html) 30 | 31 | return html 32 | 33 | 34 | def main(path): 35 | """ 36 | Process the HTML files in path, save in place (side-effect). 37 | """ 38 | fnames = glob.glob(path.strip('/') + '/*.html') 39 | for fname in fnames: 40 | with open(fname, 'r+') as f: 41 | html = f.read() 42 | 43 | new_html = simplify_credits(html) 44 | new_html = add_analytics(html) 45 | 46 | f.seek(0) 47 | f.write(new_html) 48 | f.truncate() 49 | return 50 | 51 | 52 | if __name__ == '__main__': 53 | _ = main(sys.argv[1]) 54 | -------------------------------------------------------------------------------- /docs/pre_process_ipynb.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import glob 3 | import json 4 | import pathlib 5 | import shutil 6 | 7 | 8 | def change_kernel(notebook): 9 | """ 10 | Vanillafy the kernelspec. 11 | """ 12 | new_kernelspec = { 13 | "display_name": "Python 3 (ipykernel)", 14 | "language": "python", 15 | "name": "python3", 16 | } 17 | notebook['metadata']['kernelspec'].update(new_kernelspec) 18 | return notebook 19 | 20 | 21 | def main(path): 22 | """ 23 | Process the IPYNB files in path, save in place (side-effect). 24 | """ 25 | fnames = glob.glob(path.strip('/') + '/[!_]*.ipynb') # Not files with underscore. 26 | outpath = pathlib.Path('_notebooks') 27 | if outpath.exists(): 28 | shutil.rmtree(outpath) 29 | outpath.mkdir(exist_ok=True) 30 | 31 | for fname in fnames: 32 | with open(fname, encoding='utf-8') as f: 33 | notebook = json.loads(f.read()) 34 | 35 | new_nb = change_kernel(notebook) 36 | filepart = pathlib.Path(fname).name 37 | 38 | with open(outpath / filepart, 'w') as f: 39 | _ = f.write(json.dumps(new_nb)) 40 | 41 | return 42 | 43 | 44 | if __name__ == '__main__': 45 | print(sys.argv[1]) 46 | _ = main(sys.argv[1]) 47 | -------------------------------------------------------------------------------- /docs/redflag.rst: -------------------------------------------------------------------------------- 1 | redflag package 2 | =============== 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | redflag.distributions 11 | redflag.imbalance 12 | redflag.importance 13 | redflag.independence 14 | redflag.markov 15 | redflag.outliers 16 | redflag.pandas 17 | redflag.sklearn 18 | redflag.target 19 | redflag.utils 20 | 21 | Module contents 22 | --------------- 23 | 24 | .. automodule:: redflag 25 | :members: 26 | :undoc-members: 27 | :show-inheritance: 28 | -------------------------------------------------------------------------------- /docs/what_is_redflag.md: -------------------------------------------------------------------------------- 1 | # 🚩 What is `redflag`? 2 | 3 | ## Overview 4 | 5 | _Redflag_ is a Python library that applies "safety by design" to machine 6 | learning. It helps researchers and practitioners in this field ensure their 7 | models are safe and reliable by alerting them to potential pitfalls. These 8 | pitfalls could lead to overconfidence in the model or wildly spurious 9 | predictions. _Redflag_ offers accessible ways for users to integrate safety 10 | checks into their workflows by providing `scikit-learn` transformers, `pandas` 11 | accessors, and standalone functions. These components can easily be 12 | incorporated into existing workflows, helping identify issues and enhance the 13 | quality and safety of predictive models. 14 | 15 | 16 | ## Safety by design 17 | 18 | _Safety by design_ means to 'design out' hazardous situations from complex 19 | machines or processes before they can do harm. The concept, also known as 20 | _prevention through design_, has been applied to civil engineering and 21 | industrial design for decades. Recently it has also been applied to software 22 | engineering and, more recently still, to machine learning 23 | [@van-gelder-etal-2021]. _Redflag_ helps machine learning researchers and 24 | practitioners design safety into their workflows. 25 | 26 | To read more about the motivation for this package, check out 27 | [the draft paper](https://github.com/scienxlab/redflag/blob/paper/paper/paper.md) 28 | submitted to [JOSS](https://joss.theoj.org). 29 | 30 | 31 | ## What's in `redflag` 32 | 33 | _Redflag_ offers three ways for users to insert safety checks into their 34 | machine learning workflows: 35 | 36 | 1. **`scikit-learn` transformers** which fit directly into the pipelines that 37 | most data scientists are already using, e.g. 38 | `redflag.ImbalanceDetector().fit_transform(X, y)`. 39 | 2. **`pandas` accessors** on Series and DataFrames, which can be called like a 40 | method on existing Pandas objects, e.g. `df['target'].redflag.is_imbalanced()`. 41 | 3. **Standalone functions** which the user can compose their own checks and 42 | tests with, e.g. `redflag.is_imbalanced(y)`. 43 | 44 | There are two kinds of `scikit-learn` transformer: 45 | 46 | - **Detectors** check every dataset they encounter. For example, 47 | `redflag.ClippingDetector` checks for clipped data during both model fitting 48 | and during prediction. 49 | - **Comparators** learn some parameter in the model fitting step, then check 50 | subsequent data against those parameters. For example, 51 | `redflag.DistributionComparator` learns the empirical univariate distributions 52 | of the training features, then checks that the features in subsequent datasets 53 | are tolerably close to these baselines. 54 | 55 | Although the `scikit-learn` components are implemented as transformers, 56 | subclassing `sklearn.base.BaseEstimator`, `sklearn.base.TransformerMixin`, they 57 | do not transform the data. They only raise warnings (or, optionally, 58 | exceptions) when a check fails. _Redflag_ does not attempt to fix any problems 59 | it encounters. 60 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=68", "setuptools-scm>=8"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "redflag" 7 | dynamic = ["version"] 8 | requires-python = ">=3.8" 9 | authors = [{ name="Matt Hall", email="kwinkunks@gmail.com" },] 10 | description = "Safety net for machine learning pipelines." 11 | license = {file = "LICENSE"} 12 | readme = "README.md" 13 | keywords = ["machine learning", "data science", "quality assurance"] 14 | classifiers = [ 15 | "Intended Audience :: Science/Research", 16 | "Topic :: Scientific/Engineering :: Information Analysis", 17 | "Development Status :: 4 - Beta", 18 | "Natural Language :: English", 19 | "Programming Language :: Python :: 3.8", 20 | "Programming Language :: Python :: 3.9", 21 | "Programming Language :: Python :: 3.10", 22 | "Programming Language :: Python :: 3.11", 23 | "Programming Language :: Python :: 3.12", 24 | "License :: OSI Approved :: Apache Software License", 25 | "Operating System :: OS Independent", 26 | ] 27 | 28 | dependencies = [ 29 | "numpy<2.0", # NumPy 2 will likely break some things. 30 | "scipy!=1.10.0", # Bug in stats.powerlaw. 31 | "scikit-learn", 32 | ] 33 | 34 | [project.optional-dependencies] 35 | dev = [ 36 | "setuptools", 37 | "build", 38 | "mypy", 39 | "types-setuptools", 40 | "pytest", 41 | "coverage[toml]", 42 | "pytest-cov", 43 | "sphinx", 44 | "sphinxcontrib-apidoc", 45 | "sphinx_copybutton", 46 | "furo", 47 | "myst_nb", 48 | "jupyter", 49 | "pandas", 50 | "seaborn", 51 | ] 52 | 53 | [project.urls] 54 | "documentation" = "https://scienxlab.org/redflag" 55 | "repository" = "https://github.com/scienxlab/redflag" 56 | 57 | [tool.setuptools_scm] 58 | # Empty section, required for dynamic versioning. 59 | 60 | [tool.setuptools.packages.find] 61 | where = ["src"] 62 | 63 | [tool.mypy] 64 | warn_return_any = true 65 | ignore_missing_imports = true 66 | exclude = ['docs', 'tests'] 67 | 68 | [tool.pytest.ini_options] 69 | # pytest configuration: http://doc.pytest.org/en/latest/customize.html 70 | addopts = "--doctest-modules --doctest-continue-on-failure --ignore=docs --cov=redflag" 71 | filterwarnings = [ 72 | "ignore:pkg_resources is deprecated as an API:DeprecationWarning", 73 | "ignore:Deprecated call to `pkg_resources.declare_namespace:DeprecationWarning", 74 | ] 75 | 76 | [tool.coverage.run] 77 | # coverage configuration: https://coverage.readthedocs.io/ 78 | omit = [ 79 | "src/redflag/__init__.py", 80 | ] 81 | -------------------------------------------------------------------------------- /src/redflag/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * 2 | from .sklearn import * 3 | from .pandas import * 4 | from .markov import Markov_chain 5 | 6 | # Targets 7 | from .target import * 8 | from .imbalance import * 9 | 10 | # Features 11 | from .distributions import * 12 | from .independence import * 13 | from .importance import * 14 | from .outliers import * 15 | 16 | # It used to be conventional to define a __version__ attribute. 17 | # However, it is now considered best practice to get version 18 | # information from the package metadata directly, eg by using 19 | # importlib.metadata.version (see below). 20 | # 21 | # This will be deprecated in v0.6.0 but for now we do this: 22 | # 23 | from importlib.metadata import version 24 | __version__ = version(__package__ or __name__) 25 | -------------------------------------------------------------------------------- /src/redflag/distributions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions related to understanding distributions. 3 | 4 | Author: Matt Hall, scienxlab.org 5 | Licence: Apache 2.0 6 | 7 | Copyright 2024 Redflag contributors 8 | 9 | Licensed under the Apache License, Version 2.0 (the "License"); 10 | you may not use this file except in compliance with the License. 11 | You may obtain a copy of the License at 12 | 13 | http://www.apache.org/licenses/LICENSE-2.0 14 | 15 | Unless required by applicable law or agreed to in writing, software 16 | distributed under the License is distributed on an "AS IS" BASIS, 17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | See the License for the specific language governing permissions and 19 | limitations under the License. 20 | """ 21 | from __future__ import annotations 22 | 23 | from typing import Optional, NamedTuple, Callable, Union 24 | from collections import namedtuple 25 | from itertools import combinations 26 | import warnings 27 | 28 | import numpy as np 29 | from numpy.typing import ArrayLike 30 | import scipy.stats as ss 31 | from scipy.stats import wasserstein_distance 32 | from scipy.spatial.distance import squareform 33 | from scipy.signal import find_peaks 34 | from sklearn.neighbors import KernelDensity 35 | from sklearn.model_selection import GridSearchCV 36 | 37 | from .utils import is_standard_normal 38 | from .utils import iter_groups 39 | 40 | 41 | DISTS = ['norm', 'cosine', 'expon', 'exponpow', 'gamma', 'gumbel_l', 'gumbel_r', 42 | 'powerlaw', 'triang', 'trapz', 'uniform', 43 | ] 44 | 45 | def best_distribution(a: ArrayLike, bins: Optional[int]=None) -> NamedTuple: 46 | """ 47 | Model data by finding best fit distribution to data. 48 | 49 | By default, the following distributions are tried: normal, cosine, 50 | exponential, exponential power, gamma, left-skewed Gumbel, right-skewed 51 | Gumbel, power law, triangular, trapezoidal, and uniform. 52 | 53 | The best fit is determined by the sum of squared errors (SSE) between the 54 | histogram and the probability density function (PDF) of the distribution. 55 | 56 | Returns the best fit distribution and its parameters in a named tuple. 57 | 58 | Args: 59 | a (array): The data. 60 | bins (int): The number of bins to use for the histogram. 61 | 62 | Returns: 63 | tuple: The best fit distribution and its parameters. 64 | 65 | Examples: 66 | >>> a = [0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 8] 67 | >>> best_distribution(a) 68 | Distribution(name='norm', shape=[], loc=4.0, scale=1.8771812708978117) 69 | >>> best_distribution([1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 6, 6, 7]) 70 | Distribution(name='triang', shape=[0.5001419889107208], loc=0.3286356643172673, scale=7.3406453953773365) 71 | """ 72 | if bins is None: 73 | bins = min(max(20, len(a) // 100), 200) 74 | n, x = np.histogram(a, bins=bins, density=True) 75 | x = (x[1:] + x[:-1]) / 2 76 | 77 | dists = [getattr(ss, d) for d in DISTS] 78 | 79 | best_dist = None 80 | best_params = None 81 | best_sse = np.inf 82 | 83 | for dist in dists: 84 | *shape, μ, σ = dist.fit(a) 85 | n_pred = dist.pdf(x, loc=μ, scale=σ, *shape) 86 | sse = np.sum((n - n_pred)**2) 87 | if 0 < sse < best_sse: 88 | best_dist = dist 89 | best_params = shape + [μ] + [σ] 90 | best_sse = sse 91 | 92 | *shape, μ, σ = best_params 93 | Distribution = namedtuple('Distribution', ['name', 'shape', 'loc', 'scale']) 94 | return Distribution(best_dist.name, shape, μ, σ) 95 | 96 | 97 | def wasserstein_ovr(a: ArrayLike, groups: ArrayLike=None, standardize: bool=False) -> np.ndarray: 98 | """ 99 | First Wasserstein distance between each group in `a` vs the rest of `a` 100 | ('one vs rest' or OVR). The groups are provided by `groups`, which must be 101 | a 1D array of group labels, the same length as `a`. 102 | 103 | The Wasserstein distance is a measure of the distance between two 104 | probability distributions. It is also known as the earth mover's distance. 105 | This function uses the implementation in `scipy.stats.wasserstein_distance`. 106 | 107 | The results are in `np.unique(a)` order. 108 | 109 | Data should be standardized for results you can compare across different 110 | measurements. The function does not apply standardization by default. 111 | 112 | Returns K scores for K groups. 113 | 114 | Args: 115 | a (array): The data. 116 | groups (array): The group labels. 117 | standardize (bool): Whether to standardize the data. Default False. 118 | 119 | Returns: 120 | array: The Wasserstein distance scores in `np.unique(a)` order. 121 | 122 | Examples: 123 | >>> data = [1, 1, 1, 2, 2, 1, 1, 2, 2, 3, 2, 2, 2, 3, 3] 124 | >>> groups = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2] 125 | >>> wasserstein_ovr(data, groups=groups, standardize=True) 126 | array([0.97490053, 0.1392715 , 1.11417203]) 127 | """ 128 | if standardize: 129 | a = (a - np.nanmean(a)) / np.nanstd(a) 130 | dists = [] 131 | for group in iter_groups(groups): 132 | dist = wasserstein_distance(a[group], a[~group]) 133 | dists.append(dist) 134 | return np.array(dists) 135 | 136 | 137 | def wasserstein_ovo(a: ArrayLike, groups: ArrayLike=None, standardize: bool=False) -> np.ndarray: 138 | """ 139 | First Wasserstein distance between each group in `a` vs each other group 140 | ('one vs one' or OVO). The groups are provided by `groups`, which must be 141 | a 1D array of group labels, the same length as `a`. 142 | 143 | The Wasserstein distance is a measure of the distance between two 144 | probability distributions. It is also known as the earth mover's distance. 145 | This function uses the implementation in `scipy.stats.wasserstein_distance`. 146 | 147 | The results are in the order given by `combinations(np.unique(groups), 148 | r=2)`, which matches the order of `scipy.spatial.distance` metrics. 149 | 150 | Data should be standardized for results you can compare across different 151 | measurements. The function does not apply standardization by default. 152 | 153 | Returns K(K-1)/2 scores for K groups. 154 | 155 | Args: 156 | a (array): The data. 157 | groups (array): The group labels. 158 | standardize (bool): Whether to standardize the data. Defaults to False. 159 | 160 | Returns: 161 | array: The Wasserstein distance scores. Note that the order is the 162 | same as you would get from `scipy.spatial.distance` metrics. You 163 | can pass the result to `scipy.spatial.distance.squareform` to 164 | get a square matrix. 165 | 166 | Examples: 167 | >>> data = [1, 1, 1, 2, 2, 1, 1, 2, 2, 3, 2, 2, 2, 3, 3] 168 | >>> groups = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2] 169 | >>> wasserstein_ovo(data, groups=groups, standardize=True) 170 | array([0.55708601, 1.39271504, 0.83562902]) 171 | >>> squareform(wasserstein_ovo(data, groups=groups, standardize=True)) 172 | array([[0. , 0.55708601, 1.39271504], 173 | [0.55708601, 0. , 0.83562902], 174 | [1.39271504, 0.83562902, 0. ]]) 175 | """ 176 | if standardize: 177 | a = (a - np.nanmean(a)) / np.nanstd(a) 178 | dists = [] 179 | for (group_1, group_2) in combinations(np.unique(groups), r=2): 180 | dist = wasserstein_distance(a[groups==group_1], a[groups==group_2]) 181 | dists.append(dist) 182 | return np.array(dists) 183 | 184 | 185 | def wasserstein(X: ArrayLike, 186 | groups: ArrayLike=None, 187 | method: str='ovr', 188 | standardize: bool=False, 189 | reducer: Callable=None) -> np.ndarray: 190 | """ 191 | Step over all features and apply the distance function to the groups. 192 | 193 | Method can be 'ovr', 'ovo', or a function. 194 | 195 | The function `reducer` is applied to the ovo result to reduce it to one 196 | value per group per feature. If you want the full array of each group 197 | against each other, either pass the identity function (`lambda x: x`, 198 | which adds an axis) or use `wasserstein_ovo()` directly, one feature at 199 | a time. Default function: `np.mean`. 200 | 201 | The Wasserstein distance is a measure of the distance between two 202 | probability distributions. It is also known as the earth mover's distance. 203 | This function uses the implementation in `scipy.stats.wasserstein_distance`. 204 | 205 | Args: 206 | X (array): The data. Must be a 2D array, or a sequence of 2D arrays. 207 | If the latter, then the groups are implicitly assumed to be the 208 | datasets in the sequence and the `groups` argument is ignored. 209 | groups (array): The group labels. 210 | method (str or func): The method to use. Can be 'ovr', 'ovo', or a 211 | function. 212 | standardize (bool): Whether to standardize the data. Default False. 213 | reducer (func): The function to reduce the ovo result to one value 214 | per group. Default: `np.mean`. 215 | 216 | Returns: 217 | array: The 2D array of Wasserstein distance scores. 218 | 219 | Examples: 220 | >>> data = np.array([1, 1, 1, 2, 2, 1, 1, 2, 2, 3, 2, 2, 2, 3, 3]) 221 | >>> groups = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2] 222 | >>> wasserstein(data.reshape(-1, 1), groups=groups, standardize=True) 223 | array([[0.97490053], 224 | [0.1392715 ], 225 | [1.11417203]]) 226 | >>> wasserstein(data.reshape(-1, 1), groups=groups, method='ovo', standardize=True) 227 | array([[0.97490053], 228 | [0.69635752], 229 | [1.11417203]]) 230 | >>> data = [[[1], [1.22475], [-1.22475], [0], [1], [-1], [-1]], [[1], [0], [1]], [[1], [0], [-1]]] 231 | >>> wasserstein(data, standardize=False) 232 | array([[0.39754762], 233 | [0.71161667], 234 | [0.24495 ]]) 235 | """ 236 | # If the data is a sequence of arrays, then assume the groups are the 237 | # datasets in the sequence and the `groups` argument is ignored. 238 | try: 239 | first = X[0] 240 | except KeyError: 241 | # Probably a DataFrame. 242 | first = np.asarray(X)[0] 243 | 244 | stacked = False 245 | first = np.asarray(first) 246 | try: 247 | if first.ndim == 2: 248 | stacked = True 249 | except AttributeError: 250 | # It's probably a 1D array or list. 251 | pass 252 | 253 | if stacked: 254 | # Not sure this test makes sense any more. 255 | # if not is_standard_normal(first.flat): 256 | # warnings.warn('First group does not appear to be standardized.', stacklevel=2) 257 | groups = np.hstack([len(dataset)*[i] for i, dataset in enumerate(X)]) 258 | X = np.vstack(X) 259 | 260 | # Now we can treat X as a 2D array. 261 | X = np.asarray(X) 262 | if X.ndim != 2: 263 | raise ValueError("X must be a 2D array-like.") 264 | 265 | if groups is None: 266 | raise ValueError("Must provide a 1D array of group labels if X is a 2D array.") 267 | n_groups = np.unique(groups).size 268 | 269 | if n_groups < 2: 270 | raise ValueError("Must have 2 or more groups.") 271 | 272 | methods = { 273 | 'ovr': wasserstein_ovr, 274 | 'ovo': wasserstein_ovo, 275 | } 276 | func = methods.get(method, method) 277 | 278 | if reducer is None: 279 | reducer = np.mean 280 | 281 | dist_arrs = [] 282 | for feature in X.T: 283 | dists = func(feature, groups=groups, standardize=standardize) 284 | if method == 'ovo': 285 | dists = squareform(dists) 286 | dists = dists[~np.eye(n_groups, dtype=bool)].reshape(n_groups, -1) 287 | dists = [reducer(d) for d in dists] 288 | dist_arrs.append(dists) 289 | 290 | return np.swapaxes(dist_arrs, 0, 1) 291 | 292 | 293 | def bw_silverman(a: ArrayLike) -> float: 294 | """ 295 | Calculate the Silverman bandwidth, a popular rule of thumb for kernel 296 | density estimation bandwidth. 297 | 298 | Silverman, BW (1981), "Using kernel density estimates to investigate 299 | multimodality", Journal of the Royal Statistical Society. Series B Vol. 43, 300 | No. 1 (1981), pp. 97-99. 301 | 302 | Args: 303 | a (array): The data. 304 | 305 | Returns: 306 | float: The Silverman bandwidth. 307 | 308 | Examples: 309 | >>> data = [1, 1, 1, 2, 2, 1, 1, 2, 2, 3, 2, 2, 2, 3, 3] 310 | >>> abs(bw_silverman(data) - 0.581810759152688) < 1e-9 311 | True 312 | """ 313 | n, d = np.array(a).size, 1 314 | return np.power(n, -1 / (d + 4)) 315 | 316 | 317 | def bw_scott(a: ArrayLike) -> float: 318 | """ 319 | Calculate the Scott bandwidth, a popular rule of thumb for kernel 320 | density estimation bandwidth. 321 | 322 | Args: 323 | a (array): The data. 324 | 325 | Returns: 326 | float: The Scott bandwidth. 327 | 328 | Examples: 329 | >>> data = [1, 1, 1, 2, 2, 1, 1, 2, 2, 3, 2, 2, 2, 3, 3] 330 | >>> abs(bw_scott(data) - 0.6162678270732356) < 1e-9 331 | True 332 | """ 333 | n, d = np.array(a).size, 1 334 | return np.power(n * (d + 2) / 4, -1 / (d + 4)) 335 | 336 | 337 | def cv_kde(a: ArrayLike, n_bandwidths: int=20, cv: int=10) -> float: 338 | """ 339 | Run a cross validation grid search to identify the optimal bandwidth for 340 | the kernel density estimation. 341 | 342 | Searches between half the minimum of the Silverman and Scott bandwidths, 343 | and twice the maximum. Checks `n_bandwidths` bandwidths, default 20. 344 | 345 | Args: 346 | a (array): The data. 347 | n_bandwidths (int): The number of bandwidths to try. Default 20. 348 | cv (int): The number of cross validation folds. Default 10. 349 | 350 | Returns: 351 | float. The optimal bandwidth. 352 | 353 | Example: 354 | >>> rng = np.random.default_rng(42) 355 | >>> data = rng.normal(size=100) 356 | >>> cv_kde(data, n_bandwidths=3, cv=3) 357 | 0.5212113989811242 358 | >>> cv_kde(rng.normal(size=(10, 10))) 359 | Traceback (most recent call last): 360 | ... 361 | ValueError: Data must be 1D. 362 | """ 363 | a = np.asarray(a) 364 | if a.ndim >= 2: 365 | raise ValueError("Data must be 1D.") 366 | if not is_standard_normal(a): 367 | warnings.warn('Data does not appear to be standardized, the KDE may be a poor fit.', stacklevel=2) 368 | a = a.reshape(-1, 1) 369 | 370 | silverman = bw_silverman(a) 371 | scott = bw_scott(a) 372 | start = min(silverman, scott)/2 373 | stop = max(silverman, scott)*2 374 | params = {'bandwidth': np.linspace(start, stop, n_bandwidths)} 375 | model = GridSearchCV(KernelDensity(), params, cv=cv) 376 | model.fit(a) 377 | return model.best_params_['bandwidth'] 378 | 379 | 380 | def fit_kde(a: ArrayLike, bandwidth: float=1.0, kernel: str='gaussian') -> tuple[np.ndarray, np.ndarray]: 381 | """ 382 | Fit a kernel density estimation to the data. 383 | 384 | Args: 385 | a (array): The data. 386 | bandwidth (float): The bandwidth. Default 1.0. 387 | kernel (str): The kernel. Default 'gaussian'. 388 | 389 | Returns: 390 | tuple: (x, kde). 391 | 392 | Example: 393 | >>> rng = np.random.default_rng(42) 394 | >>> data = rng.normal(size=100) 395 | >>> x, kde = fit_kde(data) 396 | >>> x[0] + 3.2124714013056916 < 1e-9 397 | True 398 | >>> kde[0] - 0.014367259502733645 < 1e-9 399 | True 400 | >>> len(kde) 401 | 200 402 | >>> fit_kde(rng.normal(size=(10, 10))) 403 | Traceback (most recent call last): 404 | ... 405 | ValueError: Data must be 1D. 406 | """ 407 | a = np.squeeze(a) 408 | if a.ndim >= 2: 409 | raise ValueError("Data must be 1D.") 410 | if not is_standard_normal(a): 411 | warnings.warn('Data does not appear to be standardized, the KDE may be a poor fit.', stacklevel=2) 412 | a = a.reshape(-1, 1) 413 | model = KernelDensity(kernel=kernel, bandwidth=bandwidth) 414 | model.fit(a) 415 | mima = 1.5 * bandwidth * np.abs(a).max() 416 | x = np.linspace(-mima, mima, 200).reshape(-1, 1) 417 | log_density = model.score_samples(x) 418 | 419 | return np.squeeze(x), np.exp(log_density) 420 | 421 | 422 | def get_kde(a: ArrayLike, method: str='scott') -> tuple[np.ndarray, np.ndarray]: 423 | """ 424 | Get a kernel density estimation for the data. By default, the bandwidth is 425 | estimated using the Scott rule of thumb. Other options are the Silverman 426 | rule of thumb, or cross validation (using the `cv_kde()` function). 427 | 428 | This function is a wrapper for `fit_kde()`, with convenient options for 429 | bandwidth estimation. 430 | 431 | Args: 432 | a (array): The data. 433 | method (str): The rule of thumb for bandwidth estimation. Must be one 434 | of 'silverman', 'scott', or 'cv'. Default 'scott'. 435 | 436 | Returns: 437 | tuple: (x, kde). 438 | 439 | Examples: 440 | >>> rng = np.random.default_rng(42) 441 | >>> data = rng.normal(size=100) 442 | >>> x, kde = get_kde(data) 443 | >>> x[0] + 1.354649738246933 < 1e-9 444 | True 445 | >>> kde[0] - 0.162332012191087 < 1e-9 446 | True 447 | >>> len(kde) 448 | 200 449 | """ 450 | methods = {'silverman': bw_silverman, 'scott': bw_scott, 'cv': cv_kde} 451 | bw = methods.get(method)(a) 452 | return fit_kde(a, bandwidth=bw) 453 | 454 | 455 | def find_large_peaks(x: ArrayLike, y: ArrayLike, threshold: float=0.1) -> tuple[np.ndarray, np.ndarray]: 456 | """ 457 | Find the peaks in the array. Returns the values of x and y at the largest 458 | peaks, using threshold × max(peak amplitudes) as the cut-off. That is, 459 | peaks smaller than that are not returned. 460 | 461 | Uses `scipy.signal.find_peaks()`, with convenient options for thresholding, 462 | and returns the x and y values of the peaks in a named tuple. 463 | 464 | Args: 465 | x (array): The x values. 466 | y (array): The y values. 467 | threshold (float): The threshold for peak amplitude. Default 0.1. 468 | 469 | Returns: 470 | tuple: (x_peaks, y_peaks). Arrays representing the x and y values of 471 | the peaks. 472 | 473 | Examples: 474 | >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] 475 | >>> y = [1, 2, 3, 2, 1, 2, 15, 40, 19, 2, 1, 1] 476 | >>> x_peaks, y_peaks = find_large_peaks(x, y) 477 | >>> x_peaks 478 | array([8.]) 479 | >>> y_peaks 480 | array([40.]) 481 | """ 482 | x, y = np.asarray(x), np.asarray(y) 483 | pos, hts = find_peaks(y, height=y) 484 | hts = hts['peak_heights'] 485 | if any(hts): 486 | z, h = np.array([(x[p].item(), h) for p, h in zip(pos, hts) if h > threshold * hts.max()]).T 487 | else: 488 | z, h = np.array([]), np.array([]) 489 | Peaks = namedtuple('Peaks', ['positions', 'heights']) 490 | return Peaks(z, h) 491 | 492 | 493 | def kde_peaks(a: ArrayLike, method: str='scott', threshold: float=0.1) -> tuple[np.ndarray, np.ndarray]: 494 | """ 495 | Find the peaks in the kernel density estimation. This might help you 496 | identify the modes in the data. 497 | 498 | Wraps `get_kde()` and `find_large_peaks()` to find the peaks in the 499 | kernel density estimation. By default, the bandwidth is estimated using 500 | the Scott rule of thumb. Other options are the Silverman rule of thumb, or 501 | cross validation (using the `cv_kde()` function). 502 | 503 | Args: 504 | a (array): The data. 505 | method (str): The rule of thumb for bandwidth estimation. Must be one 506 | of 'silverman', 'scott', or 'cv'. Default 'scott'. 507 | threshold (float): The threshold for peak amplitude. Default 0.1. 508 | 509 | Returns: 510 | tuple: (x_peaks, y_peaks). Arrays representing the x and y values of 511 | the peaks. 512 | 513 | Examples: 514 | >>> rng = np.random.default_rng(42) 515 | >>> data = np.concatenate([rng.normal(size=100)-2, rng.normal(size=100)+2]) 516 | >>> x_peaks, y_peaks = kde_peaks(data) 517 | >>> x_peaks 518 | array([-1.67243035, 1.88998226]) 519 | >>> y_peaks 520 | array([0.22014721, 0.19729456]) 521 | """ 522 | return find_large_peaks(*get_kde(a, method), threshold=threshold) 523 | 524 | 525 | def is_multimodal(a: ArrayLike, 526 | groups:Optional[ArrayLike]=None, 527 | method: str='scott', 528 | threshold: float=0.1) -> Union[bool, np.ndarray]: 529 | """ 530 | Test if the data is multimodal by looking for peaks in the kernel density 531 | estimation. If there is more than one peak, the data are considered 532 | multimodal. 533 | 534 | If groups are passed, the data are partitioned by group and tested 535 | separately. The result is an array of booleans, one per group. 536 | 537 | Wraps `kde_peaks()` to find the peaks in the kernel density estimation. 538 | 539 | Args: 540 | a (array): The data. 541 | groups (array): Group labels, if the data is to be partitioned before 542 | testing. 543 | method (str): The rule of thumb for bandwidth estimation. Must be one 544 | of 'silverman', 'scott', or 'cv'. Default 'scott'. 545 | threshold (float): The threshold for peak amplitude. Default 0.1. 546 | 547 | Returns: 548 | bool or np.ndarray: True if the data appear to be multimodal. If groups 549 | were passed, an array with one result per group is returned. 550 | 551 | Examples: 552 | >>> rng = np.random.default_rng(42) 553 | >>> a = rng.normal(size=200) 554 | >>> is_multimodal(a) 555 | False 556 | >>> b = np.concatenate([rng.normal(size=100)-2, rng.normal(size=100)+2]) 557 | >>> is_multimodal(b) 558 | True 559 | >>> c = np.concatenate([a, b]) 560 | >>> is_multimodal(c, groups=[0]*200 + [1]*200) 561 | array([False, True]) 562 | """ 563 | a = np.asarray(a) 564 | result = [] 565 | with warnings.catch_warnings(record=True) as w: 566 | for group in iter_groups(groups): 567 | x, y = kde_peaks(a[group], method=method, threshold=threshold) 568 | result.append(len(x) > 1) 569 | if w: 570 | warnings.warn('ℹ️ Multimodality detection may not have been possible for all groups.', stacklevel=2) 571 | return result[0] if len(result) == 1 else np.array(result) 572 | -------------------------------------------------------------------------------- /src/redflag/imbalance.py: -------------------------------------------------------------------------------- 1 | """ 2 | Imbalance metrics. 3 | 4 | This work is derived from the following reference work: 5 | Jonathan Ortigosa-Hernandez, Inaki Inza, and Jose A. Lozano 6 | Measuring the Class-imbalance Extent of Multi-class Problems 7 | Pattern Recognition Letters 98 (2017) 8 | https://doi.org/10.1016/j.patrec.2017.08.002 9 | 10 | Author: Matt Hall, scienxlab.org 11 | Licence: Apache 2.0 12 | 13 | Copyright 2024 Redflag contributors 14 | 15 | Licensed under the Apache License, Version 2.0 (the "License"); 16 | you may not use this file except in compliance with the License. 17 | You may obtain a copy of the License at 18 | 19 | http://www.apache.org/licenses/LICENSE-2.0 20 | 21 | Unless required by applicable law or agreed to in writing, software 22 | distributed under the License is distributed on an "AS IS" BASIS, 23 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 24 | See the License for the specific language governing permissions and 25 | limitations under the License. 26 | """ 27 | from __future__ import annotations 28 | 29 | from typing import Optional, Callable, Union 30 | from collections import Counter 31 | import warnings 32 | 33 | import numpy as np 34 | from numpy.typing import ArrayLike 35 | 36 | from .target import * 37 | from .utils import * 38 | 39 | 40 | def class_counts(a: ArrayLike, classes: Optional[ArrayLike]=None) -> dict: 41 | """ 42 | Make a Counter of the class labels in `classes`, or in `a` if `classes` 43 | is None. 44 | 45 | Args: 46 | a (array): A list of class labels. 47 | classes (array): A list of classes, in the event that `a` does not 48 | contain all of the classes, or if you want to ignore some classes 49 | in `a` (not recommended) you can omit them from this list. 50 | 51 | Returns: 52 | dict. The counts, in the order in which classes are encountered in 53 | `classes` (if `classes is not `None`) or `a`. 54 | 55 | Example: 56 | >>> class_counts([1, 3, 2, 2, 3, 3]) 57 | {1: 1, 3: 3, 2: 2} 58 | """ 59 | counts = Counter(a) 60 | 61 | if classes is None: 62 | classes = counts.keys() 63 | 64 | if len(counts) < len(classes): 65 | message = 'Some classes in the data are not in the list of classes.' 66 | warnings.warn(message, stacklevel=2) 67 | 68 | return {k: counts[k] for k in classes} 69 | 70 | 71 | def empirical_distribution(a: ArrayLike, classes: Optional[ArrayLike]=None) -> tuple[np.ndarray, np.ndarray]: 72 | """ 73 | Compute zeta and e. Equation 5 in Ortigosa-Hernandez et al. (2017). 74 | 75 | Args: 76 | a (array): A list of class labels. 77 | classes (array): A list of classes, in the event that `a` does not 78 | contain all of the classes, or if you want to ignore some classes 79 | in `a` (not recommended) you can omit them from this list. 80 | 81 | Returns: 82 | tuple: (zeta, e). Both arrays are length K, where K is the number of 83 | classes discovered in `a` (if `classes` is None) or named in 84 | `classes` otherwise. 85 | """ 86 | c = class_counts(a, classes=classes) 87 | ζ = np.array([v / sum(c.values()) for v in c.values()]) 88 | e = np.array([1 / len(c) for _ in c.values()]) 89 | return ζ, e 90 | 91 | 92 | def imbalance_ratio(a: ArrayLike, classes: Optional[ArrayLike]=None) -> float: 93 | """ 94 | Compute the IR. Equation 6 in Ortigosa-Hernandez et al. (2017). 95 | 96 | This measure is useful for binary problems, but not for multiclass problems. 97 | 98 | Args: 99 | a (array): A list of class labels. 100 | classes (array): A list of classes, in the event that `a` does not 101 | contain all of the classes, or if you want to ignore some classes 102 | in `a` (not recommended) you can omit them from this list. 103 | 104 | Returns: 105 | float: The imbalance ratio. 106 | """ 107 | ζ, _ = empirical_distribution(a, classes=classes) 108 | epsilon = 1e-12 109 | return max(ζ) / (min(ζ) + epsilon) 110 | 111 | 112 | def major_minor(a: ArrayLike, classes: Optional[ArrayLike]=None) -> tuple[int, int]: 113 | """ 114 | Returns the number of majority and minority classes. 115 | 116 | Args: 117 | a (array): A list of class labels. 118 | classes (array): A list of classes, in the event that `a` does not 119 | contain all of the classes, or if you want to ignore some classes 120 | in `a` (not recommended) you can omit them from this list. 121 | 122 | Returns: 123 | tuple: (maj, min), the number of majority and minority classes. 124 | 125 | Example: 126 | >>> major_minor([1, 1, 2, 2, 3, 3, 3]) 127 | (1, 2) 128 | """ 129 | ζ, e = empirical_distribution(a, classes=classes) 130 | return sum(ζ >= e), sum(ζ < e) 131 | 132 | 133 | def divergence(method: str='hellinger') -> Callable: 134 | """ 135 | Provides a function for computing the divergence between two discrete 136 | probability distributions. Used by `imbalance_degree()`. 137 | 138 | `method` can be a string from: 139 | - `hellinger`: Recommended by Ortigosa-Hernandez et al. (2017). 140 | - `euclidean`: Not recommended. 141 | - `manhattan`: Recommended. 142 | - `kl`: Not recommended. 143 | - `tv`: Recommended. 144 | 145 | If `method` is a function, this function just hands it back. 146 | 147 | Args: 148 | ζ (array): The actual distribution. 149 | e (array): The expected distribution. 150 | method (str): The method to use. 151 | 152 | Returns: 153 | function: A divergence function. 154 | 155 | Reference: 156 | Ortigosa-Hernandez et al. (2017) 157 | """ 158 | functions = { 159 | 'hellinger': lambda x, y: np.sqrt(np.sum((np.sqrt(x) - np.sqrt(y))**2)) / np.sqrt(2), 160 | 'euclidean': lambda x, y: np.sqrt(np.sum((x - y)**2)), 161 | 'manhattan': lambda x, y: np.sum(np.abs(x - y)), 162 | 'kl': lambda x, y: np.sum(x * np.log((x + 1e-12) / y)), # Kullback-Leibler. 163 | 'tv': lambda x, y: np.sum(np.abs(x - y)) / 2, # Total variation. 164 | } 165 | return functions.get(method, method) 166 | 167 | 168 | def furthest_distribution(a: ArrayLike, classes: Optional[ArrayLike]=None) -> np.ndarray: 169 | """ 170 | Compute the furthest distribution from `a`; used by `imbalance_degree()`. 171 | See Ortigosa-Hernandez et al. (2017). 172 | 173 | Args: 174 | a (array): A list of class labels. 175 | classes (array): A list of classes, in the event that `a` does not 176 | contain all of the classes, or if you want to ignore some classes 177 | in `a` (not recommended) you can omit them from this list. 178 | 179 | Returns: 180 | array: The furthest distribution. 181 | 182 | Example: 183 | >>> furthest_distribution([3,0,0,1,2,3,2,3,2,3,1,1,2,3,3,4,3,4,3,4,]) 184 | array([0.8, 0. , 0. , 0.2, 0. ]) 185 | """ 186 | ζ, e = empirical_distribution(a, classes=classes) 187 | # Construct the vector according to Eq 9. 188 | i = [ei if ζi >= ei else 0 for ζi, ei in zip(ζ, e)] 189 | # Arbitrarily increase one of the non-zero probs to sum to 1. 190 | i[np.argmax(i)] += 1 - sum(i) 191 | return np.array(i) 192 | 193 | 194 | def imbalance_degree(a: ArrayLike, 195 | method: Union[str, Callable]='tv', 196 | classes: Optional[ArrayLike]=None, 197 | ) -> float: 198 | r""" 199 | The imbalance degree reflects the degree to which the distribution of 200 | classes is imbalanced. The integer part of the imbalance degree is the 201 | number of minority classes minus 1 (m - 1, below). The fractional part 202 | is the distance between the actual (empirical) and expected distributions. 203 | The distance can be defined in different ways, depending on the method. 204 | 205 | IR is defined according to Eq 8 in Ortigosa-Hernandez et al. (2017). 206 | 207 | .. math:: 208 | \mathrm{ID}(\zeta) = \frac{d_\mathrm{\Delta}(\mathbf{\zeta}, \mathbf{e})} 209 | {d_\mathrm{\Delta}(\mathbf{\iota}_m, \mathbf{e})} + (m - 1) 210 | 211 | `method` can be a string from: 212 | - 'manhattan': Manhattan distance or L1 norm 213 | - 'euclidean': Euclidean distance or L2 norm 214 | - 'hellinger': Hellinger distance, recommended by Ortigosa-Hernandez et al. (2017) 215 | - 'tv': total variation distance, recommended by Ortigosa-Hernandez et al. (2017) 216 | - 'kl': Kullback-Leibner divergence 217 | 218 | It can also be a function returning a divergence. 219 | 220 | Args: 221 | a (array): A list of class labels. 222 | method (str or function): The method to use. 223 | classes (array): A list of classes, in the event that `a` does not 224 | contain all of the classes, or if you want to ignore some classes 225 | in `a` (not recommended) you can omit them from this list. 226 | 227 | Returns: 228 | float: The imbalance degree. 229 | 230 | Examples: 231 | >>> ID = imbalance_degree(generate_data([288, 49, 288]), 'tv') 232 | >>> round(ID, 2) 233 | 0.76 234 | >>> ID = imbalance_degree(generate_data([629, 333, 511]), 'euclidean') 235 | >>> round(ID, 2) 236 | 0.3 237 | >>> ID = imbalance_degree(generate_data([2, 81, 61, 4]), 'hellinger') 238 | >>> round(ID, 2) 239 | 1.73 240 | >>> ID = imbalance_degree(generate_data([2, 81, 61, 4]), 'kl') 241 | >>> round(ID, 2) 242 | 1.65 243 | """ 244 | ζ, e = empirical_distribution(a, classes=classes) 245 | m = sum(ζ < e) 246 | i = furthest_distribution(a, classes=classes) 247 | div = divergence(method) 248 | epsilon = 1e-12 249 | return (div(ζ, e) / (epsilon + div(i, e))) + (m - 1) 250 | 251 | 252 | def minority_classes(a: ArrayLike, classes: Optional[ArrayLike]=None) -> np.ndarray: 253 | """ 254 | Get the minority classes, based on the empirical distribution. 255 | The classes are listed in order of increasing frequency. 256 | 257 | Args: 258 | a (array): A list of class labels. 259 | classes (array): A list of classes, in the event that `a` does not 260 | contain all of the classes, or if you want to ignore some classes 261 | in `a` (not recommended) you can omit them from this list. 262 | 263 | Returns: 264 | array: The minority classes. 265 | 266 | Example: 267 | >>> minority_classes([1, 2, 2, 2, 3, 3, 3, 3, 4, 4]) 268 | array([1, 4]) 269 | """ 270 | a = np.asarray(a) 271 | ζ, e = empirical_distribution(a, classes=classes) 272 | 273 | # We can suppress this warning (if any) because it would already have 274 | # been raised by `empirical_distribution`. 275 | with warnings.catch_warnings(): 276 | warnings.simplefilter("ignore") 277 | classes = class_counts(a, classes=classes).keys() 278 | 279 | # Return the minority classes in order, smallest first. 280 | return np.array([c for ζi, ei, c in sorted(zip(ζ, e, classes)) if ζi < ei]) 281 | 282 | 283 | def is_imbalanced(a: ArrayLike, 284 | threshold: float=0.4, 285 | method: Union[str, Callable]='tv', 286 | classes: Optional[ArrayLike]=None, 287 | ) -> bool: 288 | """ 289 | Check if a dataset is imbalanced by first checking that there are minority 290 | classes, then inspecting the fractional part of the imbalance degree metric. 291 | The metric is compared to the threshold you provide (default 0.4, same as 292 | the sklearn detector ImbalanceDetector). 293 | 294 | Args: 295 | a (array): A list of class labels. 296 | threshold (float): The threshold to use. Default: 0.5. 297 | method (str or function): The method to use. 298 | classes (array): A list of classes, in the event that `a` does not 299 | contain all of the classes, or if you want to ignore some classes 300 | in `a` (not recommended) you can omit them from this list. 301 | 302 | Returns: 303 | bool: True if the dataset is imbalanced. 304 | 305 | Example: 306 | >>> is_imbalanced(generate_data([2, 81, 61, 4])) 307 | True 308 | """ 309 | if not minority_classes(a, classes=classes).size: 310 | return False 311 | im_deg = imbalance_degree(a, method, classes) 312 | return im_deg - int(im_deg) >= threshold 313 | -------------------------------------------------------------------------------- /src/redflag/importance.py: -------------------------------------------------------------------------------- 1 | """ 2 | Feature importance metrics. 3 | 4 | Author: Matt Hall, scienxlab.org 5 | Licence: Apache 2.0 6 | 7 | Copyright 2024 Redflag contributors 8 | 9 | Licensed under the Apache License, Version 2.0 (the "License"); 10 | you may not use this file except in compliance with the License. 11 | You may obtain a copy of the License at 12 | 13 | http://www.apache.org/licenses/LICENSE-2.0 14 | 15 | Unless required by applicable law or agreed to in writing, software 16 | distributed under the License is distributed on an "AS IS" BASIS, 17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | See the License for the specific language governing permissions and 19 | limitations under the License. 20 | """ 21 | from typing import Optional 22 | 23 | import numpy as np 24 | from numpy.typing import ArrayLike 25 | from sklearn.inspection import permutation_importance 26 | from sklearn.linear_model import LinearRegression 27 | from sklearn.ensemble import RandomForestRegressor 28 | from sklearn.neighbors import KNeighborsClassifier 29 | from sklearn.neighbors import KNeighborsRegressor 30 | from sklearn.linear_model import LogisticRegression 31 | from sklearn.ensemble import RandomForestClassifier 32 | 33 | from .target import is_continuous 34 | from .utils import split_and_standardize 35 | from .utils import aggregate 36 | 37 | 38 | def feature_importances(X: ArrayLike, y: ArrayLike=None, 39 | task: Optional[str]=None, 40 | random_state: Optional[int]=None, 41 | ) -> np.ndarray: 42 | """ 43 | Estimate feature importances on a supervised task, given X and y. 44 | 45 | Classification tasks are assessed with logistic regression, a random 46 | forest, and KNN permutation importance. Regression tasks are assessed with 47 | lasso regression, a random forest, and KNN permutation importance. 48 | 49 | The scores from these assessments are normalized, and the normalized 50 | sum is returned. 51 | 52 | See the Tutorial in the documentation for more information. 53 | 54 | Args: 55 | X (array): an array representing the data. 56 | y (array or None): an array representing the target. If None, the task 57 | is assumed to be an unsupervised clustering task. 58 | task (str or None): either 'classification' or 'regression'. If None, 59 | the task will be inferred from the labels and a warning will show 60 | the assumption being made. 61 | random_state (int or None): the random state to use. 62 | 63 | Returns: 64 | array: The importance of the features, in the order in which they 65 | appear in X. 66 | 67 | Examples: 68 | >>> X = [[0, 0, 0], [0, 1, 1], [0, 2, 0], [0, 3, 1], [0, 4, 0], [0, 5, 1], [0, 7, 0], [0, 8, 1], [0, 8, 0]] 69 | >>> y = [5, 15, 25, 35, 45, 55, 80, 85, 90] 70 | >>> feature_importances(X, y, task='regression', random_state=42) 71 | array([0. , 0.9831828, 0.0168172]) 72 | >>> y = ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c'] 73 | >>> x0, x1, x2 = feature_importances(X, y, task='classification', random_state=42) 74 | >>> x1 > x2 > x0 # See Issue #49 for why this test is like this. 75 | True 76 | """ 77 | if y is None: 78 | raise NotImplementedError('Unsupervised importance is not yet implemented.') 79 | 80 | if task is None: 81 | task = 'regression' if is_continuous(y) else 'classification' 82 | 83 | # Split the data and ensure it is standardized. 84 | X, X_train, X_val, y, y_train, y_val = split_and_standardize(X, y, random_state=random_state) 85 | 86 | # Train three models and gather the importances. 87 | imps: list = [] 88 | if task == 'classification': 89 | imps.append(np.abs(LogisticRegression(random_state=random_state).fit(X, y).coef_.sum(axis=0))) 90 | imps.append(RandomForestClassifier(random_state=random_state).fit(X, y).feature_importances_) 91 | model = KNeighborsClassifier().fit(X_train, y_train) 92 | r = permutation_importance(model, X_val, y_val, n_repeats=8, scoring='f1_weighted', random_state=random_state) 93 | imps.append(r.importances_mean) 94 | elif task == 'regression': 95 | imps.append(np.abs(LinearRegression().fit(X, y).coef_)) 96 | imps.append(RandomForestRegressor(random_state=random_state).fit(X, y).feature_importances_) 97 | model = KNeighborsRegressor().fit(X_train, y_train) 98 | r = permutation_importance(model, X_val, y_val, n_repeats=8, scoring='neg_mean_squared_error', random_state=random_state) 99 | imps.append(r.importances_mean) 100 | 101 | # Eliminate negative values and aggregate. 102 | imps = np.array(imps) 103 | imps[imps < 0] = 0 104 | return aggregate(imps, normalize_input=True, normalize_output=True) 105 | 106 | 107 | def least_important_features(importances: ArrayLike, 108 | threshold: Optional[float]=None) -> np.ndarray: 109 | """ 110 | Returns the least important features, in order of importance (least 111 | important first). The threshold controls how many features are returned. 112 | Set it to None to set it automatically. 113 | 114 | Args: 115 | importances (array): the importance of the features, in the order in 116 | which they appear in X. 117 | threshold (float or None): the cutoff for the importance. If None, the 118 | cutoff is set to half the expectation of the importance (i.e. 0.5/M 119 | where M is the number of features). 120 | 121 | Returns: 122 | array: The indices of the least important features. 123 | 124 | Examples: 125 | >>> least_important_features([0.05, 0.01, 0.24, 0.4, 0.3]) 126 | array([1, 0]) 127 | >>> least_important_features([0.2, 0.2, 0.2, 0.2, 0.2]) 128 | array([], dtype=int64) 129 | """ 130 | if threshold is None: 131 | threshold = 0.5 / len(importances) 132 | 133 | least_important: dict = {} 134 | for arg, imp in zip(np.argsort(importances), np.sort(importances)): 135 | if sum(least_important.values()) + imp > threshold: 136 | break 137 | least_important[arg] = imp 138 | 139 | return np.array(list(least_important)).astype(int) 140 | 141 | 142 | def most_important_features(importances: ArrayLike, 143 | threshold: Optional[float]=None) -> np.ndarray : 144 | """ 145 | Returns the indices of the most important features, in reverse order of 146 | importance (most important first). The threshold controls how many features 147 | are returned. Set it to None to set it automatically. 148 | 149 | Args: 150 | importances (array): the importance of the features, in the order in 151 | which they appear in X. 152 | threshold (float or None): the cutoff for the importance. If None, 153 | the cutoff is set to (M-1)/M where M is the number of features. 154 | 155 | Returns: 156 | array: The indices of the most important features. 157 | 158 | Examples: 159 | >>> most_important_features([0.05, 0.01, 0.24, 0.4, 0.3]) 160 | array([3, 4, 2]) 161 | >>> most_important_features([0.2, 0.2, 0.2, 0.2, 0.2]) 162 | array([4, 3, 2, 1, 0]) 163 | """ 164 | if threshold is None: 165 | threshold = 1 - 0.5 / len(importances) 166 | 167 | most_important: dict = {} 168 | args = np.argsort(importances)[::-1] 169 | imps = np.sort(importances)[::-1] 170 | for arg, imp in zip(args, imps): 171 | most_important[arg] = imp 172 | if sum(most_important.values()) > threshold: 173 | break 174 | 175 | return np.array(list(most_important)).astype(int) 176 | -------------------------------------------------------------------------------- /src/redflag/independence.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions related to understanding row independence. 3 | 4 | Author: Matt Hall, scienxlab.org 5 | Licence: Apache 2.0 6 | 7 | Copyright 2024 Redflag contributors 8 | 9 | Licensed under the Apache License, Version 2.0 (the "License"); 10 | you may not use this file except in compliance with the License. 11 | You may obtain a copy of the License at 12 | 13 | http://www.apache.org/licenses/LICENSE-2.0 14 | 15 | Unless required by applicable law or agreed to in writing, software 16 | distributed under the License is distributed on an "AS IS" BASIS, 17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | See the License for the specific language governing permissions and 19 | limitations under the License. 20 | """ 21 | import numpy as np 22 | from numpy.typing import ArrayLike 23 | 24 | 25 | def is_correlated(a: ArrayLike, n: int=20, s: int=20, threshold: float=0.1) -> bool: 26 | """ 27 | Check if a dataset is auto-correlated. This function returns True if 28 | the 1D input array `a` appears to be correlated to itself, perhaps 29 | because it consists of measurements sampled at neighbouring points 30 | in time or space, at a spacing short enough that samples are correlated. 31 | 32 | If samples are correlated in this way, then the records in your dataset 33 | may break the IID assumption implicit in much of statistics (though not 34 | in specialist geostatistics or timeseries algorithms). This is not 35 | necessarily a big problem, but it does mean you need to be careful 36 | about how you split your data, for example a random split between train 37 | and test will leak information from train to test, because neighbouring 38 | samples are correlated. 39 | 40 | This function inspects s random chunks of n samples, averaging the 41 | autocorrelation coefficients across chunks. If the mean first non-zero 42 | lag is greater than the threshold, the array may be autocorrelated. 43 | 44 | See the Tutorial in the documentation for more about how to use this 45 | function. 46 | 47 | Args: 48 | a (array): The data. 49 | n (int): The number of samples per chunk. 50 | s (int): The number of chunks. 51 | threshold (float): The auto-correlation threshold. 52 | 53 | Returns: 54 | bool: True if the data are autocorrelated. 55 | 56 | Examples: 57 | >>> is_correlated([7, 1, 6, 8, 7, 6, 2, 9, 4, 2]) 58 | False 59 | >>> is_correlated([1, 2, 1, 7, 6, 8, 6, 2, 1, 1]) 60 | True 61 | """ 62 | a = np.asarray(a) 63 | 64 | # Split into chunks n samples long. 65 | L_chunks = min(a.size, n) 66 | chunks = np.array_split(a, a.size//L_chunks) 67 | 68 | # Choose up to s chunk indices at random. 69 | N_chunks = min(len(chunks), s) 70 | rng = np.random.default_rng() 71 | r = rng.choice(np.arange(len(chunks)), size=N_chunks, replace=False) 72 | 73 | # Loop over selected chunks and count ones with correlation. 74 | acs: list = [] 75 | for chunk in [c for i, c in enumerate(chunks) if i in r]: 76 | c = chunk[:L_chunks] - np.nanmean(chunk) 77 | autocorr = np.correlate(c, c, mode='same') 78 | acs.append(autocorr / (c.size * np.nanvar(c))) 79 | 80 | # Average the autocorrelations. 81 | acs = np.sum(acs, axis=0) / N_chunks 82 | 83 | p = acs[c.size//2 - 1] # First non-zero lag. 84 | q = acs[c.size//2 - 2] # Next non-zero lag. 85 | 86 | return (p >= threshold) & (q >= 0) 87 | -------------------------------------------------------------------------------- /src/redflag/markov.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions related to Markov chains. This code was originally implemented in 3 | https://github.com/agilescientific/striplog. 4 | 5 | Author: Matt Hall, scienxlab.org 6 | Licence: Apache 2.0 7 | 8 | Copyright 2024 Matt Hall 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | """ 22 | from collections import namedtuple 23 | 24 | import numpy as np 25 | import scipy.stats 26 | 27 | 28 | def observations(seq_of_seqs, states, step=1, include_self=False): 29 | """ 30 | Compute observation matrix. 31 | 32 | Returns the matrix of transition counts between states. 33 | 34 | Args: 35 | seq_of_seqs (list-like): A list-like, or list-like of list-likes. 36 | The inner list-likes represent sequences of states. 37 | For example, can be a string or list of strings, or 38 | a list or list of lists. 39 | states (list-like): A list or array of the names of the states. 40 | If not provided, it will be inferred from the data. 41 | step (integer): The distance to step. Default is 1: use 42 | the previous state only. If 2, then the previous-but- 43 | one state is used as well as the previous state (and 44 | the matrix has one more dimension). 45 | include_self (bool): Whether to include self-to-self 46 | transitions (default is `False`: do not include them). 47 | 48 | Returns: 49 | ndarray. The observation matrix. 50 | """ 51 | O = np.zeros(tuple(states.size for _ in range(step+1))) 52 | for seq in seq_of_seqs: 53 | seq = np.array(seq) 54 | _, integer_seq = np.where(seq.reshape(-1, 1) == states) 55 | for idx in zip(*[integer_seq[n:] for n in range(step+1)]): 56 | if (not include_self) and (0 in np.diff(idx)): 57 | continue 58 | O[idx] += 1 59 | return O 60 | 61 | 62 | def hollow_matrix(M): 63 | """ 64 | Utility funtion to return hollow matrix (zeros on diagonal). 65 | 66 | Args 67 | M (ndarray): a 'square' ndarray. 68 | 69 | Returns 70 | ndarray. The same array with zeros on the diagonal. 71 | """ 72 | s = M.shape[0] 73 | idx = np.unravel_index(np.arange(0, s**2, s + 1), M.shape) 74 | M[idx] = 0 75 | return M 76 | 77 | 78 | def regularize(sequence, strings_are_states=False) -> tuple: 79 | """ 80 | Turn a sequence or sequence of sequences into a tuple of 81 | the unique elements in the sequence(s), plus a sequence 82 | of sequences (sort of equivalent to `np.atleast_2d()`). 83 | 84 | Args 85 | sequence (list-like): A list-like container of either 86 | states, or of list-likes of states. 87 | strings_are_states (bool): True if the strings are 88 | themselves states (i.e. words or tokens) and not 89 | sequences of one-character states. For example, 90 | set to True if you provide something like: 91 | 92 | ['sst', 'mud', 'mud', 'sst', 'lst', 'lst'] 93 | 94 | Returns 95 | tuple. A tuple of the unique states, and a sequence 96 | of sequences. 97 | """ 98 | if strings_are_states: 99 | if isinstance(sequence[0], str): 100 | seq_of_seqs = [sequence] 101 | else: 102 | seq_of_seqs = sequence 103 | else: 104 | # Just try to iterate over the contents of the sequence. 105 | try: 106 | seq_of_seqs = [list(i) if len(i) > 1 else i for i in sequence] 107 | except TypeError: 108 | seq_of_seqs = [list(sequence)] 109 | 110 | # Annoyingly, still have to fix case of single sequence of 111 | # strings... this seems really hacky. 112 | if len(seq_of_seqs[0]) == 1: 113 | seq_of_seqs = [seq_of_seqs] 114 | 115 | # Now we know we have a sequence of sequences. 116 | uniques = set() 117 | for seq in seq_of_seqs: 118 | for i in seq: 119 | uniques.add(i) 120 | 121 | return np.array(sorted(uniques)), seq_of_seqs 122 | 123 | 124 | class Markov_chain: 125 | 126 | def __init__(self, 127 | observed_counts, 128 | states=None, 129 | step=1, 130 | include_self=None, 131 | ): 132 | """ 133 | Initialize the Markov chain instance. 134 | 135 | Args: 136 | observed_counts (ndarray): A 2-D array representing the counts 137 | of change of state in the Markov Chain. 138 | states (array-like): An array-like representing the possible states 139 | of the Markov Chain. Must be in the same order as `observed 140 | counts`. 141 | step (int): The maximum step size, default 1. 142 | include_self (bool): Whether to include self-to-self transitions. 143 | """ 144 | self.step = step 145 | self.observed_counts = np.atleast_2d(observed_counts).astype(int) 146 | 147 | if include_self is not None: 148 | self.include_self = include_self 149 | else: 150 | self.include_self = np.any(np.diagonal(self.observed_counts)) 151 | 152 | if not self.include_self: 153 | self.observed_counts = hollow_matrix(self.observed_counts) 154 | 155 | if states is not None: 156 | self.states = np.asarray(states) 157 | else: 158 | self.states = np.arange(self.observed_counts.shape[0]) 159 | 160 | if self.step > 1: 161 | self.expected_counts = self._compute_expected_mc() 162 | else: 163 | self.expected_counts = self._compute_expected() 164 | 165 | return 166 | 167 | @staticmethod 168 | def _compute_freqs(C): 169 | """ 170 | Compute frequencies from counts. 171 | """ 172 | epsilon = 1e-12 173 | return (C.T / (epsilon+np.sum(C.T, axis=0))).T 174 | 175 | @staticmethod 176 | def _stop_iter(a, b, tol=0.01): 177 | """ 178 | Stopping criterion for Powers & Easterling method. 179 | """ 180 | a_small = np.all(np.abs(a[-1] - a[-2]) < tol*a[-1]) 181 | b_small = np.all(np.abs(b[-1] - b[-2]) < tol*b[-1]) 182 | return (a_small and b_small) 183 | 184 | @property 185 | def _index_dict(self): 186 | """ 187 | A dictionary mapping the states to their indices. 188 | """ 189 | if self.states is None: 190 | return {} 191 | return {self.states[index]: index for index in range(len(self.states))} 192 | 193 | @property 194 | def _state_dict(self): 195 | """ 196 | A dictionary mapping the indices to their states. 197 | """ 198 | if self.states is None: 199 | return {} 200 | return {index: self.states[index] for index in range(len(self.states))} 201 | 202 | @property 203 | def observed_freqs(self): 204 | """ 205 | The observed frequencies of each state, given the previous state. 206 | """ 207 | return self._compute_freqs(self.observed_counts) 208 | 209 | @property 210 | def expected_freqs(self): 211 | """ 212 | The expected frequencies of each state, given the previous state. 213 | """ 214 | return self._compute_freqs(self.expected_counts) 215 | 216 | @property 217 | def _state_counts(self): 218 | """ 219 | The number of times each state occurs. 220 | """ 221 | s = self.observed_counts.copy() 222 | 223 | # Deal with more than 2 dimensions. 224 | for _ in range(self.observed_counts.ndim - 2): 225 | s = np.sum(s, axis=0) 226 | 227 | a = np.sum(s, axis=0) 228 | b = np.sum(s, axis=1) 229 | return np.maximum(a, b) 230 | 231 | @property 232 | def _state_probs(self): 233 | """ 234 | The probability of each state. 235 | """ 236 | return self._state_counts / np.sum(self._state_counts) 237 | 238 | @property 239 | def normalized_difference(self): 240 | """ 241 | The normalized difference between observed and expected counts. 242 | """ 243 | O = self.observed_counts 244 | E = self.expected_counts 245 | epsilon = 1e-12 246 | return (O - E) / np.sqrt(E + epsilon) 247 | 248 | @classmethod 249 | def from_sequence(cls, 250 | sequence, 251 | states=None, 252 | strings_are_states=False, 253 | include_self=False, 254 | step=1, 255 | ): 256 | """ 257 | Parse a sequence and make the transition matrix of the specified order. 258 | 259 | You must provide sequence(s) in causal order (e.g. time order). 260 | 261 | Args: 262 | sequence (list-like): A list-like, or list-like of list-likes. 263 | The inner list-likes represent sequences of states. 264 | For example, can be a string or list of strings, or 265 | a list or list of lists. 266 | states (list-like): A list or array of the names of the states. 267 | If not provided, it will be inferred from the data. 268 | strings_are_states (bool): True if the strings are 269 | themselves states (i.e. words or tokens) and not 270 | sequences of one-character states. For example, 271 | set to True if you provide something like: 272 | 273 | ['sst', 'mud', 'mud', 'sst', 'lst', 'lst'] 274 | 275 | include_self (bool): Whether to include self-to-self 276 | transitions (default is `False`: do not include them). 277 | step (integer): The distance to step. Default is 1: use 278 | the previous state only. If 2, then the previous-but- 279 | one state is used as well as the previous state (and 280 | the matrix has one more dimension). 281 | """ 282 | uniques, seq_of_seqs = regularize(sequence, strings_are_states=strings_are_states) 283 | 284 | if states is None: 285 | states = uniques 286 | else: 287 | states = np.asarray(list(states)) 288 | 289 | O = observations(seq_of_seqs, states=states, step=step, include_self=include_self) 290 | 291 | return cls(observed_counts=np.array(O), 292 | states=states, 293 | include_self=include_self, 294 | step=step, 295 | ) 296 | 297 | def _conditional_probs(self, state): 298 | """ 299 | Conditional probabilities of each state, given a 300 | current state. 301 | """ 302 | return self.observed_freqs[self._index_dict[state]] 303 | 304 | def _next_state(self, current_state): 305 | """ 306 | Returns the state of the random variable at the next time 307 | instance. 308 | 309 | Args: 310 | current_state (str): The current state of the system. 311 | 312 | Returns: 313 | str. One realization of the next state. 314 | """ 315 | return np.random.choice(self.states, 316 | p=self._conditional_probs(current_state) 317 | ) 318 | 319 | def generate_states(self, n: int = 10, current_state=None): 320 | """ 321 | Generates the next states of the system. 322 | 323 | Args: 324 | n (int): The number of future states to generate. 325 | current_state (str): The state of the current random variable. 326 | 327 | Returns: 328 | list. The next n states. 329 | """ 330 | if current_state is None: 331 | current_state = np.random.choice(self.states, p=self._state_probs) 332 | 333 | future_states = [] 334 | for _ in range(n): 335 | next_state = self._next_state(current_state) 336 | future_states.append(next_state) 337 | current_state = next_state 338 | 339 | return future_states 340 | 341 | def _compute_expected(self): 342 | """ 343 | Try to use Powers & Easterling, fall back on Monte Carlo sampling 344 | based on the proportions of states in the data. 345 | """ 346 | try: 347 | E = self._compute_expected_pe() 348 | except: 349 | E = self._compute_expected_mc() 350 | 351 | return E 352 | 353 | def _compute_expected_mc(self, n: int = 100000): 354 | """ 355 | If we can't use Powers & Easterling's method, and it's possible there's 356 | a way to extend it to higher dimensions (which we have for step > 1), 357 | the next best thing might be to use brute force and just compute a lot 358 | of random sequence transitions, given the observed proportions. This is 359 | what P & E's method tries to estimate iteratively. 360 | 361 | What to do about 'self transitions' is a bit of a problem here, since 362 | there are a lot of n-grams that include at least one self-transition. 363 | """ 364 | seq = np.random.choice(self.states, size=n, p=self._state_probs) 365 | E = observations(np.atleast_2d(seq), self.states, step=self.step, include_self=self.include_self) 366 | if not self.include_self: 367 | E = hollow_matrix(E) 368 | return np.sum(self.observed_counts) * E / np.sum(E) 369 | 370 | def _compute_expected_pe(self, max_iter: int = 100): 371 | """ 372 | Compute the independent trials matrix, using method of 373 | Powers & Easterling 1982. 374 | """ 375 | m = len(self.states) 376 | M = self.observed_counts 377 | a, b = [], [] 378 | 379 | # Loop 1 380 | a.append(np.sum(M, axis=1) / (m - 1)) 381 | b.append(np.sum(M, axis=0) / (np.sum(a[-1]) - a[-1])) 382 | 383 | i = 2 384 | while i < max_iter: 385 | 386 | a.append(np.sum(M, axis=1) / (np.sum(b[-1]) - b[-1])) 387 | b.append(np.sum(M, axis=0) / (np.sum(a[-1]) - a[-1])) 388 | 389 | # Check for stopping criterion. 390 | if self._stop_iter(a, b, tol=0.001): 391 | break 392 | 393 | i += 1 394 | 395 | E = a[-1] * b[-1].reshape(-1, 1) 396 | 397 | if not self.include_self: 398 | return hollow_matrix(E) 399 | else: 400 | return E 401 | 402 | @property 403 | def degrees_of_freedom(self) -> int: 404 | m = len(self.states) 405 | return (m - 1)**2 - m 406 | 407 | def _chi_squared_critical(self, q: float = 0.95, df: int = None) -> float: 408 | """ 409 | The chi-squared critical value for a confidence level q 410 | and degrees of freedom df. 411 | """ 412 | if df is None: 413 | df = self.degrees_of_freedom 414 | return scipy.stats.chi2.ppf(q=q, df=df) 415 | 416 | def _chi_squared_percentile(self, x: float, df: int = None) -> float: 417 | """ 418 | The chi-squared percentile for a value x and degrees of 419 | freedom df. 420 | """ 421 | if df is None: 422 | df = self.degrees_of_freedom 423 | return scipy.stats.chi2.cdf(x, df=df) 424 | 425 | def chi_squared(self, q: float = 0.95) -> tuple: 426 | """ 427 | The chi-squared statistic for the given transition 428 | frequencies. 429 | 430 | Also returns the critical statistic at the given confidence 431 | level q (default 95%). 432 | 433 | If the first number is bigger than the second number, 434 | then you can reject the hypothesis that the sequence 435 | is randomly ordered. 436 | 437 | Args: 438 | q (float): The confidence level, as a float in the range 0 to 1. 439 | Default: 0.95. 440 | 441 | Returns: 442 | float: The chi-squared statistic. 443 | """ 444 | # Observed and Expected matrices: 445 | O = self.observed_counts 446 | E = self.expected_counts 447 | 448 | # Adjustment for divide-by-zero 449 | epsilon = 1e-12 450 | chi2 = np.sum((O - E)**2 / (E + epsilon)) 451 | crit = self._chi_squared_critical(q=q) 452 | perc = self._chi_squared_percentile(x=chi2) 453 | Chi2 = namedtuple('Chi2', ['chi2', 'crit', 'perc']) 454 | 455 | return Chi2(chi2, crit, perc) 456 | -------------------------------------------------------------------------------- /src/redflag/outliers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions related to understanding features. 3 | 4 | Author: Matt Hall, scienxlab.org 5 | Licence: Apache 2.0 6 | 7 | Copyright 2024 Redflag contributors 8 | 9 | Licensed under the Apache License, Version 2.0 (the "License"); 10 | you may not use this file except in compliance with the License. 11 | You may obtain a copy of the License at 12 | 13 | http://www.apache.org/licenses/LICENSE-2.0 14 | 15 | Unless required by applicable law or agreed to in writing, software 16 | distributed under the License is distributed on an "AS IS" BASIS, 17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | See the License for the specific language governing permissions and 19 | limitations under the License. 20 | """ 21 | from __future__ import annotations 22 | 23 | from typing import Optional 24 | from functools import reduce, partial 25 | import warnings 26 | 27 | import numpy as np 28 | from numpy.typing import ArrayLike 29 | from sklearn.neighbors import LocalOutlierFactor 30 | from sklearn.ensemble import IsolationForest 31 | from sklearn.covariance import EllipticEnvelope 32 | 33 | from .utils import stdev_to_proportion, proportion_to_stdev 34 | from .utils import get_idx 35 | 36 | 37 | def mahalanobis(X: ArrayLike, correction: bool=False) -> np.ndarray: 38 | """ 39 | Compute the Mahalanobis distances of every record (row) in a 2D dataset. 40 | 41 | If X has a single feature, this is equivalent to computing the Z-scores 42 | of the data. For more features, the Mahalanobis distance is the distance 43 | of each point from the centroid of the data, in units analogous to the 44 | standard deviation. It is a multivariate analog of the Z-score. 45 | 46 | The empirical covariance correction factor suggested by Rousseeuw and 47 | Van Driessen may be optionally applied by setting `correction=True`. 48 | 49 | Args: 50 | X (array): The data. Must be a 2D array, shape (n_samples, n_features). 51 | correction (bool): Whether to apply the empirical covariance correction. 52 | 53 | Returns: 54 | array: The Mahalanobis distances. 55 | 56 | Examples: 57 | >>> data = np.array([-3, -2, -2, -1, 0, 0, 0, 1, 2, 2, 3]).reshape(-1, 1) 58 | >>> mahalanobis(data) 59 | array([1.6583124, 1.1055416, 1.1055416, 0.5527708, 0. , 0. , 60 | 0. , 0.5527708, 1.1055416, 1.1055416, 1.6583124]) 61 | >>> mahalanobis(data, correction=True) 62 | array([1.01173463, 0.67448975, 0.67448975, 0.33724488, 0. , 63 | 0. , 0. , 0.33724488, 0.67448975, 0.67448975, 64 | 1.01173463]) 65 | """ 66 | X = np.asarray(X) 67 | 68 | ee = EllipticEnvelope(support_fraction=1.0).fit(X) 69 | 70 | if correction: 71 | ee.correct_covariance(X) 72 | 73 | return np.sqrt(ee.dist_) 74 | 75 | 76 | def mahalanobis_outliers(X: ArrayLike, 77 | p: float=0.99, 78 | threshold: Optional[float]=None, 79 | ) -> np.ndarray: 80 | """ 81 | Find outliers given samples and a threshold in multiples of stdev. 82 | Returns -1 for outliers and 1 for inliers (to match the sklearn API). 83 | 84 | For univariate data, we expect this many points outside (in units of 85 | standard deviation, and with equivalent p-values): 86 | - 1 sd: expect 31.7 points in 100 (p = 1 - 0.317 = 0.683) 87 | - 2 sd: 4.55 in 100 (p = 1 - 0.0455 = 0.9545) 88 | - 3 sd: 2.70 in 1000 (p = 1 - 0.0027 = 0.9973) 89 | - 4 sd: 6.3 in 100,000 (p = 1 - 0.000063 = 0.999937) 90 | - 4.89163847 sd: 1 in 1 million (p = 1 - 0.000001 = 0.999999) 91 | - 5 sd: 5.7 in 10 million datapoints 92 | - 6 sd: 2.0 in 1 billion points 93 | 94 | Args: 95 | X (array): The data. Can be a 2D array, shape (n_samples, n_features), 96 | or a 1D array, shape (n_samples). 97 | p (float): The probability threshold, in the range [0, 1]. This value 98 | is ignored if `threshold` is not None; in this case, `p` will be 99 | computed using `utils.stdev_to_proportion(threshold)`. 100 | threshold (float): The threshold in Mahalanobis distance, analogous to 101 | multiples of standard deviation for a single variable. If not None, 102 | the threshold will be used to compute `p`. 103 | 104 | Returns: 105 | array: Array identifying outliers; -1 for outliers and 1 for inliers. 106 | 107 | Examples: 108 | >>> data = [-3, -2, -2, -1, 0, 0, 0, 1, 2, 2, 3] 109 | >>> mahalanobis_outliers(data) 110 | array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) 111 | >>> mahalanobis_outliers(data + [100], threshold=3) 112 | array([ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1]) 113 | """ 114 | X = np.asarray(X) 115 | if X.ndim == 1: 116 | X = X.reshape(-1, 1) 117 | 118 | _, d = X.shape 119 | 120 | # Determine the Mahalanobis distance for the given confidence level. 121 | if threshold is None: 122 | threshold = proportion_to_stdev(p=p, d=d) 123 | 124 | # Compute the Mahalanobis distance. 125 | z = mahalanobis(X) 126 | 127 | # Decide whether each point is an outlier or not. 128 | idx, = np.where((z < -threshold) | (z > threshold)) 129 | outliers = np.full(z.shape, 1) 130 | outliers[idx] = -1 131 | 132 | return outliers 133 | 134 | 135 | def get_outliers(a: ArrayLike, 136 | method: Optional[str]=None, # Can change to 'mah' in 0.6.0. 137 | p: float=0.99, 138 | threshold: Optional[float]=None, 139 | ) -> np.ndarray: 140 | """ 141 | Returns outliers in the data, considering all of the features. What counts 142 | as an outlier is determined by the threshold, which is in multiples of 143 | the standard deviation. (The conversion to 'contamination' is approximate.) 144 | 145 | Methods: 'iso' (isolation forest), 'lof' (local outlier factor), 146 | 'ee' (elliptic envelope), or 'mah' (Mahanalobis distance, the default), or 147 | pass a function that returns an array of outlier flags (-1 for outliers and 1 148 | for inliers, matching the `sklearn` convention). You can also pass 'any', 149 | which will try all three outlier detection methods and return the outliers 150 | which are detected by any of them, or 'all', which will return the outliers 151 | which are common to all four methods. That is, 'all' is a rather conservative 152 | outlier detector, 'any' is rather liberal, and both of these are slower 153 | than choosing a single algorithm. 154 | 155 | Args: 156 | a (array): The data. 157 | method (str): The method to use. Can be 'mah' (the default), 'iso', 'lof', 158 | 'ee', 'any', 'all', or a function that returns a Boolean array of 159 | outlier flags. 160 | p (float): The probability threshold, in the range [0, 1]. This value 161 | is ignored if `threshold` is not None; in this case, `p` will be 162 | computed using `utils.stdev_to_proportion(threshold)`. 163 | threshold (float): The threshold in Mahalanobis distance, analogous to 164 | multiples of standard deviation for a single variable. If not None, 165 | the threshold will be used to compute `p`. 166 | 167 | Returns: 168 | array: The indices of the outliers. 169 | 170 | Examples: 171 | >>> data = [-3, -2, -2, -1, 0, 0, 0, 1, 2, 2, 3] 172 | >>> get_outliers(3 * data) 173 | array([], dtype=int64) 174 | >>> get_outliers(3 * data + [100]) 175 | array([33]) 176 | >>> get_outliers(3 * data + [100], method='mah') 177 | array([33]) 178 | >>> get_outliers(3 * data + [100], method='any') 179 | array([33]) 180 | >>> get_outliers(3 * data + [100], method='all') 181 | array([33]) 182 | """ 183 | if method is None: 184 | # Was called with the default method, which changed in 0.4.3 185 | method = 'mah' 186 | warnings.warn('The default method for get_outliers has changed to "mah". ' 187 | 'Please specify the method explicitly to avoid this warning.', 188 | DeprecationWarning, stacklevel=2) 189 | if p >= 1 or p < 0: 190 | raise ValueError('p must be in the range [0, 1).') 191 | a = np.asarray(a) 192 | if a.ndim == 1: 193 | a = a.reshape(-1, 1) 194 | if threshold is None: 195 | expect = 1 - p 196 | else: 197 | expect = 1 - stdev_to_proportion(threshold) 198 | p = 1 - expect 199 | methods = { 200 | 'iso': IsolationForest(contamination=expect).fit_predict, 201 | 'lof': LocalOutlierFactor(contamination=expect, novelty=False).fit_predict, 202 | 'ee': EllipticEnvelope(contamination=expect).fit_predict, 203 | 'mah': partial(mahalanobis_outliers, p=p, threshold=threshold), 204 | } 205 | if method == 'any': 206 | results = [get_idx(func(a)==-1) for func in methods.values()] 207 | outliers = reduce(np.union1d, results) 208 | elif method == 'all': 209 | results = [get_idx(func(a)==-1) for func in methods.values()] 210 | outliers = reduce(np.intersect1d, results) 211 | else: 212 | func = methods.get(method, method) 213 | outliers, = np.where(func(a)==-1) 214 | return outliers 215 | 216 | 217 | def expected_outliers(n: int, 218 | d: int=1, 219 | p: float=0.99, 220 | threshold: Optional[float]=None, 221 | ) -> int: 222 | """ 223 | Expected number of outliers in a dataset, under the assumption that the 224 | data are multivariate-normally distributed. What counts as an outlier is 225 | determined by the threshold, which is in multiples of the standard 226 | deviation, or by the p-value, which is the probability of a point being 227 | an outlier. Note that passing p = 0.99 does not necessarily mean that 228 | 1% of the points will be outliers, only that 1% of the points are expected 229 | to be outliers, on average, if the data are normally distributed. 230 | 231 | Args: 232 | n (int): The number of samples. 233 | d (int): The number of features. Note that if threshold is None, this 234 | value is not used in the calculation. Default: 1. 235 | p (float): The probability threshold, in the range [0, 1]. This value 236 | is ignored if `threshold` is not None and `p` will be computed 237 | using `utils.stdev_to_proportion(threshold)`. Default: 0.99. 238 | threshold (float): The threshold in Mahalanobis distance, analogous to 239 | multiples of standard deviation for a single variable. If not None, 240 | the threshold will be used to compute `p`. 241 | 242 | Returns: 243 | int: The expected number of outliers. 244 | 245 | Example: 246 | >>> expected_outliers(10_000, 6, threshold=4) 247 | 137 248 | """ 249 | if threshold is not None: 250 | p = stdev_to_proportion(threshold, d) 251 | return int(n * (1 - p)) 252 | 253 | 254 | def has_outliers(a: ArrayLike, 255 | p: float=0.99, 256 | threshold: Optional[float]=None, 257 | factor: float=1.0, 258 | ) -> bool: 259 | """ 260 | Use Mahalanobis distance to determine if there are more outliers than 261 | expected at the given confidence level or Mahalanobis distance threshold. 262 | A Boolean wrapper around `expected_outliers` and `get_outliers`. 263 | 264 | Args: 265 | a (array): The data. If 2D, the rows are samples and the columns are 266 | features. If 1D, the data are assumed to be univariate. 267 | p (float): The probability threshold, in the range [0, 1]. This value 268 | is ignored if `threshold` is not None and `p` will be computed 269 | using `utils.stdev_to_proportion(threshold)`. Default: 0.99. 270 | threshold (float): The threshold in Mahalanobis distance, analogous to 271 | multiples of standard deviation for a single variable. If not None, 272 | the threshold will be used to compute `p`. 273 | factor (float): The factor by which to multiply the expected number of 274 | outliers before comparing to the actual number of outliers. 275 | 276 | Returns: 277 | bool: True if there are more outliers than expected at the given 278 | confidence level. 279 | """ 280 | a = np.asarray(a) 281 | if a.ndim == 1: 282 | a = a.reshape(-1, 1) 283 | n, d = a.shape 284 | 285 | if threshold is not None: 286 | p = stdev_to_proportion(threshold, d) 287 | 288 | expected = expected_outliers(n, d, p=p) 289 | 290 | return get_outliers(a, method='mah', p=p).size > factor * expected 291 | -------------------------------------------------------------------------------- /src/redflag/pandas.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pandas accessors. 3 | 4 | Author: Matt Hall, scienxlab.org 5 | Licence: Apache 2.0 6 | 7 | Copyright 2024 Redflag contributors 8 | 9 | Licensed under the Apache License, Version 2.0 (the "License"); 10 | you may not use this file except in compliance with the License. 11 | You may obtain a copy of the License at 12 | 13 | http://www.apache.org/licenses/LICENSE-2.0 14 | 15 | Unless required by applicable law or agreed to in writing, software 16 | distributed under the License is distributed on an "AS IS" BASIS, 17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | See the License for the specific language governing permissions and 19 | limitations under the License. 20 | """ 21 | import warnings 22 | from typing import Optional 23 | 24 | from .imbalance import imbalance_degree, minority_classes, is_imbalanced 25 | from .importance import feature_importances as feature_importances 26 | from .outliers import get_outliers 27 | from .target import * 28 | from .independence import is_correlated 29 | from .utils import docstring_from 30 | 31 | 32 | def null_decorator(arg): 33 | """ 34 | Returns a decorator that does nothing but wrap the function it 35 | decorates. Need to do this to accept an argument on the decorator. 36 | """ 37 | def decorator(func): 38 | return func 39 | return decorator 40 | 41 | 42 | try: 43 | from pandas.api.extensions import register_dataframe_accessor 44 | from pandas.api.extensions import register_series_accessor 45 | except: 46 | register_dataframe_accessor = null_decorator 47 | register_series_accessor = null_decorator 48 | 49 | 50 | TEMPLATES = { 51 | 'continuous': """Continuous data suitable for regression 52 | Outliers: {outliers} 53 | Correlated: {correlated} 54 | Dummy scores:{dummy_scores} 55 | """, 56 | 'categorical': """Categorical data suitable for classification 57 | Imbalance degree: {imbalance} 58 | Minority classes: {minority_classes} 59 | Dummy scores: {dummy_scores} 60 | """ 61 | } 62 | 63 | @register_series_accessor("redflag") 64 | class SeriesAccessor: 65 | def __init__(self, pandas_obj): 66 | self._obj = pandas_obj 67 | 68 | @docstring_from(minority_classes) 69 | def minority_classes(self): 70 | if is_continuous(self._obj): 71 | warnings.warn('The Series does not seem categorical.') 72 | return minority_classes(self._obj) 73 | 74 | @docstring_from(imbalance_degree) 75 | def imbalance_degree(self): 76 | if is_continuous(self._obj): 77 | warnings.warn('The Series does not seem categorical.') 78 | return imbalance_degree(self._obj) 79 | 80 | @docstring_from(is_imbalanced) 81 | def is_imbalanced(self, threshold=0.4, method='tv', classes=None): 82 | if is_continuous(self._obj): 83 | warnings.warn('The Series does not seem categorical.') 84 | return is_imbalanced(self._obj, 85 | threshold=threshold, 86 | method=method, 87 | classes=classes 88 | ) 89 | 90 | @docstring_from(is_ordered) 91 | def is_ordered(self, q=0.95): 92 | return is_ordered(self._obj, q=q) 93 | 94 | @docstring_from(dummy_scores) 95 | def dummy_scores(self, task='auto', random_state=None): 96 | return dummy_scores(self._obj, task=task, random_state=random_state) 97 | 98 | def report(self, random_state=None): 99 | results = {} 100 | if is_continuous(self._obj): 101 | results['outliers'] = get_outliers(self._obj) 102 | results['correlated'] = is_correlated(self._obj) 103 | results['dummy_scores'] = dummy_regression_scores(self._obj) 104 | template = TEMPLATES['continuous'] 105 | else: 106 | # Categorical. 107 | results['minority_classes'] = minority_classes(self._obj) 108 | results['imbalance'] = imbalance_degree(self._obj) 109 | results['dummy_scores'] = dummy_classification_scores(self._obj, random_state=random_state) 110 | template = TEMPLATES['categorical'] 111 | 112 | return template.format(**results) 113 | 114 | 115 | @register_dataframe_accessor("redflag") 116 | class DataFrameAccessor: 117 | def __init__(self, pandas_obj): 118 | self._obj = pandas_obj 119 | 120 | @docstring_from(feature_importances) 121 | def feature_importances(self, features=None, target=None, 122 | task: Optional[str]=None, 123 | random_state: Optional[int]=None, 124 | ): 125 | if target is None: 126 | raise ValueError('You must provide a target column.') 127 | else: 128 | y_ = self._obj[target] 129 | if is_continuous(y_): 130 | task = 'regression' 131 | else: 132 | task = 'classification' 133 | if len(y_.shape) > 1: 134 | raise NotImplementedError('Multilabel targets are not supported.') 135 | if features is None and target is not None: 136 | X_ = self._obj.drop(columns=target) 137 | else: 138 | X_ = self._obj[features] 139 | return feature_importances(X_, y_, 140 | task=task, 141 | random_state=random_state 142 | ) 143 | 144 | 145 | def correlation_detector(self, features=None, target=None, n=20, s=20, threshold=0.1): 146 | """ 147 | This is an experimental feature. 148 | """ 149 | if target is not None: 150 | y_ = self._obj[target] 151 | if len(y_.shape) > 1: 152 | raise NotImplementedError('Multilabel targets are not supported.') 153 | if is_correlated(y_): 154 | warnings.warn('The target appears to be autocorrelated.',stacklevel=2) 155 | 156 | if features is None and target is not None: 157 | X_ = self._obj.drop(target, axis=1).values 158 | else: 159 | X_ = self._obj[features].values 160 | 161 | for i, x in enumerate(X_.T): 162 | if is_correlated(x, n=n, s=s, threshold=threshold): 163 | warnings.warn(f'🚩 Feature {i} appears to be autocorrelated.', stacklevel=2) 164 | 165 | # There is probably something more useful to return. 166 | return 167 | -------------------------------------------------------------------------------- /src/redflag/target.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions related to understanding the target and the type of task. 3 | 4 | Author: Matt Hall, scienxlab.org 5 | Licence: Apache 2.0 6 | 7 | Copyright 2024 Redflag contributors 8 | 9 | Licensed under the Apache License, Version 2.0 (the "License"); 10 | you may not use this file except in compliance with the License. 11 | You may obtain a copy of the License at 12 | 13 | http://www.apache.org/licenses/LICENSE-2.0 14 | 15 | Unless required by applicable law or agreed to in writing, software 16 | distributed under the License is distributed on an "AS IS" BASIS, 17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | See the License for the specific language governing permissions and 19 | limitations under the License. 20 | """ 21 | from __future__ import annotations 22 | from typing import Optional 23 | 24 | import numpy as np 25 | from numpy.typing import ArrayLike 26 | from sklearn.dummy import DummyClassifier, DummyRegressor 27 | from sklearn.metrics import f1_score, roc_auc_score 28 | from sklearn.metrics import mean_squared_error, r2_score 29 | 30 | from .utils import * 31 | from .markov import Markov_chain 32 | 33 | 34 | def is_continuous(a: ArrayLike, n: Optional[int]=None) -> bool: 35 | """ 36 | Decide if this is most likely a continuous variable (and thus, if this is 37 | the target, for example, most likely a regression task). 38 | 39 | Args: 40 | a (array): A target vector. 41 | n (int): The number of potential categories. That is, if there are 42 | fewer than n unique values in the data, it is estimated to be 43 | categorical. Default: the square root of the sample size, which 44 | is all the data or 10_000 random samples, whichever is smaller. 45 | 46 | Returns: 47 | bool: True if arr is probably best suited to regression. 48 | 49 | Examples: 50 | >>> is_continuous(10 * ['a', 'b']) 51 | False 52 | >>> is_continuous(100 * [1, 2, 3]) 53 | False 54 | >>> import numpy as np 55 | >>> is_continuous(np.random.random(size=100)) 56 | True 57 | >>> is_continuous(np.random.randint(0, 15, size=200)) 58 | False 59 | """ 60 | arr = np.asarray(a) 61 | 62 | if not is_numeric(arr): 63 | return False 64 | 65 | # Now we are dealing with numbers that could represent categories. 66 | 67 | if is_binary(arr): 68 | return False 69 | 70 | # Starting with this and having the uplifts be 0.666 means 71 | # that at least 2 tests must trigger to get over 0.5. 72 | p = 1 / 3 73 | 74 | # Take a sample if array is large. 75 | if arr.size < 10_000: 76 | sample = arr 77 | else: 78 | sample = np.random.choice(arr, size=10_000, replace=False) 79 | 80 | if n is None: 81 | n = np.sqrt(sample.size) 82 | 83 | # Check if floats. 84 | if np.issubdtype(sample.dtype, np.floating): 85 | 86 | # If not ints in disguise. 87 | if not np.all([xi.is_integer() for xi in np.unique(sample)]): 88 | p = update_p(p, 2/3, 2/3) 89 | 90 | # If low precision. 91 | if np.all((sample.astype(int) - sample) < 1e-3): 92 | p = update_p(p, 2/3, 2/3) 93 | 94 | # If many unique values. 95 | if np.unique(sample).size > n: 96 | p = update_p(p, 2/3, 2/3) 97 | 98 | # If many sizes of gaps between numbers. 99 | many_gap_sizes = np.unique(np.diff(np.sort(sample))).size > n 100 | if many_gap_sizes: 101 | p = update_p(p, 2/3, 2/3) 102 | 103 | return p > 0.5 104 | 105 | 106 | def n_classes(y: ArrayLike) -> int: 107 | """ 108 | Count the classes. 109 | 110 | Args: 111 | y (array): A list of class labels. 112 | 113 | Returns: 114 | int: The number of classes. 115 | 116 | Examples: 117 | >>> n_classes([1, 1, 1]) 118 | 1 119 | >>> n_classes([0, 1, 1]) 120 | 2 121 | >>> n_classes([1, 2, 3]) 122 | 3 123 | """ 124 | y_ = np.asanyarray(y) 125 | return np.unique(y_).size 126 | 127 | 128 | def is_multioutput(y: ArrayLike) -> bool: 129 | """ 130 | Decide if a target array is multi-output. 131 | 132 | Raises TypeError if y has more than 2 dimensions. 133 | 134 | Args: 135 | y (array): A list of class labels. 136 | 137 | Returns: 138 | bool: True if y has more than 1 dimensions. 139 | 140 | Examples: 141 | >>> is_multioutput([1, 2, 3]) 142 | False 143 | >>> is_multioutput([[1, 2], [3, 4]]) 144 | True 145 | >>> is_multioutput([[1], [2]]) 146 | False 147 | >>> is_multioutput([[[1], [2]],[[3], [4]]]) 148 | Traceback (most recent call last): 149 | TypeError: Target array has too many dimensions. 150 | """ 151 | y_ = np.asanyarray(y) 152 | if y_.ndim == 1: 153 | return False 154 | elif (y_.ndim == 2): 155 | return y_.shape[1] > 1 156 | else: 157 | message = "Target array has too many dimensions." 158 | raise TypeError(message) 159 | 160 | 161 | def is_multiclass(y: ArrayLike) -> bool: 162 | """ 163 | Decide if a single target is multiclass. 164 | 165 | Args: 166 | y (array): A list of class labels. 167 | 168 | Returns: 169 | bool: True if y has more than 2 classes. 170 | 171 | Examples: 172 | >>> print(is_multiclass([1, 1, 1])) 173 | False 174 | >>> is_multiclass([0, 1, 1]) 175 | False 176 | >>> is_multiclass([1, 2, 3]) 177 | True 178 | """ 179 | if n_classes(y) > 2: 180 | return True 181 | else: 182 | return False 183 | 184 | 185 | def is_binary(y: ArrayLike) -> bool: 186 | """ 187 | Decide if a single target is binary. 188 | 189 | Args: 190 | y (array): A list of class labels. 191 | 192 | Returns: 193 | bool: True if y has exactly 2 classes. 194 | 195 | Examples: 196 | >>> print(is_binary([1, 1, 1])) 197 | False 198 | >>> is_binary([0, 1, 1]) 199 | True 200 | >>> is_binary([1, 2, 3]) 201 | False 202 | """ 203 | return n_classes(y) == 2 204 | 205 | 206 | def dummy_classification_scores(y: ArrayLike, random_state:Optional[int]=None) -> dict: 207 | """ 208 | Make dummy classifications, which can indicate a good lower-bound baseline 209 | for classification tasks. Wraps scikit-learn's `DummyClassifier`, using the 210 | `most_frequent` and `stratified` methods, and provides a dictionary of F1 211 | and ROC-AUC scores. 212 | 213 | Args: 214 | y (array): A list of class labels. 215 | random_state (int): A seed for the random number generator. 216 | 217 | Returns: 218 | dict: A dictionary of scores. 219 | 220 | Examples: 221 | >>> y = [1, 1, 1, 1, 1, 2, 2, 2, 3, 3] 222 | >>> scores = dummy_classification_scores(y, random_state=42) 223 | >>> scores['most_frequent'] # Precision issue with stratified test. 224 | {'f1': 0.3333333333333333, 'roc_auc': 0.5} 225 | """ 226 | result = {'most_frequent': {}, 'stratified': {}} 227 | y = np.asanyarray(y) 228 | if y.ndim > 1: 229 | raise ValueError("Multilabel target is not supported.") 230 | X = np.ones_like(y).reshape(-1, 1) # X is not used by the model. 231 | for method, scores in result.items(): 232 | model = DummyClassifier(strategy=method, random_state=random_state) 233 | _ = model.fit(X, y) 234 | scores['f1'] = f1_score(y, model.predict(X), average='weighted') 235 | y_prob = model.predict_proba(X) 236 | if is_binary(y): 237 | scores['roc_auc'] = roc_auc_score(y, y_prob[:, 1]) 238 | else: 239 | scores['roc_auc'] = roc_auc_score(y, y_prob, multi_class='ovr') 240 | return result 241 | 242 | 243 | def dummy_regression_scores(y: ArrayLike) -> dict: 244 | """ 245 | Make dummy predictions, which can indicate a good lower-bound baseline 246 | for regression tasks. Wraps scikit-learn's `DummyRegressor`, using the 247 | `mean` method, and provides a dictionary of MSE and R-squared scores. 248 | 249 | Args: 250 | y (array): A list of values. 251 | 252 | Returns: 253 | dict: A dictionary of scores. 254 | 255 | Examples: 256 | >>> y = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 257 | >>> dummy_regression_scores(y) 258 | {'mean': {'mean_squared_error': 8.25, 'r2': 0.0}} 259 | """ 260 | result = {'mean': {}} 261 | y = np.asanyarray(y) 262 | if y.ndim > 1: 263 | raise ValueError("Multilabel target is not supported.") 264 | X = np.ones_like(y).reshape(-1, 1) # X is not used by the model. 265 | for method, scores in result.items(): 266 | model = DummyRegressor(strategy=method) 267 | _ = model.fit(X, y) 268 | y_pred = model.predict(X) 269 | scores['mean_squared_error'] = mean_squared_error(y, y_pred) 270 | scores['r2'] = r2_score(y, y_pred) 271 | return result 272 | 273 | 274 | def dummy_scores(y: ArrayLike, task='auto', random_state:Optional[int]=None) -> dict: 275 | """ 276 | Provide scores from a 'dummy' (naive) model. This can be useful for 277 | understanding the difficulty of the task. For example, if the dummy 278 | model does well, then the task is probably easy and you should be 279 | suspicious of any model that does not do well. 280 | 281 | The function automatically decides whether y is continuous or categorical 282 | and calls the appropriate scoring function. 283 | 284 | Args: 285 | y (array): A list of class labels. 286 | task (str): What kind of task: 'regression' or 'classification', or 'auto' 287 | to decide automatically. In general regression tasks predict continuous 288 | variables (e.g. temperature tomorrow), while classification tasks predict 289 | categorical variables (e.g. rain, cloud or sun). 290 | random_state (int): A seed for the random number generator. Only required 291 | classification tasks (categorical variables). 292 | 293 | Returns: 294 | dict: A dictionary of scores. 295 | 296 | Examples: 297 | >>> y = [1, 1, 1, 1, 1, 2, 2, 2, 3, 3] 298 | >>> dummy_scores(y, random_state=42) 299 | {'f1': 0.3333333333333333, 'roc_auc': 0.5, 'strategy': 'most_frequent', 'task': 'classification'} 300 | >>> y = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 301 | >>> dummy_scores(y, task='regression') 302 | {'mean_squared_error': 8.25, 'r2': 0.0, 'strategy': 'mean', 'task': 'regression'} 303 | """ 304 | if task == 'auto': 305 | task = 'regression' if is_continuous(y) else 'classification' 306 | 307 | if task == 'classification': 308 | scores = dummy_classification_scores(y, random_state=random_state) 309 | scores_mf, scores_st = scores['most_frequent'], scores['stratified'] 310 | if scores_mf['f1'] >= scores_st['f1']: 311 | scores_ = scores_mf 312 | scores_['strategy'] = 'most_frequent' 313 | else: 314 | scores_ = scores_st 315 | scores_['strategy'] = 'stratified' 316 | scores_['task'] = 'classification' 317 | elif task == 'regression': 318 | scores = dummy_regression_scores(y) 319 | scores_ = scores['mean'] 320 | scores_['strategy'] = 'mean' 321 | scores_['task'] = 'regression' 322 | else: 323 | raise ValueError("`task` must be 'classification' or 'regression', or 'auto' to decide automatically.") 324 | 325 | return scores_ 326 | 327 | 328 | def is_ordered(y: ArrayLike, q: float=0.95) -> bool: 329 | """ 330 | Decide if a single target is ordered. 331 | 332 | Args: 333 | y (array): A list of class labels. 334 | q (float): The confidence level, as a float in the range 0 to 1. 335 | Default: 0.95. 336 | 337 | Returns: 338 | bool: True if y is ordered. 339 | 340 | Examples: 341 | >>> is_ordered(10 * ['top', 'top', 'middle', 'middle', 'bottom']) 342 | True 343 | >>> is_ordered(10 * [0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3]) 344 | True 345 | >>> rng = np.random.default_rng(42) 346 | >>> is_ordered(rng.integers(low=0, high=9, size=200)) 347 | False 348 | """ 349 | y_ = np.asanyarray(y) 350 | if is_continuous(y_): 351 | raise ValueError('Cannot check order of continuous data.') 352 | if y_.ndim > 1: 353 | raise ValueError('Cannot check order of multilabel data.') 354 | sas = isinstance(y[0], str) 355 | m = Markov_chain.from_sequence(y_, strings_are_states=sas, include_self=True) 356 | chi2, crit, perc = m.chi_squared(q=q) 357 | return chi2 > crit 358 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | ## Tests 2 | 3 | Note that most of the tests in `redflag` are doctests. The testing code is in the docstrings of the various functions, under the 'examples' heading. 4 | 5 | There are some pytest files in `tests` as well. 6 | 7 | The Jupyter Notebooks in `docs/notebooks` are currently not run as part of the tests, but there is an open issue to implemement this. 8 | 9 | Test options are in `pyproject.toml`, so to run the tests: clone the repo, install the dev dependencies (e.g. with `"pip install .[dev]`") and do this from the root directory: 10 | 11 | pytest 12 | 13 | 14 | ## A note about NumPy dtypes 15 | 16 | Owing to an idiosyncracy of 64-bit Windows machines, which count a 'long' int as 32-bit not 64, I have stopped `doctest` from comparing any `dtype=int64` or similar in test outputs. This is done by the custom `doctest.OutputChecker` in `tests/conftest.py`. It only runs on Windows 17 | machines (e.g. in the CI matrix). 18 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scienxlab/redflag/f495ddd0729c7ac61dc8d8f54fc706178a0d253f/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import doctest 2 | import re 3 | import platform 4 | 5 | 6 | OutputChecker = doctest.OutputChecker 7 | class CustomOutputChecker(OutputChecker): 8 | def check_output(self, want, got, optionflags): 9 | """ 10 | Remove the dtype from NumPy array reprs, to avoid some doctests 11 | failing on Windows, which often uses int32 instead of int64. 12 | """ 13 | pattern = re.compile(r"(array\(.+?)(, dtype=int)(32|64)(\))") 14 | want = pattern.sub(r"\1\4", want) 15 | got = pattern.sub(r"\1\4", got) 16 | return OutputChecker.check_output(self, want, got, optionflags) 17 | 18 | if platform.system() == 'Windows': 19 | doctest.OutputChecker = CustomOutputChecker 20 | -------------------------------------------------------------------------------- /tests/test_markov.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the Markov module. This code was originally implemented in 3 | https://github.com/agilescientific/striplog. 4 | """ 5 | import numpy as np 6 | 7 | from redflag.markov import Markov_chain 8 | 9 | data = "sssmmmlllmlmlsslsllsmmllllmssssllllssmmlllllssssssmmmmsmllllssslmslmsmmmslsllll""" 10 | 11 | def test_basics(): 12 | data = [[0, 37, 3, 2], 13 | [21, 0, 41, 14], 14 | [20, 25, 0, 0], 15 | [1, 14, 1, 0]] 16 | 17 | m = Markov_chain(data, states=['A', 'B', 'C', 'D']) 18 | 19 | ans = (35.73687369691601, 11.070497693516351, 0.9999989278539752) 20 | assert np.allclose(m.chi_squared(), ans) 21 | 22 | ans = np.array([[0., 31.27069125, 8.17143874, 2.55787001], 23 | [31.28238248, 0., 34.05692583, 10.66069169], 24 | [8.17137105, 34.04391563, 0., 2.78471333], 25 | [2.5579797, 10.65716447, 2.78485582, 0.]]) 26 | assert np.allclose(m.expected_counts, ans) 27 | 28 | 29 | def test_sequence(): 30 | m = Markov_chain.from_sequence(data, include_self=True) 31 | 32 | assert len(m._state_counts) == 3 33 | 34 | ans = np.array([[19., 5., 7.], 35 | [6., 9., 5.], 36 | [7., 6., 14.]]) 37 | assert np.allclose(m.observed_counts, ans) 38 | 39 | ans = np.array([[0.49712747, 0.19796476, 0.30490777], 40 | [0.49712747, 0.19796476, 0.30490777], 41 | [0.49712747, 0.19796476, 0.30490777]]) 42 | assert np.allclose(m.expected_freqs, ans) 43 | 44 | ans = np.array([[-2.24633883, -2.14054029, -2.81568096], 45 | [-1.81677174, 1.82886491, -0.94412655], 46 | [-2.68890472, -0.51627836, 0.76836845]]) 47 | assert np.allclose(m.normalized_difference, ans) 48 | 49 | 50 | def test_generate(): 51 | m = Markov_chain.from_sequence(data, include_self=True) 52 | 53 | assert len(m.generate_states()) == 10 54 | 55 | 56 | def test_step_2(): 57 | m = Markov_chain.from_sequence(data, include_self=True, step=2) 58 | 59 | assert m.observed_freqs.ndim == 3 60 | -------------------------------------------------------------------------------- /tests/test_pandas.py: -------------------------------------------------------------------------------- 1 | """Test Pandas accessors.""" 2 | import pytest 3 | import pandas as pd 4 | from redflag.pandas import null_decorator, SeriesAccessor 5 | 6 | 7 | c = pd.Series([1, 1, 1, 1, 1, 2, 2, 2, 3, 3]) 8 | r = pd.Series([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 3.0]) 9 | 10 | 11 | def test_null_decorator(): 12 | @null_decorator('foo') 13 | def f(): 14 | return None 15 | assert f() is None 16 | 17 | 18 | def test_dummy_scores(): 19 | c_scores = c.redflag.dummy_scores(random_state=42) 20 | r_scores = r.redflag.dummy_scores(random_state=42) 21 | 22 | assert c_scores['roc_auc'] - 0.6801587301587301 < 1e-12 23 | assert r_scores['mean_squared_error'] - 0.5710743801652893 < 1e-12 24 | 25 | 26 | def test_imbalance(): 27 | assert c.redflag.is_imbalanced(threshold=0.24, method='tv') 28 | 29 | minorities = c.redflag.minority_classes() 30 | assert 2 in minorities and 3 in minorities 31 | 32 | imb_degree = c.redflag.imbalance_degree() 33 | assert imb_degree - 1.25 < 1e-9 34 | 35 | 36 | def test_is_ordered(): 37 | assert c.redflag.is_ordered() 38 | 39 | 40 | def test_is_ordered_warns_for_continuous_data(): 41 | with pytest.raises(ValueError, match='Cannot check order of continuous data.'): 42 | r.redflag.is_ordered() 43 | 44 | 45 | def test_warnings(): 46 | with pytest.warns(UserWarning, match="The Series does not seem categorical."): 47 | r.redflag.minority_classes() 48 | with pytest.warns(UserWarning, match="The Series does not seem categorical."): 49 | r.redflag.imbalance_degree() 50 | 51 | 52 | def test_series_categorical_report(): 53 | report_c = c.redflag.report() 54 | assert 'Categorical' in report_c 55 | 56 | 57 | def test_series_continuous_report(): 58 | report_r = r.redflag.report() 59 | assert 'Continuous' in report_r 60 | 61 | 62 | def test_feature_importances_docstring(): 63 | s = pd.DataFrame([c, r]).redflag.feature_importances.__doc__ 64 | assert s.strip().startswith("Estimate feature importances on a supervised task, given X and y.") 65 | -------------------------------------------------------------------------------- /tests/test_sklearn.py: -------------------------------------------------------------------------------- 1 | """Test sklearn classes.""" 2 | import pytest 3 | import numpy as np 4 | from sklearn.pipeline import make_pipeline 5 | from sklearn.datasets import make_classification, make_regression 6 | 7 | import redflag as rf 8 | 9 | """ 10 | NB Most of redflag is tested by its doctests, but doctest cannot test 11 | for warnings, AFAIK. Most of the tests in this file are of the sklearn API. 12 | """ 13 | 14 | def test_clip_detector(): 15 | """ 16 | Checks for clipped data. Detects clipping by looking for multiple values 17 | of max and/or min. 18 | """ 19 | pipe = make_pipeline(rf.ClipDetector()) 20 | X = np.array([[2, 1], [3, 2], [4, 3], [5, 3]]) 21 | with pytest.warns(UserWarning, match="Feature 1 has samples that may be clipped."): 22 | pipe.fit_transform(X) 23 | 24 | # Warns about y, but only on continuous data. 25 | rng = np.random.default_rng(0) 26 | X = rng.normal(size=(100, 2)) 27 | y = rng.normal(size=100) 28 | y[:3] = y.max() 29 | with pytest.warns(UserWarning, match="Target 0 has samples that may be clipped."): 30 | pipe.fit_transform(X, y) 31 | 32 | # Raises: 33 | pipe = make_pipeline(rf.ClipDetector(warn=False)) 34 | with pytest.raises(ValueError) as e: 35 | pipe.fit_transform(X, y) 36 | 37 | # Does not warn: 38 | X = np.array([[2, 1], [3, 2], [4, 3], [5, 4]]) 39 | pipe.fit_transform(X) 40 | 41 | 42 | def test_correlation_detector(): 43 | """ 44 | Checks for data which is correlated to itself. 45 | """ 46 | pipe = make_pipeline(rf.CorrelationDetector()) 47 | rng = np.random.default_rng(0) 48 | X = np.stack([rng.uniform(size=20), np.sin(np.linspace(0, 1, 20))]).T 49 | with pytest.warns(UserWarning, match="Feature 1 has samples that may be correlated."): 50 | pipe.fit_transform(X) 51 | 52 | 53 | def test_insufficient_data_detector(): 54 | """ 55 | Checks for too few samples. 56 | """ 57 | pipe = make_pipeline(rf.InsufficientDataDetector()) 58 | rng = np.random.default_rng(0) 59 | 60 | # Does not warn: 61 | X = rng.normal(size=(36, 6)) 62 | pipe.fit_transform(X) 63 | 64 | # Warns: 65 | X = rng.normal(size=(35, 6)) 66 | with pytest.warns(UserWarning, match="Dataset contains only 35 samples"): 67 | pipe.fit_transform(X) 68 | 69 | # Raises: 70 | pipe = make_pipeline(rf.InsufficientDataDetector(warn=False)) 71 | with pytest.raises(ValueError) as e: 72 | pipe.fit_transform(X) 73 | 74 | 75 | def test_multimodality_detector(): 76 | """ 77 | Checks for features with a multimodal distribution, considered across the 78 | entire dataset. 79 | """ 80 | pipe = make_pipeline(rf.MultimodalityDetector()) 81 | rng = np.random.default_rng(0) 82 | X1 = np.stack([rng.normal(size=80), rng.normal(size=80)]).T 83 | X2 = np.stack([rng.normal(size=80), 3 + rng.normal(size=80)]).T 84 | X = np.vstack([X1, X2]) 85 | with pytest.warns(UserWarning, match="Feature 1 has a multimodal distribution."): 86 | pipe.fit_transform(X) 87 | y = np.hstack([np.zeros(80), np.ones(80)]) 88 | 89 | # Does not warn. 90 | pipe.fit(X, y) 91 | 92 | 93 | def test_custom_detector(): 94 | """ 95 | Checks for data which fails a user-supplied test. 96 | """ 97 | has_negative = lambda x: np.any(x < 0) 98 | pipe = rf.make_detector_pipeline({has_negative: "are negative"}) 99 | X = np.array([[-2, 1], [3, 2], [4, 3], [5, 4]]) 100 | with pytest.warns(UserWarning, match="Feature 0 has samples that are negative."): 101 | pipe.fit_transform(X) 102 | 103 | pipe = rf.make_detector_pipeline([has_negative]) 104 | with pytest.warns(UserWarning, match="Feature 0 has samples that fail"): 105 | pipe.fit_transform(X) 106 | 107 | detector = rf.Detector(has_negative) 108 | X = np.random.random(size=(100, 2)) 109 | y = np.random.random(size=100) - 0.1 110 | assert has_negative(y) 111 | assert rf.is_continuous(y) 112 | with pytest.warns(UserWarning, match="Target 0 has samples that fail"): 113 | pipe.fit_transform(X, y) 114 | 115 | 116 | def test_distribution_comparator(): 117 | """ 118 | Checks that the distribution of test data (i.e. transformed only) is the 119 | same as the distribution of the training data (i.e. fit and transformed). 120 | """ 121 | pipe = make_pipeline(rf.DistributionComparator(threshold=0.5)) 122 | rng = np.random.default_rng(0) 123 | X = rng.normal(size=(1_000, 2)) 124 | pipe.fit_transform(X) # fit() never throws a warning, just learns the distribution. 125 | 126 | # Throws a warning on test data (tested against training statistics): 127 | X_test = 1 + rng.normal(size=(500, 2)) 128 | with pytest.warns(UserWarning, match="Features 0, 1 have distributions that are different from training."): 129 | pipe.transform(X_test) 130 | 131 | # Does not warn if distribution is the same: 132 | X_test = rng.normal(size=(500, 2)) 133 | pipe.fit_transform(X) 134 | 135 | 136 | def test_univariate_outlier_detector(): 137 | # Use a factor of 0.5 to almost guarantee that this will throw a warning. 138 | pipe = make_pipeline(rf.UnivariateOutlierDetector(factor=0.5)) 139 | rng = np.random.default_rng(0) 140 | X = rng.normal(size=1_000).reshape(-1, 1) 141 | with pytest.warns(UserWarning, match="Feature 0 has samples that are excess univariate outliers"): 142 | pipe.fit_transform(X) 143 | 144 | # Does not warn with factor of 2.5: 145 | pipe = make_pipeline(rf.UnivariateOutlierDetector(factor=2.5)) 146 | pipe.fit_transform(X) 147 | 148 | 149 | def test_multivariate_outlier_detector(): 150 | # Use a factor of 0.5 to almost guarantee that this will throw a warning. 151 | pipe = make_pipeline(rf.MultivariateOutlierDetector(factor=0.5)) 152 | rng = np.random.default_rng(0) 153 | X = rng.normal(size=(1_000, 2)) 154 | with pytest.warns(UserWarning, match="Dataset has more multivariate outlier samples than expected."): 155 | pipe.fit_transform(X) 156 | 157 | # Warns for y too. 158 | pipe = make_pipeline(rf.MultivariateOutlierDetector(factor=0.5, p=0.8)) 159 | X = rng.uniform(size=(1_000, 2)) 160 | y = rng.normal(size=1_000) 161 | # y[:100] = 10 162 | with pytest.warns(UserWarning, match="Target has more univariate outlier samples than expected."): 163 | pipe.fit_transform(X, y) 164 | 165 | # Does not warn with factor of 2.5: 166 | pipe = make_pipeline(rf.MultivariateOutlierDetector(factor=2.5)) 167 | pipe.fit_transform(X) 168 | 169 | # Does not warn for y. 170 | y = rng.normal(size=1_000) 171 | pipe.fit(X, y) 172 | 173 | 174 | def test_outlier_detector(): 175 | # Use a factor of 0.5 to almost guarantee that this will throw a warning. 176 | pipe = make_pipeline(rf.OutlierDetector(factor=0.5)) 177 | rng = np.random.default_rng(0) 178 | X = rng.normal(size=(1_000, 2)) 179 | with pytest.warns(UserWarning, match="There are more outliers than expected in the training data"): 180 | pipe.fit_transform(X) 181 | 182 | # Throws a warning on test data (tested against training statistics): 183 | X_test = rng.normal(size=(500, 2)) 184 | with pytest.warns(UserWarning, match="There are more outliers than expected in the data"): 185 | pipe.transform(X_test) 186 | 187 | # Does not warn with factor of 2: 188 | pipe = make_pipeline(rf.OutlierDetector(factor=2.0)) 189 | pipe.fit_transform(X) 190 | 191 | 192 | def test_imbalance_detector(): 193 | pipe = make_pipeline(rf.ImbalanceDetector()) 194 | rng = np.random.default_rng(0) 195 | X = rng.normal(size=(100, 1)) 196 | y = rf.generate_data([20, 80]) 197 | with pytest.warns(UserWarning, match="The labels are imbalanced"): 198 | pipe.fit_transform(X, y) 199 | 200 | # Check other method. 201 | pipe = make_pipeline(rf.ImbalanceDetector(method='ir', threshold=2)) 202 | with pytest.warns(UserWarning, match="The labels are imbalanced"): 203 | pipe.fit_transform(X, y) 204 | 205 | # Does not warn with higher threshold (summary statistic for this y is 0.6): 206 | pipe = make_pipeline(rf.ImbalanceDetector(threshold=0.7)) 207 | pipe.fit_transform(X, y) 208 | 209 | # Warns about wrong kind of y (continuous): 210 | y = rng.normal(size=100) 211 | with pytest.warns(UserWarning, match="Target y seems continuous"): 212 | pipe.fit_transform(X, y) 213 | 214 | # No warning if y is None, just skips. 215 | pipe.fit_transform(X) 216 | 217 | # Raises error because method doesn't exist: 218 | with pytest.raises(ValueError) as e: 219 | pipe = make_pipeline(rf.ImbalanceDetector(method='foo')) 220 | 221 | # Raises error because threshold is wrong. 222 | with pytest.raises(ValueError) as e: 223 | pipe = make_pipeline(rf.ImbalanceDetector(method='ir', threshold=0.5)) 224 | 225 | # Raises error because threshold is wrong. 226 | with pytest.raises(ValueError) as e: 227 | pipe = make_pipeline(rf.ImbalanceDetector(method='id', threshold=2)) 228 | 229 | 230 | def test_imbalance_comparator(): 231 | """ 232 | The 'comparator' learns the imbalance statistics of the training set, 233 | then compares subsequent sets to the learned stats. 234 | """ 235 | # We need to use the special redflag pipeline object, which passes 236 | # both X and y to `transform()`. 237 | pipe = rf.make_rf_pipeline(rf.ImbalanceComparator()) 238 | 239 | # The rest is standard. 240 | rng = np.random.default_rng(0) 241 | X = rng.normal(size=(200, 1)) 242 | y = rf.generate_data([20, 20, 20, 140]) 243 | 244 | # Does not raise a warning because we're only fitting. 245 | pipe.fit(X, y) 246 | 247 | # Warns about different number of minority classes. 248 | y = rf.generate_data([20, 20, 80, 80]) 249 | with pytest.warns(UserWarning, match="There is a different number"): 250 | pipe.transform(X, y) 251 | 252 | # Warns about wrong kind of y (continuous): 253 | y = rng.normal(size=100) 254 | with pytest.warns(UserWarning, match="Target y seems continuous"): 255 | pipe.fit_transform(X, y) 256 | with pytest.warns(UserWarning, match="Target y seems continuous"): 257 | pipe.transform(X, y) 258 | 259 | # No warning if y is None, just skips: 260 | pipe.fit_transform(X) 261 | 262 | # Raises error because threshold is wrong. 263 | with pytest.raises(ValueError) as e: 264 | pipe = make_pipeline(rf.ImbalanceComparator(method='ir', threshold=0.5)) 265 | 266 | # Raises error because threshold is wrong. 267 | with pytest.raises(ValueError) as e: 268 | pipe = make_pipeline(rf.ImbalanceComparator(method='id', threshold=2)) 269 | 270 | 271 | def test_importance_detector(): 272 | # Raises error because method doesn't exist: 273 | with pytest.raises(ValueError) as e: 274 | pipe = make_pipeline(rf.ImportanceDetector(threshold=2)) 275 | 276 | pipe = make_pipeline(rf.ImportanceDetector(random_state=0)) 277 | 278 | # Warns about low importance. 279 | X, y = make_classification(n_samples=200, n_features=4, n_informative=3, n_redundant=0, n_classes=2, random_state=42) 280 | with pytest.warns(UserWarning, match="Feature 3 has low importance"): 281 | pipe.fit_transform(X, y) 282 | 283 | # Warns about high importance. 284 | X, y = make_classification(n_samples=200, n_features=3, n_informative=2, n_redundant=0, n_classes=2, random_state=42) 285 | with pytest.warns(UserWarning, match="Feature 1 has very high importance"): 286 | pipe.fit_transform(X, y) 287 | 288 | # Warns about wrong kind of y. 289 | y = None 290 | with pytest.warns(UserWarning, match="Target y is None"): 291 | pipe.fit_transform(X, y) 292 | 293 | 294 | def test_dummy_predictor(): 295 | """ 296 | Checks that the dummy regressor and classifier work as expected. 297 | """ 298 | pipe = make_pipeline(rf.DummyPredictor(random_state=42)) 299 | 300 | # Regression: 301 | X, y = make_regression(random_state=42) 302 | with pytest.warns(UserWarning, match="Dummy regressor scores:"): 303 | pipe.fit_transform(X, y) 304 | 305 | # Classification: 306 | X, y = make_classification(random_state=42) 307 | with pytest.warns(UserWarning, match="Dummy classifier scores:"): 308 | pipe.fit_transform(X, y) 309 | 310 | # Warns about wrong kind of y. 311 | y = None 312 | with pytest.warns(UserWarning, match="Target y is None"): 313 | pipe.fit_transform(X, y) 314 | --------------------------------------------------------------------------------