├── .github
├── ISSUE_TEMPLATE
│ └── bug_report.md
└── workflows
│ ├── build-test.yml
│ ├── publish-docs.yml
│ └── pypi-publish.yml
├── .gitignore
├── AUTHORS.md
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── SECURITY.md
├── docs
├── Makefile
├── _static
│ ├── custom.css
│ ├── favicon.ico
│ ├── redflag.svg
│ ├── redflag_logo.png
│ └── redflag_social.svg
├── authors.md
├── changelog.md
├── conf.py
├── contributing.md
├── development.md
├── index.rst
├── installation.md
├── license.md
├── notebooks
│ ├── Basic_usage.ipynb
│ ├── Demo.ipynb
│ ├── Tutorial.ipynb
│ ├── Using_redflag_with_Pandas.ipynb
│ ├── Using_redflag_with_sklearn.ipynb
│ └── _Pandas_accessor.ipynb
├── post_process_html.py
├── pre_process_ipynb.py
├── redflag.rst
└── what_is_redflag.md
├── pyproject.toml
├── src
└── redflag
│ ├── __init__.py
│ ├── distributions.py
│ ├── imbalance.py
│ ├── importance.py
│ ├── independence.py
│ ├── markov.py
│ ├── outliers.py
│ ├── pandas.py
│ ├── sklearn.py
│ ├── target.py
│ └── utils.py
└── tests
├── README.md
├── __init__.py
├── conftest.py
├── test_markov.py
├── test_pandas.py
└── test_sklearn.py
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Desktop (please complete the following information):**
24 | - OS: [e.g. iOS]
25 |
26 | **Additional context**
27 | Add any other context about the problem here.
28 |
--------------------------------------------------------------------------------
/.github/workflows/build-test.yml:
--------------------------------------------------------------------------------
1 | name: Tests
2 |
3 | on:
4 | workflow_call:
5 | workflow_dispatch:
6 | push:
7 | branches: [ main, develop ]
8 | pull_request:
9 | branches: [ main ]
10 |
11 | jobs:
12 | build:
13 | runs-on: ${{ matrix.os }}
14 | strategy:
15 | fail-fast: false
16 | matrix:
17 | os: [ubuntu-latest, macos-latest, windows-latest]
18 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
19 |
20 | steps:
21 |
22 | - uses: actions/checkout@v4
23 |
24 | - name: Set up Python ${{ matrix.python-version }}
25 | uses: actions/setup-python@v5
26 | with:
27 | python-version: ${{ matrix.python-version }}
28 | allow-prereleases: true
29 |
30 | - name: Install dependencies
31 | run: |
32 | python -m pip install --upgrade pip
33 | pip install .[dev]
34 |
35 | - name: Test with pytest
36 | run: |
37 | pytest
38 |
--------------------------------------------------------------------------------
/.github/workflows/publish-docs.yml:
--------------------------------------------------------------------------------
1 | name: Docs
2 |
3 | on:
4 | workflow_call:
5 | workflow_dispatch:
6 |
7 | jobs:
8 | deploy:
9 | runs-on: ubuntu-latest
10 | steps:
11 |
12 | - uses: actions/checkout@v4
13 |
14 | - name: Set up Python
15 | uses: actions/setup-python@v5
16 | with:
17 | python-version: '3.x'
18 |
19 | - name: Install package
20 | run: |
21 | python -m pip install --upgrade pip
22 | pip install .[dev]
23 |
24 | - name: Build docs
25 | run: |
26 | cd docs
27 | make html
28 |
29 | - name: Publish docs
30 | uses: JamesIves/github-pages-deploy-action@v4
31 | with:
32 | branch: gh-pages
33 | folder: docs/_build/html
34 |
--------------------------------------------------------------------------------
/.github/workflows/pypi-publish.yml:
--------------------------------------------------------------------------------
1 | name: Publish to PyPI
2 |
3 | on:
4 | release:
5 | types: [published]
6 |
7 | jobs:
8 | tests:
9 | uses: ./.github/workflows/build-test.yml
10 |
11 | docs:
12 | uses: ./.github/workflows/publish-docs.yml
13 |
14 | deploy:
15 | needs: [tests, docs]
16 | runs-on: ubuntu-latest
17 | steps:
18 |
19 | - uses: actions/checkout@v4
20 |
21 | - name: Set up Python
22 | uses: actions/setup-python@v5
23 | with:
24 | python-version: '3.x'
25 |
26 | - name: Install package
27 | run: |
28 | python -m pip install --upgrade pip
29 | pip install .[dev]
30 |
31 | - name: Build package
32 | run: python -m build
33 |
34 | - name: Publish package
35 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
36 | with:
37 | user: __token__
38 | password: ${{ secrets.PYPI_API_TOKEN }}
39 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Mac
2 | .DS_Store
3 |
4 | # Version file autocreated in pyproject.toml and redflag/__init__.py
5 | _version.py
6 |
7 | # Processed docs
8 | _notebooks
9 |
10 | # API docs are built
11 | docs/redflag.*.rst
12 |
13 | # Other
14 | .vscode
15 |
16 | # Byte-compiled / optimized / DLL files
17 | __pycache__/
18 | *.py[cod]
19 | *$py.class
20 |
21 | # C extensions
22 | *.so
23 |
24 | # Distribution / packaging
25 | .Python
26 | build/
27 | develop-eggs/
28 | dist/
29 | downloads/
30 | eggs/
31 | .eggs/
32 | lib/
33 | lib64/
34 | parts/
35 | sdist/
36 | var/
37 | wheels/
38 | pip-wheel-metadata/
39 | share/python-wheels/
40 | *.egg-info/
41 | .installed.cfg
42 | *.egg
43 | MANIFEST*
44 |
45 | # PyInstaller
46 | # Usually these files are written by a python script from a template
47 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
48 | *.manifest
49 | *.spec
50 |
51 | # Installer logs
52 | pip-log.txt
53 | pip-delete-this-directory.txt
54 |
55 | # Unit test / coverage reports
56 | htmlcov/
57 | .tox/
58 | .nox/
59 | .coverage
60 | .coverage.*
61 | .cache
62 | nosetests.xml
63 | coverage.xml
64 | *.cover
65 | *.py,cover
66 | .hypothesis/
67 | .pytest_cache/
68 |
69 | # Translations
70 | *.mo
71 | *.pot
72 |
73 | # Django stuff:
74 | *.log
75 | local_settings.py
76 | db.sqlite3
77 | db.sqlite3-journal
78 |
79 | # Flask stuff:
80 | instance/
81 | .webassets-cache
82 |
83 | # Scrapy stuff:
84 | .scrapy
85 |
86 | # Sphinx documentation
87 | docs/_build/
88 |
89 | # PyBuilder
90 | target/
91 |
92 | # Jupyter Notebook
93 | .ipynb_checkpoints
94 |
95 | # IPython
96 | profile_default/
97 | ipython_config.py
98 |
99 | # pyenv
100 | .python-version
101 |
102 | # pipenv
103 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
104 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
105 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
106 | # install all needed dependencies.
107 | #Pipfile.lock
108 |
109 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
110 | __pypackages__/
111 |
112 | # Celery stuff
113 | celerybeat-schedule
114 | celerybeat.pid
115 |
116 | # SageMath parsed files
117 | *.sage.py
118 |
119 | # Environments
120 | .env
121 | .venv
122 | env/
123 | venv/
124 | ENV/
125 | env.bak/
126 | venv.bak/
127 |
128 | # Spyder project settings
129 | .spyderproject
130 | .spyproject
131 |
132 | # Rope project settings
133 | .ropeproject
134 |
135 | # mkdocs documentation
136 | /site
137 |
138 | # mypy
139 | .mypy_cache/
140 | .dmypy.json
141 | dmypy.json
142 |
143 | # Pyre type checker
144 | .pyre/
145 |
--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
1 | # Authors
2 |
3 | The following people have contributed to the project (in alphabetical order):
4 |
5 | - [Matt Hall](https://github.com/kwinkunks), Agile Scientific, Canada (ORCID: [0000-0002-4054-8295](https://orcid.org/0000-0002-4054-8295))
6 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 |
4 | ## 0.5.0, 21 April 2024
5 |
6 | - This release makes more changes to the tests and documentation in reponse to the review process for [the submission](https://joss.theoj.org/papers/e1ca575ec0c5344144f87176539ef547) to JOSS (see below).
7 | - In particular, see the following issue: [#97](https://github.com/scienxlab/redflag/issues/97)
8 | - Changed the method of handling dynamic versioning. For now the package `__version__` attribute is still defined, but it is deprecated and will be removed in `0.6.0`. Use `from importlib.metadata.version('redflag')` to get the version information instead.
9 | - Changed the default `get_outliers()` method from isolation forest (`'iso'`) to Mahalanobis (`'mah'`) to match other functions, eg `has_outliers()` and the `sklearn` pipeline object.
10 | - Updated `actions/setup-python` to use v5.
11 |
12 |
13 | ## 0.4.2, 10 December 2023
14 |
15 | - This is a minor release making changes to the tests and documentation in reponse to the review process for [a submission](https://joss.theoj.org/papers/e1ca575ec0c5344144f87176539ef547) to [The Journal of Open Source Software](https://joss.theoj.org) (JOSS).
16 | - See the following issues: [#89](https://github.com/scienxlab/redflag/issues/89), [#90](https://github.com/scienxlab/redflag/issues/90), [#91](https://github.com/scienxlab/redflag/issues/91), [#92](https://github.com/scienxlab/redflag/issues/92), [#93](https://github.com/scienxlab/redflag/issues/93), [#94](https://github.com/scienxlab/redflag/issues/94) and [#95](https://github.com/scienxlab/redflag/issues/95).
17 | - Now building and testing on Windows and MacOS as well as Linux.
18 | - Python version `3.12` added to package classifiers
19 | - Python version `3.12` tested during CI
20 |
21 |
22 | ## 0.4.1, 2 October 2023
23 |
24 | - This is a minor release intended to preview new `pandas`-related features for version 0.5.0.
25 | - Added another `pandas` Series accessor, `is_imbalanced()`.
26 | - Added two `pandas` DataFrame accessors, `feature_importances()` and `correlation_detector()`. These are experimental features.
27 |
28 |
29 | ## 0.4.0, 28 September 2023
30 |
31 | - `redflag` can now be installed by the `conda` package and environment manager. To do so, use `conda install -c conda-forge redflag`.
32 | - All of the `sklearn` components can now be instantiated with `warn=False` in order to trigger a `ValueException` instead of a warning. This allows you to build pipelines that will break if a detector is triggered.
33 | - Added `redflag.target.is_ordered()` to check if a single-label categorical target is ordered in some way. The test uses a Markov chain analysis, applying chi-squared test to the transition matrix. In general, the Boolean result should only be used on targets with several classes, perhaps at least 10. Below that, it seems to give a lot of false positives.
34 | - You can now pass `groups` to `redflag.distributions.is_multimodal()`. If present, the modality will be checked for each group, returning a Boolean array of values (one for each group). This allows you to check a feature partitioned by target class, for example.
35 | - Added `redflag.sklearn.MultimodalityDetector` to provide a way to check for multimodal features. If `y` is passed and is categorical, it will be used to partition the data and modality will be checked for each class.
36 | - Added `redflag.sklearn.InsufficientDataDetector` which checks that there are at least M2 records (rows in `X`), where M is the number of features (i.e. columns) in `X`.
37 | - Removed `RegressionMultimodalDetector`. Use `MultimodalDetector` instead.
38 |
39 |
40 | ## 0.3.0, 21 September 2023
41 |
42 | - Added some accessors to give access to `redflag` functions directly from `pandas.Series` objects, via an 'accessor'. For example, for a Series `s`, one can call `minority_classes = s.redflag.minority_classes()` instead of `redflag.minority_classes(s)`. Other functions include `imbalance_degree()`, `dummy_scores()` (see below). Probably not very useful yet, but future releases will add some reporting functions that wrap multiple Redflag functions. **This is an experimental feature and subject to change.**
43 | - Added a Series accessor `report()` to perform a range of tests and make a small text report suitable for printing. Access for a Series `s` like `s.redflag.report()`. **This is an experimental feature and subject to change.**
44 | - Added new documentation page for the Pandas accessor.
45 | - Added `redflag.target.dummy_classification_scores()`, `redflag.target.dummy_regression_scores()`, which train a dummy (i.e. naive) model and compute various relevant scores (MSE and R2 for regression, F1 and ROC-AUC for classification tasks). Additionally, both `most_frequent` and `stratified` strategies are tested for classification tasks; only the `mean` strategy is employed for regression tasks. The helper function `redflag.target.dummy_scores()` tries to guess what kind of task suits the data and calls the appropriate function.
46 | - Moved `redflag.target.update_p()` to `redflag.utils`.
47 | - Added `is_imbalanced()` to return a Boolean depending on a threshold of imbalance degree. Default threshold is 0.5 but the best value is up for debate.
48 | - Removed `utils.has_low_distance_stdev`.
49 |
50 |
51 | ## 0.2.0, 4 September 2023
52 |
53 | - Moved to something more closely resembling semantic versioning, which is the main reason this is version 0.2.0.
54 | - Builds and tests on Python 3.11 have been successful, so now supporting this version.
55 | - Added custom 'alarm' `Detector`, which can be instantiated with a function and a warning to emit when the function returns True for a 1D array. You can easily write your own detectors with this class.
56 | - Added `make_detector_pipeline()` which can take sequences of functions and warnings (or a mapping of functions to warnings) and returns a `scikit-learn.pipeline.Pipeline` containing a `Detector` for each function.
57 | - Added `RegressionMultimodalDetector` to allow detection of non-unimodal distributions in features, when considered across the entire dataset. (Coming soon, a similar detector for classification tasks that will partition the data by class.)
58 | - Redefined `is_standardized` (deprecated) as `is_standard_normal`, which implements the Kolmogorov–Smirnov test. It seems more reliable than assuming the data will have a mean of almost exactly 0 and standard deviation of exactly 1, when all we really care about is that the feature is roughly normal.
59 | - Changed the wording slightly in the existing detector warning messages.
60 | - No longer warning if `y` is `None` in, eg, `ImportanceDetector`, since you most likely know this.
61 | - Some changes to `ImportanceDetector`. It now uses KNN estimators instead of SVMs as the third measure of importance; the SVMs were too unstable, causing numerical issues. It also now requires that the number of important features is less than the total number of features to be triggered. So if you have 2 features and both are important, it does not trigger.
62 | - Improved `is_continuous()` which was erroneously classifying integer arrays with many consecutive values as non-continuous.
63 | - Note that `wasserstein` no longer checks that the data are standardized; this check will probably return in the future, however.
64 | - Added a `Tutorial.ipynb` notebook to the docs.
65 | - Added a **Copy** button to code blocks in the docs.
66 |
67 |
68 | ## 0.1.10, 21 November 2022
69 |
70 | - Added `redflag.importance.least_important_features()` and `redflag.importance.most_important_features()`. These functions are complementary (in other words, if the same threshold is used in each, then between them they return all of the features). The default threshold for importance is half the expected value. E.g. if there are 5 features, then the default threshold is half of 0.2, or 0.1. Part of [Issue 2](https://github.com/scienxlab/redflag/issues/2).
71 | - Added `redflag.sklearn.ImportanceDetector` class, which warns if 1 or 2 features have anomalously high importance, or if some features have anomalously low importance. Part of [Issue 2](https://github.com/scienxlab/redflag/issues/2).
72 | - Added `redflag.sklearn.ImbalanceComparator` class, which learns the imbalance present in the training data, then compares what is observed in subsequent data (evaluation, test, or production data). If there's a difference, it throws a warning. Note: it does not warn if there is imbalance present in the training data; use `ImbalanceDetector` for that.
73 | - Added `redflag.sklearn.RfPipeline` class, which is needed to include the `ImbalanceComparator` in a pipeline (because the common-or-garden `sklearn.pipeline.Pipeline` class does not pass `y` into a transformer's `transform()` method). Also added the `redflag.sklearn.make_rf_pipeline()` function to help make pipelines with this special class. These components are straight-up forks of the code in `scikit-learn` (3-clause BSD licensed).
74 | - Added example to `docs/notebooks/Using_redflag_with_sklearn.ipynb` to show how to use these new objects.
75 | - Improved `redflag.is_continuous()`, which was buggy; see [Issue 3](https://github.com/scienxlab/redflag/issues/3). It still fails on some cases. I'm not sure a definitive test for continuousness (or, conversely, discreteness) is possible; it's just a heuristic.
76 |
77 |
78 | ## 0.1.9, 25 August 2022
79 |
80 | - Added some experimental `sklearn` transformers that implement various `redflag` tests. These do not transform the data in any way, they just inspect the data and emit warnings if tests fail. The main ones are: `redflag.sklearn.ClipDetector`, `redflag.sklearn.OutlierDetector`, `redflag.sklearn.CorrelationDetector`, `redflag.sklearn.ImbalanceDetector`, and `redflag.sklearn.DistributionComparator`.
81 | - Added tests for the `sklearn` transformers. These are in `redflag/tests/test_redflag.py` file, whereas all other tests are doctests. You can run all the tests at once with `pytest`; coverage is currently 94%.
82 | - Added `docs/notebooks/Using_redflag_with_sklearn.ipynb` to show how to use these new objects in an `sklearn` pipeline.
83 | - Since there's quite a bit of `sklearn` code in the `redflag` package, it is now a hard dependency. I removed the other dependencies because they are all dependencies of `sklearn`.
84 | - Added `redflag.has_outliers()` to make it easier to check for excessive outliers in a dataset. This function only uses Mahalanobis distance and always works in a multivariate sense.
85 | - Reorganized the `redflag.features` module into new modules: `redflag.distributions`, `redflag.outliers`, and `redflag.independence`. All of the functions are still imported into the `redflag` namespace, so this doesn't affect existing code.
86 | - Added examples to `docs/notebooks/Basic_usage.ipynb`.
87 | - Removed the `class_imbalance()` function, which was confusing. Use `imbalance_ratio()` instead.
88 |
89 |
90 | ## 0.1.8, 8 July 2022
91 |
92 | - Added Wasserstein distance comparisons for univariate and multivariate distributions. This works for either a `groups` array, or for multiple dataset splits if that's more convenient.
93 | - Improved `get_outliers()`, removing OneClassSVM method and adding EllipticEnvelope and Mahalanobis distance.
94 | - Added Mahalanobis distance outlier detection function to serve `get_outliers()` or be used on its own. Reproduces the results `zscore_outliers()` used to give for univariate data, so removed that.
95 | - Added `kde_peaks()` function to find peaks in a kernel density estimate. This also needed some other functions, including `fit_kde()`, `get_kde()`, `find_large_peaks()`, and the bandwidth estimators, `bw_silverman()` and `bw_scott()`.
96 | - Added `classes` argument to the class imbalance function, in case there are classes with no data, or to override the classes in the data.
97 | - Fixed a bug in the `feature_importances()` function.
98 | - Fixed a bug in the `is_continuous()` function.
99 | - Improved the `Using_redflag.ipynb` notebook.
100 | - Added `has_nans()`, `has_monotonic()`, and `has_flat()` functions to detect interpolation issues.
101 | - Moved some more helper functions into utils, eg `iter_groups()`, `ecdf()`, `flatten()`, `stdev_to_proportion()` and `proportion_to_stdev()`.
102 | - Wrote a lot more tests, coverage is now at 95%.
103 |
104 |
105 | ## 0.1.3 to 0.1.7, 9–11 February 2022
106 |
107 | - Added `utils.has_low_distance_stdev`.
108 | - Added `utils.has_few_samples`.
109 | - Added `utils.is_standardized()` function to test if a feature or regression target appears to be a Z-score.
110 | - Changed name of `clips()` function to `clipped()` to be more predictable (it goes with `is_clipped()`).
111 | - Documentation.
112 | - CI workflow seems to be stable.
113 | - Mostly just a lot of flailing.
114 |
115 |
116 | ## 0.1.2, 1 February 2022
117 |
118 | - Early release.
119 | - Added auto-versioning.
120 |
121 |
122 | ## 0.1.1, 31 January 2022
123 |
124 | - Early release.
125 |
126 |
127 | ## 0.1.0, 30 January 2022
128 |
129 | - Early release.
130 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to make participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Racist or racially biased remarks, attacks, or content.
28 | * Trolling, insulting/derogatory comments, and personal or political attacks
29 | * Public or private harassment
30 | * Publishing others' private information, such as a physical or electronic
31 | address, without explicit permission
32 | * Other conduct which could reasonably be considered inappropriate in a
33 | professional setting
34 |
35 | ## Our responsibilities
36 |
37 | Project maintainers are responsible for clarifying the standards of acceptable
38 | behavior and are expected to take appropriate and fair corrective action in
39 | response to any instances of unacceptable behavior.
40 |
41 | Project maintainers have the right and responsibility to remove, edit, or
42 | reject comments, commits, code, wiki edits, issues, and other contributions
43 | that are not aligned to this Code of Conduct, or to ban temporarily or
44 | permanently any contributor for other behaviors that they deem inappropriate,
45 | threatening, offensive, or harmful.
46 |
47 | ## Scope
48 |
49 | This Code of Conduct applies within all project spaces, and it also applies when
50 | an individual is representing the project or its community in public spaces.
51 | Examples of representing a project or community include using an official
52 | project e-mail address, posting via an official social media account, or acting
53 | as an appointed representative at an online or offline event. Representation of
54 | a project may be further defined and clarified by project maintainers.
55 |
56 | ## Enforcement
57 |
58 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
59 | reported by contacting any of the following people:
60 |
61 | - Matt Hall, [kwinkunks@gmail.com](mailto:kwinkunks@gmail.com)
62 |
63 | All complaints will be reviewed and investigated and will result in a response that
64 | is deemed necessary and appropriate to the circumstances. The project team is
65 | obligated to maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 |
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 |
72 | ## Attribution
73 |
74 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/), version 1.4,
75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76 |
77 | For answers to common questions about this code of conduct, see
78 | https://www.contributor-covenant.org/faq
79 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | **🙌 Thank you for considering contributing to this project!**
4 |
5 | There are several important ways you can help; here are some examples:
6 |
7 | - Submitting bug reports and feature requests: see [Issues](https://github.com/scienxlab/redflag/issues).
8 | - Proposing code for bug fixes and new features, then [making a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests).
9 | - Fixing typos and generally improving the documentation.
10 | - Writing tutorials, examples, and how-to documents.
11 |
12 |
13 | ## Code of conduct
14 |
15 | We're fortunate to be part of a large professional community that conducts itself with mutual respect and consideration for others. Scienxlab's [Code of Conduct](https://github.com/scienxlab/community/blob/main/CODE_OF_CONDUCT.md) is part of protecting these features for everyone, everywhere. Please read it.
16 |
17 |
18 | ## Authorship
19 |
20 | If you contribute a pull request to the project and you wish to be identified as an author, please add yourself to `AUTHORS.md`.
21 |
22 |
23 | ## License
24 |
25 | By making a contribution, you agree that it shall be governed by the terms of the license unless another, specific agreement is made with Agile.
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # redflag
2 |
3 | [](https://github.com/scienxlab/redflag/actions/workflows/build-test.yml)
4 | [](https://github.com/scienxlab/redflag/actions/workflows/publish-docs.yml)
5 | [](https://pypi.org/project/redflag/)
6 | [](https://anaconda.org/conda-forge/redflag)
7 | [](https://pypi.org/project/redflag/)
8 | [](https://pypi.org/project/redflag/)
9 |
10 | 🚩 `redflag` aims to be an automatic safety net for machine learning datasets. The vision is to accept input of a Pandas `DataFrame` or NumPy `ndarray` representing the input `X` and target `y` in a machine learning task. `redflag` will provide an analysis of each feature, and of the target, including aspects such as class imbalance, leakage, outliers, anomalous data patterns, threats to the IID assumption, and so on. The goal is to complement other projects like `pandas-profiling` and `greatexpectations`.
11 |
12 |
13 | ## Installation
14 |
15 | You can install this package with `pip`:
16 |
17 | python -m pip install redflag
18 |
19 | Alternatively, you can use the `conda` package manager, pointed at the `conda-forge` channel:
20 |
21 | conda install -c conda-forge redflag
22 |
23 | For developers, there is a `pip` option for installing `dev` dependencies. Use `pip install "redflag[dev]"` to install all testing and documentation packages.
24 |
25 |
26 | ## Example with `sklearn`
27 |
28 | The most useful components of `redflag` are probably the `scikit-learn` "detectors". These sit in your pipeline, look at your training and validation data, and emit warnings if something looks like it might cause a problem. For example, we can get alerted to an imbalanced target vector `y` like so:
29 |
30 | ```python
31 | import redflag as rf
32 | from sklearn.datasets import make_classification
33 |
34 | X, y = make_classification(weights=[0.1])
35 |
36 | _ = rf.ImbalanceDetector().fit(X, y)
37 | ```
38 |
39 | This raises a warning:
40 |
41 | ```python
42 | 🚩 The labels are imbalanced by more than the threshold (0.780 > 0.400). See self.minority_classes_ for the minority classes.
43 | ```
44 |
45 | For maximum effect, put this and other detectors in your pipeline, or use the pre-build `rf.pipeline` which contains several useful alerts.
46 |
47 | See [the documentation](https://scienxlab.org/redflag), and specifically the notebook [Using `redflag` with `sklearn`.ipynb](https://github.com/scienxlab/redflag/blob/main/docs/notebooks/Using_redflag_with_sklearn.ipynb) for other examples.
48 |
49 |
50 | ## Example of function call
51 |
52 | `redflag` is also a collection of functions. Most of the useful ones take one or more columns of data (usually a 1D or 2D NumPy array) and run a single test. For example, we can do some outlier detection. The `get_outliers()` function returns the indices of data points that are considered outliers:
53 |
54 | ```python
55 | >>> import redflag as rf
56 | >>> data = 3 * [-3, -2, -2, -1, 0, 0, 0, 1, 2, 2, 3]
57 | >>> rf.get_outliers(data)
58 | array([], dtype=int64)
59 | ```
60 |
61 | That is, there are no outliers. But let's add a clear outlier: a new data record with a value of 100. The function returns the index position(s) of the outlier point(s):
62 |
63 | ```python
64 | >>> rf.get_outliers(data + [100])
65 | array([33])
66 | ```
67 |
68 | See [the documentation](https://scienxlab.org/redflag), and specifically the notebook [Basic_usage.ipynb](https://github.com/scienxlab/redflag/blob/main/docs/notebooks/Basic_usage.ipynb) for several other basic examples.
69 |
70 |
71 | ## Documentation
72 |
73 | [The documentation is online.](https://scienxlab.org/redflag)
74 |
75 |
76 | ## Contributing
77 |
78 | Please see [`CONTRIBUTING.md`](https://github.com/scienxlab/redflag/blob/main/CONTRIBUTING.md). There is also a section [in the documentation](https://scienxlab.org/redflag) about _Development_.
79 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security policy
2 |
3 |
4 | ## Supported versions
5 |
6 | Only the latest version of `redflag` is supported.
7 |
8 |
9 | ## Reporting a vulnerability
10 |
11 | Please do not report on issue on GitHub, instead report vulnerabilities to hello@scienxlab.org
12 |
13 | We do not award bounties for security vulnerabilities, but will notify you if and when the report is accepted and acted upon.
14 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 |
2 | # You can set these variables from the command line, and also
3 | # from the environment for the first two.
4 | SPHINXOPTS ?=
5 | SPHINXBUILD ?= sphinx-build
6 | SOURCEDIR = .
7 | BUILDDIR = _build
8 |
9 | # Put it first so that "make" without argument is like "make help".
10 | help:
11 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
12 |
13 | .PHONY: help html
14 |
15 | html:
16 | python pre_process_ipynb.py $(SOURCEDIR)/notebooks
17 | $(SPHINXBUILD) -E -b html $(SPHINXOPTS) $(SOURCEDIR) $(BUILDDIR)/html
18 | python post_process_html.py $(BUILDDIR)/html
19 | @echo
20 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
21 |
--------------------------------------------------------------------------------
/docs/_static/custom.css:
--------------------------------------------------------------------------------
1 | /* Removes Captions from the main page. */
2 | article p.caption {
3 | display: none;
4 | }
5 |
6 | /* Styles the 'line block' https://docutils.sourceforge.io/docs/user/rst/quickref.html#line-blocks. */
7 | blockquote {
8 | background: none;
9 | border-left-width: 0px;
10 | padding: 0em;
11 | }
12 |
13 | blockquote div.line {
14 | color: #838383;
15 | display: inline;
16 | font-style: normal !important;
17 | font-size: 150%;
18 | line-height: 125%;
19 | }
20 |
21 | /* Adds the GitHub ribbon. */
22 | #forkongithub a {
23 | background:rgb(158, 158, 158);
24 | color:#fff;
25 | text-decoration:none;
26 | font-family:arial,sans-serif;
27 | text-align:center;
28 | font-weight:bold;
29 | padding:5px 40px;
30 | font-size:1rem;
31 | line-height:2rem;
32 | position:relative;
33 | transition:0.5s;
34 | }
35 |
36 | #forkongithub a:hover {
37 | background:#14ca29;
38 | color:#fff;
39 | }
40 |
41 | #forkongithub a::after {
42 | bottom:1px;
43 | top:auto;
44 | }
45 |
46 | @media screen and (min-width:800px) {
47 | #forkongithub{
48 | position:fixed;
49 | display:block;
50 | top:0;
51 | right:0;
52 | width:200px;
53 | overflow:hidden;
54 | height:200px;
55 | z-index:9999;
56 | }
57 |
58 | #forkongithub a {
59 | width:200px;
60 | position:absolute;
61 | top:60px;
62 | right:-60px;
63 | transform:rotate(45deg);
64 | -webkit-transform:rotate(45deg);
65 | -ms-transform:rotate(45deg);
66 | -moz-transform:rotate(45deg);
67 | -o-transform:rotate(45deg);
68 | box-shadow:4px 4px 10px rgba(0,0,0,0.4);
69 | }
70 | }
--------------------------------------------------------------------------------
/docs/_static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scienxlab/redflag/f495ddd0729c7ac61dc8d8f54fc706178a0d253f/docs/_static/favicon.ico
--------------------------------------------------------------------------------
/docs/_static/redflag.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
107 |
--------------------------------------------------------------------------------
/docs/_static/redflag_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scienxlab/redflag/f495ddd0729c7ac61dc8d8f54fc706178a0d253f/docs/_static/redflag_logo.png
--------------------------------------------------------------------------------
/docs/authors.md:
--------------------------------------------------------------------------------
1 | ```{include} ../AUTHORS.md
2 | ```
--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
1 | ```{include} ../CHANGELOG.md
2 | ```
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for Sphinx documentation builder.
2 |
3 | # -- Setup function ----------------------------------------------------------
4 |
5 | # Defines custom steps in the process.
6 |
7 | def autodoc_skip_member(app, what, name, obj, skip, options):
8 | """Exclude all private attributes, methods, and dunder methods from Sphinx."""
9 | import re
10 | exclude = re.findall(r'\._.*', str(obj))
11 | return skip or exclude
12 |
13 | def remove_module_docstring(app, what, name, obj, options, lines):
14 | """Remove everything after 'Author: '."""
15 | if what == "module":
16 | keep = [i for i, line in enumerate(lines) if line.startswith("Author: ")]
17 | if keep:
18 | del lines[keep[0]:]
19 | return
20 |
21 | def setup(app):
22 | app.connect('autodoc-skip-member', autodoc_skip_member)
23 | app.connect("autodoc-process-docstring", remove_module_docstring)
24 | return
25 |
26 |
27 | # -- Path setup --------------------------------------------------------------
28 |
29 | # If extensions (or modules to document with autodoc) are in another directory,
30 | # add these directories to sys.path here. If the directory is relative to the
31 | # documentation root, use os.path.abspath to make it absolute, like shown here.
32 |
33 | import os
34 | import sys
35 | sys.path.insert(0, os.path.abspath('../src'))
36 |
37 |
38 | # -- Project information -----------------------------------------------------
39 |
40 | project = 'redflag'
41 | copyright = '2024, The Redflag Authors'
42 | author = 'The Redflag Authors'
43 |
44 |
45 | # -- General configuration ---------------------------------------------------
46 |
47 | # Add any Sphinx extension module names here, as strings. They can be
48 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
49 | # ones.
50 | extensions = [
51 | 'sphinxcontrib.apidoc',
52 | 'sphinx.ext.githubpages',
53 | 'sphinx.ext.napoleon',
54 | 'sphinx.ext.coverage',
55 | 'sphinx_copybutton',
56 | 'myst_nb',
57 | ]
58 |
59 | myst_enable_extensions = ["dollarmath", "amsmath"]
60 |
61 | # Apidoc automation
62 | # https://pypi.org/project/sphinxcontrib-apidoc/
63 | # The apidoc extension and this code automatically update apidoc.
64 | apidoc_module_dir = '../src/redflag'
65 | apidoc_output_dir = './'
66 | apidoc_excluded_paths = []
67 | apidoc_toc_file = False
68 | apidoc_separate_modules = True
69 |
70 | # Add any paths that contain templates here, relative to this directory.
71 | templates_path = ['_templates']
72 |
73 | # List of patterns, relative to source directory, that match files and
74 | # directories to ignore when looking for source files.
75 | # This pattern also affects html_static_path and html_extra_path.
76 | exclude_patterns = ['_build', 'notebooks']
77 |
78 |
79 | # -- Options for HTML output -------------------------------------------------
80 |
81 | # The theme to use for HTML and HTML Help pages. See the documentation for
82 | # a list of builtin themes.
83 | #
84 | # https://sphinx-themes.org/sample-sites/furo/
85 | html_theme = 'furo'
86 | html_title = ''
87 | html_theme_options = {
88 | "sidebar_hide_name": True,
89 | }
90 |
91 | # Add any paths that contain custom static files (such as style sheets) here,
92 | # relative to this directory. They are copied after the builtin static files,
93 | # so a file named "default.css" will overwrite the builtin "default.css".
94 | html_static_path = ['_static']
95 |
96 | html_css_files = [
97 | 'custom.css',
98 | ]
99 |
100 | # Branding.
101 | html_favicon = '_static/favicon.ico'
102 | html_logo = '_static/redflag_logo.png'
103 |
--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
1 | ```{include} ../CONTRIBUTING.md
2 | ```
--------------------------------------------------------------------------------
/docs/development.md:
--------------------------------------------------------------------------------
1 | # Development
2 |
3 | If you'd like to develop `redflag`, this page should help you get started.
4 |
5 |
6 | ## Installation
7 |
8 | You can install this package with `pip` or `conda`. The `dev` option will install the packages you need for testing and building the documentation.
9 |
10 | ```shell
11 | python -m pip install "redflag[dev]"
12 | ```
13 |
14 |
15 | ## Contributing
16 |
17 | If you'd like to contribute pull requests back to the main `redflag ` project, please see [`CONTRIBUTING.md`](https://github.com/scienxlab/redflag/blob/main/CONTRIBUTING.md).
18 |
19 |
20 | ## Testing
21 |
22 | You can run the tests (requires `pytest` and `pytest-cov`) with
23 |
24 | ```shell
25 | pytest
26 | ```
27 |
28 | Most of the tests are `doctest` tests, which are contained in the docstrings of this package's functions. There are further tests in the `tests` folder.
29 |
30 |
31 | ## Building the package
32 |
33 | This repo uses PEP 518-style packaging. [Read more about this](https://setuptools.pypa.io/en/latest/build_meta.html) and [about Python packaging in general](https://packaging.python.org/en/latest/tutorials/packaging-projects/).
34 |
35 | To build `redflag` locally:
36 |
37 | ```shell
38 | python -m build
39 | ```
40 |
41 | This builds both `.tar.gz` and `.whl` files, either of which you can install with `pip`.
42 |
43 |
44 | ## Building the docs
45 |
46 | You can build the docs with the following commands:
47 |
48 | ```shell
49 | cd docs
50 | make html
51 | ```
52 |
53 | Don't just run `sphinx-build` manually: there is other stuff happening in the `Makefile`.
54 |
55 | There is a continuous integration script to update the docs on published releases.
56 |
57 |
58 | ## Continuous integration
59 |
60 | This repo has two GitHub 'workflows' or 'actions':
61 |
62 | - Push to `main`: Run all tests on all version of Python. This is the **Build and test** workflow.
63 | - Publish a new release: Build and upload to PyPI. This is the **Publish to PyPI** workflow. Publish using the GitHub interface, for example ([read more](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository)).
64 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | :hide-toc:
2 |
3 | .. container::
4 | :name: forkongithub
5 |
6 | `Fork on GitHub `_
7 |
8 |
9 | Redflag: safer ML by design
10 | ===========================
11 |
12 | | ``redflag`` is a lightweight safety net for machine
13 | | learning. Given a ``DataFrame`` or ``ndarray``,
14 | | ``redflag`` will analyse the features and the target,
15 | | and warn you about class imbalance, leakage, outliers,
16 | | anomalous data patterns, threats to the IID assumption,
17 | | and more.
18 |
19 |
20 | Quick start
21 | -----------
22 |
23 | .. toctree::
24 | :caption: Quick start
25 |
26 | Install ``redflag`` with pip or with ``conda`` from the ``conda-forge`` channel:
27 |
28 | .. code-block:: shell
29 |
30 | pip install redflag
31 |
32 | Import ``redflag`` in your Python program:
33 |
34 | .. code-block:: python
35 |
36 | import redflag as rf
37 |
38 | There are three main ways to use ``redflag``:
39 |
40 | 1. ``scikit-learn`` components for your pipelines, e.g. ``rf.ImbalanceDetector().fit_transform(X, y)``.
41 | 2. ``pandas`` accessors on Series and DataFrames, e.g. ``df['target'].redflag.imbalance_degree()``.
42 | 3. As a library of standalone functions, e.g. ``rf.imbalance_degree(y)``.
43 |
44 | Carry on exploring with the user guide below.
45 |
46 |
47 | User guide
48 | ----------
49 |
50 | .. toctree::
51 | :maxdepth: 2
52 | :caption: User guide
53 |
54 | installation
55 | what_is_redflag
56 | _notebooks/Basic_usage.ipynb
57 | _notebooks/Using_redflag_with_sklearn.ipynb
58 | _notebooks/Using_redflag_with_Pandas.ipynb
59 | _notebooks/Tutorial.ipynb
60 |
61 |
62 | API reference
63 | -------------
64 |
65 | .. toctree::
66 | :maxdepth: 2
67 | :caption: API reference
68 |
69 | redflag
70 |
71 |
72 | Other resources
73 | ---------------
74 |
75 | .. toctree::
76 | :maxdepth: 1
77 | :caption: Other resources
78 |
79 | development
80 | contributing
81 | authors
82 | license
83 | changelog
84 |
85 |
86 | Indices and tables
87 | ------------------
88 |
89 | * :ref:`genindex`
90 | * :ref:`modindex`
91 | * :ref:`search`
92 |
93 |
94 | .. toctree::
95 | :caption: Project links
96 | :hidden:
97 |
98 | PyPI releases
99 | Code in GitHub
100 | Issue tracker
101 | Community guidelines
102 | Scienxlab
103 |
--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
1 | # 🚩 Installation
2 |
3 | At the command line:
4 |
5 | ```shell
6 | pip install redflag
7 | ```
8 |
9 | Or, if you use Conda environments:
10 |
11 | ```shell
12 | conda install -c conda-forge redflag
13 | ```
14 |
15 | You can add the `conda-forge` channel as a source for future installations like so:
16 |
17 | ```shell
18 | conda config --add channels conda-forge
19 | conda config --set channel_priority strict
20 | ```
21 |
22 |
23 | ## Optional dependencies
24 |
25 | For developers, there is an option to install `dev` dependencies: `pip install "redflag[dev]"` to install all testing and documentation packages.
26 |
27 | If you want to help develop `redflag`, please read [Development](development.md).
28 |
--------------------------------------------------------------------------------
/docs/license.md:
--------------------------------------------------------------------------------
1 | # License
2 |
3 | ```{include} ../LICENSE
4 | ```
5 |
--------------------------------------------------------------------------------
/docs/notebooks/Using_redflag_with_Pandas.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "a8d12712-5c7b-4acb-bb8b-e73efcb9b5dc",
6 | "metadata": {},
7 | "source": [
8 | "# 🚩 Using `redflag` with Pandas\n",
9 | "\n",
10 | "As well as using `redflag`'s functions directly (see `Basic_usage.ipynb`), or with `sklearn` (see `Using_redflag_with_Pandas.ipynb`), `redflag` has some Pandas 'accessors' that give you access to some `redflag` functions almost as if they were methods on Pandas objects.\n",
11 | "\n",
12 | "The best way to get the idea is to look at an example.\n",
13 | "\n",
14 | "First, even though we may not use it directly, we have to import `redflag` to get access to its functions. As long as you have `pandas` installed, it will register the accessors."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 4,
20 | "id": "77aa7f67-0bc7-48e9-87f4-183aa2dc2c35",
21 | "metadata": {},
22 | "outputs": [
23 | {
24 | "data": {
25 | "text/plain": [
26 | "'0.4.2rc2.dev14+g54704af.d20240421'"
27 | ]
28 | },
29 | "execution_count": 4,
30 | "metadata": {},
31 | "output_type": "execute_result"
32 | }
33 | ],
34 | "source": [
35 | "import redflag as rf\n",
36 | "\n",
37 | "rf.__version__"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 5,
43 | "id": "3dbcf6e1-1cb5-4ca5-b64a-bc1d9e7b174f",
44 | "metadata": {},
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/html": [
49 | "