├── .circleci └── config.yml ├── .coveragerc ├── .github ├── ISSUE_TEMPLATE.md ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── documentation-improvement.md │ ├── feature_request.md │ ├── other--blank-template-.md │ ├── question.md │ └── usage-question.md ├── PULL_REQUEST_TEMPLATE.md ├── check-changelog.yml ├── dependabot.yml └── workflows │ ├── circleci-artifacts-redirector.yml │ ├── linters.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── AUTHORS.rst ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.rst ├── build_tools └── circle │ ├── build_doc.sh │ ├── checkout_merge_commit.sh │ ├── linting.sh │ └── push_doc.sh ├── conftest.py ├── doc ├── Makefile ├── _static │ ├── css │ │ └── imbalanced-learn.css │ ├── img │ │ ├── favicon.ico │ │ ├── logo.png │ │ ├── logo.xcf │ │ ├── logo_wide.png │ │ └── logo_wide_dark.png │ ├── index_api.svg │ ├── index_examples.svg │ ├── index_getting_started.svg │ ├── index_user_guide.svg │ └── js │ │ └── copybutton.js ├── _templates │ ├── class.rst │ ├── function.rst │ ├── numpydoc_docstring.rst │ └── sidebar-search-bs.html ├── about.rst ├── bibtex │ └── refs.bib ├── combine.rst ├── common_pitfalls.rst ├── conf.py ├── datasets │ └── index.rst ├── developers_utils.rst ├── ensemble.rst ├── index.rst ├── install.rst ├── introduction.rst ├── make.bat ├── metrics.rst ├── miscellaneous.rst ├── over_sampling.rst ├── references │ ├── combine.rst │ ├── datasets.rst │ ├── ensemble.rst │ ├── index.rst │ ├── keras.rst │ ├── metrics.rst │ ├── miscellaneous.rst │ ├── over_sampling.rst │ ├── pipeline.rst │ ├── tensorflow.rst │ ├── under_sampling.rst │ └── utils.rst ├── sphinxext │ ├── LICENSE.txt │ ├── MANIFEST.in │ ├── README.txt │ ├── github_link.py │ └── sphinx_issues.py ├── under_sampling.rst ├── user_guide.rst ├── whats_new.rst ├── whats_new │ ├── 0.13.rst │ ├── 0.14.rst │ ├── v0.1.rst │ ├── v0.10.rst │ ├── v0.11.rst │ ├── v0.12.rst │ ├── v0.2.rst │ ├── v0.3.rst │ ├── v0.4.rst │ ├── v0.5.rst │ ├── v0.6.rst │ ├── v0.7.rst │ ├── v0.8.rst │ └── v0.9.rst └── zzz_references.rst ├── examples ├── README.txt ├── api │ ├── README.txt │ └── plot_sampling_strategy_usage.py ├── applications │ ├── README.txt │ ├── plot_impact_imbalanced_classes.py │ ├── plot_multi_class_under_sampling.py │ ├── plot_outlier_rejections.py │ ├── plot_over_sampling_benchmark_lfw.py │ ├── plot_topic_classication.py │ └── porto_seguro_keras_under_sampling.py ├── combine │ ├── README.txt │ └── plot_comparison_combine.py ├── datasets │ ├── README.txt │ └── plot_make_imbalance.py ├── ensemble │ ├── README.txt │ ├── plot_bagging_classifier.py │ └── plot_comparison_ensemble_classifier.py ├── evaluation │ ├── README.txt │ ├── plot_classification_report.py │ └── plot_metrics.py ├── model_selection │ ├── README.txt │ └── plot_validation_curve.py ├── over-sampling │ ├── README.txt │ ├── plot_comparison_over_sampling.py │ ├── plot_illustration_generation_sample.py │ └── plot_shrinkage_effect.py ├── pipeline │ ├── README.txt │ └── plot_pipeline_classification.py └── under-sampling │ ├── README.txt │ ├── plot_comparison_under_sampling.py │ ├── plot_illustration_nearmiss.py │ └── plot_illustration_tomek_links.py ├── imblearn ├── VERSION.txt ├── __init__.py ├── _version.py ├── base.py ├── combine │ ├── __init__.py │ ├── _smote_enn.py │ ├── _smote_tomek.py │ └── tests │ │ ├── __init__.py │ │ ├── test_smote_enn.py │ │ └── test_smote_tomek.py ├── datasets │ ├── __init__.py │ ├── _imbalance.py │ ├── _zenodo.py │ └── tests │ │ ├── __init__.py │ │ ├── test_imbalance.py │ │ └── test_zenodo.py ├── ensemble │ ├── __init__.py │ ├── _bagging.py │ ├── _common.py │ ├── _easy_ensemble.py │ ├── _forest.py │ ├── _weight_boosting.py │ └── tests │ │ ├── __init__.py │ │ ├── test_bagging.py │ │ ├── test_easy_ensemble.py │ │ ├── test_forest.py │ │ └── test_weight_boosting.py ├── exceptions.py ├── keras │ ├── __init__.py │ ├── _generator.py │ └── tests │ │ ├── __init__.py │ │ └── test_generator.py ├── metrics │ ├── __init__.py │ ├── _classification.py │ ├── pairwise.py │ └── tests │ │ ├── __init__.py │ │ ├── test_classification.py │ │ ├── test_pairwise.py │ │ └── test_score_objects.py ├── over_sampling │ ├── __init__.py │ ├── _adasyn.py │ ├── _random_over_sampler.py │ ├── _smote │ │ ├── __init__.py │ │ ├── base.py │ │ ├── cluster.py │ │ ├── filter.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── test_borderline_smote.py │ │ │ ├── test_kmeans_smote.py │ │ │ ├── test_smote.py │ │ │ ├── test_smote_nc.py │ │ │ ├── test_smoten.py │ │ │ └── test_svm_smote.py │ ├── base.py │ └── tests │ │ ├── __init__.py │ │ ├── test_adasyn.py │ │ ├── test_common.py │ │ └── test_random_over_sampler.py ├── pipeline.py ├── tensorflow │ ├── __init__.py │ ├── _generator.py │ └── tests │ │ ├── __init__.py │ │ └── test_generator.py ├── tests │ ├── __init__.py │ ├── test_base.py │ ├── test_common.py │ ├── test_docstring_parameters.py │ ├── test_exceptions.py │ ├── test_pipeline.py │ └── test_public_functions.py ├── under_sampling │ ├── __init__.py │ ├── _prototype_generation │ │ ├── __init__.py │ │ ├── _cluster_centroids.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ └── test_cluster_centroids.py │ ├── _prototype_selection │ │ ├── __init__.py │ │ ├── _condensed_nearest_neighbour.py │ │ ├── _edited_nearest_neighbours.py │ │ ├── _instance_hardness_threshold.py │ │ ├── _nearmiss.py │ │ ├── _neighbourhood_cleaning_rule.py │ │ ├── _one_sided_selection.py │ │ ├── _random_under_sampler.py │ │ ├── _tomek_links.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── test_allknn.py │ │ │ ├── test_condensed_nearest_neighbour.py │ │ │ ├── test_edited_nearest_neighbours.py │ │ │ ├── test_instance_hardness_threshold.py │ │ │ ├── test_nearmiss.py │ │ │ ├── test_neighbourhood_cleaning_rule.py │ │ │ ├── test_one_sided_selection.py │ │ │ ├── test_random_under_sampler.py │ │ │ ├── test_repeated_edited_nearest_neighbours.py │ │ │ └── test_tomek_links.py │ └── base.py └── utils │ ├── __init__.py │ ├── _docstring.py │ ├── _show_versions.py │ ├── _sklearn_compat.py │ ├── _tags.py │ ├── _test_common │ ├── __init__.py │ └── instance_generator.py │ ├── _validation.py │ ├── deprecation.py │ ├── estimator_checks.py │ ├── testing.py │ └── tests │ ├── __init__.py │ ├── test_deprecation.py │ ├── test_docstring.py │ ├── test_estimator_checks.py │ ├── test_min_dependencies.py │ ├── test_show_versions.py │ ├── test_testing.py │ └── test_validation.py ├── maint_tools └── test_docstring.py ├── pixi.lock ├── pyproject.toml └── references.bib /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | jobs: 4 | doc: 5 | docker: 6 | - image: cimg/python:3.8.12 7 | environment: 8 | - USERNAME: "glemaitre" 9 | - ORGANIZATION: "imbalanced-learn" 10 | - DOC_REPO: "imbalanced-learn.github.io" 11 | - DOC_URL: "" 12 | - EMAIL: "g.lemaitre58@gmail.com" 13 | - MINICONDA_PATH: ~/miniconda 14 | - CONDA_ENV_NAME: testenv 15 | - OMP_NUM_THREADS: 1 16 | - PYTHON_VERSION: 3 17 | - NUMPY_VERSION: 'latest' 18 | - SCIPY_VERSION: 'latest' 19 | - SKLEARN_VERSION: 'latest' 20 | - MATPLOTLIB_VERSION: 'latest' 21 | - SPHINX_VERSION: 'min' 22 | - PANDAS_VERSION: 'latest' 23 | - SPHINX_GALLERY_VERSION: 'latest' 24 | - NUMPYDOC_VERSION: 'latest' 25 | - SPHINXCONTRIB_BIBTEX_VERSION: 'latest' 26 | - PYDATA_SPHINX_THEME_VERSION: 'latest' 27 | - SPHINX_DESIGN_VERSION: 'latest' 28 | steps: 29 | - add_ssh_keys: 30 | fingerprints: 31 | - "34:ea:b1:d9:b1:e2:5d:79:81:c4:d0:39:ca:85:e1:ef" 32 | - checkout 33 | - run: ./build_tools/circle/checkout_merge_commit.sh 34 | - run: ./build_tools/circle/build_doc.sh 35 | - store_artifacts: 36 | path: doc/_build/html 37 | destination: doc 38 | - store_artifacts: 39 | path: ~/log.txt 40 | - persist_to_workspace: 41 | root: doc/_build/html 42 | paths: . 43 | - attach_workspace: 44 | at: doc/_build/html 45 | - run: ls -ltrh doc/_build/html 46 | - deploy: 47 | command: | 48 | if [[ "${CIRCLE_BRANCH}" =~ ^master$|^[0-9]+\.[0-9]+\.X$ ]]; then 49 | bash ./build_tools/circle/push_doc.sh doc/_build/html 50 | fi 51 | filters: 52 | branches: 53 | ignore: gh-pages 54 | 55 | workflows: 56 | version: 2 57 | build-doc-and-deploy: 58 | jobs: 59 | - doc 60 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | 4 | [report] 5 | exclude_lines = 6 | if self.debug: 7 | pragma: no cover 8 | raise NotImplementedError 9 | ignore_errors = True 10 | omit = 11 | */tests/* 12 | **/setup.py 13 | **/_sklearn_compat.py 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 5 | 6 | 7 | 8 | #### Description 9 | 10 | 11 | #### Steps/Code to Reproduce 12 | 34 | 35 | #### Expected Results 36 | 37 | 38 | #### Actual Results 39 | 40 | 41 | #### Versions 42 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us reproduce and correct the bug 4 | title: "[BUG]" 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | #### Describe the bug 11 | A clear and concise description of what the bug is. 12 | 13 | #### Steps/Code to Reproduce 14 | 36 | 37 | ``` 38 | Sample code to reproduce the problem 39 | ``` 40 | 41 | #### Expected Results 42 | 43 | 44 | #### Actual Results 45 | 46 | 47 | #### Versions 48 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation-improvement.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Documentation improvement 3 | about: Create a report to help us improve the documentation 4 | title: "[DOC]" 5 | labels: Documentation, help wanted, good first issue 6 | assignees: '' 7 | 8 | --- 9 | 10 | #### Describe the issue linked to the documentation 11 | 12 | Tell us about the confusion introduce in the documentation. 13 | 14 | #### Suggest a potential alternative/fix 15 | 16 | Tell us how we could improve the documentation in this regard. 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an new algorithm, enhancement to an existing algorithm, etc. 4 | title: "[ENH]" 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | <-- 11 | If you want to propose a new algorithm, please refer first to the scikit-learn inclusion criterion: 12 | https://scikit-learn.org/stable/faq.html#what-are-the-inclusion-criteria-for-new-algorithms 13 | --> 14 | 15 | #### Is your feature request related to a problem? Please describe 16 | 17 | #### Describe the solution you'd like 18 | 19 | #### Describe alternatives you've considered 20 | 21 | #### Additional context 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/other--blank-template-.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Other (blank template) 3 | about: For all other issues to reach the community... 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: If you have a usage question 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ** 11 | If your issue is a usage question, submit it here instead: 12 | - The imbalanced learn gitter: https://gitter.im/scikit-learn-contrib/imbalanced-learn 13 | ** 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/usage-question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Usage question 3 | about: If you have a usage question 4 | title: "[SO]" 5 | labels: question 6 | assignees: '' 7 | 8 | --- 9 | 10 | ** If your issue is a usage question, submit it here instead:** 11 | - **The imbalanced learn gitter: https://gitter.im/scikit-learn-contrib/imbalanced-learn** 12 | - **StackOverflow with the imblearn (or imbalanced-learn) tag:https://stackoverflow.com/questions/tagged/imblearn** 13 | 14 | We are going to automatically close this issue if this is not link to a bug or an enhancement. 15 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 5 | #### Reference Issue 6 | 7 | 8 | 9 | #### What does this implement/fix? Explain your changes. 10 | 11 | 12 | #### Any other comments? 13 | 14 | 15 | 25 | -------------------------------------------------------------------------------- /.github/check-changelog.yml: -------------------------------------------------------------------------------- 1 | name: Check Changelog 2 | # This check makes sure that the changelog is properly updated 3 | # when a PR introduces a change in a test file. 4 | # To bypass this check, label the PR with "No Changelog Needed". 5 | on: 6 | pull_request: 7 | types: [opened, edited, labeled, unlabeled, synchronize] 8 | 9 | jobs: 10 | check: 11 | name: A reviewer will let you know if it is required or can be bypassed 12 | runs-on: ubuntu-latest 13 | if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 }} 14 | steps: 15 | - name: Get PR number and milestone 16 | run: | 17 | echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV 18 | echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }}" >> $GITHUB_ENV 19 | - uses: actions/checkout@v4 20 | with: 21 | fetch-depth: '0' 22 | - name: Check the changelog entry 23 | run: | 24 | set -xe 25 | changed_files=$(git diff --name-only origin/main) 26 | # Changelog should be updated only if tests have been modified 27 | if [[ ! "$changed_files" =~ tests ]] 28 | then 29 | exit 0 30 | fi 31 | all_changelogs=$(cat ./doc/whats_new/v*.rst) 32 | if [[ "$all_changelogs" =~ :pr:\`$PR_NUMBER\` ]] 33 | then 34 | echo "Changelog has been updated." 35 | # If the pull request is milestoned check the correspondent changelog 36 | if exist -f ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst 37 | then 38 | expected_changelog=$(cat ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst) 39 | if [[ "$expected_changelog" =~ :pr:\`$PR_NUMBER\` ]] 40 | then 41 | echo "Changelog and milestone correspond." 42 | else 43 | echo "Changelog and milestone do not correspond." 44 | echo "If you see this error make sure that the tagged milestone for the PR" 45 | echo "and the edited changelog filename properly match." 46 | exit 1 47 | fi 48 | fi 49 | else 50 | echo "A Changelog entry is missing." 51 | echo "" 52 | echo "Please add an entry to the changelog at 'doc/whats_new/v*.rst'" 53 | echo "to document your change assuming that the PR will be merged" 54 | echo "in time for the next release of imbalanced-learn." 55 | echo "" 56 | echo "Look at other entries in that file for inspiration and please" 57 | echo "reference this pull request using the ':pr:' directive and" 58 | echo "credit yourself (and other contributors if applicable) with" 59 | echo "the ':user:' directive." 60 | echo "" 61 | echo "If you see this error and there is already a changelog entry," 62 | echo "check that the PR number is correct." 63 | echo "" 64 | echo "If you believe that this PR does not warrant a changelog" 65 | echo "entry, say so in a comment so that a maintainer will label" 66 | echo "the PR with 'No Changelog Needed' to bypass this check." 67 | exit 1 68 | fi 69 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # Maintain dependencies for GitHub Actions as recommended in SPEC8: 4 | # https://github.com/scientific-python/specs/pull/325 5 | # At the time of writing, release critical workflows such as 6 | # pypa/gh-action-pypi-publish should use hash-based versioning for security 7 | # reasons. This strategy may be generalized to all other github actions 8 | # in the future. 9 | - package-ecosystem: "github-actions" 10 | directory: "/" 11 | schedule: 12 | interval: "weekly" 13 | groups: 14 | actions: 15 | patterns: 16 | - "*" 17 | reviewers: 18 | - "glemaitre" 19 | -------------------------------------------------------------------------------- /.github/workflows/circleci-artifacts-redirector.yml: -------------------------------------------------------------------------------- 1 | name: CircleCI artifacts redirector 2 | 3 | on: [status] 4 | 5 | # Restrict the permissions granted to the use of secrets.GITHUB_TOKEN in this 6 | # github actions workflow: 7 | # https://docs.github.com/en/actions/security-guides/automatic-token-authentication 8 | permissions: 9 | statuses: write 10 | 11 | jobs: 12 | circleci_artifacts_redirector_job: 13 | runs-on: ubuntu-latest 14 | # For testing this action on a fork, remove the "github.repository =="" condition. 15 | if: "github.repository == 'scikit-learn-contrib/imbalanced-learn' && github.event.context == 'ci/circleci: doc'" 16 | name: Run CircleCI artifacts redirector 17 | steps: 18 | - name: GitHub Action step 19 | uses: scientific-python/circleci-artifacts-redirector-action@v1 20 | with: 21 | repo-token: ${{ secrets.GITHUB_TOKEN }} 22 | api-token: ${{ secrets.CIRCLE_CI }} 23 | artifact-path: 0/doc/index.html 24 | circleci-jobs: doc 25 | job-title: Check the rendered docs here! 26 | -------------------------------------------------------------------------------- /.github/workflows/linters.yml: -------------------------------------------------------------------------------- 1 | name: Run code format checks 2 | 3 | on: 4 | push: 5 | branches: 6 | - "main" 7 | pull_request: 8 | branches: 9 | - '*' 10 | 11 | jobs: 12 | run-pre-commit-checks: 13 | name: Run pre-commit checks 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | - uses: prefix-dev/setup-pixi@v0.8.8 19 | with: 20 | pixi-version: v0.39.2 21 | frozen: true 22 | 23 | - name: Run tests 24 | run: pixi run -e linters linters 25 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: 'tests' 2 | 3 | on: 4 | push: 5 | branches: 6 | - "main" 7 | pull_request: 8 | branches: 9 | - '*' 10 | 11 | jobs: 12 | test: 13 | strategy: 14 | matrix: 15 | os: [windows-latest, ubuntu-latest, macos-latest] 16 | environment: [ 17 | ci-py310-min-dependencies, 18 | ci-py310-min-optional-dependencies, 19 | ci-py310-min-keras, 20 | ci-py310-min-tensorflow, 21 | ci-py311-sklearn-1-4, 22 | ci-py311-sklearn-1-5, 23 | ci-py311-latest-keras, 24 | ci-py311-latest-tensorflow, 25 | ci-py313-latest-dependencies, 26 | ci-py313-latest-optional-dependencies, 27 | ] 28 | exclude: 29 | - os: windows-latest 30 | environment: ci-py310-min-keras 31 | - os: windows-latest 32 | environment: ci-py310-min-tensorflow 33 | - os: windows-latest 34 | environment: ci-py311-latest-keras 35 | - os: windows-latest 36 | environment: ci-py311-latest-tensorflow 37 | runs-on: ${{ matrix.os }} 38 | steps: 39 | - uses: actions/checkout@v4 40 | - uses: prefix-dev/setup-pixi@v0.8.8 41 | with: 42 | pixi-version: v0.39.2 43 | environments: ${{ matrix.environment }} 44 | # we can freeze the environment and manually bump the dependencies to the 45 | # latest version time to time. 46 | frozen: true 47 | 48 | - name: Run tests 49 | run: pixi run -e ${{ matrix.environment }} tests -n 3 50 | 51 | - name: Upload coverage reports to Codecov 52 | uses: codecov/codecov-action@v5.4.2 53 | with: 54 | token: ${{ secrets.CODECOV_TOKEN }} 55 | slug: scikit-learn-contrib/imbalanced-learn 56 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | Pipfile 26 | Pipfile.lock 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .pytest_cache/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | 56 | # Sphinx documentation 57 | docs/_build/ 58 | 59 | # PyBuilder 60 | target/ 61 | 62 | # vim 63 | *.swp 64 | 65 | # emacs 66 | *~ 67 | 68 | # Visual Studio 69 | *.sln 70 | *.pyproj 71 | *.suo 72 | *.vs 73 | .vscode/ 74 | 75 | # PyCharm 76 | .idea/ 77 | 78 | # Cython 79 | *.pyc 80 | *.pyo 81 | __pycache__ 82 | *.so 83 | *.o 84 | 85 | *.egg 86 | *.egg-info 87 | 88 | Cython/Compiler/*.c 89 | Cython/Plex/*.c 90 | Cython/Runtime/refnanny.c 91 | Cython/Tempita/*.c 92 | Cython/*.c 93 | 94 | Tools/*.elc 95 | 96 | /TEST_TMP/ 97 | /build/ 98 | /wheelhouse*/ 99 | !tests/build/ 100 | /dist/ 101 | .gitrev 102 | .coverage 103 | *.orig 104 | *.rej 105 | *.dep 106 | *.swp 107 | *~ 108 | 109 | .ipynb_checkpoints 110 | docs/build 111 | 112 | tags 113 | TAGS 114 | MANIFEST 115 | 116 | .tox 117 | 118 | cythonize.dat 119 | 120 | # build documentation 121 | doc/_build/ 122 | doc/auto_examples/ 123 | doc/generated/ 124 | doc/references/generated/ 125 | doc/bibtex/auto 126 | doc/min_dependency_table.rst 127 | 128 | # MacOS 129 | .DS_Store 130 | 131 | # Pixi folder 132 | .pixi/ 133 | 134 | # Generated files 135 | doc/min_dependency_substitutions.rst 136 | doc/sg_execution_times.rst 137 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.3.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - repo: https://github.com/astral-sh/ruff-pre-commit 9 | # Ruff version. 10 | rev: v0.4.8 11 | hooks: 12 | - id: ruff 13 | args: ["--fix", "--output-format=full"] 14 | - repo: https://github.com/psf/black 15 | rev: 23.3.0 16 | hooks: 17 | - id: black 18 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | History 2 | ------- 3 | 4 | Development lead 5 | ~~~~~~~~~~~~~~~~ 6 | 7 | The project started in August 2014 by Fernando Nogueira and focused on SMOTE implementation. 8 | Together with Guillaume Lemaitre, Dayvid Victor, and Christos Aridas, additional under-sampling and over-sampling methods have been implemented as well as major changes in the API to be fully compatible with scikit-learn_. 9 | 10 | Contributors 11 | ------------ 12 | 13 | Refers to GitHub contributors page_. 14 | 15 | .. _scikit-learn: http://scikit-learn.org 16 | .. _page: https://github.com/scikit-learn-contrib/imbalanced-learn/graphs/contributors 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014-2020 The imbalanced-learn developers. 4 | All rights reserved. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | 2 | recursive-include doc * 3 | recursive-include examples * 4 | include AUTHORS.rst 5 | include CONTRIBUTING.md 6 | include LICENSE 7 | include README.rst 8 | -------------------------------------------------------------------------------- /build_tools/circle/build_doc.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | set -e 4 | 5 | # Decide what kind of documentation build to run, and run it. 6 | # 7 | # If the last commit message has a "[doc skip]" marker, do not build 8 | # the doc. On the contrary if a "[doc build]" marker is found, build the doc 9 | # instead of relying on the subsequent rules. 10 | # 11 | # We always build the documentation for jobs that are not related to a specific 12 | # PR (e.g. a merge to master or a maintenance branch). 13 | # 14 | # If this is a PR, do a full build if there are some files in this PR that are 15 | # under the "doc/" or "examples/" folders, otherwise perform a quick build. 16 | # 17 | # If the inspection of the current commit fails for any reason, the default 18 | # behavior is to quick build the documentation. 19 | 20 | get_build_type() { 21 | if [ -z "$CIRCLE_SHA1" ] 22 | then 23 | echo SKIP: undefined CIRCLE_SHA1 24 | return 25 | fi 26 | commit_msg=$(git log --format=%B -n 1 $CIRCLE_SHA1) 27 | if [ -z "$commit_msg" ] 28 | then 29 | echo QUICK BUILD: failed to inspect commit $CIRCLE_SHA1 30 | return 31 | fi 32 | if [[ "$commit_msg" =~ \[doc\ skip\] ]] 33 | then 34 | echo SKIP: [doc skip] marker found 35 | return 36 | fi 37 | if [[ "$commit_msg" =~ \[doc\ quick\] ]] 38 | then 39 | echo QUICK: [doc quick] marker found 40 | return 41 | fi 42 | if [[ "$commit_msg" =~ \[doc\ build\] ]] 43 | then 44 | echo BUILD: [doc build] marker found 45 | return 46 | fi 47 | if [ -z "$CI_PULL_REQUEST" ] 48 | then 49 | echo BUILD: not a pull request 50 | return 51 | fi 52 | git_range="origin/master...$CIRCLE_SHA1" 53 | git fetch origin master >&2 || (echo QUICK BUILD: failed to get changed filenames for $git_range; return) 54 | filenames=$(git diff --name-only $git_range) 55 | if [ -z "$filenames" ] 56 | then 57 | echo QUICK BUILD: no changed filenames for $git_range 58 | return 59 | fi 60 | if echo "$filenames" | grep -q -e ^examples/ 61 | then 62 | echo BUILD: detected examples/ filename modified in $git_range: $(echo "$filenames" | grep -e ^examples/ | head -n1) 63 | return 64 | fi 65 | echo QUICK BUILD: no examples/ filename modified in $git_range: 66 | echo "$filenames" 67 | } 68 | 69 | build_type=$(get_build_type) 70 | if [[ "$build_type" =~ ^SKIP ]] 71 | then 72 | exit 0 73 | fi 74 | 75 | # deactivate circleci virtualenv and setup a miniconda env instead 76 | if [[ `type -t deactivate` ]]; then 77 | deactivate 78 | fi 79 | 80 | # Install pixi 81 | curl -fsSL https://pixi.sh/install.sh | bash 82 | export PATH=/home/circleci/.pixi/bin:$PATH 83 | 84 | # The pipefail is requested to propagate exit code 85 | set -o pipefail && pixi run --frozen -e docs build-docs | tee ~/log.txt 86 | set +o pipefail 87 | -------------------------------------------------------------------------------- /build_tools/circle/checkout_merge_commit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Add `master` branch to the update list. 4 | # Otherwise CircleCI will give us a cached one. 5 | FETCH_REFS="+master:master" 6 | 7 | # Update PR refs for testing. 8 | if [[ -n "${CIRCLE_PR_NUMBER}" ]] 9 | then 10 | FETCH_REFS="${FETCH_REFS} +refs/pull/${CIRCLE_PR_NUMBER}/head:pr/${CIRCLE_PR_NUMBER}/head" 11 | FETCH_REFS="${FETCH_REFS} +refs/pull/${CIRCLE_PR_NUMBER}/merge:pr/${CIRCLE_PR_NUMBER}/merge" 12 | fi 13 | 14 | # Retrieve the refs. 15 | git fetch -u origin ${FETCH_REFS} 16 | 17 | # Checkout the PR merge ref. 18 | if [[ -n "${CIRCLE_PR_NUMBER}" ]] 19 | then 20 | git checkout -qf "pr/${CIRCLE_PR_NUMBER}/merge" || ( 21 | echo Could not fetch merge commit. >&2 22 | echo There may be conflicts in merging PR \#${CIRCLE_PR_NUMBER} with master. >&2; 23 | exit 1) 24 | fi 25 | 26 | # Check for merge conflicts. 27 | if [[ -n "${CIRCLE_PR_NUMBER}" ]] 28 | then 29 | git branch --merged | grep master > /dev/null 30 | git branch --merged | grep "pr/${CIRCLE_PR_NUMBER}/head" > /dev/null 31 | fi 32 | -------------------------------------------------------------------------------- /build_tools/circle/push_doc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script is meant to be called in the "deploy" step defined in 3 | # circle.yml. See https://circleci.com/docs/ for more details. 4 | # The behavior of the script is controlled by environment variable defined 5 | # in the circle.yml in the top level folder of the project. 6 | 7 | GENERATED_DOC_DIR=$1 8 | 9 | if [[ -z "$GENERATED_DOC_DIR" ]]; then 10 | echo "Need to pass directory of the generated doc as argument" 11 | echo "Usage: $0 " 12 | exit 1 13 | fi 14 | 15 | # Absolute path needed because we use cd further down in this script 16 | GENERATED_DOC_DIR=$(readlink -f $GENERATED_DOC_DIR) 17 | 18 | if [ "$CIRCLE_BRANCH" = "master" ] 19 | then 20 | dir=dev 21 | else 22 | # Strip off .X 23 | dir="${CIRCLE_BRANCH::-2}" 24 | fi 25 | 26 | MSG="Pushing the docs to $dir/ for branch: $CIRCLE_BRANCH, commit $CIRCLE_SHA1" 27 | 28 | cd $HOME 29 | if [ ! -d $DOC_REPO ]; 30 | then git clone --depth 1 --no-checkout -b master "git@github.com:"$ORGANIZATION"/"$DOC_REPO".git"; 31 | fi 32 | cd $DOC_REPO 33 | git config core.sparseCheckout true 34 | echo $dir > .git/info/sparse-checkout 35 | git checkout master 36 | git reset --hard origin/master 37 | git rm -rf $dir/ && rm -rf $dir/ 38 | cp -R $GENERATED_DOC_DIR $dir 39 | touch $dir/.nojekyll 40 | git config --global user.email $EMAIL 41 | git config --global user.name $USERNAME 42 | git config --global push.default matching 43 | git add -f $dir/ 44 | git commit -m "$MSG" $dir 45 | git push origin master 46 | 47 | echo $MSG 48 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | # This file is here so that when running from the root folder 2 | # ./imblearn is added to sys.path by pytest. 3 | # See https://docs.pytest.org/en/latest/pythonpath.html for more details. 4 | # For example, this allows to build extensions in place and run pytest 5 | # doc/modules/clustering.rst and use imblearn from the local folder 6 | # rather than the one from site-packages. 7 | 8 | import os 9 | 10 | import numpy as np 11 | import pytest 12 | from sklearn.utils.fixes import parse_version 13 | 14 | # use legacy numpy print options to avoid failures due to NumPy 2.+ scalar 15 | # representation 16 | if parse_version(np.__version__) > parse_version("2.0.0"): 17 | np.set_printoptions(legacy="1.25") 18 | 19 | 20 | def pytest_runtest_setup(item): 21 | fname = item.fspath.strpath 22 | if ( 23 | fname.endswith(os.path.join("keras", "_generator.py")) 24 | or fname.endswith(os.path.join("tensorflow", "_generator.py")) 25 | or fname.endswith("miscellaneous.rst") 26 | ): 27 | try: 28 | import tensorflow # noqa 29 | except ImportError: 30 | pytest.skip("The tensorflow package is not installed.") 31 | -------------------------------------------------------------------------------- /doc/_static/css/imbalanced-learn.css: -------------------------------------------------------------------------------- 1 | @import url("theme.css"); 2 | 3 | .highlight a { 4 | text-decoration: underline; 5 | } 6 | 7 | .deprecated p { 8 | padding: 10px 7px 10px 10px; 9 | color: #b94a48; 10 | background-color: #f3e5e5; 11 | border: 1px solid #eed3d7; 12 | } 13 | 14 | .deprecated p span.versionmodified { 15 | font-weight: bold; 16 | } 17 | 18 | .wy-nav-content { 19 | max-width: 1200px !important; 20 | } 21 | 22 | /* Override some aspects of the pydata-sphinx-theme */ 23 | 24 | /* Main index page overview cards */ 25 | 26 | .intro-card { 27 | padding: 30px 10px 20px 10px; 28 | } 29 | 30 | .intro-card .sd-card-img-top { 31 | margin: 10px; 32 | height: 52px; 33 | background: none !important; 34 | } 35 | 36 | .intro-card .sd-card-title { 37 | color: var(--pst-color-primary); 38 | font-size: var(--pst-font-size-h5); 39 | padding: 1rem 0rem 0.5rem 0rem; 40 | } 41 | 42 | .intro-card .sd-card-footer { 43 | border: none !important; 44 | } 45 | 46 | .intro-card .sd-card-footer p.sd-card-text { 47 | max-width: 220px; 48 | margin-left: auto; 49 | margin-right: auto; 50 | } 51 | 52 | .intro-card .sd-btn-secondary { 53 | background-color: #6c757d !important; 54 | border-color: #6c757d !important; 55 | } 56 | 57 | .intro-card .sd-btn-secondary:hover { 58 | background-color: #5a6268 !important; 59 | border-color: #545b62 !important; 60 | } 61 | 62 | .card, .card img { 63 | background-color: var(--pst-color-background); 64 | } 65 | -------------------------------------------------------------------------------- /doc/_static/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/doc/_static/img/favicon.ico -------------------------------------------------------------------------------- /doc/_static/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/doc/_static/img/logo.png -------------------------------------------------------------------------------- /doc/_static/img/logo.xcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/doc/_static/img/logo.xcf -------------------------------------------------------------------------------- /doc/_static/img/logo_wide.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/doc/_static/img/logo_wide.png -------------------------------------------------------------------------------- /doc/_static/img/logo_wide_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/doc/_static/img/logo_wide_dark.png -------------------------------------------------------------------------------- /doc/_static/index_api.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 21 | 43 | 45 | 46 | 48 | image/svg+xml 49 | 51 | 52 | 53 | 54 | 55 | 60 | 63 | 68 | 73 | 76 | 82 | 88 | 94 | 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /doc/_static/index_examples.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 21 | 43 | 45 | 46 | 48 | image/svg+xml 49 | 51 | 52 | 53 | 54 | 55 | 60 | 63 | 69 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /doc/_static/index_getting_started.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 21 | 43 | 45 | 46 | 48 | image/svg+xml 49 | 51 | 52 | 53 | 54 | 55 | 60 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /doc/_static/js/copybutton.js: -------------------------------------------------------------------------------- 1 | $(document).ready(function() { 2 | /* Add a [>>>] button on the top-right corner of code samples to hide 3 | * the >>> and ... prompts and the output and thus make the code 4 | * copyable. */ 5 | var div = $('.highlight-python .highlight,' + 6 | '.highlight-python3 .highlight,' + 7 | '.highlight-pycon .highlight,' + 8 | '.highlight-default .highlight') 9 | var pre = div.find('pre'); 10 | 11 | // get the styles from the current theme 12 | pre.parent().parent().css('position', 'relative'); 13 | var hide_text = 'Hide the prompts and output'; 14 | var show_text = 'Show the prompts and output'; 15 | var border_width = pre.css('border-top-width'); 16 | var border_style = pre.css('border-top-style'); 17 | var border_color = pre.css('border-top-color'); 18 | var button_styles = { 19 | 'cursor':'pointer', 'position': 'absolute', 'top': '0', 'right': '0', 20 | 'border-color': border_color, 'border-style': border_style, 21 | 'border-width': border_width, 'color': border_color, 'text-size': '75%', 22 | 'font-family': 'monospace', 'padding-left': '0.2em', 'padding-right': '0.2em', 23 | 'border-radius': '0 3px 0 0' 24 | } 25 | 26 | // create and add the button to all the code blocks that contain >>> 27 | div.each(function(index) { 28 | var jthis = $(this); 29 | if (jthis.find('.gp').length > 0) { 30 | var button = $('>>>'); 31 | button.css(button_styles) 32 | button.attr('title', hide_text); 33 | button.data('hidden', 'false'); 34 | jthis.prepend(button); 35 | } 36 | // tracebacks (.gt) contain bare text elements that need to be 37 | // wrapped in a span to work with .nextUntil() (see later) 38 | jthis.find('pre:has(.gt)').contents().filter(function() { 39 | return ((this.nodeType == 3) && (this.data.trim().length > 0)); 40 | }).wrap(''); 41 | }); 42 | 43 | // define the behavior of the button when it's clicked 44 | $('.copybutton').click(function(e){ 45 | e.preventDefault(); 46 | var button = $(this); 47 | if (button.data('hidden') === 'false') { 48 | // hide the code output 49 | button.parent().find('.go, .gp, .gt').hide(); 50 | button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden'); 51 | button.css('text-decoration', 'line-through'); 52 | button.attr('title', show_text); 53 | button.data('hidden', 'true'); 54 | } else { 55 | // show the code output 56 | button.parent().find('.go, .gp, .gt').show(); 57 | button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible'); 58 | button.css('text-decoration', 'none'); 59 | button.attr('title', hide_text); 60 | button.data('hidden', 'false'); 61 | } 62 | }); 63 | }); 64 | -------------------------------------------------------------------------------- /doc/_templates/class.rst: -------------------------------------------------------------------------------- 1 | {{objname}} 2 | {{ underline }}============== 3 | 4 | .. currentmodule:: {{ module }} 5 | 6 | .. autoclass:: {{ objname }} 7 | 8 | {% block methods %} 9 | 10 | {% if methods %} 11 | .. rubric:: Methods 12 | 13 | .. autosummary:: 14 | {% for item in methods %} 15 | {% if '__init__' not in item %} 16 | ~{{ name }}.{{ item }} 17 | {% endif %} 18 | {%- endfor %} 19 | {% endif %} 20 | {% endblock %} 21 | 22 | .. include:: {{module}}.{{objname}}.examples 23 | 24 | .. raw:: html 25 | 26 |
27 | -------------------------------------------------------------------------------- /doc/_templates/function.rst: -------------------------------------------------------------------------------- 1 | {{objname}} 2 | {{ underline }}==================== 3 | 4 | .. currentmodule:: {{ module }} 5 | 6 | .. autofunction:: {{ objname }} 7 | 8 | .. include:: {{module}}.{{objname}}.examples 9 | 10 | .. raw:: html 11 | 12 |
13 | -------------------------------------------------------------------------------- /doc/_templates/numpydoc_docstring.rst: -------------------------------------------------------------------------------- 1 | {{index}} 2 | {{summary}} 3 | {{extended_summary}} 4 | {{parameters}} 5 | {{returns}} 6 | {{yields}} 7 | {{other_parameters}} 8 | {{attributes}} 9 | {{raises}} 10 | {{warns}} 11 | {{warnings}} 12 | {{see_also}} 13 | {{notes}} 14 | {{references}} 15 | {{examples}} 16 | {{methods}} 17 | -------------------------------------------------------------------------------- /doc/_templates/sidebar-search-bs.html: -------------------------------------------------------------------------------- 1 | 15 | -------------------------------------------------------------------------------- /doc/about.rst: -------------------------------------------------------------------------------- 1 | About us 2 | ======== 3 | 4 | .. include:: ../AUTHORS.rst 5 | 6 | .. _citing-imbalanced-learn: 7 | 8 | Citing imbalanced-learn 9 | ----------------------- 10 | 11 | If you use imbalanced-learn in a scientific publication, we would appreciate 12 | citations to the following paper:: 13 | 14 | @article{JMLR:v18:16-365, 15 | author = {Guillaume Lema{{\^i}}tre and Fernando Nogueira and Christos K. Aridas}, 16 | title = {Imbalanced-learn: A Python Toolbox to Tackle the Curse of Imbalanced Datasets in Machine Learning}, 17 | journal = {Journal of Machine Learning Research}, 18 | year = {2017}, 19 | volume = {18}, 20 | number = {17}, 21 | pages = {1-5}, 22 | url = {http://jmlr.org/papers/v18/16-365.html} 23 | } 24 | -------------------------------------------------------------------------------- /doc/combine.rst: -------------------------------------------------------------------------------- 1 | .. _combine: 2 | 3 | ======================================= 4 | Combination of over- and under-sampling 5 | ======================================= 6 | 7 | .. currentmodule:: imblearn.over_sampling 8 | 9 | We previously presented :class:`SMOTE` and showed that this method can generate 10 | noisy samples by interpolating new points between marginal outliers and 11 | inliers. This issue can be solved by cleaning the space resulting 12 | from over-sampling. 13 | 14 | .. currentmodule:: imblearn.combine 15 | 16 | In this regard, Tomek's link and edited nearest-neighbours are the two cleaning 17 | methods that have been added to the pipeline after applying SMOTE over-sampling 18 | to obtain a cleaner space. The two ready-to use classes imbalanced-learn 19 | implements for combining over- and undersampling methods are: (i) 20 | :class:`SMOTETomek` :cite:`batista2004study` and (ii) :class:`SMOTEENN` 21 | :cite:`batista2003balancing`. 22 | 23 | Those two classes can be used like any other sampler with parameters identical 24 | to their former samplers:: 25 | 26 | >>> from collections import Counter 27 | >>> from sklearn.datasets import make_classification 28 | >>> X, y = make_classification(n_samples=5000, n_features=2, n_informative=2, 29 | ... n_redundant=0, n_repeated=0, n_classes=3, 30 | ... n_clusters_per_class=1, 31 | ... weights=[0.01, 0.05, 0.94], 32 | ... class_sep=0.8, random_state=0) 33 | >>> print(sorted(Counter(y).items())) 34 | [(0, 64), (1, 262), (2, 4674)] 35 | >>> from imblearn.combine import SMOTEENN 36 | >>> smote_enn = SMOTEENN(random_state=0) 37 | >>> X_resampled, y_resampled = smote_enn.fit_resample(X, y) 38 | >>> print(sorted(Counter(y_resampled).items())) 39 | [(0, 4060), (1, 4381), (2, 3502)] 40 | >>> from imblearn.combine import SMOTETomek 41 | >>> smote_tomek = SMOTETomek(random_state=0) 42 | >>> X_resampled, y_resampled = smote_tomek.fit_resample(X, y) 43 | >>> print(sorted(Counter(y_resampled).items())) 44 | [(0, 4499), (1, 4566), (2, 4413)] 45 | 46 | We can also see in the example below that :class:`SMOTEENN` tends to clean more 47 | noisy samples than :class:`SMOTETomek`. 48 | 49 | .. image:: ./auto_examples/combine/images/sphx_glr_plot_comparison_combine_001.png 50 | :target: ./auto_examples/combine/plot_comparison_combine.html 51 | :scale: 60 52 | :align: center 53 | 54 | .. topic:: Examples 55 | 56 | * :ref:`sphx_glr_auto_examples_combine_plot_comparison_combine.py` 57 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. project-template documentation master file, created by 2 | sphinx-quickstart on Mon Jan 18 14:44:12 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | :notoc: 7 | 8 | ############################## 9 | imbalanced-learn documentation 10 | ############################## 11 | 12 | **Date**: |today| **Version**: |version| 13 | 14 | **Useful links**: 15 | `Binary Installers `__ | 16 | `Source Repository `__ | 17 | `Issues & Ideas `__ | 18 | `Q&A Support `__ 19 | 20 | Imbalanced-learn (imported as :mod:`imblearn`) is an open source, MIT-licensed 21 | library relying on scikit-learn (imported as :mod:`sklearn`) and provides tools 22 | when dealing with classification with imbalanced classes. 23 | 24 | .. grid:: 1 2 2 2 25 | :gutter: 4 26 | :padding: 2 2 0 0 27 | :class-container: sd-text-center 28 | 29 | .. grid-item-card:: Getting started 30 | :img-top: _static/index_getting_started.svg 31 | :class-card: intro-card 32 | :shadow: md 33 | 34 | Check out the getting started guides to install `imbalanced-learn`. 35 | Some extra information to get started with a new contribution is also provided. 36 | 37 | +++ 38 | 39 | .. button-ref:: getting_started 40 | :ref-type: ref 41 | :click-parent: 42 | :color: secondary 43 | :expand: 44 | 45 | To the installation guideline 46 | 47 | .. grid-item-card:: User guide 48 | :img-top: _static/index_user_guide.svg 49 | :class-card: intro-card 50 | :shadow: md 51 | 52 | The user guide provides in-depth information on the key concepts of 53 | `imbalanced-learn` with useful background information and explanation. 54 | 55 | +++ 56 | 57 | .. button-ref:: user_guide 58 | :ref-type: ref 59 | :click-parent: 60 | :color: secondary 61 | :expand: 62 | 63 | To the user guide 64 | 65 | .. grid-item-card:: API reference 66 | :img-top: _static/index_api.svg 67 | :class-card: intro-card 68 | :shadow: md 69 | 70 | The reference guide contains a detailed description of 71 | the `imbalanced-learn` API. To known more about methods parameters. 72 | 73 | +++ 74 | 75 | .. button-ref:: api 76 | :ref-type: ref 77 | :click-parent: 78 | :color: secondary 79 | :expand: 80 | 81 | To the reference guide 82 | 83 | .. grid-item-card:: Examples 84 | :img-top: _static/index_examples.svg 85 | :class-card: intro-card 86 | :shadow: md 87 | 88 | The gallery of examples is a good place to see `imbalanced-learn` in action. 89 | Select an example and dive in. 90 | 91 | +++ 92 | 93 | .. button-ref:: general_examples 94 | :ref-type: ref 95 | :click-parent: 96 | :color: secondary 97 | :expand: 98 | 99 | To the gallery of examples 100 | 101 | 102 | .. toctree:: 103 | :maxdepth: 3 104 | :hidden: 105 | :titlesonly: 106 | 107 | install 108 | user_guide 109 | references/index 110 | auto_examples/index 111 | whats_new 112 | about 113 | -------------------------------------------------------------------------------- /doc/install.rst: -------------------------------------------------------------------------------- 1 | .. _getting_started: 2 | 3 | ############### 4 | Getting Started 5 | ############### 6 | 7 | Prerequisites 8 | ============= 9 | 10 | .. |PythonMinVersion| replace:: 3.10 11 | .. |NumPyMinVersion| replace:: 1.24.3 12 | .. |SciPyMinVersion| replace:: 1.10.1 13 | .. |ScikitLearnMinVersion| replace:: 1.3.2 14 | .. |MatplotlibMinVersion| replace:: 3.7.3 15 | .. |PandasMinVersion| replace:: 1.5.3 16 | .. |TensorflowMinVersion| replace:: 2.13.1 17 | .. |KerasMinVersion| replace:: 3.0.5 18 | .. |SeabornMinVersion| replace:: 0.12.2 19 | .. |PytestMinVersion| replace:: 7.2.2 20 | 21 | `imbalanced-learn` requires the following dependencies: 22 | 23 | - Python (>= |PythonMinVersion|) 24 | - NumPy (>= |NumPyMinVersion|) 25 | - SciPy (>= |SciPyMinVersion|) 26 | - Scikit-learn (>= |ScikitLearnMinVersion|) 27 | - Pytest (>= |PytestMinVersion|) 28 | 29 | Additionally, `imbalanced-learn` requires the following optional dependencies: 30 | 31 | - Pandas (>= |PandasMinVersion|) for dealing with dataframes 32 | - Tensorflow (>= |TensorflowMinVersion|) for dealing with TensorFlow models 33 | - Keras (>= |KerasMinVersion|) for dealing with Keras models 34 | 35 | The examples will requires the following additional dependencies: 36 | 37 | - Matplotlib (>= |MatplotlibMinVersion|) 38 | - Seaborn (>= |SeabornMinVersion|) 39 | 40 | Install 41 | ======= 42 | 43 | From PyPi or conda-forge repositories 44 | ------------------------------------- 45 | 46 | imbalanced-learn is currently available on the PyPi's repositories and you can 47 | install it via `pip`:: 48 | 49 | pip install imbalanced-learn 50 | 51 | The package is released also on the conda-forge repositories and you can install 52 | it with `conda` (or `mamba`):: 53 | 54 | conda install -c conda-forge imbalanced-learn 55 | 56 | Intel optimizations via scikit-learn-intelex 57 | -------------------------------------------- 58 | 59 | Imbalanced-learn relies entirely on scikit-learn algorithms. Intel provides an 60 | optimized version of scikit-learn for Intel hardwares, called scikit-learn-intelex. 61 | Installing scikit-learn-intelex and patching scikit-learn will activate the 62 | Intel optimizations. 63 | 64 | You can refer to the following 65 | `blog post `_ 66 | for some benchmarks. 67 | 68 | Refer to the following documentation for instructions: 69 | 70 | - `Installation guide `_. 71 | - `Patching guide `_. 72 | 73 | From source available on GitHub 74 | ------------------------------- 75 | 76 | If you prefer, you can clone it and run the setup.py file. Use the following 77 | commands to get a copy from Github and install all dependencies:: 78 | 79 | git clone https://github.com/scikit-learn-contrib/imbalanced-learn.git 80 | cd imbalanced-learn 81 | pip install . 82 | 83 | Be aware that you can install in developer mode with:: 84 | 85 | pip install --no-build-isolation --editable . 86 | 87 | If you wish to make pull-requests on GitHub, we advise you to install 88 | pre-commit:: 89 | 90 | pip install pre-commit 91 | pre-commit install 92 | 93 | Test and coverage 94 | ================= 95 | 96 | You want to test the code before to install:: 97 | 98 | $ make test 99 | 100 | You wish to test the coverage of your version:: 101 | 102 | $ make coverage 103 | 104 | You can also use `pytest`:: 105 | 106 | $ pytest imblearn -v 107 | 108 | Contribute 109 | ========== 110 | 111 | You can contribute to this code through Pull Request on GitHub_. Please, make 112 | sure that your code is coming with unit tests to ensure full coverage and 113 | continuous integration in the API. 114 | 115 | .. _GitHub: https://github.com/scikit-learn-contrib/imbalanced-learn/pulls 116 | -------------------------------------------------------------------------------- /doc/introduction.rst: -------------------------------------------------------------------------------- 1 | .. _introduction: 2 | 3 | ============ 4 | Introduction 5 | ============ 6 | 7 | .. _api_imblearn: 8 | 9 | API's of imbalanced-learn samplers 10 | ---------------------------------- 11 | 12 | The available samplers follow the 13 | `scikit-learn API `_ 14 | using the base estimator 15 | and incorporating a sampling functionality via the ``sample`` method: 16 | 17 | :Estimator: 18 | 19 | The base object, implements a ``fit`` method to learn from data:: 20 | 21 | estimator = obj.fit(data, targets) 22 | 23 | :Resampler: 24 | 25 | To resample a data sets, each sampler implements a ``fit_resample`` method:: 26 | 27 | data_resampled, targets_resampled = obj.fit_resample(data, targets) 28 | 29 | Imbalanced-learn samplers accept the same inputs as scikit-learn estimators: 30 | 31 | * `data`, 2-dimensional array-like structures, such as: 32 | * Python's list of lists :class:`list`, 33 | * Numpy arrays :class:`numpy.ndarray`, 34 | * Panda dataframes :class:`pandas.DataFrame`, 35 | * Scipy sparse matrices :class:`scipy.sparse.csr_matrix` or :class:`scipy.sparse.csc_matrix`; 36 | 37 | * `targets`, 1-dimensional array-like structures, such as: 38 | * Numpy arrays :class:`numpy.ndarray`, 39 | * Pandas series :class:`pandas.Series`. 40 | 41 | The output will be of the following type: 42 | 43 | * `data_resampled`, 2-dimensional aray-like structures, such as: 44 | * Numpy arrays :class:`numpy.ndarray`, 45 | * Pandas dataframes :class:`pandas.DataFrame`, 46 | * Scipy sparse matrices :class:`scipy.sparse.csr_matrix` or :class:`scipy.sparse.csc_matrix`; 47 | 48 | * `targets_resampled`, 1-dimensional array-like structures, such as: 49 | * Numpy arrays :class:`numpy.ndarray`, 50 | * Pandas series :class:`pandas.Series`. 51 | 52 | .. topic:: Pandas in/out 53 | 54 | Unlike scikit-learn, imbalanced-learn provides support for pandas in/out. 55 | Therefore providing a dataframe, will output as well a dataframe. 56 | 57 | .. topic:: Sparse input 58 | 59 | For sparse input the data is **converted to the Compressed Sparse Rows 60 | representation** (see ``scipy.sparse.csr_matrix``) before being fed to the 61 | sampler. To avoid unnecessary memory copies, it is recommended to choose the 62 | CSR representation upstream. 63 | 64 | .. _problem_statement: 65 | 66 | Problem statement regarding imbalanced data sets 67 | ------------------------------------------------ 68 | 69 | The learning and prediction phrases of machine learning algorithms 70 | can be impacted by the issue of **imbalanced datasets**. This imbalance 71 | refers to the difference in the number of samples across different classes. 72 | We demonstrate the effect of training a `Logistic Regression classifier 73 | `_ 74 | with varying levels of class balancing by adjusting their weights. 75 | 76 | .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_001.png 77 | :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html 78 | :scale: 60 79 | :align: center 80 | 81 | As expected, the decision function of the Logistic Regression classifier varies significantly 82 | depending on how imbalanced the data is. With a greater imbalance ratio, the decision function 83 | tends to favour the class with the larger number of samples, usually referred to as the 84 | **majority class**. 85 | -------------------------------------------------------------------------------- /doc/references/combine.rst: -------------------------------------------------------------------------------- 1 | .. _combine_ref: 2 | 3 | Combination of over- and under-sampling methods 4 | =============================================== 5 | 6 | .. automodule:: imblearn.combine 7 | :no-members: 8 | :no-inherited-members: 9 | 10 | .. currentmodule:: imblearn.combine 11 | 12 | .. autosummary:: 13 | :toctree: generated/ 14 | :template: class.rst 15 | 16 | SMOTEENN 17 | SMOTETomek 18 | -------------------------------------------------------------------------------- /doc/references/datasets.rst: -------------------------------------------------------------------------------- 1 | .. _datasets_ref: 2 | 3 | Datasets 4 | ======== 5 | 6 | .. automodule:: imblearn.datasets 7 | :no-members: 8 | :no-inherited-members: 9 | 10 | .. currentmodule:: imblearn.datasets 11 | 12 | .. autosummary:: 13 | :toctree: generated/ 14 | :template: function.rst 15 | 16 | make_imbalance 17 | fetch_datasets 18 | -------------------------------------------------------------------------------- /doc/references/ensemble.rst: -------------------------------------------------------------------------------- 1 | .. _ensemble_ref: 2 | 3 | Ensemble methods 4 | ================ 5 | 6 | .. automodule:: imblearn.ensemble 7 | :no-members: 8 | :no-inherited-members: 9 | 10 | .. currentmodule:: imblearn.ensemble 11 | 12 | Boosting algorithms 13 | ------------------- 14 | 15 | .. autosummary:: 16 | :toctree: generated/ 17 | :template: class.rst 18 | 19 | EasyEnsembleClassifier 20 | RUSBoostClassifier 21 | 22 | Bagging algorithms 23 | ------------------ 24 | 25 | .. autosummary:: 26 | :toctree: generated/ 27 | :template: class.rst 28 | 29 | BalancedBaggingClassifier 30 | BalancedRandomForestClassifier 31 | -------------------------------------------------------------------------------- /doc/references/index.rst: -------------------------------------------------------------------------------- 1 | .. _api: 2 | 3 | ############# 4 | API reference 5 | ############# 6 | 7 | This is the full API documentation of the `imbalanced-learn` toolbox. 8 | 9 | .. toctree:: 10 | :maxdepth: 3 11 | 12 | under_sampling 13 | over_sampling 14 | combine 15 | ensemble 16 | keras 17 | tensorflow 18 | miscellaneous 19 | pipeline 20 | metrics 21 | datasets 22 | utils 23 | -------------------------------------------------------------------------------- /doc/references/keras.rst: -------------------------------------------------------------------------------- 1 | .. _keras_ref: 2 | 3 | Batch generator for Keras 4 | ========================= 5 | 6 | .. automodule:: imblearn.keras 7 | :no-members: 8 | :no-inherited-members: 9 | 10 | .. currentmodule:: imblearn 11 | 12 | .. autosummary:: 13 | :toctree: generated/ 14 | :template: class.rst 15 | 16 | keras.BalancedBatchGenerator 17 | 18 | .. autosummary:: 19 | :toctree: generated/ 20 | :template: function.rst 21 | 22 | keras.balanced_batch_generator 23 | -------------------------------------------------------------------------------- /doc/references/metrics.rst: -------------------------------------------------------------------------------- 1 | .. _metrics_ref: 2 | 3 | Metrics 4 | ======= 5 | 6 | .. automodule:: imblearn.metrics 7 | :no-members: 8 | :no-inherited-members: 9 | 10 | Classification metrics 11 | ---------------------- 12 | See the :ref:`metrics` section of the user guide for further details. 13 | 14 | .. currentmodule:: imblearn.metrics 15 | 16 | .. autosummary:: 17 | :toctree: generated/ 18 | :template: function.rst 19 | 20 | classification_report_imbalanced 21 | sensitivity_specificity_support 22 | sensitivity_score 23 | specificity_score 24 | geometric_mean_score 25 | macro_averaged_mean_absolute_error 26 | make_index_balanced_accuracy 27 | 28 | Pairwise metrics 29 | ---------------- 30 | See the :ref:`pairwise_metrics` section of the user guide for further details. 31 | 32 | .. automodule:: imblearn.metrics.pairwise 33 | :no-members: 34 | :no-inherited-members: 35 | 36 | .. currentmodule:: imblearn.metrics.pairwise 37 | 38 | .. autosummary:: 39 | :toctree: generated/ 40 | :template: class.rst 41 | 42 | ValueDifferenceMetric 43 | -------------------------------------------------------------------------------- /doc/references/miscellaneous.rst: -------------------------------------------------------------------------------- 1 | .. _misc_ref: 2 | 3 | Miscellaneous 4 | ============= 5 | 6 | Imbalance-learn provides some fast-prototyping tools. 7 | 8 | .. currentmodule:: imblearn 9 | 10 | .. autosummary:: 11 | :toctree: generated/ 12 | :template: class.rst 13 | 14 | FunctionSampler 15 | -------------------------------------------------------------------------------- /doc/references/over_sampling.rst: -------------------------------------------------------------------------------- 1 | .. _over_sampling_ref: 2 | 3 | Over-sampling methods 4 | ===================== 5 | 6 | .. automodule:: imblearn.over_sampling 7 | :no-members: 8 | :no-inherited-members: 9 | 10 | .. currentmodule:: imblearn.over_sampling 11 | 12 | Basic over-sampling 13 | ------------------- 14 | 15 | .. autosummary:: 16 | :toctree: generated/ 17 | :template: class.rst 18 | 19 | RandomOverSampler 20 | 21 | SMOTE algorithms 22 | ---------------- 23 | 24 | .. autosummary:: 25 | :toctree: generated/ 26 | :template: class.rst 27 | 28 | SMOTE 29 | SMOTENC 30 | SMOTEN 31 | ADASYN 32 | BorderlineSMOTE 33 | KMeansSMOTE 34 | SVMSMOTE 35 | -------------------------------------------------------------------------------- /doc/references/pipeline.rst: -------------------------------------------------------------------------------- 1 | .. _pipeline_ref: 2 | 3 | Pipeline 4 | ======== 5 | 6 | .. automodule:: imblearn.pipeline 7 | :no-members: 8 | :no-inherited-members: 9 | 10 | .. currentmodule:: imblearn.pipeline 11 | 12 | .. autosummary:: 13 | :toctree: generated/ 14 | :template: class.rst 15 | 16 | Pipeline 17 | 18 | .. autosummary:: 19 | :toctree: generated/ 20 | :template: function.rst 21 | 22 | make_pipeline 23 | -------------------------------------------------------------------------------- /doc/references/tensorflow.rst: -------------------------------------------------------------------------------- 1 | .. _tensorflow_ref: 2 | 3 | Batch generator for TensorFlow 4 | ============================== 5 | 6 | .. automodule:: imblearn.tensorflow 7 | :no-members: 8 | :no-inherited-members: 9 | 10 | .. currentmodule:: imblearn 11 | 12 | .. autosummary:: 13 | :toctree: generated/ 14 | :template: function.rst 15 | 16 | tensorflow.balanced_batch_generator 17 | -------------------------------------------------------------------------------- /doc/references/under_sampling.rst: -------------------------------------------------------------------------------- 1 | .. _under_sampling_ref: 2 | 3 | Under-sampling methods 4 | ====================== 5 | 6 | .. automodule:: imblearn.under_sampling 7 | :no-members: 8 | :no-inherited-members: 9 | 10 | Prototype generation 11 | -------------------- 12 | 13 | .. automodule:: imblearn.under_sampling._prototype_generation 14 | :no-members: 15 | :no-inherited-members: 16 | 17 | .. currentmodule:: imblearn.under_sampling 18 | 19 | .. autosummary:: 20 | :toctree: generated/ 21 | :template: class.rst 22 | 23 | ClusterCentroids 24 | 25 | Prototype selection 26 | ------------------- 27 | 28 | .. automodule:: imblearn.under_sampling._prototype_selection 29 | :no-members: 30 | :no-inherited-members: 31 | 32 | .. currentmodule:: imblearn.under_sampling 33 | 34 | .. autosummary:: 35 | :toctree: generated/ 36 | :template: class.rst 37 | 38 | CondensedNearestNeighbour 39 | EditedNearestNeighbours 40 | RepeatedEditedNearestNeighbours 41 | AllKNN 42 | InstanceHardnessThreshold 43 | NearMiss 44 | NeighbourhoodCleaningRule 45 | OneSidedSelection 46 | RandomUnderSampler 47 | TomekLinks 48 | -------------------------------------------------------------------------------- /doc/references/utils.rst: -------------------------------------------------------------------------------- 1 | Utilities 2 | ========= 3 | 4 | .. automodule:: imblearn.utils 5 | :no-members: 6 | :no-inherited-members: 7 | 8 | .. currentmodule:: imblearn.utils 9 | 10 | Validation checks used in samplers 11 | ---------------------------------- 12 | 13 | .. autosummary:: 14 | :toctree: generated/ 15 | :template: function.rst 16 | 17 | estimator_checks.parametrize_with_checks 18 | check_neighbors_object 19 | check_sampling_strategy 20 | check_target_type 21 | 22 | Testing compatibility of your own sampler 23 | ----------------------------------------- 24 | 25 | .. automodule:: imblearn.utils.estimator_checks 26 | :no-members: 27 | :no-inherited-members: 28 | 29 | .. currentmodule:: imblearn.utils.estimator_checks 30 | 31 | .. autosummary:: 32 | :toctree: generated/ 33 | :template: function.rst 34 | 35 | parametrize_with_checks 36 | -------------------------------------------------------------------------------- /doc/sphinxext/MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include tests *.py 2 | include *.txt 3 | -------------------------------------------------------------------------------- /doc/sphinxext/README.txt: -------------------------------------------------------------------------------- 1 | ===================================== 2 | numpydoc -- Numpy's Sphinx extensions 3 | ===================================== 4 | 5 | Numpy's documentation uses several custom extensions to Sphinx. These 6 | are shipped in this ``numpydoc`` package, in case you want to make use 7 | of them in third-party projects. 8 | 9 | The following extensions are available: 10 | 11 | - ``numpydoc``: support for the Numpy docstring format in Sphinx, and add 12 | the code description directives ``np-function``, ``np-cfunction``, etc. 13 | that support the Numpy docstring syntax. 14 | 15 | - ``numpydoc.traitsdoc``: For gathering documentation about Traits attributes. 16 | 17 | - ``numpydoc.plot_directives``: Adaptation of Matplotlib's ``plot::`` 18 | directive. Note that this implementation may still undergo severe 19 | changes or eventually be deprecated. 20 | 21 | - ``numpydoc.only_directives``: (DEPRECATED) 22 | 23 | - ``numpydoc.autosummary``: (DEPRECATED) An ``autosummary::`` directive. 24 | Available in Sphinx 0.6.2 and (to-be) 1.0 as ``sphinx.ext.autosummary``, 25 | and it the Sphinx 1.0 version is recommended over that included in 26 | Numpydoc. 27 | 28 | 29 | numpydoc 30 | ======== 31 | 32 | Numpydoc inserts a hook into Sphinx's autodoc that converts docstrings 33 | following the Numpy/Scipy format to a form palatable to Sphinx. 34 | 35 | Options 36 | ------- 37 | 38 | The following options can be set in conf.py: 39 | 40 | - numpydoc_use_plots: bool 41 | 42 | Whether to produce ``plot::`` directives for Examples sections that 43 | contain ``import matplotlib``. 44 | 45 | - numpydoc_show_class_members: bool 46 | 47 | Whether to show all members of a class in the Methods and Attributes 48 | sections automatically. 49 | 50 | - numpydoc_edit_link: bool (DEPRECATED -- edit your HTML template instead) 51 | 52 | Whether to insert an edit link after docstrings. 53 | -------------------------------------------------------------------------------- /doc/sphinxext/github_link.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import os 3 | import subprocess 4 | import sys 5 | from functools import partial 6 | from operator import attrgetter 7 | 8 | REVISION_CMD = "git rev-parse --short HEAD" 9 | 10 | 11 | def _get_git_revision(): 12 | try: 13 | revision = subprocess.check_output(REVISION_CMD.split()).strip() 14 | except (subprocess.CalledProcessError, OSError): 15 | print("Failed to execute git to get revision") 16 | return None 17 | return revision.decode("utf-8") 18 | 19 | 20 | def _linkcode_resolve(domain, info, package, url_fmt, revision): 21 | """Determine a link to online source for a class/method/function 22 | 23 | This is called by sphinx.ext.linkcode 24 | 25 | An example with a long-untouched module that everyone has 26 | >>> _linkcode_resolve('py', {'module': 'tty', 27 | ... 'fullname': 'setraw'}, 28 | ... package='tty', 29 | ... url_fmt='https://hg.python.org/cpython/file/' 30 | ... '{revision}/Lib/{package}/{path}#L{lineno}', 31 | ... revision='xxxx') 32 | 'https://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18' 33 | """ 34 | 35 | if revision is None: 36 | return 37 | if domain not in ("py", "pyx"): 38 | return 39 | if not info.get("module") or not info.get("fullname"): 40 | return 41 | 42 | class_name = info["fullname"].split(".")[0] 43 | module = __import__(info["module"], fromlist=[class_name]) 44 | obj = attrgetter(info["fullname"])(module) 45 | 46 | # Unwrap the object to get the correct source 47 | # file in case that is wrapped by a decorator 48 | obj = inspect.unwrap(obj) 49 | 50 | try: 51 | fn = inspect.getsourcefile(obj) 52 | except Exception: 53 | fn = None 54 | if not fn: 55 | try: 56 | fn = inspect.getsourcefile(sys.modules[obj.__module__]) 57 | except Exception: 58 | fn = None 59 | if not fn: 60 | return 61 | 62 | fn = os.path.relpath(fn, start=os.path.dirname(__import__(package).__file__)) 63 | try: 64 | lineno = inspect.getsourcelines(obj)[1] 65 | except Exception: 66 | lineno = "" 67 | return url_fmt.format(revision=revision, package=package, path=fn, lineno=lineno) 68 | 69 | 70 | def make_linkcode_resolve(package, url_fmt): 71 | """Returns a linkcode_resolve function for the given URL format 72 | 73 | revision is a git commit reference (hash or name) 74 | 75 | package is the name of the root module of the package 76 | 77 | url_fmt is along the lines of ('https://github.com/USER/PROJECT/' 78 | 'blob/{revision}/{package}/' 79 | '{path}#L{lineno}') 80 | """ 81 | revision = _get_git_revision() 82 | return partial( 83 | _linkcode_resolve, revision=revision, package=package, url_fmt=url_fmt 84 | ) 85 | -------------------------------------------------------------------------------- /doc/user_guide.rst: -------------------------------------------------------------------------------- 1 | .. title:: User guide: contents 2 | 3 | .. _user_guide: 4 | 5 | ========== 6 | User Guide 7 | ========== 8 | 9 | .. Ensure that the references will be alphabetically collected last 10 | .. Check https://github.com/mcmtroffaes/sphinxcontrib-bibtex/issues/113 11 | 12 | .. toctree:: 13 | :numbered: 14 | 15 | introduction.rst 16 | over_sampling.rst 17 | under_sampling.rst 18 | combine.rst 19 | ensemble.rst 20 | miscellaneous.rst 21 | metrics.rst 22 | common_pitfalls.rst 23 | Dataset loading utilities 24 | developers_utils.rst 25 | zzz_references.rst 26 | -------------------------------------------------------------------------------- /doc/whats_new.rst: -------------------------------------------------------------------------------- 1 | .. currentmodule:: imblearn 2 | 3 | =============== 4 | Release history 5 | =============== 6 | 7 | .. include:: whats_new/v0.14.rst 8 | 9 | .. include:: whats_new/v0.13.rst 10 | 11 | .. include:: whats_new/v0.12.rst 12 | 13 | .. include:: whats_new/v0.11.rst 14 | 15 | .. include:: whats_new/v0.10.rst 16 | 17 | .. include:: whats_new/v0.9.rst 18 | 19 | .. include:: whats_new/v0.8.rst 20 | 21 | .. include:: whats_new/v0.7.rst 22 | 23 | .. include:: whats_new/v0.6.rst 24 | 25 | .. include:: whats_new/v0.5.rst 26 | 27 | .. include:: whats_new/v0.4.rst 28 | 29 | .. include:: whats_new/v0.3.rst 30 | 31 | .. include:: whats_new/v0.2.rst 32 | 33 | .. include:: whats_new/v0.1.rst 34 | -------------------------------------------------------------------------------- /doc/whats_new/0.13.rst: -------------------------------------------------------------------------------- 1 | .. _changes_0_13: 2 | 3 | Version 0.13.0 4 | ============== 5 | 6 | **December 20, 2024** 7 | 8 | Changelog 9 | --------- 10 | 11 | Bug fixes 12 | ......... 13 | 14 | - Fix `get_metadata_routing` in :class:`~imblearn.pipeline.Pipeline` such that one 15 | can use a sampler with metadata routing. 16 | :pr:`1115` by :user:`Guillaume Lemaitre `. 17 | 18 | Compatibility 19 | ............. 20 | 21 | - Compatibility with scikit-learn 1.6 22 | :pr:`1109` by :user:`Guillaume Lemaitre `. 23 | 24 | Deprecations 25 | ............ 26 | 27 | - :class:`~imblearn.pipeline.Pipeline` now uses 28 | :func:`~sklearn.utils.check_is_fitted` instead of 29 | :func:`~sklearn.utils.check_fitted` to check if the pipeline is fitted. In 0.15, it 30 | will raise an error instead of a warning. 31 | :pr:`1109` by :user:`Guillaume Lemaitre `. 32 | 33 | - `algorithm` parameter in :class:`~imblearn.ensemble.RUSBoostClassifier` is now 34 | deprecated and will be removed in 0.14. 35 | :pr:`1109` by :user:`Guillaume Lemaitre `. 36 | -------------------------------------------------------------------------------- /doc/whats_new/0.14.rst: -------------------------------------------------------------------------------- 1 | .. _changes_0_14: 2 | 3 | Version 0.14.0 (Under development) 4 | ================================== 5 | 6 | **TBD** 7 | 8 | Changelog 9 | --------- 10 | 11 | Bug fixes 12 | ......... 13 | 14 | Enhancements 15 | ............ 16 | 17 | Compatibility 18 | ............. 19 | 20 | Deprecations 21 | ............ 22 | -------------------------------------------------------------------------------- /doc/whats_new/v0.1.rst: -------------------------------------------------------------------------------- 1 | .. _changes_0_1: 2 | 3 | Version 0.1 4 | =========== 5 | 6 | **December 26, 2016** 7 | 8 | Changelog 9 | --------- 10 | 11 | API 12 | ~~~ 13 | 14 | - First release of the stable API. By :user;`Fernando Nogueira `, 15 | :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, 16 | and :user:`Dayvid Oliveira `. 17 | 18 | New methods 19 | ~~~~~~~~~~~ 20 | 21 | * Under-sampling 22 | 1. Random majority under-sampling with replacement 23 | 2. Extraction of majority-minority Tomek links 24 | 3. Under-sampling with Cluster Centroids 25 | 4. NearMiss-(1 & 2 & 3) 26 | 5. Condensend Nearest Neighbour 27 | 6. One-Sided Selection 28 | 7. Neighboorhood Cleaning Rule 29 | 8. Edited Nearest Neighbours 30 | 9. Instance Hardness Threshold 31 | 10. Repeated Edited Nearest Neighbours 32 | 33 | * Over-sampling 34 | 1. Random minority over-sampling with replacement 35 | 2. SMOTE - Synthetic Minority Over-sampling Technique 36 | 3. bSMOTE(1 & 2) - Borderline SMOTE of types 1 and 2 37 | 4. SVM SMOTE - Support Vectors SMOTE 38 | 5. ADASYN - Adaptive synthetic sampling approach for imbalanced learning 39 | 40 | * Over-sampling followed by under-sampling 41 | 1. SMOTE + Tomek links 42 | 2. SMOTE + ENN 43 | 44 | * Ensemble sampling 45 | 1. EasyEnsemble 46 | 2. BalanceCascade 47 | -------------------------------------------------------------------------------- /doc/whats_new/v0.10.rst: -------------------------------------------------------------------------------- 1 | .. _changes_0_10: 2 | 3 | Version 0.10.1 4 | ============== 5 | 6 | **December 28, 2022** 7 | 8 | Changelog 9 | --------- 10 | 11 | Bug fixes 12 | ......... 13 | 14 | - Fix a regression in over-sampler where the string `minority` was rejected as 15 | an unvalid sampling strategy. 16 | :pr:`964` by :user:`Prakhyath Bhandary `. 17 | 18 | Version 0.10.0 19 | ============== 20 | 21 | **December 9, 2022** 22 | 23 | Changelog 24 | --------- 25 | 26 | Bug fixes 27 | ......... 28 | 29 | - Make sure that :class:`~imblearn.utils._docstring.Substitution` is 30 | working with `python -OO` that replace `__doc__` by `None`. 31 | :pr:`953` bu :user:`Guillaume Lemaitre `. 32 | 33 | Compatibility 34 | ............. 35 | 36 | - Maintenance release for be compatible with scikit-learn >= 1.0.2. 37 | :pr:`946`, :pr:`947`, :pr:`949` by :user:`Guillaume Lemaitre `. 38 | 39 | - Add support for automatic parameters validation as in scikit-learn >= 1.2. 40 | :pr:`955` by :user:`Guillaume Lemaitre `. 41 | 42 | - Add support for `feature_names_in_` as well as `get_feature_names_out` for 43 | all samplers. 44 | :pr:`959` by :user:`Guillaume Lemaitre `. 45 | 46 | Deprecation 47 | ........... 48 | 49 | - The parameter `n_jobs` has been deprecated from the classes 50 | :class:`~imblearn.over_sampling.ADASYN`, 51 | :class:`~imblearn.over_sampling.BorderlineSMOTE`, 52 | :class:`~imblearn.over_sampling.SMOTE`, 53 | :class:`~imblearn.over_sampling.SMOTENC`, 54 | :class:`~imblearn.over_sampling.SMOTEN`, and 55 | :class:`~imblearn.over_sampling.SVMSMOTE`. Instead, pass a nearest neighbors 56 | estimator where `n_jobs` is set. 57 | :pr:`887` by :user:`Guillaume Lemaitre `. 58 | 59 | - The parameter `base_estimator` is deprecated and will be removed in version 60 | 0.12. It is impacted the following classes: 61 | :class:`~imblearn.ensemble.BalancedBaggingClassifier`, 62 | :class:`~imblearn.ensemble.EasyEnsembleClassifier`, 63 | :class:`~imblearn.ensemble.RUSBoostClassifier`. 64 | :pr:`946` by :user:`Guillaume Lemaitre `. 65 | 66 | 67 | Enhancements 68 | ............ 69 | 70 | - Add support to accept compatible `NearestNeighbors` objects by only 71 | duck-typing. For instance, it allows to accept cuML instances. 72 | :pr:`858` by :user:`NV-jpt ` and 73 | :user:`Guillaume Lemaitre `. 74 | -------------------------------------------------------------------------------- /doc/whats_new/v0.11.rst: -------------------------------------------------------------------------------- 1 | .. _changes_0_11: 2 | 3 | Version 0.11.0 4 | ============== 5 | 6 | **July 8, 2023** 7 | 8 | Changelog 9 | --------- 10 | 11 | Bug fixes 12 | ......... 13 | 14 | - Fix a bug in :func:`~imblearn.metrics.classification_report_imbalanced` where the 15 | parameter `target_names` was not taken into account when `output_dict=True`. 16 | :pr:`989` by :user:`AYY7 `. 17 | 18 | - :class:`~imblearn.over_sampling.SMOTENC` now handles mix types of data type such as 19 | `bool` and `pd.category` by delegating the conversion to scikit-learn encoder. 20 | :pr:`1002` by :user:`Guillaume Lemaitre `. 21 | 22 | - Handle sparse matrices in :class:`~imblearn.over_sampling.SMOTEN` and raise a warning 23 | since it requires a conversion to dense matrices. 24 | :pr:`1003` by :user:`Guillaume Lemaitre `. 25 | 26 | - Remove spurious warning raised when minority class get over-sampled more than the 27 | number of sample in the majority class. 28 | :pr:`1007` by :user:`Guillaume Lemaitre `. 29 | 30 | Compatibility 31 | ............. 32 | 33 | - Maintenance release for being compatible with scikit-learn >= 1.3.0. 34 | :pr:`999` by :user:`Guillaume Lemaitre `. 35 | 36 | Deprecation 37 | ........... 38 | 39 | - The fitted attribute `ohe_` in :class:`~imblearn.over_sampling.SMOTENC` is deprecated 40 | and will be removed in version 0.13. Use `categorical_encoder_` instead. 41 | :pr:`1000` by :user:`Guillaume Lemaitre `. 42 | 43 | - The default of the parameters `sampling_strategy`, `bootstrap` and 44 | `replacement` will change in 45 | :class:`~imblearn.ensemble.BalancedRandomForestClassifier` to follow the 46 | implementation of the original paper. This changes will take effect in 47 | version 0.13. 48 | :pr:`1006` by :user:`Guillaume Lemaitre `. 49 | 50 | Enhancements 51 | ............ 52 | 53 | - :class:`~imblearn.over_sampling.SMOTENC` now accepts a parameter `categorical_encoder` 54 | allowing to specify a :class:`~sklearn.preprocessing.OneHotEncoder` with custom 55 | parameters. 56 | :pr:`1000` by :user:`Guillaume Lemaitre `. 57 | 58 | - :class:`~imblearn.over_sampling.SMOTEN` now accepts a parameter `categorical_encoder` 59 | allowing to specify a :class:`~sklearn.preprocessing.OrdinalEncoder` with custom 60 | parameters. A new fitted parameter `categorical_encoder_` is exposed to access the 61 | fitted encoder. 62 | :pr:`1001` by :user:`Guillaume Lemaitre `. 63 | 64 | - :class:`~imblearn.under_sampling.RandomUnderSampler` and 65 | :class:`~imblearn.over_sampling.RandomOverSampler` (when `shrinkage is not 66 | None`) now accept any data types and will not attempt any data conversion. 67 | :pr:`1004` by :user:`Guillaume Lemaitre `. 68 | 69 | - :class:`~imblearn.over_sampling.SMOTENC` now support passing array-like of `str` 70 | when passing the `categorical_features` parameter. 71 | :pr:`1008` by :user`Guillaume Lemaitre `. 72 | 73 | - :class:`~imblearn.over_sampling.SMOTENC` now support automatic categorical inference 74 | when `categorical_features` is set to `"auto"`. 75 | :pr:`1009` by :user`Guillaume Lemaitre `. 76 | -------------------------------------------------------------------------------- /doc/whats_new/v0.3.rst: -------------------------------------------------------------------------------- 1 | .. _changes_0_3: 2 | 3 | Version 0.3 4 | =========== 5 | 6 | **February 22, 2018** 7 | 8 | Changelog 9 | --------- 10 | 11 | Testing 12 | ~~~~~~~ 13 | - Pytest is used instead of nosetests. :issue:`321` by :user:`Joan Massich 14 | `. 15 | 16 | Documentation 17 | ~~~~~~~~~~~~~ 18 | 19 | - Added a User Guide and extended some examples. :issue:`295` by 20 | :user:`Guillaume Lemaitre `. 21 | 22 | Bug fixes 23 | ~~~~~~~~~ 24 | 25 | - Fixed a bug in :func:`utils.check_ratio` such that an error is raised when 26 | the number of samples required is negative. :issue:`312` by :user:`Guillaume 27 | Lemaitre `. 28 | 29 | - Fixed a bug in :class:`under_sampling.NearMiss` version 3. The indices 30 | returned were wrong. :issue:`312` by :user:`Guillaume Lemaitre `. 31 | 32 | - Fixed bug for :class:`ensemble.BalanceCascade` and :class:`combine.SMOTEENN` 33 | and :class:`SMOTETomek`. :issue:`295` by :user:`Guillaume Lemaitre 34 | `. 35 | 36 | - Fixed bug for `check_ratio` to be able to pass arguments when `ratio` is a 37 | callable. :issue:`307` by :user:`Guillaume Lemaitre `. 38 | 39 | New features 40 | ~~~~~~~~~~~~ 41 | 42 | - Turn off steps in :class:`pipeline.Pipeline` using the `None` 43 | object. By :user:`Christos Aridas `. 44 | 45 | - Add a fetching function :func:`datasets.fetch_datasets` in order to get some 46 | imbalanced datasets useful for benchmarking. :issue:`249` by :user:`Guillaume 47 | Lemaitre `. 48 | 49 | Enhancement 50 | ~~~~~~~~~~~ 51 | 52 | - All samplers accepts sparse matrices with defaulting on CSR 53 | type. :issue:`316` by :user:`Guillaume Lemaitre `. 54 | 55 | - :func:`datasets.make_imbalance` take a ratio similarly to other samplers. It 56 | supports multiclass. :issue:`312` by :user:`Guillaume Lemaitre `. 57 | 58 | - All the unit tests have been factorized and a :func:`utils.check_estimators` 59 | has been derived from scikit-learn. By :user:`Guillaume Lemaitre 60 | `. 61 | 62 | - Script for automatic build of conda packages and uploading. :issue:`242` by 63 | :user:`Guillaume Lemaitre ` 64 | 65 | - Remove seaborn dependence and improve the examples. :issue:`264` by 66 | :user:`Guillaume Lemaitre `. 67 | 68 | - adapt all classes to multi-class resampling. :issue:`290` by :user:`Guillaume 69 | Lemaitre ` 70 | 71 | API changes summary 72 | ~~~~~~~~~~~~~~~~~~~ 73 | 74 | - `__init__` has been removed from the :class:`base.SamplerMixin` to create a 75 | real mixin class. :issue:`242` by :user:`Guillaume Lemaitre `. 76 | 77 | - creation of a module :mod:`exceptions` to handle consistant raising of 78 | errors. :issue:`242` by :user:`Guillaume Lemaitre `. 79 | 80 | - creation of a module ``utils.validation`` to make checking of recurrent 81 | patterns. :issue:`242` by :user:`Guillaume Lemaitre `. 82 | 83 | - move the under-sampling methods in ``prototype_selection`` and 84 | ``prototype_generation`` submodule to make a clearer 85 | dinstinction. :issue:`277` by :user:`Guillaume Lemaitre `. 86 | 87 | - change ``ratio`` such that it can adapt to multiple class 88 | problems. :issue:`290` by :user:`Guillaume Lemaitre `. 89 | 90 | Deprecation 91 | ~~~~~~~~~~~ 92 | 93 | - Deprecation of the use of ``min_c_`` in 94 | :func:`datasets.make_imbalance`. :issue:`312` by :user:`Guillaume Lemaitre 95 | ` 96 | 97 | - Deprecation of the use of float in :func:`datasets.make_imbalance` for the 98 | ratio parameter. :issue:`290` by :user:`Guillaume Lemaitre `. 99 | 100 | - deprecate the use of float as ratio in favor of dictionary, string, or 101 | callable. :issue:`290` by :user:`Guillaume Lemaitre `. 102 | -------------------------------------------------------------------------------- /doc/whats_new/v0.5.rst: -------------------------------------------------------------------------------- 1 | .. _changes_0_5: 2 | 3 | Version 0.5.0 4 | ============= 5 | 6 | **June 28, 2019** 7 | 8 | Changelog 9 | --------- 10 | 11 | Changed models 12 | .............. 13 | 14 | The following models or function might give different results even if the 15 | same data ``X`` and ``y`` are the same. 16 | 17 | * :class:`imblearn.ensemble.RUSBoostClassifier` default estimator changed from 18 | :class:`sklearn.tree.DecisionTreeClassifier` with full depth to a decision 19 | stump (i.e., tree with ``max_depth=1``). 20 | 21 | Documentation 22 | ............. 23 | 24 | - Correct the definition of the ratio when using a ``float`` in sampling 25 | strategy for the over-sampling and under-sampling. 26 | :issue:`525` by :user:`Ariel Rossanigo `. 27 | 28 | - Add :class:`imblearn.over_sampling.BorderlineSMOTE` and 29 | :class:`imblearn.over_sampling.SVMSMOTE` in the API documenation. 30 | :issue:`530` by :user:`Guillaume Lemaitre `. 31 | 32 | Enhancement 33 | ........... 34 | 35 | - Add Parallelisation for SMOTEENN and SMOTETomek. 36 | :pr:`547` by :user:`Michael Hsieh `. 37 | 38 | - Add :class:`imblearn.utils._show_versions`. Updated the contribution guide 39 | and issue template showing how to print system and dependency information 40 | from the command line. :pr:`557` by :user:`Alexander L. Hayes `. 41 | 42 | - Add :class:`imblearn.over_sampling.KMeansSMOTE` which is an over-sampler 43 | clustering points before to apply SMOTE. 44 | :pr:`435` by :user:`Stephan Heijl `. 45 | 46 | Maintenance 47 | ........... 48 | 49 | - Make it possible to ``import imblearn`` and access submodule. 50 | :pr:`500` by :user:`Guillaume Lemaitre `. 51 | 52 | - Remove support for Python 2, remove deprecation warning from 53 | scikit-learn 0.21. 54 | :pr:`576` by :user:`Guillaume Lemaitre `. 55 | 56 | Bug 57 | ... 58 | 59 | - Fix wrong usage of :class:`keras.layers.BatchNormalization` in 60 | ``porto_seguro_keras_under_sampling.py`` example. The batch normalization 61 | was moved before the activation function and the bias was removed from the 62 | dense layer. 63 | :pr:`531` by :user:`Guillaume Lemaitre `. 64 | 65 | - Fix bug which converting to COO format sparse when stacking the matrices in 66 | :class:`imblearn.over_sampling.SMOTENC`. This bug was only old scipy version. 67 | :pr:`539` by :user:`Guillaume Lemaitre `. 68 | 69 | - Fix bug in :class:`imblearn.pipeline.Pipeline` where None could be the final 70 | estimator. 71 | :pr:`554` by :user:`Oliver Rausch `. 72 | 73 | - Fix bug in :class:`imblearn.over_sampling.SVMSMOTE` and 74 | :class:`imblearn.over_sampling.BorderlineSMOTE` where the default parameter 75 | of ``n_neighbors`` was not set properly. 76 | :pr:`578` by :user:`Guillaume Lemaitre `. 77 | 78 | - Fix bug by changing the default depth in 79 | :class:`imblearn.ensemble.RUSBoostClassifier` to get a decision stump as a 80 | weak learner as in the original paper. 81 | :pr:`545` by :user:`Christos Aridas `. 82 | 83 | - Allow to import ``keras`` directly from ``tensorflow`` in the 84 | :mod:`imblearn.keras`. 85 | :pr:`531` by :user:`Guillaume Lemaitre `. 86 | -------------------------------------------------------------------------------- /doc/whats_new/v0.7.rst: -------------------------------------------------------------------------------- 1 | .. _changes_0_7: 2 | 3 | Version 0.7.0 4 | ============= 5 | 6 | **June 9, 2020** 7 | 8 | Changelog 9 | --------- 10 | 11 | Maintenance 12 | ........... 13 | 14 | - Ensure that :class:`imblearn.pipeline.Pipeline` is working when `memory` 15 | is activated and `joblib==0.11`. 16 | :pr:`687` by :user:`Christos Aridas `. 17 | 18 | - Refactor common test to use the dev tools from `scikit-learn` 0.23. 19 | :pr:`710` by :user:`Guillaume Lemaitre `. 20 | 21 | - Remove `FutureWarning` issued by `scikit-learn` 0.23. 22 | :pr:`710` by :user:`Guillaume Lemaitre `. 23 | 24 | - Impose keywords only argument as in `scikit-learn`. 25 | :pr:`721` by :user:`Guillaume Lemaitre `. 26 | 27 | Changed models 28 | .............. 29 | 30 | The following models might give some different results due to changes: 31 | 32 | - :class:`imblearn.ensemble.BalancedRandomForestClassifier` 33 | 34 | Bug fixes 35 | ......... 36 | 37 | - Change the default value `min_samples_leaf` to be consistent with 38 | scikit-learn. 39 | :pr:`711` by :user:`zerolfx `. 40 | 41 | - Fix a bug due to change in `scikit-learn` 0.23 in 42 | :class:`imblearn.metrics.make_index_balanced_accuracy`. The function was 43 | unusable. 44 | :pr:`710` by :user:`Guillaume Lemaitre `. 45 | 46 | - Raise a proper error message when only numerical or categorical features 47 | are given in :class:`imblearn.over_sampling.SMOTENC`. 48 | :pr:`720` by :user:`Guillaume Lemaitre `. 49 | 50 | - Fix a bug when the median of the standard deviation is null in 51 | :class:`imblearn.over_sampling.SMOTENC`. 52 | :pr:`675` by :user:`bganglia `. 53 | 54 | Enhancements 55 | ............ 56 | 57 | - The classifier implemented in imbalanced-learn, 58 | :class:`imblearn.ensemble.BalancedBaggingClassifier`, 59 | :class:`imblearn.ensemble.BalancedRandomForestClassifier`, 60 | :class:`imblearn.ensemble.EasyEnsembleClassifier`, and 61 | :class:`imblearn.ensemble.RUSBoostClassifier`, accept `sampling_strategy` 62 | with the same key than in `y` without the need of encoding `y` in advance. 63 | :pr:`718` by :user:`Guillaume Lemaitre `. 64 | 65 | - Lazy import `keras` module when importing `imblearn.keras` 66 | :pr:`719` by :user:`Guillaume Lemaitre `. 67 | 68 | Deprecation 69 | ........... 70 | 71 | - Deprecation of the parameters `n_jobs` in 72 | :class:`imblearn.under_sampling.ClusterCentroids` since it was used by 73 | :class:`sklearn.cluster.KMeans` which deprecated it. 74 | :pr:`710` by :user:`Guillaume Lemaitre `. 75 | 76 | - Deprecation of passing keyword argument by position similarly to 77 | `scikit-learn`. 78 | :pr:`721` by :user:`Guillaume lemaitre `. 79 | -------------------------------------------------------------------------------- /doc/whats_new/v0.8.rst: -------------------------------------------------------------------------------- 1 | .. _changes_0_8: 2 | 3 | Version 0.8.1 4 | ============= 5 | 6 | **September 29, 2020** 7 | 8 | Changelog 9 | --------- 10 | 11 | Maintenance 12 | ........... 13 | 14 | - Make `imbalanced-learn` compatible with `scikit-learn` 1.0. 15 | :pr:`864` by :user:`Guillaume Lemaitre `. 16 | 17 | Version 0.8.0 18 | ============= 19 | 20 | **February 18, 2021** 21 | 22 | Changelog 23 | --------- 24 | 25 | New features 26 | ............ 27 | 28 | - Add the the function 29 | :func:`imblearn.metrics.macro_averaged_mean_absolute_error` returning the 30 | average across class of the MAE. This metric is used in ordinal 31 | classification. 32 | :pr:`780` by :user:`Aurélien Massiot `. 33 | 34 | - Add the class :class:`imblearn.metrics.pairwise.ValueDifferenceMetric` to 35 | compute pairwise distances between samples containing only categorical 36 | values. 37 | :pr:`796` by :user:`Guillaume Lemaitre `. 38 | 39 | - Add the class :class:`imblearn.over_sampling.SMOTEN` to over-sample data 40 | only containing categorical features. 41 | :pr:`802` by :user:`Guillaume Lemaitre `. 42 | 43 | - Add the possibility to pass any type of samplers in 44 | :class:`imblearn.ensemble.BalancedBaggingClassifier` unlocking the 45 | implementation of methods based on resampled bagging. 46 | :pr:`808` by :user:`Guillaume Lemaitre `. 47 | 48 | Enhancements 49 | ............ 50 | 51 | - Add option `output_dict` in 52 | :func:`imblearn.metrics.classification_report_imbalanced` to return a 53 | dictionary instead of a string. 54 | :pr:`770` by :user:`Guillaume Lemaitre `. 55 | 56 | - Added an option to generate smoothed bootstrap in 57 | :class:`imblearn.over_sampling.RandomOverSampler`. It is controls by the 58 | parameter `shrinkage`. This method is also known as Random Over-Sampling 59 | Examples (ROSE). 60 | :pr:`754` by :user:`Andrea Lorenzon ` and 61 | :user:`Guillaume Lemaitre `. 62 | 63 | Bug fixes 64 | ......... 65 | 66 | - Fix a bug in :class:`imblearn.under_sampling.ClusterCentroids` where 67 | `voting="hard"` could have lead to select a sample from any class instead of 68 | the targeted class. 69 | :pr:`769` by :user:`Guillaume Lemaitre `. 70 | 71 | - Fix a bug in :class:`imblearn.FunctionSampler` where validation was performed 72 | even with `validate=False` when calling `fit`. 73 | :pr:`790` by :user:`Guillaume Lemaitre `. 74 | 75 | Maintenance 76 | ........... 77 | 78 | - Remove requirements files in favour of adding the packages in the 79 | `extras_require` within the `setup.py` file. 80 | :pr:`816` by :user:`Guillaume Lemaitre `. 81 | 82 | - Change the website template to use `pydata-sphinx-theme`. 83 | :pr:`801` by :user:`Guillaume Lemaitre `. 84 | 85 | Deprecation 86 | ........... 87 | 88 | - The context manager :func:`imblearn.utils.testing.warns` is deprecated in 0.8 89 | and will be removed 1.0. 90 | :pr:`815` by :user:`Guillaume Lemaitre `. 91 | -------------------------------------------------------------------------------- /doc/whats_new/v0.9.rst: -------------------------------------------------------------------------------- 1 | .. _changes_0_9: 2 | 3 | Version 0.9.1 4 | ============= 5 | 6 | **May 16, 2022** 7 | 8 | Changelog 9 | --------- 10 | 11 | This release provides fixes that make `imbalanced-learn` works with the 12 | latest release (`1.1.0`) of `scikit-learn`. 13 | 14 | Version 0.9.0 15 | ============= 16 | 17 | **January 11, 2022** 18 | 19 | Changelog 20 | --------- 21 | 22 | This release is mainly providing fixes that make `imbalanced-learn` works 23 | with the latest release (`1.0.2`) of `scikit-learn`. 24 | -------------------------------------------------------------------------------- /doc/zzz_references.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | References 3 | ========== 4 | 5 | .. bibliography:: bibtex/refs.bib 6 | -------------------------------------------------------------------------------- /examples/README.txt: -------------------------------------------------------------------------------- 1 | .. _general_examples: 2 | 3 | Examples 4 | -------- 5 | 6 | General-purpose and introductory examples for the `imbalanced-learn` toolbox. 7 | -------------------------------------------------------------------------------- /examples/api/README.txt: -------------------------------------------------------------------------------- 1 | .. _api_usage: 2 | 3 | Examples showing API imbalanced-learn usage 4 | ------------------------------------------- 5 | 6 | Examples that show some details regarding the API of imbalanced-learn. 7 | -------------------------------------------------------------------------------- /examples/applications/README.txt: -------------------------------------------------------------------------------- 1 | .. _realword_examples: 2 | 3 | Examples based on real world datasets 4 | ------------------------------------- 5 | 6 | Examples which use real-word dataset. 7 | -------------------------------------------------------------------------------- /examples/applications/plot_multi_class_under_sampling.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============================================= 3 | Multiclass classification with under-sampling 4 | ============================================= 5 | 6 | Some balancing methods allow for balancing dataset with multiples classes. 7 | We provide an example to illustrate the use of those methods which do 8 | not differ from the binary case. 9 | 10 | """ 11 | 12 | # Authors: Guillaume Lemaitre 13 | # License: MIT 14 | 15 | from collections import Counter 16 | 17 | from sklearn.datasets import load_iris 18 | from sklearn.linear_model import LogisticRegression 19 | from sklearn.model_selection import train_test_split 20 | from sklearn.preprocessing import StandardScaler 21 | 22 | from imblearn.datasets import make_imbalance 23 | from imblearn.metrics import classification_report_imbalanced 24 | from imblearn.pipeline import make_pipeline 25 | from imblearn.under_sampling import NearMiss 26 | 27 | print(__doc__) 28 | 29 | RANDOM_STATE = 42 30 | 31 | # Create a folder to fetch the dataset 32 | iris = load_iris() 33 | X, y = make_imbalance( 34 | iris.data, 35 | iris.target, 36 | sampling_strategy={0: 25, 1: 50, 2: 50}, 37 | random_state=RANDOM_STATE, 38 | ) 39 | 40 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE) 41 | 42 | print(f"Training target statistics: {Counter(y_train)}") 43 | print(f"Testing target statistics: {Counter(y_test)}") 44 | 45 | # Create a pipeline 46 | pipeline = make_pipeline(NearMiss(version=2), StandardScaler(), LogisticRegression()) 47 | pipeline.fit(X_train, y_train) 48 | 49 | # Classify and report the results 50 | print(classification_report_imbalanced(y_test, pipeline.predict(X_test))) 51 | -------------------------------------------------------------------------------- /examples/applications/plot_topic_classication.py: -------------------------------------------------------------------------------- 1 | """ 2 | ================================================= 3 | Example of topic classification in text documents 4 | ================================================= 5 | 6 | This example shows how to balance the text data before to train a classifier. 7 | 8 | Note that for this example, the data are slightly imbalanced but it can happen 9 | that for some data sets, the imbalanced ratio is more significant. 10 | """ 11 | 12 | # Authors: Guillaume Lemaitre 13 | # License: MIT 14 | 15 | # %% 16 | print(__doc__) 17 | 18 | # %% [markdown] 19 | # Setting the data set 20 | # -------------------- 21 | # 22 | # We use a part of the 20 newsgroups data set by loading 4 topics. Using the 23 | # scikit-learn loader, the data are split into a training and a testing set. 24 | # 25 | # Note the class \#3 is the minority class and has almost twice less samples 26 | # than the majority class. 27 | 28 | # %% 29 | from sklearn.datasets import fetch_20newsgroups 30 | 31 | categories = [ 32 | "alt.atheism", 33 | "talk.religion.misc", 34 | "comp.graphics", 35 | "sci.space", 36 | ] 37 | newsgroups_train = fetch_20newsgroups(subset="train", categories=categories) 38 | newsgroups_test = fetch_20newsgroups(subset="test", categories=categories) 39 | 40 | X_train = newsgroups_train.data 41 | X_test = newsgroups_test.data 42 | 43 | y_train = newsgroups_train.target 44 | y_test = newsgroups_test.target 45 | 46 | # %% 47 | from collections import Counter 48 | 49 | print(f"Training class distributions summary: {Counter(y_train)}") 50 | print(f"Test class distributions summary: {Counter(y_test)}") 51 | 52 | # %% [markdown] 53 | # The usual scikit-learn pipeline 54 | # ------------------------------- 55 | # 56 | # You might usually use scikit-learn pipeline by combining the TF-IDF 57 | # vectorizer to feed a multinomial naive bayes classifier. A classification 58 | # report summarized the results on the testing set. 59 | # 60 | # As expected, the recall of the class \#3 is low mainly due to the class 61 | # imbalanced. 62 | 63 | # %% 64 | from sklearn.feature_extraction.text import TfidfVectorizer 65 | from sklearn.naive_bayes import MultinomialNB 66 | from sklearn.pipeline import make_pipeline 67 | 68 | model = make_pipeline(TfidfVectorizer(), MultinomialNB()) 69 | model.fit(X_train, y_train) 70 | y_pred = model.predict(X_test) 71 | 72 | # %% 73 | from imblearn.metrics import classification_report_imbalanced 74 | 75 | print(classification_report_imbalanced(y_test, y_pred)) 76 | 77 | # %% [markdown] 78 | # Balancing the class before classification 79 | # ----------------------------------------- 80 | # 81 | # To improve the prediction of the class \#3, it could be interesting to apply 82 | # a balancing before to train the naive bayes classifier. Therefore, we will 83 | # use a :class:`~imblearn.under_sampling.RandomUnderSampler` to equalize the 84 | # number of samples in all the classes before the training. 85 | # 86 | # It is also important to note that we are using the 87 | # :class:`~imblearn.pipeline.make_pipeline` function implemented in 88 | # imbalanced-learn to properly handle the samplers. 89 | 90 | from imblearn.pipeline import make_pipeline as make_pipeline_imb 91 | 92 | # %% 93 | from imblearn.under_sampling import RandomUnderSampler 94 | 95 | model = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(), MultinomialNB()) 96 | 97 | model.fit(X_train, y_train) 98 | y_pred = model.predict(X_test) 99 | 100 | # %% [markdown] 101 | # Although the results are almost identical, it can be seen that the resampling 102 | # allowed to correct the poor recall of the class \#3 at the cost of reducing 103 | # the other metrics for the other classes. However, the overall results are 104 | # slightly better. 105 | 106 | # %% 107 | print(classification_report_imbalanced(y_test, y_pred)) 108 | -------------------------------------------------------------------------------- /examples/combine/README.txt: -------------------------------------------------------------------------------- 1 | .. _combine_examples: 2 | 3 | Examples using combine class methods 4 | ==================================== 5 | 6 | Combine methods mixed over- and under-sampling methods. Generally SMOTE is used for over-sampling while some cleaning methods (i.e., ENN and Tomek links) are used to under-sample. 7 | -------------------------------------------------------------------------------- /examples/datasets/README.txt: -------------------------------------------------------------------------------- 1 | .. _dataset_examples: 2 | 3 | Dataset examples 4 | ----------------------- 5 | 6 | Examples concerning the :mod:`imblearn.datasets` module. 7 | -------------------------------------------------------------------------------- /examples/datasets/plot_make_imbalance.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============================ 3 | Create an imbalanced dataset 4 | ============================ 5 | 6 | An illustration of the :func:`~imblearn.datasets.make_imbalance` function to 7 | create an imbalanced dataset from a balanced dataset. We show the ability of 8 | :func:`~imblearn.datasets.make_imbalance` of dealing with Pandas DataFrame. 9 | """ 10 | 11 | # Authors: Dayvid Oliveira 12 | # Christos Aridas 13 | # Guillaume Lemaitre 14 | # License: MIT 15 | 16 | # %% 17 | print(__doc__) 18 | 19 | import seaborn as sns 20 | 21 | sns.set_context("poster") 22 | 23 | # %% [markdown] 24 | # Generate the dataset 25 | # -------------------- 26 | # 27 | # First, we will generate a dataset and convert it to a 28 | # :class:`~pandas.DataFrame` with arbitrary column names. We will plot the 29 | # original dataset. 30 | 31 | # %% 32 | import matplotlib.pyplot as plt 33 | import pandas as pd 34 | from sklearn.datasets import make_moons 35 | 36 | X, y = make_moons(n_samples=200, shuffle=True, noise=0.5, random_state=10) 37 | X = pd.DataFrame(X, columns=["feature 1", "feature 2"]) 38 | ax = X.plot.scatter( 39 | x="feature 1", 40 | y="feature 2", 41 | c=y, 42 | colormap="viridis", 43 | colorbar=False, 44 | ) 45 | sns.despine(ax=ax, offset=10) 46 | plt.tight_layout() 47 | 48 | # %% [markdown] 49 | # Make a dataset imbalanced 50 | # ------------------------- 51 | # 52 | # Now, we will show the helpers :func:`~imblearn.datasets.make_imbalance` 53 | # that is useful to random select a subset of samples. It will impact the 54 | # class distribution as specified by the parameters. 55 | 56 | # %% 57 | from collections import Counter 58 | 59 | 60 | def ratio_func(y, multiplier, minority_class): 61 | target_stats = Counter(y) 62 | return {minority_class: int(multiplier * target_stats[minority_class])} 63 | 64 | 65 | # %% 66 | from imblearn.datasets import make_imbalance 67 | 68 | fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(15, 10)) 69 | 70 | X.plot.scatter( 71 | x="feature 1", 72 | y="feature 2", 73 | c=y, 74 | ax=axs[0, 0], 75 | colormap="viridis", 76 | colorbar=False, 77 | ) 78 | axs[0, 0].set_title("Original set") 79 | sns.despine(ax=axs[0, 0], offset=10) 80 | 81 | multipliers = [0.9, 0.75, 0.5, 0.25, 0.1] 82 | for ax, multiplier in zip(axs.ravel()[1:], multipliers): 83 | X_resampled, y_resampled = make_imbalance( 84 | X, 85 | y, 86 | sampling_strategy=ratio_func, 87 | **{"multiplier": multiplier, "minority_class": 1}, 88 | ) 89 | X_resampled.plot.scatter( 90 | x="feature 1", 91 | y="feature 2", 92 | c=y_resampled, 93 | ax=ax, 94 | colormap="viridis", 95 | colorbar=False, 96 | ) 97 | ax.set_title(f"Sampling ratio = {multiplier}") 98 | sns.despine(ax=ax, offset=10) 99 | 100 | plt.tight_layout() 101 | plt.show() 102 | -------------------------------------------------------------------------------- /examples/ensemble/README.txt: -------------------------------------------------------------------------------- 1 | .. _ensemble_examples: 2 | 3 | Example using ensemble class methods 4 | ==================================== 5 | 6 | Under-sampling methods implies that samples of the majority class are lost during the balancing procedure. 7 | Ensemble methods offer an alternative to use most of the samples. 8 | In fact, an ensemble of balanced sets is created and used to later train any classifier. 9 | -------------------------------------------------------------------------------- /examples/evaluation/README.txt: -------------------------------------------------------------------------------- 1 | .. _evaluation_examples: 2 | 3 | Evaluation examples 4 | ------------------- 5 | 6 | Examples illustrating how classification using imbalanced dataset can be done. 7 | -------------------------------------------------------------------------------- /examples/evaluation/plot_classification_report.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============================================= 3 | Evaluate classification by compiling a report 4 | ============================================= 5 | 6 | Specific metrics have been developed to evaluate classifier which has been 7 | trained using imbalanced data. :mod:`imblearn` provides a classification report 8 | similar to :mod:`sklearn`, with additional metrics specific to imbalanced 9 | learning problem. 10 | """ 11 | 12 | # Authors: Guillaume Lemaitre 13 | # License: MIT 14 | 15 | 16 | from sklearn import datasets 17 | from sklearn.linear_model import LogisticRegression 18 | from sklearn.model_selection import train_test_split 19 | from sklearn.preprocessing import StandardScaler 20 | 21 | from imblearn import over_sampling as os 22 | from imblearn import pipeline as pl 23 | from imblearn.metrics import classification_report_imbalanced 24 | 25 | print(__doc__) 26 | 27 | RANDOM_STATE = 42 28 | 29 | # Generate a dataset 30 | X, y = datasets.make_classification( 31 | n_classes=2, 32 | class_sep=2, 33 | weights=[0.1, 0.9], 34 | n_informative=10, 35 | n_redundant=1, 36 | flip_y=0, 37 | n_features=20, 38 | n_clusters_per_class=4, 39 | n_samples=5000, 40 | random_state=RANDOM_STATE, 41 | ) 42 | 43 | pipeline = pl.make_pipeline( 44 | StandardScaler(), 45 | os.SMOTE(random_state=RANDOM_STATE), 46 | LogisticRegression(max_iter=10_000), 47 | ) 48 | 49 | # Split the data 50 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE) 51 | 52 | # Train the classifier with balancing 53 | pipeline.fit(X_train, y_train) 54 | 55 | # Test the classifier and get the prediction 56 | y_pred_bal = pipeline.predict(X_test) 57 | 58 | # Show the classification report 59 | print(classification_report_imbalanced(y_test, y_pred_bal)) 60 | -------------------------------------------------------------------------------- /examples/evaluation/plot_metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | ======================================= 3 | Metrics specific to imbalanced learning 4 | ======================================= 5 | 6 | Specific metrics have been developed to evaluate classifier which 7 | has been trained using imbalanced data. :mod:`imblearn` provides mainly 8 | two additional metrics which are not implemented in :mod:`sklearn`: (i) 9 | geometric mean and (ii) index balanced accuracy. 10 | """ 11 | 12 | # Authors: Guillaume Lemaitre 13 | # License: MIT 14 | 15 | # %% 16 | print(__doc__) 17 | 18 | RANDOM_STATE = 42 19 | 20 | # %% [markdown] 21 | # First, we will generate some imbalanced dataset. 22 | 23 | # %% 24 | from sklearn.datasets import make_classification 25 | 26 | X, y = make_classification( 27 | n_classes=3, 28 | class_sep=2, 29 | weights=[0.1, 0.9], 30 | n_informative=10, 31 | n_redundant=1, 32 | flip_y=0, 33 | n_features=20, 34 | n_clusters_per_class=4, 35 | n_samples=5000, 36 | random_state=RANDOM_STATE, 37 | ) 38 | 39 | # %% [markdown] 40 | # We will split the data into a training and testing set. 41 | 42 | # %% 43 | from sklearn.model_selection import train_test_split 44 | 45 | X_train, X_test, y_train, y_test = train_test_split( 46 | X, y, stratify=y, random_state=RANDOM_STATE 47 | ) 48 | 49 | # %% [markdown] 50 | # We will create a pipeline made of a :class:`~imblearn.over_sampling.SMOTE` 51 | # over-sampler followed by a :class:`~sklearn.linear_model.LogisticRegression` 52 | # classifier. 53 | 54 | from sklearn.linear_model import LogisticRegression 55 | from sklearn.preprocessing import StandardScaler 56 | 57 | from imblearn.over_sampling import SMOTE 58 | 59 | # %% 60 | from imblearn.pipeline import make_pipeline 61 | 62 | model = make_pipeline( 63 | StandardScaler(), 64 | SMOTE(random_state=RANDOM_STATE), 65 | LogisticRegression(max_iter=10_000, random_state=RANDOM_STATE), 66 | ) 67 | 68 | # %% [markdown] 69 | # Now, we will train the model on the training set and get the prediction 70 | # associated with the testing set. Be aware that the resampling will happen 71 | # only when calling `fit`: the number of samples in `y_pred` is the same than 72 | # in `y_test`. 73 | 74 | # %% 75 | model.fit(X_train, y_train) 76 | y_pred = model.predict(X_test) 77 | 78 | # %% [markdown] 79 | # The geometric mean corresponds to the square root of the product of the 80 | # sensitivity and specificity. Combining the two metrics should account for 81 | # the balancing of the dataset. 82 | 83 | # %% 84 | from imblearn.metrics import geometric_mean_score 85 | 86 | print(f"The geometric mean is {geometric_mean_score(y_test, y_pred):.3f}") 87 | 88 | # %% [markdown] 89 | # The index balanced accuracy can transform any metric to be used in 90 | # imbalanced learning problems. 91 | 92 | # %% 93 | from imblearn.metrics import make_index_balanced_accuracy 94 | 95 | alpha = 0.1 96 | geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(geometric_mean_score) 97 | 98 | print( 99 | f"The IBA using alpha={alpha} and the geometric mean: " 100 | f"{geo_mean(y_test, y_pred):.3f}" 101 | ) 102 | 103 | # %% 104 | alpha = 0.5 105 | geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(geometric_mean_score) 106 | 107 | print( 108 | f"The IBA using alpha={alpha} and the geometric mean: " 109 | f"{geo_mean(y_test, y_pred):.3f}" 110 | ) 111 | -------------------------------------------------------------------------------- /examples/model_selection/README.txt: -------------------------------------------------------------------------------- 1 | .. _model_selection_examples: 2 | 3 | Model Selection 4 | --------------- 5 | 6 | Examples related to the selection of balancing methods. 7 | -------------------------------------------------------------------------------- /examples/model_selection/plot_validation_curve.py: -------------------------------------------------------------------------------- 1 | """ 2 | ========================== 3 | Plotting Validation Curves 4 | ========================== 5 | 6 | In this example the impact of the :class:`~imblearn.over_sampling.SMOTE`'s 7 | `k_neighbors` parameter is examined. In the plot you can see the validation 8 | scores of a SMOTE-CART classifier for different values of the 9 | :class:`~imblearn.over_sampling.SMOTE`'s `k_neighbors` parameter. 10 | """ 11 | 12 | # Authors: Christos Aridas 13 | # Guillaume Lemaitre 14 | # License: MIT 15 | 16 | # %% 17 | print(__doc__) 18 | 19 | import seaborn as sns 20 | 21 | sns.set_context("poster") 22 | 23 | 24 | RANDOM_STATE = 42 25 | 26 | # %% [markdown] 27 | # Let's first generate a dataset with imbalanced class distribution. 28 | 29 | # %% 30 | from sklearn.datasets import make_classification 31 | 32 | X, y = make_classification( 33 | n_classes=2, 34 | class_sep=2, 35 | weights=[0.1, 0.9], 36 | n_informative=10, 37 | n_redundant=1, 38 | flip_y=0, 39 | n_features=20, 40 | n_clusters_per_class=4, 41 | n_samples=5000, 42 | random_state=RANDOM_STATE, 43 | ) 44 | 45 | # %% [markdown] 46 | # We will use an over-sampler :class:`~imblearn.over_sampling.SMOTE` followed 47 | # by a :class:`~sklearn.tree.DecisionTreeClassifier`. The aim will be to 48 | # search which `k_neighbors` parameter is the most adequate with the dataset 49 | # that we generated. 50 | 51 | from sklearn.tree import DecisionTreeClassifier 52 | 53 | # %% 54 | from imblearn.over_sampling import SMOTE 55 | from imblearn.pipeline import make_pipeline 56 | 57 | model = make_pipeline( 58 | SMOTE(random_state=RANDOM_STATE), DecisionTreeClassifier(random_state=RANDOM_STATE) 59 | ) 60 | 61 | # %% [markdown] 62 | # We can use the :class:`~sklearn.model_selection.validation_curve` to inspect 63 | # the impact of varying the parameter `k_neighbors`. In this case, we need 64 | # to use a score to evaluate the generalization score during the 65 | # cross-validation. 66 | 67 | # %% 68 | from sklearn.metrics import cohen_kappa_score, make_scorer 69 | from sklearn.model_selection import validation_curve 70 | 71 | scorer = make_scorer(cohen_kappa_score) 72 | param_range = range(1, 11) 73 | train_scores, test_scores = validation_curve( 74 | model, 75 | X, 76 | y, 77 | param_name="smote__k_neighbors", 78 | param_range=param_range, 79 | cv=3, 80 | scoring=scorer, 81 | ) 82 | 83 | # %% 84 | train_scores_mean = train_scores.mean(axis=1) 85 | train_scores_std = train_scores.std(axis=1) 86 | test_scores_mean = test_scores.mean(axis=1) 87 | test_scores_std = test_scores.std(axis=1) 88 | 89 | # %% [markdown] 90 | # We can now plot the results of the cross-validation for the different 91 | # parameter values that we tried. 92 | 93 | # %% 94 | import matplotlib.pyplot as plt 95 | 96 | fig, ax = plt.subplots(figsize=(7, 7)) 97 | ax.plot(param_range, test_scores_mean, label="SMOTE") 98 | ax.fill_between( 99 | param_range, 100 | test_scores_mean + test_scores_std, 101 | test_scores_mean - test_scores_std, 102 | alpha=0.2, 103 | ) 104 | idx_max = test_scores_mean.argmax() 105 | ax.scatter( 106 | param_range[idx_max], 107 | test_scores_mean[idx_max], 108 | label=r"Cohen Kappa: ${:.2f}\pm{:.2f}$".format( 109 | test_scores_mean[idx_max], test_scores_std[idx_max] 110 | ), 111 | ) 112 | 113 | fig.suptitle("Validation Curve with SMOTE-CART") 114 | ax.set_xlabel("Number of neighbors") 115 | ax.set_ylabel("Cohen's kappa") 116 | 117 | # make nice plotting 118 | sns.despine(ax=ax, offset=10) 119 | ax.set_xlim([1, 10]) 120 | ax.set_ylim([0.4, 0.8]) 121 | ax.legend(loc="lower right", fontsize=16) 122 | plt.tight_layout() 123 | plt.show() 124 | -------------------------------------------------------------------------------- /examples/over-sampling/README.txt: -------------------------------------------------------------------------------- 1 | .. _over_sampling_examples: 2 | 3 | Example using over-sampling class methods 4 | ========================================= 5 | 6 | Data balancing can be performed by over-sampling such that new samples are generated in the minority class to reach a given balancing ratio. 7 | -------------------------------------------------------------------------------- /examples/over-sampling/plot_illustration_generation_sample.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============================================ 3 | Sample generator used in SMOTE-like samplers 4 | ============================================ 5 | 6 | This example illustrates how a new sample is generated taking into account the 7 | neighbourhood of this sample. A new sample is generated by selecting the 8 | randomly 2 samples of the same class and interpolating a point between these 9 | samples. 10 | """ 11 | 12 | # Authors: Guillaume Lemaitre 13 | # License: MIT 14 | # %% 15 | print(__doc__) 16 | 17 | import matplotlib.pyplot as plt 18 | import numpy as np 19 | import seaborn as sns 20 | 21 | sns.set_context("poster") 22 | 23 | rng = np.random.RandomState(18) 24 | 25 | f, ax = plt.subplots(figsize=(8, 8)) 26 | 27 | # generate some data points 28 | y = np.array([3.65284, 3.52623, 3.51468, 3.22199, 3.21]) 29 | z = np.array([0.43, 0.45, 0.6, 0.4, 0.211]) 30 | y_2 = np.array([3.3, 3.6]) 31 | z_2 = np.array([0.58, 0.34]) 32 | 33 | # plot the majority and minority samples 34 | ax.scatter(z, y, label="Minority class", s=100) 35 | ax.scatter(z_2, y_2, label="Majority class", s=100) 36 | 37 | idx = rng.randint(len(y), size=2) 38 | annotation = [r"$x_i$", r"$x_{zi}$"] 39 | 40 | for a, i in zip(annotation, idx): 41 | ax.annotate(a, (z[i], y[i]), xytext=tuple([z[i] + 0.01, y[i] + 0.005]), fontsize=15) 42 | 43 | # draw the circle in which the new sample will generated 44 | radius = np.sqrt((z[idx[0]] - z[idx[1]]) ** 2 + (y[idx[0]] - y[idx[1]]) ** 2) 45 | circle = plt.Circle((z[idx[0]], y[idx[0]]), radius=radius, alpha=0.2) 46 | ax.add_artist(circle) 47 | 48 | # plot the line on which the sample will be generated 49 | ax.plot(z[idx], y[idx], "--", alpha=0.5) 50 | 51 | # create and plot the new sample 52 | step = rng.uniform() 53 | y_gen = y[idx[0]] + step * (y[idx[1]] - y[idx[0]]) 54 | z_gen = z[idx[0]] + step * (z[idx[1]] - z[idx[0]]) 55 | 56 | ax.scatter(z_gen, y_gen, s=100) 57 | ax.annotate( 58 | r"$x_{new}$", 59 | (z_gen, y_gen), 60 | xytext=tuple([z_gen + 0.01, y_gen + 0.005]), 61 | fontsize=15, 62 | ) 63 | 64 | # make the plot nicer with legend and label 65 | sns.despine(ax=ax, offset=10) 66 | ax.set_xlim([0.2, 0.7]) 67 | ax.set_ylim([3.2, 3.7]) 68 | plt.xlabel(r"$X_1$") 69 | plt.ylabel(r"$X_2$") 70 | plt.legend() 71 | plt.tight_layout() 72 | plt.show() 73 | -------------------------------------------------------------------------------- /examples/pipeline/README.txt: -------------------------------------------------------------------------------- 1 | .. _pipeline_examples: 2 | 3 | Pipeline examples 4 | ================= 5 | 6 | Example of how to use the a pipeline to include under-sampling with `scikit-learn` estimators. 7 | -------------------------------------------------------------------------------- /examples/pipeline/plot_pipeline_classification.py: -------------------------------------------------------------------------------- 1 | """ 2 | ==================================== 3 | Usage of pipeline embedding samplers 4 | ==================================== 5 | 6 | An example of the :class:~imblearn.pipeline.Pipeline` object (or 7 | :func:`~imblearn.pipeline.make_pipeline` helper function) working with 8 | transformers and resamplers. 9 | """ 10 | 11 | # Authors: Christos Aridas 12 | # Guillaume Lemaitre 13 | # License: MIT 14 | 15 | # %% 16 | print(__doc__) 17 | 18 | # %% [markdown] 19 | # Let's first create an imbalanced dataset and split in to two sets. 20 | 21 | # %% 22 | from sklearn.datasets import make_classification 23 | from sklearn.model_selection import train_test_split 24 | 25 | X, y = make_classification( 26 | n_classes=2, 27 | class_sep=1.25, 28 | weights=[0.3, 0.7], 29 | n_informative=3, 30 | n_redundant=1, 31 | flip_y=0, 32 | n_features=5, 33 | n_clusters_per_class=1, 34 | n_samples=5000, 35 | random_state=10, 36 | ) 37 | 38 | X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) 39 | 40 | # %% [markdown] 41 | # Now, we will create each individual steps that we would like later to combine 42 | 43 | # %% 44 | from sklearn.decomposition import PCA 45 | from sklearn.neighbors import KNeighborsClassifier 46 | 47 | from imblearn.over_sampling import SMOTE 48 | from imblearn.under_sampling import EditedNearestNeighbours 49 | 50 | pca = PCA(n_components=2) 51 | enn = EditedNearestNeighbours() 52 | smote = SMOTE(random_state=0) 53 | knn = KNeighborsClassifier(n_neighbors=1) 54 | 55 | # %% [markdown] 56 | # Now, we can finally create a pipeline to specify in which order the different 57 | # transformers and samplers should be executed before to provide the data to 58 | # the final classifier. 59 | 60 | # %% 61 | from imblearn.pipeline import make_pipeline 62 | 63 | model = make_pipeline(pca, enn, smote, knn) 64 | 65 | # %% [markdown] 66 | # We can now use the pipeline created as a normal classifier where resampling 67 | # will happen when calling `fit` and disabled when calling `decision_function`, 68 | # `predict_proba`, or `predict`. 69 | 70 | # %% 71 | from sklearn.metrics import classification_report 72 | 73 | model.fit(X_train, y_train) 74 | y_pred = model.predict(X_test) 75 | print(classification_report(y_test, y_pred)) 76 | -------------------------------------------------------------------------------- /examples/under-sampling/README.txt: -------------------------------------------------------------------------------- 1 | .. _under_sampling_examples: 2 | 3 | Example using under-sampling class methods 4 | ========================================== 5 | 6 | Under-sampling refers to the process of reducing the number of samples in the majority classes. 7 | The implemented methods can be categorized into 2 groups: (i) fixed under-sampling and (ii) cleaning under-sampling. 8 | -------------------------------------------------------------------------------- /examples/under-sampling/plot_illustration_tomek_links.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============================================== 3 | Illustration of the definition of a Tomek link 4 | ============================================== 5 | 6 | This example illustrates what is a Tomek link. 7 | """ 8 | 9 | # Authors: Guillaume Lemaitre 10 | # License: MIT 11 | 12 | # %% 13 | print(__doc__) 14 | 15 | import matplotlib.pyplot as plt 16 | import seaborn as sns 17 | 18 | sns.set_context("poster") 19 | 20 | # %% [markdown] 21 | # This function allows to make nice plotting 22 | 23 | # %% 24 | 25 | 26 | def make_plot_despine(ax): 27 | sns.despine(ax=ax, offset=10) 28 | ax.set_xlim([0, 3]) 29 | ax.set_ylim([0, 3]) 30 | ax.set_xlabel(r"$X_1$") 31 | ax.set_ylabel(r"$X_2$") 32 | ax.legend(loc="lower right") 33 | 34 | 35 | # %% [markdown] 36 | # We will generate some toy data that illustrates how 37 | # :class:`~imblearn.under_sampling.TomekLinks` is used to clean a dataset. 38 | 39 | # %% 40 | import numpy as np 41 | 42 | rng = np.random.RandomState(18) 43 | 44 | X_minority = np.transpose( 45 | [[1.1, 1.3, 1.15, 0.8, 0.55, 2.1], [1.0, 1.5, 1.7, 2.5, 0.55, 1.9]] 46 | ) 47 | X_majority = np.transpose( 48 | [ 49 | [2.1, 2.12, 2.13, 2.14, 2.2, 2.3, 2.5, 2.45], 50 | [1.5, 2.1, 2.7, 0.9, 1.0, 1.4, 2.4, 2.9], 51 | ] 52 | ) 53 | 54 | # %% [markdown] 55 | # In the figure above, the samples highlighted in green form a Tomek link since 56 | # they are of different classes and are nearest neighbors of each other. 57 | 58 | fig, ax = plt.subplots(figsize=(8, 8)) 59 | ax.scatter( 60 | X_minority[:, 0], 61 | X_minority[:, 1], 62 | label="Minority class", 63 | s=200, 64 | marker="_", 65 | ) 66 | ax.scatter( 67 | X_majority[:, 0], 68 | X_majority[:, 1], 69 | label="Majority class", 70 | s=200, 71 | marker="+", 72 | ) 73 | 74 | # highlight the samples of interest 75 | ax.scatter( 76 | [X_minority[-1, 0], X_majority[1, 0]], 77 | [X_minority[-1, 1], X_majority[1, 1]], 78 | label="Tomek link", 79 | s=200, 80 | alpha=0.3, 81 | ) 82 | make_plot_despine(ax) 83 | fig.suptitle("Illustration of a Tomek link") 84 | fig.tight_layout() 85 | 86 | # %% [markdown] 87 | # We can run the :class:`~imblearn.under_sampling.TomekLinks` sampling to 88 | # remove the corresponding samples. If `sampling_strategy='auto'` only the 89 | # sample from the majority class will be removed. If `sampling_strategy='all'` 90 | # both samples will be removed. 91 | 92 | # %% 93 | from imblearn.under_sampling import TomekLinks 94 | 95 | fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(16, 8)) 96 | 97 | samplers = { 98 | "Removing only majority samples": TomekLinks(sampling_strategy="auto"), 99 | "Removing all samples": TomekLinks(sampling_strategy="all"), 100 | } 101 | 102 | for ax, (title, sampler) in zip(axs, samplers.items()): 103 | X_res, y_res = sampler.fit_resample( 104 | np.vstack((X_minority, X_majority)), 105 | np.array([0] * X_minority.shape[0] + [1] * X_majority.shape[0]), 106 | ) 107 | ax.scatter( 108 | X_res[y_res == 0][:, 0], 109 | X_res[y_res == 0][:, 1], 110 | label="Minority class", 111 | s=200, 112 | marker="_", 113 | ) 114 | ax.scatter( 115 | X_res[y_res == 1][:, 0], 116 | X_res[y_res == 1][:, 1], 117 | label="Majority class", 118 | s=200, 119 | marker="+", 120 | ) 121 | 122 | # highlight the samples of interest 123 | ax.scatter( 124 | [X_minority[-1, 0], X_majority[1, 0]], 125 | [X_minority[-1, 1], X_majority[1, 1]], 126 | label="Tomek link", 127 | s=200, 128 | alpha=0.3, 129 | ) 130 | 131 | ax.set_title(title) 132 | make_plot_despine(ax) 133 | fig.tight_layout() 134 | 135 | plt.show() 136 | -------------------------------------------------------------------------------- /imblearn/VERSION.txt: -------------------------------------------------------------------------------- 1 | 0.14.dev0 2 | -------------------------------------------------------------------------------- /imblearn/_version.py: -------------------------------------------------------------------------------- 1 | """ 2 | ``imbalanced-learn`` is a set of python methods to deal with imbalanced 3 | datset in machine learning and pattern recognition. 4 | """ 5 | # Based on NiLearn package 6 | # License: simplified BSD 7 | 8 | # PEP0440 compatible formatted version, see: 9 | # https://www.python.org/dev/peps/pep-0440/ 10 | # 11 | # Generic release markers: 12 | # X.Y 13 | # X.Y.Z # For bugfix releases 14 | # 15 | # Admissible pre-release markers: 16 | # X.YaN # Alpha release 17 | # X.YbN # Beta release 18 | # X.YrcN # Release Candidate 19 | # X.Y # Final release 20 | # 21 | # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. 22 | # 'X.Y.dev0' is the canonical version of 'X.Y.dev' 23 | 24 | from pathlib import Path 25 | 26 | with open(Path(__file__).parent / "VERSION.txt") as _fh: 27 | __version__ = _fh.read().strip() 28 | -------------------------------------------------------------------------------- /imblearn/combine/__init__.py: -------------------------------------------------------------------------------- 1 | """The :mod:`imblearn.combine` provides methods which combine 2 | over-sampling and under-sampling. 3 | """ 4 | 5 | from ._smote_enn import SMOTEENN 6 | from ._smote_tomek import SMOTETomek 7 | 8 | __all__ = ["SMOTEENN", "SMOTETomek"] 9 | -------------------------------------------------------------------------------- /imblearn/combine/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/combine/tests/__init__.py -------------------------------------------------------------------------------- /imblearn/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`imblearn.datasets` provides methods to generate 3 | imbalanced data. 4 | """ 5 | 6 | from ._imbalance import make_imbalance 7 | from ._zenodo import fetch_datasets 8 | 9 | __all__ = ["make_imbalance", "fetch_datasets"] 10 | -------------------------------------------------------------------------------- /imblearn/datasets/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/datasets/tests/__init__.py -------------------------------------------------------------------------------- /imblearn/datasets/tests/test_imbalance.py: -------------------------------------------------------------------------------- 1 | """Test the module easy ensemble.""" 2 | # Authors: Guillaume Lemaitre 3 | # Christos Aridas 4 | # License: MIT 5 | 6 | from collections import Counter 7 | 8 | import numpy as np 9 | import pytest 10 | from sklearn.datasets import load_iris 11 | 12 | from imblearn.datasets import make_imbalance 13 | 14 | 15 | @pytest.fixture 16 | def iris(): 17 | return load_iris(return_X_y=True) 18 | 19 | 20 | @pytest.mark.parametrize( 21 | "sampling_strategy, err_msg", 22 | [ 23 | ({0: -100, 1: 50, 2: 50}, "in a class cannot be negative"), 24 | ({0: 10, 1: 70}, "should be less or equal to the original"), 25 | ], 26 | ) 27 | def test_make_imbalance_error(iris, sampling_strategy, err_msg): 28 | # we are reusing part of utils.check_sampling_strategy, however this is not 29 | # cover in the common tests so we will repeat it here 30 | X, y = iris 31 | with pytest.raises(ValueError, match=err_msg): 32 | make_imbalance(X, y, sampling_strategy=sampling_strategy) 33 | 34 | 35 | def test_make_imbalance_error_single_class(iris): 36 | X, y = iris 37 | y = np.zeros_like(y) 38 | with pytest.raises(ValueError, match="needs to have more than 1 class."): 39 | make_imbalance(X, y, sampling_strategy={0: 10}) 40 | 41 | 42 | @pytest.mark.parametrize( 43 | "sampling_strategy, expected_counts", 44 | [ 45 | ({0: 10, 1: 20, 2: 30}, {0: 10, 1: 20, 2: 30}), 46 | ({0: 10, 1: 20}, {0: 10, 1: 20, 2: 50}), 47 | ], 48 | ) 49 | def test_make_imbalance_dict(iris, sampling_strategy, expected_counts): 50 | X, y = iris 51 | _, y_ = make_imbalance(X, y, sampling_strategy=sampling_strategy) 52 | assert Counter(y_) == expected_counts 53 | 54 | 55 | @pytest.mark.parametrize("as_frame", [True, False], ids=["dataframe", "array"]) 56 | @pytest.mark.parametrize( 57 | "sampling_strategy, expected_counts", 58 | [ 59 | ( 60 | {"setosa": 10, "versicolor": 20, "virginica": 30}, 61 | {"setosa": 10, "versicolor": 20, "virginica": 30}, 62 | ), 63 | ( 64 | {"setosa": 10, "versicolor": 20}, 65 | {"setosa": 10, "versicolor": 20, "virginica": 50}, 66 | ), 67 | ], 68 | ) 69 | def test_make_imbalanced_iris(as_frame, sampling_strategy, expected_counts): 70 | pd = pytest.importorskip("pandas") 71 | iris = load_iris(as_frame=as_frame) 72 | X, y = iris.data, iris.target 73 | y = iris.target_names[iris.target] 74 | if as_frame: 75 | y = pd.Series(iris.target_names[iris.target], name="target") 76 | X_res, y_res = make_imbalance(X, y, sampling_strategy=sampling_strategy) 77 | if as_frame: 78 | assert hasattr(X_res, "loc") 79 | pd.testing.assert_index_equal(X_res.index, y_res.index) 80 | assert Counter(y_res) == expected_counts 81 | -------------------------------------------------------------------------------- /imblearn/datasets/tests/test_zenodo.py: -------------------------------------------------------------------------------- 1 | """Test the datasets loader. 2 | 3 | Skipped if datasets is not already downloaded to data_home. 4 | """ 5 | # Authors: Guillaume Lemaitre 6 | # Christos Aridas 7 | # License: MIT 8 | 9 | import pytest 10 | from sklearn.utils._testing import SkipTest 11 | 12 | from imblearn.datasets import fetch_datasets 13 | 14 | DATASET_SHAPE = { 15 | "ecoli": (336, 7), 16 | "optical_digits": (5620, 64), 17 | "satimage": (6435, 36), 18 | "pen_digits": (10992, 16), 19 | "abalone": (4177, 10), 20 | "sick_euthyroid": (3163, 42), 21 | "spectrometer": (531, 93), 22 | "car_eval_34": (1728, 21), 23 | "isolet": (7797, 617), 24 | "us_crime": (1994, 100), 25 | "yeast_ml8": (2417, 103), 26 | "scene": (2407, 294), 27 | "libras_move": (360, 90), 28 | "thyroid_sick": (3772, 52), 29 | "coil_2000": (9822, 85), 30 | "arrhythmia": (452, 278), 31 | "solar_flare_m0": (1389, 32), 32 | "oil": (937, 49), 33 | "car_eval_4": (1728, 21), 34 | "wine_quality": (4898, 11), 35 | "letter_img": (20000, 16), 36 | "yeast_me2": (1484, 8), 37 | "webpage": (34780, 300), 38 | "ozone_level": (2536, 72), 39 | "mammography": (11183, 6), 40 | "protein_homo": (145751, 74), 41 | "abalone_19": (4177, 10), 42 | } 43 | 44 | 45 | def fetch(*args, **kwargs): 46 | return fetch_datasets(*args, download_if_missing=True, **kwargs) 47 | 48 | 49 | @pytest.mark.xfail 50 | def test_fetch(): 51 | try: 52 | datasets1 = fetch(shuffle=True, random_state=42) 53 | except IOError: 54 | raise SkipTest("Zenodo dataset can not be loaded.") 55 | 56 | datasets2 = fetch(shuffle=True, random_state=37) 57 | 58 | for k in DATASET_SHAPE.keys(): 59 | X1, X2 = datasets1[k].data, datasets2[k].data 60 | assert DATASET_SHAPE[k] == X1.shape 61 | assert X1.shape == X2.shape 62 | 63 | y1, y2 = datasets1[k].target, datasets2[k].target 64 | assert (X1.shape[0],) == y1.shape 65 | assert (X1.shape[0],) == y2.shape 66 | 67 | 68 | def test_fetch_filter(): 69 | try: 70 | datasets1 = fetch(filter_data=tuple([1]), shuffle=True, random_state=42) 71 | except IOError: 72 | raise SkipTest("Zenodo dataset can not be loaded.") 73 | 74 | datasets2 = fetch(filter_data=tuple(["ecoli"]), shuffle=True, random_state=37) 75 | 76 | X1, X2 = datasets1["ecoli"].data, datasets2["ecoli"].data 77 | assert DATASET_SHAPE["ecoli"] == X1.shape 78 | assert X1.shape == X2.shape 79 | 80 | assert X1.sum() == pytest.approx(X2.sum()) 81 | 82 | y1, y2 = datasets1["ecoli"].target, datasets2["ecoli"].target 83 | assert (X1.shape[0],) == y1.shape 84 | assert (X1.shape[0],) == y2.shape 85 | 86 | 87 | @pytest.mark.parametrize( 88 | "filter_data, err_msg", 89 | [ 90 | (("rnf",), "is not a dataset available"), 91 | ((-1,), "dataset with the ID="), 92 | ((100,), "dataset with the ID="), 93 | ((1.00,), "value in the tuple"), 94 | ], 95 | ) 96 | def test_fetch_error(filter_data, err_msg): 97 | with pytest.raises(ValueError, match=err_msg): 98 | fetch_datasets(filter_data=filter_data) 99 | -------------------------------------------------------------------------------- /imblearn/ensemble/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`imblearn.ensemble` module include methods generating 3 | under-sampled subsets combined inside an ensemble. 4 | """ 5 | 6 | from ._bagging import BalancedBaggingClassifier 7 | from ._easy_ensemble import EasyEnsembleClassifier 8 | from ._forest import BalancedRandomForestClassifier 9 | from ._weight_boosting import RUSBoostClassifier 10 | 11 | __all__ = [ 12 | "BalancedBaggingClassifier", 13 | "BalancedRandomForestClassifier", 14 | "EasyEnsembleClassifier", 15 | "RUSBoostClassifier", 16 | ] 17 | -------------------------------------------------------------------------------- /imblearn/ensemble/_common.py: -------------------------------------------------------------------------------- 1 | from numbers import Integral, Real 2 | 3 | from sklearn.tree._criterion import Criterion 4 | from sklearn.utils._param_validation import ( 5 | HasMethods, 6 | Hidden, 7 | Interval, 8 | RealNotInt, 9 | StrOptions, 10 | ) 11 | 12 | 13 | def _estimator_has(attr): 14 | """Check if we can delegate a method to the underlying estimator. 15 | First, we check the first fitted estimator if available, otherwise we 16 | check the estimator attribute. 17 | """ 18 | 19 | def check(self): 20 | if hasattr(self, "estimators_"): 21 | return hasattr(self.estimators_[0], attr) 22 | elif self.estimator is not None: 23 | return hasattr(self.estimator, attr) 24 | else: # TODO(1.4): Remove when the base_estimator deprecation cycle ends 25 | return hasattr(self.base_estimator, attr) 26 | 27 | return check 28 | 29 | 30 | _bagging_parameter_constraints = { 31 | "estimator": [HasMethods(["fit", "predict"]), None], 32 | "n_estimators": [Interval(Integral, 1, None, closed="left")], 33 | "max_samples": [ 34 | Interval(Integral, 1, None, closed="left"), 35 | Interval(RealNotInt, 0, 1, closed="right"), 36 | ], 37 | "max_features": [ 38 | Interval(Integral, 1, None, closed="left"), 39 | Interval(RealNotInt, 0, 1, closed="right"), 40 | ], 41 | "bootstrap": ["boolean"], 42 | "bootstrap_features": ["boolean"], 43 | "oob_score": ["boolean"], 44 | "warm_start": ["boolean"], 45 | "n_jobs": [None, Integral], 46 | "random_state": ["random_state"], 47 | "verbose": ["verbose"], 48 | "base_estimator": [ 49 | HasMethods(["fit", "predict"]), 50 | StrOptions({"deprecated"}), 51 | None, 52 | ], 53 | } 54 | 55 | _adaboost_classifier_parameter_constraints = { 56 | "estimator": [HasMethods(["fit", "predict"]), None], 57 | "n_estimators": [Interval(Integral, 1, None, closed="left")], 58 | "learning_rate": [Interval(Real, 0, None, closed="neither")], 59 | "random_state": ["random_state"], 60 | "base_estimator": [HasMethods(["fit", "predict"]), StrOptions({"deprecated"})], 61 | "algorithm": [StrOptions({"SAMME", "SAMME.R"})], 62 | } 63 | 64 | _random_forest_classifier_parameter_constraints = { 65 | "n_estimators": [Interval(Integral, 1, None, closed="left")], 66 | "bootstrap": ["boolean"], 67 | "oob_score": ["boolean"], 68 | "n_jobs": [Integral, None], 69 | "random_state": ["random_state"], 70 | "verbose": ["verbose"], 71 | "warm_start": ["boolean"], 72 | "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)], 73 | "max_samples": [ 74 | None, 75 | Interval(Real, 0.0, 1.0, closed="right"), 76 | Interval(Integral, 1, None, closed="left"), 77 | ], 78 | "max_depth": [Interval(Integral, 1, None, closed="left"), None], 79 | "min_samples_split": [ 80 | Interval(Integral, 2, None, closed="left"), 81 | Interval(RealNotInt, 0.0, 1.0, closed="right"), 82 | ], 83 | "min_samples_leaf": [ 84 | Interval(Integral, 1, None, closed="left"), 85 | Interval(RealNotInt, 0.0, 1.0, closed="neither"), 86 | ], 87 | "min_weight_fraction_leaf": [Interval(Real, 0.0, 0.5, closed="both")], 88 | "max_features": [ 89 | Interval(Integral, 1, None, closed="left"), 90 | Interval(RealNotInt, 0.0, 1.0, closed="right"), 91 | StrOptions({"sqrt", "log2"}), 92 | None, 93 | ], 94 | "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None], 95 | "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")], 96 | "ccp_alpha": [Interval(Real, 0.0, None, closed="left")], 97 | "class_weight": [ 98 | StrOptions({"balanced_subsample", "balanced"}), 99 | dict, 100 | list, 101 | None, 102 | ], 103 | "monotonic_cst": ["array-like", None], 104 | } 105 | -------------------------------------------------------------------------------- /imblearn/ensemble/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/ensemble/tests/__init__.py -------------------------------------------------------------------------------- /imblearn/ensemble/tests/test_weight_boosting.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from sklearn.datasets import make_classification 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.utils._testing import assert_array_equal 6 | 7 | from imblearn.ensemble import RUSBoostClassifier 8 | 9 | 10 | @pytest.fixture 11 | def imbalanced_dataset(): 12 | return make_classification( 13 | n_samples=10000, 14 | n_features=3, 15 | n_informative=2, 16 | n_redundant=0, 17 | n_repeated=0, 18 | n_classes=3, 19 | n_clusters_per_class=1, 20 | weights=[0.01, 0.05, 0.94], 21 | class_sep=0.8, 22 | random_state=0, 23 | ) 24 | 25 | 26 | def test_rusboost(imbalanced_dataset): 27 | X, y = imbalanced_dataset 28 | X_train, X_test, y_train, y_test = train_test_split( 29 | X, y, stratify=y, random_state=1 30 | ) 31 | classes = np.unique(y) 32 | 33 | n_estimators = 500 34 | rusboost = RUSBoostClassifier(n_estimators=n_estimators, random_state=0) 35 | rusboost.fit(X_train, y_train) 36 | assert_array_equal(classes, rusboost.classes_) 37 | 38 | # check that we have an ensemble of samplers and estimators with a 39 | # consistent size 40 | assert len(rusboost.estimators_) > 1 41 | assert len(rusboost.estimators_) == len(rusboost.samplers_) 42 | assert len(rusboost.pipelines_) == len(rusboost.samplers_) 43 | 44 | # each sampler in the ensemble should have different random state 45 | assert len({sampler.random_state for sampler in rusboost.samplers_}) == len( 46 | rusboost.samplers_ 47 | ) 48 | # each estimator in the ensemble should have different random state 49 | assert len({est.random_state for est in rusboost.estimators_}) == len( 50 | rusboost.estimators_ 51 | ) 52 | 53 | # check the consistency of the feature importances 54 | assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1] 55 | 56 | # check the consistency of the prediction outpus 57 | y_pred = rusboost.predict_proba(X_test) 58 | assert y_pred.shape[1] == len(classes) 59 | assert rusboost.decision_function(X_test).shape[1] == len(classes) 60 | 61 | score = rusboost.score(X_test, y_test) 62 | assert score > 0.6, f"Failed with score {score}" 63 | 64 | y_pred = rusboost.predict(X_test) 65 | assert y_pred.shape == y_test.shape 66 | 67 | 68 | def test_rusboost_sample_weight(imbalanced_dataset): 69 | X, y = imbalanced_dataset 70 | sample_weight = np.ones_like(y) 71 | rusboost = RUSBoostClassifier(random_state=0) 72 | 73 | # Predictions should be the same when sample_weight are all ones 74 | y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X) 75 | y_pred_no_sample_weight = rusboost.fit(X, y).predict(X) 76 | 77 | assert_array_equal(y_pred_sample_weight, y_pred_no_sample_weight) 78 | 79 | rng = np.random.RandomState(42) 80 | sample_weight = rng.rand(y.shape[0]) 81 | y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X) 82 | 83 | with pytest.raises(AssertionError): 84 | assert_array_equal(y_pred_no_sample_weight, y_pred_sample_weight) 85 | 86 | 87 | @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) 88 | def test_rusboost_algorithm(imbalanced_dataset, algorithm): 89 | X, y = imbalanced_dataset 90 | 91 | rusboost = RUSBoostClassifier(algorithm=algorithm) 92 | warn_msg = "`algorithm` parameter is deprecated in 0.12 and will be removed" 93 | with pytest.warns(FutureWarning, match=warn_msg): 94 | rusboost.fit(X, y) 95 | -------------------------------------------------------------------------------- /imblearn/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`imblearn.exceptions` module includes all custom warnings and error 3 | classes and functions used across imbalanced-learn. 4 | """ 5 | 6 | # Authors: Guillaume Lemaitre 7 | # License: MIT 8 | 9 | 10 | def raise_isinstance_error(variable_name, possible_type, variable): 11 | """Raise consistent error message for isinstance() function. 12 | 13 | Parameters 14 | ---------- 15 | variable_name : str 16 | The name of the variable. 17 | 18 | possible_type : type 19 | The possible type of the variable. 20 | 21 | variable : object 22 | The variable to check. 23 | 24 | Raises 25 | ------ 26 | ValueError 27 | If the instance is not of the possible type. 28 | """ 29 | raise ValueError( 30 | f"{variable_name} has to be one of {possible_type}. " 31 | f"Got {type(variable)} instead." 32 | ) 33 | -------------------------------------------------------------------------------- /imblearn/keras/__init__.py: -------------------------------------------------------------------------------- 1 | """The :mod:`imblearn.keras` provides utilities to deal with imbalanced dataset 2 | in keras.""" 3 | 4 | from ._generator import BalancedBatchGenerator, balanced_batch_generator 5 | 6 | __all__ = ["BalancedBatchGenerator", "balanced_batch_generator"] 7 | -------------------------------------------------------------------------------- /imblearn/keras/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/keras/tests/__init__.py -------------------------------------------------------------------------------- /imblearn/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`imblearn.metrics` module includes score functions, performance 3 | metrics and pairwise metrics and distance computations. 4 | """ 5 | 6 | from ._classification import ( 7 | classification_report_imbalanced, 8 | geometric_mean_score, 9 | macro_averaged_mean_absolute_error, 10 | make_index_balanced_accuracy, 11 | sensitivity_score, 12 | sensitivity_specificity_support, 13 | specificity_score, 14 | ) 15 | 16 | __all__ = [ 17 | "sensitivity_specificity_support", 18 | "sensitivity_score", 19 | "specificity_score", 20 | "geometric_mean_score", 21 | "make_index_balanced_accuracy", 22 | "classification_report_imbalanced", 23 | "macro_averaged_mean_absolute_error", 24 | ] 25 | -------------------------------------------------------------------------------- /imblearn/metrics/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/metrics/tests/__init__.py -------------------------------------------------------------------------------- /imblearn/metrics/tests/test_score_objects.py: -------------------------------------------------------------------------------- 1 | """Test for score""" 2 | # Authors: Guillaume Lemaitre 3 | # Christos Aridas 4 | # License: MIT 5 | 6 | import pytest 7 | from sklearn.datasets import make_blobs 8 | from sklearn.linear_model import LogisticRegression 9 | from sklearn.metrics import make_scorer 10 | from sklearn.model_selection import GridSearchCV, train_test_split 11 | 12 | from imblearn.metrics import ( 13 | geometric_mean_score, 14 | make_index_balanced_accuracy, 15 | sensitivity_score, 16 | specificity_score, 17 | ) 18 | 19 | R_TOL = 1e-2 20 | 21 | 22 | @pytest.fixture 23 | def data(): 24 | X, y = make_blobs(random_state=0, centers=2) 25 | return train_test_split(X, y, random_state=0) 26 | 27 | 28 | @pytest.mark.parametrize( 29 | "score, expected_score", 30 | [ 31 | (sensitivity_score, 0.90), 32 | (specificity_score, 0.90), 33 | (geometric_mean_score, 0.90), 34 | (make_index_balanced_accuracy()(geometric_mean_score), 0.82), 35 | ], 36 | ) 37 | @pytest.mark.parametrize("average", ["macro", "weighted", "micro"]) 38 | def test_scorer_common_average(data, score, expected_score, average): 39 | X_train, X_test, y_train, _ = data 40 | 41 | scorer = make_scorer(score, pos_label=None, average=average) 42 | grid = GridSearchCV( 43 | LogisticRegression(), 44 | param_grid={"C": [1, 10]}, 45 | scoring=scorer, 46 | cv=3, 47 | ) 48 | grid.fit(X_train, y_train).predict(X_test) 49 | 50 | assert grid.best_score_ >= expected_score 51 | 52 | 53 | @pytest.mark.parametrize( 54 | "score, average, expected_score", 55 | [ 56 | (sensitivity_score, "binary", 0.94), 57 | (specificity_score, "binary", 0.89), 58 | (geometric_mean_score, "multiclass", 0.90), 59 | ( 60 | make_index_balanced_accuracy()(geometric_mean_score), 61 | "multiclass", 62 | 0.82, 63 | ), 64 | ], 65 | ) 66 | def test_scorer_default_average(data, score, average, expected_score): 67 | X_train, X_test, y_train, _ = data 68 | 69 | scorer = make_scorer(score, pos_label=1, average=average) 70 | grid = GridSearchCV( 71 | LogisticRegression(), 72 | param_grid={"C": [1, 10]}, 73 | scoring=scorer, 74 | cv=3, 75 | ) 76 | grid.fit(X_train, y_train).predict(X_test) 77 | 78 | assert grid.best_score_ >= expected_score 79 | -------------------------------------------------------------------------------- /imblearn/over_sampling/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`imblearn.over_sampling` provides a set of method to 3 | perform over-sampling. 4 | """ 5 | 6 | from ._adasyn import ADASYN 7 | from ._random_over_sampler import RandomOverSampler 8 | from ._smote import SMOTE, SMOTEN, SMOTENC, SVMSMOTE, BorderlineSMOTE, KMeansSMOTE 9 | 10 | __all__ = [ 11 | "ADASYN", 12 | "RandomOverSampler", 13 | "KMeansSMOTE", 14 | "SMOTE", 15 | "BorderlineSMOTE", 16 | "SVMSMOTE", 17 | "SMOTENC", 18 | "SMOTEN", 19 | ] 20 | -------------------------------------------------------------------------------- /imblearn/over_sampling/_smote/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import SMOTE, SMOTEN, SMOTENC 2 | from .cluster import KMeansSMOTE 3 | from .filter import SVMSMOTE, BorderlineSMOTE 4 | 5 | __all__ = [ 6 | "SMOTE", 7 | "SMOTEN", 8 | "SMOTENC", 9 | "KMeansSMOTE", 10 | "BorderlineSMOTE", 11 | "SVMSMOTE", 12 | ] 13 | -------------------------------------------------------------------------------- /imblearn/over_sampling/_smote/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/over_sampling/_smote/tests/__init__.py -------------------------------------------------------------------------------- /imblearn/over_sampling/_smote/tests/test_borderline_smote.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | import pytest 4 | from sklearn.datasets import make_classification 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.utils._testing import assert_allclose, assert_array_equal 7 | 8 | from imblearn.over_sampling import BorderlineSMOTE 9 | 10 | 11 | @pytest.mark.parametrize("kind", ["borderline-1", "borderline-2"]) 12 | def test_borderline_smote_no_in_danger_samples(kind): 13 | """Check that the algorithm behave properly even on a dataset without any sample 14 | in danger. 15 | """ 16 | X, y = make_classification( 17 | n_samples=500, 18 | n_features=2, 19 | n_informative=2, 20 | n_redundant=0, 21 | n_repeated=0, 22 | n_clusters_per_class=1, 23 | n_classes=3, 24 | weights=[0.1, 0.2, 0.7], 25 | class_sep=1.5, 26 | random_state=1, 27 | ) 28 | smote = BorderlineSMOTE(kind=kind, m_neighbors=3, k_neighbors=5, random_state=0) 29 | X_res, y_res = smote.fit_resample(X, y) 30 | 31 | assert_allclose(X, X_res) 32 | assert_allclose(y, y_res) 33 | assert not smote.in_danger_indices 34 | 35 | 36 | def test_borderline_smote_kind(): 37 | """Check the behaviour of the `kind` parameter. 38 | 39 | In short, "borderline-2" generates sample closer to the boundary decision than 40 | "borderline-1". We generate an example where a logistic regression will perform 41 | worse on "borderline-2" than on "borderline-1". 42 | """ 43 | X, y = make_classification( 44 | n_samples=500, 45 | n_features=2, 46 | n_informative=2, 47 | n_redundant=0, 48 | n_repeated=0, 49 | n_clusters_per_class=1, 50 | n_classes=3, 51 | weights=[0.1, 0.2, 0.7], 52 | class_sep=1.0, 53 | random_state=1, 54 | ) 55 | smote = BorderlineSMOTE( 56 | kind="borderline-1", m_neighbors=9, k_neighbors=5, random_state=0 57 | ) 58 | X_res_borderline_1, y_res_borderline_1 = smote.fit_resample(X, y) 59 | smote.set_params(kind="borderline-2") 60 | X_res_borderline_2, y_res_borderline_2 = smote.fit_resample(X, y) 61 | 62 | score_borderline_1 = ( 63 | LogisticRegression() 64 | .fit(X_res_borderline_1, y_res_borderline_1) 65 | .score(X_res_borderline_1, y_res_borderline_1) 66 | ) 67 | score_borderline_2 = ( 68 | LogisticRegression() 69 | .fit(X_res_borderline_2, y_res_borderline_2) 70 | .score(X_res_borderline_2, y_res_borderline_2) 71 | ) 72 | assert score_borderline_1 > score_borderline_2 73 | 74 | 75 | def test_borderline_smote_in_danger(): 76 | X, y = make_classification( 77 | n_samples=500, 78 | n_features=2, 79 | n_informative=2, 80 | n_redundant=0, 81 | n_repeated=0, 82 | n_clusters_per_class=1, 83 | n_classes=3, 84 | weights=[0.1, 0.2, 0.7], 85 | class_sep=0.8, 86 | random_state=1, 87 | ) 88 | smote = BorderlineSMOTE( 89 | kind="borderline-1", 90 | m_neighbors=9, 91 | k_neighbors=5, 92 | random_state=0, 93 | ) 94 | _, y_res_1 = smote.fit_resample(X, y) 95 | in_danger_indices_borderline_1 = smote.in_danger_indices 96 | smote.set_params(kind="borderline-2") 97 | _, y_res_2 = smote.fit_resample(X, y) 98 | in_danger_indices_borderline_2 = smote.in_danger_indices 99 | 100 | for key1, key2 in zip( 101 | in_danger_indices_borderline_1, in_danger_indices_borderline_2 102 | ): 103 | assert_array_equal( 104 | in_danger_indices_borderline_1[key1], in_danger_indices_borderline_2[key2] 105 | ) 106 | assert len(in_danger_indices_borderline_1) == len(in_danger_indices_borderline_2) 107 | counter = Counter(y_res_1) 108 | assert counter[0] == counter[1] == counter[2] 109 | counter = Counter(y_res_2) 110 | assert counter[0] == counter[1] == counter[2] 111 | -------------------------------------------------------------------------------- /imblearn/over_sampling/_smote/tests/test_kmeans_smote.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from sklearn.cluster import KMeans, MiniBatchKMeans 4 | from sklearn.datasets import make_classification 5 | from sklearn.neighbors import NearestNeighbors 6 | from sklearn.utils._testing import assert_allclose, assert_array_equal 7 | 8 | from imblearn.over_sampling import SMOTE, KMeansSMOTE 9 | 10 | 11 | @pytest.fixture 12 | def data(): 13 | X = np.array( 14 | [ 15 | [0.11622591, -0.0317206], 16 | [0.77481731, 0.60935141], 17 | [1.25192108, -0.22367336], 18 | [0.53366841, -0.30312976], 19 | [1.52091956, -0.49283504], 20 | [-0.28162401, -2.10400981], 21 | [0.83680821, 1.72827342], 22 | [0.3084254, 0.33299982], 23 | [0.70472253, -0.73309052], 24 | [0.28893132, -0.38761769], 25 | [1.15514042, 0.0129463], 26 | [0.88407872, 0.35454207], 27 | [1.31301027, -0.92648734], 28 | [-1.11515198, -0.93689695], 29 | [-0.18410027, -0.45194484], 30 | [0.9281014, 0.53085498], 31 | [-0.14374509, 0.27370049], 32 | [-0.41635887, -0.38299653], 33 | [0.08711622, 0.93259929], 34 | [1.70580611, -0.11219234], 35 | ] 36 | ) 37 | y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) 38 | return X, y 39 | 40 | 41 | @pytest.mark.filterwarnings("ignore:The default value of `n_init` will change") 42 | def test_kmeans_smote(data): 43 | X, y = data 44 | kmeans_smote = KMeansSMOTE( 45 | kmeans_estimator=1, 46 | random_state=42, 47 | cluster_balance_threshold=0.0, 48 | k_neighbors=5, 49 | ) 50 | smote = SMOTE(random_state=42) 51 | 52 | X_res_1, y_res_1 = kmeans_smote.fit_resample(X, y) 53 | X_res_2, y_res_2 = smote.fit_resample(X, y) 54 | 55 | assert_allclose(X_res_1, X_res_2) 56 | assert_array_equal(y_res_1, y_res_2) 57 | 58 | assert kmeans_smote.nn_k_.n_neighbors == 6 59 | assert kmeans_smote.kmeans_estimator_.n_clusters == 1 60 | assert "batch_size" in kmeans_smote.kmeans_estimator_.get_params() 61 | 62 | 63 | @pytest.mark.filterwarnings("ignore:The default value of `n_init` will change") 64 | @pytest.mark.parametrize("k_neighbors", [2, NearestNeighbors(n_neighbors=3)]) 65 | @pytest.mark.parametrize( 66 | "kmeans_estimator", 67 | [ 68 | 3, 69 | KMeans(n_clusters=3, n_init=1, random_state=42), 70 | MiniBatchKMeans(n_clusters=3, n_init=1, random_state=42), 71 | ], 72 | ) 73 | def test_sample_kmeans_custom(data, k_neighbors, kmeans_estimator): 74 | X, y = data 75 | kmeans_smote = KMeansSMOTE( 76 | random_state=42, 77 | kmeans_estimator=kmeans_estimator, 78 | k_neighbors=k_neighbors, 79 | ) 80 | X_resampled, y_resampled = kmeans_smote.fit_resample(X, y) 81 | assert X_resampled.shape == (24, 2) 82 | assert y_resampled.shape == (24,) 83 | 84 | assert kmeans_smote.nn_k_.n_neighbors == 3 85 | assert kmeans_smote.kmeans_estimator_.n_clusters == 3 86 | 87 | 88 | @pytest.mark.filterwarnings("ignore:The default value of `n_init` will change") 89 | def test_sample_kmeans_not_enough_clusters(data): 90 | X, y = data 91 | smote = KMeansSMOTE(cluster_balance_threshold=10, random_state=42) 92 | with pytest.raises(RuntimeError): 93 | smote.fit_resample(X, y) 94 | 95 | 96 | @pytest.mark.parametrize("density_exponent", ["auto", 10]) 97 | @pytest.mark.parametrize("cluster_balance_threshold", ["auto", 0.1]) 98 | def test_sample_kmeans_density_estimation(density_exponent, cluster_balance_threshold): 99 | X, y = make_classification( 100 | n_samples=10_000, n_classes=2, weights=[0.3, 0.7], random_state=42 101 | ) 102 | smote = KMeansSMOTE( 103 | kmeans_estimator=MiniBatchKMeans(n_init=1, random_state=42), 104 | random_state=0, 105 | density_exponent=density_exponent, 106 | cluster_balance_threshold=cluster_balance_threshold, 107 | ) 108 | smote.fit_resample(X, y) 109 | -------------------------------------------------------------------------------- /imblearn/over_sampling/_smote/tests/test_smoten.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from sklearn.exceptions import DataConversionWarning 4 | from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder 5 | from sklearn.utils._testing import _convert_container 6 | 7 | from imblearn.over_sampling import SMOTEN 8 | 9 | 10 | @pytest.fixture 11 | def data(): 12 | rng = np.random.RandomState(0) 13 | 14 | feature_1 = ["A"] * 10 + ["B"] * 20 + ["C"] * 30 15 | feature_2 = ["A"] * 40 + ["B"] * 20 16 | feature_3 = ["A"] * 20 + ["B"] * 20 + ["C"] * 10 + ["D"] * 10 17 | X = np.array([feature_1, feature_2, feature_3], dtype=object).T 18 | rng.shuffle(X) 19 | y = np.array([0] * 20 + [1] * 40, dtype=np.int32) 20 | y_labels = np.array(["not apple", "apple"], dtype=object) 21 | y = y_labels[y] 22 | return X, y 23 | 24 | 25 | def test_smoten(data): 26 | # overall check for SMOTEN 27 | X, y = data 28 | sampler = SMOTEN(random_state=0) 29 | X_res, y_res = sampler.fit_resample(X, y) 30 | 31 | assert X_res.shape == (80, 3) 32 | assert y_res.shape == (80,) 33 | assert isinstance(sampler.categorical_encoder_, OrdinalEncoder) 34 | 35 | 36 | def test_smoten_resampling(): 37 | # check if the SMOTEN resample data as expected 38 | # we generate data such that "not apple" will be the minority class and 39 | # samples from this class will be generated. We will force the "blue" 40 | # category to be associated with this class. Therefore, the new generated 41 | # samples should as well be from the "blue" category. 42 | X = np.array(["green"] * 5 + ["red"] * 10 + ["blue"] * 7, dtype=object).reshape( 43 | -1, 1 44 | ) 45 | y = np.array( 46 | ["apple"] * 5 47 | + ["not apple"] * 3 48 | + ["apple"] * 7 49 | + ["not apple"] * 5 50 | + ["apple"] * 2, 51 | dtype=object, 52 | ) 53 | sampler = SMOTEN(random_state=0) 54 | X_res, y_res = sampler.fit_resample(X, y) 55 | 56 | X_generated, y_generated = X_res[X.shape[0] :], y_res[X.shape[0] :] 57 | np.testing.assert_array_equal(X_generated, "blue") 58 | np.testing.assert_array_equal(y_generated, "not apple") 59 | 60 | 61 | @pytest.mark.parametrize("sparse_format", ["sparse_csr", "sparse_csc"]) 62 | def test_smoten_sparse_input(data, sparse_format): 63 | """Check that we handle sparse input in SMOTEN even if it is not efficient. 64 | 65 | Non-regression test for: 66 | https://github.com/scikit-learn-contrib/imbalanced-learn/issues/971 67 | """ 68 | X, y = data 69 | X = OneHotEncoder().fit_transform(X).toarray() 70 | X = _convert_container(X, sparse_format) 71 | 72 | with pytest.warns(DataConversionWarning, match="is not really efficient"): 73 | X_res, y_res = SMOTEN(random_state=0).fit_resample(X, y) 74 | 75 | assert X_res.format == X.format 76 | assert X_res.shape[0] == len(y_res) 77 | 78 | 79 | def test_smoten_categorical_encoder(data): 80 | """Check that `categorical_encoder` is used when provided.""" 81 | 82 | X, y = data 83 | sampler = SMOTEN(random_state=0) 84 | sampler.fit_resample(X, y) 85 | 86 | assert isinstance(sampler.categorical_encoder_, OrdinalEncoder) 87 | assert sampler.categorical_encoder_.dtype == np.int32 88 | 89 | encoder = OrdinalEncoder(dtype=np.int64) 90 | sampler.set_params(categorical_encoder=encoder).fit_resample(X, y) 91 | 92 | assert isinstance(sampler.categorical_encoder_, OrdinalEncoder) 93 | assert sampler.categorical_encoder is encoder 94 | assert sampler.categorical_encoder_ is not encoder 95 | assert sampler.categorical_encoder_.dtype == np.int64 96 | -------------------------------------------------------------------------------- /imblearn/over_sampling/_smote/tests/test_svm_smote.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from sklearn.datasets import make_classification 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.neighbors import NearestNeighbors 6 | from sklearn.svm import SVC 7 | from sklearn.utils._testing import assert_allclose, assert_array_equal 8 | 9 | from imblearn.over_sampling import SVMSMOTE 10 | 11 | 12 | @pytest.fixture 13 | def data(): 14 | X = np.array( 15 | [ 16 | [0.11622591, -0.0317206], 17 | [0.77481731, 0.60935141], 18 | [1.25192108, -0.22367336], 19 | [0.53366841, -0.30312976], 20 | [1.52091956, -0.49283504], 21 | [-0.28162401, -2.10400981], 22 | [0.83680821, 1.72827342], 23 | [0.3084254, 0.33299982], 24 | [0.70472253, -0.73309052], 25 | [0.28893132, -0.38761769], 26 | [1.15514042, 0.0129463], 27 | [0.88407872, 0.35454207], 28 | [1.31301027, -0.92648734], 29 | [-1.11515198, -0.93689695], 30 | [-0.18410027, -0.45194484], 31 | [0.9281014, 0.53085498], 32 | [-0.14374509, 0.27370049], 33 | [-0.41635887, -0.38299653], 34 | [0.08711622, 0.93259929], 35 | [1.70580611, -0.11219234], 36 | ] 37 | ) 38 | y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) 39 | return X, y 40 | 41 | 42 | def test_svm_smote(data): 43 | svm_smote = SVMSMOTE(random_state=42) 44 | svm_smote_nn = SVMSMOTE( 45 | random_state=42, 46 | k_neighbors=NearestNeighbors(n_neighbors=6), 47 | m_neighbors=NearestNeighbors(n_neighbors=11), 48 | svm_estimator=SVC(gamma="scale", random_state=42), 49 | ) 50 | 51 | X_res_1, y_res_1 = svm_smote.fit_resample(*data) 52 | X_res_2, y_res_2 = svm_smote_nn.fit_resample(*data) 53 | 54 | assert_allclose(X_res_1, X_res_2) 55 | assert_array_equal(y_res_1, y_res_2) 56 | 57 | 58 | def test_svm_smote_not_svm(data): 59 | """Check that we raise a proper error if passing an estimator that does not 60 | expose a `support_` fitted attribute.""" 61 | 62 | err_msg = "`svm_estimator` is required to exposed a `support_` fitted attribute." 63 | with pytest.raises(RuntimeError, match=err_msg): 64 | SVMSMOTE(svm_estimator=LogisticRegression()).fit_resample(*data) 65 | 66 | 67 | def test_svm_smote_all_noise(data): 68 | """Check that we raise a proper error message when all support vectors are 69 | detected as noise and there is nothing that we can do. 70 | 71 | Non-regression test for: 72 | https://github.com/scikit-learn-contrib/imbalanced-learn/issues/742 73 | """ 74 | X, y = make_classification( 75 | n_classes=3, 76 | class_sep=0.001, 77 | weights=[0.004, 0.451, 0.545], 78 | n_informative=3, 79 | n_redundant=0, 80 | flip_y=0, 81 | n_features=3, 82 | n_clusters_per_class=2, 83 | n_samples=1000, 84 | random_state=10, 85 | ) 86 | 87 | with pytest.raises(ValueError, match="SVM-SMOTE is not adapted to your dataset"): 88 | SVMSMOTE(k_neighbors=4, random_state=42).fit_resample(X, y) 89 | -------------------------------------------------------------------------------- /imblearn/over_sampling/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base class for the over-sampling method. 3 | """ 4 | # Authors: Guillaume Lemaitre 5 | # Christos Aridas 6 | # License: MIT 7 | 8 | import numbers 9 | from collections.abc import Mapping 10 | 11 | from sklearn.utils._param_validation import Interval, StrOptions 12 | 13 | from ..base import BaseSampler 14 | 15 | 16 | class BaseOverSampler(BaseSampler): 17 | """Base class for over-sampling algorithms. 18 | 19 | Warning: This class should not be used directly. Use the derive classes 20 | instead. 21 | """ 22 | 23 | _sampling_type = "over-sampling" 24 | 25 | _sampling_strategy_docstring = ( 26 | """sampling_strategy : float, str, dict or callable, default='auto' 27 | Sampling information to resample the data set. 28 | 29 | - When ``float``, it corresponds to the desired ratio of the number of 30 | samples in the minority class over the number of samples in the 31 | majority class after resampling. Therefore, the ratio is expressed as 32 | :math:`\\alpha_{os} = N_{rm} / N_{M}` where :math:`N_{rm}` is the 33 | number of samples in the minority class after resampling and 34 | :math:`N_{M}` is the number of samples in the majority class. 35 | 36 | .. warning:: 37 | ``float`` is only available for **binary** classification. An 38 | error is raised for multi-class classification. 39 | 40 | - When ``str``, specify the class targeted by the resampling. The 41 | number of samples in the different classes will be equalized. 42 | Possible choices are: 43 | 44 | ``'minority'``: resample only the minority class; 45 | 46 | ``'not minority'``: resample all classes but the minority class; 47 | 48 | ``'not majority'``: resample all classes but the majority class; 49 | 50 | ``'all'``: resample all classes; 51 | 52 | ``'auto'``: equivalent to ``'not majority'``. 53 | 54 | - When ``dict``, the keys correspond to the targeted classes. The 55 | values correspond to the desired number of samples for each targeted 56 | class. 57 | 58 | - When callable, function taking ``y`` and returns a ``dict``. The keys 59 | correspond to the targeted classes. The values correspond to the 60 | desired number of samples for each class. 61 | """.strip() 62 | ) # noqa: E501 63 | 64 | _parameter_constraints: dict = { 65 | "sampling_strategy": [ 66 | Interval(numbers.Real, 0, 1, closed="right"), 67 | StrOptions({"auto", "minority", "not minority", "not majority", "all"}), 68 | Mapping, 69 | callable, 70 | ], 71 | "random_state": ["random_state"], 72 | } 73 | -------------------------------------------------------------------------------- /imblearn/over_sampling/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/over_sampling/tests/__init__.py -------------------------------------------------------------------------------- /imblearn/over_sampling/tests/test_common.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | import numpy as np 4 | import pytest 5 | from sklearn.cluster import MiniBatchKMeans 6 | 7 | from imblearn.over_sampling import ( 8 | ADASYN, 9 | SMOTE, 10 | SMOTEN, 11 | SMOTENC, 12 | SVMSMOTE, 13 | BorderlineSMOTE, 14 | KMeansSMOTE, 15 | ) 16 | from imblearn.utils.testing import _CustomNearestNeighbors 17 | 18 | 19 | @pytest.fixture 20 | def numerical_data(): 21 | rng = np.random.RandomState(0) 22 | X = rng.randn(100, 2) 23 | y = np.repeat([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0], 5) 24 | 25 | return X, y 26 | 27 | 28 | @pytest.fixture 29 | def categorical_data(): 30 | rng = np.random.RandomState(0) 31 | 32 | feature_1 = ["A"] * 10 + ["B"] * 20 + ["C"] * 30 33 | feature_2 = ["A"] * 40 + ["B"] * 20 34 | feature_3 = ["A"] * 20 + ["B"] * 20 + ["C"] * 10 + ["D"] * 10 35 | X = np.array([feature_1, feature_2, feature_3], dtype=object).T 36 | rng.shuffle(X) 37 | y = np.array([0] * 20 + [1] * 40, dtype=np.int32) 38 | y_labels = np.array(["not apple", "apple"], dtype=object) 39 | y = y_labels[y] 40 | return X, y 41 | 42 | 43 | @pytest.fixture 44 | def heterogeneous_data(): 45 | rng = np.random.RandomState(42) 46 | X = np.empty((30, 4), dtype=object) 47 | X[:, :2] = rng.randn(30, 2) 48 | X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object) 49 | X[:, 3] = rng.randint(3, size=30) 50 | y = np.array([0] * 10 + [1] * 20) 51 | return X, y, [2, 3] 52 | 53 | 54 | @pytest.mark.parametrize( 55 | "smote", [BorderlineSMOTE(), SVMSMOTE()], ids=["borderline", "svm"] 56 | ) 57 | def test_smote_m_neighbors(numerical_data, smote): 58 | # check that m_neighbors is properly set. Regression test for: 59 | # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/568 60 | X, y = numerical_data 61 | _ = smote.fit_resample(X, y) 62 | assert smote.nn_k_.n_neighbors == 6 63 | assert smote.nn_m_.n_neighbors == 11 64 | 65 | 66 | @pytest.mark.parametrize( 67 | "smote, neighbor_estimator_name", 68 | [ 69 | (ADASYN(random_state=0), "n_neighbors"), 70 | (BorderlineSMOTE(random_state=0), "k_neighbors"), 71 | ( 72 | KMeansSMOTE( 73 | kmeans_estimator=MiniBatchKMeans(n_init=1, random_state=0), 74 | random_state=1, 75 | ), 76 | "k_neighbors", 77 | ), 78 | (SMOTE(random_state=0), "k_neighbors"), 79 | (SVMSMOTE(random_state=0), "k_neighbors"), 80 | ], 81 | ids=["adasyn", "borderline", "kmeans", "smote", "svm"], 82 | ) 83 | def test_numerical_smote_custom_nn(numerical_data, smote, neighbor_estimator_name): 84 | X, y = numerical_data 85 | params = { 86 | neighbor_estimator_name: _CustomNearestNeighbors(n_neighbors=5), 87 | } 88 | smote.set_params(**params) 89 | X_res, _ = smote.fit_resample(X, y) 90 | 91 | assert X_res.shape[0] >= 120 92 | 93 | 94 | def test_categorical_smote_k_custom_nn(categorical_data): 95 | X, y = categorical_data 96 | smote = SMOTEN(k_neighbors=_CustomNearestNeighbors(n_neighbors=5)) 97 | X_res, y_res = smote.fit_resample(X, y) 98 | 99 | assert X_res.shape == (80, 3) 100 | assert Counter(y_res) == {"apple": 40, "not apple": 40} 101 | 102 | 103 | def test_heterogeneous_smote_k_custom_nn(heterogeneous_data): 104 | X, y, categorical_features = heterogeneous_data 105 | smote = SMOTENC( 106 | categorical_features, k_neighbors=_CustomNearestNeighbors(n_neighbors=5) 107 | ) 108 | X_res, y_res = smote.fit_resample(X, y) 109 | 110 | assert X_res.shape == (40, 4) 111 | assert Counter(y_res) == {0: 20, 1: 20} 112 | 113 | 114 | @pytest.mark.parametrize( 115 | "smote", 116 | [BorderlineSMOTE(random_state=0), SVMSMOTE(random_state=0)], 117 | ids=["borderline", "svm"], 118 | ) 119 | def test_numerical_smote_extra_custom_nn(numerical_data, smote): 120 | X, y = numerical_data 121 | smote.set_params(m_neighbors=_CustomNearestNeighbors(n_neighbors=5)) 122 | X_res, y_res = smote.fit_resample(X, y) 123 | 124 | assert X_res.shape == (120, 2) 125 | assert Counter(y_res) == {0: 60, 1: 60} 126 | -------------------------------------------------------------------------------- /imblearn/tensorflow/__init__.py: -------------------------------------------------------------------------------- 1 | """The :mod:`imblearn.tensorflow` provides utilities to deal with imbalanced 2 | dataset in tensorflow.""" 3 | 4 | from ._generator import balanced_batch_generator 5 | 6 | __all__ = ["balanced_batch_generator"] 7 | -------------------------------------------------------------------------------- /imblearn/tensorflow/_generator.py: -------------------------------------------------------------------------------- 1 | """Implement generators for ``tensorflow`` which will balance the data.""" 2 | 3 | from scipy.sparse import issparse 4 | from sklearn.base import clone 5 | from sklearn.utils import _safe_indexing, check_random_state 6 | 7 | from ..under_sampling import RandomUnderSampler 8 | from ..utils import Substitution 9 | from ..utils._docstring import _random_state_docstring 10 | 11 | 12 | @Substitution(random_state=_random_state_docstring) 13 | def balanced_batch_generator( 14 | X, 15 | y, 16 | *, 17 | sample_weight=None, 18 | sampler=None, 19 | batch_size=32, 20 | keep_sparse=False, 21 | random_state=None, 22 | ): 23 | """Create a balanced batch generator to train tensorflow model. 24 | 25 | Returns a generator --- as well as the number of step per epoch --- to 26 | iterate to get the mini-batches. The sampler defines the sampling strategy 27 | used to balance the dataset ahead of creating the batch. The sampler should 28 | have an attribute ``sample_indices_``. 29 | 30 | .. versionadded:: 0.4 31 | 32 | Parameters 33 | ---------- 34 | X : ndarray of shape (n_samples, n_features) 35 | Original imbalanced dataset. 36 | 37 | y : ndarray of shape (n_samples,) or (n_samples, n_classes) 38 | Associated targets. 39 | 40 | sample_weight : ndarray of shape (n_samples,), default=None 41 | Sample weight. 42 | 43 | sampler : sampler object, default=None 44 | A sampler instance which has an attribute ``sample_indices_``. 45 | By default, the sampler used is a 46 | :class:`~imblearn.under_sampling.RandomUnderSampler`. 47 | 48 | batch_size : int, default=32 49 | Number of samples per gradient update. 50 | 51 | keep_sparse : bool, default=False 52 | Either or not to conserve or not the sparsity of the input ``X``. By 53 | default, the returned batches will be dense. 54 | 55 | {random_state} 56 | 57 | Returns 58 | ------- 59 | generator : generator of tuple 60 | Generate batch of data. The tuple generated are either (X_batch, 61 | y_batch) or (X_batch, y_batch, sampler_weight_batch). 62 | 63 | steps_per_epoch : int 64 | The number of samples per epoch. 65 | """ 66 | 67 | random_state = check_random_state(random_state) 68 | if sampler is None: 69 | sampler_ = RandomUnderSampler(random_state=random_state) 70 | else: 71 | sampler_ = clone(sampler) 72 | sampler_.fit_resample(X, y) 73 | if not hasattr(sampler_, "sample_indices_"): 74 | raise ValueError("'sampler' needs to have an attribute 'sample_indices_'.") 75 | indices = sampler_.sample_indices_ 76 | # shuffle the indices since the sampler are packing them by class 77 | random_state.shuffle(indices) 78 | 79 | def generator(X, y, sample_weight, indices, batch_size): 80 | while True: 81 | for index in range(0, len(indices), batch_size): 82 | X_res = _safe_indexing(X, indices[index : index + batch_size]) 83 | y_res = _safe_indexing(y, indices[index : index + batch_size]) 84 | if issparse(X_res) and not keep_sparse: 85 | X_res = X_res.toarray() 86 | if sample_weight is None: 87 | yield X_res, y_res 88 | else: 89 | sw_res = _safe_indexing( 90 | sample_weight, indices[index : index + batch_size] 91 | ) 92 | yield X_res, y_res, sw_res 93 | 94 | return ( 95 | generator(X, y, sample_weight, indices, batch_size), 96 | int(indices.size // batch_size), 97 | ) 98 | -------------------------------------------------------------------------------- /imblearn/tensorflow/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/tensorflow/tests/__init__.py -------------------------------------------------------------------------------- /imblearn/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/tests/__init__.py -------------------------------------------------------------------------------- /imblearn/tests/test_base.py: -------------------------------------------------------------------------------- 1 | """Test for miscellaneous samplers objects.""" 2 | 3 | # Authors: Guillaume Lemaitre 4 | # License: MIT 5 | 6 | import numpy as np 7 | import pytest 8 | from scipy import sparse 9 | from sklearn.datasets import load_iris, make_regression 10 | from sklearn.linear_model import LinearRegression 11 | from sklearn.utils import _safe_indexing 12 | from sklearn.utils._testing import assert_allclose_dense_sparse, assert_array_equal 13 | from sklearn.utils.multiclass import type_of_target 14 | 15 | from imblearn import FunctionSampler 16 | from imblearn.datasets import make_imbalance 17 | from imblearn.pipeline import make_pipeline 18 | from imblearn.under_sampling import RandomUnderSampler 19 | 20 | iris = load_iris() 21 | X, y = make_imbalance( 22 | iris.data, iris.target, sampling_strategy={0: 10, 1: 25}, random_state=0 23 | ) 24 | 25 | 26 | def test_function_sampler_reject_sparse(): 27 | X_sparse = sparse.csr_matrix(X) 28 | sampler = FunctionSampler(accept_sparse=False) 29 | err_msg = "dense data is required" 30 | with pytest.raises( 31 | TypeError, 32 | match=err_msg, 33 | ): 34 | sampler.fit_resample(X_sparse, y) 35 | 36 | 37 | @pytest.mark.parametrize( 38 | "X, y", [(X, y), (sparse.csr_matrix(X), y), (sparse.csc_matrix(X), y)] 39 | ) 40 | def test_function_sampler_identity(X, y): 41 | sampler = FunctionSampler() 42 | X_res, y_res = sampler.fit_resample(X, y) 43 | assert_allclose_dense_sparse(X_res, X) 44 | assert_array_equal(y_res, y) 45 | 46 | 47 | @pytest.mark.parametrize( 48 | "X, y", [(X, y), (sparse.csr_matrix(X), y), (sparse.csc_matrix(X), y)] 49 | ) 50 | def test_function_sampler_func(X, y): 51 | def func(X, y): 52 | return X[:10], y[:10] 53 | 54 | sampler = FunctionSampler(func=func) 55 | X_res, y_res = sampler.fit_resample(X, y) 56 | assert_allclose_dense_sparse(X_res, X[:10]) 57 | assert_array_equal(y_res, y[:10]) 58 | 59 | 60 | @pytest.mark.parametrize( 61 | "X, y", [(X, y), (sparse.csr_matrix(X), y), (sparse.csc_matrix(X), y)] 62 | ) 63 | def test_function_sampler_func_kwargs(X, y): 64 | def func(X, y, sampling_strategy, random_state): 65 | rus = RandomUnderSampler( 66 | sampling_strategy=sampling_strategy, random_state=random_state 67 | ) 68 | return rus.fit_resample(X, y) 69 | 70 | sampler = FunctionSampler( 71 | func=func, kw_args={"sampling_strategy": "auto", "random_state": 0} 72 | ) 73 | X_res, y_res = sampler.fit_resample(X, y) 74 | X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_resample(X, y) 75 | assert_allclose_dense_sparse(X_res, X_res_2) 76 | assert_array_equal(y_res, y_res_2) 77 | 78 | 79 | def test_function_sampler_validate(): 80 | # check that we can let a pass a regression variable by turning down the 81 | # validation 82 | X, y = make_regression() 83 | 84 | def dummy_sampler(X, y): 85 | indices = np.random.choice(np.arange(X.shape[0]), size=100) 86 | return _safe_indexing(X, indices), _safe_indexing(y, indices) 87 | 88 | sampler = FunctionSampler(func=dummy_sampler, validate=False) 89 | pipeline = make_pipeline(sampler, LinearRegression()) 90 | y_pred = pipeline.fit(X, y).predict(X) 91 | 92 | assert type_of_target(y_pred) == "continuous" 93 | 94 | 95 | def test_function_resampler_fit(): 96 | # Check that the validation is bypass when calling `fit` 97 | # Non-regression test for: 98 | # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/782 99 | X = np.array([[1, np.nan], [2, 3], [np.inf, 4]]) 100 | y = np.array([0, 1, 1]) 101 | 102 | def func(X, y): 103 | return X[:1], y[:1] 104 | 105 | sampler = FunctionSampler(func=func, validate=False) 106 | sampler.fit(X, y) 107 | sampler.fit_resample(X, y) 108 | -------------------------------------------------------------------------------- /imblearn/tests/test_common.py: -------------------------------------------------------------------------------- 1 | """Common tests""" 2 | 3 | # Authors: Guillaume Lemaitre 4 | # Christos Aridas 5 | # License: MIT 6 | 7 | import warnings 8 | from collections import OrderedDict 9 | 10 | import numpy as np 11 | import pytest 12 | from sklearn.exceptions import ConvergenceWarning 13 | from sklearn.utils._testing import ignore_warnings 14 | 15 | from imblearn.over_sampling import RandomOverSampler 16 | from imblearn.under_sampling import RandomUnderSampler 17 | from imblearn.utils._sklearn_compat import ( 18 | parametrize_with_checks as parametrize_with_checks_sklearn, 19 | ) 20 | from imblearn.utils._test_common.instance_generator import ( 21 | _get_check_estimator_ids, 22 | _get_expected_failed_checks, 23 | _tested_estimators, 24 | ) 25 | from imblearn.utils.estimator_checks import ( 26 | _set_checking_parameters, 27 | check_dataframe_column_names_consistency, 28 | check_param_validation, 29 | parametrize_with_checks, 30 | ) 31 | from imblearn.utils.testing import all_estimators 32 | 33 | 34 | @pytest.mark.parametrize("name, Estimator", all_estimators()) 35 | def test_all_estimator_no_base_class(name, Estimator): 36 | # test that all_estimators doesn't find abstract classes. 37 | msg = f"Base estimators such as {name} should not be included in all_estimators" 38 | assert not name.lower().startswith("base"), msg 39 | 40 | 41 | @parametrize_with_checks_sklearn( 42 | list(_tested_estimators()), expected_failed_checks=_get_expected_failed_checks 43 | ) 44 | def test_estimators_compatibility_sklearn(estimator, check, request): 45 | _set_checking_parameters(estimator) 46 | check(estimator) 47 | 48 | 49 | @parametrize_with_checks( 50 | list(_tested_estimators()), expected_failed_checks=_get_expected_failed_checks 51 | ) 52 | def test_estimators_imblearn(estimator, check, request): 53 | # Common tests for estimator instances 54 | with ignore_warnings( 55 | category=( 56 | FutureWarning, 57 | ConvergenceWarning, 58 | UserWarning, 59 | FutureWarning, 60 | ) 61 | ): 62 | _set_checking_parameters(estimator) 63 | check(estimator) 64 | 65 | 66 | @pytest.mark.parametrize( 67 | "estimator", _tested_estimators(), ids=_get_check_estimator_ids 68 | ) 69 | def test_check_param_validation(estimator): 70 | name = estimator.__class__.__name__ 71 | _set_checking_parameters(estimator) 72 | check_param_validation(name, estimator) 73 | 74 | 75 | @pytest.mark.parametrize("Sampler", [RandomOverSampler, RandomUnderSampler]) 76 | def test_strategy_as_ordered_dict(Sampler): 77 | """Check that it is possible to pass an `OrderedDict` as strategy.""" 78 | rng = np.random.RandomState(42) 79 | X, y = rng.randn(30, 2), np.array([0] * 10 + [1] * 20) 80 | sampler = Sampler(random_state=42) 81 | if isinstance(sampler, RandomOverSampler): 82 | strategy = OrderedDict({0: 20, 1: 20}) 83 | else: 84 | strategy = OrderedDict({0: 10, 1: 10}) 85 | sampler.set_params(sampling_strategy=strategy) 86 | X_res, y_res = sampler.fit_resample(X, y) 87 | assert X_res.shape[0] == sum(strategy.values()) 88 | assert y_res.shape[0] == sum(strategy.values()) 89 | 90 | 91 | @pytest.mark.parametrize( 92 | "estimator", _tested_estimators(), ids=_get_check_estimator_ids 93 | ) 94 | def test_pandas_column_name_consistency(estimator): 95 | _set_checking_parameters(estimator) 96 | with ignore_warnings(category=(FutureWarning)): 97 | with warnings.catch_warnings(record=True) as record: 98 | check_dataframe_column_names_consistency( 99 | estimator.__class__.__name__, estimator 100 | ) 101 | for warning in record: 102 | assert "was fitted without feature names" not in str(warning.message) 103 | -------------------------------------------------------------------------------- /imblearn/tests/test_exceptions.py: -------------------------------------------------------------------------------- 1 | """Test for the exceptions modules""" 2 | # Authors: Guillaume Lemaitre 3 | # Christos Aridas 4 | # License: MIT 5 | 6 | from pytest import raises 7 | 8 | from imblearn.exceptions import raise_isinstance_error 9 | 10 | 11 | def test_raise_isinstance_error(): 12 | var = 10.0 13 | with raises(ValueError, match="has to be one of"): 14 | raise_isinstance_error("var", [int], var) 15 | -------------------------------------------------------------------------------- /imblearn/under_sampling/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`imblearn.under_sampling` provides methods to under-sample 3 | a dataset. 4 | """ 5 | 6 | from ._prototype_generation import ClusterCentroids 7 | from ._prototype_selection import ( 8 | AllKNN, 9 | CondensedNearestNeighbour, 10 | EditedNearestNeighbours, 11 | InstanceHardnessThreshold, 12 | NearMiss, 13 | NeighbourhoodCleaningRule, 14 | OneSidedSelection, 15 | RandomUnderSampler, 16 | RepeatedEditedNearestNeighbours, 17 | TomekLinks, 18 | ) 19 | 20 | __all__ = [ 21 | "ClusterCentroids", 22 | "RandomUnderSampler", 23 | "InstanceHardnessThreshold", 24 | "NearMiss", 25 | "TomekLinks", 26 | "EditedNearestNeighbours", 27 | "RepeatedEditedNearestNeighbours", 28 | "AllKNN", 29 | "OneSidedSelection", 30 | "CondensedNearestNeighbour", 31 | "NeighbourhoodCleaningRule", 32 | ] 33 | -------------------------------------------------------------------------------- /imblearn/under_sampling/_prototype_generation/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`imblearn.under_sampling.prototype_generation` submodule contains 3 | methods that generate new samples in order to balance the dataset. 4 | """ 5 | 6 | from ._cluster_centroids import ClusterCentroids 7 | 8 | __all__ = ["ClusterCentroids"] 9 | -------------------------------------------------------------------------------- /imblearn/under_sampling/_prototype_generation/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/under_sampling/_prototype_generation/tests/__init__.py -------------------------------------------------------------------------------- /imblearn/under_sampling/_prototype_selection/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`imblearn.under_sampling.prototype_selection` submodule contains 3 | methods that select samples in order to balance the dataset. 4 | """ 5 | 6 | from ._condensed_nearest_neighbour import CondensedNearestNeighbour 7 | from ._edited_nearest_neighbours import ( 8 | AllKNN, 9 | EditedNearestNeighbours, 10 | RepeatedEditedNearestNeighbours, 11 | ) 12 | from ._instance_hardness_threshold import InstanceHardnessThreshold 13 | from ._nearmiss import NearMiss 14 | from ._neighbourhood_cleaning_rule import NeighbourhoodCleaningRule 15 | from ._one_sided_selection import OneSidedSelection 16 | from ._random_under_sampler import RandomUnderSampler 17 | from ._tomek_links import TomekLinks 18 | 19 | __all__ = [ 20 | "RandomUnderSampler", 21 | "InstanceHardnessThreshold", 22 | "NearMiss", 23 | "TomekLinks", 24 | "EditedNearestNeighbours", 25 | "RepeatedEditedNearestNeighbours", 26 | "AllKNN", 27 | "OneSidedSelection", 28 | "CondensedNearestNeighbour", 29 | "NeighbourhoodCleaningRule", 30 | ] 31 | -------------------------------------------------------------------------------- /imblearn/under_sampling/_prototype_selection/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/under_sampling/_prototype_selection/tests/__init__.py -------------------------------------------------------------------------------- /imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py: -------------------------------------------------------------------------------- 1 | """Test the module .""" 2 | # Authors: Guillaume Lemaitre 3 | # Christos Aridas 4 | # License: MIT 5 | 6 | import numpy as np 7 | from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier 8 | from sklearn.naive_bayes import GaussianNB as NB 9 | from sklearn.pipeline import make_pipeline 10 | from sklearn.utils._testing import assert_array_equal 11 | 12 | from imblearn.under_sampling import InstanceHardnessThreshold 13 | 14 | RND_SEED = 0 15 | X = np.array( 16 | [ 17 | [-0.3879569, 0.6894251], 18 | [-0.09322739, 1.28177189], 19 | [-0.77740357, 0.74097941], 20 | [0.91542919, -0.65453327], 21 | [-0.03852113, 0.40910479], 22 | [-0.43877303, 1.07366684], 23 | [-0.85795321, 0.82980738], 24 | [-0.18430329, 0.52328473], 25 | [-0.30126957, -0.66268378], 26 | [-0.65571327, 0.42412021], 27 | [-0.28305528, 0.30284991], 28 | [0.20246714, -0.34727125], 29 | [1.06446472, -1.09279772], 30 | [0.30543283, -0.02589502], 31 | [-0.00717161, 0.00318087], 32 | ] 33 | ) 34 | Y = np.array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0]) 35 | ESTIMATOR = GradientBoostingClassifier(random_state=RND_SEED) 36 | 37 | 38 | def test_iht_init(): 39 | sampling_strategy = "auto" 40 | iht = InstanceHardnessThreshold( 41 | estimator=ESTIMATOR, 42 | sampling_strategy=sampling_strategy, 43 | random_state=RND_SEED, 44 | ) 45 | 46 | assert iht.sampling_strategy == sampling_strategy 47 | assert iht.random_state == RND_SEED 48 | 49 | 50 | def test_iht_fit_resample(): 51 | iht = InstanceHardnessThreshold(estimator=ESTIMATOR, random_state=RND_SEED) 52 | X_resampled, y_resampled = iht.fit_resample(X, Y) 53 | assert X_resampled.shape == (12, 2) 54 | assert y_resampled.shape == (12,) 55 | 56 | 57 | def test_iht_fit_resample_half(): 58 | sampling_strategy = {0: 3, 1: 3} 59 | iht = InstanceHardnessThreshold( 60 | estimator=NB(), 61 | sampling_strategy=sampling_strategy, 62 | random_state=RND_SEED, 63 | ) 64 | X_resampled, y_resampled = iht.fit_resample(X, Y) 65 | assert X_resampled.shape == (6, 2) 66 | assert y_resampled.shape == (6,) 67 | 68 | 69 | def test_iht_fit_resample_class_obj(): 70 | est = GradientBoostingClassifier(random_state=RND_SEED) 71 | iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) 72 | X_resampled, y_resampled = iht.fit_resample(X, Y) 73 | assert X_resampled.shape == (12, 2) 74 | assert y_resampled.shape == (12,) 75 | 76 | 77 | def test_iht_reproducibility(): 78 | from sklearn.datasets import load_digits 79 | 80 | X_digits, y_digits = load_digits(return_X_y=True) 81 | idx_sampled = [] 82 | for seed in range(5): 83 | est = RandomForestClassifier(n_estimators=10, random_state=seed) 84 | iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) 85 | iht.fit_resample(X_digits, y_digits) 86 | idx_sampled.append(iht.sample_indices_.copy()) 87 | for idx_1, idx_2 in zip(idx_sampled, idx_sampled[1:]): 88 | assert_array_equal(idx_1, idx_2) 89 | 90 | 91 | def test_iht_fit_resample_default_estimator(): 92 | iht = InstanceHardnessThreshold(estimator=None, random_state=RND_SEED) 93 | X_resampled, y_resampled = iht.fit_resample(X, Y) 94 | assert isinstance(iht.estimator_, RandomForestClassifier) 95 | assert X_resampled.shape == (12, 2) 96 | assert y_resampled.shape == (12,) 97 | 98 | 99 | def test_iht_estimator_pipeline(): 100 | """Check that we can pass a pipeline containing a classifier. 101 | 102 | Checking if we have a classifier should not be based on inheriting from 103 | `ClassifierMixin`. 104 | 105 | Non-regression test for: 106 | https://github.com/scikit-learn-contrib/imbalanced-learn/pull/1049 107 | """ 108 | model = make_pipeline(GradientBoostingClassifier(random_state=RND_SEED)) 109 | iht = InstanceHardnessThreshold(estimator=model, random_state=RND_SEED) 110 | X_resampled, y_resampled = iht.fit_resample(X, Y) 111 | assert X_resampled.shape == (12, 2) 112 | assert y_resampled.shape == (12,) 113 | -------------------------------------------------------------------------------- /imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py: -------------------------------------------------------------------------------- 1 | """Test the module neighbourhood cleaning rule.""" 2 | # Authors: Guillaume Lemaitre 3 | # Christos Aridas 4 | # License: MIT 5 | 6 | from collections import Counter 7 | 8 | import numpy as np 9 | import pytest 10 | from sklearn.datasets import make_classification 11 | from sklearn.utils._testing import assert_array_equal 12 | 13 | from imblearn.under_sampling import EditedNearestNeighbours, NeighbourhoodCleaningRule 14 | 15 | 16 | @pytest.fixture(scope="module") 17 | def data(): 18 | return make_classification( 19 | n_samples=200, 20 | n_features=2, 21 | n_informative=2, 22 | n_redundant=0, 23 | n_repeated=0, 24 | n_clusters_per_class=1, 25 | n_classes=3, 26 | weights=[0.1, 0.3, 0.6], 27 | random_state=0, 28 | ) 29 | 30 | 31 | def test_ncr_threshold_cleaning(data): 32 | """Test the effect of the `threshold_cleaning` parameter.""" 33 | X, y = data 34 | # with a large `threshold_cleaning`, the algorithm is equivalent to ENN 35 | enn = EditedNearestNeighbours() 36 | ncr = NeighbourhoodCleaningRule( 37 | edited_nearest_neighbours=enn, n_neighbors=10, threshold_cleaning=10 38 | ) 39 | 40 | enn.fit_resample(X, y) 41 | ncr.fit_resample(X, y) 42 | 43 | assert_array_equal(np.sort(enn.sample_indices_), np.sort(ncr.sample_indices_)) 44 | assert ncr.classes_to_clean_ == [] 45 | 46 | # set a threshold that we should consider only the class #2 47 | counter = Counter(y) 48 | threshold = counter[1] / counter[0] 49 | ncr.set_params(threshold_cleaning=threshold) 50 | ncr.fit_resample(X, y) 51 | 52 | assert set(ncr.classes_to_clean_) == {2} 53 | 54 | # making the threshold slightly smaller to take into account class #1 55 | ncr.set_params(threshold_cleaning=threshold - np.finfo(np.float32).eps) 56 | ncr.fit_resample(X, y) 57 | 58 | assert set(ncr.classes_to_clean_) == {1, 2} 59 | 60 | 61 | def test_ncr_n_neighbors(data): 62 | """Check the effect of the NN on the cleaning of the second phase.""" 63 | X, y = data 64 | 65 | enn = EditedNearestNeighbours() 66 | ncr = NeighbourhoodCleaningRule(edited_nearest_neighbours=enn, n_neighbors=3) 67 | 68 | ncr.fit_resample(X, y) 69 | sample_indices_3_nn = ncr.sample_indices_ 70 | 71 | ncr.set_params(n_neighbors=10).fit_resample(X, y) 72 | sample_indices_10_nn = ncr.sample_indices_ 73 | 74 | # we should have a more aggressive cleaning with n_neighbors is larger 75 | assert len(sample_indices_3_nn) > len(sample_indices_10_nn) 76 | 77 | 78 | # TODO: remove in 0.14 79 | @pytest.mark.parametrize("kind_sel", ["all", "mode"]) 80 | def test_ncr_deprecate_kind_sel(data, kind_sel): 81 | X, y = data 82 | 83 | with pytest.warns(FutureWarning, match="`kind_sel` is deprecated"): 84 | NeighbourhoodCleaningRule(kind_sel=kind_sel).fit_resample(X, y) 85 | -------------------------------------------------------------------------------- /imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py: -------------------------------------------------------------------------------- 1 | """Test the module Tomek's links.""" 2 | # Authors: Guillaume Lemaitre 3 | # Christos Aridas 4 | # License: MIT 5 | 6 | import numpy as np 7 | import pytest 8 | from sklearn.datasets import make_classification 9 | from sklearn.utils._testing import assert_array_equal 10 | 11 | from imblearn.under_sampling import TomekLinks 12 | 13 | X = np.array( 14 | [ 15 | [0.31230513, 0.1216318], 16 | [0.68481731, 0.51935141], 17 | [1.34192108, -0.13367336], 18 | [0.62366841, -0.21312976], 19 | [1.61091956, -0.40283504], 20 | [-0.37162401, -2.19400981], 21 | [0.74680821, 1.63827342], 22 | [0.2184254, 0.24299982], 23 | [0.61472253, -0.82309052], 24 | [0.19893132, -0.47761769], 25 | [1.06514042, -0.0770537], 26 | [0.97407872, 0.44454207], 27 | [1.40301027, -0.83648734], 28 | [-1.20515198, -1.02689695], 29 | [-0.27410027, -0.54194484], 30 | [0.8381014, 0.44085498], 31 | [-0.23374509, 0.18370049], 32 | [-0.32635887, -0.29299653], 33 | [-0.00288378, 0.84259929], 34 | [1.79580611, -0.02219234], 35 | ] 36 | ) 37 | Y = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) 38 | 39 | 40 | def test_tl_init(): 41 | tl = TomekLinks() 42 | assert tl.n_jobs is None 43 | 44 | 45 | def test_tl_fit_resample(): 46 | tl = TomekLinks() 47 | X_resampled, y_resampled = tl.fit_resample(X, Y) 48 | 49 | X_gt = np.array( 50 | [ 51 | [0.31230513, 0.1216318], 52 | [0.68481731, 0.51935141], 53 | [1.34192108, -0.13367336], 54 | [0.62366841, -0.21312976], 55 | [1.61091956, -0.40283504], 56 | [-0.37162401, -2.19400981], 57 | [0.74680821, 1.63827342], 58 | [0.2184254, 0.24299982], 59 | [0.61472253, -0.82309052], 60 | [0.19893132, -0.47761769], 61 | [0.97407872, 0.44454207], 62 | [1.40301027, -0.83648734], 63 | [-1.20515198, -1.02689695], 64 | [-0.23374509, 0.18370049], 65 | [-0.32635887, -0.29299653], 66 | [-0.00288378, 0.84259929], 67 | [1.79580611, -0.02219234], 68 | ] 69 | ) 70 | y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0]) 71 | assert_array_equal(X_resampled, X_gt) 72 | assert_array_equal(y_resampled, y_gt) 73 | 74 | 75 | @pytest.mark.parametrize( 76 | "sampling_strategy", ["auto", "majority", "not minority", "not majority", "all"] 77 | ) 78 | def test_tomek_links_strings(sampling_strategy): 79 | """Check that we support all supposed strings as `sampling_strategy` in 80 | a sampler inheriting from `BaseCleaningSampler`.""" 81 | 82 | X, y = make_classification( 83 | n_samples=100, 84 | n_clusters_per_class=1, 85 | n_classes=3, 86 | weights=[0.1, 0.3, 0.6], 87 | random_state=0, 88 | ) 89 | TomekLinks(sampling_strategy=sampling_strategy).fit_resample(X, y) 90 | -------------------------------------------------------------------------------- /imblearn/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`imblearn.utils` module includes various utilities. 3 | """ 4 | 5 | from ._docstring import Substitution 6 | from ._validation import ( 7 | check_neighbors_object, 8 | check_sampling_strategy, 9 | check_target_type, 10 | ) 11 | 12 | __all__ = [ 13 | "check_neighbors_object", 14 | "check_sampling_strategy", 15 | "check_target_type", 16 | "Substitution", 17 | ] 18 | -------------------------------------------------------------------------------- /imblearn/utils/_docstring.py: -------------------------------------------------------------------------------- 1 | """Utilities for docstring in imbalanced-learn.""" 2 | 3 | # Authors: Guillaume Lemaitre 4 | # License: MIT 5 | 6 | 7 | class Substitution: 8 | """Decorate a function's or a class' docstring to perform string 9 | substitution on it. 10 | 11 | This decorator should be robust even if obj.__doc__ is None 12 | (for example, if -OO was passed to the interpreter) 13 | """ 14 | 15 | def __init__(self, *args, **kwargs): 16 | if args and kwargs: 17 | raise AssertionError("Only positional or keyword args are allowed") 18 | 19 | self.params = args or kwargs 20 | 21 | def __call__(self, obj): 22 | if obj.__doc__: 23 | obj.__doc__ = obj.__doc__.format(**self.params) 24 | return obj 25 | 26 | 27 | _random_state_docstring = """random_state : int, RandomState instance, default=None 28 | Control the randomization of the algorithm. 29 | 30 | - If int, ``random_state`` is the seed used by the random number 31 | generator; 32 | - If ``RandomState`` instance, random_state is the random number 33 | generator; 34 | - If ``None``, the random number generator is the ``RandomState`` 35 | instance used by ``np.random``. 36 | """.rstrip() 37 | 38 | _n_jobs_docstring = """n_jobs : int, default=None 39 | Number of CPU cores used during the cross-validation loop. 40 | ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 41 | ``-1`` means using all processors. See 42 | `Glossary `_ 43 | for more details. 44 | """.rstrip() 45 | -------------------------------------------------------------------------------- /imblearn/utils/_show_versions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility method which prints system info to help with debugging, 3 | and filing issues on GitHub. 4 | Adapted from :func:`sklearn.show_versions`, 5 | which was adapted from :func:`pandas.show_versions` 6 | """ 7 | 8 | # Author: Alexander L. Hayes 9 | # License: MIT 10 | 11 | from .. import __version__ 12 | 13 | 14 | def _get_deps_info(): 15 | """Overview of the installed version of main dependencies 16 | Returns 17 | ------- 18 | deps_info: dict 19 | version information on relevant Python libraries 20 | """ 21 | deps = [ 22 | "imbalanced-learn", 23 | "pip", 24 | "setuptools", 25 | "numpy", 26 | "scipy", 27 | "scikit-learn", 28 | "Cython", 29 | "pandas", 30 | "keras", 31 | "tensorflow", 32 | "joblib", 33 | ] 34 | 35 | deps_info = { 36 | "imbalanced-learn": __version__, 37 | } 38 | 39 | from importlib.metadata import PackageNotFoundError, version 40 | 41 | for modname in deps: 42 | try: 43 | deps_info[modname] = version(modname) 44 | except PackageNotFoundError: 45 | deps_info[modname] = None 46 | return deps_info 47 | 48 | 49 | def show_versions(github=False): 50 | """Print debugging information. 51 | 52 | .. versionadded:: 0.5 53 | 54 | Parameters 55 | ---------- 56 | github : bool, 57 | If true, wrap system info with GitHub markup. 58 | """ 59 | 60 | from sklearn.utils._show_versions import _get_sys_info 61 | 62 | _sys_info = _get_sys_info() 63 | _deps_info = _get_deps_info() 64 | _github_markup = ( 65 | "
" 66 | "System, Dependency Information\n\n" 67 | "**System Information**\n\n" 68 | "{0}\n" 69 | "**Python Dependencies**\n\n" 70 | "{1}\n" 71 | "
" 72 | ) 73 | 74 | if github: 75 | _sys_markup = "" 76 | _deps_markup = "" 77 | 78 | for k, stat in _sys_info.items(): 79 | _sys_markup += f"* {k:<10}: `{stat}`\n" 80 | for k, stat in _deps_info.items(): 81 | _deps_markup += f"* {k:<10}: `{stat}`\n" 82 | 83 | print(_github_markup.format(_sys_markup, _deps_markup)) 84 | 85 | else: 86 | print("\nSystem:") 87 | for k, stat in _sys_info.items(): 88 | print(f"{k:>11}: {stat}") 89 | 90 | print("\nPython dependencies:") 91 | for k, stat in _deps_info.items(): 92 | print(f"{k:>11}: {stat}") 93 | -------------------------------------------------------------------------------- /imblearn/utils/_tags.py: -------------------------------------------------------------------------------- 1 | from ._sklearn_compat import InputTags, SamplerTags, Tags # noqa: F401 2 | -------------------------------------------------------------------------------- /imblearn/utils/_test_common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/utils/_test_common/__init__.py -------------------------------------------------------------------------------- /imblearn/utils/deprecation.py: -------------------------------------------------------------------------------- 1 | """Utilities for deprecation""" 2 | 3 | # Authors: Guillaume Lemaitre 4 | # License: MIT 5 | 6 | import warnings 7 | 8 | 9 | def deprecate_parameter(sampler, version_deprecation, param_deprecated, new_param=None): 10 | """Helper to deprecate a parameter by another one. 11 | 12 | Parameters 13 | ---------- 14 | sampler : sampler object, 15 | The object which will be inspected. 16 | 17 | version_deprecation : str, 18 | The version from which the parameter will be deprecated. The format 19 | should be ``'x.y'``. 20 | 21 | param_deprecated : str, 22 | The parameter being deprecated. 23 | 24 | new_param : str, 25 | The parameter used instead of the deprecated parameter. By default, no 26 | parameter is expected. 27 | """ 28 | x, y = version_deprecation.split(".") 29 | version_removed = x + "." + str(int(y) + 2) 30 | if new_param is None: 31 | if getattr(sampler, param_deprecated) is not None: 32 | warnings.warn( 33 | ( 34 | f"'{param_deprecated}' is deprecated from {version_deprecation} and" 35 | f" will be removed in {version_removed} for the estimator" 36 | f" {sampler.__class__}." 37 | ), 38 | category=FutureWarning, 39 | ) 40 | else: 41 | if getattr(sampler, param_deprecated) is not None: 42 | warnings.warn( 43 | ( 44 | f"'{param_deprecated}' is deprecated from {version_deprecation} and" 45 | f" will be removed in {version_removed} for the estimator" 46 | f" {sampler.__class__}. Use '{new_param}' instead." 47 | ), 48 | category=FutureWarning, 49 | ) 50 | setattr(sampler, new_param, getattr(sampler, param_deprecated)) 51 | -------------------------------------------------------------------------------- /imblearn/utils/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/utils/tests/__init__.py -------------------------------------------------------------------------------- /imblearn/utils/tests/test_deprecation.py: -------------------------------------------------------------------------------- 1 | """Test for the deprecation helper""" 2 | 3 | # Authors: Guillaume Lemaitre 4 | # License: MIT 5 | 6 | import pytest 7 | 8 | from imblearn.utils.deprecation import deprecate_parameter 9 | 10 | 11 | class Sampler: 12 | def __init__(self): 13 | self.a = "something" 14 | self.b = "something" 15 | 16 | 17 | def test_deprecate_parameter(): 18 | with pytest.warns(FutureWarning, match="is deprecated from"): 19 | deprecate_parameter(Sampler(), "0.2", "a") 20 | with pytest.warns(FutureWarning, match="Use 'b' instead."): 21 | deprecate_parameter(Sampler(), "0.2", "a", "b") 22 | -------------------------------------------------------------------------------- /imblearn/utils/tests/test_docstring.py: -------------------------------------------------------------------------------- 1 | """Test utilities for docstring.""" 2 | 3 | # Authors: Guillaume Lemaitre 4 | # License: MIT 5 | 6 | import sys 7 | import textwrap 8 | 9 | import pytest 10 | 11 | from imblearn.utils import Substitution 12 | from imblearn.utils._docstring import _n_jobs_docstring, _random_state_docstring 13 | 14 | 15 | def _dedent_docstring(docstring): 16 | """Compatibility with Python 3.13+. 17 | 18 | xref: https://github.com/python/cpython/issues/81283 19 | """ 20 | return "\n".join([textwrap.dedent(line) for line in docstring.split("\n")]) 21 | 22 | 23 | func_docstring = """A function. 24 | 25 | Parameters 26 | ---------- 27 | xxx 28 | 29 | yyy 30 | """ 31 | 32 | 33 | def func(param_1, param_2): 34 | """A function. 35 | 36 | Parameters 37 | ---------- 38 | {param_1} 39 | 40 | {param_2} 41 | """ 42 | return param_1, param_2 43 | 44 | 45 | cls_docstring = """A class. 46 | 47 | Parameters 48 | ---------- 49 | xxx 50 | 51 | yyy 52 | """ 53 | 54 | 55 | class cls: 56 | """A class. 57 | 58 | Parameters 59 | ---------- 60 | {param_1} 61 | 62 | {param_2} 63 | """ 64 | 65 | def __init__(self, param_1, param_2): 66 | self.param_1 = param_1 67 | self.param_2 = param_2 68 | 69 | 70 | if sys.version_info >= (3, 13): 71 | func_docstring = _dedent_docstring(func_docstring) 72 | cls_docstring = _dedent_docstring(cls_docstring) 73 | 74 | 75 | @pytest.mark.parametrize( 76 | "obj, obj_docstring", [(func, func_docstring), (cls, cls_docstring)] 77 | ) 78 | def test_docstring_inject(obj, obj_docstring): 79 | obj_injected_docstring = Substitution(param_1="xxx", param_2="yyy")(obj) 80 | assert obj_injected_docstring.__doc__ == obj_docstring 81 | 82 | 83 | def test_docstring_template(): 84 | assert "random_state" in _random_state_docstring 85 | assert "n_jobs" in _n_jobs_docstring 86 | 87 | 88 | def test_docstring_with_python_OO(): 89 | """Check that we don't raise a warning if the code is executed with -OO. 90 | 91 | Non-regression test for: 92 | https://github.com/scikit-learn-contrib/imbalanced-learn/issues/945 93 | """ 94 | instance = cls(param_1="xxx", param_2="yyy") 95 | instance.__doc__ = None # simulate -OO 96 | 97 | instance = Substitution(param_1="xxx", param_2="yyy")(instance) 98 | 99 | assert instance.__doc__ is None 100 | -------------------------------------------------------------------------------- /imblearn/utils/tests/test_estimator_checks.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from sklearn.base import BaseEstimator 4 | from sklearn.utils.multiclass import check_classification_targets 5 | 6 | from imblearn.base import BaseSampler 7 | from imblearn.over_sampling.base import BaseOverSampler 8 | from imblearn.utils import check_target_type as target_check 9 | from imblearn.utils._sklearn_compat import validate_data 10 | from imblearn.utils.estimator_checks import ( 11 | check_samplers_fit, 12 | check_samplers_nan, 13 | check_samplers_one_label, 14 | check_samplers_preserve_dtype, 15 | check_samplers_sparse, 16 | check_samplers_string, 17 | check_target_type, 18 | ) 19 | 20 | 21 | class BaseBadSampler(BaseEstimator): 22 | """Sampler without inputs checking.""" 23 | 24 | _sampling_type = "bypass" 25 | 26 | def fit(self, X, y): 27 | return self 28 | 29 | def fit_resample(self, X, y): 30 | check_classification_targets(y) 31 | self.fit(X, y) 32 | return X, y 33 | 34 | 35 | class SamplerSingleClass(BaseSampler): 36 | """Sampler that would sample even with a single class.""" 37 | 38 | _sampling_type = "bypass" 39 | 40 | def fit_resample(self, X, y): 41 | return self._fit_resample(X, y) 42 | 43 | def _fit_resample(self, X, y): 44 | return X, y 45 | 46 | 47 | class NotFittedSampler(BaseBadSampler): 48 | """Sampler without target checking.""" 49 | 50 | def fit(self, X, y): 51 | X, y = validate_data(self, X=X, y=y) 52 | return self 53 | 54 | 55 | class NoAcceptingSparseSampler(BaseBadSampler): 56 | """Sampler which does not accept sparse matrix.""" 57 | 58 | def fit(self, X, y): 59 | X, y = validate_data(self, X=X, y=y) 60 | self.sampling_strategy_ = "sampling_strategy_" 61 | return self 62 | 63 | 64 | class NotPreservingDtypeSampler(BaseSampler): 65 | _sampling_type = "bypass" 66 | 67 | _parameter_constraints: dict = {"sampling_strategy": "no_validation"} 68 | 69 | def _fit_resample(self, X, y): 70 | return X.astype(np.float64), y.astype(np.int64) 71 | 72 | 73 | class IndicesSampler(BaseOverSampler): 74 | def _check_X_y(self, X, y): 75 | y, binarize_y = target_check(y, indicate_one_vs_all=True) 76 | X, y = validate_data( 77 | self, 78 | X=X, 79 | y=y, 80 | reset=True, 81 | dtype=None, 82 | ensure_all_finite=False, 83 | ) 84 | return X, y, binarize_y 85 | 86 | def _fit_resample(self, X, y): 87 | n_max_count_class = np.bincount(y).max() 88 | indices = np.random.choice(np.arange(X.shape[0]), size=n_max_count_class * 2) 89 | return X[indices], y[indices] 90 | 91 | 92 | def test_check_samplers_string(): 93 | sampler = IndicesSampler() 94 | check_samplers_string(sampler.__class__.__name__, sampler) 95 | 96 | 97 | def test_check_samplers_nan(): 98 | sampler = IndicesSampler() 99 | check_samplers_nan(sampler.__class__.__name__, sampler) 100 | 101 | 102 | mapping_estimator_error = { 103 | "BaseBadSampler": (AssertionError, None), 104 | "SamplerSingleClass": (AssertionError, "Sampler can't balance when only"), 105 | "NotFittedSampler": (AssertionError, "No fitted attribute"), 106 | "NoAcceptingSparseSampler": (TypeError, "dense data is required"), 107 | "NotPreservingDtypeSampler": (AssertionError, "X dtype is not preserved"), 108 | } 109 | 110 | 111 | def _test_single_check(Estimator, check): 112 | estimator = Estimator() 113 | name = estimator.__class__.__name__ 114 | err_type, err_msg = mapping_estimator_error[name] 115 | with pytest.raises(err_type, match=err_msg): 116 | check(name, estimator) 117 | 118 | 119 | def test_all_checks(): 120 | _test_single_check(BaseBadSampler, check_target_type) 121 | _test_single_check(SamplerSingleClass, check_samplers_one_label) 122 | _test_single_check(NotFittedSampler, check_samplers_fit) 123 | _test_single_check(NoAcceptingSparseSampler, check_samplers_sparse) 124 | _test_single_check(NotPreservingDtypeSampler, check_samplers_preserve_dtype) 125 | -------------------------------------------------------------------------------- /imblearn/utils/tests/test_min_dependencies.py: -------------------------------------------------------------------------------- 1 | """Tests for the minimum dependencies in the README.rst file.""" 2 | 3 | import os 4 | import platform 5 | import re 6 | from pathlib import Path 7 | 8 | import pytest 9 | from packaging.requirements import Requirement 10 | from packaging.version import parse 11 | 12 | import imblearn 13 | 14 | 15 | @pytest.mark.skipif( 16 | platform.system() == "Windows" or parse(platform.python_version()) < parse("3.11"), 17 | reason="This test is enough on unix system and requires Python >= 3.11", 18 | ) 19 | def test_min_dependencies_readme(): 20 | # local import to not import the file with Python < 3.11 21 | import tomllib 22 | 23 | # Test that the minimum dependencies in the README.rst file are 24 | # consistent with the minimum dependencies defined at the file: 25 | # pyproject.toml 26 | 27 | pyproject_path = Path(imblearn.__path__[0]).parents[0] / "pyproject.toml" 28 | with open(pyproject_path, "rb") as f: 29 | pyproject_data = tomllib.load(f) 30 | 31 | def process_requirements(requirements): 32 | result = {} 33 | for req in requirements: 34 | req = Requirement(req) 35 | for specifier in req.specifier: 36 | if specifier.operator == ">=": 37 | result[req.name] = parse(specifier.version) 38 | return result 39 | 40 | min_dependencies = process_requirements( 41 | [f"python{pyproject_data['project']['requires-python']}"] 42 | ) 43 | min_dependencies.update( 44 | process_requirements(pyproject_data["project"]["dependencies"]) 45 | ) 46 | 47 | markers = ["docs", "optional", "tensorflow", "keras", "tests"] 48 | for marker_name in markers: 49 | min_dependencies.update( 50 | process_requirements( 51 | pyproject_data["project"]["optional-dependencies"][marker_name] 52 | ) 53 | ) 54 | 55 | pattern = re.compile( 56 | r"(\.\. \|)" 57 | + r"(([A-Za-z]+\-?)+)" 58 | + r"(MinVersion\| replace::)" 59 | + r"( [0-9]+\.[0-9]+(\.[0-9]+)?)" 60 | ) 61 | 62 | readme_path = Path(imblearn.__path__[0]).parents[0] 63 | readme_file = readme_path / "README.rst" 64 | 65 | if not os.path.exists(readme_file): 66 | # Skip the test if the README.rst file is not available. 67 | # For instance, when installing scikit-learn from wheels 68 | pytest.skip("The README.rst file is not available.") 69 | 70 | with readme_file.open("r") as f: 71 | for line in f: 72 | matched = pattern.match(line) 73 | 74 | if not matched: 75 | continue 76 | 77 | package, version = matched.group(2), matched.group(5) 78 | package = package.lower() 79 | if package == "scikitlearn": 80 | package = "scikit-learn" 81 | 82 | if package in min_dependencies: 83 | version = parse(version) 84 | min_version = min_dependencies[package] 85 | 86 | assert version == min_version, f"{package} has a mismatched version" 87 | -------------------------------------------------------------------------------- /imblearn/utils/tests/test_show_versions.py: -------------------------------------------------------------------------------- 1 | """Test for the show_versions helper. Based on the sklearn tests.""" 2 | # Author: Alexander L. Hayes 3 | # License: MIT 4 | 5 | from imblearn.utils._show_versions import _get_deps_info, show_versions 6 | 7 | 8 | def test_get_deps_info(): 9 | _deps_info = _get_deps_info() 10 | assert "pip" in _deps_info 11 | assert "setuptools" in _deps_info 12 | assert "imbalanced-learn" in _deps_info 13 | assert "scikit-learn" in _deps_info 14 | assert "numpy" in _deps_info 15 | assert "scipy" in _deps_info 16 | assert "Cython" in _deps_info 17 | assert "pandas" in _deps_info 18 | assert "joblib" in _deps_info 19 | 20 | 21 | def test_show_versions_default(capsys): 22 | show_versions() 23 | out, err = capsys.readouterr() 24 | assert "python" in out 25 | assert "executable" in out 26 | assert "machine" in out 27 | assert "pip" in out 28 | assert "setuptools" in out 29 | assert "imbalanced-learn" in out 30 | assert "scikit-learn" in out 31 | assert "numpy" in out 32 | assert "scipy" in out 33 | assert "Cython" in out 34 | assert "pandas" in out 35 | assert "keras" in out 36 | assert "tensorflow" in out 37 | assert "joblib" in out 38 | 39 | 40 | def test_show_versions_github(capsys): 41 | show_versions(github=True) 42 | out, err = capsys.readouterr() 43 | assert "
System, Dependency Information" in out 44 | assert "**System Information**" in out 45 | assert "* python" in out 46 | assert "* executable" in out 47 | assert "* machine" in out 48 | assert "**Python Dependencies**" in out 49 | assert "* pip" in out 50 | assert "* setuptools" in out 51 | assert "* imbalanced-learn" in out 52 | assert "* scikit-learn" in out 53 | assert "* numpy" in out 54 | assert "* scipy" in out 55 | assert "* Cython" in out 56 | assert "* pandas" in out 57 | assert "* keras" in out 58 | assert "* tensorflow" in out 59 | assert "* joblib" in out 60 | assert "
" in out 61 | -------------------------------------------------------------------------------- /imblearn/utils/tests/test_testing.py: -------------------------------------------------------------------------------- 1 | """Test for the testing module""" 2 | # Authors: Guillaume Lemaitre 3 | # Christos Aridas 4 | # License: MIT 5 | 6 | import numpy as np 7 | import pytest 8 | from sklearn.neighbors._base import KNeighborsMixin 9 | 10 | from imblearn.base import SamplerMixin 11 | from imblearn.utils.testing import _CustomNearestNeighbors, all_estimators 12 | 13 | 14 | def test_all_estimators(): 15 | # check if the filtering is working with a list or a single string 16 | type_filter = "sampler" 17 | all_estimators(type_filter=type_filter) 18 | type_filter = ["sampler"] 19 | estimators = all_estimators(type_filter=type_filter) 20 | for estimator in estimators: 21 | # check that all estimators are sampler 22 | assert issubclass(estimator[1], SamplerMixin) 23 | 24 | # check that an error is raised when the type is unknown 25 | type_filter = "rnd" 26 | with pytest.raises(ValueError, match="Parameter type_filter must be 'sampler'"): 27 | all_estimators(type_filter=type_filter) 28 | 29 | 30 | def test_custom_nearest_neighbors(): 31 | """Check that our custom nearest neighbors can be used for our internal 32 | duck-typing.""" 33 | 34 | neareat_neighbors = _CustomNearestNeighbors(n_neighbors=3) 35 | 36 | assert not isinstance(neareat_neighbors, KNeighborsMixin) 37 | assert hasattr(neareat_neighbors, "kneighbors") 38 | assert hasattr(neareat_neighbors, "kneighbors_graph") 39 | 40 | rng = np.random.RandomState(42) 41 | X = rng.randn(150, 3) 42 | y = rng.randint(0, 2, 150) 43 | neareat_neighbors.fit(X, y) 44 | 45 | distances, indices = neareat_neighbors.kneighbors(X) 46 | assert distances.shape == (150, 3) 47 | assert indices.shape == (150, 3) 48 | np.testing.assert_allclose(distances[:, 0], 0.0) 49 | np.testing.assert_allclose(indices[:, 0], np.arange(150)) 50 | --------------------------------------------------------------------------------