├── .circleci
    └── config.yml
├── .coveragerc
├── .github
    ├── ISSUE_TEMPLATE.md
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── documentation-improvement.md
    │   ├── feature_request.md
    │   ├── other--blank-template-.md
    │   ├── question.md
    │   └── usage-question.md
    ├── PULL_REQUEST_TEMPLATE.md
    ├── check-changelog.yml
    ├── dependabot.yml
    └── workflows
    │   ├── circleci-artifacts-redirector.yml
    │   ├── linters.yml
    │   └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── AUTHORS.rst
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.rst
├── build_tools
    └── circle
    │   ├── build_doc.sh
    │   ├── checkout_merge_commit.sh
    │   ├── linting.sh
    │   └── push_doc.sh
├── conftest.py
├── doc
    ├── Makefile
    ├── _static
    │   ├── css
    │   │   └── imbalanced-learn.css
    │   ├── img
    │   │   ├── favicon.ico
    │   │   ├── logo.png
    │   │   ├── logo.xcf
    │   │   ├── logo_wide.png
    │   │   └── logo_wide_dark.png
    │   ├── index_api.svg
    │   ├── index_examples.svg
    │   ├── index_getting_started.svg
    │   ├── index_user_guide.svg
    │   └── js
    │   │   └── copybutton.js
    ├── _templates
    │   ├── class.rst
    │   ├── function.rst
    │   ├── numpydoc_docstring.rst
    │   └── sidebar-search-bs.html
    ├── about.rst
    ├── bibtex
    │   └── refs.bib
    ├── combine.rst
    ├── common_pitfalls.rst
    ├── conf.py
    ├── datasets
    │   └── index.rst
    ├── developers_utils.rst
    ├── ensemble.rst
    ├── index.rst
    ├── install.rst
    ├── introduction.rst
    ├── make.bat
    ├── metrics.rst
    ├── miscellaneous.rst
    ├── over_sampling.rst
    ├── references
    │   ├── combine.rst
    │   ├── datasets.rst
    │   ├── ensemble.rst
    │   ├── index.rst
    │   ├── keras.rst
    │   ├── metrics.rst
    │   ├── miscellaneous.rst
    │   ├── over_sampling.rst
    │   ├── pipeline.rst
    │   ├── tensorflow.rst
    │   ├── under_sampling.rst
    │   └── utils.rst
    ├── sphinxext
    │   ├── LICENSE.txt
    │   ├── MANIFEST.in
    │   ├── README.txt
    │   ├── github_link.py
    │   └── sphinx_issues.py
    ├── under_sampling.rst
    ├── user_guide.rst
    ├── whats_new.rst
    ├── whats_new
    │   ├── 0.13.rst
    │   ├── 0.14.rst
    │   ├── v0.1.rst
    │   ├── v0.10.rst
    │   ├── v0.11.rst
    │   ├── v0.12.rst
    │   ├── v0.2.rst
    │   ├── v0.3.rst
    │   ├── v0.4.rst
    │   ├── v0.5.rst
    │   ├── v0.6.rst
    │   ├── v0.7.rst
    │   ├── v0.8.rst
    │   └── v0.9.rst
    └── zzz_references.rst
├── examples
    ├── README.txt
    ├── api
    │   ├── README.txt
    │   └── plot_sampling_strategy_usage.py
    ├── applications
    │   ├── README.txt
    │   ├── plot_impact_imbalanced_classes.py
    │   ├── plot_multi_class_under_sampling.py
    │   ├── plot_outlier_rejections.py
    │   ├── plot_over_sampling_benchmark_lfw.py
    │   ├── plot_topic_classication.py
    │   └── porto_seguro_keras_under_sampling.py
    ├── combine
    │   ├── README.txt
    │   └── plot_comparison_combine.py
    ├── datasets
    │   ├── README.txt
    │   └── plot_make_imbalance.py
    ├── ensemble
    │   ├── README.txt
    │   ├── plot_bagging_classifier.py
    │   └── plot_comparison_ensemble_classifier.py
    ├── evaluation
    │   ├── README.txt
    │   ├── plot_classification_report.py
    │   └── plot_metrics.py
    ├── model_selection
    │   ├── README.txt
    │   └── plot_validation_curve.py
    ├── over-sampling
    │   ├── README.txt
    │   ├── plot_comparison_over_sampling.py
    │   ├── plot_illustration_generation_sample.py
    │   └── plot_shrinkage_effect.py
    ├── pipeline
    │   ├── README.txt
    │   └── plot_pipeline_classification.py
    └── under-sampling
    │   ├── README.txt
    │   ├── plot_comparison_under_sampling.py
    │   ├── plot_illustration_nearmiss.py
    │   └── plot_illustration_tomek_links.py
├── imblearn
    ├── VERSION.txt
    ├── __init__.py
    ├── _version.py
    ├── base.py
    ├── combine
    │   ├── __init__.py
    │   ├── _smote_enn.py
    │   ├── _smote_tomek.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── test_smote_enn.py
    │   │   └── test_smote_tomek.py
    ├── datasets
    │   ├── __init__.py
    │   ├── _imbalance.py
    │   ├── _zenodo.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── test_imbalance.py
    │   │   └── test_zenodo.py
    ├── ensemble
    │   ├── __init__.py
    │   ├── _bagging.py
    │   ├── _common.py
    │   ├── _easy_ensemble.py
    │   ├── _forest.py
    │   ├── _weight_boosting.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── test_bagging.py
    │   │   ├── test_easy_ensemble.py
    │   │   ├── test_forest.py
    │   │   └── test_weight_boosting.py
    ├── exceptions.py
    ├── keras
    │   ├── __init__.py
    │   ├── _generator.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   └── test_generator.py
    ├── metrics
    │   ├── __init__.py
    │   ├── _classification.py
    │   ├── pairwise.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── test_classification.py
    │   │   ├── test_pairwise.py
    │   │   └── test_score_objects.py
    ├── over_sampling
    │   ├── __init__.py
    │   ├── _adasyn.py
    │   ├── _random_over_sampler.py
    │   ├── _smote
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── cluster.py
    │   │   ├── filter.py
    │   │   └── tests
    │   │   │   ├── __init__.py
    │   │   │   ├── test_borderline_smote.py
    │   │   │   ├── test_kmeans_smote.py
    │   │   │   ├── test_smote.py
    │   │   │   ├── test_smote_nc.py
    │   │   │   ├── test_smoten.py
    │   │   │   └── test_svm_smote.py
    │   ├── base.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── test_adasyn.py
    │   │   ├── test_common.py
    │   │   └── test_random_over_sampler.py
    ├── pipeline.py
    ├── tensorflow
    │   ├── __init__.py
    │   ├── _generator.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   └── test_generator.py
    ├── tests
    │   ├── __init__.py
    │   ├── test_base.py
    │   ├── test_common.py
    │   ├── test_docstring_parameters.py
    │   ├── test_exceptions.py
    │   ├── test_pipeline.py
    │   └── test_public_functions.py
    ├── under_sampling
    │   ├── __init__.py
    │   ├── _prototype_generation
    │   │   ├── __init__.py
    │   │   ├── _cluster_centroids.py
    │   │   └── tests
    │   │   │   ├── __init__.py
    │   │   │   └── test_cluster_centroids.py
    │   ├── _prototype_selection
    │   │   ├── __init__.py
    │   │   ├── _condensed_nearest_neighbour.py
    │   │   ├── _edited_nearest_neighbours.py
    │   │   ├── _instance_hardness_threshold.py
    │   │   ├── _nearmiss.py
    │   │   ├── _neighbourhood_cleaning_rule.py
    │   │   ├── _one_sided_selection.py
    │   │   ├── _random_under_sampler.py
    │   │   ├── _tomek_links.py
    │   │   └── tests
    │   │   │   ├── __init__.py
    │   │   │   ├── test_allknn.py
    │   │   │   ├── test_condensed_nearest_neighbour.py
    │   │   │   ├── test_edited_nearest_neighbours.py
    │   │   │   ├── test_instance_hardness_threshold.py
    │   │   │   ├── test_nearmiss.py
    │   │   │   ├── test_neighbourhood_cleaning_rule.py
    │   │   │   ├── test_one_sided_selection.py
    │   │   │   ├── test_random_under_sampler.py
    │   │   │   ├── test_repeated_edited_nearest_neighbours.py
    │   │   │   └── test_tomek_links.py
    │   └── base.py
    └── utils
    │   ├── __init__.py
    │   ├── _docstring.py
    │   ├── _show_versions.py
    │   ├── _sklearn_compat.py
    │   ├── _tags.py
    │   ├── _test_common
    │       ├── __init__.py
    │       └── instance_generator.py
    │   ├── _validation.py
    │   ├── deprecation.py
    │   ├── estimator_checks.py
    │   ├── testing.py
    │   └── tests
    │       ├── __init__.py
    │       ├── test_deprecation.py
    │       ├── test_docstring.py
    │       ├── test_estimator_checks.py
    │       ├── test_min_dependencies.py
    │       ├── test_show_versions.py
    │       ├── test_testing.py
    │       └── test_validation.py
├── maint_tools
    └── test_docstring.py
├── pixi.lock
├── pyproject.toml
└── references.bib


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | jobs:
 4 |   doc:
 5 |     docker:
 6 |       - image: cimg/python:3.8.12
 7 |     environment:
 8 |       - USERNAME: "glemaitre"
 9 |       - ORGANIZATION: "imbalanced-learn"
10 |       - DOC_REPO: "imbalanced-learn.github.io"
11 |       - DOC_URL: ""
12 |       - EMAIL: "g.lemaitre58@gmail.com"
13 |       - MINICONDA_PATH: ~/miniconda
14 |       - CONDA_ENV_NAME: testenv
15 |       - OMP_NUM_THREADS: 1
16 |       - PYTHON_VERSION: 3
17 |       - NUMPY_VERSION: 'latest'
18 |       - SCIPY_VERSION: 'latest'
19 |       - SKLEARN_VERSION: 'latest'
20 |       - MATPLOTLIB_VERSION: 'latest'
21 |       - SPHINX_VERSION: 'min'
22 |       - PANDAS_VERSION: 'latest'
23 |       - SPHINX_GALLERY_VERSION: 'latest'
24 |       - NUMPYDOC_VERSION: 'latest'
25 |       - SPHINXCONTRIB_BIBTEX_VERSION: 'latest'
26 |       - PYDATA_SPHINX_THEME_VERSION: 'latest'
27 |       - SPHINX_DESIGN_VERSION: 'latest'
28 |     steps:
29 |       - add_ssh_keys:
30 |           fingerprints:
31 |             - "34:ea:b1:d9:b1:e2:5d:79:81:c4:d0:39:ca:85:e1:ef"
32 |       - checkout
33 |       - run: ./build_tools/circle/checkout_merge_commit.sh
34 |       - run: ./build_tools/circle/build_doc.sh
35 |       - store_artifacts:
36 |           path: doc/_build/html
37 |           destination: doc
38 |       - store_artifacts:
39 |           path: ~/log.txt
40 |       - persist_to_workspace:
41 |           root: doc/_build/html
42 |           paths: .
43 |       - attach_workspace:
44 |           at: doc/_build/html
45 |       - run: ls -ltrh doc/_build/html
46 |       - deploy:
47 |           command: |
48 |             if [[ "${CIRCLE_BRANCH}" =~ ^master$|^[0-9]+\.[0-9]+\.X$ ]]; then
49 |               bash ./build_tools/circle/push_doc.sh doc/_build/html
50 |             fi
51 |     filters:
52 |       branches:
53 |         ignore: gh-pages
54 | 
55 | workflows:
56 |   version: 2
57 |   build-doc-and-deploy:
58 |     jobs:
59 |       - doc
60 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = True
 3 | 
 4 | [report]
 5 | exclude_lines =
 6 |     if self.debug:
 7 |     pragma: no cover
 8 |     raise NotImplementedError
 9 | ignore_errors = True
10 | omit =
11 |     */tests/*
12 |     **/setup.py
13 |     **/_sklearn_compat.py
14 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | If your issue is a usage question, submit it here instead:
 3 | - The imbalanced learn gitter: https://gitter.im/scikit-learn-contrib/imbalanced-learn
 4 | -->
 5 | 
 6 | <!-- Instructions For Filing a Bug: https://github.com/scikit-learn-contrib/imbalanced-learn/blob/master/CONTRIBUTING.md#filing-bugs -->
 7 | 
 8 | #### Description
 9 | <!-- Example: Joblib Error thrown when calling fit on LatentDirichletAllocation with evaluate_every > 0-->
10 | 
11 | #### Steps/Code to Reproduce
12 | <!--
13 | Example:
14 | ```
15 | from sklearn.feature_extraction.text import CountVectorizer
16 | from sklearn.decomposition import LatentDirichletAllocation
17 | 
18 | docs = ["Help I have a bug" for i in range(1000)]
19 | 
20 | vectorizer = CountVectorizer(input=docs, analyzer='word')
21 | lda_features = vectorizer.fit_transform(docs)
22 | 
23 | lda_model = LatentDirichletAllocation(
24 |     n_topics=10,
25 |     learning_method='online',
26 |     evaluate_every=10,
27 |     n_jobs=4,
28 | )
29 | model = lda_model.fit(lda_features)
30 | ```
31 | If the code is too long, feel free to put it in a public gist and link
32 | it in the issue: https://gist.github.com
33 | -->
34 | 
35 | #### Expected Results
36 | <!-- Example: No error is thrown. Please paste or describe the expected results.-->
37 | 
38 | #### Actual Results
39 | <!-- Please paste or specifically describe the actual output or traceback. -->
40 | 
41 | #### Versions
42 | <!--
43 | Please run the following snippet and paste the output below.
44 | import platform; print(platform.platform())
45 | import sys; print("Python", sys.version)
46 | import numpy; print("NumPy", numpy.__version__)
47 | import scipy; print("SciPy", scipy.__version__)
48 | import sklearn; print("Scikit-Learn", sklearn.__version__)
49 | import imblearn; print("Imbalanced-Learn", imblearn.__version__)
50 | -->
51 | 
52 | 
53 | <!-- Thanks for contributing! -->
54 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us reproduce and correct the bug
 4 | title: "[BUG]"
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | #### Describe the bug
11 | A clear and concise description of what the bug is.
12 | 
13 | #### Steps/Code to Reproduce
14 | <!--
15 | Example:
16 | ```python
17 | from sklearn.feature_extraction.text import CountVectorizer
18 | from sklearn.decomposition import LatentDirichletAllocation
19 | 
20 | docs = ["Help I have a bug" for i in range(1000)]
21 | 
22 | vectorizer = CountVectorizer(input=docs, analyzer='word')
23 | lda_features = vectorizer.fit_transform(docs)
24 | 
25 | lda_model = LatentDirichletAllocation(
26 |     n_topics=10,
27 |     learning_method='online',
28 |     evaluate_every=10,
29 |     n_jobs=4,
30 | )
31 | model = lda_model.fit(lda_features)
32 | ```
33 | If the code is too long, feel free to put it in a public gist and link
34 | it in the issue: https://gist.github.com
35 | -->
36 | 
37 | ```
38 | Sample code to reproduce the problem
39 | ```
40 | 
41 | #### Expected Results
42 | <!-- Example: No error is thrown. Please paste or describe the expected results.-->
43 | 
44 | #### Actual Results
45 | <!-- Please paste or specifically describe the actual output or traceback. -->
46 | 
47 | #### Versions
48 | <!--
49 | Please run the following snippet and paste the output below.
50 | For scikit-learn >= 0.20:
51 | import sklearn; sklearn.show_versions()
52 | For scikit-learn < 0.20:
53 | import platform; print(platform.platform())
54 | import sys; print("Python", sys.version)
55 | import numpy; print("NumPy", numpy.__version__)
56 | import scipy; print("SciPy", scipy.__version__)
57 | import sklearn; print("Scikit-Learn", sklearn.__version__)
58 | import imblearn; print("Imbalanced-Learn", imblearn.__version__)
59 | -->
60 | 
61 | 
62 | <!-- Thanks for contributing! -->
63 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation-improvement.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Documentation improvement
 3 | about: Create a report to help us improve the documentation
 4 | title: "[DOC]"
 5 | labels: Documentation, help wanted, good first issue
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | #### Describe the issue linked to the documentation
11 | 
12 | Tell us about the confusion introduce in the documentation.
13 | 
14 | #### Suggest a potential alternative/fix
15 | 
16 | Tell us how we could improve the documentation in this regard.
17 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an new algorithm, enhancement to an existing algorithm, etc.
 4 | title: "[ENH]"
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | <--
11 | If you want to propose a new algorithm, please refer first to the scikit-learn inclusion criterion:
12 | https://scikit-learn.org/stable/faq.html#what-are-the-inclusion-criteria-for-new-algorithms
13 | -->
14 | 
15 | #### Is your feature request related to a problem? Please describe
16 | 
17 | #### Describe the solution you'd like
18 | 
19 | #### Describe alternatives you've considered
20 | 
21 | #### Additional context
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/other--blank-template-.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Other (blank template)
3 | about: For all other issues to reach the community...
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 | 
8 | ---
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Question
 3 | about: If you have a usage question
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **
11 | If your issue is a usage question, submit it here instead:
12 | - The imbalanced learn gitter: https://gitter.im/scikit-learn-contrib/imbalanced-learn
13 | **
14 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/usage-question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Usage question
 3 | about: If you have a usage question
 4 | title: "[SO]"
 5 | labels: question
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ** If your issue is a usage question, submit it here instead:**
11 | - **The imbalanced learn gitter: https://gitter.im/scikit-learn-contrib/imbalanced-learn**
12 | - **StackOverflow with the imblearn (or imbalanced-learn) tag:https://stackoverflow.com/questions/tagged/imblearn**
13 | 
14 | We are going to automatically close this issue if this is not link to a bug or an enhancement.
15 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | Thanks for contributing a pull request! Please ensure you have taken a look at
 3 | the contribution guidelines: https://github.com/scikit-learn-contrib/imbalanced-learn/blob/master/CONTRIBUTING.md#contributing-pull-requests
 4 | -->
 5 | #### Reference Issue
 6 | <!-- Example: Fixes #1234 -->
 7 | 
 8 | 
 9 | #### What does this implement/fix? Explain your changes.
10 | 
11 | 
12 | #### Any other comments?
13 | 
14 | 
15 | <!--
16 | Please be aware that we are a loose team of volunteers so patience is
17 | necessary; assistance handling other issues is very welcome. We value
18 | all user contributions, no matter how minor they are. If we are slow to
19 | review, either the pull request needs some benchmarking, tinkering,
20 | convincing, etc. or more likely the reviewers are simply busy. In either
21 | case, we ask for your understanding during the review process.
22 | 
23 | Thanks for contributing!
24 | -->
25 | 


--------------------------------------------------------------------------------
/.github/check-changelog.yml:
--------------------------------------------------------------------------------
 1 | name: Check Changelog
 2 | # This check makes sure that the changelog is properly updated
 3 | # when a PR introduces a change in a test file.
 4 | # To bypass this check, label the PR with "No Changelog Needed".
 5 | on:
 6 |   pull_request:
 7 |     types: [opened, edited, labeled, unlabeled, synchronize]
 8 | 
 9 | jobs:
10 |   check:
11 |     name: A reviewer will let you know if it is required or can be bypassed
12 |     runs-on: ubuntu-latest
13 |     if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 }}
14 |     steps:
15 |       - name: Get PR number and milestone
16 |         run: |
17 |           echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
18 |           echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }}" >> $GITHUB_ENV
19 |       - uses: actions/checkout@v4
20 |         with:
21 |           fetch-depth: '0'
22 |       - name: Check the changelog entry
23 |         run: |
24 |           set -xe
25 |           changed_files=$(git diff --name-only origin/main)
26 |           # Changelog should be updated only if tests have been modified
27 |           if [[ ! "$changed_files" =~ tests ]]
28 |           then
29 |             exit 0
30 |           fi
31 |           all_changelogs=$(cat ./doc/whats_new/v*.rst)
32 |           if [[ "$all_changelogs" =~ :pr:\`$PR_NUMBER\` ]]
33 |           then
34 |             echo "Changelog has been updated."
35 |             # If the pull request is milestoned check the correspondent changelog
36 |             if exist -f ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst
37 |             then
38 |               expected_changelog=$(cat ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst)
39 |               if [[ "$expected_changelog" =~ :pr:\`$PR_NUMBER\` ]]
40 |               then
41 |                 echo "Changelog and milestone correspond."
42 |               else
43 |                 echo "Changelog and milestone do not correspond."
44 |                 echo "If you see this error make sure that the tagged milestone for the PR"
45 |                 echo "and the edited changelog filename properly match."
46 |                 exit 1
47 |               fi
48 |             fi
49 |           else
50 |             echo "A Changelog entry is missing."
51 |             echo ""
52 |             echo "Please add an entry to the changelog at 'doc/whats_new/v*.rst'"
53 |             echo "to document your change assuming that the PR will be merged"
54 |             echo "in time for the next release of imbalanced-learn."
55 |             echo ""
56 |             echo "Look at other entries in that file for inspiration and please"
57 |             echo "reference this pull request using the ':pr:' directive and"
58 |             echo "credit yourself (and other contributors if applicable) with"
59 |             echo "the ':user:' directive."
60 |             echo ""
61 |             echo "If you see this error and there is already a changelog entry,"
62 |             echo "check that the PR number is correct."
63 |             echo ""
64 |             echo "If you believe that this PR does not warrant a changelog"
65 |             echo "entry, say so in a comment so that a maintainer will label"
66 |             echo "the PR with 'No Changelog Needed' to bypass this check."
67 |             exit 1
68 |           fi
69 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   # Maintain dependencies for GitHub Actions as recommended in SPEC8:
 4 |   # https://github.com/scientific-python/specs/pull/325
 5 |   # At the time of writing, release critical workflows such as
 6 |   # pypa/gh-action-pypi-publish should use hash-based versioning for security
 7 |   # reasons. This strategy may be generalized to all other github actions
 8 |   # in the future.
 9 |   - package-ecosystem: "github-actions"
10 |     directory: "/"
11 |     schedule:
12 |       interval: "weekly"
13 |     groups:
14 |       actions:
15 |         patterns:
16 |           - "*"
17 |     reviewers:
18 |       - "glemaitre"
19 | 


--------------------------------------------------------------------------------
/.github/workflows/circleci-artifacts-redirector.yml:
--------------------------------------------------------------------------------
 1 | name: CircleCI artifacts redirector
 2 | 
 3 | on: [status]
 4 | 
 5 | # Restrict the permissions granted to the use of secrets.GITHUB_TOKEN in this
 6 | # github actions workflow:
 7 | # https://docs.github.com/en/actions/security-guides/automatic-token-authentication
 8 | permissions:
 9 |   statuses: write
10 | 
11 | jobs:
12 |   circleci_artifacts_redirector_job:
13 |     runs-on: ubuntu-latest
14 |     # For testing this action on a fork, remove the "github.repository =="" condition.
15 |     if: "github.repository == 'scikit-learn-contrib/imbalanced-learn' && github.event.context == 'ci/circleci: doc'"
16 |     name: Run CircleCI artifacts redirector
17 |     steps:
18 |       - name: GitHub Action step
19 |         uses: scientific-python/circleci-artifacts-redirector-action@v1
20 |         with:
21 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
22 |           api-token: ${{ secrets.CIRCLE_CI }}
23 |           artifact-path: 0/doc/index.html
24 |           circleci-jobs: doc
25 |           job-title: Check the rendered docs here!
26 | 


--------------------------------------------------------------------------------
/.github/workflows/linters.yml:
--------------------------------------------------------------------------------
 1 | name: Run code format checks
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - "main"
 7 |   pull_request:
 8 |     branches:
 9 |       - '*'
10 | 
11 | jobs:
12 |   run-pre-commit-checks:
13 |     name: Run pre-commit checks
14 |     runs-on: ubuntu-latest
15 | 
16 |     steps:
17 |       - uses: actions/checkout@v4
18 |       - uses: prefix-dev/setup-pixi@v0.8.8
19 |         with:
20 |           pixi-version: v0.39.2
21 |           frozen: true
22 | 
23 |       - name: Run tests
24 |         run: pixi run -e linters linters
25 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: 'tests'
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - "main"
 7 |   pull_request:
 8 |     branches:
 9 |       - '*'
10 | 
11 | jobs:
12 |   test:
13 |     strategy:
14 |       matrix:
15 |         os: [windows-latest, ubuntu-latest, macos-latest]
16 |         environment: [
17 |             ci-py310-min-dependencies,
18 |             ci-py310-min-optional-dependencies,
19 |             ci-py310-min-keras,
20 |             ci-py310-min-tensorflow,
21 |             ci-py311-sklearn-1-4,
22 |             ci-py311-sklearn-1-5,
23 |             ci-py311-latest-keras,
24 |             ci-py311-latest-tensorflow,
25 |             ci-py313-latest-dependencies,
26 |             ci-py313-latest-optional-dependencies,
27 |         ]
28 |         exclude:
29 |             - os: windows-latest
30 |               environment: ci-py310-min-keras
31 |             - os: windows-latest
32 |               environment: ci-py310-min-tensorflow
33 |             - os: windows-latest
34 |               environment: ci-py311-latest-keras
35 |             - os: windows-latest
36 |               environment: ci-py311-latest-tensorflow
37 |     runs-on: ${{ matrix.os }}
38 |     steps:
39 |       - uses: actions/checkout@v4
40 |       - uses: prefix-dev/setup-pixi@v0.8.8
41 |         with:
42 |           pixi-version: v0.39.2
43 |           environments: ${{ matrix.environment }}
44 |           # we can freeze the environment and manually bump the dependencies to the
45 |           # latest version time to time.
46 |           frozen: true
47 | 
48 |       - name: Run tests
49 |         run: pixi run -e ${{ matrix.environment }} tests -n 3
50 | 
51 |       - name: Upload coverage reports to Codecov
52 |         uses: codecov/codecov-action@v5.4.2
53 |         with:
54 |           token: ${{ secrets.CODECOV_TOKEN }}
55 |           slug: scikit-learn-contrib/imbalanced-learn
56 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | 
  5 | # C extensions
  6 | *.so
  7 | 
  8 | # Distribution / packaging
  9 | .Python
 10 | env/
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | *.egg-info/
 23 | .installed.cfg
 24 | *.egg
 25 | Pipfile
 26 | Pipfile.lock
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *,cover
 47 | .pytest_cache/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | 
 56 | # Sphinx documentation
 57 | docs/_build/
 58 | 
 59 | # PyBuilder
 60 | target/
 61 | 
 62 | # vim
 63 | *.swp
 64 | 
 65 | # emacs
 66 | *~
 67 | 
 68 | # Visual Studio
 69 | *.sln
 70 | *.pyproj
 71 | *.suo
 72 | *.vs
 73 | .vscode/
 74 | 
 75 | # PyCharm
 76 | .idea/
 77 | 
 78 | # Cython
 79 | *.pyc
 80 | *.pyo
 81 | __pycache__
 82 | *.so
 83 | *.o
 84 | 
 85 | *.egg
 86 | *.egg-info
 87 | 
 88 | Cython/Compiler/*.c
 89 | Cython/Plex/*.c
 90 | Cython/Runtime/refnanny.c
 91 | Cython/Tempita/*.c
 92 | Cython/*.c
 93 | 
 94 | Tools/*.elc
 95 | 
 96 | /TEST_TMP/
 97 | /build/
 98 | /wheelhouse*/
 99 | !tests/build/
100 | /dist/
101 | .gitrev
102 | .coverage
103 | *.orig
104 | *.rej
105 | *.dep
106 | *.swp
107 | *~
108 | 
109 | .ipynb_checkpoints
110 | docs/build
111 | 
112 | tags
113 | TAGS
114 | MANIFEST
115 | 
116 | .tox
117 | 
118 | cythonize.dat
119 | 
120 | # build documentation
121 | doc/_build/
122 | doc/auto_examples/
123 | doc/generated/
124 | doc/references/generated/
125 | doc/bibtex/auto
126 | doc/min_dependency_table.rst
127 | 
128 | # MacOS
129 | .DS_Store
130 | 
131 | # Pixi folder
132 | .pixi/
133 | 
134 | # Generated files
135 | doc/min_dependency_substitutions.rst
136 | doc/sg_execution_times.rst
137 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.3.0
 4 |     hooks:
 5 |     -   id: check-yaml
 6 |     -   id: end-of-file-fixer
 7 |     -   id: trailing-whitespace
 8 | -   repo: https://github.com/astral-sh/ruff-pre-commit
 9 |     # Ruff version.
10 |     rev: v0.4.8
11 |     hooks:
12 |     -   id: ruff
13 |         args: ["--fix", "--output-format=full"]
14 | -   repo: https://github.com/psf/black
15 |     rev: 23.3.0
16 |     hooks:
17 |     -   id: black
18 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | History
 2 | -------
 3 | 
 4 | Development lead
 5 | ~~~~~~~~~~~~~~~~
 6 | 
 7 | The project started in August 2014 by Fernando Nogueira and focused on SMOTE implementation.
 8 | Together with Guillaume Lemaitre, Dayvid Victor, and Christos Aridas, additional under-sampling and over-sampling methods have been implemented as well as major changes in the API to be fully compatible with scikit-learn_.
 9 | 
10 | Contributors
11 | ------------
12 | 
13 | Refers to GitHub contributors page_.
14 | 
15 | .. _scikit-learn: http://scikit-learn.org
16 | .. _page: https://github.com/scikit-learn-contrib/imbalanced-learn/graphs/contributors
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014-2020 The imbalanced-learn developers.
 4 | All rights reserved.
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | 
2 | recursive-include doc *
3 | recursive-include examples *
4 | include AUTHORS.rst
5 | include CONTRIBUTING.md
6 | include LICENSE
7 | include README.rst
8 | 


--------------------------------------------------------------------------------
/build_tools/circle/build_doc.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | set -e
 4 | 
 5 | # Decide what kind of documentation build to run, and run it.
 6 | #
 7 | # If the last commit message has a "[doc skip]" marker, do not build
 8 | # the doc. On the contrary if a "[doc build]" marker is found, build the doc
 9 | # instead of relying on the subsequent rules.
10 | #
11 | # We always build the documentation for jobs that are not related to a specific
12 | # PR (e.g. a merge to master or a maintenance branch).
13 | #
14 | # If this is a PR, do a full build if there are some files in this PR that are
15 | # under the "doc/" or "examples/" folders, otherwise perform a quick build.
16 | #
17 | # If the inspection of the current commit fails for any reason, the default
18 | # behavior is to quick build the documentation.
19 | 
20 | get_build_type() {
21 |     if [ -z "$CIRCLE_SHA1" ]
22 |     then
23 | 	echo SKIP: undefined CIRCLE_SHA1
24 | 	return
25 |     fi
26 |     commit_msg=$(git log --format=%B -n 1 $CIRCLE_SHA1)
27 |     if [ -z "$commit_msg" ]
28 |     then
29 | 	echo QUICK BUILD: failed to inspect commit $CIRCLE_SHA1
30 | 	return
31 |     fi
32 |     if [[ "$commit_msg" =~ \[doc\ skip\] ]]
33 |     then
34 | 	echo SKIP: [doc skip] marker found
35 | 	return
36 |     fi
37 |     if [[ "$commit_msg" =~ \[doc\ quick\] ]]
38 |     then
39 | 	echo QUICK: [doc quick] marker found
40 | 	return
41 |     fi
42 |     if [[ "$commit_msg" =~ \[doc\ build\] ]]
43 |     then
44 | 	echo BUILD: [doc build] marker found
45 | 	return
46 |     fi
47 |     if [ -z "$CI_PULL_REQUEST" ]
48 |     then
49 | 	echo BUILD: not a pull request
50 | 	return
51 |     fi
52 |     git_range="origin/master...$CIRCLE_SHA1"
53 |     git fetch origin master >&2 || (echo QUICK BUILD: failed to get changed filenames for $git_range; return)
54 |     filenames=$(git diff --name-only $git_range)
55 |     if [ -z "$filenames" ]
56 |     then
57 | 	echo QUICK BUILD: no changed filenames for $git_range
58 | 	return
59 |     fi
60 |     if echo "$filenames" | grep -q -e ^examples/
61 |     then
62 | 	echo BUILD: detected examples/ filename modified in $git_range: $(echo "$filenames" | grep -e ^examples/ | head -n1)
63 | 	return
64 |     fi
65 |     echo QUICK BUILD: no examples/ filename modified in $git_range:
66 |     echo "$filenames"
67 | }
68 | 
69 | build_type=$(get_build_type)
70 | if [[ "$build_type" =~ ^SKIP ]]
71 | then
72 |     exit 0
73 | fi
74 | 
75 | # deactivate circleci virtualenv and setup a miniconda env instead
76 | if [[ `type -t deactivate` ]]; then
77 |     deactivate
78 | fi
79 | 
80 | # Install pixi
81 | curl -fsSL https://pixi.sh/install.sh | bash
82 | export PATH=/home/circleci/.pixi/bin:$PATH
83 | 
84 | # The pipefail is requested to propagate exit code
85 | set -o pipefail && pixi run --frozen -e docs build-docs | tee ~/log.txt
86 | set +o pipefail
87 | 


--------------------------------------------------------------------------------
/build_tools/circle/checkout_merge_commit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Add `master` branch to the update list.
 4 | # Otherwise CircleCI will give us a cached one.
 5 | FETCH_REFS="+master:master"
 6 | 
 7 | # Update PR refs for testing.
 8 | if [[ -n "${CIRCLE_PR_NUMBER}" ]]
 9 | then
10 |     FETCH_REFS="${FETCH_REFS} +refs/pull/${CIRCLE_PR_NUMBER}/head:pr/${CIRCLE_PR_NUMBER}/head"
11 |     FETCH_REFS="${FETCH_REFS} +refs/pull/${CIRCLE_PR_NUMBER}/merge:pr/${CIRCLE_PR_NUMBER}/merge"
12 | fi
13 | 
14 | # Retrieve the refs.
15 | git fetch -u origin ${FETCH_REFS}
16 | 
17 | # Checkout the PR merge ref.
18 | if [[ -n "${CIRCLE_PR_NUMBER}" ]]
19 | then
20 |     git checkout -qf "pr/${CIRCLE_PR_NUMBER}/merge" || (
21 |         echo Could not fetch merge commit. >&2
22 |         echo There may be conflicts in merging PR \#${CIRCLE_PR_NUMBER} with master. >&2;
23 |         exit 1)
24 | fi
25 | 
26 | # Check for merge conflicts.
27 | if [[ -n "${CIRCLE_PR_NUMBER}" ]]
28 | then
29 |     git branch --merged | grep master > /dev/null
30 |     git branch --merged | grep "pr/${CIRCLE_PR_NUMBER}/head" > /dev/null
31 | fi
32 | 


--------------------------------------------------------------------------------
/build_tools/circle/push_doc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script is meant to be called in the "deploy" step defined in
 3 | # circle.yml. See https://circleci.com/docs/ for more details.
 4 | # The behavior of the script is controlled by environment variable defined
 5 | # in the circle.yml in the top level folder of the project.
 6 | 
 7 | GENERATED_DOC_DIR=$1
 8 | 
 9 | if [[ -z "$GENERATED_DOC_DIR" ]]; then
10 |     echo "Need to pass directory of the generated doc as argument"
11 |     echo "Usage: $0 <generated_doc_dir>"
12 |     exit 1
13 | fi
14 | 
15 | # Absolute path needed because we use cd further down in this script
16 | GENERATED_DOC_DIR=$(readlink -f $GENERATED_DOC_DIR)
17 | 
18 | if [ "$CIRCLE_BRANCH" = "master" ]
19 | then
20 |     dir=dev
21 | else
22 |     # Strip off .X
23 |     dir="${CIRCLE_BRANCH::-2}"
24 | fi
25 | 
26 | MSG="Pushing the docs to $dir/ for branch: $CIRCLE_BRANCH, commit $CIRCLE_SHA1"
27 | 
28 | cd $HOME
29 | if [ ! -d $DOC_REPO ];
30 | then git clone --depth 1 --no-checkout -b master "git@github.com:"$ORGANIZATION"/"$DOC_REPO".git";
31 | fi
32 | cd $DOC_REPO
33 | git config core.sparseCheckout true
34 | echo $dir > .git/info/sparse-checkout
35 | git checkout master
36 | git reset --hard origin/master
37 | git rm -rf $dir/ && rm -rf $dir/
38 | cp -R $GENERATED_DOC_DIR $dir
39 | touch $dir/.nojekyll
40 | git config --global user.email $EMAIL
41 | git config --global user.name $USERNAME
42 | git config --global push.default matching
43 | git add -f $dir/
44 | git commit -m "$MSG" $dir
45 | git push origin master
46 | 
47 | echo $MSG
48 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | # This file is here so that when running from the root folder
 2 | # ./imblearn is added to sys.path by pytest.
 3 | # See https://docs.pytest.org/en/latest/pythonpath.html for more details.
 4 | # For example, this allows to build extensions in place and run pytest
 5 | # doc/modules/clustering.rst and use imblearn from the local folder
 6 | # rather than the one from site-packages.
 7 | 
 8 | import os
 9 | 
10 | import numpy as np
11 | import pytest
12 | from sklearn.utils.fixes import parse_version
13 | 
14 | # use legacy numpy print options to avoid failures due to NumPy 2.+ scalar
15 | # representation
16 | if parse_version(np.__version__) > parse_version("2.0.0"):
17 |     np.set_printoptions(legacy="1.25")
18 | 
19 | 
20 | def pytest_runtest_setup(item):
21 |     fname = item.fspath.strpath
22 |     if (
23 |         fname.endswith(os.path.join("keras", "_generator.py"))
24 |         or fname.endswith(os.path.join("tensorflow", "_generator.py"))
25 |         or fname.endswith("miscellaneous.rst")
26 |     ):
27 |         try:
28 |             import tensorflow  # noqa
29 |         except ImportError:
30 |             pytest.skip("The tensorflow package is not installed.")
31 | 


--------------------------------------------------------------------------------
/doc/_static/css/imbalanced-learn.css:
--------------------------------------------------------------------------------
 1 | @import url("theme.css");
 2 | 
 3 | .highlight a {
 4 |   text-decoration: underline;
 5 | }
 6 | 
 7 | .deprecated p {
 8 |   padding: 10px 7px 10px 10px;
 9 |   color: #b94a48;
10 |   background-color: #f3e5e5;
11 |   border: 1px solid #eed3d7;
12 | }
13 | 
14 | .deprecated p span.versionmodified {
15 |   font-weight: bold;
16 | }
17 | 
18 | .wy-nav-content {
19 |   max-width: 1200px !important;
20 | }
21 | 
22 | /* Override some aspects of the pydata-sphinx-theme */
23 | 
24 | /* Main index page overview cards */
25 | 
26 | .intro-card {
27 |   padding: 30px 10px 20px 10px;
28 | }
29 | 
30 | .intro-card .sd-card-img-top {
31 |   margin: 10px;
32 |   height: 52px;
33 |   background: none !important;
34 | }
35 | 
36 | .intro-card .sd-card-title {
37 |   color: var(--pst-color-primary);
38 |   font-size: var(--pst-font-size-h5);
39 |   padding: 1rem 0rem 0.5rem 0rem;
40 | }
41 | 
42 | .intro-card .sd-card-footer {
43 |   border: none !important;
44 | }
45 | 
46 | .intro-card .sd-card-footer p.sd-card-text {
47 |   max-width: 220px;
48 |   margin-left: auto;
49 |   margin-right: auto;
50 | }
51 | 
52 | .intro-card .sd-btn-secondary {
53 |   background-color: #6c757d !important;
54 |   border-color: #6c757d !important;
55 | }
56 | 
57 | .intro-card .sd-btn-secondary:hover {
58 |   background-color: #5a6268 !important;
59 |   border-color: #545b62 !important;
60 | }
61 | 
62 | .card, .card img {
63 |   background-color: var(--pst-color-background);
64 | }
65 | 


--------------------------------------------------------------------------------
/doc/_static/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/doc/_static/img/favicon.ico


--------------------------------------------------------------------------------
/doc/_static/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/doc/_static/img/logo.png


--------------------------------------------------------------------------------
/doc/_static/img/logo.xcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/doc/_static/img/logo.xcf


--------------------------------------------------------------------------------
/doc/_static/img/logo_wide.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/doc/_static/img/logo_wide.png


--------------------------------------------------------------------------------
/doc/_static/img/logo_wide_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/doc/_static/img/logo_wide_dark.png


--------------------------------------------------------------------------------
/doc/_static/index_api.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
 3 | 
 4 | <svg
 5 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
 6 |    xmlns:cc="http://creativecommons.org/ns#"
 7 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
 8 |    xmlns:svg="http://www.w3.org/2000/svg"
 9 |    xmlns="http://www.w3.org/2000/svg"
10 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
11 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
12 |    width="99.058548mm"
13 |    height="89.967583mm"
14 |    viewBox="0 0 99.058554 89.967582"
15 |    version="1.1"
16 |    id="svg1040"
17 |    inkscape:version="0.92.4 (f8dce91, 2019-08-02)"
18 |    sodipodi:docname="index_api.svg">
19 |   <defs
20 |      id="defs1034" />
21 |   <sodipodi:namedview
22 |      id="base"
23 |      pagecolor="#ffffff"
24 |      bordercolor="#666666"
25 |      borderopacity="1.0"
26 |      inkscape:pageopacity="0.0"
27 |      inkscape:pageshadow="2"
28 |      inkscape:zoom="0.35"
29 |      inkscape:cx="533.74914"
30 |      inkscape:cy="10.90433"
31 |      inkscape:document-units="mm"
32 |      inkscape:current-layer="layer1"
33 |      showgrid="false"
34 |      fit-margin-top="0"
35 |      fit-margin-left="0"
36 |      fit-margin-right="0"
37 |      fit-margin-bottom="0"
38 |      inkscape:window-width="930"
39 |      inkscape:window-height="472"
40 |      inkscape:window-x="2349"
41 |      inkscape:window-y="267"
42 |      inkscape:window-maximized="0" />
43 |   <metadata
44 |      id="metadata1037">
45 |     <rdf:RDF>
46 |       <cc:Work
47 |          rdf:about="">
48 |         <dc:format>image/svg+xml</dc:format>
49 |         <dc:type
50 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
51 |         <dc:title></dc:title>
52 |       </cc:Work>
53 |     </rdf:RDF>
54 |   </metadata>
55 |   <g
56 |      inkscape:label="Layer 1"
57 |      inkscape:groupmode="layer"
58 |      id="layer1"
59 |      transform="translate(195.19933,-1.0492759)">
60 |     <g
61 |        id="g1008"
62 |        transform="matrix(1.094977,0,0,1.094977,-521.5523,-198.34055)">
63 |       <path
64 |          inkscape:connector-curvature="0"
65 |          id="path899"
66 |          d="M 324.96812,187.09499 H 303.0455 v 72.1639 h 22.67969"
67 |          style="fill:none;stroke:#459DB9;stroke-width:10;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
68 |       <path
69 |          inkscape:connector-curvature="0"
70 |          id="path899-3"
71 |          d="m 361.58921,187.09499 h 21.92262 v 72.1639 h -22.67969"
72 |          style="fill:none;stroke:#459DB9;stroke-width:10;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
73 |       <g
74 |          transform="translate(415.87139,46.162126)"
75 |          id="g944">
76 |         <circle
77 |            style="fill:#459DB9;fill-opacity:1;stroke:#459DB9;stroke-width:4.53704548;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
78 |            id="path918"
79 |            cx="-84.40152"
80 |            cy="189.84375"
81 |            r="2.2293637" />
82 |         <circle
83 |            style="fill:#459DB9;fill-opacity:1;stroke:#459DB9;stroke-width:4.53704548;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
84 |            id="path918-5"
85 |            cx="-72.949402"
86 |            cy="189.84375"
87 |            r="2.2293637" />
88 |         <circle
89 |            style="fill:#459DB9;fill-opacity:1;stroke:#459DB9;stroke-width:4.53704548;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
90 |            id="path918-6"
91 |            cx="-61.497284"
92 |            cy="189.84375"
93 |            r="2.2293637" />
94 |       </g>
95 |     </g>
96 |   </g>
97 | </svg>
98 | 


--------------------------------------------------------------------------------
/doc/_static/index_examples.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
 3 | 
 4 | <svg
 5 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
 6 |    xmlns:cc="http://creativecommons.org/ns#"
 7 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
 8 |    xmlns:svg="http://www.w3.org/2000/svg"
 9 |    xmlns="http://www.w3.org/2000/svg"
10 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
11 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
12 |    width="89.624855mm"
13 |    height="89.96759mm"
14 |    viewBox="0 0 89.62486 89.96759"
15 |    version="1.1"
16 |    id="svg1040"
17 |    inkscape:version="0.92.4 (f8dce91, 2019-08-02)"
18 |    sodipodi:docname="index_contribute.svg">
19 |   <defs
20 |      id="defs1034" />
21 |   <sodipodi:namedview
22 |      id="base"
23 |      pagecolor="#ffffff"
24 |      bordercolor="#666666"
25 |      borderopacity="1.0"
26 |      inkscape:pageopacity="0.0"
27 |      inkscape:pageshadow="2"
28 |      inkscape:zoom="0.35"
29 |      inkscape:cx="683.11893"
30 |      inkscape:cy="-59.078181"
31 |      inkscape:document-units="mm"
32 |      inkscape:current-layer="layer1"
33 |      showgrid="false"
34 |      fit-margin-top="0"
35 |      fit-margin-left="0"
36 |      fit-margin-right="0"
37 |      fit-margin-bottom="0"
38 |      inkscape:window-width="930"
39 |      inkscape:window-height="472"
40 |      inkscape:window-x="2349"
41 |      inkscape:window-y="267"
42 |      inkscape:window-maximized="0" />
43 |   <metadata
44 |      id="metadata1037">
45 |     <rdf:RDF>
46 |       <cc:Work
47 |          rdf:about="">
48 |         <dc:format>image/svg+xml</dc:format>
49 |         <dc:type
50 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
51 |         <dc:title></dc:title>
52 |       </cc:Work>
53 |     </rdf:RDF>
54 |   </metadata>
55 |   <g
56 |      inkscape:label="Layer 1"
57 |      inkscape:groupmode="layer"
58 |      id="layer1"
59 |      transform="translate(234.72009,17.466935)">
60 |     <g
61 |        id="g875"
62 |        transform="matrix(0.99300176,0,0,0.99300176,-133.24106,-172.58804)">
63 |       <path
64 |          sodipodi:nodetypes="ccc"
65 |          inkscape:connector-curvature="0"
66 |          id="path869"
67 |          d="m -97.139881,161.26069 47.247024,40.25446 -47.247024,40.25446"
68 |          style="fill:none;stroke:#459DB9;stroke-width:10;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
69 |       <path
70 |          inkscape:connector-curvature="0"
71 |          id="path871"
72 |          d="m -49.514879,241.81547 h 32.505951"
73 |          style="fill:none;stroke:#459DB9;stroke-width:10;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
74 |     </g>
75 |   </g>
76 | </svg>
77 | 


--------------------------------------------------------------------------------
/doc/_static/index_getting_started.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
 3 | 
 4 | <svg
 5 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
 6 |    xmlns:cc="http://creativecommons.org/ns#"
 7 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
 8 |    xmlns:svg="http://www.w3.org/2000/svg"
 9 |    xmlns="http://www.w3.org/2000/svg"
10 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
11 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
12 |    width="101.09389mm"
13 |    height="89.96759mm"
14 |    viewBox="0 0 101.09389 89.96759"
15 |    version="1.1"
16 |    id="svg1040"
17 |    inkscape:version="0.92.4 (f8dce91, 2019-08-02)"
18 |    sodipodi:docname="index_getting_started.svg">
19 |   <defs
20 |      id="defs1034" />
21 |   <sodipodi:namedview
22 |      id="base"
23 |      pagecolor="#ffffff"
24 |      bordercolor="#666666"
25 |      borderopacity="1.0"
26 |      inkscape:pageopacity="0.0"
27 |      inkscape:pageshadow="2"
28 |      inkscape:zoom="0.35"
29 |      inkscape:cx="-93.242129"
30 |      inkscape:cy="-189.9825"
31 |      inkscape:document-units="mm"
32 |      inkscape:current-layer="layer1"
33 |      showgrid="false"
34 |      fit-margin-top="0"
35 |      fit-margin-left="0"
36 |      fit-margin-right="0"
37 |      fit-margin-bottom="0"
38 |      inkscape:window-width="1875"
39 |      inkscape:window-height="1056"
40 |      inkscape:window-x="1965"
41 |      inkscape:window-y="0"
42 |      inkscape:window-maximized="1" />
43 |   <metadata
44 |      id="metadata1037">
45 |     <rdf:RDF>
46 |       <cc:Work
47 |          rdf:about="">
48 |         <dc:format>image/svg+xml</dc:format>
49 |         <dc:type
50 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
51 |         <dc:title></dc:title>
52 |       </cc:Work>
53 |     </rdf:RDF>
54 |   </metadata>
55 |   <g
56 |      inkscape:label="Layer 1"
57 |      inkscape:groupmode="layer"
58 |      id="layer1"
59 |      transform="translate(2.9219487,-8.5995374)">
60 |     <path
61 |        style="fill:#459DB9;fill-opacity:1;stroke-width:0.20233451"
62 |        d="M 37.270955,98.335591 C 33.358064,97.07991 31.237736,92.52319 32.964256,89.08022 c 0.18139,-0.361738 4.757999,-5.096629 10.17021,-10.521968 l 9.84041,-9.864254 -4.03738,-4.041175 -4.037391,-4.041172 -4.96415,4.916665 c -3.61569,3.581096 -5.238959,5.04997 -5.975818,5.407377 l -1.011682,0.490718 H 17.267525 1.5866055 L 0.65034544,70.96512 C -2.2506745,69.535833 -3.5952145,66.18561 -2.5925745,62.884631 c 0.53525,-1.762217 1.61699004,-3.050074 3.22528014,-3.839847 l 1.15623996,-0.56778 13.2591094,-0.05613 13.259111,-0.05613 11.5262,-11.527539 11.526199,-11.527528 H 40.622647 c -12.145542,0 -12.189222,-0.0046 -13.752801,-1.445851 -2.229871,-2.055423 -2.162799,-5.970551 0.135998,-7.938238 1.475193,-1.262712 1.111351,-1.238469 18.588522,-1.238469 12.899229,0 16.035311,0.05193 16.692589,0.276494 0.641832,0.219264 2.590731,2.051402 9.416301,8.852134 l 8.606941,8.575638 h 6.848168 c 4.837422,0 7.092281,0.07311 7.679571,0.249094 0.48064,0.144008 1.22985,0.634863 1.77578,1.163429 2.383085,2.307333 1.968685,6.539886 -0.804989,8.221882 -0.571871,0.346781 -1.38284,0.687226 -1.80217,0.756523 -0.41933,0.06928 -4.2741,0.127016 -8.56615,0.128238 -6.56998,0.0016 -7.977492,-0.04901 -8.902732,-0.321921 -0.975569,-0.287742 -1.400468,-0.622236 -3.783999,-2.978832 l -2.685021,-2.654679 -5.05411,5.051071 -5.0541,5.051081 3.926292,3.947202 c 2.365399,2.378001 4.114289,4.309171 4.399158,4.857713 0.39266,0.75606 0.47311,1.219412 0.474321,2.731516 0.003,3.083647 0.620779,2.331942 -13.598011,16.531349 -10.273768,10.259761 -12.679778,12.563171 -13.500979,12.92519 -1.267042,0.55857 -3.156169,0.681342 -4.390271,0.285321 z m 40.130741,-65.45839 c -2.212909,-0.579748 -3.782711,-1.498393 -5.51275,-3.226063 -2.522111,-2.518633 -3.633121,-5.181304 -3.633121,-8.707194 0,-3.530699 1.11238,-6.197124 3.631161,-8.704043 4.866751,-4.8438383 12.324781,-4.8550953 17.211791,-0.026 3.908758,3.862461 4.818578,9.377999 2.372188,14.380771 -0.846209,1.730481 -3.39493,4.326384 -5.143839,5.239072 -2.69708,1.407492 -6.042829,1.798628 -8.92543,1.043434 z"
63 |        id="path1000"
64 |        inkscape:connector-curvature="0" />
65 |   </g>
66 | </svg>
67 | 


--------------------------------------------------------------------------------
/doc/_static/js/copybutton.js:
--------------------------------------------------------------------------------
 1 | $(document).ready(function() {
 2 |     /* Add a [>>>] button on the top-right corner of code samples to hide
 3 |      * the >>> and ... prompts and the output and thus make the code
 4 |      * copyable. */
 5 |     var div = $('.highlight-python .highlight,' +
 6 |                 '.highlight-python3 .highlight,' +
 7 |                 '.highlight-pycon .highlight,' +
 8 | 		'.highlight-default .highlight')
 9 |     var pre = div.find('pre');
10 | 
11 |     // get the styles from the current theme
12 |     pre.parent().parent().css('position', 'relative');
13 |     var hide_text = 'Hide the prompts and output';
14 |     var show_text = 'Show the prompts and output';
15 |     var border_width = pre.css('border-top-width');
16 |     var border_style = pre.css('border-top-style');
17 |     var border_color = pre.css('border-top-color');
18 |     var button_styles = {
19 |         'cursor':'pointer', 'position': 'absolute', 'top': '0', 'right': '0',
20 |         'border-color': border_color, 'border-style': border_style,
21 |         'border-width': border_width, 'color': border_color, 'text-size': '75%',
22 |         'font-family': 'monospace', 'padding-left': '0.2em', 'padding-right': '0.2em',
23 |         'border-radius': '0 3px 0 0'
24 |     }
25 | 
26 |     // create and add the button to all the code blocks that contain >>>
27 |     div.each(function(index) {
28 |         var jthis = $(this);
29 |         if (jthis.find('.gp').length > 0) {
30 |             var button = $('<span class="copybutton">&gt;&gt;&gt;</span>');
31 |             button.css(button_styles)
32 |             button.attr('title', hide_text);
33 |             button.data('hidden', 'false');
34 |             jthis.prepend(button);
35 |         }
36 |         // tracebacks (.gt) contain bare text elements that need to be
37 |         // wrapped in a span to work with .nextUntil() (see later)
38 |         jthis.find('pre:has(.gt)').contents().filter(function() {
39 |             return ((this.nodeType == 3) && (this.data.trim().length > 0));
40 |         }).wrap('<span>');
41 |     });
42 | 
43 |     // define the behavior of the button when it's clicked
44 |     $('.copybutton').click(function(e){
45 |         e.preventDefault();
46 |         var button = $(this);
47 |         if (button.data('hidden') === 'false') {
48 |             // hide the code output
49 |             button.parent().find('.go, .gp, .gt').hide();
50 |             button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden');
51 |             button.css('text-decoration', 'line-through');
52 |             button.attr('title', show_text);
53 |             button.data('hidden', 'true');
54 |         } else {
55 |             // show the code output
56 |             button.parent().find('.go, .gp, .gt').show();
57 |             button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible');
58 |             button.css('text-decoration', 'none');
59 |             button.attr('title', hide_text);
60 |             button.data('hidden', 'false');
61 |         }
62 |     });
63 | });
64 | 


--------------------------------------------------------------------------------
/doc/_templates/class.rst:
--------------------------------------------------------------------------------
 1 | {{objname}}
 2 | {{ underline }}==============
 3 | 
 4 | .. currentmodule:: {{ module }}
 5 | 
 6 | .. autoclass:: {{ objname }}
 7 | 
 8 |    {% block methods %}
 9 | 
10 |    {% if methods %}
11 |    .. rubric:: Methods
12 | 
13 |    .. autosummary::
14 |    {% for item in methods %}
15 |       {% if '__init__' not in item %}
16 |         ~{{ name }}.{{ item }}
17 |       {% endif %}
18 |    {%- endfor %}
19 |    {% endif %}
20 |    {% endblock %}
21 | 
22 | .. include:: {{module}}.{{objname}}.examples
23 | 
24 | .. raw:: html
25 | 
26 |     <div style='clear:both'></div>
27 | 


--------------------------------------------------------------------------------
/doc/_templates/function.rst:
--------------------------------------------------------------------------------
 1 | {{objname}}
 2 | {{ underline }}====================
 3 | 
 4 | .. currentmodule:: {{ module }}
 5 | 
 6 | .. autofunction:: {{ objname }}
 7 | 
 8 | .. include:: {{module}}.{{objname}}.examples
 9 | 
10 | .. raw:: html
11 | 
12 |     <div style='clear:both'></div>
13 | 


--------------------------------------------------------------------------------
/doc/_templates/numpydoc_docstring.rst:
--------------------------------------------------------------------------------
 1 | {{index}}
 2 | {{summary}}
 3 | {{extended_summary}}
 4 | {{parameters}}
 5 | {{returns}}
 6 | {{yields}}
 7 | {{other_parameters}}
 8 | {{attributes}}
 9 | {{raises}}
10 | {{warns}}
11 | {{warnings}}
12 | {{see_also}}
13 | {{notes}}
14 | {{references}}
15 | {{examples}}
16 | {{methods}}
17 | 


--------------------------------------------------------------------------------
/doc/_templates/sidebar-search-bs.html:
--------------------------------------------------------------------------------
 1 | <div class="navbar-brand-box">
 2 |   <a class="navbar-brand-box text-wrap" href="{{ pathto('index') }}">
 3 |     {% if logo %}
 4 |     <img
 5 |       src="{{ pathto('_static/' + logo, 1) }}"
 6 |       class="logo"
 7 |       style="width: 60%"
 8 |       alt="logo"
 9 |     />
10 |     {% endif %} {% if docstitle %}
11 |     <h4 class="site-logo" id="site-title">{{ docstitle }}</h4>
12 |     {% endif %}
13 |   </a>
14 | </div>
15 | 


--------------------------------------------------------------------------------
/doc/about.rst:
--------------------------------------------------------------------------------
 1 | About us
 2 | ========
 3 | 
 4 | .. include:: ../AUTHORS.rst
 5 | 
 6 | .. _citing-imbalanced-learn:
 7 | 
 8 | Citing imbalanced-learn
 9 | -----------------------
10 | 
11 | If you use imbalanced-learn in a scientific publication, we would appreciate
12 | citations to the following paper::
13 | 
14 |   @article{JMLR:v18:16-365,
15 |   author  = {Guillaume  Lema{{\^i}}tre and Fernando Nogueira and Christos K. Aridas},
16 |   title   = {Imbalanced-learn: A Python Toolbox to Tackle the Curse of Imbalanced Datasets in Machine Learning},
17 |   journal = {Journal of Machine Learning Research},
18 |   year    = {2017},
19 |   volume  = {18},
20 |   number  = {17},
21 |   pages   = {1-5},
22 |   url     = {http://jmlr.org/papers/v18/16-365.html}
23 |   }
24 | 


--------------------------------------------------------------------------------
/doc/combine.rst:
--------------------------------------------------------------------------------
 1 | .. _combine:
 2 | 
 3 | =======================================
 4 | Combination of over- and under-sampling
 5 | =======================================
 6 | 
 7 | .. currentmodule:: imblearn.over_sampling
 8 | 
 9 | We previously presented :class:`SMOTE` and showed that this method can generate
10 | noisy samples by interpolating new points between marginal outliers and
11 | inliers. This issue can be solved by cleaning the space resulting
12 | from over-sampling.
13 | 
14 | .. currentmodule:: imblearn.combine
15 | 
16 | In this regard, Tomek's link and edited nearest-neighbours are the two cleaning
17 | methods that have been added to the pipeline after applying SMOTE over-sampling
18 | to obtain a cleaner space. The two ready-to use classes imbalanced-learn
19 | implements for combining over- and undersampling methods are: (i)
20 | :class:`SMOTETomek` :cite:`batista2004study` and (ii) :class:`SMOTEENN`
21 | :cite:`batista2003balancing`.
22 | 
23 | Those two classes can be used like any other sampler with parameters identical
24 | to their former samplers::
25 | 
26 |   >>> from collections import Counter
27 |   >>> from sklearn.datasets import make_classification
28 |   >>> X, y = make_classification(n_samples=5000, n_features=2, n_informative=2,
29 |   ...                            n_redundant=0, n_repeated=0, n_classes=3,
30 |   ...                            n_clusters_per_class=1,
31 |   ...                            weights=[0.01, 0.05, 0.94],
32 |   ...                            class_sep=0.8, random_state=0)
33 |   >>> print(sorted(Counter(y).items()))
34 |   [(0, 64), (1, 262), (2, 4674)]
35 |   >>> from imblearn.combine import SMOTEENN
36 |   >>> smote_enn = SMOTEENN(random_state=0)
37 |   >>> X_resampled, y_resampled = smote_enn.fit_resample(X, y)
38 |   >>> print(sorted(Counter(y_resampled).items()))
39 |   [(0, 4060), (1, 4381), (2, 3502)]
40 |   >>> from imblearn.combine import SMOTETomek
41 |   >>> smote_tomek = SMOTETomek(random_state=0)
42 |   >>> X_resampled, y_resampled = smote_tomek.fit_resample(X, y)
43 |   >>> print(sorted(Counter(y_resampled).items()))
44 |   [(0, 4499), (1, 4566), (2, 4413)]
45 | 
46 | We can also see in the example below that :class:`SMOTEENN` tends to clean more
47 | noisy samples than :class:`SMOTETomek`.
48 | 
49 | .. image:: ./auto_examples/combine/images/sphx_glr_plot_comparison_combine_001.png
50 |    :target: ./auto_examples/combine/plot_comparison_combine.html
51 |    :scale: 60
52 |    :align: center
53 | 
54 | .. topic:: Examples
55 | 
56 |   * :ref:`sphx_glr_auto_examples_combine_plot_comparison_combine.py`
57 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
  1 | .. project-template documentation master file, created by
  2 |    sphinx-quickstart on Mon Jan 18 14:44:12 2016.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | :notoc:
  7 | 
  8 | ##############################
  9 | imbalanced-learn documentation
 10 | ##############################
 11 | 
 12 | **Date**: |today| **Version**: |version|
 13 | 
 14 | **Useful links**:
 15 | `Binary Installers <https://pypi.org/project/imbalanced-learn>`__ |
 16 | `Source Repository <https://github.com/scikit-learn-contrib/imbalanced-learn>`__ |
 17 | `Issues & Ideas <https://github.com/scikit-learn-contrib/imbalanced-learn/issues>`__ |
 18 | `Q&A Support <https://gitter.im/scikit-learn-contrib/imbalanced-learn>`__
 19 | 
 20 | Imbalanced-learn (imported as :mod:`imblearn`) is an open source, MIT-licensed
 21 | library relying on scikit-learn (imported as :mod:`sklearn`) and provides tools
 22 | when dealing with classification with imbalanced classes.
 23 | 
 24 | .. grid:: 1 2 2 2
 25 |     :gutter: 4
 26 |     :padding: 2 2 0 0
 27 |     :class-container: sd-text-center
 28 | 
 29 |     .. grid-item-card:: Getting started
 30 |         :img-top: _static/index_getting_started.svg
 31 |         :class-card: intro-card
 32 |         :shadow: md
 33 | 
 34 |         Check out the getting started guides to install `imbalanced-learn`.
 35 |         Some extra information to get started with a new contribution is also provided.
 36 | 
 37 |         +++
 38 | 
 39 |         .. button-ref:: getting_started
 40 |             :ref-type: ref
 41 |             :click-parent:
 42 |             :color: secondary
 43 |             :expand:
 44 | 
 45 |             To the installation guideline
 46 | 
 47 |     .. grid-item-card::  User guide
 48 |         :img-top: _static/index_user_guide.svg
 49 |         :class-card: intro-card
 50 |         :shadow: md
 51 | 
 52 |         The user guide provides in-depth information on the key concepts of
 53 |         `imbalanced-learn` with useful background information and explanation.
 54 | 
 55 |         +++
 56 | 
 57 |         .. button-ref:: user_guide
 58 |             :ref-type: ref
 59 |             :click-parent:
 60 |             :color: secondary
 61 |             :expand:
 62 | 
 63 |             To the user guide
 64 | 
 65 |     .. grid-item-card::  API reference
 66 |         :img-top: _static/index_api.svg
 67 |         :class-card: intro-card
 68 |         :shadow: md
 69 | 
 70 |         The reference guide contains a detailed description of
 71 |         the `imbalanced-learn` API. To known more about methods parameters.
 72 | 
 73 |         +++
 74 | 
 75 |         .. button-ref:: api
 76 |             :ref-type: ref
 77 |             :click-parent:
 78 |             :color: secondary
 79 |             :expand:
 80 | 
 81 |             To the reference guide
 82 | 
 83 |     .. grid-item-card::  Examples
 84 |         :img-top: _static/index_examples.svg
 85 |         :class-card: intro-card
 86 |         :shadow: md
 87 | 
 88 |         The gallery of examples is a good place to see `imbalanced-learn` in action.
 89 |         Select an example and dive in.
 90 | 
 91 |         +++
 92 | 
 93 |         .. button-ref:: general_examples
 94 |             :ref-type: ref
 95 |             :click-parent:
 96 |             :color: secondary
 97 |             :expand:
 98 | 
 99 |             To the gallery of examples
100 | 
101 | 
102 | .. toctree::
103 |     :maxdepth: 3
104 |     :hidden:
105 |     :titlesonly:
106 | 
107 |     install
108 |     user_guide
109 |     references/index
110 |     auto_examples/index
111 |     whats_new
112 |     about
113 | 


--------------------------------------------------------------------------------
/doc/install.rst:
--------------------------------------------------------------------------------
  1 | .. _getting_started:
  2 | 
  3 | ###############
  4 | Getting Started
  5 | ###############
  6 | 
  7 | Prerequisites
  8 | =============
  9 | 
 10 | .. |PythonMinVersion| replace:: 3.10
 11 | .. |NumPyMinVersion| replace:: 1.24.3
 12 | .. |SciPyMinVersion| replace:: 1.10.1
 13 | .. |ScikitLearnMinVersion| replace:: 1.3.2
 14 | .. |MatplotlibMinVersion| replace:: 3.7.3
 15 | .. |PandasMinVersion| replace:: 1.5.3
 16 | .. |TensorflowMinVersion| replace:: 2.13.1
 17 | .. |KerasMinVersion| replace:: 3.0.5
 18 | .. |SeabornMinVersion| replace:: 0.12.2
 19 | .. |PytestMinVersion| replace:: 7.2.2
 20 | 
 21 | `imbalanced-learn` requires the following dependencies:
 22 | 
 23 | - Python (>= |PythonMinVersion|)
 24 | - NumPy (>= |NumPyMinVersion|)
 25 | - SciPy (>= |SciPyMinVersion|)
 26 | - Scikit-learn (>= |ScikitLearnMinVersion|)
 27 | - Pytest (>= |PytestMinVersion|)
 28 | 
 29 | Additionally, `imbalanced-learn` requires the following optional dependencies:
 30 | 
 31 | - Pandas (>= |PandasMinVersion|) for dealing with dataframes
 32 | - Tensorflow (>= |TensorflowMinVersion|) for dealing with TensorFlow models
 33 | - Keras (>= |KerasMinVersion|) for dealing with Keras models
 34 | 
 35 | The examples will requires the following additional dependencies:
 36 | 
 37 | - Matplotlib (>= |MatplotlibMinVersion|)
 38 | - Seaborn (>= |SeabornMinVersion|)
 39 | 
 40 | Install
 41 | =======
 42 | 
 43 | From PyPi or conda-forge repositories
 44 | -------------------------------------
 45 | 
 46 | imbalanced-learn is currently available on the PyPi's repositories and you can
 47 | install it via `pip`::
 48 | 
 49 |   pip install imbalanced-learn
 50 | 
 51 | The package is released also on the conda-forge repositories and you can install
 52 | it with `conda` (or `mamba`)::
 53 | 
 54 |   conda install -c conda-forge imbalanced-learn
 55 | 
 56 | Intel optimizations via scikit-learn-intelex
 57 | --------------------------------------------
 58 | 
 59 | Imbalanced-learn relies entirely on scikit-learn algorithms. Intel provides an
 60 | optimized version of scikit-learn for Intel hardwares, called scikit-learn-intelex.
 61 | Installing scikit-learn-intelex and patching scikit-learn will activate the
 62 | Intel optimizations.
 63 | 
 64 | You can refer to the following
 65 | `blog post <https://medium.com/intel-analytics-software/why-pay-more-for-machine-learning-893683bd78e4>`_
 66 | for some benchmarks.
 67 | 
 68 | Refer to the following documentation for instructions:
 69 | 
 70 | - `Installation guide <https://intel.github.io/scikit-learn-intelex/installation.html>`_.
 71 | - `Patching guide <https://intel.github.io/scikit-learn-intelex/what-is-patching.html>`_.
 72 | 
 73 | From source available on GitHub
 74 | -------------------------------
 75 | 
 76 | If you prefer, you can clone it and run the setup.py file. Use the following
 77 | commands to get a copy from Github and install all dependencies::
 78 | 
 79 |   git clone https://github.com/scikit-learn-contrib/imbalanced-learn.git
 80 |   cd imbalanced-learn
 81 |   pip install .
 82 | 
 83 | Be aware that you can install in developer mode with::
 84 | 
 85 |   pip install --no-build-isolation --editable .
 86 | 
 87 | If you wish to make pull-requests on GitHub, we advise you to install
 88 | pre-commit::
 89 | 
 90 |   pip install pre-commit
 91 |   pre-commit install
 92 | 
 93 | Test and coverage
 94 | =================
 95 | 
 96 | You want to test the code before to install::
 97 | 
 98 |   $ make test
 99 | 
100 | You wish to test the coverage of your version::
101 | 
102 |   $ make coverage
103 | 
104 | You can also use `pytest`::
105 | 
106 |   $ pytest imblearn -v
107 | 
108 | Contribute
109 | ==========
110 | 
111 | You can contribute to this code through Pull Request on GitHub_. Please, make
112 | sure that your code is coming with unit tests to ensure full coverage and
113 | continuous integration in the API.
114 | 
115 | .. _GitHub: https://github.com/scikit-learn-contrib/imbalanced-learn/pulls
116 | 


--------------------------------------------------------------------------------
/doc/introduction.rst:
--------------------------------------------------------------------------------
 1 | .. _introduction:
 2 | 
 3 | ============
 4 | Introduction
 5 | ============
 6 | 
 7 | .. _api_imblearn:
 8 | 
 9 | API's of imbalanced-learn samplers
10 | ----------------------------------
11 | 
12 | The available samplers follow the
13 | `scikit-learn API <https://scikit-learn.org/stable/getting_started.html#fitting-and-predicting-estimator-basics>`_
14 | using the base estimator
15 | and incorporating a sampling functionality via the ``sample`` method:
16 | 
17 | :Estimator:
18 | 
19 |     The base object, implements a ``fit`` method to learn from data::
20 | 
21 |       estimator = obj.fit(data, targets)
22 | 
23 | :Resampler:
24 | 
25 |     To resample a data sets, each sampler implements a ``fit_resample`` method::
26 | 
27 |       data_resampled, targets_resampled = obj.fit_resample(data, targets)
28 | 
29 | Imbalanced-learn samplers accept the same inputs as scikit-learn estimators:
30 | 
31 | * `data`, 2-dimensional array-like structures, such as:
32 |    * Python's list of lists :class:`list`,
33 |    * Numpy arrays :class:`numpy.ndarray`,
34 |    * Panda dataframes :class:`pandas.DataFrame`,
35 |    * Scipy sparse matrices :class:`scipy.sparse.csr_matrix` or :class:`scipy.sparse.csc_matrix`;
36 | 
37 | * `targets`, 1-dimensional array-like structures, such as:
38 |    * Numpy arrays :class:`numpy.ndarray`,
39 |    * Pandas series :class:`pandas.Series`.
40 | 
41 | The output will be of the following type:
42 | 
43 | * `data_resampled`, 2-dimensional aray-like structures, such as:
44 |    * Numpy arrays :class:`numpy.ndarray`,
45 |    * Pandas dataframes :class:`pandas.DataFrame`,
46 |    * Scipy sparse matrices :class:`scipy.sparse.csr_matrix` or :class:`scipy.sparse.csc_matrix`;
47 | 
48 | * `targets_resampled`, 1-dimensional array-like structures, such as:
49 |    * Numpy arrays :class:`numpy.ndarray`,
50 |    * Pandas series :class:`pandas.Series`.
51 | 
52 | .. topic:: Pandas in/out
53 | 
54 |    Unlike scikit-learn, imbalanced-learn provides support for pandas in/out.
55 |    Therefore providing a dataframe, will output as well a dataframe.
56 | 
57 | .. topic:: Sparse input
58 | 
59 |    For sparse input the data is **converted to the Compressed Sparse Rows
60 |    representation** (see ``scipy.sparse.csr_matrix``) before being fed to the
61 |    sampler. To avoid unnecessary memory copies, it is recommended to choose the
62 |    CSR representation upstream.
63 | 
64 | .. _problem_statement:
65 | 
66 | Problem statement regarding imbalanced data sets
67 | ------------------------------------------------
68 | 
69 | The learning and prediction phrases of machine learning algorithms
70 | can be impacted by the issue of **imbalanced datasets**. This imbalance
71 | refers to the difference in the number of samples across different classes.
72 | We demonstrate the effect of training a `Logistic Regression classifier
73 | <https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html>`_
74 | with varying levels of class balancing by adjusting their weights.
75 | 
76 | .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_001.png
77 |    :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html
78 |    :scale: 60
79 |    :align: center
80 | 
81 | As expected, the decision function of the Logistic Regression classifier varies significantly
82 | depending on how imbalanced the data is. With a greater imbalance ratio, the decision function
83 | tends to favour the class with the larger number of samples, usually referred to as the
84 | **majority class**.
85 | 


--------------------------------------------------------------------------------
/doc/references/combine.rst:
--------------------------------------------------------------------------------
 1 | .. _combine_ref:
 2 | 
 3 | Combination of over- and under-sampling methods
 4 | ===============================================
 5 | 
 6 | .. automodule:: imblearn.combine
 7 |    :no-members:
 8 |    :no-inherited-members:
 9 | 
10 | .. currentmodule:: imblearn.combine
11 | 
12 | .. autosummary::
13 |    :toctree: generated/
14 |    :template: class.rst
15 | 
16 |    SMOTEENN
17 |    SMOTETomek
18 | 


--------------------------------------------------------------------------------
/doc/references/datasets.rst:
--------------------------------------------------------------------------------
 1 | .. _datasets_ref:
 2 | 
 3 | Datasets
 4 | ========
 5 | 
 6 | .. automodule:: imblearn.datasets
 7 |     :no-members:
 8 |     :no-inherited-members:
 9 | 
10 | .. currentmodule:: imblearn.datasets
11 | 
12 | .. autosummary::
13 |    :toctree: generated/
14 |    :template: function.rst
15 | 
16 |    make_imbalance
17 |    fetch_datasets
18 | 


--------------------------------------------------------------------------------
/doc/references/ensemble.rst:
--------------------------------------------------------------------------------
 1 | .. _ensemble_ref:
 2 | 
 3 | Ensemble methods
 4 | ================
 5 | 
 6 | .. automodule:: imblearn.ensemble
 7 |     :no-members:
 8 |     :no-inherited-members:
 9 | 
10 | .. currentmodule:: imblearn.ensemble
11 | 
12 | Boosting algorithms
13 | -------------------
14 | 
15 | .. autosummary::
16 |    :toctree: generated/
17 |    :template: class.rst
18 | 
19 |    EasyEnsembleClassifier
20 |    RUSBoostClassifier
21 | 
22 | Bagging algorithms
23 | ------------------
24 | 
25 | .. autosummary::
26 |    :toctree: generated/
27 |    :template: class.rst
28 | 
29 |    BalancedBaggingClassifier
30 |    BalancedRandomForestClassifier
31 | 


--------------------------------------------------------------------------------
/doc/references/index.rst:
--------------------------------------------------------------------------------
 1 | .. _api:
 2 | 
 3 | #############
 4 | API reference
 5 | #############
 6 | 
 7 | This is the full API documentation of the `imbalanced-learn` toolbox.
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 3
11 | 
12 |    under_sampling
13 |    over_sampling
14 |    combine
15 |    ensemble
16 |    keras
17 |    tensorflow
18 |    miscellaneous
19 |    pipeline
20 |    metrics
21 |    datasets
22 |    utils
23 | 


--------------------------------------------------------------------------------
/doc/references/keras.rst:
--------------------------------------------------------------------------------
 1 | .. _keras_ref:
 2 | 
 3 | Batch generator for Keras
 4 | =========================
 5 | 
 6 | .. automodule:: imblearn.keras
 7 |     :no-members:
 8 |     :no-inherited-members:
 9 | 
10 | .. currentmodule:: imblearn
11 | 
12 | .. autosummary::
13 |    :toctree: generated/
14 |    :template: class.rst
15 | 
16 |    keras.BalancedBatchGenerator
17 | 
18 | .. autosummary::
19 |    :toctree: generated/
20 |    :template: function.rst
21 | 
22 |    keras.balanced_batch_generator
23 | 


--------------------------------------------------------------------------------
/doc/references/metrics.rst:
--------------------------------------------------------------------------------
 1 | .. _metrics_ref:
 2 | 
 3 | Metrics
 4 | =======
 5 | 
 6 | .. automodule:: imblearn.metrics
 7 |    :no-members:
 8 |    :no-inherited-members:
 9 | 
10 | Classification metrics
11 | ----------------------
12 | See the :ref:`metrics` section of the user guide for further details.
13 | 
14 | .. currentmodule:: imblearn.metrics
15 | 
16 | .. autosummary::
17 |    :toctree: generated/
18 |    :template: function.rst
19 | 
20 |    classification_report_imbalanced
21 |    sensitivity_specificity_support
22 |    sensitivity_score
23 |    specificity_score
24 |    geometric_mean_score
25 |    macro_averaged_mean_absolute_error
26 |    make_index_balanced_accuracy
27 | 
28 | Pairwise metrics
29 | ----------------
30 | See the :ref:`pairwise_metrics` section of the user guide for further details.
31 | 
32 | .. automodule:: imblearn.metrics.pairwise
33 |    :no-members:
34 |    :no-inherited-members:
35 | 
36 | .. currentmodule:: imblearn.metrics.pairwise
37 | 
38 | .. autosummary::
39 |    :toctree: generated/
40 |    :template: class.rst
41 | 
42 |    ValueDifferenceMetric
43 | 


--------------------------------------------------------------------------------
/doc/references/miscellaneous.rst:
--------------------------------------------------------------------------------
 1 | .. _misc_ref:
 2 | 
 3 | Miscellaneous
 4 | =============
 5 | 
 6 | Imbalance-learn provides some fast-prototyping tools.
 7 | 
 8 | .. currentmodule:: imblearn
 9 | 
10 | .. autosummary::
11 |    :toctree: generated/
12 |    :template: class.rst
13 | 
14 |    FunctionSampler
15 | 


--------------------------------------------------------------------------------
/doc/references/over_sampling.rst:
--------------------------------------------------------------------------------
 1 | .. _over_sampling_ref:
 2 | 
 3 | Over-sampling methods
 4 | =====================
 5 | 
 6 | .. automodule:: imblearn.over_sampling
 7 |     :no-members:
 8 |     :no-inherited-members:
 9 | 
10 | .. currentmodule:: imblearn.over_sampling
11 | 
12 | Basic over-sampling
13 | -------------------
14 | 
15 | .. autosummary::
16 |    :toctree: generated/
17 |    :template: class.rst
18 | 
19 |    RandomOverSampler
20 | 
21 | SMOTE algorithms
22 | ----------------
23 | 
24 | .. autosummary::
25 |    :toctree: generated/
26 |    :template: class.rst
27 | 
28 |    SMOTE
29 |    SMOTENC
30 |    SMOTEN
31 |    ADASYN
32 |    BorderlineSMOTE
33 |    KMeansSMOTE
34 |    SVMSMOTE
35 | 


--------------------------------------------------------------------------------
/doc/references/pipeline.rst:
--------------------------------------------------------------------------------
 1 | .. _pipeline_ref:
 2 | 
 3 | Pipeline
 4 | ========
 5 | 
 6 | .. automodule:: imblearn.pipeline
 7 |     :no-members:
 8 |     :no-inherited-members:
 9 | 
10 | .. currentmodule:: imblearn.pipeline
11 | 
12 | .. autosummary::
13 |    :toctree: generated/
14 |    :template: class.rst
15 | 
16 |    Pipeline
17 | 
18 | .. autosummary::
19 |    :toctree: generated/
20 |    :template: function.rst
21 | 
22 |    make_pipeline
23 | 


--------------------------------------------------------------------------------
/doc/references/tensorflow.rst:
--------------------------------------------------------------------------------
 1 | .. _tensorflow_ref:
 2 | 
 3 | Batch generator for TensorFlow
 4 | ==============================
 5 | 
 6 | .. automodule:: imblearn.tensorflow
 7 |     :no-members:
 8 |     :no-inherited-members:
 9 | 
10 | .. currentmodule:: imblearn
11 | 
12 | .. autosummary::
13 |    :toctree: generated/
14 |    :template: function.rst
15 | 
16 |    tensorflow.balanced_batch_generator
17 | 


--------------------------------------------------------------------------------
/doc/references/under_sampling.rst:
--------------------------------------------------------------------------------
 1 | .. _under_sampling_ref:
 2 | 
 3 | Under-sampling methods
 4 | ======================
 5 | 
 6 | .. automodule:: imblearn.under_sampling
 7 |     :no-members:
 8 |     :no-inherited-members:
 9 | 
10 | Prototype generation
11 | --------------------
12 | 
13 | .. automodule:: imblearn.under_sampling._prototype_generation
14 |    :no-members:
15 |    :no-inherited-members:
16 | 
17 | .. currentmodule:: imblearn.under_sampling
18 | 
19 | .. autosummary::
20 |    :toctree: generated/
21 |    :template: class.rst
22 | 
23 |    ClusterCentroids
24 | 
25 | Prototype selection
26 | -------------------
27 | 
28 | .. automodule:: imblearn.under_sampling._prototype_selection
29 |    :no-members:
30 |    :no-inherited-members:
31 | 
32 | .. currentmodule:: imblearn.under_sampling
33 | 
34 | .. autosummary::
35 |    :toctree: generated/
36 |    :template: class.rst
37 | 
38 |    CondensedNearestNeighbour
39 |    EditedNearestNeighbours
40 |    RepeatedEditedNearestNeighbours
41 |    AllKNN
42 |    InstanceHardnessThreshold
43 |    NearMiss
44 |    NeighbourhoodCleaningRule
45 |    OneSidedSelection
46 |    RandomUnderSampler
47 |    TomekLinks
48 | 


--------------------------------------------------------------------------------
/doc/references/utils.rst:
--------------------------------------------------------------------------------
 1 | Utilities
 2 | =========
 3 | 
 4 | .. automodule:: imblearn.utils
 5 |     :no-members:
 6 |     :no-inherited-members:
 7 | 
 8 | .. currentmodule:: imblearn.utils
 9 | 
10 | Validation checks used in samplers
11 | ----------------------------------
12 | 
13 | .. autosummary::
14 |    :toctree: generated/
15 |    :template: function.rst
16 | 
17 |    estimator_checks.parametrize_with_checks
18 |    check_neighbors_object
19 |    check_sampling_strategy
20 |    check_target_type
21 | 
22 | Testing compatibility of your own sampler
23 | -----------------------------------------
24 | 
25 | .. automodule:: imblearn.utils.estimator_checks
26 |     :no-members:
27 |     :no-inherited-members:
28 | 
29 | .. currentmodule:: imblearn.utils.estimator_checks
30 | 
31 | .. autosummary::
32 |    :toctree: generated/
33 |    :template: function.rst
34 | 
35 |    parametrize_with_checks
36 | 


--------------------------------------------------------------------------------
/doc/sphinxext/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include tests *.py
2 | include *.txt
3 | 


--------------------------------------------------------------------------------
/doc/sphinxext/README.txt:
--------------------------------------------------------------------------------
 1 | =====================================
 2 | numpydoc -- Numpy's Sphinx extensions
 3 | =====================================
 4 | 
 5 | Numpy's documentation uses several custom extensions to Sphinx.  These
 6 | are shipped in this ``numpydoc`` package, in case you want to make use
 7 | of them in third-party projects.
 8 | 
 9 | The following extensions are available:
10 | 
11 |   - ``numpydoc``: support for the Numpy docstring format in Sphinx, and add
12 |     the code description directives ``np-function``, ``np-cfunction``, etc.
13 |     that support the Numpy docstring syntax.
14 | 
15 |   - ``numpydoc.traitsdoc``: For gathering documentation about Traits attributes.
16 | 
17 |   - ``numpydoc.plot_directives``: Adaptation of Matplotlib's ``plot::``
18 |     directive. Note that this implementation may still undergo severe
19 |     changes or eventually be deprecated.
20 | 
21 |   - ``numpydoc.only_directives``: (DEPRECATED)
22 | 
23 |   - ``numpydoc.autosummary``: (DEPRECATED) An ``autosummary::`` directive.
24 |     Available in Sphinx 0.6.2 and (to-be) 1.0 as ``sphinx.ext.autosummary``,
25 |     and it the Sphinx 1.0 version is recommended over that included in
26 |     Numpydoc.
27 | 
28 | 
29 | numpydoc
30 | ========
31 | 
32 | Numpydoc inserts a hook into Sphinx's autodoc that converts docstrings
33 | following the Numpy/Scipy format to a form palatable to Sphinx.
34 | 
35 | Options
36 | -------
37 | 
38 | The following options can be set in conf.py:
39 | 
40 | - numpydoc_use_plots: bool
41 | 
42 |   Whether to produce ``plot::`` directives for Examples sections that
43 |   contain ``import matplotlib``.
44 | 
45 | - numpydoc_show_class_members: bool
46 | 
47 |   Whether to show all members of a class in the Methods and Attributes
48 |   sections automatically.
49 | 
50 | - numpydoc_edit_link: bool  (DEPRECATED -- edit your HTML template instead)
51 | 
52 |   Whether to insert an edit link after docstrings.
53 | 


--------------------------------------------------------------------------------
/doc/sphinxext/github_link.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import os
 3 | import subprocess
 4 | import sys
 5 | from functools import partial
 6 | from operator import attrgetter
 7 | 
 8 | REVISION_CMD = "git rev-parse --short HEAD"
 9 | 
10 | 
11 | def _get_git_revision():
12 |     try:
13 |         revision = subprocess.check_output(REVISION_CMD.split()).strip()
14 |     except (subprocess.CalledProcessError, OSError):
15 |         print("Failed to execute git to get revision")
16 |         return None
17 |     return revision.decode("utf-8")
18 | 
19 | 
20 | def _linkcode_resolve(domain, info, package, url_fmt, revision):
21 |     """Determine a link to online source for a class/method/function
22 | 
23 |     This is called by sphinx.ext.linkcode
24 | 
25 |     An example with a long-untouched module that everyone has
26 |     >>> _linkcode_resolve('py', {'module': 'tty',
27 |     ...                          'fullname': 'setraw'},
28 |     ...                   package='tty',
29 |     ...                   url_fmt='https://hg.python.org/cpython/file/'
30 |     ...                           '{revision}/Lib/{package}/{path}#L{lineno}',
31 |     ...                   revision='xxxx')
32 |     'https://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18'
33 |     """
34 | 
35 |     if revision is None:
36 |         return
37 |     if domain not in ("py", "pyx"):
38 |         return
39 |     if not info.get("module") or not info.get("fullname"):
40 |         return
41 | 
42 |     class_name = info["fullname"].split(".")[0]
43 |     module = __import__(info["module"], fromlist=[class_name])
44 |     obj = attrgetter(info["fullname"])(module)
45 | 
46 |     # Unwrap the object to get the correct source
47 |     # file in case that is wrapped by a decorator
48 |     obj = inspect.unwrap(obj)
49 | 
50 |     try:
51 |         fn = inspect.getsourcefile(obj)
52 |     except Exception:
53 |         fn = None
54 |     if not fn:
55 |         try:
56 |             fn = inspect.getsourcefile(sys.modules[obj.__module__])
57 |         except Exception:
58 |             fn = None
59 |     if not fn:
60 |         return
61 | 
62 |     fn = os.path.relpath(fn, start=os.path.dirname(__import__(package).__file__))
63 |     try:
64 |         lineno = inspect.getsourcelines(obj)[1]
65 |     except Exception:
66 |         lineno = ""
67 |     return url_fmt.format(revision=revision, package=package, path=fn, lineno=lineno)
68 | 
69 | 
70 | def make_linkcode_resolve(package, url_fmt):
71 |     """Returns a linkcode_resolve function for the given URL format
72 | 
73 |     revision is a git commit reference (hash or name)
74 | 
75 |     package is the name of the root module of the package
76 | 
77 |     url_fmt is along the lines of ('https://github.com/USER/PROJECT/'
78 |                                    'blob/{revision}/{package}/'
79 |                                    '{path}#L{lineno}')
80 |     """
81 |     revision = _get_git_revision()
82 |     return partial(
83 |         _linkcode_resolve, revision=revision, package=package, url_fmt=url_fmt
84 |     )
85 | 


--------------------------------------------------------------------------------
/doc/user_guide.rst:
--------------------------------------------------------------------------------
 1 | .. title:: User guide: contents
 2 | 
 3 | .. _user_guide:
 4 | 
 5 | ==========
 6 | User Guide
 7 | ==========
 8 | 
 9 | .. Ensure that the references will be alphabetically collected last
10 | .. Check https://github.com/mcmtroffaes/sphinxcontrib-bibtex/issues/113
11 | 
12 | .. toctree::
13 |    :numbered:
14 | 
15 |    introduction.rst
16 |    over_sampling.rst
17 |    under_sampling.rst
18 |    combine.rst
19 |    ensemble.rst
20 |    miscellaneous.rst
21 |    metrics.rst
22 |    common_pitfalls.rst
23 |    Dataset loading utilities <datasets/index.rst>
24 |    developers_utils.rst
25 |    zzz_references.rst
26 | 


--------------------------------------------------------------------------------
/doc/whats_new.rst:
--------------------------------------------------------------------------------
 1 | .. currentmodule:: imblearn
 2 | 
 3 | ===============
 4 | Release history
 5 | ===============
 6 | 
 7 | .. include:: whats_new/v0.14.rst
 8 | 
 9 | .. include:: whats_new/v0.13.rst
10 | 
11 | .. include:: whats_new/v0.12.rst
12 | 
13 | .. include:: whats_new/v0.11.rst
14 | 
15 | .. include:: whats_new/v0.10.rst
16 | 
17 | .. include:: whats_new/v0.9.rst
18 | 
19 | .. include:: whats_new/v0.8.rst
20 | 
21 | .. include:: whats_new/v0.7.rst
22 | 
23 | .. include:: whats_new/v0.6.rst
24 | 
25 | .. include:: whats_new/v0.5.rst
26 | 
27 | .. include:: whats_new/v0.4.rst
28 | 
29 | .. include:: whats_new/v0.3.rst
30 | 
31 | .. include:: whats_new/v0.2.rst
32 | 
33 | .. include:: whats_new/v0.1.rst
34 | 


--------------------------------------------------------------------------------
/doc/whats_new/0.13.rst:
--------------------------------------------------------------------------------
 1 | .. _changes_0_13:
 2 | 
 3 | Version 0.13.0
 4 | ==============
 5 | 
 6 | **December 20, 2024**
 7 | 
 8 | Changelog
 9 | ---------
10 | 
11 | Bug fixes
12 | .........
13 | 
14 | - Fix `get_metadata_routing` in :class:`~imblearn.pipeline.Pipeline` such that one
15 |   can use a sampler with metadata routing.
16 |   :pr:`1115` by :user:`Guillaume Lemaitre <glemaitre>`.
17 | 
18 | Compatibility
19 | .............
20 | 
21 | - Compatibility with scikit-learn 1.6
22 |   :pr:`1109` by :user:`Guillaume Lemaitre <glemaitre>`.
23 | 
24 | Deprecations
25 | ............
26 | 
27 | - :class:`~imblearn.pipeline.Pipeline` now uses
28 |   :func:`~sklearn.utils.check_is_fitted` instead of
29 |   :func:`~sklearn.utils.check_fitted` to check if the pipeline is fitted. In 0.15, it
30 |   will raise an error instead of a warning.
31 |   :pr:`1109` by :user:`Guillaume Lemaitre <glemaitre>`.
32 | 
33 | - `algorithm` parameter in :class:`~imblearn.ensemble.RUSBoostClassifier` is now
34 |   deprecated and will be removed in 0.14.
35 |   :pr:`1109` by :user:`Guillaume Lemaitre <glemaitre>`.
36 | 


--------------------------------------------------------------------------------
/doc/whats_new/0.14.rst:
--------------------------------------------------------------------------------
 1 | .. _changes_0_14:
 2 | 
 3 | Version 0.14.0 (Under development)
 4 | ==================================
 5 | 
 6 | **TBD**
 7 | 
 8 | Changelog
 9 | ---------
10 | 
11 | Bug fixes
12 | .........
13 | 
14 | Enhancements
15 | ............
16 | 
17 | Compatibility
18 | .............
19 | 
20 | Deprecations
21 | ............
22 | 


--------------------------------------------------------------------------------
/doc/whats_new/v0.1.rst:
--------------------------------------------------------------------------------
 1 | .. _changes_0_1:
 2 | 
 3 | Version 0.1
 4 | ===========
 5 | 
 6 | **December 26, 2016**
 7 | 
 8 | Changelog
 9 | ---------
10 | 
11 | API
12 | ~~~
13 | 
14 | - First release of the stable API. By :user;`Fernando Nogueira <fmfn>`,
15 |   :user:`Guillaume Lemaitre <glemaitre>`, :user:`Christos Aridas <chkoar>`,
16 |   and :user:`Dayvid Oliveira <dvro>`.
17 | 
18 | New methods
19 | ~~~~~~~~~~~
20 | 
21 | * Under-sampling
22 |     1. Random majority under-sampling with replacement
23 |     2. Extraction of majority-minority Tomek links
24 |     3. Under-sampling with Cluster Centroids
25 |     4. NearMiss-(1 & 2 & 3)
26 |     5. Condensend Nearest Neighbour
27 |     6. One-Sided Selection
28 |     7. Neighboorhood Cleaning Rule
29 |     8. Edited Nearest Neighbours
30 |     9. Instance Hardness Threshold
31 |     10. Repeated Edited Nearest Neighbours
32 | 
33 | * Over-sampling
34 |     1. Random minority over-sampling with replacement
35 |     2. SMOTE - Synthetic Minority Over-sampling Technique
36 |     3. bSMOTE(1 & 2) - Borderline SMOTE of types 1 and 2
37 |     4. SVM SMOTE - Support Vectors SMOTE
38 |     5. ADASYN - Adaptive synthetic sampling approach for imbalanced learning
39 | 
40 | * Over-sampling followed by under-sampling
41 |     1. SMOTE + Tomek links
42 |     2. SMOTE + ENN
43 | 
44 | * Ensemble sampling
45 |     1. EasyEnsemble
46 |     2. BalanceCascade
47 | 


--------------------------------------------------------------------------------
/doc/whats_new/v0.10.rst:
--------------------------------------------------------------------------------
 1 | .. _changes_0_10:
 2 | 
 3 | Version 0.10.1
 4 | ==============
 5 | 
 6 | **December 28, 2022**
 7 | 
 8 | Changelog
 9 | ---------
10 | 
11 | Bug fixes
12 | .........
13 | 
14 | - Fix a regression in over-sampler where the string `minority` was rejected as
15 |   an unvalid sampling strategy.
16 |   :pr:`964` by :user:`Prakhyath Bhandary <Prakhyath07>`.
17 | 
18 | Version 0.10.0
19 | ==============
20 | 
21 | **December 9, 2022**
22 | 
23 | Changelog
24 | ---------
25 | 
26 | Bug fixes
27 | .........
28 | 
29 | - Make sure that :class:`~imblearn.utils._docstring.Substitution` is
30 |   working with `python -OO` that replace `__doc__` by `None`.
31 |   :pr:`953` bu :user:`Guillaume Lemaitre <glemaitre>`.
32 | 
33 | Compatibility
34 | .............
35 | 
36 | - Maintenance release for be compatible with scikit-learn >= 1.0.2.
37 |   :pr:`946`, :pr:`947`, :pr:`949` by :user:`Guillaume Lemaitre <glemaitre>`.
38 | 
39 | - Add support for automatic parameters validation as in scikit-learn >= 1.2.
40 |   :pr:`955` by :user:`Guillaume Lemaitre <glemaitre>`.
41 | 
42 | - Add support for `feature_names_in_` as well as `get_feature_names_out` for
43 |   all samplers.
44 |   :pr:`959` by :user:`Guillaume Lemaitre <glemaitre>`.
45 | 
46 | Deprecation
47 | ...........
48 | 
49 | - The parameter `n_jobs` has been deprecated from the classes
50 |   :class:`~imblearn.over_sampling.ADASYN`,
51 |   :class:`~imblearn.over_sampling.BorderlineSMOTE`,
52 |   :class:`~imblearn.over_sampling.SMOTE`,
53 |   :class:`~imblearn.over_sampling.SMOTENC`,
54 |   :class:`~imblearn.over_sampling.SMOTEN`, and
55 |   :class:`~imblearn.over_sampling.SVMSMOTE`. Instead, pass a nearest neighbors
56 |   estimator where `n_jobs` is set.
57 |   :pr:`887` by :user:`Guillaume Lemaitre <glemaitre>`.
58 | 
59 | - The parameter `base_estimator` is deprecated and will be removed in version
60 |   0.12. It is impacted the following classes:
61 |   :class:`~imblearn.ensemble.BalancedBaggingClassifier`,
62 |   :class:`~imblearn.ensemble.EasyEnsembleClassifier`,
63 |   :class:`~imblearn.ensemble.RUSBoostClassifier`.
64 |   :pr:`946` by :user:`Guillaume Lemaitre <glemaitre>`.
65 | 
66 | 
67 | Enhancements
68 | ............
69 | 
70 | - Add support to accept compatible `NearestNeighbors` objects by only
71 |   duck-typing. For instance, it allows to accept cuML instances.
72 |   :pr:`858` by :user:`NV-jpt <NV-jpt>` and
73 |   :user:`Guillaume Lemaitre <glemaitre>`.
74 | 


--------------------------------------------------------------------------------
/doc/whats_new/v0.11.rst:
--------------------------------------------------------------------------------
 1 | .. _changes_0_11:
 2 | 
 3 | Version 0.11.0
 4 | ==============
 5 | 
 6 | **July 8, 2023**
 7 | 
 8 | Changelog
 9 | ---------
10 | 
11 | Bug fixes
12 | .........
13 | 
14 | - Fix a bug in :func:`~imblearn.metrics.classification_report_imbalanced` where the
15 |   parameter `target_names` was not taken into account when `output_dict=True`.
16 |   :pr:`989` by :user:`AYY7 <AYY7>`.
17 | 
18 | - :class:`~imblearn.over_sampling.SMOTENC` now handles mix types of data type such as
19 |   `bool` and `pd.category` by delegating the conversion to scikit-learn encoder.
20 |   :pr:`1002` by :user:`Guillaume Lemaitre <glemaitre>`.
21 | 
22 | - Handle sparse matrices in :class:`~imblearn.over_sampling.SMOTEN` and raise a warning
23 |   since it requires a conversion to dense matrices.
24 |   :pr:`1003` by :user:`Guillaume Lemaitre <glemaitre>`.
25 | 
26 | - Remove spurious warning raised when minority class get over-sampled more than the
27 |   number of sample in the majority class.
28 |   :pr:`1007` by :user:`Guillaume Lemaitre <glemaitre>`.
29 | 
30 | Compatibility
31 | .............
32 | 
33 | - Maintenance release for being compatible with scikit-learn >= 1.3.0.
34 |   :pr:`999` by :user:`Guillaume Lemaitre <glemaitre>`.
35 | 
36 | Deprecation
37 | ...........
38 | 
39 | - The fitted attribute `ohe_` in :class:`~imblearn.over_sampling.SMOTENC` is deprecated
40 |   and will be removed in version 0.13. Use `categorical_encoder_` instead.
41 |   :pr:`1000` by :user:`Guillaume Lemaitre <glemaitre>`.
42 | 
43 | - The default of the parameters `sampling_strategy`, `bootstrap` and
44 |   `replacement` will change in
45 |   :class:`~imblearn.ensemble.BalancedRandomForestClassifier` to follow the
46 |   implementation of the original paper. This changes will take effect in
47 |   version 0.13.
48 |   :pr:`1006` by :user:`Guillaume Lemaitre <glemaitre>`.
49 | 
50 | Enhancements
51 | ............
52 | 
53 | - :class:`~imblearn.over_sampling.SMOTENC` now accepts a parameter `categorical_encoder`
54 |   allowing to specify a :class:`~sklearn.preprocessing.OneHotEncoder` with custom
55 |   parameters.
56 |   :pr:`1000` by :user:`Guillaume Lemaitre <glemaitre>`.
57 | 
58 | - :class:`~imblearn.over_sampling.SMOTEN` now accepts a parameter `categorical_encoder`
59 |   allowing to specify a :class:`~sklearn.preprocessing.OrdinalEncoder` with custom
60 |   parameters. A new fitted parameter `categorical_encoder_` is exposed to access the
61 |   fitted encoder.
62 |   :pr:`1001` by :user:`Guillaume Lemaitre <glemaitre>`.
63 | 
64 | - :class:`~imblearn.under_sampling.RandomUnderSampler` and
65 |   :class:`~imblearn.over_sampling.RandomOverSampler` (when `shrinkage is not
66 |   None`) now accept any data types and will not attempt any data conversion.
67 |   :pr:`1004` by :user:`Guillaume Lemaitre <glemaitre>`.
68 | 
69 | - :class:`~imblearn.over_sampling.SMOTENC` now support passing array-like of `str`
70 |   when passing the `categorical_features` parameter.
71 |   :pr:`1008` by :user`Guillaume Lemaitre <glemaitre>`.
72 | 
73 | - :class:`~imblearn.over_sampling.SMOTENC` now support automatic categorical inference
74 |   when `categorical_features` is set to `"auto"`.
75 |   :pr:`1009` by :user`Guillaume Lemaitre <glemaitre>`.
76 | 


--------------------------------------------------------------------------------
/doc/whats_new/v0.3.rst:
--------------------------------------------------------------------------------
  1 | .. _changes_0_3:
  2 | 
  3 | Version 0.3
  4 | ===========
  5 | 
  6 | **February 22, 2018**
  7 | 
  8 | Changelog
  9 | ---------
 10 | 
 11 | Testing
 12 | ~~~~~~~
 13 | - Pytest is used instead of nosetests. :issue:`321` by :user:`Joan Massich
 14 |   <massich>`.
 15 | 
 16 | Documentation
 17 | ~~~~~~~~~~~~~
 18 | 
 19 | - Added a User Guide and extended some examples. :issue:`295` by
 20 |   :user:`Guillaume Lemaitre <glemaitre>`.
 21 | 
 22 | Bug fixes
 23 | ~~~~~~~~~
 24 | 
 25 | - Fixed a bug in :func:`utils.check_ratio` such that an error is raised when
 26 |   the number of samples required is negative. :issue:`312` by :user:`Guillaume
 27 |   Lemaitre <glemaitre>`.
 28 | 
 29 | - Fixed a bug in :class:`under_sampling.NearMiss` version 3. The indices
 30 |   returned were wrong. :issue:`312` by :user:`Guillaume Lemaitre <glemaitre>`.
 31 | 
 32 | - Fixed bug for :class:`ensemble.BalanceCascade` and :class:`combine.SMOTEENN`
 33 |   and :class:`SMOTETomek`. :issue:`295` by :user:`Guillaume Lemaitre
 34 |   <glemaitre>`.
 35 | 
 36 | - Fixed bug for `check_ratio` to be able to pass arguments when `ratio` is a
 37 |   callable. :issue:`307` by :user:`Guillaume Lemaitre <glemaitre>`.
 38 | 
 39 | New features
 40 | ~~~~~~~~~~~~
 41 | 
 42 | - Turn off steps in :class:`pipeline.Pipeline` using the `None`
 43 |   object. By :user:`Christos Aridas <chkoar>`.
 44 | 
 45 | - Add a fetching function :func:`datasets.fetch_datasets` in order to get some
 46 |   imbalanced datasets useful for benchmarking. :issue:`249` by :user:`Guillaume
 47 |   Lemaitre <glemaitre>`.
 48 | 
 49 | Enhancement
 50 | ~~~~~~~~~~~
 51 | 
 52 | - All samplers accepts sparse matrices with defaulting on CSR
 53 |   type. :issue:`316` by :user:`Guillaume Lemaitre <glemaitre>`.
 54 | 
 55 | - :func:`datasets.make_imbalance` take a ratio similarly to other samplers. It
 56 |   supports multiclass. :issue:`312` by :user:`Guillaume Lemaitre <glemaitre>`.
 57 | 
 58 | - All the unit tests have been factorized and a :func:`utils.check_estimators`
 59 |   has been derived from scikit-learn. By :user:`Guillaume Lemaitre
 60 |   <glemaitre>`.
 61 | 
 62 | - Script for automatic build of conda packages and uploading. :issue:`242` by
 63 |   :user:`Guillaume Lemaitre <glemaitre>`
 64 | 
 65 | - Remove seaborn dependence and improve the examples. :issue:`264` by
 66 |   :user:`Guillaume Lemaitre <glemaitre>`.
 67 | 
 68 | - adapt all classes to multi-class resampling. :issue:`290` by :user:`Guillaume
 69 |   Lemaitre <glemaitre>`
 70 | 
 71 | API changes summary
 72 | ~~~~~~~~~~~~~~~~~~~
 73 | 
 74 | - `__init__` has been removed from the :class:`base.SamplerMixin` to create a
 75 |   real mixin class. :issue:`242` by :user:`Guillaume Lemaitre <glemaitre>`.
 76 | 
 77 | - creation of a module :mod:`exceptions` to handle consistant raising of
 78 |   errors. :issue:`242` by :user:`Guillaume Lemaitre <glemaitre>`.
 79 | 
 80 | - creation of a module ``utils.validation`` to make checking of recurrent
 81 |   patterns. :issue:`242` by :user:`Guillaume Lemaitre <glemaitre>`.
 82 | 
 83 | - move the under-sampling methods in ``prototype_selection`` and
 84 |   ``prototype_generation`` submodule to make a clearer
 85 |   dinstinction. :issue:`277` by :user:`Guillaume Lemaitre <glemaitre>`.
 86 | 
 87 | - change ``ratio`` such that it can adapt to multiple class
 88 |   problems. :issue:`290` by :user:`Guillaume Lemaitre <glemaitre>`.
 89 | 
 90 | Deprecation
 91 | ~~~~~~~~~~~
 92 | 
 93 | - Deprecation of the use of ``min_c_`` in
 94 |   :func:`datasets.make_imbalance`. :issue:`312` by :user:`Guillaume Lemaitre
 95 |   <glemaitre>`
 96 | 
 97 | - Deprecation of the use of float in :func:`datasets.make_imbalance` for the
 98 |   ratio parameter. :issue:`290` by :user:`Guillaume Lemaitre <glemaitre>`.
 99 | 
100 | - deprecate the use of float as ratio in favor of dictionary, string, or
101 |   callable. :issue:`290` by :user:`Guillaume Lemaitre <glemaitre>`.
102 | 


--------------------------------------------------------------------------------
/doc/whats_new/v0.5.rst:
--------------------------------------------------------------------------------
 1 | .. _changes_0_5:
 2 | 
 3 | Version 0.5.0
 4 | =============
 5 | 
 6 | **June 28, 2019**
 7 | 
 8 | Changelog
 9 | ---------
10 | 
11 | Changed models
12 | ..............
13 | 
14 | The following models or function might give different results even if the
15 | same data ``X`` and ``y`` are the same.
16 | 
17 | * :class:`imblearn.ensemble.RUSBoostClassifier` default estimator changed from
18 |   :class:`sklearn.tree.DecisionTreeClassifier` with full depth to a decision
19 |   stump (i.e., tree with ``max_depth=1``).
20 | 
21 | Documentation
22 | .............
23 | 
24 | - Correct the definition of the ratio when using a ``float`` in sampling
25 |   strategy for the over-sampling and under-sampling.
26 |   :issue:`525` by :user:`Ariel Rossanigo <arielrossanigo>`.
27 | 
28 | - Add :class:`imblearn.over_sampling.BorderlineSMOTE` and
29 |   :class:`imblearn.over_sampling.SVMSMOTE` in the API documenation.
30 |   :issue:`530` by :user:`Guillaume Lemaitre <glemaitre>`.
31 | 
32 | Enhancement
33 | ...........
34 | 
35 | - Add Parallelisation for SMOTEENN and SMOTETomek.
36 |   :pr:`547` by :user:`Michael Hsieh <Microsheep>`.
37 | 
38 | - Add :class:`imblearn.utils._show_versions`. Updated the contribution guide
39 |   and issue template showing how to print system and dependency information
40 |   from the command line. :pr:`557` by :user:`Alexander L. Hayes <batflyer>`.
41 | 
42 | - Add :class:`imblearn.over_sampling.KMeansSMOTE` which is an over-sampler
43 |   clustering points before to apply SMOTE.
44 |   :pr:`435` by :user:`Stephan Heijl <StephanHeijl>`.
45 | 
46 | Maintenance
47 | ...........
48 | 
49 | - Make it possible to ``import imblearn`` and access submodule.
50 |   :pr:`500` by :user:`Guillaume Lemaitre <glemaitre>`.
51 | 
52 | - Remove support for Python 2, remove deprecation warning from
53 |   scikit-learn 0.21.
54 |   :pr:`576` by :user:`Guillaume Lemaitre <glemaitre>`.
55 | 
56 | Bug
57 | ...
58 | 
59 | - Fix wrong usage of :class:`keras.layers.BatchNormalization` in
60 |   ``porto_seguro_keras_under_sampling.py`` example. The batch normalization
61 |   was moved before the activation function and the bias was removed from the
62 |   dense layer.
63 |   :pr:`531` by :user:`Guillaume Lemaitre <glemaitre>`.
64 | 
65 | - Fix bug which converting to COO format sparse when stacking the matrices in
66 |   :class:`imblearn.over_sampling.SMOTENC`. This bug was only old scipy version.
67 |   :pr:`539` by :user:`Guillaume Lemaitre <glemaitre>`.
68 | 
69 | - Fix bug in :class:`imblearn.pipeline.Pipeline` where None could be the final
70 |   estimator.
71 |   :pr:`554` by :user:`Oliver Rausch <orausch>`.
72 | 
73 | - Fix bug in :class:`imblearn.over_sampling.SVMSMOTE` and
74 |   :class:`imblearn.over_sampling.BorderlineSMOTE` where the default parameter
75 |   of ``n_neighbors`` was not set properly.
76 |   :pr:`578` by :user:`Guillaume Lemaitre <glemaitre>`.
77 | 
78 | - Fix bug by changing the default depth in
79 |   :class:`imblearn.ensemble.RUSBoostClassifier` to get a decision stump as a
80 |   weak learner as in the original paper.
81 |   :pr:`545` by :user:`Christos Aridas <chkoar>`.
82 | 
83 | - Allow to import ``keras`` directly from ``tensorflow`` in the
84 |   :mod:`imblearn.keras`.
85 |   :pr:`531` by :user:`Guillaume Lemaitre <glemaitre>`.
86 | 


--------------------------------------------------------------------------------
/doc/whats_new/v0.7.rst:
--------------------------------------------------------------------------------
 1 | .. _changes_0_7:
 2 | 
 3 | Version 0.7.0
 4 | =============
 5 | 
 6 | **June 9, 2020**
 7 | 
 8 | Changelog
 9 | ---------
10 | 
11 | Maintenance
12 | ...........
13 | 
14 | - Ensure that :class:`imblearn.pipeline.Pipeline` is working when `memory`
15 |   is activated and `joblib==0.11`.
16 |   :pr:`687` by :user:`Christos Aridas <chkoar>`.
17 | 
18 | - Refactor common test to use the dev tools from `scikit-learn` 0.23.
19 |   :pr:`710` by :user:`Guillaume Lemaitre <glemaitre>`.
20 | 
21 | - Remove `FutureWarning` issued by `scikit-learn` 0.23.
22 |   :pr:`710` by :user:`Guillaume Lemaitre <glemaitre>`.
23 | 
24 | - Impose keywords only argument as in `scikit-learn`.
25 |   :pr:`721` by :user:`Guillaume Lemaitre <glemaitre>`.
26 | 
27 | Changed models
28 | ..............
29 | 
30 | The following models might give some different results due to changes:
31 | 
32 | - :class:`imblearn.ensemble.BalancedRandomForestClassifier`
33 | 
34 | Bug fixes
35 | .........
36 | 
37 | - Change the default value `min_samples_leaf` to be consistent with
38 |   scikit-learn.
39 |   :pr:`711` by :user:`zerolfx <zerolfx>`.
40 | 
41 | - Fix a bug due to change in `scikit-learn` 0.23 in
42 |   :class:`imblearn.metrics.make_index_balanced_accuracy`. The function was
43 |   unusable.
44 |   :pr:`710` by :user:`Guillaume Lemaitre <glemaitre>`.
45 | 
46 | - Raise a proper error message when only numerical or categorical features
47 |   are given in :class:`imblearn.over_sampling.SMOTENC`.
48 |   :pr:`720` by :user:`Guillaume Lemaitre <glemaitre>`.
49 | 
50 | - Fix a bug when the median of the standard deviation is null in
51 |   :class:`imblearn.over_sampling.SMOTENC`.
52 |   :pr:`675` by :user:`bganglia <bganglia>`.
53 | 
54 | Enhancements
55 | ............
56 | 
57 | - The classifier implemented in imbalanced-learn,
58 |   :class:`imblearn.ensemble.BalancedBaggingClassifier`,
59 |   :class:`imblearn.ensemble.BalancedRandomForestClassifier`,
60 |   :class:`imblearn.ensemble.EasyEnsembleClassifier`, and
61 |   :class:`imblearn.ensemble.RUSBoostClassifier`, accept `sampling_strategy`
62 |   with the same key than in `y` without the need of encoding `y` in advance.
63 |   :pr:`718` by :user:`Guillaume Lemaitre <glemaitre>`.
64 | 
65 | - Lazy import `keras` module when importing `imblearn.keras`
66 |   :pr:`719` by :user:`Guillaume Lemaitre <glemaitre>`.
67 | 
68 | Deprecation
69 | ...........
70 | 
71 | - Deprecation of the parameters `n_jobs` in
72 |   :class:`imblearn.under_sampling.ClusterCentroids` since it was used by
73 |   :class:`sklearn.cluster.KMeans` which deprecated it.
74 |   :pr:`710` by :user:`Guillaume Lemaitre <glemaitre>`.
75 | 
76 | - Deprecation of passing keyword argument by position similarly to
77 |   `scikit-learn`.
78 |   :pr:`721` by :user:`Guillaume lemaitre <glemaitre>`.
79 | 


--------------------------------------------------------------------------------
/doc/whats_new/v0.8.rst:
--------------------------------------------------------------------------------
 1 | .. _changes_0_8:
 2 | 
 3 | Version 0.8.1
 4 | =============
 5 | 
 6 | **September 29, 2020**
 7 | 
 8 | Changelog
 9 | ---------
10 | 
11 | Maintenance
12 | ...........
13 | 
14 | - Make `imbalanced-learn` compatible with `scikit-learn` 1.0.
15 |   :pr:`864` by :user:`Guillaume Lemaitre <glemaitre>`.
16 | 
17 | Version 0.8.0
18 | =============
19 | 
20 | **February 18, 2021**
21 | 
22 | Changelog
23 | ---------
24 | 
25 | New features
26 | ............
27 | 
28 | - Add the the function
29 |   :func:`imblearn.metrics.macro_averaged_mean_absolute_error` returning the
30 |   average across class of the MAE. This metric is used in ordinal
31 |   classification.
32 |   :pr:`780` by :user:`Aurélien Massiot <AurelienMassiot>`.
33 | 
34 | - Add the class :class:`imblearn.metrics.pairwise.ValueDifferenceMetric` to
35 |   compute pairwise distances between samples containing only categorical
36 |   values.
37 |   :pr:`796` by :user:`Guillaume Lemaitre <glemaitre>`.
38 | 
39 | - Add the class :class:`imblearn.over_sampling.SMOTEN` to over-sample data
40 |   only containing categorical features.
41 |   :pr:`802` by :user:`Guillaume Lemaitre <glemaitre>`.
42 | 
43 | - Add the possibility to pass any type of samplers in
44 |   :class:`imblearn.ensemble.BalancedBaggingClassifier` unlocking the
45 |   implementation of methods based on resampled bagging.
46 |   :pr:`808` by :user:`Guillaume Lemaitre <glemaitre>`.
47 | 
48 | Enhancements
49 | ............
50 | 
51 | - Add option `output_dict` in
52 |   :func:`imblearn.metrics.classification_report_imbalanced` to return a
53 |   dictionary instead of a string.
54 |   :pr:`770` by :user:`Guillaume Lemaitre <glemaitre>`.
55 | 
56 | - Added an option to generate smoothed bootstrap in
57 |   :class:`imblearn.over_sampling.RandomOverSampler`. It is controls by the
58 |   parameter `shrinkage`. This method is also known as Random Over-Sampling
59 |   Examples (ROSE).
60 |   :pr:`754` by :user:`Andrea Lorenzon <andrealorenzon>` and
61 |   :user:`Guillaume Lemaitre <glemaitre>`.
62 | 
63 | Bug fixes
64 | .........
65 | 
66 | - Fix a bug in :class:`imblearn.under_sampling.ClusterCentroids` where
67 |   `voting="hard"` could have lead to select a sample from any class instead of
68 |   the targeted class.
69 |   :pr:`769` by :user:`Guillaume Lemaitre <glemaitre>`.
70 | 
71 | - Fix a bug in :class:`imblearn.FunctionSampler` where validation was performed
72 |   even with `validate=False` when calling `fit`.
73 |   :pr:`790` by :user:`Guillaume Lemaitre <glemaitre>`.
74 | 
75 | Maintenance
76 | ...........
77 | 
78 | - Remove requirements files in favour of adding the packages in the
79 |   `extras_require` within the `setup.py` file.
80 |   :pr:`816` by :user:`Guillaume Lemaitre <glemaitre>`.
81 | 
82 | - Change the website template to use `pydata-sphinx-theme`.
83 |   :pr:`801` by :user:`Guillaume Lemaitre <glemaitre>`.
84 | 
85 | Deprecation
86 | ...........
87 | 
88 | - The context manager :func:`imblearn.utils.testing.warns` is deprecated in 0.8
89 |   and will be removed 1.0.
90 |   :pr:`815` by :user:`Guillaume Lemaitre <glemaitre>`.
91 | 


--------------------------------------------------------------------------------
/doc/whats_new/v0.9.rst:
--------------------------------------------------------------------------------
 1 | .. _changes_0_9:
 2 | 
 3 | Version 0.9.1
 4 | =============
 5 | 
 6 | **May 16, 2022**
 7 | 
 8 | Changelog
 9 | ---------
10 | 
11 | This release provides fixes that make `imbalanced-learn` works with the
12 | latest release (`1.1.0`) of `scikit-learn`.
13 | 
14 | Version 0.9.0
15 | =============
16 | 
17 | **January 11, 2022**
18 | 
19 | Changelog
20 | ---------
21 | 
22 | This release is mainly providing fixes that make `imbalanced-learn` works
23 | with the latest release (`1.0.2`) of `scikit-learn`.
24 | 


--------------------------------------------------------------------------------
/doc/zzz_references.rst:
--------------------------------------------------------------------------------
1 | ==========
2 | References
3 | ==========
4 | 
5 | .. bibliography:: bibtex/refs.bib
6 | 


--------------------------------------------------------------------------------
/examples/README.txt:
--------------------------------------------------------------------------------
1 | .. _general_examples:
2 | 
3 | Examples
4 | --------
5 | 
6 | General-purpose and introductory examples for the `imbalanced-learn` toolbox.
7 | 


--------------------------------------------------------------------------------
/examples/api/README.txt:
--------------------------------------------------------------------------------
1 | .. _api_usage:
2 | 
3 | Examples showing API imbalanced-learn usage
4 | -------------------------------------------
5 | 
6 | Examples that show some details regarding the API of imbalanced-learn.
7 | 


--------------------------------------------------------------------------------
/examples/applications/README.txt:
--------------------------------------------------------------------------------
1 | .. _realword_examples:
2 | 
3 | Examples based on real world datasets
4 | -------------------------------------
5 | 
6 | Examples which use real-word dataset.
7 | 


--------------------------------------------------------------------------------
/examples/applications/plot_multi_class_under_sampling.py:
--------------------------------------------------------------------------------
 1 | """
 2 | =============================================
 3 | Multiclass classification with under-sampling
 4 | =============================================
 5 | 
 6 | Some balancing methods allow for balancing dataset with multiples classes.
 7 | We provide an example to illustrate the use of those methods which do
 8 | not differ from the binary case.
 9 | 
10 | """
11 | 
12 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
13 | # License: MIT
14 | 
15 | from collections import Counter
16 | 
17 | from sklearn.datasets import load_iris
18 | from sklearn.linear_model import LogisticRegression
19 | from sklearn.model_selection import train_test_split
20 | from sklearn.preprocessing import StandardScaler
21 | 
22 | from imblearn.datasets import make_imbalance
23 | from imblearn.metrics import classification_report_imbalanced
24 | from imblearn.pipeline import make_pipeline
25 | from imblearn.under_sampling import NearMiss
26 | 
27 | print(__doc__)
28 | 
29 | RANDOM_STATE = 42
30 | 
31 | # Create a folder to fetch the dataset
32 | iris = load_iris()
33 | X, y = make_imbalance(
34 |     iris.data,
35 |     iris.target,
36 |     sampling_strategy={0: 25, 1: 50, 2: 50},
37 |     random_state=RANDOM_STATE,
38 | )
39 | 
40 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE)
41 | 
42 | print(f"Training target statistics: {Counter(y_train)}")
43 | print(f"Testing target statistics: {Counter(y_test)}")
44 | 
45 | # Create a pipeline
46 | pipeline = make_pipeline(NearMiss(version=2), StandardScaler(), LogisticRegression())
47 | pipeline.fit(X_train, y_train)
48 | 
49 | # Classify and report the results
50 | print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))
51 | 


--------------------------------------------------------------------------------
/examples/applications/plot_topic_classication.py:
--------------------------------------------------------------------------------
  1 | """
  2 | =================================================
  3 | Example of topic classification in text documents
  4 | =================================================
  5 | 
  6 | This example shows how to balance the text data before to train a classifier.
  7 | 
  8 | Note that for this example, the data are slightly imbalanced but it can happen
  9 | that for some data sets, the imbalanced ratio is more significant.
 10 | """
 11 | 
 12 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
 13 | # License: MIT
 14 | 
 15 | # %%
 16 | print(__doc__)
 17 | 
 18 | # %% [markdown]
 19 | # Setting the data set
 20 | # --------------------
 21 | #
 22 | # We use a part of the 20 newsgroups data set by loading 4 topics. Using the
 23 | # scikit-learn loader, the data are split into a training and a testing set.
 24 | #
 25 | # Note the class \#3 is the minority class and has almost twice less samples
 26 | # than the majority class.
 27 | 
 28 | # %%
 29 | from sklearn.datasets import fetch_20newsgroups
 30 | 
 31 | categories = [
 32 |     "alt.atheism",
 33 |     "talk.religion.misc",
 34 |     "comp.graphics",
 35 |     "sci.space",
 36 | ]
 37 | newsgroups_train = fetch_20newsgroups(subset="train", categories=categories)
 38 | newsgroups_test = fetch_20newsgroups(subset="test", categories=categories)
 39 | 
 40 | X_train = newsgroups_train.data
 41 | X_test = newsgroups_test.data
 42 | 
 43 | y_train = newsgroups_train.target
 44 | y_test = newsgroups_test.target
 45 | 
 46 | # %%
 47 | from collections import Counter
 48 | 
 49 | print(f"Training class distributions summary: {Counter(y_train)}")
 50 | print(f"Test class distributions summary: {Counter(y_test)}")
 51 | 
 52 | # %% [markdown]
 53 | # The usual scikit-learn pipeline
 54 | # -------------------------------
 55 | #
 56 | # You might usually use scikit-learn pipeline by combining the TF-IDF
 57 | # vectorizer to feed a multinomial naive bayes classifier. A classification
 58 | # report summarized the results on the testing set.
 59 | #
 60 | # As expected, the recall of the class \#3 is low mainly due to the class
 61 | # imbalanced.
 62 | 
 63 | # %%
 64 | from sklearn.feature_extraction.text import TfidfVectorizer
 65 | from sklearn.naive_bayes import MultinomialNB
 66 | from sklearn.pipeline import make_pipeline
 67 | 
 68 | model = make_pipeline(TfidfVectorizer(), MultinomialNB())
 69 | model.fit(X_train, y_train)
 70 | y_pred = model.predict(X_test)
 71 | 
 72 | # %%
 73 | from imblearn.metrics import classification_report_imbalanced
 74 | 
 75 | print(classification_report_imbalanced(y_test, y_pred))
 76 | 
 77 | # %% [markdown]
 78 | # Balancing the class before classification
 79 | # -----------------------------------------
 80 | #
 81 | # To improve the prediction of the class \#3, it could be interesting to apply
 82 | # a balancing before to train the naive bayes classifier. Therefore, we will
 83 | # use a :class:`~imblearn.under_sampling.RandomUnderSampler` to equalize the
 84 | # number of samples in all the classes before the training.
 85 | #
 86 | # It is also important to note that we are using the
 87 | # :class:`~imblearn.pipeline.make_pipeline` function implemented in
 88 | # imbalanced-learn to properly handle the samplers.
 89 | 
 90 | from imblearn.pipeline import make_pipeline as make_pipeline_imb
 91 | 
 92 | # %%
 93 | from imblearn.under_sampling import RandomUnderSampler
 94 | 
 95 | model = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(), MultinomialNB())
 96 | 
 97 | model.fit(X_train, y_train)
 98 | y_pred = model.predict(X_test)
 99 | 
100 | # %% [markdown]
101 | # Although the results are almost identical, it can be seen that the resampling
102 | # allowed to correct the poor recall of the class \#3 at the cost of reducing
103 | # the other metrics for the other classes. However, the overall results are
104 | # slightly better.
105 | 
106 | # %%
107 | print(classification_report_imbalanced(y_test, y_pred))
108 | 


--------------------------------------------------------------------------------
/examples/combine/README.txt:
--------------------------------------------------------------------------------
1 | .. _combine_examples:
2 | 
3 | Examples using combine class methods
4 | ====================================
5 | 
6 | Combine methods mixed over- and under-sampling methods. Generally SMOTE is used for over-sampling while some cleaning methods (i.e., ENN and Tomek links) are used to under-sample.
7 | 


--------------------------------------------------------------------------------
/examples/datasets/README.txt:
--------------------------------------------------------------------------------
1 | .. _dataset_examples:
2 | 
3 | Dataset examples
4 | -----------------------
5 | 
6 | Examples concerning the :mod:`imblearn.datasets` module.
7 | 


--------------------------------------------------------------------------------
/examples/datasets/plot_make_imbalance.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ============================
  3 | Create an imbalanced dataset
  4 | ============================
  5 | 
  6 | An illustration of the :func:`~imblearn.datasets.make_imbalance` function to
  7 | create an imbalanced dataset from a balanced dataset. We show the ability of
  8 | :func:`~imblearn.datasets.make_imbalance` of dealing with Pandas DataFrame.
  9 | """
 10 | 
 11 | # Authors: Dayvid Oliveira
 12 | #          Christos Aridas
 13 | #          Guillaume Lemaitre <g.lemaitre58@gmail.com>
 14 | # License: MIT
 15 | 
 16 | # %%
 17 | print(__doc__)
 18 | 
 19 | import seaborn as sns
 20 | 
 21 | sns.set_context("poster")
 22 | 
 23 | # %% [markdown]
 24 | # Generate the dataset
 25 | # --------------------
 26 | #
 27 | # First, we will generate a dataset and convert it to a
 28 | # :class:`~pandas.DataFrame` with arbitrary column names. We will plot the
 29 | # original dataset.
 30 | 
 31 | # %%
 32 | import matplotlib.pyplot as plt
 33 | import pandas as pd
 34 | from sklearn.datasets import make_moons
 35 | 
 36 | X, y = make_moons(n_samples=200, shuffle=True, noise=0.5, random_state=10)
 37 | X = pd.DataFrame(X, columns=["feature 1", "feature 2"])
 38 | ax = X.plot.scatter(
 39 |     x="feature 1",
 40 |     y="feature 2",
 41 |     c=y,
 42 |     colormap="viridis",
 43 |     colorbar=False,
 44 | )
 45 | sns.despine(ax=ax, offset=10)
 46 | plt.tight_layout()
 47 | 
 48 | # %% [markdown]
 49 | # Make a dataset imbalanced
 50 | # -------------------------
 51 | #
 52 | # Now, we will show the helpers :func:`~imblearn.datasets.make_imbalance`
 53 | # that is useful to random select a subset of samples. It will impact the
 54 | # class distribution as specified by the parameters.
 55 | 
 56 | # %%
 57 | from collections import Counter
 58 | 
 59 | 
 60 | def ratio_func(y, multiplier, minority_class):
 61 |     target_stats = Counter(y)
 62 |     return {minority_class: int(multiplier * target_stats[minority_class])}
 63 | 
 64 | 
 65 | # %%
 66 | from imblearn.datasets import make_imbalance
 67 | 
 68 | fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
 69 | 
 70 | X.plot.scatter(
 71 |     x="feature 1",
 72 |     y="feature 2",
 73 |     c=y,
 74 |     ax=axs[0, 0],
 75 |     colormap="viridis",
 76 |     colorbar=False,
 77 | )
 78 | axs[0, 0].set_title("Original set")
 79 | sns.despine(ax=axs[0, 0], offset=10)
 80 | 
 81 | multipliers = [0.9, 0.75, 0.5, 0.25, 0.1]
 82 | for ax, multiplier in zip(axs.ravel()[1:], multipliers):
 83 |     X_resampled, y_resampled = make_imbalance(
 84 |         X,
 85 |         y,
 86 |         sampling_strategy=ratio_func,
 87 |         **{"multiplier": multiplier, "minority_class": 1},
 88 |     )
 89 |     X_resampled.plot.scatter(
 90 |         x="feature 1",
 91 |         y="feature 2",
 92 |         c=y_resampled,
 93 |         ax=ax,
 94 |         colormap="viridis",
 95 |         colorbar=False,
 96 |     )
 97 |     ax.set_title(f"Sampling ratio = {multiplier}")
 98 |     sns.despine(ax=ax, offset=10)
 99 | 
100 | plt.tight_layout()
101 | plt.show()
102 | 


--------------------------------------------------------------------------------
/examples/ensemble/README.txt:
--------------------------------------------------------------------------------
1 | .. _ensemble_examples:
2 | 
3 | Example using ensemble class methods
4 | ====================================
5 | 
6 | Under-sampling methods implies that samples of the majority class are lost during the balancing procedure.
7 | Ensemble methods offer an alternative to use most of the samples.
8 | In fact, an ensemble of balanced sets is created and used to later train any classifier.
9 | 


--------------------------------------------------------------------------------
/examples/evaluation/README.txt:
--------------------------------------------------------------------------------
1 | .. _evaluation_examples:
2 | 
3 | Evaluation examples
4 | -------------------
5 | 
6 | Examples illustrating how classification using imbalanced dataset can be done.
7 | 


--------------------------------------------------------------------------------
/examples/evaluation/plot_classification_report.py:
--------------------------------------------------------------------------------
 1 | """
 2 | =============================================
 3 | Evaluate classification by compiling a report
 4 | =============================================
 5 | 
 6 | Specific metrics have been developed to evaluate classifier which has been
 7 | trained using imbalanced data. :mod:`imblearn` provides a classification report
 8 | similar to :mod:`sklearn`, with additional metrics specific to imbalanced
 9 | learning problem.
10 | """
11 | 
12 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
13 | # License: MIT
14 | 
15 | 
16 | from sklearn import datasets
17 | from sklearn.linear_model import LogisticRegression
18 | from sklearn.model_selection import train_test_split
19 | from sklearn.preprocessing import StandardScaler
20 | 
21 | from imblearn import over_sampling as os
22 | from imblearn import pipeline as pl
23 | from imblearn.metrics import classification_report_imbalanced
24 | 
25 | print(__doc__)
26 | 
27 | RANDOM_STATE = 42
28 | 
29 | # Generate a dataset
30 | X, y = datasets.make_classification(
31 |     n_classes=2,
32 |     class_sep=2,
33 |     weights=[0.1, 0.9],
34 |     n_informative=10,
35 |     n_redundant=1,
36 |     flip_y=0,
37 |     n_features=20,
38 |     n_clusters_per_class=4,
39 |     n_samples=5000,
40 |     random_state=RANDOM_STATE,
41 | )
42 | 
43 | pipeline = pl.make_pipeline(
44 |     StandardScaler(),
45 |     os.SMOTE(random_state=RANDOM_STATE),
46 |     LogisticRegression(max_iter=10_000),
47 | )
48 | 
49 | # Split the data
50 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE)
51 | 
52 | # Train the classifier with balancing
53 | pipeline.fit(X_train, y_train)
54 | 
55 | # Test the classifier and get the prediction
56 | y_pred_bal = pipeline.predict(X_test)
57 | 
58 | # Show the classification report
59 | print(classification_report_imbalanced(y_test, y_pred_bal))
60 | 


--------------------------------------------------------------------------------
/examples/evaluation/plot_metrics.py:
--------------------------------------------------------------------------------
  1 | """
  2 | =======================================
  3 | Metrics specific to imbalanced learning
  4 | =======================================
  5 | 
  6 | Specific metrics have been developed to evaluate classifier which
  7 | has been trained using imbalanced data. :mod:`imblearn` provides mainly
  8 | two additional metrics which are not implemented in :mod:`sklearn`: (i)
  9 | geometric mean and (ii) index balanced accuracy.
 10 | """
 11 | 
 12 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
 13 | # License: MIT
 14 | 
 15 | # %%
 16 | print(__doc__)
 17 | 
 18 | RANDOM_STATE = 42
 19 | 
 20 | # %% [markdown]
 21 | # First, we will generate some imbalanced dataset.
 22 | 
 23 | # %%
 24 | from sklearn.datasets import make_classification
 25 | 
 26 | X, y = make_classification(
 27 |     n_classes=3,
 28 |     class_sep=2,
 29 |     weights=[0.1, 0.9],
 30 |     n_informative=10,
 31 |     n_redundant=1,
 32 |     flip_y=0,
 33 |     n_features=20,
 34 |     n_clusters_per_class=4,
 35 |     n_samples=5000,
 36 |     random_state=RANDOM_STATE,
 37 | )
 38 | 
 39 | # %% [markdown]
 40 | # We will split the data into a training and testing set.
 41 | 
 42 | # %%
 43 | from sklearn.model_selection import train_test_split
 44 | 
 45 | X_train, X_test, y_train, y_test = train_test_split(
 46 |     X, y, stratify=y, random_state=RANDOM_STATE
 47 | )
 48 | 
 49 | # %% [markdown]
 50 | # We will create a pipeline made of a :class:`~imblearn.over_sampling.SMOTE`
 51 | # over-sampler followed by a :class:`~sklearn.linear_model.LogisticRegression`
 52 | # classifier.
 53 | 
 54 | from sklearn.linear_model import LogisticRegression
 55 | from sklearn.preprocessing import StandardScaler
 56 | 
 57 | from imblearn.over_sampling import SMOTE
 58 | 
 59 | # %%
 60 | from imblearn.pipeline import make_pipeline
 61 | 
 62 | model = make_pipeline(
 63 |     StandardScaler(),
 64 |     SMOTE(random_state=RANDOM_STATE),
 65 |     LogisticRegression(max_iter=10_000, random_state=RANDOM_STATE),
 66 | )
 67 | 
 68 | # %% [markdown]
 69 | # Now, we will train the model on the training set and get the prediction
 70 | # associated with the testing set. Be aware that the resampling will happen
 71 | # only when calling `fit`: the number of samples in `y_pred` is the same than
 72 | # in `y_test`.
 73 | 
 74 | # %%
 75 | model.fit(X_train, y_train)
 76 | y_pred = model.predict(X_test)
 77 | 
 78 | # %% [markdown]
 79 | # The geometric mean corresponds to the square root of the product of the
 80 | # sensitivity and specificity. Combining the two metrics should account for
 81 | # the balancing of the dataset.
 82 | 
 83 | # %%
 84 | from imblearn.metrics import geometric_mean_score
 85 | 
 86 | print(f"The geometric mean is {geometric_mean_score(y_test, y_pred):.3f}")
 87 | 
 88 | # %% [markdown]
 89 | # The index balanced accuracy can transform any metric to be used in
 90 | # imbalanced learning problems.
 91 | 
 92 | # %%
 93 | from imblearn.metrics import make_index_balanced_accuracy
 94 | 
 95 | alpha = 0.1
 96 | geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(geometric_mean_score)
 97 | 
 98 | print(
 99 |     f"The IBA using alpha={alpha} and the geometric mean: "
100 |     f"{geo_mean(y_test, y_pred):.3f}"
101 | )
102 | 
103 | # %%
104 | alpha = 0.5
105 | geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(geometric_mean_score)
106 | 
107 | print(
108 |     f"The IBA using alpha={alpha} and the geometric mean: "
109 |     f"{geo_mean(y_test, y_pred):.3f}"
110 | )
111 | 


--------------------------------------------------------------------------------
/examples/model_selection/README.txt:
--------------------------------------------------------------------------------
1 | .. _model_selection_examples:
2 | 
3 | Model Selection
4 | ---------------
5 | 
6 | Examples related to the selection of balancing methods.
7 | 


--------------------------------------------------------------------------------
/examples/model_selection/plot_validation_curve.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ==========================
  3 | Plotting Validation Curves
  4 | ==========================
  5 | 
  6 | In this example the impact of the :class:`~imblearn.over_sampling.SMOTE`'s
  7 | `k_neighbors` parameter is examined. In the plot you can see the validation
  8 | scores of a SMOTE-CART classifier for different values of the
  9 | :class:`~imblearn.over_sampling.SMOTE`'s `k_neighbors` parameter.
 10 | """
 11 | 
 12 | # Authors: Christos Aridas
 13 | #          Guillaume Lemaitre <g.lemaitre58@gmail.com>
 14 | # License: MIT
 15 | 
 16 | # %%
 17 | print(__doc__)
 18 | 
 19 | import seaborn as sns
 20 | 
 21 | sns.set_context("poster")
 22 | 
 23 | 
 24 | RANDOM_STATE = 42
 25 | 
 26 | # %% [markdown]
 27 | # Let's first generate a dataset with imbalanced class distribution.
 28 | 
 29 | # %%
 30 | from sklearn.datasets import make_classification
 31 | 
 32 | X, y = make_classification(
 33 |     n_classes=2,
 34 |     class_sep=2,
 35 |     weights=[0.1, 0.9],
 36 |     n_informative=10,
 37 |     n_redundant=1,
 38 |     flip_y=0,
 39 |     n_features=20,
 40 |     n_clusters_per_class=4,
 41 |     n_samples=5000,
 42 |     random_state=RANDOM_STATE,
 43 | )
 44 | 
 45 | # %% [markdown]
 46 | # We will use an over-sampler :class:`~imblearn.over_sampling.SMOTE` followed
 47 | # by a :class:`~sklearn.tree.DecisionTreeClassifier`. The aim will be to
 48 | # search which `k_neighbors` parameter is the most adequate with the dataset
 49 | # that we generated.
 50 | 
 51 | from sklearn.tree import DecisionTreeClassifier
 52 | 
 53 | # %%
 54 | from imblearn.over_sampling import SMOTE
 55 | from imblearn.pipeline import make_pipeline
 56 | 
 57 | model = make_pipeline(
 58 |     SMOTE(random_state=RANDOM_STATE), DecisionTreeClassifier(random_state=RANDOM_STATE)
 59 | )
 60 | 
 61 | # %% [markdown]
 62 | # We can use the :class:`~sklearn.model_selection.validation_curve` to inspect
 63 | # the impact of varying the parameter `k_neighbors`. In this case, we need
 64 | # to use a score to evaluate the generalization score during the
 65 | # cross-validation.
 66 | 
 67 | # %%
 68 | from sklearn.metrics import cohen_kappa_score, make_scorer
 69 | from sklearn.model_selection import validation_curve
 70 | 
 71 | scorer = make_scorer(cohen_kappa_score)
 72 | param_range = range(1, 11)
 73 | train_scores, test_scores = validation_curve(
 74 |     model,
 75 |     X,
 76 |     y,
 77 |     param_name="smote__k_neighbors",
 78 |     param_range=param_range,
 79 |     cv=3,
 80 |     scoring=scorer,
 81 | )
 82 | 
 83 | # %%
 84 | train_scores_mean = train_scores.mean(axis=1)
 85 | train_scores_std = train_scores.std(axis=1)
 86 | test_scores_mean = test_scores.mean(axis=1)
 87 | test_scores_std = test_scores.std(axis=1)
 88 | 
 89 | # %% [markdown]
 90 | # We can now plot the results of the cross-validation for the different
 91 | # parameter values that we tried.
 92 | 
 93 | # %%
 94 | import matplotlib.pyplot as plt
 95 | 
 96 | fig, ax = plt.subplots(figsize=(7, 7))
 97 | ax.plot(param_range, test_scores_mean, label="SMOTE")
 98 | ax.fill_between(
 99 |     param_range,
100 |     test_scores_mean + test_scores_std,
101 |     test_scores_mean - test_scores_std,
102 |     alpha=0.2,
103 | )
104 | idx_max = test_scores_mean.argmax()
105 | ax.scatter(
106 |     param_range[idx_max],
107 |     test_scores_mean[idx_max],
108 |     label=r"Cohen Kappa: ${:.2f}\pm{:.2f}$".format(
109 |         test_scores_mean[idx_max], test_scores_std[idx_max]
110 |     ),
111 | )
112 | 
113 | fig.suptitle("Validation Curve with SMOTE-CART")
114 | ax.set_xlabel("Number of neighbors")
115 | ax.set_ylabel("Cohen's kappa")
116 | 
117 | # make nice plotting
118 | sns.despine(ax=ax, offset=10)
119 | ax.set_xlim([1, 10])
120 | ax.set_ylim([0.4, 0.8])
121 | ax.legend(loc="lower right", fontsize=16)
122 | plt.tight_layout()
123 | plt.show()
124 | 


--------------------------------------------------------------------------------
/examples/over-sampling/README.txt:
--------------------------------------------------------------------------------
1 | .. _over_sampling_examples:
2 | 
3 | Example using over-sampling class methods
4 | =========================================
5 | 
6 | Data balancing can be performed by over-sampling such that new samples are generated in the minority class to reach a given balancing ratio.
7 | 


--------------------------------------------------------------------------------
/examples/over-sampling/plot_illustration_generation_sample.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ============================================
 3 | Sample generator used in SMOTE-like samplers
 4 | ============================================
 5 | 
 6 | This example illustrates how a new sample is generated taking into account the
 7 | neighbourhood of this sample. A new sample is generated by selecting the
 8 | randomly 2 samples of the same class and interpolating a point between these
 9 | samples.
10 | """
11 | 
12 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
13 | # License: MIT
14 | # %%
15 | print(__doc__)
16 | 
17 | import matplotlib.pyplot as plt
18 | import numpy as np
19 | import seaborn as sns
20 | 
21 | sns.set_context("poster")
22 | 
23 | rng = np.random.RandomState(18)
24 | 
25 | f, ax = plt.subplots(figsize=(8, 8))
26 | 
27 | # generate some data points
28 | y = np.array([3.65284, 3.52623, 3.51468, 3.22199, 3.21])
29 | z = np.array([0.43, 0.45, 0.6, 0.4, 0.211])
30 | y_2 = np.array([3.3, 3.6])
31 | z_2 = np.array([0.58, 0.34])
32 | 
33 | # plot the majority and minority samples
34 | ax.scatter(z, y, label="Minority class", s=100)
35 | ax.scatter(z_2, y_2, label="Majority class", s=100)
36 | 
37 | idx = rng.randint(len(y), size=2)
38 | annotation = [r"$x_i$", r"$x_{zi}$"]
39 | 
40 | for a, i in zip(annotation, idx):
41 |     ax.annotate(a, (z[i], y[i]), xytext=tuple([z[i] + 0.01, y[i] + 0.005]), fontsize=15)
42 | 
43 | # draw the circle in which the new sample will generated
44 | radius = np.sqrt((z[idx[0]] - z[idx[1]]) ** 2 + (y[idx[0]] - y[idx[1]]) ** 2)
45 | circle = plt.Circle((z[idx[0]], y[idx[0]]), radius=radius, alpha=0.2)
46 | ax.add_artist(circle)
47 | 
48 | # plot the line on which the sample will be generated
49 | ax.plot(z[idx], y[idx], "--", alpha=0.5)
50 | 
51 | # create and plot the new sample
52 | step = rng.uniform()
53 | y_gen = y[idx[0]] + step * (y[idx[1]] - y[idx[0]])
54 | z_gen = z[idx[0]] + step * (z[idx[1]] - z[idx[0]])
55 | 
56 | ax.scatter(z_gen, y_gen, s=100)
57 | ax.annotate(
58 |     r"$x_{new}$",
59 |     (z_gen, y_gen),
60 |     xytext=tuple([z_gen + 0.01, y_gen + 0.005]),
61 |     fontsize=15,
62 | )
63 | 
64 | # make the plot nicer with legend and label
65 | sns.despine(ax=ax, offset=10)
66 | ax.set_xlim([0.2, 0.7])
67 | ax.set_ylim([3.2, 3.7])
68 | plt.xlabel(r"$X_1$")
69 | plt.ylabel(r"$X_2$")
70 | plt.legend()
71 | plt.tight_layout()
72 | plt.show()
73 | 


--------------------------------------------------------------------------------
/examples/pipeline/README.txt:
--------------------------------------------------------------------------------
1 | .. _pipeline_examples:
2 | 
3 | Pipeline examples
4 | =================
5 | 
6 | Example of how to use the a pipeline to include under-sampling with `scikit-learn` estimators.
7 | 


--------------------------------------------------------------------------------
/examples/pipeline/plot_pipeline_classification.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ====================================
 3 | Usage of pipeline embedding samplers
 4 | ====================================
 5 | 
 6 | An example of the :class:~imblearn.pipeline.Pipeline` object (or
 7 | :func:`~imblearn.pipeline.make_pipeline` helper function) working with
 8 | transformers and resamplers.
 9 | """
10 | 
11 | # Authors: Christos Aridas
12 | #          Guillaume Lemaitre <g.lemaitre58@gmail.com>
13 | # License: MIT
14 | 
15 | # %%
16 | print(__doc__)
17 | 
18 | # %% [markdown]
19 | # Let's first create an imbalanced dataset and split in to two sets.
20 | 
21 | # %%
22 | from sklearn.datasets import make_classification
23 | from sklearn.model_selection import train_test_split
24 | 
25 | X, y = make_classification(
26 |     n_classes=2,
27 |     class_sep=1.25,
28 |     weights=[0.3, 0.7],
29 |     n_informative=3,
30 |     n_redundant=1,
31 |     flip_y=0,
32 |     n_features=5,
33 |     n_clusters_per_class=1,
34 |     n_samples=5000,
35 |     random_state=10,
36 | )
37 | 
38 | X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
39 | 
40 | # %% [markdown]
41 | # Now, we will create each individual steps that we would like later to combine
42 | 
43 | # %%
44 | from sklearn.decomposition import PCA
45 | from sklearn.neighbors import KNeighborsClassifier
46 | 
47 | from imblearn.over_sampling import SMOTE
48 | from imblearn.under_sampling import EditedNearestNeighbours
49 | 
50 | pca = PCA(n_components=2)
51 | enn = EditedNearestNeighbours()
52 | smote = SMOTE(random_state=0)
53 | knn = KNeighborsClassifier(n_neighbors=1)
54 | 
55 | # %% [markdown]
56 | # Now, we can finally create a pipeline to specify in which order the different
57 | # transformers and samplers should be executed before to provide the data to
58 | # the final classifier.
59 | 
60 | # %%
61 | from imblearn.pipeline import make_pipeline
62 | 
63 | model = make_pipeline(pca, enn, smote, knn)
64 | 
65 | # %% [markdown]
66 | # We can now use the pipeline created as a normal classifier where resampling
67 | # will happen when calling `fit` and disabled when calling `decision_function`,
68 | # `predict_proba`, or `predict`.
69 | 
70 | # %%
71 | from sklearn.metrics import classification_report
72 | 
73 | model.fit(X_train, y_train)
74 | y_pred = model.predict(X_test)
75 | print(classification_report(y_test, y_pred))
76 | 


--------------------------------------------------------------------------------
/examples/under-sampling/README.txt:
--------------------------------------------------------------------------------
1 | .. _under_sampling_examples:
2 | 
3 | Example using under-sampling class methods
4 | ==========================================
5 | 
6 | Under-sampling refers to the process of reducing the number of samples in the majority classes.
7 | The implemented methods can be categorized into 2 groups: (i) fixed under-sampling and (ii) cleaning under-sampling.
8 | 


--------------------------------------------------------------------------------
/examples/under-sampling/plot_illustration_tomek_links.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ==============================================
  3 | Illustration of the definition of a Tomek link
  4 | ==============================================
  5 | 
  6 | This example illustrates what is a Tomek link.
  7 | """
  8 | 
  9 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
 10 | # License: MIT
 11 | 
 12 | # %%
 13 | print(__doc__)
 14 | 
 15 | import matplotlib.pyplot as plt
 16 | import seaborn as sns
 17 | 
 18 | sns.set_context("poster")
 19 | 
 20 | # %% [markdown]
 21 | # This function allows to make nice plotting
 22 | 
 23 | # %%
 24 | 
 25 | 
 26 | def make_plot_despine(ax):
 27 |     sns.despine(ax=ax, offset=10)
 28 |     ax.set_xlim([0, 3])
 29 |     ax.set_ylim([0, 3])
 30 |     ax.set_xlabel(r"$X_1$")
 31 |     ax.set_ylabel(r"$X_2$")
 32 |     ax.legend(loc="lower right")
 33 | 
 34 | 
 35 | # %% [markdown]
 36 | # We will generate some toy data that illustrates how
 37 | # :class:`~imblearn.under_sampling.TomekLinks` is used to clean a dataset.
 38 | 
 39 | # %%
 40 | import numpy as np
 41 | 
 42 | rng = np.random.RandomState(18)
 43 | 
 44 | X_minority = np.transpose(
 45 |     [[1.1, 1.3, 1.15, 0.8, 0.55, 2.1], [1.0, 1.5, 1.7, 2.5, 0.55, 1.9]]
 46 | )
 47 | X_majority = np.transpose(
 48 |     [
 49 |         [2.1, 2.12, 2.13, 2.14, 2.2, 2.3, 2.5, 2.45],
 50 |         [1.5, 2.1, 2.7, 0.9, 1.0, 1.4, 2.4, 2.9],
 51 |     ]
 52 | )
 53 | 
 54 | # %% [markdown]
 55 | # In the figure above, the samples highlighted in green form a Tomek link since
 56 | # they are of different classes and are nearest neighbors of each other.
 57 | 
 58 | fig, ax = plt.subplots(figsize=(8, 8))
 59 | ax.scatter(
 60 |     X_minority[:, 0],
 61 |     X_minority[:, 1],
 62 |     label="Minority class",
 63 |     s=200,
 64 |     marker="_",
 65 | )
 66 | ax.scatter(
 67 |     X_majority[:, 0],
 68 |     X_majority[:, 1],
 69 |     label="Majority class",
 70 |     s=200,
 71 |     marker="+",
 72 | )
 73 | 
 74 | # highlight the samples of interest
 75 | ax.scatter(
 76 |     [X_minority[-1, 0], X_majority[1, 0]],
 77 |     [X_minority[-1, 1], X_majority[1, 1]],
 78 |     label="Tomek link",
 79 |     s=200,
 80 |     alpha=0.3,
 81 | )
 82 | make_plot_despine(ax)
 83 | fig.suptitle("Illustration of a Tomek link")
 84 | fig.tight_layout()
 85 | 
 86 | # %% [markdown]
 87 | # We can run the :class:`~imblearn.under_sampling.TomekLinks` sampling to
 88 | # remove the corresponding samples. If `sampling_strategy='auto'` only the
 89 | # sample from the majority class will be removed. If `sampling_strategy='all'`
 90 | # both samples will be removed.
 91 | 
 92 | # %%
 93 | from imblearn.under_sampling import TomekLinks
 94 | 
 95 | fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(16, 8))
 96 | 
 97 | samplers = {
 98 |     "Removing only majority samples": TomekLinks(sampling_strategy="auto"),
 99 |     "Removing all samples": TomekLinks(sampling_strategy="all"),
100 | }
101 | 
102 | for ax, (title, sampler) in zip(axs, samplers.items()):
103 |     X_res, y_res = sampler.fit_resample(
104 |         np.vstack((X_minority, X_majority)),
105 |         np.array([0] * X_minority.shape[0] + [1] * X_majority.shape[0]),
106 |     )
107 |     ax.scatter(
108 |         X_res[y_res == 0][:, 0],
109 |         X_res[y_res == 0][:, 1],
110 |         label="Minority class",
111 |         s=200,
112 |         marker="_",
113 |     )
114 |     ax.scatter(
115 |         X_res[y_res == 1][:, 0],
116 |         X_res[y_res == 1][:, 1],
117 |         label="Majority class",
118 |         s=200,
119 |         marker="+",
120 |     )
121 | 
122 |     # highlight the samples of interest
123 |     ax.scatter(
124 |         [X_minority[-1, 0], X_majority[1, 0]],
125 |         [X_minority[-1, 1], X_majority[1, 1]],
126 |         label="Tomek link",
127 |         s=200,
128 |         alpha=0.3,
129 |     )
130 | 
131 |     ax.set_title(title)
132 |     make_plot_despine(ax)
133 | fig.tight_layout()
134 | 
135 | plt.show()
136 | 


--------------------------------------------------------------------------------
/imblearn/VERSION.txt:
--------------------------------------------------------------------------------
1 | 0.14.dev0
2 | 


--------------------------------------------------------------------------------
/imblearn/_version.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ``imbalanced-learn`` is a set of python methods to deal with imbalanced
 3 | datset in machine learning and pattern recognition.
 4 | """
 5 | # Based on NiLearn package
 6 | # License: simplified BSD
 7 | 
 8 | # PEP0440 compatible formatted version, see:
 9 | # https://www.python.org/dev/peps/pep-0440/
10 | #
11 | # Generic release markers:
12 | # X.Y
13 | # X.Y.Z # For bugfix releases
14 | #
15 | # Admissible pre-release markers:
16 | # X.YaN # Alpha release
17 | # X.YbN # Beta release
18 | # X.YrcN # Release Candidate
19 | # X.Y # Final release
20 | #
21 | # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
22 | # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
23 | 
24 | from pathlib import Path
25 | 
26 | with open(Path(__file__).parent / "VERSION.txt") as _fh:
27 |     __version__ = _fh.read().strip()
28 | 


--------------------------------------------------------------------------------
/imblearn/combine/__init__.py:
--------------------------------------------------------------------------------
1 | """The :mod:`imblearn.combine` provides methods which combine
2 | over-sampling and under-sampling.
3 | """
4 | 
5 | from ._smote_enn import SMOTEENN
6 | from ._smote_tomek import SMOTETomek
7 | 
8 | __all__ = ["SMOTEENN", "SMOTETomek"]
9 | 


--------------------------------------------------------------------------------
/imblearn/combine/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/combine/tests/__init__.py


--------------------------------------------------------------------------------
/imblearn/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`imblearn.datasets` provides methods to generate
 3 | imbalanced data.
 4 | """
 5 | 
 6 | from ._imbalance import make_imbalance
 7 | from ._zenodo import fetch_datasets
 8 | 
 9 | __all__ = ["make_imbalance", "fetch_datasets"]
10 | 


--------------------------------------------------------------------------------
/imblearn/datasets/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/datasets/tests/__init__.py


--------------------------------------------------------------------------------
/imblearn/datasets/tests/test_imbalance.py:
--------------------------------------------------------------------------------
 1 | """Test the module easy ensemble."""
 2 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
 3 | #          Christos Aridas
 4 | # License: MIT
 5 | 
 6 | from collections import Counter
 7 | 
 8 | import numpy as np
 9 | import pytest
10 | from sklearn.datasets import load_iris
11 | 
12 | from imblearn.datasets import make_imbalance
13 | 
14 | 
15 | @pytest.fixture
16 | def iris():
17 |     return load_iris(return_X_y=True)
18 | 
19 | 
20 | @pytest.mark.parametrize(
21 |     "sampling_strategy, err_msg",
22 |     [
23 |         ({0: -100, 1: 50, 2: 50}, "in a class cannot be negative"),
24 |         ({0: 10, 1: 70}, "should be less or equal to the original"),
25 |     ],
26 | )
27 | def test_make_imbalance_error(iris, sampling_strategy, err_msg):
28 |     # we are reusing part of utils.check_sampling_strategy, however this is not
29 |     # cover in the common tests so we will repeat it here
30 |     X, y = iris
31 |     with pytest.raises(ValueError, match=err_msg):
32 |         make_imbalance(X, y, sampling_strategy=sampling_strategy)
33 | 
34 | 
35 | def test_make_imbalance_error_single_class(iris):
36 |     X, y = iris
37 |     y = np.zeros_like(y)
38 |     with pytest.raises(ValueError, match="needs to have more than 1 class."):
39 |         make_imbalance(X, y, sampling_strategy={0: 10})
40 | 
41 | 
42 | @pytest.mark.parametrize(
43 |     "sampling_strategy, expected_counts",
44 |     [
45 |         ({0: 10, 1: 20, 2: 30}, {0: 10, 1: 20, 2: 30}),
46 |         ({0: 10, 1: 20}, {0: 10, 1: 20, 2: 50}),
47 |     ],
48 | )
49 | def test_make_imbalance_dict(iris, sampling_strategy, expected_counts):
50 |     X, y = iris
51 |     _, y_ = make_imbalance(X, y, sampling_strategy=sampling_strategy)
52 |     assert Counter(y_) == expected_counts
53 | 
54 | 
55 | @pytest.mark.parametrize("as_frame", [True, False], ids=["dataframe", "array"])
56 | @pytest.mark.parametrize(
57 |     "sampling_strategy, expected_counts",
58 |     [
59 |         (
60 |             {"setosa": 10, "versicolor": 20, "virginica": 30},
61 |             {"setosa": 10, "versicolor": 20, "virginica": 30},
62 |         ),
63 |         (
64 |             {"setosa": 10, "versicolor": 20},
65 |             {"setosa": 10, "versicolor": 20, "virginica": 50},
66 |         ),
67 |     ],
68 | )
69 | def test_make_imbalanced_iris(as_frame, sampling_strategy, expected_counts):
70 |     pd = pytest.importorskip("pandas")
71 |     iris = load_iris(as_frame=as_frame)
72 |     X, y = iris.data, iris.target
73 |     y = iris.target_names[iris.target]
74 |     if as_frame:
75 |         y = pd.Series(iris.target_names[iris.target], name="target")
76 |     X_res, y_res = make_imbalance(X, y, sampling_strategy=sampling_strategy)
77 |     if as_frame:
78 |         assert hasattr(X_res, "loc")
79 |         pd.testing.assert_index_equal(X_res.index, y_res.index)
80 |     assert Counter(y_res) == expected_counts
81 | 


--------------------------------------------------------------------------------
/imblearn/datasets/tests/test_zenodo.py:
--------------------------------------------------------------------------------
 1 | """Test the datasets loader.
 2 | 
 3 | Skipped if datasets is not already downloaded to data_home.
 4 | """
 5 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
 6 | #          Christos Aridas
 7 | # License: MIT
 8 | 
 9 | import pytest
10 | from sklearn.utils._testing import SkipTest
11 | 
12 | from imblearn.datasets import fetch_datasets
13 | 
14 | DATASET_SHAPE = {
15 |     "ecoli": (336, 7),
16 |     "optical_digits": (5620, 64),
17 |     "satimage": (6435, 36),
18 |     "pen_digits": (10992, 16),
19 |     "abalone": (4177, 10),
20 |     "sick_euthyroid": (3163, 42),
21 |     "spectrometer": (531, 93),
22 |     "car_eval_34": (1728, 21),
23 |     "isolet": (7797, 617),
24 |     "us_crime": (1994, 100),
25 |     "yeast_ml8": (2417, 103),
26 |     "scene": (2407, 294),
27 |     "libras_move": (360, 90),
28 |     "thyroid_sick": (3772, 52),
29 |     "coil_2000": (9822, 85),
30 |     "arrhythmia": (452, 278),
31 |     "solar_flare_m0": (1389, 32),
32 |     "oil": (937, 49),
33 |     "car_eval_4": (1728, 21),
34 |     "wine_quality": (4898, 11),
35 |     "letter_img": (20000, 16),
36 |     "yeast_me2": (1484, 8),
37 |     "webpage": (34780, 300),
38 |     "ozone_level": (2536, 72),
39 |     "mammography": (11183, 6),
40 |     "protein_homo": (145751, 74),
41 |     "abalone_19": (4177, 10),
42 | }
43 | 
44 | 
45 | def fetch(*args, **kwargs):
46 |     return fetch_datasets(*args, download_if_missing=True, **kwargs)
47 | 
48 | 
49 | @pytest.mark.xfail
50 | def test_fetch():
51 |     try:
52 |         datasets1 = fetch(shuffle=True, random_state=42)
53 |     except IOError:
54 |         raise SkipTest("Zenodo dataset can not be loaded.")
55 | 
56 |     datasets2 = fetch(shuffle=True, random_state=37)
57 | 
58 |     for k in DATASET_SHAPE.keys():
59 |         X1, X2 = datasets1[k].data, datasets2[k].data
60 |         assert DATASET_SHAPE[k] == X1.shape
61 |         assert X1.shape == X2.shape
62 | 
63 |         y1, y2 = datasets1[k].target, datasets2[k].target
64 |         assert (X1.shape[0],) == y1.shape
65 |         assert (X1.shape[0],) == y2.shape
66 | 
67 | 
68 | def test_fetch_filter():
69 |     try:
70 |         datasets1 = fetch(filter_data=tuple([1]), shuffle=True, random_state=42)
71 |     except IOError:
72 |         raise SkipTest("Zenodo dataset can not be loaded.")
73 | 
74 |     datasets2 = fetch(filter_data=tuple(["ecoli"]), shuffle=True, random_state=37)
75 | 
76 |     X1, X2 = datasets1["ecoli"].data, datasets2["ecoli"].data
77 |     assert DATASET_SHAPE["ecoli"] == X1.shape
78 |     assert X1.shape == X2.shape
79 | 
80 |     assert X1.sum() == pytest.approx(X2.sum())
81 | 
82 |     y1, y2 = datasets1["ecoli"].target, datasets2["ecoli"].target
83 |     assert (X1.shape[0],) == y1.shape
84 |     assert (X1.shape[0],) == y2.shape
85 | 
86 | 
87 | @pytest.mark.parametrize(
88 |     "filter_data, err_msg",
89 |     [
90 |         (("rnf",), "is not a dataset available"),
91 |         ((-1,), "dataset with the ID="),
92 |         ((100,), "dataset with the ID="),
93 |         ((1.00,), "value in the tuple"),
94 |     ],
95 | )
96 | def test_fetch_error(filter_data, err_msg):
97 |     with pytest.raises(ValueError, match=err_msg):
98 |         fetch_datasets(filter_data=filter_data)
99 | 


--------------------------------------------------------------------------------
/imblearn/ensemble/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`imblearn.ensemble` module include methods generating
 3 | under-sampled subsets combined inside an ensemble.
 4 | """
 5 | 
 6 | from ._bagging import BalancedBaggingClassifier
 7 | from ._easy_ensemble import EasyEnsembleClassifier
 8 | from ._forest import BalancedRandomForestClassifier
 9 | from ._weight_boosting import RUSBoostClassifier
10 | 
11 | __all__ = [
12 |     "BalancedBaggingClassifier",
13 |     "BalancedRandomForestClassifier",
14 |     "EasyEnsembleClassifier",
15 |     "RUSBoostClassifier",
16 | ]
17 | 


--------------------------------------------------------------------------------
/imblearn/ensemble/_common.py:
--------------------------------------------------------------------------------
  1 | from numbers import Integral, Real
  2 | 
  3 | from sklearn.tree._criterion import Criterion
  4 | from sklearn.utils._param_validation import (
  5 |     HasMethods,
  6 |     Hidden,
  7 |     Interval,
  8 |     RealNotInt,
  9 |     StrOptions,
 10 | )
 11 | 
 12 | 
 13 | def _estimator_has(attr):
 14 |     """Check if we can delegate a method to the underlying estimator.
 15 |     First, we check the first fitted estimator if available, otherwise we
 16 |     check the estimator attribute.
 17 |     """
 18 | 
 19 |     def check(self):
 20 |         if hasattr(self, "estimators_"):
 21 |             return hasattr(self.estimators_[0], attr)
 22 |         elif self.estimator is not None:
 23 |             return hasattr(self.estimator, attr)
 24 |         else:  # TODO(1.4): Remove when the base_estimator deprecation cycle ends
 25 |             return hasattr(self.base_estimator, attr)
 26 | 
 27 |     return check
 28 | 
 29 | 
 30 | _bagging_parameter_constraints = {
 31 |     "estimator": [HasMethods(["fit", "predict"]), None],
 32 |     "n_estimators": [Interval(Integral, 1, None, closed="left")],
 33 |     "max_samples": [
 34 |         Interval(Integral, 1, None, closed="left"),
 35 |         Interval(RealNotInt, 0, 1, closed="right"),
 36 |     ],
 37 |     "max_features": [
 38 |         Interval(Integral, 1, None, closed="left"),
 39 |         Interval(RealNotInt, 0, 1, closed="right"),
 40 |     ],
 41 |     "bootstrap": ["boolean"],
 42 |     "bootstrap_features": ["boolean"],
 43 |     "oob_score": ["boolean"],
 44 |     "warm_start": ["boolean"],
 45 |     "n_jobs": [None, Integral],
 46 |     "random_state": ["random_state"],
 47 |     "verbose": ["verbose"],
 48 |     "base_estimator": [
 49 |         HasMethods(["fit", "predict"]),
 50 |         StrOptions({"deprecated"}),
 51 |         None,
 52 |     ],
 53 | }
 54 | 
 55 | _adaboost_classifier_parameter_constraints = {
 56 |     "estimator": [HasMethods(["fit", "predict"]), None],
 57 |     "n_estimators": [Interval(Integral, 1, None, closed="left")],
 58 |     "learning_rate": [Interval(Real, 0, None, closed="neither")],
 59 |     "random_state": ["random_state"],
 60 |     "base_estimator": [HasMethods(["fit", "predict"]), StrOptions({"deprecated"})],
 61 |     "algorithm": [StrOptions({"SAMME", "SAMME.R"})],
 62 | }
 63 | 
 64 | _random_forest_classifier_parameter_constraints = {
 65 |     "n_estimators": [Interval(Integral, 1, None, closed="left")],
 66 |     "bootstrap": ["boolean"],
 67 |     "oob_score": ["boolean"],
 68 |     "n_jobs": [Integral, None],
 69 |     "random_state": ["random_state"],
 70 |     "verbose": ["verbose"],
 71 |     "warm_start": ["boolean"],
 72 |     "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)],
 73 |     "max_samples": [
 74 |         None,
 75 |         Interval(Real, 0.0, 1.0, closed="right"),
 76 |         Interval(Integral, 1, None, closed="left"),
 77 |     ],
 78 |     "max_depth": [Interval(Integral, 1, None, closed="left"), None],
 79 |     "min_samples_split": [
 80 |         Interval(Integral, 2, None, closed="left"),
 81 |         Interval(RealNotInt, 0.0, 1.0, closed="right"),
 82 |     ],
 83 |     "min_samples_leaf": [
 84 |         Interval(Integral, 1, None, closed="left"),
 85 |         Interval(RealNotInt, 0.0, 1.0, closed="neither"),
 86 |     ],
 87 |     "min_weight_fraction_leaf": [Interval(Real, 0.0, 0.5, closed="both")],
 88 |     "max_features": [
 89 |         Interval(Integral, 1, None, closed="left"),
 90 |         Interval(RealNotInt, 0.0, 1.0, closed="right"),
 91 |         StrOptions({"sqrt", "log2"}),
 92 |         None,
 93 |     ],
 94 |     "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None],
 95 |     "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")],
 96 |     "ccp_alpha": [Interval(Real, 0.0, None, closed="left")],
 97 |     "class_weight": [
 98 |         StrOptions({"balanced_subsample", "balanced"}),
 99 |         dict,
100 |         list,
101 |         None,
102 |     ],
103 |     "monotonic_cst": ["array-like", None],
104 | }
105 | 


--------------------------------------------------------------------------------
/imblearn/ensemble/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/ensemble/tests/__init__.py


--------------------------------------------------------------------------------
/imblearn/ensemble/tests/test_weight_boosting.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from sklearn.datasets import make_classification
 4 | from sklearn.model_selection import train_test_split
 5 | from sklearn.utils._testing import assert_array_equal
 6 | 
 7 | from imblearn.ensemble import RUSBoostClassifier
 8 | 
 9 | 
10 | @pytest.fixture
11 | def imbalanced_dataset():
12 |     return make_classification(
13 |         n_samples=10000,
14 |         n_features=3,
15 |         n_informative=2,
16 |         n_redundant=0,
17 |         n_repeated=0,
18 |         n_classes=3,
19 |         n_clusters_per_class=1,
20 |         weights=[0.01, 0.05, 0.94],
21 |         class_sep=0.8,
22 |         random_state=0,
23 |     )
24 | 
25 | 
26 | def test_rusboost(imbalanced_dataset):
27 |     X, y = imbalanced_dataset
28 |     X_train, X_test, y_train, y_test = train_test_split(
29 |         X, y, stratify=y, random_state=1
30 |     )
31 |     classes = np.unique(y)
32 | 
33 |     n_estimators = 500
34 |     rusboost = RUSBoostClassifier(n_estimators=n_estimators, random_state=0)
35 |     rusboost.fit(X_train, y_train)
36 |     assert_array_equal(classes, rusboost.classes_)
37 | 
38 |     # check that we have an ensemble of samplers and estimators with a
39 |     # consistent size
40 |     assert len(rusboost.estimators_) > 1
41 |     assert len(rusboost.estimators_) == len(rusboost.samplers_)
42 |     assert len(rusboost.pipelines_) == len(rusboost.samplers_)
43 | 
44 |     # each sampler in the ensemble should have different random state
45 |     assert len({sampler.random_state for sampler in rusboost.samplers_}) == len(
46 |         rusboost.samplers_
47 |     )
48 |     # each estimator in the ensemble should have different random state
49 |     assert len({est.random_state for est in rusboost.estimators_}) == len(
50 |         rusboost.estimators_
51 |     )
52 | 
53 |     # check the consistency of the feature importances
54 |     assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1]
55 | 
56 |     # check the consistency of the prediction outpus
57 |     y_pred = rusboost.predict_proba(X_test)
58 |     assert y_pred.shape[1] == len(classes)
59 |     assert rusboost.decision_function(X_test).shape[1] == len(classes)
60 | 
61 |     score = rusboost.score(X_test, y_test)
62 |     assert score > 0.6, f"Failed with score {score}"
63 | 
64 |     y_pred = rusboost.predict(X_test)
65 |     assert y_pred.shape == y_test.shape
66 | 
67 | 
68 | def test_rusboost_sample_weight(imbalanced_dataset):
69 |     X, y = imbalanced_dataset
70 |     sample_weight = np.ones_like(y)
71 |     rusboost = RUSBoostClassifier(random_state=0)
72 | 
73 |     # Predictions should be the same when sample_weight are all ones
74 |     y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X)
75 |     y_pred_no_sample_weight = rusboost.fit(X, y).predict(X)
76 | 
77 |     assert_array_equal(y_pred_sample_weight, y_pred_no_sample_weight)
78 | 
79 |     rng = np.random.RandomState(42)
80 |     sample_weight = rng.rand(y.shape[0])
81 |     y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X)
82 | 
83 |     with pytest.raises(AssertionError):
84 |         assert_array_equal(y_pred_no_sample_weight, y_pred_sample_weight)
85 | 
86 | 
87 | @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
88 | def test_rusboost_algorithm(imbalanced_dataset, algorithm):
89 |     X, y = imbalanced_dataset
90 | 
91 |     rusboost = RUSBoostClassifier(algorithm=algorithm)
92 |     warn_msg = "`algorithm` parameter is deprecated in 0.12 and will be removed"
93 |     with pytest.warns(FutureWarning, match=warn_msg):
94 |         rusboost.fit(X, y)
95 | 


--------------------------------------------------------------------------------
/imblearn/exceptions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`imblearn.exceptions` module includes all custom warnings and error
 3 | classes and functions used across imbalanced-learn.
 4 | """
 5 | 
 6 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
 7 | # License: MIT
 8 | 
 9 | 
10 | def raise_isinstance_error(variable_name, possible_type, variable):
11 |     """Raise consistent error message for isinstance() function.
12 | 
13 |     Parameters
14 |     ----------
15 |     variable_name : str
16 |         The name of the variable.
17 | 
18 |     possible_type : type
19 |         The possible type of the variable.
20 | 
21 |     variable : object
22 |         The variable to check.
23 | 
24 |     Raises
25 |     ------
26 |     ValueError
27 |         If the instance is not of the possible type.
28 |     """
29 |     raise ValueError(
30 |         f"{variable_name} has to be one of {possible_type}. "
31 |         f"Got {type(variable)} instead."
32 |     )
33 | 


--------------------------------------------------------------------------------
/imblearn/keras/__init__.py:
--------------------------------------------------------------------------------
1 | """The :mod:`imblearn.keras` provides utilities to deal with imbalanced dataset
2 | in keras."""
3 | 
4 | from ._generator import BalancedBatchGenerator, balanced_batch_generator
5 | 
6 | __all__ = ["BalancedBatchGenerator", "balanced_batch_generator"]
7 | 


--------------------------------------------------------------------------------
/imblearn/keras/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/keras/tests/__init__.py


--------------------------------------------------------------------------------
/imblearn/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`imblearn.metrics` module includes score functions, performance
 3 | metrics and pairwise metrics and distance computations.
 4 | """
 5 | 
 6 | from ._classification import (
 7 |     classification_report_imbalanced,
 8 |     geometric_mean_score,
 9 |     macro_averaged_mean_absolute_error,
10 |     make_index_balanced_accuracy,
11 |     sensitivity_score,
12 |     sensitivity_specificity_support,
13 |     specificity_score,
14 | )
15 | 
16 | __all__ = [
17 |     "sensitivity_specificity_support",
18 |     "sensitivity_score",
19 |     "specificity_score",
20 |     "geometric_mean_score",
21 |     "make_index_balanced_accuracy",
22 |     "classification_report_imbalanced",
23 |     "macro_averaged_mean_absolute_error",
24 | ]
25 | 


--------------------------------------------------------------------------------
/imblearn/metrics/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/metrics/tests/__init__.py


--------------------------------------------------------------------------------
/imblearn/metrics/tests/test_score_objects.py:
--------------------------------------------------------------------------------
 1 | """Test for score"""
 2 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
 3 | #          Christos Aridas
 4 | # License: MIT
 5 | 
 6 | import pytest
 7 | from sklearn.datasets import make_blobs
 8 | from sklearn.linear_model import LogisticRegression
 9 | from sklearn.metrics import make_scorer
10 | from sklearn.model_selection import GridSearchCV, train_test_split
11 | 
12 | from imblearn.metrics import (
13 |     geometric_mean_score,
14 |     make_index_balanced_accuracy,
15 |     sensitivity_score,
16 |     specificity_score,
17 | )
18 | 
19 | R_TOL = 1e-2
20 | 
21 | 
22 | @pytest.fixture
23 | def data():
24 |     X, y = make_blobs(random_state=0, centers=2)
25 |     return train_test_split(X, y, random_state=0)
26 | 
27 | 
28 | @pytest.mark.parametrize(
29 |     "score, expected_score",
30 |     [
31 |         (sensitivity_score, 0.90),
32 |         (specificity_score, 0.90),
33 |         (geometric_mean_score, 0.90),
34 |         (make_index_balanced_accuracy()(geometric_mean_score), 0.82),
35 |     ],
36 | )
37 | @pytest.mark.parametrize("average", ["macro", "weighted", "micro"])
38 | def test_scorer_common_average(data, score, expected_score, average):
39 |     X_train, X_test, y_train, _ = data
40 | 
41 |     scorer = make_scorer(score, pos_label=None, average=average)
42 |     grid = GridSearchCV(
43 |         LogisticRegression(),
44 |         param_grid={"C": [1, 10]},
45 |         scoring=scorer,
46 |         cv=3,
47 |     )
48 |     grid.fit(X_train, y_train).predict(X_test)
49 | 
50 |     assert grid.best_score_ >= expected_score
51 | 
52 | 
53 | @pytest.mark.parametrize(
54 |     "score, average, expected_score",
55 |     [
56 |         (sensitivity_score, "binary", 0.94),
57 |         (specificity_score, "binary", 0.89),
58 |         (geometric_mean_score, "multiclass", 0.90),
59 |         (
60 |             make_index_balanced_accuracy()(geometric_mean_score),
61 |             "multiclass",
62 |             0.82,
63 |         ),
64 |     ],
65 | )
66 | def test_scorer_default_average(data, score, average, expected_score):
67 |     X_train, X_test, y_train, _ = data
68 | 
69 |     scorer = make_scorer(score, pos_label=1, average=average)
70 |     grid = GridSearchCV(
71 |         LogisticRegression(),
72 |         param_grid={"C": [1, 10]},
73 |         scoring=scorer,
74 |         cv=3,
75 |     )
76 |     grid.fit(X_train, y_train).predict(X_test)
77 | 
78 |     assert grid.best_score_ >= expected_score
79 | 


--------------------------------------------------------------------------------
/imblearn/over_sampling/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`imblearn.over_sampling` provides a set of method to
 3 | perform over-sampling.
 4 | """
 5 | 
 6 | from ._adasyn import ADASYN
 7 | from ._random_over_sampler import RandomOverSampler
 8 | from ._smote import SMOTE, SMOTEN, SMOTENC, SVMSMOTE, BorderlineSMOTE, KMeansSMOTE
 9 | 
10 | __all__ = [
11 |     "ADASYN",
12 |     "RandomOverSampler",
13 |     "KMeansSMOTE",
14 |     "SMOTE",
15 |     "BorderlineSMOTE",
16 |     "SVMSMOTE",
17 |     "SMOTENC",
18 |     "SMOTEN",
19 | ]
20 | 


--------------------------------------------------------------------------------
/imblearn/over_sampling/_smote/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import SMOTE, SMOTEN, SMOTENC
 2 | from .cluster import KMeansSMOTE
 3 | from .filter import SVMSMOTE, BorderlineSMOTE
 4 | 
 5 | __all__ = [
 6 |     "SMOTE",
 7 |     "SMOTEN",
 8 |     "SMOTENC",
 9 |     "KMeansSMOTE",
10 |     "BorderlineSMOTE",
11 |     "SVMSMOTE",
12 | ]
13 | 


--------------------------------------------------------------------------------
/imblearn/over_sampling/_smote/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/over_sampling/_smote/tests/__init__.py


--------------------------------------------------------------------------------
/imblearn/over_sampling/_smote/tests/test_borderline_smote.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | 
  3 | import pytest
  4 | from sklearn.datasets import make_classification
  5 | from sklearn.linear_model import LogisticRegression
  6 | from sklearn.utils._testing import assert_allclose, assert_array_equal
  7 | 
  8 | from imblearn.over_sampling import BorderlineSMOTE
  9 | 
 10 | 
 11 | @pytest.mark.parametrize("kind", ["borderline-1", "borderline-2"])
 12 | def test_borderline_smote_no_in_danger_samples(kind):
 13 |     """Check that the algorithm behave properly even on a dataset without any sample
 14 |     in danger.
 15 |     """
 16 |     X, y = make_classification(
 17 |         n_samples=500,
 18 |         n_features=2,
 19 |         n_informative=2,
 20 |         n_redundant=0,
 21 |         n_repeated=0,
 22 |         n_clusters_per_class=1,
 23 |         n_classes=3,
 24 |         weights=[0.1, 0.2, 0.7],
 25 |         class_sep=1.5,
 26 |         random_state=1,
 27 |     )
 28 |     smote = BorderlineSMOTE(kind=kind, m_neighbors=3, k_neighbors=5, random_state=0)
 29 |     X_res, y_res = smote.fit_resample(X, y)
 30 | 
 31 |     assert_allclose(X, X_res)
 32 |     assert_allclose(y, y_res)
 33 |     assert not smote.in_danger_indices
 34 | 
 35 | 
 36 | def test_borderline_smote_kind():
 37 |     """Check the behaviour of the `kind` parameter.
 38 | 
 39 |     In short, "borderline-2" generates sample closer to the boundary decision than
 40 |     "borderline-1". We generate an example where a logistic regression will perform
 41 |     worse on "borderline-2" than on "borderline-1".
 42 |     """
 43 |     X, y = make_classification(
 44 |         n_samples=500,
 45 |         n_features=2,
 46 |         n_informative=2,
 47 |         n_redundant=0,
 48 |         n_repeated=0,
 49 |         n_clusters_per_class=1,
 50 |         n_classes=3,
 51 |         weights=[0.1, 0.2, 0.7],
 52 |         class_sep=1.0,
 53 |         random_state=1,
 54 |     )
 55 |     smote = BorderlineSMOTE(
 56 |         kind="borderline-1", m_neighbors=9, k_neighbors=5, random_state=0
 57 |     )
 58 |     X_res_borderline_1, y_res_borderline_1 = smote.fit_resample(X, y)
 59 |     smote.set_params(kind="borderline-2")
 60 |     X_res_borderline_2, y_res_borderline_2 = smote.fit_resample(X, y)
 61 | 
 62 |     score_borderline_1 = (
 63 |         LogisticRegression()
 64 |         .fit(X_res_borderline_1, y_res_borderline_1)
 65 |         .score(X_res_borderline_1, y_res_borderline_1)
 66 |     )
 67 |     score_borderline_2 = (
 68 |         LogisticRegression()
 69 |         .fit(X_res_borderline_2, y_res_borderline_2)
 70 |         .score(X_res_borderline_2, y_res_borderline_2)
 71 |     )
 72 |     assert score_borderline_1 > score_borderline_2
 73 | 
 74 | 
 75 | def test_borderline_smote_in_danger():
 76 |     X, y = make_classification(
 77 |         n_samples=500,
 78 |         n_features=2,
 79 |         n_informative=2,
 80 |         n_redundant=0,
 81 |         n_repeated=0,
 82 |         n_clusters_per_class=1,
 83 |         n_classes=3,
 84 |         weights=[0.1, 0.2, 0.7],
 85 |         class_sep=0.8,
 86 |         random_state=1,
 87 |     )
 88 |     smote = BorderlineSMOTE(
 89 |         kind="borderline-1",
 90 |         m_neighbors=9,
 91 |         k_neighbors=5,
 92 |         random_state=0,
 93 |     )
 94 |     _, y_res_1 = smote.fit_resample(X, y)
 95 |     in_danger_indices_borderline_1 = smote.in_danger_indices
 96 |     smote.set_params(kind="borderline-2")
 97 |     _, y_res_2 = smote.fit_resample(X, y)
 98 |     in_danger_indices_borderline_2 = smote.in_danger_indices
 99 | 
100 |     for key1, key2 in zip(
101 |         in_danger_indices_borderline_1, in_danger_indices_borderline_2
102 |     ):
103 |         assert_array_equal(
104 |             in_danger_indices_borderline_1[key1], in_danger_indices_borderline_2[key2]
105 |         )
106 |     assert len(in_danger_indices_borderline_1) == len(in_danger_indices_borderline_2)
107 |     counter = Counter(y_res_1)
108 |     assert counter[0] == counter[1] == counter[2]
109 |     counter = Counter(y_res_2)
110 |     assert counter[0] == counter[1] == counter[2]
111 | 


--------------------------------------------------------------------------------
/imblearn/over_sampling/_smote/tests/test_kmeans_smote.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | from sklearn.cluster import KMeans, MiniBatchKMeans
  4 | from sklearn.datasets import make_classification
  5 | from sklearn.neighbors import NearestNeighbors
  6 | from sklearn.utils._testing import assert_allclose, assert_array_equal
  7 | 
  8 | from imblearn.over_sampling import SMOTE, KMeansSMOTE
  9 | 
 10 | 
 11 | @pytest.fixture
 12 | def data():
 13 |     X = np.array(
 14 |         [
 15 |             [0.11622591, -0.0317206],
 16 |             [0.77481731, 0.60935141],
 17 |             [1.25192108, -0.22367336],
 18 |             [0.53366841, -0.30312976],
 19 |             [1.52091956, -0.49283504],
 20 |             [-0.28162401, -2.10400981],
 21 |             [0.83680821, 1.72827342],
 22 |             [0.3084254, 0.33299982],
 23 |             [0.70472253, -0.73309052],
 24 |             [0.28893132, -0.38761769],
 25 |             [1.15514042, 0.0129463],
 26 |             [0.88407872, 0.35454207],
 27 |             [1.31301027, -0.92648734],
 28 |             [-1.11515198, -0.93689695],
 29 |             [-0.18410027, -0.45194484],
 30 |             [0.9281014, 0.53085498],
 31 |             [-0.14374509, 0.27370049],
 32 |             [-0.41635887, -0.38299653],
 33 |             [0.08711622, 0.93259929],
 34 |             [1.70580611, -0.11219234],
 35 |         ]
 36 |     )
 37 |     y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0])
 38 |     return X, y
 39 | 
 40 | 
 41 | @pytest.mark.filterwarnings("ignore:The default value of `n_init` will change")
 42 | def test_kmeans_smote(data):
 43 |     X, y = data
 44 |     kmeans_smote = KMeansSMOTE(
 45 |         kmeans_estimator=1,
 46 |         random_state=42,
 47 |         cluster_balance_threshold=0.0,
 48 |         k_neighbors=5,
 49 |     )
 50 |     smote = SMOTE(random_state=42)
 51 | 
 52 |     X_res_1, y_res_1 = kmeans_smote.fit_resample(X, y)
 53 |     X_res_2, y_res_2 = smote.fit_resample(X, y)
 54 | 
 55 |     assert_allclose(X_res_1, X_res_2)
 56 |     assert_array_equal(y_res_1, y_res_2)
 57 | 
 58 |     assert kmeans_smote.nn_k_.n_neighbors == 6
 59 |     assert kmeans_smote.kmeans_estimator_.n_clusters == 1
 60 |     assert "batch_size" in kmeans_smote.kmeans_estimator_.get_params()
 61 | 
 62 | 
 63 | @pytest.mark.filterwarnings("ignore:The default value of `n_init` will change")
 64 | @pytest.mark.parametrize("k_neighbors", [2, NearestNeighbors(n_neighbors=3)])
 65 | @pytest.mark.parametrize(
 66 |     "kmeans_estimator",
 67 |     [
 68 |         3,
 69 |         KMeans(n_clusters=3, n_init=1, random_state=42),
 70 |         MiniBatchKMeans(n_clusters=3, n_init=1, random_state=42),
 71 |     ],
 72 | )
 73 | def test_sample_kmeans_custom(data, k_neighbors, kmeans_estimator):
 74 |     X, y = data
 75 |     kmeans_smote = KMeansSMOTE(
 76 |         random_state=42,
 77 |         kmeans_estimator=kmeans_estimator,
 78 |         k_neighbors=k_neighbors,
 79 |     )
 80 |     X_resampled, y_resampled = kmeans_smote.fit_resample(X, y)
 81 |     assert X_resampled.shape == (24, 2)
 82 |     assert y_resampled.shape == (24,)
 83 | 
 84 |     assert kmeans_smote.nn_k_.n_neighbors == 3
 85 |     assert kmeans_smote.kmeans_estimator_.n_clusters == 3
 86 | 
 87 | 
 88 | @pytest.mark.filterwarnings("ignore:The default value of `n_init` will change")
 89 | def test_sample_kmeans_not_enough_clusters(data):
 90 |     X, y = data
 91 |     smote = KMeansSMOTE(cluster_balance_threshold=10, random_state=42)
 92 |     with pytest.raises(RuntimeError):
 93 |         smote.fit_resample(X, y)
 94 | 
 95 | 
 96 | @pytest.mark.parametrize("density_exponent", ["auto", 10])
 97 | @pytest.mark.parametrize("cluster_balance_threshold", ["auto", 0.1])
 98 | def test_sample_kmeans_density_estimation(density_exponent, cluster_balance_threshold):
 99 |     X, y = make_classification(
100 |         n_samples=10_000, n_classes=2, weights=[0.3, 0.7], random_state=42
101 |     )
102 |     smote = KMeansSMOTE(
103 |         kmeans_estimator=MiniBatchKMeans(n_init=1, random_state=42),
104 |         random_state=0,
105 |         density_exponent=density_exponent,
106 |         cluster_balance_threshold=cluster_balance_threshold,
107 |     )
108 |     smote.fit_resample(X, y)
109 | 


--------------------------------------------------------------------------------
/imblearn/over_sampling/_smote/tests/test_smoten.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from sklearn.exceptions import DataConversionWarning
 4 | from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
 5 | from sklearn.utils._testing import _convert_container
 6 | 
 7 | from imblearn.over_sampling import SMOTEN
 8 | 
 9 | 
10 | @pytest.fixture
11 | def data():
12 |     rng = np.random.RandomState(0)
13 | 
14 |     feature_1 = ["A"] * 10 + ["B"] * 20 + ["C"] * 30
15 |     feature_2 = ["A"] * 40 + ["B"] * 20
16 |     feature_3 = ["A"] * 20 + ["B"] * 20 + ["C"] * 10 + ["D"] * 10
17 |     X = np.array([feature_1, feature_2, feature_3], dtype=object).T
18 |     rng.shuffle(X)
19 |     y = np.array([0] * 20 + [1] * 40, dtype=np.int32)
20 |     y_labels = np.array(["not apple", "apple"], dtype=object)
21 |     y = y_labels[y]
22 |     return X, y
23 | 
24 | 
25 | def test_smoten(data):
26 |     # overall check for SMOTEN
27 |     X, y = data
28 |     sampler = SMOTEN(random_state=0)
29 |     X_res, y_res = sampler.fit_resample(X, y)
30 | 
31 |     assert X_res.shape == (80, 3)
32 |     assert y_res.shape == (80,)
33 |     assert isinstance(sampler.categorical_encoder_, OrdinalEncoder)
34 | 
35 | 
36 | def test_smoten_resampling():
37 |     # check if the SMOTEN resample data as expected
38 |     # we generate data such that "not apple" will be the minority class and
39 |     # samples from this class will be generated. We will force the "blue"
40 |     # category to be associated with this class. Therefore, the new generated
41 |     # samples should as well be from the "blue" category.
42 |     X = np.array(["green"] * 5 + ["red"] * 10 + ["blue"] * 7, dtype=object).reshape(
43 |         -1, 1
44 |     )
45 |     y = np.array(
46 |         ["apple"] * 5
47 |         + ["not apple"] * 3
48 |         + ["apple"] * 7
49 |         + ["not apple"] * 5
50 |         + ["apple"] * 2,
51 |         dtype=object,
52 |     )
53 |     sampler = SMOTEN(random_state=0)
54 |     X_res, y_res = sampler.fit_resample(X, y)
55 | 
56 |     X_generated, y_generated = X_res[X.shape[0] :], y_res[X.shape[0] :]
57 |     np.testing.assert_array_equal(X_generated, "blue")
58 |     np.testing.assert_array_equal(y_generated, "not apple")
59 | 
60 | 
61 | @pytest.mark.parametrize("sparse_format", ["sparse_csr", "sparse_csc"])
62 | def test_smoten_sparse_input(data, sparse_format):
63 |     """Check that we handle sparse input in SMOTEN even if it is not efficient.
64 | 
65 |     Non-regression test for:
66 |     https://github.com/scikit-learn-contrib/imbalanced-learn/issues/971
67 |     """
68 |     X, y = data
69 |     X = OneHotEncoder().fit_transform(X).toarray()
70 |     X = _convert_container(X, sparse_format)
71 | 
72 |     with pytest.warns(DataConversionWarning, match="is not really efficient"):
73 |         X_res, y_res = SMOTEN(random_state=0).fit_resample(X, y)
74 | 
75 |     assert X_res.format == X.format
76 |     assert X_res.shape[0] == len(y_res)
77 | 
78 | 
79 | def test_smoten_categorical_encoder(data):
80 |     """Check that `categorical_encoder` is used when provided."""
81 | 
82 |     X, y = data
83 |     sampler = SMOTEN(random_state=0)
84 |     sampler.fit_resample(X, y)
85 | 
86 |     assert isinstance(sampler.categorical_encoder_, OrdinalEncoder)
87 |     assert sampler.categorical_encoder_.dtype == np.int32
88 | 
89 |     encoder = OrdinalEncoder(dtype=np.int64)
90 |     sampler.set_params(categorical_encoder=encoder).fit_resample(X, y)
91 | 
92 |     assert isinstance(sampler.categorical_encoder_, OrdinalEncoder)
93 |     assert sampler.categorical_encoder is encoder
94 |     assert sampler.categorical_encoder_ is not encoder
95 |     assert sampler.categorical_encoder_.dtype == np.int64
96 | 


--------------------------------------------------------------------------------
/imblearn/over_sampling/_smote/tests/test_svm_smote.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from sklearn.datasets import make_classification
 4 | from sklearn.linear_model import LogisticRegression
 5 | from sklearn.neighbors import NearestNeighbors
 6 | from sklearn.svm import SVC
 7 | from sklearn.utils._testing import assert_allclose, assert_array_equal
 8 | 
 9 | from imblearn.over_sampling import SVMSMOTE
10 | 
11 | 
12 | @pytest.fixture
13 | def data():
14 |     X = np.array(
15 |         [
16 |             [0.11622591, -0.0317206],
17 |             [0.77481731, 0.60935141],
18 |             [1.25192108, -0.22367336],
19 |             [0.53366841, -0.30312976],
20 |             [1.52091956, -0.49283504],
21 |             [-0.28162401, -2.10400981],
22 |             [0.83680821, 1.72827342],
23 |             [0.3084254, 0.33299982],
24 |             [0.70472253, -0.73309052],
25 |             [0.28893132, -0.38761769],
26 |             [1.15514042, 0.0129463],
27 |             [0.88407872, 0.35454207],
28 |             [1.31301027, -0.92648734],
29 |             [-1.11515198, -0.93689695],
30 |             [-0.18410027, -0.45194484],
31 |             [0.9281014, 0.53085498],
32 |             [-0.14374509, 0.27370049],
33 |             [-0.41635887, -0.38299653],
34 |             [0.08711622, 0.93259929],
35 |             [1.70580611, -0.11219234],
36 |         ]
37 |     )
38 |     y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0])
39 |     return X, y
40 | 
41 | 
42 | def test_svm_smote(data):
43 |     svm_smote = SVMSMOTE(random_state=42)
44 |     svm_smote_nn = SVMSMOTE(
45 |         random_state=42,
46 |         k_neighbors=NearestNeighbors(n_neighbors=6),
47 |         m_neighbors=NearestNeighbors(n_neighbors=11),
48 |         svm_estimator=SVC(gamma="scale", random_state=42),
49 |     )
50 | 
51 |     X_res_1, y_res_1 = svm_smote.fit_resample(*data)
52 |     X_res_2, y_res_2 = svm_smote_nn.fit_resample(*data)
53 | 
54 |     assert_allclose(X_res_1, X_res_2)
55 |     assert_array_equal(y_res_1, y_res_2)
56 | 
57 | 
58 | def test_svm_smote_not_svm(data):
59 |     """Check that we raise a proper error if passing an estimator that does not
60 |     expose a `support_` fitted attribute."""
61 | 
62 |     err_msg = "`svm_estimator` is required to exposed a `support_` fitted attribute."
63 |     with pytest.raises(RuntimeError, match=err_msg):
64 |         SVMSMOTE(svm_estimator=LogisticRegression()).fit_resample(*data)
65 | 
66 | 
67 | def test_svm_smote_all_noise(data):
68 |     """Check that we raise a proper error message when all support vectors are
69 |     detected as noise and there is nothing that we can do.
70 | 
71 |     Non-regression test for:
72 |     https://github.com/scikit-learn-contrib/imbalanced-learn/issues/742
73 |     """
74 |     X, y = make_classification(
75 |         n_classes=3,
76 |         class_sep=0.001,
77 |         weights=[0.004, 0.451, 0.545],
78 |         n_informative=3,
79 |         n_redundant=0,
80 |         flip_y=0,
81 |         n_features=3,
82 |         n_clusters_per_class=2,
83 |         n_samples=1000,
84 |         random_state=10,
85 |     )
86 | 
87 |     with pytest.raises(ValueError, match="SVM-SMOTE is not adapted to your dataset"):
88 |         SVMSMOTE(k_neighbors=4, random_state=42).fit_resample(X, y)
89 | 


--------------------------------------------------------------------------------
/imblearn/over_sampling/base.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base class for the over-sampling method.
 3 | """
 4 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
 5 | #          Christos Aridas
 6 | # License: MIT
 7 | 
 8 | import numbers
 9 | from collections.abc import Mapping
10 | 
11 | from sklearn.utils._param_validation import Interval, StrOptions
12 | 
13 | from ..base import BaseSampler
14 | 
15 | 
16 | class BaseOverSampler(BaseSampler):
17 |     """Base class for over-sampling algorithms.
18 | 
19 |     Warning: This class should not be used directly. Use the derive classes
20 |     instead.
21 |     """
22 | 
23 |     _sampling_type = "over-sampling"
24 | 
25 |     _sampling_strategy_docstring = (
26 |         """sampling_strategy : float, str, dict or callable, default='auto'
27 |         Sampling information to resample the data set.
28 | 
29 |         - When ``float``, it corresponds to the desired ratio of the number of
30 |           samples in the minority class over the number of samples in the
31 |           majority class after resampling. Therefore, the ratio is expressed as
32 |           :math:`\\alpha_{os} = N_{rm} / N_{M}` where :math:`N_{rm}` is the
33 |           number of samples in the minority class after resampling and
34 |           :math:`N_{M}` is the number of samples in the majority class.
35 | 
36 |             .. warning::
37 |                ``float`` is only available for **binary** classification. An
38 |                error is raised for multi-class classification.
39 | 
40 |         - When ``str``, specify the class targeted by the resampling. The
41 |           number of samples in the different classes will be equalized.
42 |           Possible choices are:
43 | 
44 |             ``'minority'``: resample only the minority class;
45 | 
46 |             ``'not minority'``: resample all classes but the minority class;
47 | 
48 |             ``'not majority'``: resample all classes but the majority class;
49 | 
50 |             ``'all'``: resample all classes;
51 | 
52 |             ``'auto'``: equivalent to ``'not majority'``.
53 | 
54 |         - When ``dict``, the keys correspond to the targeted classes. The
55 |           values correspond to the desired number of samples for each targeted
56 |           class.
57 | 
58 |         - When callable, function taking ``y`` and returns a ``dict``. The keys
59 |           correspond to the targeted classes. The values correspond to the
60 |           desired number of samples for each class.
61 |         """.strip()
62 |     )  # noqa: E501
63 | 
64 |     _parameter_constraints: dict = {
65 |         "sampling_strategy": [
66 |             Interval(numbers.Real, 0, 1, closed="right"),
67 |             StrOptions({"auto", "minority", "not minority", "not majority", "all"}),
68 |             Mapping,
69 |             callable,
70 |         ],
71 |         "random_state": ["random_state"],
72 |     }
73 | 


--------------------------------------------------------------------------------
/imblearn/over_sampling/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/over_sampling/tests/__init__.py


--------------------------------------------------------------------------------
/imblearn/over_sampling/tests/test_common.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | 
  3 | import numpy as np
  4 | import pytest
  5 | from sklearn.cluster import MiniBatchKMeans
  6 | 
  7 | from imblearn.over_sampling import (
  8 |     ADASYN,
  9 |     SMOTE,
 10 |     SMOTEN,
 11 |     SMOTENC,
 12 |     SVMSMOTE,
 13 |     BorderlineSMOTE,
 14 |     KMeansSMOTE,
 15 | )
 16 | from imblearn.utils.testing import _CustomNearestNeighbors
 17 | 
 18 | 
 19 | @pytest.fixture
 20 | def numerical_data():
 21 |     rng = np.random.RandomState(0)
 22 |     X = rng.randn(100, 2)
 23 |     y = np.repeat([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0], 5)
 24 | 
 25 |     return X, y
 26 | 
 27 | 
 28 | @pytest.fixture
 29 | def categorical_data():
 30 |     rng = np.random.RandomState(0)
 31 | 
 32 |     feature_1 = ["A"] * 10 + ["B"] * 20 + ["C"] * 30
 33 |     feature_2 = ["A"] * 40 + ["B"] * 20
 34 |     feature_3 = ["A"] * 20 + ["B"] * 20 + ["C"] * 10 + ["D"] * 10
 35 |     X = np.array([feature_1, feature_2, feature_3], dtype=object).T
 36 |     rng.shuffle(X)
 37 |     y = np.array([0] * 20 + [1] * 40, dtype=np.int32)
 38 |     y_labels = np.array(["not apple", "apple"], dtype=object)
 39 |     y = y_labels[y]
 40 |     return X, y
 41 | 
 42 | 
 43 | @pytest.fixture
 44 | def heterogeneous_data():
 45 |     rng = np.random.RandomState(42)
 46 |     X = np.empty((30, 4), dtype=object)
 47 |     X[:, :2] = rng.randn(30, 2)
 48 |     X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object)
 49 |     X[:, 3] = rng.randint(3, size=30)
 50 |     y = np.array([0] * 10 + [1] * 20)
 51 |     return X, y, [2, 3]
 52 | 
 53 | 
 54 | @pytest.mark.parametrize(
 55 |     "smote", [BorderlineSMOTE(), SVMSMOTE()], ids=["borderline", "svm"]
 56 | )
 57 | def test_smote_m_neighbors(numerical_data, smote):
 58 |     # check that m_neighbors is properly set. Regression test for:
 59 |     # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/568
 60 |     X, y = numerical_data
 61 |     _ = smote.fit_resample(X, y)
 62 |     assert smote.nn_k_.n_neighbors == 6
 63 |     assert smote.nn_m_.n_neighbors == 11
 64 | 
 65 | 
 66 | @pytest.mark.parametrize(
 67 |     "smote, neighbor_estimator_name",
 68 |     [
 69 |         (ADASYN(random_state=0), "n_neighbors"),
 70 |         (BorderlineSMOTE(random_state=0), "k_neighbors"),
 71 |         (
 72 |             KMeansSMOTE(
 73 |                 kmeans_estimator=MiniBatchKMeans(n_init=1, random_state=0),
 74 |                 random_state=1,
 75 |             ),
 76 |             "k_neighbors",
 77 |         ),
 78 |         (SMOTE(random_state=0), "k_neighbors"),
 79 |         (SVMSMOTE(random_state=0), "k_neighbors"),
 80 |     ],
 81 |     ids=["adasyn", "borderline", "kmeans", "smote", "svm"],
 82 | )
 83 | def test_numerical_smote_custom_nn(numerical_data, smote, neighbor_estimator_name):
 84 |     X, y = numerical_data
 85 |     params = {
 86 |         neighbor_estimator_name: _CustomNearestNeighbors(n_neighbors=5),
 87 |     }
 88 |     smote.set_params(**params)
 89 |     X_res, _ = smote.fit_resample(X, y)
 90 | 
 91 |     assert X_res.shape[0] >= 120
 92 | 
 93 | 
 94 | def test_categorical_smote_k_custom_nn(categorical_data):
 95 |     X, y = categorical_data
 96 |     smote = SMOTEN(k_neighbors=_CustomNearestNeighbors(n_neighbors=5))
 97 |     X_res, y_res = smote.fit_resample(X, y)
 98 | 
 99 |     assert X_res.shape == (80, 3)
100 |     assert Counter(y_res) == {"apple": 40, "not apple": 40}
101 | 
102 | 
103 | def test_heterogeneous_smote_k_custom_nn(heterogeneous_data):
104 |     X, y, categorical_features = heterogeneous_data
105 |     smote = SMOTENC(
106 |         categorical_features, k_neighbors=_CustomNearestNeighbors(n_neighbors=5)
107 |     )
108 |     X_res, y_res = smote.fit_resample(X, y)
109 | 
110 |     assert X_res.shape == (40, 4)
111 |     assert Counter(y_res) == {0: 20, 1: 20}
112 | 
113 | 
114 | @pytest.mark.parametrize(
115 |     "smote",
116 |     [BorderlineSMOTE(random_state=0), SVMSMOTE(random_state=0)],
117 |     ids=["borderline", "svm"],
118 | )
119 | def test_numerical_smote_extra_custom_nn(numerical_data, smote):
120 |     X, y = numerical_data
121 |     smote.set_params(m_neighbors=_CustomNearestNeighbors(n_neighbors=5))
122 |     X_res, y_res = smote.fit_resample(X, y)
123 | 
124 |     assert X_res.shape == (120, 2)
125 |     assert Counter(y_res) == {0: 60, 1: 60}
126 | 


--------------------------------------------------------------------------------
/imblearn/tensorflow/__init__.py:
--------------------------------------------------------------------------------
1 | """The :mod:`imblearn.tensorflow` provides utilities to deal with imbalanced
2 | dataset in tensorflow."""
3 | 
4 | from ._generator import balanced_batch_generator
5 | 
6 | __all__ = ["balanced_batch_generator"]
7 | 


--------------------------------------------------------------------------------
/imblearn/tensorflow/_generator.py:
--------------------------------------------------------------------------------
 1 | """Implement generators for ``tensorflow`` which will balance the data."""
 2 | 
 3 | from scipy.sparse import issparse
 4 | from sklearn.base import clone
 5 | from sklearn.utils import _safe_indexing, check_random_state
 6 | 
 7 | from ..under_sampling import RandomUnderSampler
 8 | from ..utils import Substitution
 9 | from ..utils._docstring import _random_state_docstring
10 | 
11 | 
12 | @Substitution(random_state=_random_state_docstring)
13 | def balanced_batch_generator(
14 |     X,
15 |     y,
16 |     *,
17 |     sample_weight=None,
18 |     sampler=None,
19 |     batch_size=32,
20 |     keep_sparse=False,
21 |     random_state=None,
22 | ):
23 |     """Create a balanced batch generator to train tensorflow model.
24 | 
25 |     Returns a generator --- as well as the number of step per epoch --- to
26 |     iterate to get the mini-batches. The sampler defines the sampling strategy
27 |     used to balance the dataset ahead of creating the batch. The sampler should
28 |     have an attribute ``sample_indices_``.
29 | 
30 |     .. versionadded:: 0.4
31 | 
32 |     Parameters
33 |     ----------
34 |     X : ndarray of shape (n_samples, n_features)
35 |         Original imbalanced dataset.
36 | 
37 |     y : ndarray of shape (n_samples,) or (n_samples, n_classes)
38 |         Associated targets.
39 | 
40 |     sample_weight : ndarray of shape (n_samples,), default=None
41 |         Sample weight.
42 | 
43 |     sampler : sampler object, default=None
44 |         A sampler instance which has an attribute ``sample_indices_``.
45 |         By default, the sampler used is a
46 |         :class:`~imblearn.under_sampling.RandomUnderSampler`.
47 | 
48 |     batch_size : int, default=32
49 |         Number of samples per gradient update.
50 | 
51 |     keep_sparse : bool, default=False
52 |         Either or not to conserve or not the sparsity of the input ``X``. By
53 |         default, the returned batches will be dense.
54 | 
55 |     {random_state}
56 | 
57 |     Returns
58 |     -------
59 |     generator : generator of tuple
60 |         Generate batch of data. The tuple generated are either (X_batch,
61 |         y_batch) or (X_batch, y_batch, sampler_weight_batch).
62 | 
63 |     steps_per_epoch : int
64 |         The number of samples per epoch.
65 |     """
66 | 
67 |     random_state = check_random_state(random_state)
68 |     if sampler is None:
69 |         sampler_ = RandomUnderSampler(random_state=random_state)
70 |     else:
71 |         sampler_ = clone(sampler)
72 |     sampler_.fit_resample(X, y)
73 |     if not hasattr(sampler_, "sample_indices_"):
74 |         raise ValueError("'sampler' needs to have an attribute 'sample_indices_'.")
75 |     indices = sampler_.sample_indices_
76 |     # shuffle the indices since the sampler are packing them by class
77 |     random_state.shuffle(indices)
78 | 
79 |     def generator(X, y, sample_weight, indices, batch_size):
80 |         while True:
81 |             for index in range(0, len(indices), batch_size):
82 |                 X_res = _safe_indexing(X, indices[index : index + batch_size])
83 |                 y_res = _safe_indexing(y, indices[index : index + batch_size])
84 |                 if issparse(X_res) and not keep_sparse:
85 |                     X_res = X_res.toarray()
86 |                 if sample_weight is None:
87 |                     yield X_res, y_res
88 |                 else:
89 |                     sw_res = _safe_indexing(
90 |                         sample_weight, indices[index : index + batch_size]
91 |                     )
92 |                     yield X_res, y_res, sw_res
93 | 
94 |     return (
95 |         generator(X, y, sample_weight, indices, batch_size),
96 |         int(indices.size // batch_size),
97 |     )
98 | 


--------------------------------------------------------------------------------
/imblearn/tensorflow/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/tensorflow/tests/__init__.py


--------------------------------------------------------------------------------
/imblearn/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/tests/__init__.py


--------------------------------------------------------------------------------
/imblearn/tests/test_base.py:
--------------------------------------------------------------------------------
  1 | """Test for miscellaneous samplers objects."""
  2 | 
  3 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
  4 | # License: MIT
  5 | 
  6 | import numpy as np
  7 | import pytest
  8 | from scipy import sparse
  9 | from sklearn.datasets import load_iris, make_regression
 10 | from sklearn.linear_model import LinearRegression
 11 | from sklearn.utils import _safe_indexing
 12 | from sklearn.utils._testing import assert_allclose_dense_sparse, assert_array_equal
 13 | from sklearn.utils.multiclass import type_of_target
 14 | 
 15 | from imblearn import FunctionSampler
 16 | from imblearn.datasets import make_imbalance
 17 | from imblearn.pipeline import make_pipeline
 18 | from imblearn.under_sampling import RandomUnderSampler
 19 | 
 20 | iris = load_iris()
 21 | X, y = make_imbalance(
 22 |     iris.data, iris.target, sampling_strategy={0: 10, 1: 25}, random_state=0
 23 | )
 24 | 
 25 | 
 26 | def test_function_sampler_reject_sparse():
 27 |     X_sparse = sparse.csr_matrix(X)
 28 |     sampler = FunctionSampler(accept_sparse=False)
 29 |     err_msg = "dense data is required"
 30 |     with pytest.raises(
 31 |         TypeError,
 32 |         match=err_msg,
 33 |     ):
 34 |         sampler.fit_resample(X_sparse, y)
 35 | 
 36 | 
 37 | @pytest.mark.parametrize(
 38 |     "X, y", [(X, y), (sparse.csr_matrix(X), y), (sparse.csc_matrix(X), y)]
 39 | )
 40 | def test_function_sampler_identity(X, y):
 41 |     sampler = FunctionSampler()
 42 |     X_res, y_res = sampler.fit_resample(X, y)
 43 |     assert_allclose_dense_sparse(X_res, X)
 44 |     assert_array_equal(y_res, y)
 45 | 
 46 | 
 47 | @pytest.mark.parametrize(
 48 |     "X, y", [(X, y), (sparse.csr_matrix(X), y), (sparse.csc_matrix(X), y)]
 49 | )
 50 | def test_function_sampler_func(X, y):
 51 |     def func(X, y):
 52 |         return X[:10], y[:10]
 53 | 
 54 |     sampler = FunctionSampler(func=func)
 55 |     X_res, y_res = sampler.fit_resample(X, y)
 56 |     assert_allclose_dense_sparse(X_res, X[:10])
 57 |     assert_array_equal(y_res, y[:10])
 58 | 
 59 | 
 60 | @pytest.mark.parametrize(
 61 |     "X, y", [(X, y), (sparse.csr_matrix(X), y), (sparse.csc_matrix(X), y)]
 62 | )
 63 | def test_function_sampler_func_kwargs(X, y):
 64 |     def func(X, y, sampling_strategy, random_state):
 65 |         rus = RandomUnderSampler(
 66 |             sampling_strategy=sampling_strategy, random_state=random_state
 67 |         )
 68 |         return rus.fit_resample(X, y)
 69 | 
 70 |     sampler = FunctionSampler(
 71 |         func=func, kw_args={"sampling_strategy": "auto", "random_state": 0}
 72 |     )
 73 |     X_res, y_res = sampler.fit_resample(X, y)
 74 |     X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_resample(X, y)
 75 |     assert_allclose_dense_sparse(X_res, X_res_2)
 76 |     assert_array_equal(y_res, y_res_2)
 77 | 
 78 | 
 79 | def test_function_sampler_validate():
 80 |     # check that we can let a pass a regression variable by turning down the
 81 |     # validation
 82 |     X, y = make_regression()
 83 | 
 84 |     def dummy_sampler(X, y):
 85 |         indices = np.random.choice(np.arange(X.shape[0]), size=100)
 86 |         return _safe_indexing(X, indices), _safe_indexing(y, indices)
 87 | 
 88 |     sampler = FunctionSampler(func=dummy_sampler, validate=False)
 89 |     pipeline = make_pipeline(sampler, LinearRegression())
 90 |     y_pred = pipeline.fit(X, y).predict(X)
 91 | 
 92 |     assert type_of_target(y_pred) == "continuous"
 93 | 
 94 | 
 95 | def test_function_resampler_fit():
 96 |     # Check that the validation is bypass when calling `fit`
 97 |     # Non-regression test for:
 98 |     # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/782
 99 |     X = np.array([[1, np.nan], [2, 3], [np.inf, 4]])
100 |     y = np.array([0, 1, 1])
101 | 
102 |     def func(X, y):
103 |         return X[:1], y[:1]
104 | 
105 |     sampler = FunctionSampler(func=func, validate=False)
106 |     sampler.fit(X, y)
107 |     sampler.fit_resample(X, y)
108 | 


--------------------------------------------------------------------------------
/imblearn/tests/test_common.py:
--------------------------------------------------------------------------------
  1 | """Common tests"""
  2 | 
  3 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
  4 | #          Christos Aridas
  5 | # License: MIT
  6 | 
  7 | import warnings
  8 | from collections import OrderedDict
  9 | 
 10 | import numpy as np
 11 | import pytest
 12 | from sklearn.exceptions import ConvergenceWarning
 13 | from sklearn.utils._testing import ignore_warnings
 14 | 
 15 | from imblearn.over_sampling import RandomOverSampler
 16 | from imblearn.under_sampling import RandomUnderSampler
 17 | from imblearn.utils._sklearn_compat import (
 18 |     parametrize_with_checks as parametrize_with_checks_sklearn,
 19 | )
 20 | from imblearn.utils._test_common.instance_generator import (
 21 |     _get_check_estimator_ids,
 22 |     _get_expected_failed_checks,
 23 |     _tested_estimators,
 24 | )
 25 | from imblearn.utils.estimator_checks import (
 26 |     _set_checking_parameters,
 27 |     check_dataframe_column_names_consistency,
 28 |     check_param_validation,
 29 |     parametrize_with_checks,
 30 | )
 31 | from imblearn.utils.testing import all_estimators
 32 | 
 33 | 
 34 | @pytest.mark.parametrize("name, Estimator", all_estimators())
 35 | def test_all_estimator_no_base_class(name, Estimator):
 36 |     # test that all_estimators doesn't find abstract classes.
 37 |     msg = f"Base estimators such as {name} should not be included in all_estimators"
 38 |     assert not name.lower().startswith("base"), msg
 39 | 
 40 | 
 41 | @parametrize_with_checks_sklearn(
 42 |     list(_tested_estimators()), expected_failed_checks=_get_expected_failed_checks
 43 | )
 44 | def test_estimators_compatibility_sklearn(estimator, check, request):
 45 |     _set_checking_parameters(estimator)
 46 |     check(estimator)
 47 | 
 48 | 
 49 | @parametrize_with_checks(
 50 |     list(_tested_estimators()), expected_failed_checks=_get_expected_failed_checks
 51 | )
 52 | def test_estimators_imblearn(estimator, check, request):
 53 |     # Common tests for estimator instances
 54 |     with ignore_warnings(
 55 |         category=(
 56 |             FutureWarning,
 57 |             ConvergenceWarning,
 58 |             UserWarning,
 59 |             FutureWarning,
 60 |         )
 61 |     ):
 62 |         _set_checking_parameters(estimator)
 63 |         check(estimator)
 64 | 
 65 | 
 66 | @pytest.mark.parametrize(
 67 |     "estimator", _tested_estimators(), ids=_get_check_estimator_ids
 68 | )
 69 | def test_check_param_validation(estimator):
 70 |     name = estimator.__class__.__name__
 71 |     _set_checking_parameters(estimator)
 72 |     check_param_validation(name, estimator)
 73 | 
 74 | 
 75 | @pytest.mark.parametrize("Sampler", [RandomOverSampler, RandomUnderSampler])
 76 | def test_strategy_as_ordered_dict(Sampler):
 77 |     """Check that it is possible to pass an `OrderedDict` as strategy."""
 78 |     rng = np.random.RandomState(42)
 79 |     X, y = rng.randn(30, 2), np.array([0] * 10 + [1] * 20)
 80 |     sampler = Sampler(random_state=42)
 81 |     if isinstance(sampler, RandomOverSampler):
 82 |         strategy = OrderedDict({0: 20, 1: 20})
 83 |     else:
 84 |         strategy = OrderedDict({0: 10, 1: 10})
 85 |     sampler.set_params(sampling_strategy=strategy)
 86 |     X_res, y_res = sampler.fit_resample(X, y)
 87 |     assert X_res.shape[0] == sum(strategy.values())
 88 |     assert y_res.shape[0] == sum(strategy.values())
 89 | 
 90 | 
 91 | @pytest.mark.parametrize(
 92 |     "estimator", _tested_estimators(), ids=_get_check_estimator_ids
 93 | )
 94 | def test_pandas_column_name_consistency(estimator):
 95 |     _set_checking_parameters(estimator)
 96 |     with ignore_warnings(category=(FutureWarning)):
 97 |         with warnings.catch_warnings(record=True) as record:
 98 |             check_dataframe_column_names_consistency(
 99 |                 estimator.__class__.__name__, estimator
100 |             )
101 |         for warning in record:
102 |             assert "was fitted without feature names" not in str(warning.message)
103 | 


--------------------------------------------------------------------------------
/imblearn/tests/test_exceptions.py:
--------------------------------------------------------------------------------
 1 | """Test for the exceptions modules"""
 2 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
 3 | #          Christos Aridas
 4 | # License: MIT
 5 | 
 6 | from pytest import raises
 7 | 
 8 | from imblearn.exceptions import raise_isinstance_error
 9 | 
10 | 
11 | def test_raise_isinstance_error():
12 |     var = 10.0
13 |     with raises(ValueError, match="has to be one of"):
14 |         raise_isinstance_error("var", [int], var)
15 | 


--------------------------------------------------------------------------------
/imblearn/under_sampling/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`imblearn.under_sampling` provides methods to under-sample
 3 | a dataset.
 4 | """
 5 | 
 6 | from ._prototype_generation import ClusterCentroids
 7 | from ._prototype_selection import (
 8 |     AllKNN,
 9 |     CondensedNearestNeighbour,
10 |     EditedNearestNeighbours,
11 |     InstanceHardnessThreshold,
12 |     NearMiss,
13 |     NeighbourhoodCleaningRule,
14 |     OneSidedSelection,
15 |     RandomUnderSampler,
16 |     RepeatedEditedNearestNeighbours,
17 |     TomekLinks,
18 | )
19 | 
20 | __all__ = [
21 |     "ClusterCentroids",
22 |     "RandomUnderSampler",
23 |     "InstanceHardnessThreshold",
24 |     "NearMiss",
25 |     "TomekLinks",
26 |     "EditedNearestNeighbours",
27 |     "RepeatedEditedNearestNeighbours",
28 |     "AllKNN",
29 |     "OneSidedSelection",
30 |     "CondensedNearestNeighbour",
31 |     "NeighbourhoodCleaningRule",
32 | ]
33 | 


--------------------------------------------------------------------------------
/imblearn/under_sampling/_prototype_generation/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The :mod:`imblearn.under_sampling.prototype_generation` submodule contains
3 | methods that generate new samples in order to balance the dataset.
4 | """
5 | 
6 | from ._cluster_centroids import ClusterCentroids
7 | 
8 | __all__ = ["ClusterCentroids"]
9 | 


--------------------------------------------------------------------------------
/imblearn/under_sampling/_prototype_generation/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/under_sampling/_prototype_generation/tests/__init__.py


--------------------------------------------------------------------------------
/imblearn/under_sampling/_prototype_selection/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`imblearn.under_sampling.prototype_selection` submodule contains
 3 | methods that select samples in order to balance the dataset.
 4 | """
 5 | 
 6 | from ._condensed_nearest_neighbour import CondensedNearestNeighbour
 7 | from ._edited_nearest_neighbours import (
 8 |     AllKNN,
 9 |     EditedNearestNeighbours,
10 |     RepeatedEditedNearestNeighbours,
11 | )
12 | from ._instance_hardness_threshold import InstanceHardnessThreshold
13 | from ._nearmiss import NearMiss
14 | from ._neighbourhood_cleaning_rule import NeighbourhoodCleaningRule
15 | from ._one_sided_selection import OneSidedSelection
16 | from ._random_under_sampler import RandomUnderSampler
17 | from ._tomek_links import TomekLinks
18 | 
19 | __all__ = [
20 |     "RandomUnderSampler",
21 |     "InstanceHardnessThreshold",
22 |     "NearMiss",
23 |     "TomekLinks",
24 |     "EditedNearestNeighbours",
25 |     "RepeatedEditedNearestNeighbours",
26 |     "AllKNN",
27 |     "OneSidedSelection",
28 |     "CondensedNearestNeighbour",
29 |     "NeighbourhoodCleaningRule",
30 | ]
31 | 


--------------------------------------------------------------------------------
/imblearn/under_sampling/_prototype_selection/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/under_sampling/_prototype_selection/tests/__init__.py


--------------------------------------------------------------------------------
/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py:
--------------------------------------------------------------------------------
  1 | """Test the module ."""
  2 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
  3 | #          Christos Aridas
  4 | # License: MIT
  5 | 
  6 | import numpy as np
  7 | from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
  8 | from sklearn.naive_bayes import GaussianNB as NB
  9 | from sklearn.pipeline import make_pipeline
 10 | from sklearn.utils._testing import assert_array_equal
 11 | 
 12 | from imblearn.under_sampling import InstanceHardnessThreshold
 13 | 
 14 | RND_SEED = 0
 15 | X = np.array(
 16 |     [
 17 |         [-0.3879569, 0.6894251],
 18 |         [-0.09322739, 1.28177189],
 19 |         [-0.77740357, 0.74097941],
 20 |         [0.91542919, -0.65453327],
 21 |         [-0.03852113, 0.40910479],
 22 |         [-0.43877303, 1.07366684],
 23 |         [-0.85795321, 0.82980738],
 24 |         [-0.18430329, 0.52328473],
 25 |         [-0.30126957, -0.66268378],
 26 |         [-0.65571327, 0.42412021],
 27 |         [-0.28305528, 0.30284991],
 28 |         [0.20246714, -0.34727125],
 29 |         [1.06446472, -1.09279772],
 30 |         [0.30543283, -0.02589502],
 31 |         [-0.00717161, 0.00318087],
 32 |     ]
 33 | )
 34 | Y = np.array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0])
 35 | ESTIMATOR = GradientBoostingClassifier(random_state=RND_SEED)
 36 | 
 37 | 
 38 | def test_iht_init():
 39 |     sampling_strategy = "auto"
 40 |     iht = InstanceHardnessThreshold(
 41 |         estimator=ESTIMATOR,
 42 |         sampling_strategy=sampling_strategy,
 43 |         random_state=RND_SEED,
 44 |     )
 45 | 
 46 |     assert iht.sampling_strategy == sampling_strategy
 47 |     assert iht.random_state == RND_SEED
 48 | 
 49 | 
 50 | def test_iht_fit_resample():
 51 |     iht = InstanceHardnessThreshold(estimator=ESTIMATOR, random_state=RND_SEED)
 52 |     X_resampled, y_resampled = iht.fit_resample(X, Y)
 53 |     assert X_resampled.shape == (12, 2)
 54 |     assert y_resampled.shape == (12,)
 55 | 
 56 | 
 57 | def test_iht_fit_resample_half():
 58 |     sampling_strategy = {0: 3, 1: 3}
 59 |     iht = InstanceHardnessThreshold(
 60 |         estimator=NB(),
 61 |         sampling_strategy=sampling_strategy,
 62 |         random_state=RND_SEED,
 63 |     )
 64 |     X_resampled, y_resampled = iht.fit_resample(X, Y)
 65 |     assert X_resampled.shape == (6, 2)
 66 |     assert y_resampled.shape == (6,)
 67 | 
 68 | 
 69 | def test_iht_fit_resample_class_obj():
 70 |     est = GradientBoostingClassifier(random_state=RND_SEED)
 71 |     iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED)
 72 |     X_resampled, y_resampled = iht.fit_resample(X, Y)
 73 |     assert X_resampled.shape == (12, 2)
 74 |     assert y_resampled.shape == (12,)
 75 | 
 76 | 
 77 | def test_iht_reproducibility():
 78 |     from sklearn.datasets import load_digits
 79 | 
 80 |     X_digits, y_digits = load_digits(return_X_y=True)
 81 |     idx_sampled = []
 82 |     for seed in range(5):
 83 |         est = RandomForestClassifier(n_estimators=10, random_state=seed)
 84 |         iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED)
 85 |         iht.fit_resample(X_digits, y_digits)
 86 |         idx_sampled.append(iht.sample_indices_.copy())
 87 |     for idx_1, idx_2 in zip(idx_sampled, idx_sampled[1:]):
 88 |         assert_array_equal(idx_1, idx_2)
 89 | 
 90 | 
 91 | def test_iht_fit_resample_default_estimator():
 92 |     iht = InstanceHardnessThreshold(estimator=None, random_state=RND_SEED)
 93 |     X_resampled, y_resampled = iht.fit_resample(X, Y)
 94 |     assert isinstance(iht.estimator_, RandomForestClassifier)
 95 |     assert X_resampled.shape == (12, 2)
 96 |     assert y_resampled.shape == (12,)
 97 | 
 98 | 
 99 | def test_iht_estimator_pipeline():
100 |     """Check that we can pass a pipeline containing a classifier.
101 | 
102 |     Checking if we have a classifier should not be based on inheriting from
103 |     `ClassifierMixin`.
104 | 
105 |     Non-regression test for:
106 |     https://github.com/scikit-learn-contrib/imbalanced-learn/pull/1049
107 |     """
108 |     model = make_pipeline(GradientBoostingClassifier(random_state=RND_SEED))
109 |     iht = InstanceHardnessThreshold(estimator=model, random_state=RND_SEED)
110 |     X_resampled, y_resampled = iht.fit_resample(X, Y)
111 |     assert X_resampled.shape == (12, 2)
112 |     assert y_resampled.shape == (12,)
113 | 


--------------------------------------------------------------------------------
/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py:
--------------------------------------------------------------------------------
 1 | """Test the module neighbourhood cleaning rule."""
 2 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
 3 | #          Christos Aridas
 4 | # License: MIT
 5 | 
 6 | from collections import Counter
 7 | 
 8 | import numpy as np
 9 | import pytest
10 | from sklearn.datasets import make_classification
11 | from sklearn.utils._testing import assert_array_equal
12 | 
13 | from imblearn.under_sampling import EditedNearestNeighbours, NeighbourhoodCleaningRule
14 | 
15 | 
16 | @pytest.fixture(scope="module")
17 | def data():
18 |     return make_classification(
19 |         n_samples=200,
20 |         n_features=2,
21 |         n_informative=2,
22 |         n_redundant=0,
23 |         n_repeated=0,
24 |         n_clusters_per_class=1,
25 |         n_classes=3,
26 |         weights=[0.1, 0.3, 0.6],
27 |         random_state=0,
28 |     )
29 | 
30 | 
31 | def test_ncr_threshold_cleaning(data):
32 |     """Test the effect of the `threshold_cleaning` parameter."""
33 |     X, y = data
34 |     # with a large `threshold_cleaning`, the algorithm is equivalent to ENN
35 |     enn = EditedNearestNeighbours()
36 |     ncr = NeighbourhoodCleaningRule(
37 |         edited_nearest_neighbours=enn, n_neighbors=10, threshold_cleaning=10
38 |     )
39 | 
40 |     enn.fit_resample(X, y)
41 |     ncr.fit_resample(X, y)
42 | 
43 |     assert_array_equal(np.sort(enn.sample_indices_), np.sort(ncr.sample_indices_))
44 |     assert ncr.classes_to_clean_ == []
45 | 
46 |     # set a threshold that we should consider only the class #2
47 |     counter = Counter(y)
48 |     threshold = counter[1] / counter[0]
49 |     ncr.set_params(threshold_cleaning=threshold)
50 |     ncr.fit_resample(X, y)
51 | 
52 |     assert set(ncr.classes_to_clean_) == {2}
53 | 
54 |     # making the threshold slightly smaller to take into account class #1
55 |     ncr.set_params(threshold_cleaning=threshold - np.finfo(np.float32).eps)
56 |     ncr.fit_resample(X, y)
57 | 
58 |     assert set(ncr.classes_to_clean_) == {1, 2}
59 | 
60 | 
61 | def test_ncr_n_neighbors(data):
62 |     """Check the effect of the NN on the cleaning of the second phase."""
63 |     X, y = data
64 | 
65 |     enn = EditedNearestNeighbours()
66 |     ncr = NeighbourhoodCleaningRule(edited_nearest_neighbours=enn, n_neighbors=3)
67 | 
68 |     ncr.fit_resample(X, y)
69 |     sample_indices_3_nn = ncr.sample_indices_
70 | 
71 |     ncr.set_params(n_neighbors=10).fit_resample(X, y)
72 |     sample_indices_10_nn = ncr.sample_indices_
73 | 
74 |     # we should have a more aggressive cleaning with n_neighbors is larger
75 |     assert len(sample_indices_3_nn) > len(sample_indices_10_nn)
76 | 
77 | 
78 | # TODO: remove in 0.14
79 | @pytest.mark.parametrize("kind_sel", ["all", "mode"])
80 | def test_ncr_deprecate_kind_sel(data, kind_sel):
81 |     X, y = data
82 | 
83 |     with pytest.warns(FutureWarning, match="`kind_sel` is deprecated"):
84 |         NeighbourhoodCleaningRule(kind_sel=kind_sel).fit_resample(X, y)
85 | 


--------------------------------------------------------------------------------
/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py:
--------------------------------------------------------------------------------
 1 | """Test the module Tomek's links."""
 2 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
 3 | #          Christos Aridas
 4 | # License: MIT
 5 | 
 6 | import numpy as np
 7 | import pytest
 8 | from sklearn.datasets import make_classification
 9 | from sklearn.utils._testing import assert_array_equal
10 | 
11 | from imblearn.under_sampling import TomekLinks
12 | 
13 | X = np.array(
14 |     [
15 |         [0.31230513, 0.1216318],
16 |         [0.68481731, 0.51935141],
17 |         [1.34192108, -0.13367336],
18 |         [0.62366841, -0.21312976],
19 |         [1.61091956, -0.40283504],
20 |         [-0.37162401, -2.19400981],
21 |         [0.74680821, 1.63827342],
22 |         [0.2184254, 0.24299982],
23 |         [0.61472253, -0.82309052],
24 |         [0.19893132, -0.47761769],
25 |         [1.06514042, -0.0770537],
26 |         [0.97407872, 0.44454207],
27 |         [1.40301027, -0.83648734],
28 |         [-1.20515198, -1.02689695],
29 |         [-0.27410027, -0.54194484],
30 |         [0.8381014, 0.44085498],
31 |         [-0.23374509, 0.18370049],
32 |         [-0.32635887, -0.29299653],
33 |         [-0.00288378, 0.84259929],
34 |         [1.79580611, -0.02219234],
35 |     ]
36 | )
37 | Y = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0])
38 | 
39 | 
40 | def test_tl_init():
41 |     tl = TomekLinks()
42 |     assert tl.n_jobs is None
43 | 
44 | 
45 | def test_tl_fit_resample():
46 |     tl = TomekLinks()
47 |     X_resampled, y_resampled = tl.fit_resample(X, Y)
48 | 
49 |     X_gt = np.array(
50 |         [
51 |             [0.31230513, 0.1216318],
52 |             [0.68481731, 0.51935141],
53 |             [1.34192108, -0.13367336],
54 |             [0.62366841, -0.21312976],
55 |             [1.61091956, -0.40283504],
56 |             [-0.37162401, -2.19400981],
57 |             [0.74680821, 1.63827342],
58 |             [0.2184254, 0.24299982],
59 |             [0.61472253, -0.82309052],
60 |             [0.19893132, -0.47761769],
61 |             [0.97407872, 0.44454207],
62 |             [1.40301027, -0.83648734],
63 |             [-1.20515198, -1.02689695],
64 |             [-0.23374509, 0.18370049],
65 |             [-0.32635887, -0.29299653],
66 |             [-0.00288378, 0.84259929],
67 |             [1.79580611, -0.02219234],
68 |         ]
69 |     )
70 |     y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0])
71 |     assert_array_equal(X_resampled, X_gt)
72 |     assert_array_equal(y_resampled, y_gt)
73 | 
74 | 
75 | @pytest.mark.parametrize(
76 |     "sampling_strategy", ["auto", "majority", "not minority", "not majority", "all"]
77 | )
78 | def test_tomek_links_strings(sampling_strategy):
79 |     """Check that we support all supposed strings as `sampling_strategy` in
80 |     a sampler inheriting from `BaseCleaningSampler`."""
81 | 
82 |     X, y = make_classification(
83 |         n_samples=100,
84 |         n_clusters_per_class=1,
85 |         n_classes=3,
86 |         weights=[0.1, 0.3, 0.6],
87 |         random_state=0,
88 |     )
89 |     TomekLinks(sampling_strategy=sampling_strategy).fit_resample(X, y)
90 | 


--------------------------------------------------------------------------------
/imblearn/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`imblearn.utils` module includes various utilities.
 3 | """
 4 | 
 5 | from ._docstring import Substitution
 6 | from ._validation import (
 7 |     check_neighbors_object,
 8 |     check_sampling_strategy,
 9 |     check_target_type,
10 | )
11 | 
12 | __all__ = [
13 |     "check_neighbors_object",
14 |     "check_sampling_strategy",
15 |     "check_target_type",
16 |     "Substitution",
17 | ]
18 | 


--------------------------------------------------------------------------------
/imblearn/utils/_docstring.py:
--------------------------------------------------------------------------------
 1 | """Utilities for docstring in imbalanced-learn."""
 2 | 
 3 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
 4 | # License: MIT
 5 | 
 6 | 
 7 | class Substitution:
 8 |     """Decorate a function's or a class' docstring to perform string
 9 |     substitution on it.
10 | 
11 |     This decorator should be robust even if obj.__doc__ is None
12 |     (for example, if -OO was passed to the interpreter)
13 |     """
14 | 
15 |     def __init__(self, *args, **kwargs):
16 |         if args and kwargs:
17 |             raise AssertionError("Only positional or keyword args are allowed")
18 | 
19 |         self.params = args or kwargs
20 | 
21 |     def __call__(self, obj):
22 |         if obj.__doc__:
23 |             obj.__doc__ = obj.__doc__.format(**self.params)
24 |         return obj
25 | 
26 | 
27 | _random_state_docstring = """random_state : int, RandomState instance, default=None
28 |         Control the randomization of the algorithm.
29 | 
30 |         - If int, ``random_state`` is the seed used by the random number
31 |           generator;
32 |         - If ``RandomState`` instance, random_state is the random number
33 |           generator;
34 |         - If ``None``, the random number generator is the ``RandomState``
35 |           instance used by ``np.random``.
36 |     """.rstrip()
37 | 
38 | _n_jobs_docstring = """n_jobs : int, default=None
39 |         Number of CPU cores used during the cross-validation loop.
40 |         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
41 |         ``-1`` means using all processors. See
42 |         `Glossary <https://scikit-learn.org/stable/glossary.html#term-n-jobs>`_
43 |         for more details.
44 |     """.rstrip()
45 | 


--------------------------------------------------------------------------------
/imblearn/utils/_show_versions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility method which prints system info to help with debugging,
 3 | and filing issues on GitHub.
 4 | Adapted from :func:`sklearn.show_versions`,
 5 | which was adapted from :func:`pandas.show_versions`
 6 | """
 7 | 
 8 | # Author: Alexander L. Hayes <hayesall@iu.edu>
 9 | # License: MIT
10 | 
11 | from .. import __version__
12 | 
13 | 
14 | def _get_deps_info():
15 |     """Overview of the installed version of main dependencies
16 |     Returns
17 |     -------
18 |     deps_info: dict
19 |         version information on relevant Python libraries
20 |     """
21 |     deps = [
22 |         "imbalanced-learn",
23 |         "pip",
24 |         "setuptools",
25 |         "numpy",
26 |         "scipy",
27 |         "scikit-learn",
28 |         "Cython",
29 |         "pandas",
30 |         "keras",
31 |         "tensorflow",
32 |         "joblib",
33 |     ]
34 | 
35 |     deps_info = {
36 |         "imbalanced-learn": __version__,
37 |     }
38 | 
39 |     from importlib.metadata import PackageNotFoundError, version
40 | 
41 |     for modname in deps:
42 |         try:
43 |             deps_info[modname] = version(modname)
44 |         except PackageNotFoundError:
45 |             deps_info[modname] = None
46 |     return deps_info
47 | 
48 | 
49 | def show_versions(github=False):
50 |     """Print debugging information.
51 | 
52 |     .. versionadded:: 0.5
53 | 
54 |     Parameters
55 |     ----------
56 |     github : bool,
57 |         If true, wrap system info with GitHub markup.
58 |     """
59 | 
60 |     from sklearn.utils._show_versions import _get_sys_info
61 | 
62 |     _sys_info = _get_sys_info()
63 |     _deps_info = _get_deps_info()
64 |     _github_markup = (
65 |         "<details>"
66 |         "<summary>System, Dependency Information</summary>\n\n"
67 |         "**System Information**\n\n"
68 |         "{0}\n"
69 |         "**Python Dependencies**\n\n"
70 |         "{1}\n"
71 |         "</details>"
72 |     )
73 | 
74 |     if github:
75 |         _sys_markup = ""
76 |         _deps_markup = ""
77 | 
78 |         for k, stat in _sys_info.items():
79 |             _sys_markup += f"* {k:<10}: `{stat}`\n"
80 |         for k, stat in _deps_info.items():
81 |             _deps_markup += f"* {k:<10}: `{stat}`\n"
82 | 
83 |         print(_github_markup.format(_sys_markup, _deps_markup))
84 | 
85 |     else:
86 |         print("\nSystem:")
87 |         for k, stat in _sys_info.items():
88 |             print(f"{k:>11}: {stat}")
89 | 
90 |         print("\nPython dependencies:")
91 |         for k, stat in _deps_info.items():
92 |             print(f"{k:>11}: {stat}")
93 | 


--------------------------------------------------------------------------------
/imblearn/utils/_tags.py:
--------------------------------------------------------------------------------
1 | from ._sklearn_compat import InputTags, SamplerTags, Tags  # noqa: F401
2 | 


--------------------------------------------------------------------------------
/imblearn/utils/_test_common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/utils/_test_common/__init__.py


--------------------------------------------------------------------------------
/imblearn/utils/deprecation.py:
--------------------------------------------------------------------------------
 1 | """Utilities for deprecation"""
 2 | 
 3 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
 4 | # License: MIT
 5 | 
 6 | import warnings
 7 | 
 8 | 
 9 | def deprecate_parameter(sampler, version_deprecation, param_deprecated, new_param=None):
10 |     """Helper to deprecate a parameter by another one.
11 | 
12 |     Parameters
13 |     ----------
14 |     sampler : sampler object,
15 |         The object which will be inspected.
16 | 
17 |     version_deprecation : str,
18 |         The version from which the parameter will be deprecated. The format
19 |         should be ``'x.y'``.
20 | 
21 |     param_deprecated : str,
22 |         The parameter being deprecated.
23 | 
24 |     new_param : str,
25 |         The parameter used instead of the deprecated parameter. By default, no
26 |         parameter is expected.
27 |     """
28 |     x, y = version_deprecation.split(".")
29 |     version_removed = x + "." + str(int(y) + 2)
30 |     if new_param is None:
31 |         if getattr(sampler, param_deprecated) is not None:
32 |             warnings.warn(
33 |                 (
34 |                     f"'{param_deprecated}' is deprecated from {version_deprecation} and"
35 |                     f"  will be removed in {version_removed} for the estimator"
36 |                     f" {sampler.__class__}."
37 |                 ),
38 |                 category=FutureWarning,
39 |             )
40 |     else:
41 |         if getattr(sampler, param_deprecated) is not None:
42 |             warnings.warn(
43 |                 (
44 |                     f"'{param_deprecated}' is deprecated from {version_deprecation} and"
45 |                     f" will be removed in {version_removed} for the estimator"
46 |                     f" {sampler.__class__}. Use '{new_param}' instead."
47 |                 ),
48 |                 category=FutureWarning,
49 |             )
50 |             setattr(sampler, new_param, getattr(sampler, param_deprecated))
51 | 


--------------------------------------------------------------------------------
/imblearn/utils/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/c975dd68f50beb88e1331cdd47780acd4fa95397/imblearn/utils/tests/__init__.py


--------------------------------------------------------------------------------
/imblearn/utils/tests/test_deprecation.py:
--------------------------------------------------------------------------------
 1 | """Test for the deprecation helper"""
 2 | 
 3 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
 4 | # License: MIT
 5 | 
 6 | import pytest
 7 | 
 8 | from imblearn.utils.deprecation import deprecate_parameter
 9 | 
10 | 
11 | class Sampler:
12 |     def __init__(self):
13 |         self.a = "something"
14 |         self.b = "something"
15 | 
16 | 
17 | def test_deprecate_parameter():
18 |     with pytest.warns(FutureWarning, match="is deprecated from"):
19 |         deprecate_parameter(Sampler(), "0.2", "a")
20 |     with pytest.warns(FutureWarning, match="Use 'b' instead."):
21 |         deprecate_parameter(Sampler(), "0.2", "a", "b")
22 | 


--------------------------------------------------------------------------------
/imblearn/utils/tests/test_docstring.py:
--------------------------------------------------------------------------------
  1 | """Test utilities for docstring."""
  2 | 
  3 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
  4 | # License: MIT
  5 | 
  6 | import sys
  7 | import textwrap
  8 | 
  9 | import pytest
 10 | 
 11 | from imblearn.utils import Substitution
 12 | from imblearn.utils._docstring import _n_jobs_docstring, _random_state_docstring
 13 | 
 14 | 
 15 | def _dedent_docstring(docstring):
 16 |     """Compatibility with Python 3.13+.
 17 | 
 18 |     xref: https://github.com/python/cpython/issues/81283
 19 |     """
 20 |     return "\n".join([textwrap.dedent(line) for line in docstring.split("\n")])
 21 | 
 22 | 
 23 | func_docstring = """A function.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     xxx
 28 | 
 29 |     yyy
 30 |     """
 31 | 
 32 | 
 33 | def func(param_1, param_2):
 34 |     """A function.
 35 | 
 36 |     Parameters
 37 |     ----------
 38 |     {param_1}
 39 | 
 40 |     {param_2}
 41 |     """
 42 |     return param_1, param_2
 43 | 
 44 | 
 45 | cls_docstring = """A class.
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     xxx
 50 | 
 51 |     yyy
 52 |     """
 53 | 
 54 | 
 55 | class cls:
 56 |     """A class.
 57 | 
 58 |     Parameters
 59 |     ----------
 60 |     {param_1}
 61 | 
 62 |     {param_2}
 63 |     """
 64 | 
 65 |     def __init__(self, param_1, param_2):
 66 |         self.param_1 = param_1
 67 |         self.param_2 = param_2
 68 | 
 69 | 
 70 | if sys.version_info >= (3, 13):
 71 |     func_docstring = _dedent_docstring(func_docstring)
 72 |     cls_docstring = _dedent_docstring(cls_docstring)
 73 | 
 74 | 
 75 | @pytest.mark.parametrize(
 76 |     "obj, obj_docstring", [(func, func_docstring), (cls, cls_docstring)]
 77 | )
 78 | def test_docstring_inject(obj, obj_docstring):
 79 |     obj_injected_docstring = Substitution(param_1="xxx", param_2="yyy")(obj)
 80 |     assert obj_injected_docstring.__doc__ == obj_docstring
 81 | 
 82 | 
 83 | def test_docstring_template():
 84 |     assert "random_state" in _random_state_docstring
 85 |     assert "n_jobs" in _n_jobs_docstring
 86 | 
 87 | 
 88 | def test_docstring_with_python_OO():
 89 |     """Check that we don't raise a warning if the code is executed with -OO.
 90 | 
 91 |     Non-regression test for:
 92 |     https://github.com/scikit-learn-contrib/imbalanced-learn/issues/945
 93 |     """
 94 |     instance = cls(param_1="xxx", param_2="yyy")
 95 |     instance.__doc__ = None  # simulate -OO
 96 | 
 97 |     instance = Substitution(param_1="xxx", param_2="yyy")(instance)
 98 | 
 99 |     assert instance.__doc__ is None
100 | 


--------------------------------------------------------------------------------
/imblearn/utils/tests/test_estimator_checks.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | from sklearn.base import BaseEstimator
  4 | from sklearn.utils.multiclass import check_classification_targets
  5 | 
  6 | from imblearn.base import BaseSampler
  7 | from imblearn.over_sampling.base import BaseOverSampler
  8 | from imblearn.utils import check_target_type as target_check
  9 | from imblearn.utils._sklearn_compat import validate_data
 10 | from imblearn.utils.estimator_checks import (
 11 |     check_samplers_fit,
 12 |     check_samplers_nan,
 13 |     check_samplers_one_label,
 14 |     check_samplers_preserve_dtype,
 15 |     check_samplers_sparse,
 16 |     check_samplers_string,
 17 |     check_target_type,
 18 | )
 19 | 
 20 | 
 21 | class BaseBadSampler(BaseEstimator):
 22 |     """Sampler without inputs checking."""
 23 | 
 24 |     _sampling_type = "bypass"
 25 | 
 26 |     def fit(self, X, y):
 27 |         return self
 28 | 
 29 |     def fit_resample(self, X, y):
 30 |         check_classification_targets(y)
 31 |         self.fit(X, y)
 32 |         return X, y
 33 | 
 34 | 
 35 | class SamplerSingleClass(BaseSampler):
 36 |     """Sampler that would sample even with a single class."""
 37 | 
 38 |     _sampling_type = "bypass"
 39 | 
 40 |     def fit_resample(self, X, y):
 41 |         return self._fit_resample(X, y)
 42 | 
 43 |     def _fit_resample(self, X, y):
 44 |         return X, y
 45 | 
 46 | 
 47 | class NotFittedSampler(BaseBadSampler):
 48 |     """Sampler without target checking."""
 49 | 
 50 |     def fit(self, X, y):
 51 |         X, y = validate_data(self, X=X, y=y)
 52 |         return self
 53 | 
 54 | 
 55 | class NoAcceptingSparseSampler(BaseBadSampler):
 56 |     """Sampler which does not accept sparse matrix."""
 57 | 
 58 |     def fit(self, X, y):
 59 |         X, y = validate_data(self, X=X, y=y)
 60 |         self.sampling_strategy_ = "sampling_strategy_"
 61 |         return self
 62 | 
 63 | 
 64 | class NotPreservingDtypeSampler(BaseSampler):
 65 |     _sampling_type = "bypass"
 66 | 
 67 |     _parameter_constraints: dict = {"sampling_strategy": "no_validation"}
 68 | 
 69 |     def _fit_resample(self, X, y):
 70 |         return X.astype(np.float64), y.astype(np.int64)
 71 | 
 72 | 
 73 | class IndicesSampler(BaseOverSampler):
 74 |     def _check_X_y(self, X, y):
 75 |         y, binarize_y = target_check(y, indicate_one_vs_all=True)
 76 |         X, y = validate_data(
 77 |             self,
 78 |             X=X,
 79 |             y=y,
 80 |             reset=True,
 81 |             dtype=None,
 82 |             ensure_all_finite=False,
 83 |         )
 84 |         return X, y, binarize_y
 85 | 
 86 |     def _fit_resample(self, X, y):
 87 |         n_max_count_class = np.bincount(y).max()
 88 |         indices = np.random.choice(np.arange(X.shape[0]), size=n_max_count_class * 2)
 89 |         return X[indices], y[indices]
 90 | 
 91 | 
 92 | def test_check_samplers_string():
 93 |     sampler = IndicesSampler()
 94 |     check_samplers_string(sampler.__class__.__name__, sampler)
 95 | 
 96 | 
 97 | def test_check_samplers_nan():
 98 |     sampler = IndicesSampler()
 99 |     check_samplers_nan(sampler.__class__.__name__, sampler)
100 | 
101 | 
102 | mapping_estimator_error = {
103 |     "BaseBadSampler": (AssertionError, None),
104 |     "SamplerSingleClass": (AssertionError, "Sampler can't balance when only"),
105 |     "NotFittedSampler": (AssertionError, "No fitted attribute"),
106 |     "NoAcceptingSparseSampler": (TypeError, "dense data is required"),
107 |     "NotPreservingDtypeSampler": (AssertionError, "X dtype is not preserved"),
108 | }
109 | 
110 | 
111 | def _test_single_check(Estimator, check):
112 |     estimator = Estimator()
113 |     name = estimator.__class__.__name__
114 |     err_type, err_msg = mapping_estimator_error[name]
115 |     with pytest.raises(err_type, match=err_msg):
116 |         check(name, estimator)
117 | 
118 | 
119 | def test_all_checks():
120 |     _test_single_check(BaseBadSampler, check_target_type)
121 |     _test_single_check(SamplerSingleClass, check_samplers_one_label)
122 |     _test_single_check(NotFittedSampler, check_samplers_fit)
123 |     _test_single_check(NoAcceptingSparseSampler, check_samplers_sparse)
124 |     _test_single_check(NotPreservingDtypeSampler, check_samplers_preserve_dtype)
125 | 


--------------------------------------------------------------------------------
/imblearn/utils/tests/test_min_dependencies.py:
--------------------------------------------------------------------------------
 1 | """Tests for the minimum dependencies in the README.rst file."""
 2 | 
 3 | import os
 4 | import platform
 5 | import re
 6 | from pathlib import Path
 7 | 
 8 | import pytest
 9 | from packaging.requirements import Requirement
10 | from packaging.version import parse
11 | 
12 | import imblearn
13 | 
14 | 
15 | @pytest.mark.skipif(
16 |     platform.system() == "Windows" or parse(platform.python_version()) < parse("3.11"),
17 |     reason="This test is enough on unix system and requires Python >= 3.11",
18 | )
19 | def test_min_dependencies_readme():
20 |     # local import to not import the file with Python < 3.11
21 |     import tomllib
22 | 
23 |     # Test that the minimum dependencies in the README.rst file are
24 |     # consistent with the minimum dependencies defined at the file:
25 |     # pyproject.toml
26 | 
27 |     pyproject_path = Path(imblearn.__path__[0]).parents[0] / "pyproject.toml"
28 |     with open(pyproject_path, "rb") as f:
29 |         pyproject_data = tomllib.load(f)
30 | 
31 |     def process_requirements(requirements):
32 |         result = {}
33 |         for req in requirements:
34 |             req = Requirement(req)
35 |             for specifier in req.specifier:
36 |                 if specifier.operator == ">=":
37 |                     result[req.name] = parse(specifier.version)
38 |         return result
39 | 
40 |     min_dependencies = process_requirements(
41 |         [f"python{pyproject_data['project']['requires-python']}"]
42 |     )
43 |     min_dependencies.update(
44 |         process_requirements(pyproject_data["project"]["dependencies"])
45 |     )
46 | 
47 |     markers = ["docs", "optional", "tensorflow", "keras", "tests"]
48 |     for marker_name in markers:
49 |         min_dependencies.update(
50 |             process_requirements(
51 |                 pyproject_data["project"]["optional-dependencies"][marker_name]
52 |             )
53 |         )
54 | 
55 |     pattern = re.compile(
56 |         r"(\.\. \|)"
57 |         + r"(([A-Za-z]+\-?)+)"
58 |         + r"(MinVersion\| replace::)"
59 |         + r"( [0-9]+\.[0-9]+(\.[0-9]+)?)"
60 |     )
61 | 
62 |     readme_path = Path(imblearn.__path__[0]).parents[0]
63 |     readme_file = readme_path / "README.rst"
64 | 
65 |     if not os.path.exists(readme_file):
66 |         # Skip the test if the README.rst file is not available.
67 |         # For instance, when installing scikit-learn from wheels
68 |         pytest.skip("The README.rst file is not available.")
69 | 
70 |     with readme_file.open("r") as f:
71 |         for line in f:
72 |             matched = pattern.match(line)
73 | 
74 |             if not matched:
75 |                 continue
76 | 
77 |             package, version = matched.group(2), matched.group(5)
78 |             package = package.lower()
79 |             if package == "scikitlearn":
80 |                 package = "scikit-learn"
81 | 
82 |             if package in min_dependencies:
83 |                 version = parse(version)
84 |                 min_version = min_dependencies[package]
85 | 
86 |                 assert version == min_version, f"{package} has a mismatched version"
87 | 


--------------------------------------------------------------------------------
/imblearn/utils/tests/test_show_versions.py:
--------------------------------------------------------------------------------
 1 | """Test for the show_versions helper. Based on the sklearn tests."""
 2 | # Author: Alexander L. Hayes <hayesall@iu.edu>
 3 | # License: MIT
 4 | 
 5 | from imblearn.utils._show_versions import _get_deps_info, show_versions
 6 | 
 7 | 
 8 | def test_get_deps_info():
 9 |     _deps_info = _get_deps_info()
10 |     assert "pip" in _deps_info
11 |     assert "setuptools" in _deps_info
12 |     assert "imbalanced-learn" in _deps_info
13 |     assert "scikit-learn" in _deps_info
14 |     assert "numpy" in _deps_info
15 |     assert "scipy" in _deps_info
16 |     assert "Cython" in _deps_info
17 |     assert "pandas" in _deps_info
18 |     assert "joblib" in _deps_info
19 | 
20 | 
21 | def test_show_versions_default(capsys):
22 |     show_versions()
23 |     out, err = capsys.readouterr()
24 |     assert "python" in out
25 |     assert "executable" in out
26 |     assert "machine" in out
27 |     assert "pip" in out
28 |     assert "setuptools" in out
29 |     assert "imbalanced-learn" in out
30 |     assert "scikit-learn" in out
31 |     assert "numpy" in out
32 |     assert "scipy" in out
33 |     assert "Cython" in out
34 |     assert "pandas" in out
35 |     assert "keras" in out
36 |     assert "tensorflow" in out
37 |     assert "joblib" in out
38 | 
39 | 
40 | def test_show_versions_github(capsys):
41 |     show_versions(github=True)
42 |     out, err = capsys.readouterr()
43 |     assert "<details><summary>System, Dependency Information</summary>" in out
44 |     assert "**System Information**" in out
45 |     assert "* python" in out
46 |     assert "* executable" in out
47 |     assert "* machine" in out
48 |     assert "**Python Dependencies**" in out
49 |     assert "* pip" in out
50 |     assert "* setuptools" in out
51 |     assert "* imbalanced-learn" in out
52 |     assert "* scikit-learn" in out
53 |     assert "* numpy" in out
54 |     assert "* scipy" in out
55 |     assert "* Cython" in out
56 |     assert "* pandas" in out
57 |     assert "* keras" in out
58 |     assert "* tensorflow" in out
59 |     assert "* joblib" in out
60 |     assert "</details>" in out
61 | 


--------------------------------------------------------------------------------
/imblearn/utils/tests/test_testing.py:
--------------------------------------------------------------------------------
 1 | """Test for the testing module"""
 2 | # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
 3 | #          Christos Aridas
 4 | # License: MIT
 5 | 
 6 | import numpy as np
 7 | import pytest
 8 | from sklearn.neighbors._base import KNeighborsMixin
 9 | 
10 | from imblearn.base import SamplerMixin
11 | from imblearn.utils.testing import _CustomNearestNeighbors, all_estimators
12 | 
13 | 
14 | def test_all_estimators():
15 |     # check if the filtering is working with a list or a single string
16 |     type_filter = "sampler"
17 |     all_estimators(type_filter=type_filter)
18 |     type_filter = ["sampler"]
19 |     estimators = all_estimators(type_filter=type_filter)
20 |     for estimator in estimators:
21 |         # check that all estimators are sampler
22 |         assert issubclass(estimator[1], SamplerMixin)
23 | 
24 |     # check that an error is raised when the type is unknown
25 |     type_filter = "rnd"
26 |     with pytest.raises(ValueError, match="Parameter type_filter must be 'sampler'"):
27 |         all_estimators(type_filter=type_filter)
28 | 
29 | 
30 | def test_custom_nearest_neighbors():
31 |     """Check that our custom nearest neighbors can be used for our internal
32 |     duck-typing."""
33 | 
34 |     neareat_neighbors = _CustomNearestNeighbors(n_neighbors=3)
35 | 
36 |     assert not isinstance(neareat_neighbors, KNeighborsMixin)
37 |     assert hasattr(neareat_neighbors, "kneighbors")
38 |     assert hasattr(neareat_neighbors, "kneighbors_graph")
39 | 
40 |     rng = np.random.RandomState(42)
41 |     X = rng.randn(150, 3)
42 |     y = rng.randint(0, 2, 150)
43 |     neareat_neighbors.fit(X, y)
44 | 
45 |     distances, indices = neareat_neighbors.kneighbors(X)
46 |     assert distances.shape == (150, 3)
47 |     assert indices.shape == (150, 3)
48 |     np.testing.assert_allclose(distances[:, 0], 0.0)
49 |     np.testing.assert_allclose(indices[:, 0], np.arange(150))
50 | 


--------------------------------------------------------------------------------