├── .circleci
    └── config.yml
├── .dvc
    ├── .gitignore
    ├── config
    └── plots
    │   ├── confusion.json
    │   ├── default.json
    │   ├── scatter.json
    │   └── smooth.json
├── .dvcignore
├── .github
    ├── .codecov.yml
    ├── ISSUE_TEMPLATE
    │   ├── add-method.md
    │   ├── bug_report.md
    │   └── feature-request.md
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   ├── codeql-analysis.yml
    │   └── pythonpublish.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── ITMO_FS
    ├── __about__.py
    ├── __init__.py
    ├── embedded
    │   ├── MOS.py
    │   └── __init__.py
    ├── ensembles
    │   ├── __init__.py
    │   ├── measure_based
    │   │   ├── WeightBased.py
    │   │   ├── __init__.py
    │   │   └── fusion_functions.py
    │   ├── model_based
    │   │   ├── __init__.py
    │   │   └── best_sum.py
    │   └── ranking_based
    │   │   ├── Mixed.py
    │   │   ├── __init__.py
    │   │   └── fusion_functions.py
    ├── filters
    │   ├── __init__.py
    │   ├── multivariate
    │   │   ├── DISRwithMassive.py
    │   │   ├── FCBF.py
    │   │   ├── MultivariateFilter.py
    │   │   ├── STIR.py
    │   │   ├── TraceRatioFisher.py
    │   │   ├── __init__.py
    │   │   ├── measures.py
    │   │   └── mimaga.py
    │   ├── univariate
    │   │   ├── NDFS.py
    │   │   ├── RFS.py
    │   │   ├── SPEC.py
    │   │   ├── UnivariateFilter.py
    │   │   ├── VDM.py
    │   │   ├── __init__.py
    │   │   └── measures.py
    │   └── unsupervised
    │   │   ├── MCFS.py
    │   │   ├── UDFS.py
    │   │   ├── __init__.py
    │   │   └── trace_ratio_laplacian.py
    ├── hybrid
    │   ├── IWSSr_SFLA.py
    │   ├── Melif.py
    │   ├── __init__.py
    │   └── filter_wrapper_hybrid.py
    ├── utils
    │   ├── __init__.py
    │   ├── base_transformer.py
    │   ├── base_wrapper.py
    │   ├── data_check.py
    │   ├── functions.py
    │   ├── information_theory.py
    │   └── qpfs_body.py
    └── wrappers
    │   ├── __init__.py
    │   ├── deterministic
    │       ├── AddDelWrapper.py
    │       ├── BackwardSelection.py
    │       ├── RecursiveElimination.py
    │       ├── SequentialForwardSelection.py
    │       ├── __init__.py
    │       └── qpfs_wrapper.py
    │   └── randomized
    │       ├── HillClimbing.py
    │       ├── SimulatedAnnealing.py
    │       ├── TPhMGWO.py
    │       └── __init__.py
├── LICENSE
├── README.rst
├── _config.yml
├── docs
    ├── Makefile
    ├── _templates
    │   ├── class.rst
    │   └── function.rst
    ├── api.rst
    ├── conf.py
    ├── index.rst
    ├── install.rst
    ├── introduction.rst
    ├── logos
    │   └── logo_itmo_fs_itog_colour.jpg
    ├── make.bat
    └── user_guide.rst
├── meta.yml
├── requirements.txt
├── setup.cfg
├── setup.py
└── test
    ├── Melif_test.py
    ├── datasets
        ├── .gitignore
        ├── arcene.csv.dvc
        ├── dexter.csv.dvc
        ├── dorothea.csv.dvc
        ├── gisette.csv.dvc
        └── madelon.csv.dvc
    ├── embedded_test.py
    ├── ensemble_test.py
    ├── hybrid_test.py
    ├── multivariate_filters_test.py
    ├── univariate_filters_test.py
    ├── univariate_measures_test.py
    ├── unsupervised_filters_test.py
    ├── utils.py
    └── wrapper_test.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2.1
 2 | 
 3 | orbs:
 4 |   python: circleci/python@1.2.0
 5 | 
 6 | jobs:
 7 |   build-and-test:
 8 |     executor: python/default
 9 |     steps:
10 |       - checkout
11 | 
12 |       # Download and cache dependencies
13 |       - restore_cache:
14 |           keys:
15 |             - v1-dependencies-{{ checksum "requirements.txt" }}
16 |             # fallback to using the latest cache if no exact match is found
17 |             - v1-dependencies-
18 | 
19 |       - run:
20 |           name: install dependencies
21 |           command: |
22 |             wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
23 |             chmod +x miniconda.sh && ./miniconda.sh -b -p ~/miniconda
24 |             export PATH="~/miniconda/bin:$PATH"
25 |             conda update --yes --quiet conda
26 |             conda create -n testenv --yes --quiet python=3
27 |             source activate testenv
28 |             pip install -r requirements.txt
29 |             pip install pandas sphinx_rtd_theme dvc
30 |             pip install sphinx-gallery
31 |             pip install dvc
32 |             pip install pydrive2
33 |             pip install .
34 |             cd docs
35 |             make html
36 | 
37 |       - save_cache:
38 |           paths:
39 |             - ./venv
40 |           key: v1-dependencies-{{ checksum "requirements.txt" }}
41 | 
42 |       # run tests!
43 |       # this example uses Django's built-in test-runner
44 |       # other common Python testing frameworks include pytest and nose
45 |       # https://pytest.org
46 |       # https://nose.readthedocs.io
47 |       - run:
48 |           name: run tests
49 |           paths:
50 |             - ./venv
51 |           command: |
52 |             pip install -r requirements.txt
53 |             pip install pandas sphinx_rtd_theme
54 |             pip install pandas pandas
55 |             pip install dvc
56 |             pip install pytest
57 |             pytest --cov=ITMO_FS test/
58 | 
59 |       - store_artifacts:
60 |           path: test-reports
61 |           destination: test-reports
62 | 
63 | workflows:
64 |   main:
65 |     jobs:
66 |       - build-and-test


--------------------------------------------------------------------------------
/.dvc/.gitignore:
--------------------------------------------------------------------------------
1 | /config.local
2 | /tmp
3 | /cache
4 | 


--------------------------------------------------------------------------------
/.dvc/config:
--------------------------------------------------------------------------------
1 | [core]
2 |     remote = gdrive
3 | ['remote "gdrive"']
4 |     url = gdrive://1T6v5zENAgNdIXQWZ01xRGpqVOvDNAo7w
5 | 


--------------------------------------------------------------------------------
/.dvc/plots/confusion.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
 3 |     "data": {
 4 |         "values": "<DVC_METRIC_DATA>"
 5 |     },
 6 |     "title": "<DVC_METRIC_TITLE>",
 7 |     "mark": "rect",
 8 |     "encoding": {
 9 |         "x": {
10 |             "field": "<DVC_METRIC_X>",
11 |             "type": "nominal",
12 |             "sort": "ascending",
13 |             "title": "<DVC_METRIC_X_LABEL>"
14 |         },
15 |         "y": {
16 |             "field": "<DVC_METRIC_Y>",
17 |             "type": "nominal",
18 |             "sort": "ascending",
19 |             "title": "<DVC_METRIC_Y_LABEL>"
20 |         },
21 |         "color": {
22 |             "aggregate": "count",
23 |             "type": "quantitative"
24 |         },
25 |         "facet": {
26 |             "field": "rev",
27 |             "type": "nominal"
28 |         }
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/.dvc/plots/default.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
 3 |     "data": {
 4 |         "values": "<DVC_METRIC_DATA>"
 5 |     },
 6 |     "title": "<DVC_METRIC_TITLE>",
 7 |     "mark": {
 8 |         "type": "line"
 9 |     },
10 |     "encoding": {
11 |         "x": {
12 |             "field": "<DVC_METRIC_X>",
13 |             "type": "quantitative",
14 |             "title": "<DVC_METRIC_X_LABEL>"
15 |         },
16 |         "y": {
17 |             "field": "<DVC_METRIC_Y>",
18 |             "type": "quantitative",
19 |             "title": "<DVC_METRIC_Y_LABEL>",
20 |             "scale": {
21 |                 "zero": false
22 |             }
23 |         },
24 |         "color": {
25 |             "field": "rev",
26 |             "type": "nominal"
27 |         }
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/.dvc/plots/scatter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
 3 |     "data": {
 4 |         "values": "<DVC_METRIC_DATA>"
 5 |     },
 6 |     "title": "<DVC_METRIC_TITLE>",
 7 |     "mark": "point",
 8 |     "encoding": {
 9 |         "x": {
10 |             "field": "<DVC_METRIC_X>",
11 |             "type": "quantitative",
12 |             "title": "<DVC_METRIC_X_LABEL>"
13 |         },
14 |         "y": {
15 |             "field": "<DVC_METRIC_Y>",
16 |             "type": "quantitative",
17 |             "title": "<DVC_METRIC_Y_LABEL>",
18 |             "scale": {
19 |                 "zero": false
20 |             }
21 |         },
22 |         "color": {
23 |             "field": "rev",
24 |             "type": "nominal"
25 |         }
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/.dvc/plots/smooth.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
 3 |     "data": {
 4 |         "values": "<DVC_METRIC_DATA>"
 5 |     },
 6 |     "title": "<DVC_METRIC_TITLE>",
 7 |     "mark": {
 8 |         "type": "line"
 9 |     },
10 |     "encoding": {
11 |         "x": {
12 |             "field": "<DVC_METRIC_X>",
13 |             "type": "quantitative",
14 |             "title": "<DVC_METRIC_X_LABEL>"
15 |         },
16 |         "y": {
17 |             "field": "<DVC_METRIC_Y>",
18 |             "type": "quantitative",
19 |             "title": "<DVC_METRIC_Y_LABEL>",
20 |             "scale": {
21 |                 "zero": false
22 |             }
23 |         },
24 |         "color": {
25 |             "field": "rev",
26 |             "type": "nominal"
27 |         }
28 |     },
29 |     "transform": [
30 |         {
31 |             "loess": "<DVC_METRIC_Y>",
32 |             "on": "<DVC_METRIC_X>",
33 |             "groupby": [
34 |                 "rev"
35 |             ],
36 |             "bandwidth": 0.3
37 |         }
38 |     ]
39 | }
40 | 


--------------------------------------------------------------------------------
/.dvcignore:
--------------------------------------------------------------------------------
1 | # Add patterns of files dvc should ignore, which could improve
2 | # the performance. Learn more at
3 | # https://dvc.org/doc/user-guide/dvcignore
4 | 


--------------------------------------------------------------------------------
/.github/.codecov.yml:
--------------------------------------------------------------------------------
 1 | codecov:
 2 |   require_ci_to_pass: yes
 3 |   token: f8baf545-d745-4d99-81e7-ef7e019a1d1c
 4 | coverage:
 5 |   precision: 2
 6 |   round: down
 7 |   range: "70...100"
 8 | 
 9 | parsers:
10 |   gcov:
11 |     branch_detection:
12 |       conditional: yes
13 |       loop: yes
14 |       method: no
15 |       macro: no
16 | 
17 | install:
18 |   pip:
19 |     requirements.txt
20 | 
21 | comment:
22 |   layout: "reach,diff,flags,tree"
23 |   behavior: default
24 |   require_changes: no


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/add-method.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Add method
 3 | about: Suggest an idea for this project
 4 | title: 'New Method:'
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe a method you want to be added**
11 | Brief description of the method.
12 | 
13 | **Link to paper**
14 | A link to the paper with the method if you have it.
15 | 
16 | **Additional context**
17 | Add any other context or screenshots about the feature request here.
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: 'BUG : '
 5 | labels: ''
 6 | assignees: LastShekel
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behaviour:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See an error
19 | 
20 | **Expected behaviour**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 | 
29 | **Additional context**
30 | Add any other context about the problem here.
31 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: 'ADD : '
 5 | labels: ''
 6 | assignees: LastShekel
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | # Pull Request Template
 2 | 
 3 | ## Description
 4 | 
 5 | Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change.
 6 | 
 7 | Fixes # (issue)
 8 | 
 9 | ## Type of change
10 | 
11 | Please delete options that are not relevant.
12 | 
13 | - [ ] Bug fix (non-breaking change which fixes an issue)
14 | - [ ] New feature (non-breaking change which adds functionality)
15 | - [ ] New method (set a link to paper then)
16 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
17 | - [ ] This change requires a documentation update
18 | 
19 | ## How Has This Been Tested?
20 | 
21 | Please describe the tests or set link to that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration
22 | 
23 | 
24 | 
25 | ## Checklist:
26 | 
27 | - [ ] My code follows the style guidelines of this project
28 | - [ ] I have performed a self-review of my own code
29 | - [ ] I have commented my code, particularly in hard-to-understand areas
30 | - [ ] I have made corresponding changes to the documentation
31 | - [ ] My changes generate no new warnings
32 | - [ ] I have added tests that prove my fix is effective or that my feature works
33 | - [ ] New and existing unit tests pass locally with my changes
34 | - [ ] Any dependent changes have been merged and published in downstream modules
35 | - [ ] I have checked my code and corrected any misspellings


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ master ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ master ]
20 |   schedule:
21 |     - cron: '21 23 * * 1'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ 'python' ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
37 |         # Learn more:
38 |         # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
39 | 
40 |     steps:
41 |     - name: Checkout repository
42 |       uses: actions/checkout@v2
43 | 
44 |     # Initializes the CodeQL tools for scanning.
45 |     - name: Initialize CodeQL
46 |       uses: github/codeql-action/init@v1
47 |       with:
48 |         languages: ${{ matrix.language }}
49 |         # If you wish to specify custom queries, you can do so here or in a config file.
50 |         # By default, queries listed here will override any specified in a config file.
51 |         # Prefix the list here with "+" to use these queries and those in the config file.
52 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
53 | 
54 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
55 |     # If this step fails, then you should remove it and run the build manually (see below)
56 |     - name: Autobuild
57 |       uses: github/codeql-action/autobuild@v1
58 | 
59 |     # ℹ️ Command-line programs to run using the OS shell.
60 |     # 📚 https://git.io/JvXDl
61 | 
62 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
63 |     #    and modify them (or add more) to build your code if your project
64 |     #    uses a compiled language
65 | 
66 |     #- run: |
67 |     #   make bootstrap
68 |     #   make release
69 | 
70 |     - name: Perform CodeQL Analysis
71 |       uses: github/codeql-action/analyze@v1
72 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v1
12 |     - name: Set up Python
13 |       uses: actions/setup-python@v1
14 |       with:
15 |         python-version: '3.x'
16 |     - name: Install dependencies
17 |       run: |
18 |         python -m pip install --upgrade pip
19 |         pip install setuptools wheel twine
20 |     - name: Build and publish
21 |       env:
22 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
23 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
24 |       run: |
25 |         python setup.py sdist bdist_wheel
26 |         twine upload dist/*
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | ITMO_FS.egg-info
3 | dist
4 | build
5 | common


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at SomaCruz@bk.ru. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/ITMO_FS/__about__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["__title__", "__uri__", "__version__"]
2 | 
3 | __title__ = "ITMO_FS"
4 | __uri__ = "https://github.com/ctlab/ITMO_FS"
5 | 
6 | __version__ = "0.3.5"
7 | 


--------------------------------------------------------------------------------
/ITMO_FS/__init__.py:
--------------------------------------------------------------------------------
1 | from .embedded import *
2 | from .ensembles import *
3 | from .filters import *
4 | from .hybrid import *
5 | from .wrappers import *
6 | 


--------------------------------------------------------------------------------
/ITMO_FS/embedded/MOS.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger
  2 | 
  3 | import numpy as np
  4 | from imblearn.over_sampling import SMOTE
  5 | from sklearn.base import clone
  6 | 
  7 | from ..utils import augmented_rvalue, BaseTransformer
  8 | 
  9 | 
 10 | class MOS(BaseTransformer):
 11 |     """Perform Minimizing Overlapping Selection under SMOTE (MOSS) or under
 12 |     No-Sampling (MOSNS) algorithm.
 13 | 
 14 |     Parameters
 15 |     ----------
 16 |     model : object
 17 |         The model that should have a fit(X, y) method and a field corresponding
 18 |         to feature weights. Currently only SGDClassifier should be passed,
 19 |         other models would not work.
 20 |     weight_func : callable
 21 |         The function to extract weights from the model.
 22 |     loss : str, 'log' or 'hinge'
 23 |         Loss function to use in the algorithm. 'log' gives a logistic
 24 |         regression, while 'hinge' gives a support vector machine.
 25 |     seed : int, optional
 26 |         Seed for python random.
 27 |     l1_ratio : float
 28 |         The value used to balance the L1 and L2 penalties in elastic-net.
 29 |     threshold : float
 30 |         The threshold value for feature dropout. Instead of comparing them to
 31 |         zero, they are normalized and values with absolute value lower than the
 32 |         threshold are dropped out.
 33 |     epochs : int
 34 |         The number of epochs to perform in the algorithm.
 35 |     alphas : array-like, shape (n_alphas,), optional
 36 |         The range of lambdas that should form the regularization path.
 37 |     sampling : bool
 38 |         Bool value that control whether MOSS (True) or MOSNS (False) should be
 39 |         executed.
 40 |     k_neighbors : int
 41 |         Amount of nearest neighbors to use in SMOTE if MOSS is used.
 42 | 
 43 |     Notes
 44 |     -----
 45 |     For more details see `this paper
 46 |     <https://www.sciencedirect.com/science/article/pii/S0169743919306070/>`_.
 47 | 
 48 |     Examples
 49 |     --------
 50 |     >>> from ITMO_FS.embedded import MOS
 51 |     >>> from sklearn.linear_model import SGDClassifier
 52 |     >>> import numpy as np
 53 |     >>> from sklearn.datasets import make_classification
 54 |     >>> from sklearn.linear_model import LogisticRegression
 55 |     >>> dataset = make_classification(n_samples=100, n_features=10,
 56 |     ... n_informative=5, n_redundant=0, weights=[0.85, 0.15], random_state=42,
 57 |     ... shuffle=False)
 58 |     >>> X, y = np.array(dataset[0]), np.array(dataset[1])
 59 |     >>> m = MOS(model=SGDClassifier(),
 60 |     ... weight_func=lambda model: np.square(model.coef_).sum(axis=0)).fit(X, y)
 61 |     >>> m.selected_features_
 62 |     array([1, 3, 4], dtype=int64)
 63 |     >>> m = MOS(model=SGDClassifier(), sampling=True,
 64 |     ... weight_func=lambda model: np.square(model.coef_).sum(axis=0)).fit(X, y)
 65 |     >>> m.selected_features_
 66 |     array([1, 3, 4, 6], dtype=int64)
 67 |     """
 68 |     def __init__(self, model, weight_func, loss='log', seed=42, l1_ratio=0.5,
 69 |                  threshold=1e-3, epochs=1000, alphas=np.arange(0.01, 0.2, 0.01),
 70 |                  sampling=False, k_neighbors=2):
 71 |         self.model = model
 72 |         self.weight_func = weight_func
 73 |         self.loss = loss
 74 |         self.seed = seed
 75 |         self.l1_ratio = l1_ratio
 76 |         self.threshold = threshold
 77 |         self.epochs = epochs
 78 |         self.alphas = alphas
 79 |         self.sampling = sampling
 80 |         self.k_neighbors = k_neighbors
 81 | 
 82 |     def _fit(self, X, y):
 83 |         """Run the MOS algorithm on the specified dataset.
 84 | 
 85 |         Parameters
 86 |         ----------
 87 |         X : array-like, shape (n_samples, n_features)
 88 |             The input samples.
 89 |         y : array-like, shape (n_samples,)
 90 |             The classes for the samples.
 91 | 
 92 |         Returns
 93 |         -------
 94 |         None
 95 |         """
 96 |         if self.loss not in ['hinge', 'log']:
 97 |             getLogger(__name__).error(
 98 |                 "Loss should be 'hinge' or 'log', %s was passed", self.loss)
 99 |             raise KeyError(
100 |                 "Loss should be 'hinge' or 'log', %s was passed" % self.loss)
101 | 
102 |         if self.sampling:
103 |             try:
104 |                 X, y = SMOTE(
105 |                     random_state=self.seed,
106 |                     k_neighbors=self.k_neighbors).fit_resample(X, y)
107 |             except ValueError:
108 |                 getLogger(__name__).warning(
109 |                     "Couldn't perform SMOTE because k_neighbors is bigger "
110 |                     "than amount of instances in one of the classes; MOSNS "
111 |                     "would be performed instead")
112 | 
113 |         min_rvalue = 1
114 |         min_b = []
115 |         model = clone(self.model)
116 |         for a in self.alphas:  # TODO: do a little more
117 |             # research on the range of lambdas
118 |             model = model.set_params(
119 |                 loss=self.loss, random_state=self.seed, penalty='elasticnet',
120 |                 alpha=a, l1_ratio=self.l1_ratio, max_iter=self.epochs)
121 |             model.fit(X, y)
122 |             b = self.weight_func(model)
123 |             rvalue = augmented_rvalue(
124 |                 X[:, np.flatnonzero(np.abs(b) > self.threshold)], y)
125 |             getLogger(__name__).info(
126 |                 "For alpha %f: rvalue = %f, weight vector = %s", a, rvalue, b)
127 |             if min_rvalue > rvalue:
128 |                 min_rvalue = rvalue
129 |                 min_b = b
130 |                 getLogger(__name__).info("New minimum rvalue: %f", rvalue)
131 |                 getLogger(__name__).info("New weight vector: %s", b)
132 |         self.selected_features_ = np.flatnonzero(np.abs(min_b) > self.threshold)
133 | 


--------------------------------------------------------------------------------
/ITMO_FS/embedded/__init__.py:
--------------------------------------------------------------------------------
1 | from .MOS import MOS
2 | 


--------------------------------------------------------------------------------
/ITMO_FS/ensembles/__init__.py:
--------------------------------------------------------------------------------
1 | from .measure_based import *
2 | from .model_based import *
3 | from .ranking_based import *
4 | 


--------------------------------------------------------------------------------
/ITMO_FS/ensembles/measure_based/WeightBased.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger
  2 | 
  3 | import numpy as np
  4 | from sklearn.base import clone
  5 | 
  6 | from .fusion_functions import *
  7 | from ...utils import BaseTransformer, apply_cr, check_filters
  8 | 
  9 | 
 10 | class WeightBased(BaseTransformer):
 11 |     """Weight-based filter ensemble. The ensemble first computes all filter
 12 |     scores for the dataset and then aggregates them using a selected fusion
 13 |     function.
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     filters : collection
 18 |         Collection of filter objects. Filters should have a fit(X, y) method
 19 |         and a feature_scores_ field that contains scores for all features.
 20 |     cutting_rule : string or callable
 21 |         A cutting rule name defined in GLOB_CR or a callable with signature
 22 |         cutting_rule (features), which should return a list features ranked by
 23 |         some rule.
 24 |     fusion_function : callable
 25 |         A function with signature (filter_scores (array-like, shape
 26 |         (n_filters, n_features)), weights (array-like, shape (n_filters,)))
 27 |         that should return the aggregated weights for all features.
 28 |     weights : array-like
 29 |         An array of shape (n_filters,) defining the weights for input filters.
 30 | 
 31 |     See Also
 32 |     --------
 33 | 
 34 |     Examples
 35 |     --------
 36 |     >>> from ITMO_FS.ensembles import WeightBased
 37 |     >>> from ITMO_FS.filters.univariate import UnivariateFilter
 38 |     >>> import numpy as np
 39 |     >>> filters = [UnivariateFilter('GiniIndex'),
 40 |     ... UnivariateFilter('FechnerCorr'),
 41 |     ... UnivariateFilter('SpearmanCorr'),
 42 |     ... UnivariateFilter('PearsonCorr')]
 43 |     >>> x = np.array([[3, 3, 3, 2, 2], [3, 3, 1, 2, 3], [1, 3, 5, 1, 1],
 44 |     ... [3, 1, 4, 3, 1], [3, 1, 2, 3, 1]])
 45 |     >>> y = np.array([1, 3, 2, 1, 2])
 46 |     >>> wb = WeightBased(filters, ("K best", 2)).fit(x, y)
 47 |     >>> wb.selected_features_
 48 |     array([4, 1], dtype=int64)
 49 |     """
 50 |     def __init__(self, filters, cutting_rule=("K best", 2),
 51 |                  fusion_function=weight_fusion, weights=None):
 52 |         self.filters = filters
 53 |         self.cutting_rule = cutting_rule
 54 |         self.fusion_function = fusion_function
 55 |         self.weights = weights
 56 | 
 57 |     def get_scores(self, X, y):
 58 |         """Return the normalized feature scores for all filters.
 59 | 
 60 |         Parameters
 61 |         ----------
 62 |         X : array-like, shape (n_samples, n_features)
 63 |             The training input samples.
 64 |         y : array-like, shape (n_samples,)
 65 |             The target values.
 66 | 
 67 |         Returns
 68 |         -------
 69 |         array-like, shape (n_filters, n_features) : feature scores
 70 |         """
 71 |         scores = np.vectorize(
 72 |             lambda f: clone(f).fit(X, y).feature_scores_,
 73 |             signature='()->(1)')(self.filters)
 74 |         getLogger(__name__).info("Scores for all filters: %s", scores)
 75 |         mins = np.min(scores, axis=1).reshape(-1, 1)
 76 |         maxs = np.max(scores, axis=1).reshape(-1, 1)
 77 |         return (scores - mins) / (maxs - mins)
 78 | 
 79 |     def __len__(self):
 80 |         """Return the number of filters used in the ensemble.
 81 | 
 82 |         Parameters
 83 |         ----------
 84 | 
 85 |         Returns
 86 |         -------
 87 |         int : number of filters
 88 |         """
 89 |         return len(self.filters)
 90 | 
 91 |     def _fit(self, X, y):
 92 |         """Fit the ensemble.
 93 | 
 94 |         Parameters
 95 |         ----------
 96 |         X : array-like, shape (n_samples, n_features)
 97 |             The training input samples.
 98 |         y : array-like, shape (n_samples,)
 99 |             The target values.
100 | 
101 |         Returns
102 |         -------
103 |         None
104 |         """
105 |         check_filters(self.filters)
106 |         getLogger(__name__).info(
107 |             "Running WeightBased with filters: %s", self.filters)
108 |         filter_scores = self.get_scores(X, y)
109 |         getLogger(__name__).info(
110 |             "Normalized scores for all filters: %s", filter_scores)
111 |         if self.weights is None:
112 |             weights = np.ones(len(self.filters)) / len(self.filters)
113 |         else:
114 |             weights = self.weights
115 |         getLogger(__name__).info("Weights vector: %s", weights)
116 |         self.feature_scores_ = self.fusion_function(filter_scores, weights)
117 |         getLogger(__name__).info("Feature scores: %s", self.feature_scores_)
118 |         self.selected_features_ = apply_cr(self.cutting_rule)(
119 |             self.feature_scores_)
120 | 


--------------------------------------------------------------------------------
/ITMO_FS/ensembles/measure_based/__init__.py:
--------------------------------------------------------------------------------
1 | from .WeightBased import *
2 | from .fusion_functions import *
3 | 


--------------------------------------------------------------------------------
/ITMO_FS/ensembles/measure_based/fusion_functions.py:
--------------------------------------------------------------------------------
 1 | from numpy import dot
 2 | 
 3 | 
 4 | def weight_fusion(filter_scores, weights):
 5 |     """Calculate the weighted score of each feature.
 6 | 
 7 |     Parameters
 8 |     ----------
 9 |     filter_scores : array-like, shape (n_filters, n_features)
10 |         Scores for all filters.
11 |     weights : array-like, shape (n_filters,)
12 |         Filter weights.
13 | 
14 |     Returns
15 |     -------
16 |     array-like, shape (n_features,) : feature scores
17 |     """
18 |     return filter_scores.T.dot(weights)
19 | 


--------------------------------------------------------------------------------
/ITMO_FS/ensembles/model_based/__init__.py:
--------------------------------------------------------------------------------
1 | from .best_sum import BestSum
2 | 


--------------------------------------------------------------------------------
/ITMO_FS/ensembles/model_based/best_sum.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.base import clone
  3 | from sklearn.model_selection import cross_val_score
  4 | from logging import getLogger
  5 | from ...utils import BaseTransformer, apply_cr
  6 | 
  7 | 
  8 | class BestSum(BaseTransformer):
  9 |     """Best weighted sum ensemble. The ensemble fits the input models and
 10 |     computes the feature scores as the weighted sum of the models' feature
 11 |     scores and some performance metric (e.g. accuracy)
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     models : collection
 16 |         Collection of model objects. Models should have a fit(X, y) method and
 17 |         a field corresponding to feature weights.
 18 |     cutting_rule : string or callable
 19 |         A cutting rule name defined in GLOB_CR or a callable with signature
 20 |         cutting_rule (features), which should return a list features ranked by
 21 |         some rule.
 22 |     weight_func : callable
 23 |         The function to extract weights from the model.
 24 |     metric : string or callable
 25 |         A standard estimator metric (e.g. 'f1' or 'roc_auc') or a callable
 26 |         object / function with signature measure(estimator, X, y) which
 27 |         should return only a single value.
 28 |     cv : int
 29 |         Number of folds in cross-validation.
 30 | 
 31 |     See Also
 32 |     --------
 33 |     Jeon, H.; Oh, S. Hybrid-Recursive Feature Elimination for Efficient
 34 |     Feature Selection. Appl. Sci. 2020, 10, 3211.
 35 | 
 36 |         Examples
 37 |         --------
 38 |         >>> from ITMO_FS.ensembles import BestSum
 39 |         >>> from sklearn.svm import SVC
 40 |         >>> from sklearn.linear_model import LogisticRegression
 41 |         >>> from sklearn.linear_model import RidgeClassifier
 42 |         >>> import numpy as np
 43 |         >>> models = [SVC(kernel='linear'),
 44 |         ... LogisticRegression(),
 45 |         ... RidgeClassifier()]
 46 |         >>> x = np.array([[3, 3, 3, 2, 2],
 47 |         ...               [3, 3, 1, 2, 3],
 48 |         ...               [1, 3, 5, 1, 1],
 49 |         ...               [3, 1, 4, 3, 1],
 50 |         ...               [3, 1, 2, 3, 1]])
 51 |         >>> y = np.array([1, 2, 2, 1, 2])
 52 |         >>> bs = BestSum(models, ("K best", 2),
 53 |         ... lambda model: np.square(model.coef_).sum(axis=0), cv=2).fit(x, y)
 54 |         >>> bs.selected_features_
 55 |         array([0, 2], dtype=int64)
 56 |     """
 57 | 
 58 |     def __init__(self, models, cutting_rule, weight_func, metric='f1_micro',
 59 |                  cv=3):
 60 |         super().__init__()
 61 |         self.models = models
 62 |         self.cutting_rule = cutting_rule
 63 |         self.weight_func = weight_func
 64 |         self.metric = metric
 65 |         self.cv = cv
 66 | 
 67 |     def _fit(self, X, y):
 68 |         """
 69 |             Fits the ensemble.
 70 | 
 71 |             Parameters
 72 |             ----------
 73 |             X : array-like, shape (n_samples, n_features)
 74 |                 The training input samples.
 75 |             y : array-like, shape (n_samples, )
 76 |                 The target values.
 77 | 
 78 |         Returns
 79 |         -------
 80 |         None
 81 |         """
 82 | 
 83 |         def __get_weights(model):
 84 |             _model = clone(model).fit(X, y)
 85 |             weights = self.weight_func(_model)
 86 |             perf = cross_val_score(_model, X, y, cv=self.cv,
 87 |                                    scoring=self.metric).mean()
 88 |             return weights * perf
 89 | 
 90 |         if len(self.models) == 0:
 91 |             getLogger(__name__).error("No models are set")
 92 |             raise ValueError("No models are set")
 93 | 
 94 |         model_scores = np.vectorize(
 95 |             lambda model: __get_weights(model),
 96 |             signature='()->(1)')(self.models)
 97 |         getLogger(__name__).info("Weighted model scores: %s", model_scores)
 98 |         self.feature_scores_ = model_scores.sum(axis=0)
 99 |         getLogger(__name__).info("Feature scores: %s", self.feature_scores_)
100 |         self.selected_features_ = apply_cr(self.cutting_rule)(
101 |             self.feature_scores_)
102 | 


--------------------------------------------------------------------------------
/ITMO_FS/ensembles/ranking_based/Mixed.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | import numpy as np
 4 | 
 5 | from .fusion_functions import *
 6 | from ...utils import BaseTransformer
 7 | 
 8 | 
 9 | class Mixed(BaseTransformer):
10 |     """Perform feature selection based on several filters, selecting features
11 |     this way:
12 |         Get ranks from every filter from input.
13 |         Then loops through, on every iteration=i
14 |             selects features on i position on every filter
15 |             then shuffles them, then adds to result list without
16 |             duplication,
17 |         continues until specified number of features
18 | 
19 |     Parameters
20 |     ----------
21 |     filters : collection
22 |         Collection of measure functions with signature measure(X, y) that
23 |         should return an array of importance values for each feature.
24 |     n_features : int
25 |         Amount of features to select.
26 |     fusion_function : callable
27 |         A function with signature (filter_ranks (array-like, shape
28 |         (n_filters, n_features), k (int)) that should return the indices of k
29 |         selected features based on the filter rankings.
30 | 
31 |     Examples
32 |     --------
33 |     >>> from ITMO_FS.filters.univariate.measures import *
34 |     >>> from ITMO_FS.ensembles.ranking_based.Mixed import Mixed
35 |     >>> import numpy as np
36 |     >>> x = np.array([[3, 3, 3, 2, 2], [3, 3, 1, 2, 3], [1, 3, 5, 1, 1],
37 |     ... [3, 1, 4, 3, 1], [3, 1, 2, 3, 1]])
38 |     >>> y = np.array([1, 3, 2, 1, 2])
39 |     >>> mixed = Mixed([gini_index, chi2_measure], 2).fit(x, y)
40 |     >>> mixed.selected_features_
41 |     array([2, 4], dtype=int64)
42 |     """
43 |     def __init__(self, filters, n_features,
44 |                  fusion_function=best_goes_first_fusion):
45 |         self.filters = filters
46 |         self.n_features = n_features
47 |         self.fusion_function = fusion_function
48 | 
49 |     def _fit(self, X, y):
50 |         """Fit the ensemble.
51 | 
52 |         Parameters
53 |         ----------
54 |         X : array-like, shape (n_samples, n_features)
55 |             The training input samples.
56 |         y : array-like, shape (n_samples,)
57 |             The target values.
58 | 
59 |         Returns
60 |         -------
61 |         None
62 |         """
63 |         #TODO: some measures are 'lower is better', a simple argsort would not
64 |         #work there - need to call a different ranking function
65 |         self.filter_ranks_ = np.vectorize(
66 |             lambda f: np.argsort(f(X, y))[::-1],
67 |             signature='()->(1)')(self.filters)
68 |         getLogger(__name__).info("Filter ranks: %s", self.filter_ranks_)
69 |         self.selected_features_ = self.fusion_function(
70 |             self.filter_ranks_, self.n_features)
71 | 


--------------------------------------------------------------------------------
/ITMO_FS/ensembles/ranking_based/__init__.py:
--------------------------------------------------------------------------------
1 | from .Mixed import *
2 | 


--------------------------------------------------------------------------------
/ITMO_FS/ensembles/ranking_based/fusion_functions.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | 
 4 | 
 5 | def best_goes_first_fusion(filter_ranks, k):
 6 |     """
 7 |         Fusion function mixes filter results according feature appearance in
 8 |         range of each filter. Selects first k of them.
 9 | 
10 |     Parameters
11 |     ----------
12 |     filter_ranks : array-like, shape (n_filters, n_features)
13 |         Feature ranking for all filters.
14 |     k : int
15 |         Amount of features to select.
16 | 
17 |     Returns
18 |     -------
19 |     array-like, shape (k,) : selected features
20 |     """
21 |     result = np.array([], dtype='int')
22 |     place = 0
23 |     while len(result) < k:
24 |         placed_features = np.setdiff1d(filter_ranks[:, place], result)
25 |         random.shuffle(placed_features)
26 |         result = np.append(result, placed_features)
27 |         place += 1
28 |     return result[:k]
29 | 
30 | 
31 | def borda_fusion(filter_ranks, k):
32 |     """Select features according to borda function.
33 | 
34 |     Parameters
35 |     ----------
36 |     filter_ranks : array-like, shape (n_filters, n_features)
37 |         Feature ranking for all filters.
38 |     k : int
39 |         Amount of features to select.
40 | 
41 |     Returns
42 |     -------
43 |     array-like, shape (k,) : selected features
44 |     """
45 |     n_features = filter_ranks.shape[1]
46 |     scores = np.zeros(n_features)
47 |     for f in filter_ranks:
48 |         scores[f] += np.arange(1, n_features + 1)
49 |     return np.argsort(scores)[:k]
50 | 


--------------------------------------------------------------------------------
/ITMO_FS/filters/__init__.py:
--------------------------------------------------------------------------------
1 | from .multivariate import *
2 | from .univariate import *
3 | from .unsupervised import *
4 | 


--------------------------------------------------------------------------------
/ITMO_FS/filters/multivariate/DISRwithMassive.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger
  2 | 
  3 | import numpy as np
  4 | from sklearn.metrics import pairwise_distances
  5 | 
  6 | from ...utils import BaseTransformer, generate_features
  7 | from ...utils.information_theory import (entropy, joint_entropy,
  8 |                                          mutual_information)
  9 | 
 10 | 
 11 | def _complementarity(x_i, x_j, y):
 12 |     return (entropy(x_i) + entropy(x_j) + entropy(y) - joint_entropy(x_i, x_j)
 13 |             - joint_entropy(x_i, y) - joint_entropy(x_j, y)
 14 |             + joint_entropy(x_i, x_j, y))
 15 | 
 16 | 
 17 | def _chained_information(x_i, x_j, y):
 18 |     return (mutual_information(x_i, y) + mutual_information(x_j, y)
 19 |             + _complementarity(x_i, x_j, y))
 20 | 
 21 | 
 22 | class DISRWithMassive(BaseTransformer):
 23 |     """Create DISR (Double Input Symmetric Relevance) feature selection filter
 24 |     based on kASSI criterin for feature selection which aims at maximizing the
 25 |     mutual information avoiding, meanwhile, large multivariate density
 26 |     estimation. Its a kASSI criterion with approximation of the information of
 27 |     a set of variables by counting average information of subset on combination
 28 |     of two features. This formulation thus deals with feature complementarity
 29 |     up to order two by preserving the same computational complexity of the
 30 |     MRMR and CMIM criteria The DISR calculation is done using graph based
 31 |     solution.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     n_features : int
 36 |         Number of features to select.
 37 | 
 38 |     Notes
 39 |     -----
 40 |     For more details see `this paper
 41 |     <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.318.6576&rep=rep1&type=pdf/>`_.
 42 | 
 43 |     Examples
 44 |     --------
 45 |     >>> from ITMO_FS.filters.multivariate import DISRWithMassive
 46 |     >>> import numpy as np
 47 |     >>> X = np.array([[1, 2, 3, 3, 1], [2, 2, 3, 3, 2], [1, 3, 3, 1, 3],
 48 |     ... [3, 1, 3, 1, 4], [4, 4, 3, 1, 5]])
 49 |     >>> y = np.array([1, 2, 3, 4, 5])
 50 |     >>> disr = DISRWithMassive(3).fit(X, y)
 51 |     >>> disr.selected_features_
 52 |     array([0, 1, 4], dtype=int64)
 53 |     """
 54 |     def __init__(self, n_features):
 55 |         self.n_features = n_features
 56 | 
 57 |     def _fit(self, x, y):
 58 |         """Fit the filter.
 59 | 
 60 |         Parameters
 61 |         ----------
 62 |         x : array-like, shape (n_samples, n_features)
 63 |             The training input samples.
 64 |         y : array-like, shape (n_samples,)
 65 |             The target values.
 66 | 
 67 |         Returns
 68 |         -------
 69 |         None
 70 |         """
 71 |         free_features = np.array([], dtype='int')
 72 |         self.selected_features_ = generate_features(x)
 73 |         self._edges = pairwise_distances(
 74 |             x.T, x.T, lambda xi, xj: (_chained_information(xi, xj, y)
 75 |                                       / (joint_entropy(xi, xj) + 1e-15)))
 76 |         np.fill_diagonal(self._edges, 0)
 77 |         getLogger(__name__).info("Graph weights: %s", self._edges)
 78 |     
 79 |         while len(self.selected_features_) != self.n_features:
 80 |             min_index = np.argmin(
 81 |                 np.sum(self._edges[np.ix_(self.selected_features_,
 82 |                                           self.selected_features_)], axis=0))
 83 |             getLogger(__name__).info(
 84 |                 "Removing feature %d from selected set",
 85 |                 self.selected_features_[min_index])
 86 |             free_features = np.append(
 87 |                 free_features, self.selected_features_[min_index])
 88 |             self.selected_features_ = np.delete(
 89 |                 self.selected_features_, min_index)
 90 | 
 91 |         getLogger(__name__).info(
 92 |             "Selected set: %s, free set: %s", self.selected_features_,
 93 |             free_features)
 94 | 
 95 |         while True:
 96 |             selected_weights = np.sum(
 97 |                 self._edges[np.ix_(self.selected_features_,
 98 |                                    self.selected_features_)], axis=0)
 99 |             getLogger(__name__).info(
100 |                 "Graph of selected set: %s", selected_weights)
101 | 
102 |             free_weights = np.sum(self._edges[np.ix_(self.selected_features_,
103 |                                                      free_features)], axis=0)
104 |             getLogger(__name__).info(
105 |                 "Free weights that would be added: %s", free_weights)
106 | 
107 |             difference = (
108 |                 free_weights.reshape(-1, 1)
109 |                 - self._edges[np.ix_(free_features, self.selected_features_)]
110 |                 - selected_weights)
111 |             getLogger(__name__).info("Difference matrix: %s", difference)
112 | 
113 |             if np.all(difference <= 0):
114 |                 getLogger(__name__).info(
115 |                     "All differences are non-positive, terminating")
116 |                 break
117 |             index_add, index_del = np.unravel_index(
118 |                 np.argmax(difference), difference.shape)
119 |             getLogger(__name__).info(
120 |                 "Maximum difference found at index (%d, %d), swapping those "
121 |                 "features", index_add, index_del)
122 | 
123 |             self.selected_features_[index_del], free_features[index_add] = (
124 |                 free_features[index_add], self.selected_features_[index_del])
125 | 


--------------------------------------------------------------------------------
/ITMO_FS/filters/multivariate/FCBF.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | import numpy as np
 4 | 
 5 | from ...utils import BaseTransformer, generate_features
 6 | from ...utils.information_theory import entropy, conditional_entropy
 7 | 
 8 | 
 9 | class FCBFDiscreteFilter(BaseTransformer):
10 |     """Create FCBF (Fast Correlation Based filter) feature selection filter
11 |     based on mutual information criteria for data with discrete features. This
12 |     filter finds best set of features by searching for a feature, which
13 |     provides the most information about classification problem on given dataset
14 |     at each step and then eliminating features which are less relevant than
15 |     redundant.
16 | 
17 |     Parameters
18 |     ----------
19 |     delta : float
20 |         Symmetric uncertainty value threshold.
21 | 
22 |     Notes
23 |     -----
24 |     For more details see `this paper
25 |     <https://www.aaai.org/Papers/ICML/2003/ICML03-111.pdf/>`_.
26 | 
27 |     Examples
28 |     --------
29 |     >>> from ITMO_FS.filters.multivariate import FCBFDiscreteFilter
30 |     >>> import numpy as np
31 |     >>> X = np.array([[1, 2, 3, 3, 1], [2, 2, 3, 3, 2], [1, 3, 3, 1, 3],
32 |     ... [3, 1, 3, 1, 4], [4, 4, 3, 1, 5]])
33 |     >>> y = np.array([1, 2, 3, 4, 5])
34 |     >>> fcbf = FCBFDiscreteFilter().fit(X, y)
35 |     >>> fcbf.selected_features_
36 |     array([4], dtype=int64)
37 |     """
38 |     def __init__(self, delta=0.1):
39 |         self.delta = delta
40 | 
41 |     def _fit(self, x, y):
42 |         """Fit the filter.
43 | 
44 |         Parameters
45 |         ----------
46 |         x : array-like, shape (n_samples, n_features)
47 |             The training input samples.
48 |         y : array-like, shape (n_samples,)
49 |             The target values.
50 | 
51 |         Returns
52 |         -------
53 |         None
54 |         """
55 |         def __SU(x, y, entropy_y):
56 |             entropy_x = entropy(x)
57 |             return 2 * ((entropy_x - conditional_entropy(y, x))
58 |                         / (entropy_x + entropy_y))
59 | 
60 |         free_features = generate_features(x)
61 |         self.selected_features_ = np.array([], dtype='int')
62 |         entropy_y = entropy(y)
63 |         getLogger(__name__).info("Entropy of y: %f", entropy_y)
64 | 
65 |         su_class = np.apply_along_axis(__SU, 0, x, y, entropy_y)
66 |         getLogger(__name__).info("SU values against y: %s", su_class)
67 |         self.selected_features_ = np.argsort(su_class)[::-1][:
68 |             np.count_nonzero(su_class > self.delta)]
69 |         getLogger(__name__).info("Selected set: %s", self.selected_features_)
70 | 
71 |         index = 1
72 |         while index < self.selected_features_.shape[0]:
73 |             feature = self.selected_features_[index - 1]
74 |             getLogger(__name__).info("Leading feature: %d", feature)
75 |             entropy_feature = entropy(x[:, feature])
76 |             getLogger(__name__).info(
77 |                 "Leading feature entropy: %f", entropy_feature)
78 |             su_classes = su_class[self.selected_features_[index:]]
79 |             getLogger(__name__).info(
80 |                 "SU values against y for the remaining features: %s",
81 |                 su_classes)
82 |             su_feature = np.apply_along_axis(
83 |                 __SU, 0, x[:, self.selected_features_[index:]], x[:, feature],
84 |                 entropy_feature)
85 |             getLogger(__name__).info(
86 |                 "SU values against leading feature for the remaining features: "
87 |                 "%s", su_feature)
88 |             to_delete = np.flatnonzero(su_feature >= su_classes) + index
89 |             getLogger(__name__).info(
90 |                 "Deleting those features from the selected set: %s",
91 |                 self.selected_features_[to_delete])
92 |             self.selected_features_ = np.delete(
93 |                 self.selected_features_, to_delete)
94 |             index += 1
95 | 


--------------------------------------------------------------------------------
/ITMO_FS/filters/multivariate/MultivariateFilter.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger
  2 | 
  3 | import numpy as np
  4 | from sklearn.base import TransformerMixin
  5 | 
  6 | from .measures import (MEASURE_NAMES, mutual_information,
  7 |                        matrix_mutual_information)
  8 | from ...utils import BaseTransformer, generate_features
  9 | 
 10 | 
 11 | class MultivariateFilter(BaseTransformer):
 12 |     """Provides basic functionality for multivariate filters.
 13 | 
 14 |     Parameters
 15 |     ----------
 16 |     measure : string or callable
 17 |         A metric name defined in GLOB_MEASURE or a callable with signature
 18 |         measure(selected_features, free_features, dataset, labels) which
 19 |         should return a list of metric values for each feature in the dataset.
 20 |     n_features : int
 21 |         Number of features to select.
 22 |     beta : float, optional
 23 |         Initialize only in case you run MIFS or generalizedCriteria metrics.
 24 |     gamma : float, optional
 25 |         Initialize only in case you run generalizedCriteria metric.
 26 | 
 27 |     See Also
 28 |     --------
 29 | 
 30 |     Examples
 31 |     --------
 32 |     >>> from ITMO_FS.filters.multivariate import MultivariateFilter
 33 |     >>> from sklearn.preprocessing import KBinsDiscretizer
 34 |     >>> import numpy as np
 35 |     >>> est = KBinsDiscretizer(n_bins=10, encode='ordinal')
 36 |     >>> x = np.array([[1, 2, 3, 3, 1], [2, 2, 3, 3, 2], [1, 3, 3, 1, 3],
 37 |     ... [3, 1, 3, 1, 4], [4, 4, 3, 1, 5]])
 38 |     >>> y = np.array([1, 2, 3, 4, 5])
 39 |     >>> data = est.fit_transform(x)
 40 |     >>> model = MultivariateFilter('JMI', 3).fit(x, y)
 41 |     >>> model.selected_features_
 42 |     array([4, 0, 1], dtype=int64)
 43 |     """
 44 |     def __init__(self, measure, n_features, beta=None, gamma=None):
 45 |         self.measure = measure
 46 |         self.n_features = n_features
 47 |         self.beta = beta
 48 |         self.gamma = gamma
 49 | 
 50 |     def _fit(self, X, y, **kwargs):
 51 |         """Fit the filter.
 52 | 
 53 |         Parameters
 54 |         ----------
 55 |         X : array-like, shape (n_samples, n_features)
 56 |             The training input samples.
 57 |         y : array-like, shape (n_samples,)
 58 |             The target values.
 59 |         **kwargs
 60 | 
 61 |         Returns
 62 |         -------
 63 |         None
 64 |         """
 65 |         if isinstance(self.measure, str):
 66 |             try:
 67 |                 measure = MEASURE_NAMES[self.measure]
 68 |             except KeyError:
 69 |                 getLogger(__name__).error("No %r measure yet", self.measure)
 70 |                 raise KeyError("No %r measure yet" % self.measure)
 71 | 
 72 |         getLogger(__name__).info(
 73 |             "Using MultivariateFilter with %s measure", measure)
 74 |         free_features = generate_features(X)
 75 |         self.selected_features_ = np.array([], dtype='int')
 76 | 
 77 |         relevance = np.apply_along_axis(
 78 |             mutual_information, 0, X[:, free_features], y)
 79 |         getLogger(__name__).info("Relevance vector: %s", relevance)
 80 | 
 81 |         redundancy = np.vectorize(
 82 |             lambda free_feature: matrix_mutual_information(
 83 |                 X[:, free_features], X[:, free_feature]),
 84 |             signature='()->(1)')(free_features)
 85 |         getLogger(__name__).info("Redundancy vector: %s", redundancy)
 86 | 
 87 |         while len(self.selected_features_) != self.n_features:
 88 |             if self.beta is None:
 89 |                 values = measure(
 90 |                     self.selected_features_, free_features, X, y,
 91 |                     relevance=relevance[free_features],
 92 |                     redundancy=np.sum(
 93 |                         redundancy[self.selected_features_],
 94 |                         axis=0)[free_features])
 95 |             else:
 96 |                 if self.gamma is not None:
 97 |                     values = measure(
 98 |                         self.selected_features_, free_features, X, y, self.beta,
 99 |                         self.gamma, relevance=relevance[free_features],
100 |                         redundancy=np.sum(
101 |                             redundancy[self.selected_features_],
102 |                             axis=0)[free_features])
103 |                 else:
104 |                     values = measure(
105 |                         self.selected_features_,free_features, X, y, self.beta,
106 |                         relevance=relevance[free_features],
107 |                         redundancy=np.sum(
108 |                             redundancy[self.selected_features_],
109 |                             axis=0)[free_features])
110 | 
111 |             getLogger(__name__).info("Free features: %s", free_features)
112 |             getLogger(__name__).info("Measure values: %s", values)
113 |             to_add = np.argmax(values)
114 |             getLogger(__name__).info(
115 |                 "Adding feature %d to the selected set", free_features[to_add])
116 |             self.selected_features_ = np.append(
117 |                 self.selected_features_, free_features[to_add])
118 |             free_features = np.delete(free_features, to_add)
119 | 


--------------------------------------------------------------------------------
/ITMO_FS/filters/multivariate/STIR.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger
  2 | 
  3 | import numpy as np
  4 | from sklearn.metrics import pairwise_distances
  5 | from sklearn.preprocessing import MinMaxScaler
  6 | 
  7 | from ...utils import knn_from_class, BaseTransformer
  8 | 
  9 | 
 10 | class STIR(BaseTransformer):
 11 |     """Feature selection using STIR algorithm.
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     n_features : int
 16 |         Number of features to select.
 17 |     metric : str or callable
 18 |         Distance metric to use in kNN. If str, should be one of the standard
 19 |         distance metrics (e.g. 'euclidean' or 'manhattan'). If callable, should
 20 |         have the signature metric(x1 (array-like, shape (n,)), x2 (array-like,
 21 |         shape (n,))) that should return the distance between two vectors.
 22 |     k : int
 23 |         Number of constant nearest hits/misses.
 24 | 
 25 |     Notes
 26 |     -----
 27 |     For more details see `this paper <https://academic.oup.com/bioinformatics/article/35/8/1358/5100883>`_.
 28 | 
 29 |     Examples
 30 |     --------
 31 |     >>> from ITMO_FS.filters.multivariate import STIR
 32 |     >>> import numpy as np
 33 |     >>> X = np.array([[3, 3, 3, 2, 2], [3, 3, 1, 2, 3], [1, 3, 5, 1, 1],
 34 |     ... [3, 1, 4, 3, 1], [3, 1, 2, 3, 1]])
 35 |     >>> y = np.array([1, 2, 2, 1, 2])
 36 |     >>> model = STIR(2).fit(X, y)
 37 |     >>> model.selected_features_
 38 |     array([2, 0], dtype=int64)
 39 |     """
 40 |     def __init__(self, n_features, metric='manhattan', k=1):
 41 |         self.n_features = n_features
 42 |         self.metric = metric
 43 |         self.k = k
 44 | 
 45 |     def _fit(self, X, y):
 46 |         """Fit the filter.
 47 | 
 48 |         Parameters
 49 |         ----------
 50 |         X : array-like, shape (n_samples, n_features)
 51 |             The input samples.
 52 |         y : array-like, shape (n_samples,)
 53 |             The classes for the samples.
 54 | 
 55 |         Returns
 56 |         -------
 57 |         None
 58 |         """
 59 |         n_samples = X.shape[0]
 60 |         classes, counts = np.unique(y, return_counts=True)
 61 | 
 62 |         if np.any(counts <= self.k):
 63 |             getLogger(__name__).error(
 64 |                 "Cannot select %d nearest neighbors because one of the classes "
 65 |                 "has less than %d samples", self.k, self.k + 1)
 66 |             raise ValueError(
 67 |                 "Cannot select %d nearest neighbors because one of the classes "
 68 |                 "has less than %d samples" % (self.k, self.k + 1))
 69 | 
 70 |         x_normalized = MinMaxScaler().fit_transform(X)
 71 |         dm = pairwise_distances(x_normalized, x_normalized, self.metric)
 72 |         getLogger(__name__).info("Distance matrix: %s", dm)
 73 | 
 74 |         indices = np.arange(n_samples)
 75 |         hits_diffs = np.abs(
 76 |             np.vectorize(
 77 |                 lambda index: (
 78 |                     x_normalized[index]
 79 |                     - x_normalized[knn_from_class(
 80 |                         dm, y, index, self.k, y[index])]),
 81 |                 signature='()->(n,m)')(indices))
 82 |         getLogger(__name__).info("Hit differences matrix: %s", hits_diffs)
 83 |         misses_diffs = np.abs(
 84 |             np.vectorize(
 85 |                 lambda index: (
 86 |                     x_normalized[index]
 87 |                     - x_normalized[knn_from_class(
 88 |                         dm, y, index, self.k, y[index], anyOtherClass=True)]),
 89 |                 signature='()->(n,m)')(indices))
 90 |         getLogger(__name__).info("Miss differences matrix: %s", misses_diffs)
 91 | 
 92 |         H = np.mean(hits_diffs, axis=(0,1))
 93 |         getLogger(__name__).info("H: %s", H)
 94 |         M = np.mean(misses_diffs, axis=(0,1))
 95 |         getLogger(__name__).info("M: %s", M)
 96 |         var_H = np.var(hits_diffs, axis=(0,1))
 97 |         var_M = np.var(misses_diffs, axis=(0,1))
 98 | 
 99 |         # the 1 / (1 / |M| + 1 / |H|) ^ (1/2) multiplier is constant, we omit it
100 |         self.feature_scores_ = (
101 |             (M - H) * np.sqrt(2 * self.k * n_samples - 2)
102 |             / (np.sqrt((self.k * n_samples - 1) * (var_H + var_M)) + 1e-15))
103 |         getLogger(__name__).info("Feature scores: %s", self.feature_scores_)
104 |         self.selected_features_ = np.argsort(self.feature_scores_)[::-1][
105 |             :self.n_features]
106 | 
107 | 


--------------------------------------------------------------------------------
/ITMO_FS/filters/multivariate/TraceRatioFisher.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | import numpy as np
 4 | from sklearn.metrics.pairwise import pairwise_distances
 5 | 
 6 | from ...utils import BaseTransformer, generate_features
 7 | 
 8 | class TraceRatioFisher(BaseTransformer):
 9 |     """Creates TraceRatio(similarity based) feature selection filter
10 |     performed in supervised way, i.e. fisher version
11 | 
12 |     Parameters
13 |     ----------
14 |     n_features : int
15 |         Number of features to select.
16 |     epsilon : float
17 |         Lambda change threshold.
18 | 
19 |     Notes
20 |     -----
21 |     For more details see `this paper
22 |     <https://www.aaai.org/Papers/AAAI/2008/AAAI08-107.pdf/>`_.
23 | 
24 |     Examples
25 |     --------
26 |     >>> from ITMO_FS.filters.multivariate import TraceRatioFisher
27 |     >>> x = np.array([[1, 2, 3, 3, 1], [2, 2, 3, 3, 2], [1, 3, 3, 1, 3],
28 |     ... [3, 1, 3, 1, 4], [4, 4, 3, 1, 5]])
29 |     >>> y = np.array([1, 2, 1, 1, 2])
30 |     >>> tracer = TraceRatioFisher(3).fit(x, y)
31 |     >>> tracer.selected_features_
32 |     array([0, 1, 3], dtype=int64)
33 |     """
34 |     def __init__(self, n_features, epsilon=1e-3):
35 |         self.n_features = n_features
36 |         self.epsilon = epsilon
37 | 
38 |     def _fit(self, X, y):
39 |         """Fit the filter.
40 | 
41 |         Parameters
42 |         ----------
43 |         X : array-like, shape (n_samples, n_features)
44 |             The training input samples
45 |         y : array-like, shape (n_samples,)
46 |             The target values
47 | 
48 |         Returns
49 |         -------
50 |         None
51 |         """
52 |         n_samples = X.shape[0]
53 |         classes, counts = np.unique(y, return_counts=True)
54 |         counts_d = {cl: counts[idx] for idx, cl in enumerate(classes)}
55 |         getLogger(__name__).info("Class counts: %s", counts_d)
56 | 
57 |         A_within = pairwise_distances(
58 |             y.reshape(-1, 1), metric=lambda x, y: (
59 |                 (x[0] == y[0]) / counts_d[x[0]]))
60 |         L_within = np.eye(n_samples) - A_within
61 |         getLogger(__name__).info("A_w: %s", A_within)
62 |         getLogger(__name__).info("L_w: %s", L_within)
63 | 
64 |         L_between = A_within - np.ones((n_samples, n_samples)) / n_samples
65 |         getLogger(__name__).info("L_b: %s", L_between)
66 | 
67 |         E = X.T.dot(L_within).dot(X)
68 |         B = X.T.dot(L_between).dot(X)
69 | 
70 |         # we need only diagonal elements for trace calculation
71 |         e = np.array(np.diag(E))
72 |         b = np.array(np.diag(B))
73 |         getLogger(__name__).info("E: %s", e)
74 |         getLogger(__name__).info("B: %s", b)
75 |         lam = 0
76 |         prev_lam = -1
77 |         while (lam - prev_lam >= self.epsilon):  # TODO: optimize
78 |             score = b - lam * e
79 |             getLogger(__name__).info("Score: %s", score)
80 |             self.selected_features_ = np.argsort(score)[::-1][:self.n_features]
81 |             getLogger(__name__).info(
82 |                 "New selected set: %s", self.selected_features_)
83 |             prev_lam = lam
84 |             lam = (np.sum(b[self.selected_features_])
85 |                    / np.sum(e[self.selected_features_]))
86 |             getLogger(__name__).info("New lambda: %d", lam)
87 |         self.score_ = score
88 |         self.lam_ = lam
89 | 


--------------------------------------------------------------------------------
/ITMO_FS/filters/multivariate/__init__.py:
--------------------------------------------------------------------------------
1 | from .DISRwithMassive import *
2 | from .FCBF import *
3 | from .MultivariateFilter import MultivariateFilter
4 | from .measures import *
5 | from .TraceRatioFisher import TraceRatioFisher
6 | from .STIR import STIR
7 | from .mimaga import MIMAGA
8 | 


--------------------------------------------------------------------------------
/ITMO_FS/filters/univariate/NDFS.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger
  2 | 
  3 | import numpy as np
  4 | from sklearn.cluster import KMeans
  5 | from sklearn.metrics.pairwise import pairwise_distances
  6 | from sklearn.neighbors import NearestNeighbors
  7 | from sklearn.preprocessing import OneHotEncoder
  8 | 
  9 | from ...utils import l21_norm, matrix_norm, power_neg_half, BaseTransformer
 10 | 
 11 | 
 12 | class NDFS(BaseTransformer):
 13 |     """Nonnegative Discriminative Feature Selection algorithm.
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     n_features : int
 18 |         Number of features to select.
 19 |     c : int
 20 |         Amount of clusters to find.
 21 |     k : int
 22 |         Amount of nearest neighbors to use while building the graph.
 23 |     alpha : float
 24 |         Parameter in the objective function.
 25 |     beta : float
 26 |         Regularization parameter in the objective function.
 27 |     gamma : float
 28 |         Parameter in the objective function that controls the orthogonality
 29 |         condition.
 30 |     sigma : float
 31 |         Parameter for the weighting scheme.
 32 |     max_iterations : int
 33 |         Maximum amount of iterations to perform.
 34 |     epsilon : positive float
 35 |         Specifies the needed residual between the target functions from
 36 |         consecutive iterations. If the residual is smaller than epsilon, the
 37 |         algorithm is considered to have converged.
 38 | 
 39 |     See Also
 40 |     --------
 41 |     http://www.nlpr.ia.ac.cn/2012papers/gjhy/gh27.pdf
 42 | 
 43 |     Examples
 44 |     --------
 45 |     >>> from ITMO_FS.filters.univariate import NDFS
 46 |     >>> import numpy as np
 47 |     >>> X = np.array([[1, 2, 3, 3, 1], [2, 2, 3, 3, 2], [1, 3, 3, 1, 3],
 48 |     ... [1, 1, 3, 1, 4], [2, 4, 3, 1, 5]])
 49 |     >>> y = np.array([1, 2, 1, 1, 2])
 50 |     >>> model = NDFS(3).fit(X, y)
 51 |     >>> model.selected_features_
 52 |     array([0, 3, 4], dtype=int64)
 53 |     >>> model = NDFS(3).fit(X)
 54 |     >>> model.selected_features_
 55 |     array([3, 4, 1], dtype=int64)
 56 |     """
 57 |     def __init__(self, n_features, c=2, k=3, alpha=1, beta=1, gamma=10e8,
 58 |                  sigma=1, max_iterations=1000, epsilon=1e-5):
 59 |         self.n_features = n_features
 60 |         self.c = c
 61 |         self.k = k
 62 |         self.alpha = alpha
 63 |         self.beta = beta
 64 |         self.gamma = gamma
 65 |         self.sigma = sigma
 66 |         self.max_iterations = max_iterations
 67 |         self.epsilon = epsilon
 68 | 
 69 |     def __scheme(self, x1, x2):
 70 |         return np.exp(-np.linalg.norm(x1 - x2) ** 2 / (self.sigma ** 2))
 71 | 
 72 |     def _fit(self, X, y, **kwargs):
 73 |         """Fit the filter.
 74 | 
 75 |         Parameters
 76 |         ----------
 77 |         X : array-like, shape (n_samples, n_features)
 78 |             The training input samples.
 79 |         y : array-like, shape (n_samples,) or (n_samples, n_classes)
 80 |             The target values or their one-hot encoding that are used to
 81 |             compute F. If not present, a k-means clusterization algorithm
 82 |             is used. If present, n_classes should be equal to c.
 83 | 
 84 |         Returns
 85 |         -------
 86 |         None
 87 |         """
 88 |         n_samples = X.shape[0]
 89 | 
 90 |         if self.k >= n_samples:
 91 |             getLogger(__name__).error(
 92 |                 "Cannot select %d nearest neighbors with n_samples = %d",
 93 |                 self.k, n_samples)
 94 |             raise ValueError(
 95 |                 "Cannot select %d nearest neighbors with n_samples = %d"
 96 |                 % (self.k, n_samples))
 97 | 
 98 |         graph = NearestNeighbors(
 99 |             n_neighbors=self.k,
100 |             algorithm='ball_tree').fit(X).kneighbors_graph().toarray()
101 |         graph = np.minimum(1, graph + graph.T)
102 |         getLogger(__name__).info("Nearest neighbors graph: %s", graph)
103 | 
104 |         S = graph * pairwise_distances(
105 |             X, metric=lambda x, y: self.__scheme(x, y))
106 |         getLogger(__name__).info("S: %s", S)
107 |         A = np.diag(S.sum(axis=0))
108 |         getLogger(__name__).info("A: %s", A)
109 |         L = power_neg_half(A).dot(A - S).dot(power_neg_half(A))
110 |         getLogger(__name__).info("L: %s", L)
111 | 
112 |         if y is not None:
113 |             if len(y.shape) == 2:
114 |                 Y = y
115 |             else:
116 |                 Y = OneHotEncoder().fit_transform(y.reshape(-1, 1)).toarray()
117 |         else:
118 |             if self.c > n_samples:
119 |                 getLogger(__name__).error(
120 |                     "Cannot find %d clusters with n_samples = %d", self.c,
121 |                     n_samples)
122 |                 raise ValueError(
123 |                     "Cannot find %d clusters with n_samples = %d"
124 |                     % (self.c, n_samples))
125 |             Y = self.__run_kmeans(X)
126 |         getLogger(__name__).info("Transformed Y: %s", Y)
127 |         F = Y.dot(power_neg_half(Y.T.dot(Y)))
128 |         getLogger(__name__).info("F: %s", F)
129 |         D = np.eye(self.n_features_)
130 |         In = np.eye(n_samples)
131 |         Ic = np.eye(Y.shape[1])
132 | 
133 |         previous_target = -1
134 |         for _ in range(self.max_iterations):
135 |             M = (L + self.alpha
136 |                 * (In - X.dot(
137 |                     np.linalg.inv(X.T.dot(X) + self.beta * D)).dot(X.T)))
138 |             getLogger(__name__).info("M: %s", M)
139 |             F = (F * ((self.gamma * F)
140 |                        / (M.dot(F) + self.gamma * F.dot(F.T).dot(F))))
141 |             getLogger(__name__).info("F: %s", F)
142 |             W = np.linalg.inv(X.T.dot(X) + self.beta * D).dot(X.T.dot(F))
143 |             getLogger(__name__).info("W: %s", W)
144 |             diag = 2 * matrix_norm(W)
145 |             diag[diag < 1e-10] = 1e-10  # prevents division by zero
146 |             D = np.diag(1 / diag)
147 |             getLogger(__name__).info("D: %s", D)
148 | 
149 |             target = (np.trace(F.T.dot(L).dot(F))
150 |                 + self.alpha * (np.linalg.norm(X.dot(W) - F) ** 2
151 |                     + self.beta * l21_norm(W))
152 |                 + self.gamma * (np.linalg.norm(F.T.dot(F) - Ic) ** 2) / 2)
153 |             getLogger(__name__).info("New target value: %d", target)
154 |             if abs(target - previous_target) < self.epsilon:
155 |                 break
156 |             previous_target = target
157 | 
158 |         getLogger(__name__).info("Ended up with W: %s", W)
159 |         self.feature_scores_ = matrix_norm(W)
160 |         getLogger(__name__).info("Feature scores: %s", self.feature_scores_)
161 |         ranking = np.argsort(self.feature_scores_)[::-1]
162 |         self.selected_features_ = ranking[:self.n_features]
163 | 
164 |     def __run_kmeans(self, X):
165 |         kmeans = KMeans(n_clusters=self.c, copy_x=True)
166 |         kmeans.fit(X)
167 |         labels = kmeans.labels_
168 |         getLogger(__name__).info("Labels from KMeans: %s", labels)
169 |         return OneHotEncoder().fit_transform(labels.reshape(-1, 1)).toarray()
170 | 


--------------------------------------------------------------------------------
/ITMO_FS/filters/univariate/RFS.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | import numpy as np
 4 | from sklearn.preprocessing import OneHotEncoder
 5 | 
 6 | from ...utils import l21_norm, matrix_norm, BaseTransformer
 7 | 
 8 | 
 9 | class RFS(BaseTransformer):
10 |     """Robust Feature Selection via Joint L2,1-Norms Minimization algorithm.
11 | 
12 |     Parameters
13 |     ----------
14 |     n_features : int
15 |         Number of features to select.
16 |     gamma : float
17 |         Regularization parameter.
18 |     max_iterations : int
19 |         Maximum amount of iterations to perform.
20 |     epsilon : positive float
21 |         Specifies the needed residual between the target functions from
22 |         consecutive iterations. If the residual is smaller than epsilon, the
23 |         algorithm is considered to have converged.
24 | 
25 |     Notes
26 |     -----
27 |     For more details see `this paper
28 |     <https://papers.nips.cc/paper/3988-efficient-and-robust-feature-selection-via-joint-l21-norms-minimization.pdf/>`_.
29 | 
30 |     Examples
31 |     --------
32 |     >>> from ITMO_FS.filters.univariate import RFS
33 |     >>> import numpy as np
34 |     >>> X = np.array([[1, 2, 3, 3, 1], [2, 2, 3, 3, 2], [1, 3, 3, 1, 3],
35 |     ... [1, 1, 3, 1, 4], [2, 4, 3, 1, 5]])
36 |     >>> y = np.array([1, 2, 1, 1, 2])
37 |     >>> model = RFS(2).fit(X, y)
38 |     >>> model.selected_features_
39 |     array([0, 3], dtype=int64)
40 |     """
41 |     def __init__(self, n_features, gamma=1, max_iterations=1000, epsilon=1e-5):
42 |         self.n_features = n_features
43 |         self.gamma = gamma
44 |         self.max_iterations = max_iterations
45 |         self.epsilon = epsilon
46 | 
47 |     def _fit(self, X, y):
48 |         """Fit the filter.
49 | 
50 |         Parameters
51 |         ----------
52 |         X : array-like, shape (n_samples, n_features)
53 |             The training input samples.
54 |         y : array-like, shape (n_samples,) or (n_samples, n_classes)
55 |             The target values or their one-hot encoding.
56 | 
57 |         Returns
58 |         -------
59 |         None
60 |         """
61 |         if len(y.shape) == 2:
62 |             Y = y
63 |         else:
64 |             Y = OneHotEncoder().fit_transform(y.reshape(-1, 1)).toarray()
65 | 
66 |         getLogger(__name__).info("Transformed Y: %s", Y)
67 |         n_samples = X.shape[0]
68 |         A = np.append(X, self.gamma * np.eye(n_samples), axis=1)
69 |         getLogger(__name__).info("A: %s", A)
70 |         D = np.eye(n_samples + self.n_features_)
71 | 
72 |         previous_target = -1
73 |         for _ in range(self.max_iterations):
74 |             D_inv = np.linalg.inv(D)
75 |             U = D_inv.dot(A.T).dot(np.linalg.inv(A.dot(D_inv).dot(A.T))).dot(Y)
76 |             getLogger(__name__).info("U: %s", U)
77 |             diag = 2 * matrix_norm(U)
78 |             diag[diag < 1e-10] = 1e-10  # prevents division by zero
79 |             D = np.diag(1 / diag)
80 |             getLogger(__name__).info("D: %s", D)
81 | 
82 |             target = l21_norm(U)
83 |             getLogger(__name__).info("New target value: %d", target)
84 |             if abs(target - previous_target) < self.epsilon:
85 |                 break
86 |             previous_target = target
87 | 
88 |         getLogger(__name__).info("Ended up with U: %s", U)
89 |         self.feature_scores_ = matrix_norm(U[:self.n_features_])
90 |         getLogger(__name__).info("Feature scores: %s", self.feature_scores_)
91 |         ranking = np.argsort(self.feature_scores_)[::-1]
92 |         self.selected_features_ = ranking[:self.n_features]
93 | 


--------------------------------------------------------------------------------
/ITMO_FS/filters/univariate/SPEC.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger
  2 | 
  3 | import numpy as np
  4 | from scipy.linalg import eigh
  5 | from sklearn.metrics.pairwise import pairwise_distances
  6 | 
  7 | from ...utils import l21_norm, matrix_norm, power_neg_half, BaseTransformer
  8 | 
  9 | 
 10 | class SPEC(BaseTransformer):
 11 |     """Spectral Feature Selection algorithm.
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     n_features : int
 16 |         Number of features to select.
 17 |     k : int
 18 |         Amount of clusters to find.
 19 |     gamma : callable
 20 |         An "increasing function that penalizes high frequency components".
 21 |         Default is gamma(x) = x^2.
 22 |     sigma : float
 23 |         Parameter for the weighting scheme.
 24 |     phi_type : int (1, 2 or 3)
 25 |         Type of feature ranking function to use.
 26 | 
 27 |     Notes
 28 |     -----
 29 |     For more details see `this paper <http://www.public.asu.edu/~huanliu/papers/icml07.pdf/>`_.
 30 | 
 31 |     Examples
 32 |     --------
 33 |     >>> from ITMO_FS.filters.univariate import SPEC
 34 |     >>> import numpy as np
 35 |     >>> X = np.array([[1, 2, 3, 3, 1], [2, 2, 3, 3, 2], [1, 3, 3, 1, 3],
 36 |     ... [1, 1, 3, 1, 4], [2, 4, 3, 1, 5]])
 37 |     >>> y = np.array([1, 2, 1, 1, 2])
 38 |     >>> model = SPEC(3).fit(X, y)
 39 |     >>> model.selected_features_
 40 |     array([0, 1, 4], dtype=int64)
 41 |     >>> model = SPEC(3).fit(X)
 42 |     >>> model.selected_features_
 43 |     array([3, 4, 1], dtype=int64)
 44 |     """
 45 |     def __init__(self, n_features, k=2, gamma=(lambda x: x ** 2), sigma=0.5,
 46 |                  phi_type=3):
 47 |         self.n_features = n_features
 48 |         self.k = k
 49 |         self.gamma = gamma
 50 |         self.sigma = sigma
 51 |         self.phi_type = phi_type
 52 | 
 53 |     def __scheme(self, x1, x2):
 54 |         return np.exp(-np.linalg.norm(x1 - x2) ** 2 / (2 * self.sigma ** 2))
 55 | 
 56 |     def __phi1(self, cosines, eigvals, k):
 57 |         return np.sum(cosines * cosines * self.gamma(eigvals))
 58 | 
 59 |     def __phi2(self, cosines, eigvals, k):
 60 |         return (np.sum(cosines[1:] * cosines[1:] * self.gamma(eigvals[1:]))
 61 |                 / np.sum(cosines[1:] * cosines[1:]))
 62 | 
 63 |     def __phi3(self, cosines, eigvals, k):
 64 |         return np.sum(cosines[1:k] * cosines[1:k]
 65 |                       * (self.gamma(2) - self.gamma(eigvals[1:k])))
 66 | 
 67 |     def _fit(self, X, y):
 68 |         """Fit the filter.
 69 | 
 70 |         Parameters
 71 |         ----------
 72 |         X : array-like, shape (n_samples, n_features)
 73 |             The training input samples.
 74 |         y : array-like, shape (n_samples,), optional
 75 |             The target values. If present, label values are used to
 76 |             construct the similarity graph and the amount of classes
 77 |             overrides k.
 78 | 
 79 |         Returns
 80 |         -------
 81 |         None
 82 |         """
 83 |         def calc_weight(f):
 84 |             f_norm = np.sqrt(D).dot(f)
 85 |             f_norm /= np.linalg.norm(f_norm)
 86 | 
 87 |             cosines = np.apply_along_axis(
 88 |                 lambda vec: np.dot(vec / np.linalg.norm(vec), f_norm), 0,
 89 |                 eigvectors)
 90 |             return phi(cosines, eigvals, k)
 91 | 
 92 |         if self.phi_type == 1:
 93 |             phi = self.__phi1
 94 |         elif self.phi_type == 2:
 95 |             phi = self.__phi2
 96 |         elif self.phi_type == 3:
 97 |             phi = self.__phi3
 98 |         else:
 99 |             getLogger(__name__).error(
100 |                 "phi_type should be 1, 2 or 3, %d passed", self.phi_type)
101 |             raise ValueError(
102 |                 "phi_type should be 1, 2 or 3, %d passed" % self.phi_type)
103 | 
104 |         n_samples = X.shape[0]
105 | 
106 |         if y is None:
107 |             if self.k > n_samples:
108 |                 getLogger(__name__).error(
109 |                     "Cannot find %d clusters with n_samples = %d",
110 |                     self.k, n_samples)
111 |                 raise ValueError(
112 |                     "Cannot find %d clusters with n_samples = %d"
113 |                     % (self.k, n_samples))
114 |             k = self.k
115 |             graph = np.ones((n_samples, n_samples))
116 |             W = graph * pairwise_distances(
117 |                 X, metric=lambda x, y: self.__scheme(x, y))
118 |         else:
119 |             values, counts = np.unique(y, return_counts=True)
120 |             values_dict = dict(zip(values, counts))
121 |             k = len(values)
122 |             W = pairwise_distances(
123 |                 y.reshape(-1, 1),
124 |                 metric=lambda x, y: (x[0] == y[0]) / values_dict[x[0]])
125 | 
126 |         getLogger(__name__).info("W: %s", W)
127 | 
128 |         D = np.diag(W.sum(axis=1))
129 |         getLogger(__name__).info("D: %s", D)
130 |         L = D - W
131 |         getLogger(__name__).info("L: %s", L)
132 |         L_norm = power_neg_half(D).dot(L).dot(power_neg_half(D))
133 |         getLogger(__name__).info("Normalized L: %s", L_norm)
134 |         eigvals, eigvectors = eigh(a=L_norm)
135 |         getLogger(__name__).info(
136 |             "Eigenvalues for normalized L: %s, eigenvectors: %s",
137 |             eigvals, eigvectors)
138 | 
139 |         self.feature_scores_ = np.apply_along_axis(
140 |             lambda f: calc_weight(f), 0, X)
141 |         getLogger(__name__).info("Feature scores: %s", self.feature_scores_)
142 |         ranking = np.argsort(self.feature_scores_)
143 |         if self.phi_type == 3:
144 |             ranking = ranking[::-1]
145 |         self.selected_features_ = ranking[:self.n_features]
146 | 


--------------------------------------------------------------------------------
/ITMO_FS/filters/univariate/UnivariateFilter.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | import numpy as np
 4 | 
 5 | from .measures import CR_NAMES, MEASURE_NAMES
 6 | from ...utils import (BaseTransformer, generate_features, check_restrictions,
 7 |                       apply_cr)
 8 | 
 9 | 
10 | class UnivariateFilter(BaseTransformer):
11 |     """Basic interface for using univariate measures for feature selection.
12 |     List of available measures is in ITMO_FS.filters.univariate.measures, also
13 |     you can provide your own measure but it should suit the argument scheme for
14 |     measures, i.e. take two arguments x,y and return scores for all the
15 |     features in dataset x. Same applies to cutting rules.
16 | 
17 |     Parameters
18 |     ----------
19 |     measure : string or callable
20 |         A metric name defined in GLOB_MEASURE or a callable with signature
21 |         measure (sample dataset, labels of dataset samples) which should
22 |         return a list of metric values for each feature in the dataset.
23 |     cutting_rule : string or callables
24 |         A cutting rule name defined in GLOB_CR or a callable with signature
25 |         cutting_rule (features) which should return a list of features ranked by
26 |         some rule.
27 | 
28 |     See Also
29 |     --------
30 | 
31 |     Examples
32 |     --------
33 | 
34 |     >>> import numpy as np
35 |     >>> from ITMO_FS.filters.univariate import select_k_best
36 |     >>> from ITMO_FS.filters.univariate import UnivariateFilter
37 |     >>> from ITMO_FS.filters.univariate import f_ratio_measure
38 |     >>> x = np.array([[3, 3, 3, 2, 2], [3, 3, 1, 2, 3], [1, 3, 5, 1, 1],
39 |     ... [3, 1, 4, 3, 1], [3, 1, 2, 3, 1]])
40 |     >>> y = np.array([1, 3, 2, 1, 2])
41 |     >>> filter = UnivariateFilter(f_ratio_measure,
42 |     ... select_k_best(2)).fit(x, y)
43 |     >>> filter.selected_features_
44 |     array([4, 2], dtype=int64)
45 |     >>> filter.feature_scores_
46 |     array([0.6 , 0.2 , 1.  , 0.12, 5.4 ])
47 |     """
48 |     def __init__(self, measure, cutting_rule=("Best by percentage", 1.0)):
49 |         self.measure = measure
50 |         self.cutting_rule = cutting_rule
51 | 
52 |     def __apply_ms(self):
53 |         if isinstance(self.measure, str):
54 |             try:
55 |                 measure = MEASURE_NAMES[self.measure]
56 |             except KeyError:
57 |                 getLogger(__name__).error("No %s measure yet", self.measure)
58 |                 raise KeyError("No %s measure yet" % self.measure)
59 |         elif hasattr(self.measure, '__call__'):
60 |             measure = self.measure
61 |         else:
62 |             getLogger(__name__).error(
63 |                 "%s isn't a measure function or string", self.measure)
64 |             raise KeyError(
65 |                 "%s isn't a measure function or string" % self.measure)
66 |         return measure
67 | 
68 |     def _fit(self, X, y, store_scores=True):
69 |         """Fit the filter.
70 | 
71 |         Parameters
72 |         ----------
73 |         X : array-like, shape (n_samples, n_features)
74 |             The training input samples.
75 |         y : array-like, shape (n_samples,)
76 |             The target values.
77 |         store_scores : boolean, optional
78 |             In case you want to store the scores of features
79 |             for future calls to Univariate filter; default True
80 | 
81 |         Returns
82 |         -------
83 |         None
84 |         """
85 |         measure = self.__apply_ms()
86 |         cutting_rule = apply_cr(self.cutting_rule)
87 |         getLogger(__name__).info(
88 |             "Using UnivariateFilter with measure %s and cutting rule %s",
89 |             measure, cutting_rule)
90 | 
91 |         check_restrictions(measure.__name__, cutting_rule.__name__)
92 | 
93 |         feature_scores = measure(X, y)
94 |         getLogger(__name__).info("Feature scores: %s", feature_scores)
95 | 
96 |         if store_scores:
97 |             self.feature_scores_ = feature_scores
98 |         self.selected_features_ = cutting_rule(feature_scores)
99 | 


--------------------------------------------------------------------------------
/ITMO_FS/filters/univariate/__init__.py:
--------------------------------------------------------------------------------
 1 | from .UnivariateFilter import UnivariateFilter
 2 | from .VDM import VDM
 3 | from .measures import anova, fit_criterion_measure, f_ratio_measure, \
 4 |     gini_index, su_measure, modified_t_score, fechner_corr, \
 5 |     information_gain, relief_measure, reliefF_measure, chi2_measure, \
 6 |     spearman_corr, pearson_corr, laplacian_score, qpfs_filter, \
 7 |     kendall_corr, select_k_best, select_k_worst, select_worst_by_value, \
 8 |     select_best_by_value, select_best_percentage, \
 9 |     select_worst_percentage
10 | from .NDFS import NDFS
11 | from .RFS import RFS
12 | from .SPEC import SPEC
13 | 


--------------------------------------------------------------------------------
/ITMO_FS/filters/unsupervised/MCFS.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger
  2 | 
  3 | import numpy as np
  4 | from scipy.linalg import eigh
  5 | from sklearn.linear_model import Lars
  6 | from sklearn.neighbors import NearestNeighbors
  7 | from sklearn.metrics.pairwise import pairwise_distances
  8 | 
  9 | from ...utils import BaseTransformer
 10 | 
 11 | 
 12 | class MCFS(BaseTransformer):
 13 |     """Unsupervised Feature Selection for Multi-Cluster Data algorithm.
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     n_features : int
 18 |         Number of features to select.
 19 |     k : int
 20 |         Amount of clusters to find.
 21 |     p : int
 22 |         Amount of nearest neighbors to use while building the graph.
 23 |     scheme : str, either '0-1', 'heat' or 'dot'
 24 |         Weighting scheme to use while building the graph.
 25 |     sigma : float
 26 |         Parameter for heat weighting scheme. Ignored if scheme is not 'heat'.
 27 |     full_graph : boolean
 28 |         If True, connect all vertices in the graph to each other instead of
 29 |         running the k-nearest neighbors algorithm. Use with 'heat' or 'dot'
 30 |         schemes.
 31 | 
 32 |     Notes
 33 |     -----
 34 |     For more details see `this paper
 35 |     <http://www.cad.zju.edu.cn/home/dengcai/Publication/Conference/2010_KDD-MCFS.pdf/>`_.
 36 | 
 37 |     Examples
 38 |     --------
 39 |     >>> from ITMO_FS.filters.unsupervised import MCFS
 40 |     >>> from sklearn.datasets import make_classification
 41 |     >>> import numpy as np
 42 |     >>> dataset = make_classification(n_samples=500, n_features=100,
 43 |     ... n_informative=5, n_redundant=0, random_state=42, shuffle=False)
 44 |     >>> X, y = np.array(dataset[0]), np.array(dataset[1])
 45 |     >>> model = MCFS(5).fit(X)
 46 |     >>> model.selected_features_
 47 |     array([0, 2, 4, 1, 3], dtype=int64)
 48 |     """
 49 |     def __init__(self, n_features, k=2, p=3, scheme='dot', sigma=1,
 50 |                  full_graph=False):
 51 |         self.n_features = n_features
 52 |         self.k = k
 53 |         self.p = p
 54 |         self.scheme = scheme
 55 |         self.sigma = sigma
 56 |         self.full_graph = full_graph
 57 | 
 58 |     def __scheme_01(self, x1, x2):
 59 |         return 1
 60 | 
 61 |     def __scheme_heat(self, x1, x2):
 62 |         return np.exp(-np.linalg.norm(x1 - x2) ** 2 / self.sigma)
 63 | 
 64 |     def __scheme_dot(self, x1, x2):
 65 |         return (x1 / np.linalg.norm(x1 + 1e-10)).dot(
 66 |             x2 / np.linalg.norm(x2 + 1e-10))
 67 | 
 68 |     def _fit(self, X, y):
 69 |         """
 70 |             Fits the filter.
 71 | 
 72 |             Parameters
 73 |             ----------
 74 |             X : array-like, shape (n_samples, n_features)
 75 |                 The training input samples.
 76 |             y : array-like
 77 |                 The target values (ignored).
 78 | 
 79 |             Returns
 80 |             ----------
 81 |             None
 82 |         """
 83 |         if self.scheme == '0-1':
 84 |             scheme = self.__scheme_01
 85 |         elif self.scheme == 'heat':
 86 |             scheme = self.__scheme_heat
 87 |         elif self.scheme == 'dot':
 88 |             scheme = self.__scheme_dot
 89 |         else:
 90 |             getLogger(__name__).error(
 91 |                 "scheme should be either '0-1', 'heat' or 'dot'; %s passed",
 92 |                 self.scheme)
 93 |             raise KeyError(
 94 |                 "scheme should be either '0-1', 'heat' or 'dot'; %s passed"
 95 |                 % self.scheme)
 96 | 
 97 |         n_samples = X.shape[0]
 98 | 
 99 | 
100 |         if self.k > n_samples:
101 |             getLogger(__name__).error(
102 |                 "Cannot find %d clusters with n_samples = %d",
103 |                 self.k, n_samples)
104 |             raise ValueError(
105 |                 "Cannot find %d clusters with n_samples = %d"
106 |                 % (self.k, n_samples))
107 | 
108 |         if self.p >= n_samples:
109 |             getLogger(__name__).error(
110 |                 "Cannot select %d nearest neighbors with n_samples = %d",
111 |                 self.p, n_samples)
112 |             raise ValueError(
113 |                 "Cannot select %d nearest neighbors with n_samples = %d"
114 |                 % (self.p, n_samples))
115 | 
116 |         if self.full_graph:
117 |             graph = np.ones((n_samples, n_samples))
118 |         else:
119 |             graph = NearestNeighbors(n_neighbors=self.p,
120 |                 algorithm='ball_tree').fit(X).kneighbors_graph().toarray()
121 |             graph = np.minimum(1, graph + graph.T)
122 | 
123 |         getLogger(__name__).info("Nearest neighbors graph: %s", graph)
124 | 
125 |         W = graph * pairwise_distances(X, metric=lambda x, y: scheme(x, y))
126 |         getLogger(__name__).info("W: %s", W)
127 |         D = np.diag(W.sum(axis=0))
128 |         getLogger(__name__).info("D: %s", D)
129 |         L = D - W
130 |         getLogger(__name__).info("L: %s", L)
131 |         eigvals, Y = eigh(type=1, a=L, b=D, subset_by_index=[1, self.k])
132 |         getLogger(__name__).info("Eigenvalues: %s, classes: %s", eigvals, Y)
133 | 
134 |         weights = np.zeros((self.n_features_, self.k))
135 |         for i in range(self.k):
136 |             clf = Lars(n_nonzero_coefs=self.n_features)
137 |             clf.fit(X, Y[:, i])
138 |             weights[:, i] = np.abs(clf.coef_)
139 |             getLogger(__name__).info(
140 |                 "Weights for eigenvalue %d: %s", i, weights[:, i])
141 | 
142 |         self.feature_scores_ = weights.max(axis=1)
143 |         getLogger(__name__).info("Feature scores: %s", self.feature_scores_)
144 |         ranking = np.argsort(self.feature_scores_)[::-1]
145 |         self.selected_features_ = ranking[:self.n_features]
146 | 


--------------------------------------------------------------------------------
/ITMO_FS/filters/unsupervised/UDFS.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger
  2 | 
  3 | import numpy as np
  4 | from scipy.linalg import eigh
  5 | from sklearn.neighbors import NearestNeighbors
  6 | 
  7 | from ...utils import l21_norm, matrix_norm, BaseTransformer
  8 | 
  9 | 
 10 | class UDFS(BaseTransformer):
 11 |     """Unsupervised Discriminative Feature Selection algorithm.
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     n_features : int
 16 |         Number of features to select.
 17 |     c : int
 18 |         Amount of clusters to find.
 19 |     k : int
 20 |         Amount of nearest neighbors to use while building the graph.
 21 |     gamma : float
 22 |         Regularization term in the target function.
 23 |     l : float
 24 |         Parameter that controls the invertibility of the matrix used in
 25 |         computing of B.
 26 |     max_iterations : int
 27 |         Maximum amount of iterations to perform.
 28 |     epsilon : positive float
 29 |         Specifies the needed residual between the target functions from
 30 |         consecutive iterations. If the residual is smaller than epsilon, the
 31 |         algorithm is considered to have converged.
 32 | 
 33 |     Notes
 34 |     -----
 35 |     For more details see `this paper <https://www.ijcai.org/Proceedings/11/Papers/267.pdf/>`_.
 36 | 
 37 |     Examples
 38 |     --------
 39 |     >>> from ITMO_FS.filters.unsupervised import UDFS
 40 |     >>> from sklearn.datasets import make_classification
 41 |     >>> import numpy as np
 42 |     >>> dataset = make_classification(n_samples=500, n_features=100,
 43 |     ... n_informative=5, n_redundant=0, random_state=42, shuffle=False,
 44 |     ... n_clusters_per_class=1)
 45 |     >>> X, y = np.array(dataset[0]), np.array(dataset[1])
 46 |     >>> model = UDFS(5).fit(X)
 47 |     >>> model.selected_features_
 48 |     array([ 2,  3, 19, 90, 92], dtype=int64)
 49 |     """
 50 |     def __init__(self, n_features, c=2, k=3, gamma=1, l=1e-6,
 51 |                  max_iterations=1000, epsilon=1e-5):
 52 |         self.n_features = n_features
 53 |         self.c = c
 54 |         self.k = k
 55 |         self.gamma = gamma
 56 |         self.l = l
 57 |         self.max_iterations = max_iterations
 58 |         self.epsilon = epsilon
 59 | 
 60 |     def _fit(self, X, y):
 61 |         """Fit the filter.
 62 | 
 63 |         Parameters
 64 |         ----------
 65 |         X : array-like, shape (n_samples, n_features)
 66 |             The training input samples.
 67 |         y : array-like
 68 |             The target values (ignored).
 69 | 
 70 |         Returns
 71 |         -------
 72 |         None
 73 |         """
 74 |         def construct_S(arr):
 75 |             S = np.zeros((n_samples, self.k + 1))
 76 |             for idx in range(self.k + 1):
 77 |                 S[arr[idx], idx] = 1
 78 |             return S
 79 | 
 80 |         n_samples = X.shape[0]
 81 | 
 82 |         if self.c > n_samples:
 83 |             getLogger(__name__).error(
 84 |                 "Cannot find %d clusters with n_samples = %d",
 85 |                 self.c, n_samples)
 86 |             raise ValueError(
 87 |                 "Cannot find %d clusters with n_samples = %d"
 88 |                 % (self.c, n_samples))
 89 | 
 90 |         if self.k >= n_samples:
 91 |             getLogger(__name__).error(
 92 |                 "Cannot select %d nearest neighbors with n_samples = %d",
 93 |                 self.k, n_samples)
 94 |             raise ValueError(
 95 |                 "Cannot select %d nearest neighbors with n_samples = %d"
 96 |                 % (self.k, n_samples))
 97 | 
 98 |         indices = list(range(n_samples))
 99 |         I = np.eye(self.k + 1)
100 |         H = I - np.ones((self.k + 1, self.k + 1)) / (self.k + 1)
101 | 
102 |         neighbors = NearestNeighbors(
103 |             n_neighbors=self.k + 1,
104 |             algorithm='ball_tree').fit(X).kneighbors(X, return_distance=False)
105 |         getLogger(__name__).info("Neighbors graph: %s", neighbors)
106 |         X_centered = np.apply_along_axis(
107 |             lambda arr: X[arr].T.dot(H), 1, neighbors)
108 | 
109 |         S = np.apply_along_axis(lambda arr: construct_S(arr), 1, neighbors)
110 |         getLogger(__name__).info("S: %s", S)
111 |         B = np.vectorize(
112 |             lambda idx: np.linalg.inv(X_centered[idx].T.dot(X_centered[idx])
113 |                         + self.l * I),
114 |             signature='()->(1,1)')(indices)
115 |         getLogger(__name__).info("B: %s", B)
116 |         Mi = np.vectorize(
117 |             lambda idx: S[idx].dot(H).dot(B[idx]).dot(H).dot(S[idx].T),
118 |             signature='()->(1,1)')(indices)
119 |         M = X.T.dot(Mi.sum(axis=0)).dot(X)
120 |         getLogger(__name__).info("M: %s", M)
121 | 
122 |         D = np.eye(self.n_features_)
123 |         previous_target = -1
124 |         for step in range(self.max_iterations):
125 |             P = M + self.gamma * D
126 |             getLogger(__name__).info("P: %s", P)
127 |             _, W = eigh(a=P, subset_by_index=[0, self.c - 1])
128 |             getLogger(__name__).info("W: %s", W)
129 |             diag = 2 * matrix_norm(W)
130 |             diag[diag < 1e-10] = 1e-10  # prevents division by zero
131 |             D = np.diag(1 / diag)
132 |             getLogger(__name__).info("D: %s", D)
133 | 
134 |             target = np.trace(W.T.dot(M).dot(W)) + self.gamma * l21_norm(W)
135 |             getLogger(__name__).info("New target value: %d", target)
136 |             if abs(target - previous_target) < self.epsilon:
137 |                 break
138 |             previous_target = target
139 | 
140 |         getLogger(__name__).info("Ended up with W = %s", W)
141 |         self.feature_scores_ = matrix_norm(W)
142 |         getLogger(__name__).info("Feature scores: %s", self.feature_scores_)
143 |         ranking = np.argsort(self.feature_scores_)[::-1]
144 |         self.selected_features_ = ranking[:self.n_features]
145 | 


--------------------------------------------------------------------------------
/ITMO_FS/filters/unsupervised/__init__.py:
--------------------------------------------------------------------------------
1 | from .MCFS import MCFS
2 | from .UDFS import UDFS
3 | from .trace_ratio_laplacian import TraceRatioLaplacian
4 | 


--------------------------------------------------------------------------------
/ITMO_FS/filters/unsupervised/trace_ratio_laplacian.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger
  2 | 
  3 | import numpy as np
  4 | from sklearn.metrics.pairwise import pairwise_distances
  5 | from sklearn.neighbors import NearestNeighbors
  6 | 
  7 | from ...utils import BaseTransformer
  8 | 
  9 | class TraceRatioLaplacian(BaseTransformer):
 10 |     """TraceRatio(similarity based) feature selection filter performed in
 11 |     unsupervised way, i.e laplacian version
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     n_features : int
 16 |         Amount of features to select.
 17 |     k : int
 18 |         Amount of nearest neighbors to use while building the graph.
 19 |     t : int
 20 |         constant for kernel function calculation
 21 |     epsilon : float
 22 |         Lambda change threshold.
 23 | 
 24 |     Notes
 25 |     -----
 26 |     For more details see `this paper <https://aaai.org/Papers/AAAI/2008/AAAI08-107.pdf/>`_.
 27 | 
 28 |     Examples
 29 |     --------
 30 |     >>> from ITMO_FS.filters.unsupervised import TraceRatioLaplacian
 31 |     >>> import numpy as np
 32 |     >>> X = np.array([[1, 2, 3, 3, 1], [2, 2, 3, 3, 2], [1, 3, 3, 1, 3],
 33 |     ... [1, 1, 3, 1, 4], [2, 4, 3, 1, 5]])
 34 |     >>> y = np.array([1, 2, 1, 1, 2])
 35 |     >>> tracer = TraceRatioLaplacian(2, k=2).fit(X)
 36 |     >>> tracer.selected_features_
 37 |     array([3, 1], dtype=int64)
 38 |     """
 39 |     def __init__(self, n_features, k=5, t=1, epsilon=1e-3):
 40 |         self.n_features = n_features
 41 |         self.k = k
 42 |         self.t = t
 43 |         self.epsilon = epsilon
 44 | 
 45 |     def _fit(self, X, y):
 46 |         """Fit the filter.
 47 | 
 48 |         Parameters
 49 |         ----------
 50 |         X : array-likey, shape (n_samples, n_features)
 51 |             The training input samples
 52 |         y : array-like, shape (n_samples,)
 53 |             The target values
 54 | 
 55 |         Returns
 56 |         -------
 57 |         None
 58 |         """
 59 |         n_samples = X.shape[0]
 60 | 
 61 |         if self.k >= n_samples:
 62 |             getLogger(__name__).error(
 63 |                 "Cannot select %d nearest neighbors with n_samples = %d",
 64 |                 self.k, n_samples)
 65 |             raise ValueError(
 66 |                 "Cannot select %d nearest neighbors with n_samples = %d"
 67 |                 % (self.k, n_samples))
 68 | 
 69 |         graph = NearestNeighbors(
 70 |             n_neighbors=self.n_features,
 71 |             algorithm='ball_tree').fit(X).kneighbors_graph().toarray()
 72 |         graph = np.minimum(1, graph + graph.T)
 73 |         getLogger(__name__).info("Nearest neighbors graph: %s", graph)
 74 | 
 75 |         A_within = graph * pairwise_distances(
 76 |             X, metric=lambda x, y: np.exp(-np.linalg.norm(x - y) ** 2 / self.t))
 77 |         getLogger(__name__).info("A_within: %s", A_within)
 78 |         D_within = np.diag(A_within.sum(axis=1))
 79 |         getLogger(__name__).info("D_within: %s", D_within)
 80 |         L_within = D_within - A_within
 81 |         getLogger(__name__).info("L_within: %s", L_within)
 82 |         A_between = (D_within.dot(np.ones((n_samples, n_samples))).dot(D_within)
 83 |                      / np.sum(D_within))
 84 |         getLogger(__name__).info("A_between: %s", A_between)
 85 |         D_between = np.diag(A_between.sum(axis=1))
 86 |         getLogger(__name__).info("D_between: %s", D_between)
 87 |         L_between = D_between - A_between
 88 |         getLogger(__name__).info("L_between: %s", L_between)
 89 | 
 90 |         E = X.T.dot(L_within).dot(X)
 91 |         B = X.T.dot(L_between).dot(X)
 92 | 
 93 |         # we need only diagonal elements for trace calculation
 94 |         e = np.array(np.diag(E))
 95 |         b = np.array(np.diag(B))
 96 |         getLogger(__name__).info("E: %s", e)
 97 |         getLogger(__name__).info("B: %s", b)
 98 |         lam = 0
 99 |         prev_lam = -1
100 |         while lam - prev_lam >= self.epsilon:  # TODO: optimize
101 |             score = b - lam * e
102 |             getLogger(__name__).info("Score: %s", score)
103 |             self.selected_features_ = np.argsort(score)[::-1][:self.n_features]
104 |             getLogger(__name__).info(
105 |                 "New selected set: %s", self.selected_features_)
106 |             prev_lam = lam
107 |             lam = (np.sum(b[self.selected_features_])
108 |                    / np.sum(e[self.selected_features_]))
109 |             getLogger(__name__).info("New lambda: %d", lam)
110 |         self.score_ = score
111 |         self.lam_ = lam
112 | 


--------------------------------------------------------------------------------
/ITMO_FS/hybrid/Melif.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger
  2 | 
  3 | import numpy as np
  4 | from sklearn.base import clone
  5 | from sklearn.model_selection import cross_val_score
  6 | 
  7 | from ITMO_FS.ensembles import WeightBased
  8 | from ITMO_FS.utils import BaseWrapper, apply_cr
  9 | from ITMO_FS.utils.data_check import *
 10 | 
 11 | 
 12 | class Melif(BaseWrapper):
 13 |     """MeLiF algorithm.
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     estimator : object
 18 |         A supervised learning estimator that should have a fit(X, y) method and
 19 |         a predict(X) method.
 20 |     measure : string or callable
 21 |         A standard estimator metric (e.g. 'f1' or 'roc_auc') or a callable with
 22 |         signature measure(estimator, X, y) which should return only a single
 23 |         value.
 24 |     cutting_rule : string or callable
 25 |         A cutting rule name defined in GLOB_CR or a callable with signature
 26 |         cutting_rule (features), which should return a list features ranked by
 27 |         some rule.
 28 |     filter_ensemble : object
 29 |         A filter ensemble (e.g. WeightBased) or a list of filters that will be
 30 |         used to create a WeightBased ensemble.
 31 |     delta : float
 32 |         The step in coordinate descent.
 33 |     points : array-like
 34 |         An array of starting points in the search.
 35 |     seed : int
 36 |         Random seed used to initialize np.random.default_rng().
 37 |     cv : int
 38 |         Number of folds in cross-validation.
 39 | 
 40 |     See Also
 41 |     --------
 42 |     For more details see `this paper <https://www.researchgate.net/publication/317201206_MeLiF_Filter_Ensemble_Learning_Algorithm_for_Gene_Selection>`_.
 43 | 
 44 |     Examples
 45 |     --------
 46 |     >>> from ITMO_FS.hybrid import Melif
 47 |     >>> from ITMO_FS.filters.univariate import UnivariateFilter
 48 |     >>> from sklearn.datasets import make_classification
 49 |     >>> from sklearn.preprocessing import KBinsDiscretizer
 50 |     >>> from sklearn.linear_model import LogisticRegression
 51 |     >>> dataset = make_classification(n_samples=100, n_features=20,
 52 |     ... n_informative=5, n_redundant=0, shuffle=False, random_state=42)
 53 |     >>> x, y = np.array(dataset[0]), np.array(dataset[1])
 54 |     >>> x = KBinsDiscretizer(n_bins=10, encode='ordinal',
 55 |     ... strategy='uniform').fit_transform(x)
 56 |     >>> filters = [UnivariateFilter('GiniIndex'),
 57 |     ... UnivariateFilter('FechnerCorr'),
 58 |     ... UnivariateFilter('SpearmanCorr'),
 59 |     ... UnivariateFilter('PearsonCorr')]
 60 |     >>> algo = Melif(LogisticRegression(), 'f1_macro', ("K best", 5),
 61 |     ... filters, delta=0.5).fit(x, y)
 62 |     >>> algo.selected_features_
 63 |     array([ 3,  4,  1, 13, 16], dtype=int64)
 64 |     """
 65 |     def __init__(self, estimator, measure, cutting_rule, filter_ensemble,
 66 |                  delta=0.5, points=None, seed=42, cv=3):
 67 |         self.estimator = estimator
 68 |         self.measure = measure
 69 |         self.cutting_rule = cutting_rule
 70 |         self.filter_ensemble = filter_ensemble
 71 |         self.delta = delta
 72 |         self.points = points
 73 |         self.seed = seed
 74 |         self.cv = cv
 75 | 
 76 |     def _fit(self, X, y):
 77 |         """Run the MeLiF algorithm on the specified dataset.
 78 | 
 79 |         Parameters
 80 |         ----------
 81 |         X : array-like, shape (n_samples, n_features)
 82 |             The input samples.
 83 |         y : array-like, shape (n_samples,)
 84 |             The classes for the samples.
 85 | 
 86 |         Returns
 87 |         -------
 88 |         None
 89 |         """
 90 |         self._rng = np.random.default_rng(self.seed)
 91 |         if type(self.filter_ensemble) is list:
 92 |             self.__ensemble = WeightBased(self.filter_ensemble)
 93 |         else:
 94 |             self.__ensemble = clone(self.filter_ensemble)
 95 | 
 96 |         self.n_filters = len(self.__ensemble)
 97 |         self.__filter_weights = np.ones(self.n_filters) / self.n_filters
 98 | 
 99 |         check_cutting_rule(self.cutting_rule)
100 |         cutting_rule = apply_cr(self.cutting_rule)
101 |         getLogger(__name__).info(
102 |             "Using MeLiF with ensemble: %s and cutting rule: %s",
103 |             self.__ensemble, cutting_rule)
104 |         scores = self.__ensemble.get_scores(X, y)
105 | 
106 |         if self.points is None:
107 |             points = np.vstack((self.__filter_weights, np.eye(self.n_filters)))
108 |         else:
109 |             points = self.points
110 |         best_point_ = points[0]
111 | 
112 |         self.best_score_ = 0
113 |         for point in points:
114 |             getLogger(__name__).info(
115 |                 "Running coordinate descent from point %s", point)
116 |             new_point, new_score = self.__search(
117 |                 X, y, point, scores, cutting_rule)
118 |             getLogger(__name__).info(
119 |                 "Ended up in point %s with score %d", new_point, new_score)
120 |             if new_score > self.best_score_:
121 |                 self.best_score_ = new_score
122 |                 self.best_point_ = new_point
123 |         getLogger(__name__).info(
124 |             "Final best point: %s with score %d",
125 |             best_point_, self.best_score_)
126 |         self.selected_features_ = cutting_rule(
127 |             np.dot(scores.T, self.best_point_))
128 |         self._estimator.fit(X[:, self.selected_features_], y)
129 | 
130 |     def __search(self, X, y, point, scores, cutting_rule):
131 |         """Perform a coordinate descent from the given point.
132 | 
133 |         Parameters
134 |         ----------
135 |         X : array-like, shape (n_samples, n_features)
136 |             The input samples.
137 |         y : array-like, shape (n_samples,)
138 |             The classes for the samples.
139 |         point : array-like, shape (n_filters,)
140 |             The starting point.
141 |         scores : array-like, shape (n_filters, n_features)
142 |             The scores for the features from all filters.
143 |         cutting_rule : callable
144 |             The cutting rule to use.
145 | 
146 |         Returns
147 |         -------
148 |         tuple (array-like, float) : the optimal point and its score
149 |         """
150 |         best_point = point
151 |         selected_features = cutting_rule(np.dot(scores.T, point))
152 |         best_score = cross_val_score(
153 |             self._estimator, X[:, selected_features], y, cv=self.cv,
154 |             scoring=self.measure).mean()
155 |         delta = np.eye(self.n_filters) * self.delta
156 |         changed = True
157 |         while changed:
158 |             #the original paper descends starting from the first filter;
159 |             #we randomize the order instead to avoid local maximas
160 |             getLogger(__name__).info(
161 |                 "Current optimal point: %s with score = %d",
162 |                 best_point, best_score)
163 |             order = self._rng.permutation(self.n_filters)
164 |             changed = False
165 |             for f in order:
166 |                 iteration_point_plus = best_point + delta[f]
167 |                 selected_features = cutting_rule(
168 |                     np.dot(scores.T, iteration_point_plus))
169 |                 score = cross_val_score(
170 |                     self._estimator, X[:, selected_features], y, cv=self.cv,
171 |                     scoring=self.measure).mean()
172 |                 getLogger(__name__).info(
173 |                     "Trying to move to point %s: score = %d",
174 |                     iteration_point_plus, score)
175 |                 if score > best_score:
176 |                     best_score = score
177 |                     best_point = iteration_point_plus
178 |                     changed = True
179 |                     break
180 | 
181 |                 iteration_point_minus = best_point - delta[f]
182 |                 selected_features = cutting_rule(
183 |                     np.dot(scores.T, iteration_point_minus))
184 |                 score = cross_val_score(
185 |                     self._estimator, X[:, selected_features], y, cv=self.cv,
186 |                     scoring=self.measure).mean()
187 |                 getLogger(__name__).info(
188 |                     "Trying to move to point %s: score = %d",
189 |                     iteration_point_minus, score)
190 |                 if score > best_score:
191 |                     best_score = score
192 |                     best_point = iteration_point_minus
193 |                     changed = True
194 |                     break
195 |         return best_point, best_score
196 | 


--------------------------------------------------------------------------------
/ITMO_FS/hybrid/__init__.py:
--------------------------------------------------------------------------------
1 | from .filter_wrapper_hybrid import *
2 | from .Melif import Melif
3 | from .IWSSr_SFLA import IWSSr_SFLA
4 | 


--------------------------------------------------------------------------------
/ITMO_FS/hybrid/filter_wrapper_hybrid.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | from sklearn.base import clone
 4 | 
 5 | from ..utils import BaseTransformer
 6 | 
 7 | class FilterWrapperHybrid(BaseTransformer):
 8 |     """Perform the filter + wrapper hybrid algorithm by first running the
 9 |     filter algorithm on the full dataset, leaving the selected features and
10 |     running the wrapper algorithm on the cut dataset.
11 | 
12 |     Parameters
13 |     ----------
14 |     filter_ : object
15 |         A feature selection model that should have a fit(X, y) method and a
16 |         selected_features_ attribute available after fitting.
17 |     wrapper : object
18 |         A feature selection model that should have a fit(X, y) method,
19 |         selected_features_ and best_score_ attributes available after fitting
20 |         and a predict(X) method.
21 | 
22 |     Notes
23 |     -----
24 |     This class doesn't require the first algorithm to be a filter (the only
25 |     requirements are a fit(X, y) method and a selected_features_ attribute)
26 |     but it is recommended to use a fast algorithm first to remove a lot of
27 |     unnecessary features before processing the resulting dataset with a more
28 |     time-consuming algorithm (e.g. a wrapper).
29 | 
30 |     Examples
31 |     --------
32 |     >>> import numpy as np
33 |     >>> from sklearn.linear_model import LogisticRegression
34 |     >>> from ITMO_FS.wrappers.deterministic import BackwardSelection
35 |     >>> from ITMO_FS.filters.univariate import UnivariateFilter
36 |     >>> from ITMO_FS.hybrid import FilterWrapperHybrid
37 |     >>> from sklearn.datasets import make_classification
38 |     >>> dataset = make_classification(n_samples=100, n_features=20,
39 |     ... n_informative=5, n_redundant=0, shuffle=False, random_state=42)
40 |     >>> x, y = np.array(dataset[0]), np.array(dataset[1])
41 |     >>> filter_ = UnivariateFilter('FRatio', ("K best", 10))
42 |     >>> wrapper = BackwardSelection(LogisticRegression(), 5, measure='f1_macro')
43 |     >>> model = FilterWrapperHybrid(filter_, wrapper).fit(x, y)
44 |     >>> model.selected_features_
45 |     array([ 1,  3,  4, 10,  7], dtype=int64)
46 |     """
47 |     def __init__(self, filter_, wrapper):
48 |         self.filter_ = filter_
49 |         self.wrapper = wrapper
50 | 
51 |     def _fit(self, X, y):
52 |         """Fit the model.
53 | 
54 |         Parameters
55 |         ----------
56 |         X : array-like, shape (n_samples, n_features)
57 |             The input samples.
58 |         y : array-like, shape (n_samples,)
59 |             The classes for the samples.
60 | 
61 |         Returns
62 |         -------
63 |         None
64 |         """
65 |         self._filter = clone(self.filter_)
66 |         self._wrapper = clone(self.wrapper)
67 |         getLogger(__name__).info(
68 |             "Running FilterWrapper with filter = %s, wrapper = %s",
69 |             self._filter, self._wrapper)
70 | 
71 |         selected_filter = self._filter.fit(X, y).selected_features_
72 |         getLogger(__name__).info(
73 |             "Features selected by filter: %s", selected_filter)
74 |         self.selected_features_ = selected_filter[self._wrapper.fit(
75 |             X[:, selected_filter], y).selected_features_]
76 |         self.best_score_ = self._wrapper.best_score_
77 | 
78 |     def predict(self, X):
79 |         """Predict class labels for the input data.
80 | 
81 |         Parameters
82 |         ----------
83 |         X : array-like, shape (n_samples, n_features)
84 |             The input samples.
85 | 
86 |         Returns
87 |         ------
88 |         array-like, shape (n_samples,) : class labels
89 |         """
90 |         return self._wrapper.predict(X)
91 | 


--------------------------------------------------------------------------------
/ITMO_FS/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .data_check import *
2 | from .functions import *
3 | from .information_theory import *
4 | from .qpfs_body import qpfs_body
5 | from .base_transformer import BaseTransformer
6 | from .base_wrapper import BaseWrapper
7 | 


--------------------------------------------------------------------------------
/ITMO_FS/utils/base_transformer.py:
--------------------------------------------------------------------------------
  1 | from abc import abstractmethod
  2 | from logging import getLogger
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from sklearn.base import BaseEstimator, TransformerMixin
  7 | from sklearn.feature_selection import VarianceThreshold
  8 | from sklearn.utils import check_X_y, check_array
  9 | from sklearn.utils.validation import check_is_fitted
 10 | 
 11 | 
 12 | class BaseTransformer(TransformerMixin, BaseEstimator):
 13 |     def __init__(self):
 14 |         pass
 15 | 
 16 |     def fit(self, X, y=None, **fit_params):
 17 |         """Fit the algorithm.
 18 | 
 19 |         Parameters
 20 |         ----------
 21 |         X : array-like, shape (n_samples, n_features)
 22 |             The training input samples.
 23 |         y : array-like, shape (n_samples,), optional
 24 |             The class labels.
 25 |         fit_params : dict, optional
 26 |             Additional parameters to pass to underlying _fit function.
 27 | 
 28 |         Returns
 29 |         -------
 30 |         Self, i.e. the transformer object.
 31 |         """
 32 |         if y is not None:
 33 |             X, y = check_X_y(X, y, dtype='numeric')
 34 |             if y.dtype.kind == 'O':
 35 |                 y = y.astype('int')
 36 |         else:
 37 |             X = check_array(X, dtype='float64', accept_large_sparse=False)
 38 | 
 39 |         self.n_total_features_ = X.shape[1]
 40 |         nonconst_features = VarianceThreshold().fit(X).get_support(indices=True)
 41 |         self.n_features_ = nonconst_features.shape[0]
 42 | 
 43 |         if self.n_features_ != self.n_total_features_:
 44 |             getLogger(__name__).warning(
 45 |                 "Found %d constant features; they would not be used in fit")
 46 | 
 47 |         if hasattr(self, 'n_features'):
 48 |             if self.n_features > self.n_features_:
 49 |                 getLogger(__name__).error(
 50 |                     "Cannot select %d features with n_features = %d",
 51 |                     self.n_features, self.n_features_)
 52 |                 raise ValueError(
 53 |                     "Cannot select %d features with n_features = %d"
 54 |                     % (self.n_features, self.n_features_))
 55 | 
 56 |         if hasattr(self, 'epsilon'):
 57 |             if self.epsilon <= 0:
 58 |                 getLogger(__name__).error(
 59 |                     "Epsilon should be positive, %d passed", self.epsilon)
 60 |                 raise ValueError(
 61 |                     "Epsilon should be positive, %d passed" % self.epsilon)
 62 | 
 63 | 
 64 |         self._fit(X[:, nonconst_features], y, **fit_params)
 65 | 
 66 |         if hasattr(self, 'feature_scores_'):
 67 |             scores = np.empty(self.n_total_features_)
 68 |             scores.fill(np.nan)
 69 |             scores[nonconst_features] = self.feature_scores_
 70 |             self.feature_scores_ = scores
 71 |         self.selected_features_ = nonconst_features[self.selected_features_]
 72 | 
 73 |         return self
 74 | 
 75 |     def transform(self, X):
 76 |         """
 77 |             Transform given data by slicing it with selected features.
 78 | 
 79 |             Parameters
 80 |             ----------
 81 |             X : array-like, shape (n_samples, n_features)
 82 |                 The training input samples.
 83 | 
 84 |             Returns
 85 |             ------
 86 |             Transformed 2D numpy array
 87 |         """
 88 |         check_is_fitted(self, 'selected_features_')
 89 |         X_ = check_array(X, dtype='numeric', accept_large_sparse=False)
 90 |         if X_.shape[1] != self.n_total_features_:
 91 |             getLogger(__name__).error(
 92 |                 "Shape of input is different from what was seen in 'fit'")
 93 |             raise ValueError(
 94 |                 "Shape of input is different from what was seen in 'fit'")
 95 |         if isinstance(X, pd.DataFrame):
 96 |             return X[X.columns[self.selected_features_]]
 97 |         else:
 98 |             return X_[:, self.selected_features_]
 99 | 
100 |     @abstractmethod
101 |     def _fit(self, X, y):
102 |         pass
103 | 


--------------------------------------------------------------------------------
/ITMO_FS/utils/base_wrapper.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | from sklearn.base import clone
 4 | from sklearn.utils import check_array
 5 | from sklearn.utils.validation import check_is_fitted
 6 | 
 7 | from . import BaseTransformer
 8 | 
 9 | class BaseWrapper(BaseTransformer):
10 |     def __init__(self):
11 |         pass
12 | 
13 |     def fit(self, X, y=None, **fit_params):
14 |         """Fit the algorithm.
15 | 
16 |         Parameters
17 |         ----------
18 |         X : array-like, shape (n_samples, n_features)
19 |             The training input samples.
20 |         y : array-like, shape (n_samples,), optional
21 |             The class labels.
22 |         fit_params : dict, optional
23 |             Additional parameters to pass to underlying _fit function.
24 | 
25 |         Returns
26 |         -------
27 |         Self, i.e. the transformer object.
28 |         """
29 |         if not hasattr(self.estimator, 'fit'):
30 |             getLogger(__name__).error(
31 |                 "estimator should be an estimator implementing "
32 |                 "'fit' method, %s was passed", self.estimator)
33 |             raise TypeError(
34 |                 "estimator should be an estimator implementing "
35 |                 "'fit' method, %s was passed" % self.estimator)
36 |         if not hasattr(self.estimator, 'predict'):
37 |             getLogger(__name__).error(
38 |                 "estimator should be an estimator implementing "
39 |                 "'predict' method, %s was passed", self.estimator)
40 |             raise TypeError(
41 |                 "estimator should be an estimator implementing "
42 |                 "'predict' method, %s was passed" % self.estimator)
43 |         self._estimator = clone(self.estimator)
44 | 
45 |         return super().fit(X, y, **fit_params)
46 | 
47 |     def predict(self, X):
48 |         """Predict class labels for the input data.
49 | 
50 |         Parameters
51 |         ----------
52 |         X : array-like, shape (n_samples, n_features)
53 |             The input samples.
54 | 
55 |         Returns
56 |         -------
57 |         array-like, shape (n_samples,) : class labels
58 |         """
59 |         check_is_fitted(self, 'selected_features_')
60 |         X_ = check_array(X, dtype='float64', accept_large_sparse=False)
61 |         if X_.shape[1] != self.n_features_:
62 |             getLogger(__name__).error(
63 |                 "Shape of input is different from what was seen in 'fit'")
64 |             raise ValueError(
65 |                 "Shape of input is different from what was seen in 'fit'")
66 | 
67 |         return self._estimator.predict(X_[:, self.selected_features_])
68 | 


--------------------------------------------------------------------------------
/ITMO_FS/utils/data_check.py:
--------------------------------------------------------------------------------
 1 | from numpy import array
 2 | 
 3 | 
 4 | def generate_features(X, features=None):
 5 |     if features is None:
 6 |         try:
 7 |             if X.columns is list:
 8 |                 features = X.columns
 9 |             else:
10 |                 features = list(X.columns)
11 |         except AttributeError:
12 |             features = [i for i in range(X.shape[1])]
13 |     return array(features)
14 | 
15 | 
16 | def check_filters(filters):
17 |     for filter_ in filters:
18 |         attr = None
19 |         if not hasattr(filter_, 'fit'):
20 |             attr = 'fit'
21 |         if not hasattr(filter_, 'transform'):
22 |             attr = 'transform'
23 |         if not hasattr(filter_, 'fit_transform'):
24 |             attr = 'fit_transform'
25 |         if not (attr is None):
26 |             raise TypeError(
27 |                 "filters should be a list of filters each implementing {0} "
28 |                 "method, {1} was passed".format(attr, filter_))
29 | 
30 | 
31 | def check_cutting_rule(cutting_rule):
32 |     pass  # todo check cutting rule
33 | 
34 | 
35 | RESTRICTIONS = {'qpfs_filter': {'__select_k'}}
36 | 
37 | 
38 | def check_restrictions(measure_name, cutting_rule_name):
39 |     if (measure_name in RESTRICTIONS.keys() and
40 |             cutting_rule_name not in RESTRICTIONS[measure_name]):
41 |         raise KeyError(
42 |             "This measure %s doesn't support this cutting rule %s"
43 |             % (measure_name, cutting_rule_name))
44 | 


--------------------------------------------------------------------------------
/ITMO_FS/utils/functions.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.metrics import f1_score
  3 | from sklearn.metrics.pairwise import euclidean_distances
  4 | 
  5 | 
  6 | def cartesian(rw, cl):  # returns cartesian product for passed numpy arrays as two paired numpy array
  7 |     tmp = np.array(np.meshgrid(rw, cl)).T.reshape(len(rw) * len(cl), 2)
  8 |     return tmp.T[0], tmp.T[1]
  9 | 
 10 | def weight_func(model):  # weight function used in MOS testing
 11 |     return model.coef_[0]
 12 | 
 13 | def f1_scorer(y_true, y_pred):
 14 |     return f1_score(y_true, y_pred, average='micro')
 15 | 
 16 | def augmented_rvalue(X, y, k=7, theta=3):
 17 |     """Calculate the augmented R-value for a dataset with two classes.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     X : array-like, shape (n_samples, n_features)
 22 |         The input samples.
 23 |     y : array-like, shape (n_samples,)
 24 |         The classes for the samples.
 25 |     k : int
 26 |         The amount of nearest neighbors used in the calculation.
 27 |     theta : int
 28 |         The threshold value: if from k nearest neighbors of an object more than
 29 |         theta of them are of a different class, then this object is in the
 30 |         overlap region.
 31 | 
 32 |     Returns
 33 |     -------
 34 |     float - the augmented R-value for the dataset; the value is in the range
 35 |     [-1, 1].
 36 | 
 37 |     Notes
 38 |     -----
 39 |     For more details see `this paper <https://www.sciencedirect.com/science/article/pii/S0169743919306070>`_.
 40 |     """
 41 |     unique, counts = np.unique(y, return_counts=True)
 42 |     freq = sorted(list(zip(unique, counts)), key=lambda x: x[1], reverse=True)
 43 |     dm = euclidean_distances(X, X)
 44 |     Rs = []
 45 |     Cs = []
 46 | 
 47 |     for label, frequency in freq:
 48 |         Cs.append(frequency)
 49 |         count = 0
 50 |         for elem in [i for i, x in enumerate(y) if x == label]:
 51 |             nearest = knn_from_class(dm, y, elem, k, 1, anyClass=True)
 52 |             count += np.sign(
 53 |                 k
 54 |                 - list(map(lambda x: y[x], nearest)).count(label)
 55 |                 - theta)
 56 |         Rs.append(count / frequency)
 57 |     Cs = Cs[::-1]
 58 |     return np.dot(Rs, Cs) / len(X)
 59 | 
 60 | 
 61 | def knn_from_class(distances, y, index, k, cl, anyOtherClass=False,
 62 |                    anyClass=False):
 63 |     """Return the indices of k nearest neighbors of X[index] from the selected
 64 |     class.
 65 | 
 66 |     Parameters
 67 |     ----------
 68 |     distances : array-like, shape (n_samples, n_samples)
 69 |         The distance matrix of the input samples.
 70 |     y : array-like, shape (n_samples,)
 71 |         The classes for the samples.
 72 |     index : int
 73 |         The index of an element.
 74 |     k : int
 75 |         The amount of nearest neighbors to return.
 76 |     cl : int
 77 |         The class label for the nearest neighbors.
 78 |     anyClass : bool
 79 |         If True, returns neighbors not belonging to the same class as X[index].
 80 | 
 81 |     Returns
 82 |     -------
 83 |     array-like, shape (k,) - the indices of the nearest neighbors
 84 |     """
 85 |     y_c = np.copy(y)
 86 |     if anyOtherClass:
 87 |         cl = y_c[index] + 1
 88 |         y_c[y_c != y_c[index]] = cl
 89 |     if anyClass:
 90 |         y_c.fill(cl)
 91 |     class_indices = np.nonzero(y_c == cl)[0]
 92 |     distances_class = distances[index][class_indices]
 93 |     nearest = np.argsort(distances_class)
 94 |     if y_c[index] == cl:
 95 |         nearest = nearest[1:]
 96 | 
 97 |     return class_indices[nearest[:k]]
 98 | 
 99 | def matrix_norm(M):
100 |     """Calculate the norm of all rows in the matrix.
101 | 
102 |     Parameters
103 |     ----------
104 |     M : array-like, shape (n, m)
105 |         The matrix.
106 | 
107 |     Returns
108 |     -------
109 |     array-like, shape (n,) : the norms for each row in the matrix
110 |     """
111 |     return np.sqrt((M * M).sum(axis=1))
112 | 
113 | def l21_norm(M):
114 |     """Calculate the L2,1 norm of a matrix.
115 | 
116 |     Parameters
117 |     ----------
118 |     M : array-like, shape (n, m)
119 |         The matrix.
120 | 
121 |     Returns
122 |     -------
123 |     float : the L2,1 norm of this matrix
124 |     """
125 |     return matrix_norm(M).sum()
126 | 
127 | def power_neg_half(M):
128 |     """Calculate M ^ (-1/2).
129 | 
130 |     Parameters
131 |     ----------
132 |     M : array-like, shape (n, m)
133 |         The matrix.
134 | 
135 |     Returns
136 |     -------
137 |     array-like, shape (n, m) : M ^ (-1/2)
138 |     """
139 |     return np.sqrt(np.linalg.inv(M))
140 | 
141 | def apply_cr(cutting_rule):
142 |     """Extract the cutting rule from a tuple or callable.
143 | 
144 |     Parameters
145 |     ----------
146 |     cutting_rule : tuple or callable
147 |         A (str, float) tuple describing a cutting rule or a callable with
148 |         signature cutting_rule (features) which should return a list of features
149 |         ranked by some rule.
150 | 
151 |     Returns
152 |     -------
153 |     callable : a cutting rule callable
154 |     """
155 |     from ..filters.univariate.measures import CR_NAMES, MEASURE_NAMES
156 |     if type(cutting_rule) is tuple:
157 |         cutting_rule_name = cutting_rule[0]
158 |         cutting_rule_value = cutting_rule[1]
159 |         try:
160 |             cr = CR_NAMES[cutting_rule_name](cutting_rule_value)
161 |         except KeyError:
162 |             raise KeyError("No %s cutting rule yet" % cutting_rule_name)
163 |     elif hasattr(cutting_rule, '__call__'):
164 |         cr = cutting_rule
165 |     else:
166 |         raise KeyError(
167 |             "%s isn't a cutting rule function or string" % cutting_rule)
168 |     return cr
169 | 


--------------------------------------------------------------------------------
/ITMO_FS/utils/qpfs_body.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from functools import partial
 3 | 
 4 | import numpy as np
 5 | from qpsolvers import solve_qp
 6 | from scipy.linalg import sqrtm
 7 | 
 8 | 
 9 | def qpfs_body(X, y, fn, alpha=None, r=None, sigma=None, solv='quadprog',
10 |               metric_for_complex=complex.__abs__):
11 |     # TODO understand why complex double appears
12 |     # TODO find suitable r parameter value
13 |     # TODO find suitable sigma parameter value
14 |     if r is None:
15 |         r = X.shape[1] - 1
16 |     if r >= X.shape[1]:
17 |         raise TypeError("r parameter should be less than the number of features")
18 |     F = np.zeros(X.shape[1], dtype=np.double)  # F vector represents how each variable is correlated class
19 |     class_size = max(
20 |         y) + 1  # Count the number of classes, we assume that class labels would be numbers from 1 to max(y)
21 |     priors = np.histogram(y, bins=max(y))[0]  # Count prior probabilities of classes
22 |     for i in range(1, class_size):  # Loop through classes
23 |         Ck = np.where(y == i, 1, 0)  # Get array C(i) where C(k) is 1 when i = k and 0 otherwise
24 |         F += priors[i - 1] * fn(X, Ck)  # Counting F vector
25 |     Q = np.apply_along_axis(partial(fn, X), 0, X).reshape(X.shape[1], X.shape[1])
26 |     indices = np.random.random_integers(0, Q.shape[0] - 1,
27 |                                         r)  # Taking random r indices according to Nystrom approximation
28 |     A = Q[indices][:, :r]  # A matrix for Nystrom(matrix of real numbers with size of [r, r])
29 |     B = Q[indices][:, r:]  # B matrix for Nystrom(matrix of real numbers with size of [r, M - r])
30 |     if alpha is None:
31 |         alpha = __countAlpha(A, B, F)  # Only in filter method, in wrapper we should adapt it based on performance
32 |     AInvSqrt = sqrtm(np.linalg.pinv(A))  # Calculate squared root of inverted matrix A
33 |     S = np.add(A, AInvSqrt.dot(B).dot(B.T).dot(AInvSqrt))  # Caluclate S matrix
34 |     eigvals, EVect = np.linalg.eig(S)  # eigenvalues and eigenvectors of S
35 |     U = np.append(A, B.T, axis=0).dot(AInvSqrt).dot(EVect).dot(
36 |         sqrtm(np.linalg.pinv(EVect)))  # Eigenvectors of Q matrix using [A B]
37 |     eigvalsFilt, UFilt = __filterBy(sigma, eigvals,
38 |                                     U)  # Take onyl eigenvalues greater than threshold and corresponding eigenvectors
39 |     LFilt = np.zeros((len(eigvalsFilt), len(eigvalsFilt)), dtype=complex)  # initialize diagonal matrix of eigenvalues
40 |     for i in range(len(eigvalsFilt)):  # Loop through eigenvalues
41 |         LFilt[i][i] = eigvalsFilt[i]  # Init diagonal values
42 |     UFilt = np.array([list(map(metric_for_complex, t)) for t in UFilt])
43 |     LFilt = np.array([list(map(metric_for_complex, t)) for t in LFilt])
44 |     yf = solve_qp((1 - alpha) * LFilt, alpha * F.dot(UFilt), UFilt, np.zeros(UFilt.shape[0]),
45 |                   solver=solv)  # perform qp on stated problem
46 |     xSolution = UFilt.dot(yf)  # Find x - weights of features
47 |     forRanks = list(zip(xSolution, F, [x for x in range(len(F))]))  # Zip into array of tuple for proper sort
48 |     forRanks.sort(reverse=True)
49 |     ranks = np.zeros(len(F))
50 |     rankIndex = 1
51 |     for i in forRanks:
52 |         ranks[int(i[2])] = rankIndex
53 |         rankIndex += 1
54 |     return ranks
55 | 
56 | 
57 | def __filterBy(sigma, eigvals, U):
58 |     if sigma is None:
59 |         return eigvals, U
60 |     y = np.where(eigvals > sigma)[0]
61 |     return eigvals[y], U[:, y]
62 | 
63 | 
64 | def __countAlpha(A, B, F):
65 |     Comb = B.T.dot(np.linalg.pinv(A)).dot(B)
66 |     sumQ = np.sum(A) + 2 * np.sum(B) + np.sum(Comb)
67 |     sumQ /= (A.shape[1] + B.shape[1]) ** 2
68 |     sumF = np.sum(F)
69 |     sumF /= len(F)
70 |     return sumQ / (sumQ + sumF)
71 | 


--------------------------------------------------------------------------------
/ITMO_FS/wrappers/__init__.py:
--------------------------------------------------------------------------------
1 | from .deterministic import *
2 | from .randomized import *
3 | 


--------------------------------------------------------------------------------
/ITMO_FS/wrappers/deterministic/AddDelWrapper.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger
  2 | import random as rnd
  3 | 
  4 | import numpy as np
  5 | from sklearn.model_selection import cross_val_score
  6 | 
  7 | from ...utils import BaseWrapper, generate_features
  8 | 
  9 | 
 10 | class AddDelWrapper(BaseWrapper):
 11 |     """Add-Del feature wrapper.
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     estimator : object
 16 |         A supervised learning estimator that should have a fit(X, y) method and
 17 |         a predict(X) method.
 18 |     measure : string or callable
 19 |         A standard estimator metric (e.g. 'f1' or 'roc_auc') or a callable with
 20 |         signature measure(estimator, X, y) which should return only a single
 21 |         value.
 22 |     cv : int
 23 |         Number of folds in cross-validation.
 24 |     seed : int
 25 |         Seed for python random.
 26 |     d : int
 27 |         Amount of consecutive iterations for add ana del procedures that can
 28 |         have decreasing objective function before the algorithm terminates.
 29 | 
 30 |     See Also
 31 |     --------
 32 |     Lecture about feature selection (ru), p.13 -
 33 |     http://www.ccas.ru/voron/download/Modeling.pdf
 34 | 
 35 |     Examples
 36 |     --------
 37 |     >>> from sklearn.datasets import make_classification
 38 |     >>> from sklearn.linear_model import LogisticRegression
 39 |     >>> dataset = make_classification(n_samples=100, n_features=20,
 40 |     ... n_informative=5, n_redundant=0, shuffle=False, random_state=42)
 41 |     >>> x, y = np.array(dataset[0]), np.array(dataset[1])
 42 |     >>> lg = LogisticRegression(solver='lbfgs')
 43 |     >>> add_del = AddDelWrapper(lg, 'accuracy').fit(x, y)
 44 |     >>> add_del.selected_features_
 45 |     array([1, 4, 3], dtype=int64)
 46 |     """
 47 |     def __init__(self, estimator, measure, cv=3, seed=42, d=1):
 48 |         self.estimator = estimator
 49 |         self.measure = measure
 50 |         self.cv = cv
 51 |         self.seed = seed
 52 |         self.d = d
 53 | 
 54 |     def __add(self, X, y, free_features):
 55 |         """Add features to the selected set one by one until either all of
 56 |         the features are added or more than d iterations pass without
 57 |         increasing the objective function.
 58 | 
 59 |         Parameters
 60 |         ----------
 61 |         X : array-like, shape (n_samples, n_features)
 62 |             The training input samples.
 63 |         y : array-like, shape (n_samples,)
 64 |             The target values.
 65 |         free_features : array-like, shape (n_not_selected_features,)
 66 |             The array of current free features.
 67 | 
 68 |         Returns
 69 |         -------
 70 |         array-like, shape (n_new_selected_features,) : selected features;
 71 |         array-like, shape (n_new_not_selected_features,) : new free features
 72 |         """
 73 |         best_score = self.best_score_
 74 |         iteration_features = self.selected_features_
 75 |         iteration_free_features = free_features
 76 |         selected_features = self.selected_features_
 77 |         getLogger(__name__).info(
 78 |             "Trying to add features from free set %s to selected set %s",
 79 |             free_features, selected_features)
 80 | 
 81 |         while (iteration_features.shape[0] - selected_features.shape[0] <=
 82 |                    self.d) & (iteration_free_features.shape[0] != 0):
 83 |             getLogger(__name__).info(
 84 |                 "Current selected set: %s, best score: %d",
 85 |                 selected_features, best_score)
 86 |             scores = np.vectorize(
 87 |                 lambda f: cross_val_score(
 88 |                     self._estimator, X[:, np.append(iteration_features, f)], y,
 89 |                     cv=self.cv, scoring=self.measure).mean())(
 90 |             iteration_free_features)
 91 |             getLogger(__name__).info("Scores for all free features: %s", scores)
 92 | 
 93 |             to_add = np.argmax(scores)
 94 |             iteration_score = scores[to_add]
 95 |             getLogger(__name__).info(
 96 |                 "Adding feature %d, new score: %d",
 97 |                 iteration_free_features[to_add], iteration_score)
 98 |             iteration_features = np.append(
 99 |                 iteration_features, iteration_free_features[to_add])
100 |             iteration_free_features = np.delete(iteration_free_features, to_add)
101 | 
102 |             if iteration_score > best_score:
103 |                 selected_features = iteration_features
104 |                 free_features = iteration_free_features
105 |                 best_score = iteration_score
106 | 
107 |         return selected_features, free_features
108 | 
109 |     def __del(self, X, y, selected_features, free_features):
110 |         """Delete features from the selected set one by one until either only
111 |         one feature is left or more than d iterations pass without
112 |         increasing the objective function.
113 | 
114 |         Parameters
115 |         ----------
116 |         X : array-like, shape (n_samples, n_features)
117 |             The training input samples.
118 |         y : array-like, shape (n_samples,)
119 |             The target values.
120 |         selected_features : array-like, shape (n_selected_features,)
121 |             The array of current selected features.
122 |         free_features : array-like, shape (n_not_selected_features,)
123 |             The array of current free features.
124 | 
125 |         Returns
126 |         -------
127 |         array-like, shape (n_new_selected_features,) : new selected features;
128 |         array-like, shape (n_new_not_selected_features,) : new free features;
129 |         float : score for the selected feature set
130 |         """
131 |         best_score = cross_val_score(
132 |             self._estimator, X[:, selected_features], y, scoring=self.measure,
133 |             cv=self.cv).mean()
134 |         iteration_features = selected_features
135 |         iteration_free_features = free_features
136 |         getLogger(__name__).info(
137 |             "Trying to delete features from selected set %s", selected_features)
138 | 
139 |         while (selected_features.shape[0] - iteration_features.shape[0] <=
140 |                    self.d) & (iteration_features.shape[0] != 1):
141 |             getLogger(__name__).info(
142 |                 "Current selected set: %s, best score: %d",
143 |                 selected_features, best_score)
144 |             scores = np.vectorize(
145 |                 lambda i: cross_val_score(
146 |                     self._estimator, X[:, np.delete(iteration_features, i)], y,
147 |                     cv=self.cv, scoring=self.measure).mean())(
148 |             np.arange(0, iteration_features.shape[0]))
149 |             getLogger(__name__).info(
150 |                 "Scores for all selected features: %s", scores)
151 | 
152 |             to_delete = np.argmax(scores)
153 |             iteration_score = scores[to_delete]
154 |             getLogger(__name__).info(
155 |                 "Deleting feature %d, new score: %d",
156 |                 iteration_features[to_delete], iteration_score)
157 |             iteration_free_features = np.append(
158 |                 iteration_free_features, iteration_features[to_delete])
159 |             iteration_features = np.delete(iteration_features, to_delete)
160 | 
161 |             if iteration_score > best_score:
162 |                 selected_features = iteration_features
163 |                 free_features = iteration_free_features
164 |                 best_score = iteration_score
165 | 
166 |         return selected_features, free_features, best_score
167 | 
168 |     def _fit(self, X, y):
169 |         """Fit the wrapper.
170 | 
171 |         Parameters
172 |         ----------
173 |         X : array-like, shape (n_samples, n_features)
174 |             The training input samples.
175 |         y : array-like, shape (n_samples,)
176 |             The target values.
177 | 
178 |         Returns
179 |         -------
180 |         None
181 |         """
182 |         self.selected_features_ = np.array([], dtype='int')
183 |         free_features = generate_features(X)
184 |         self.best_score_ = 0
185 |         while True:
186 |             selected_features, free_features = self.__add(X, y, free_features)
187 |             getLogger(__name__).info(
188 |                 "After add: selected set = %s, free set = %s",
189 |                 selected_features, free_features)
190 |             selected_features, free_features, iteration_score = self.__del(
191 |                 X, y, selected_features, free_features)
192 |             getLogger(__name__).info(
193 |                 "After del: selected set = %s, free set = %s, score = %d",
194 |                 selected_features, free_features, iteration_score)
195 | 
196 |             if iteration_score > self.best_score_:
197 |                 self.best_score_ = iteration_score
198 |                 self.selected_features_ = selected_features
199 |             else:
200 |                 break
201 |         self._estimator.fit(X[:, self.selected_features_], y)
202 | 


--------------------------------------------------------------------------------
/ITMO_FS/wrappers/deterministic/BackwardSelection.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | import numpy as np
 4 | from sklearn.model_selection import cross_val_score
 5 | 
 6 | from ...utils import generate_features, BaseWrapper
 7 | 
 8 | 
 9 | class BackwardSelection(BaseWrapper):
10 |     """Backward Selection removes one feature at a time until the number of
11 |     features to be removed is reached. On each step, the best n-1 features
12 |     out of n are chosen (according to some estimator metric) and the last one
13 |     is removed.
14 | 
15 |     Parameters
16 |     ----------
17 |     estimator : object
18 |         A supervised learning estimator that should have a fit(X, y) method and
19 |         a predict(X) method.
20 |     n_features : int
21 |         Number of features to select.
22 |     measure : string or callable
23 |         A standard estimator metric (e.g. 'f1' or 'roc_auc') or a callable with
24 |         signature measure(estimator, X, y) which should return only a single
25 |         value.
26 |     cv : int
27 |         Number of folds in cross-validation.
28 | 
29 |     See Also
30 |     --------
31 | 
32 |     Examples
33 |     --------
34 |     >>> from ITMO_FS.wrappers import BackwardSelection
35 |     >>> from sklearn.linear_model import LogisticRegression
36 |     >>> from sklearn.datasets import make_classification
37 |     >>> import numpy as np
38 |     >>> dataset = make_classification(n_samples=100, n_features=20,
39 |     ... n_informative=5, n_redundant=0, shuffle=False, random_state=42)
40 |     >>> x, y = np.array(dataset[0]), np.array(dataset[1])
41 |     >>> model = BackwardSelection(LogisticRegression(), 5,
42 |     ... measure='f1_macro').fit(x, y)
43 |     >>> model.selected_features_
44 |     array([ 0,  1,  2,  3, 13], dtype=int64)
45 |     """
46 |     def __init__(self, estimator, n_features, measure, cv=3):
47 |         self.estimator = estimator
48 |         self.n_features = n_features
49 |         self.measure = measure
50 |         self.cv = cv
51 | 
52 |     def _fit(self, X, y):
53 |         """Fit the wrapper.
54 | 
55 |         Parameters
56 |         ----------
57 |         X : array-like, shape (n_samples, n_features)
58 |             The training input samples.
59 |         y : array-like, shape (n_samples,)
60 |             The target values.
61 | 
62 |         Returns
63 |         -------
64 |         None
65 |         """
66 |         self.selected_features_ = generate_features(X)
67 | 
68 |         while self.selected_features_.shape[0] != self.n_features:
69 |             getLogger(__name__).info(
70 |                 "Current selected set: %s", self.selected_features_)
71 |             scores = np.vectorize(
72 |                 lambda i: cross_val_score(
73 |                     self._estimator,
74 |                     X[:, np.delete(self.selected_features_, i)], y, cv=self.cv,
75 |                     scoring=self.measure).mean())(
76 |             np.arange(0, self.selected_features_.shape[0]))
77 |             getLogger(__name__).info(
78 |                 "Scores for all selected features: %s", scores)
79 |             to_delete = np.argmax(scores)
80 |             getLogger(__name__).info(
81 |                 "Deleting feature %d", self.selected_features_[to_delete])
82 | 
83 |             self.selected_features_ = np.delete(
84 |                 self.selected_features_, to_delete)
85 |         self.best_score_ = cross_val_score(
86 |             self._estimator, X[:, self.selected_features_], y, cv=self.cv,
87 |             scoring=self.measure).mean()
88 |         self._estimator.fit(X[:, self.selected_features_], y)
89 | 


--------------------------------------------------------------------------------
/ITMO_FS/wrappers/deterministic/RecursiveElimination.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | import numpy as np
 4 | from sklearn.model_selection import cross_val_score
 5 | 
 6 | from ...utils import generate_features, BaseWrapper
 7 | 
 8 | 
 9 | class RecursiveElimination(BaseWrapper):
10 |     """Recursive feature elimination algorithm.
11 | 
12 |     Parameters
13 |     ----------
14 |     estimator : object
15 |         A supervised learning estimator that should have a fit(X, y) method, a
16 |         predict(X) method and a field corresponding to feature weights.
17 |     n_features : int
18 |         Number of features to leave.
19 |     measure : string or callable
20 |         A standard estimator metric (e.g. 'f1' or 'roc_auc') or a callable with
21 |         signature measure(estimator, X, y) which should return only a single
22 |         value.
23 |     weight_func : callable
24 |         A function to extract weights from the model.
25 |     cv : int
26 |         Number of folds in cross-validation.
27 |             
28 |     See Also
29 |     --------
30 |     Guyon, I., Weston, J., Barnhill, S., & Vapnik, V., “Gene selection for
31 |     cancer classification using support vector machines”, Mach. Learn.,
32 |     46(1-3), 389–422, 2002.
33 |     https://link.springer.com/article/10.1023/A:1012487302797
34 | 
35 |     Examples
36 |     --------
37 |     >>> from sklearn.datasets import make_classification
38 |     >>> from ITMO_FS.wrappers import RecursiveElimination
39 |     >>> from sklearn.svm import SVC
40 |     >>> import numpy as np
41 |     >>> dataset = make_classification(n_samples=100, n_features=20,
42 |     ... n_informative=4, n_redundant=0, shuffle=False, random_state=42)
43 |     >>> x, y = np.array(dataset[0]), np.array(dataset[1])
44 |     >>> model = SVC(kernel='linear')
45 |     >>> rfe = RecursiveElimination(model, 5, measure='f1_macro',
46 |     ... weight_func=lambda model: np.square(model.coef_).sum(axis=0)).fit(x, y)
47 |     >>> rfe.selected_features_
48 |     array([ 0,  1,  2, 11, 19], dtype=int64)
49 |     """
50 |     def __init__(self, estimator, n_features, measure, weight_func, cv=3):
51 |         self.estimator = estimator
52 |         self.n_features = n_features
53 |         self.measure = measure
54 |         self.weight_func = weight_func
55 |         self.cv = cv
56 | 
57 |     def _fit(self, X, y):
58 |         """Fit the wrapper.
59 | 
60 |         Parameters
61 |         ----------
62 |         X : array-like, shape (n_samples, n_features)
63 |             The training input samples.
64 |         y : array-like, shape (n_samples,)
65 |             the target values.
66 | 
67 |         Returns
68 |         -------
69 |         None
70 |         """
71 |         self.selected_features_ = generate_features(X)
72 | 
73 |         while self.selected_features_.shape[0] != self.n_features:
74 |             getLogger(__name__).info(
75 |                 "Current selected set: %s", self.selected_features_)
76 |             self._estimator.fit(X[:, self.selected_features_], y)
77 |             weights = self.weight_func(self._estimator)
78 |             getLogger(__name__).info(
79 |                 "Weights for all selected features: %s", weights)
80 |             least_important = np.argmin(weights)
81 |             getLogger(__name__).info(
82 |                 "Deleting the least important feature %d",
83 |                 self.selected_features_[least_important])
84 |             self.selected_features_ = np.delete(self.selected_features_,
85 |                 least_important)
86 | 
87 |         self.best_score_ = cross_val_score(self._estimator,
88 |             X[:, self.selected_features_], y, cv=self.cv,
89 |             scoring=self.measure).mean()
90 |         self._estimator.fit(X[:, self.selected_features_], y)
91 | 


--------------------------------------------------------------------------------
/ITMO_FS/wrappers/deterministic/SequentialForwardSelection.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | import numpy as np
 4 | from sklearn.model_selection import cross_val_score
 5 | 
 6 | from ...utils import generate_features, BaseWrapper
 7 | 
 8 | 
 9 | class SequentialForwardSelection(BaseWrapper):
10 |     """Sequentially add features that maximize the classifying function when
11 |     combined with the features already used.
12 |     #TODO add theory about this method
13 | 
14 |     Parameters
15 |     ----------
16 |     estimator: object
17 |         A supervised learning estimator that should have a fit(X, y) method and
18 |         a predict(X) method.
19 |     n_features : int
20 |         Number of features to select.
21 |     measure : string or callable
22 |         A standard estimator metric (e.g. 'f1' or 'roc_auc') or a callable with
23 |         signature measure(estimator, X, y) which should return only a single
24 |         value.
25 |     cv : int
26 |         Number of folds in cross-validation.
27 | 
28 |     See Also
29 |     --------
30 | 
31 |     Examples
32 |     --------
33 |     >>> from ITMO_FS.wrappers import SequentialForwardSelection
34 |     >>> from sklearn.linear_model import LogisticRegression
35 |     >>> from sklearn.datasets import make_classification
36 |     >>> import numpy as np
37 |     >>> dataset = make_classification(n_samples=100, n_features=20,
38 |     ... n_informative=5, n_redundant=0, shuffle=False, random_state=42)
39 |     >>> x, y = np.array(dataset[0]), np.array(dataset[1])
40 |     >>> model = SequentialForwardSelection(LogisticRegression(), 5,
41 |     ... measure='f1_macro').fit(x, y)
42 |     >>> model.selected_features_
43 |     array([ 1,  4,  3,  5, 19], dtype=int64)
44 |     """
45 |     def __init__(self, estimator, n_features, measure, cv=3):
46 |         self.estimator = estimator
47 |         self.n_features = n_features
48 |         self.measure = measure
49 |         self.cv = cv
50 | 
51 |     def _fit(self, X, y):
52 |         """Fit the wrapper.
53 | 
54 |         Parameters
55 |         ----------
56 |         X : array-like, shape (n_samples, n_features)
57 |             The training input samples.
58 |         y : array-like, shape (n_samples,)
59 |             The target values.
60 | 
61 |         Returns
62 |         -------
63 |         None
64 |         """
65 |         self.selected_features_ = np.array([], dtype=int)
66 |         free_features = generate_features(X)
67 | 
68 |         while self.selected_features_.shape[0] != self.n_features:
69 |             getLogger(__name__).info(
70 |                 "Current selected set: %s", self.selected_features_)
71 |             scores = np.vectorize(
72 |                 lambda f: cross_val_score(
73 |                     self._estimator,
74 |                     X[:, np.append(self.selected_features_, f)], y, cv=self.cv,
75 |                     scoring=self.measure).mean())(free_features)
76 |             getLogger(__name__).info("Scores for all free features: %s", scores)
77 |             to_add = np.argmax(scores)
78 |             getLogger(__name__).info("Adding feature %d", free_features[to_add])
79 |             self.selected_features_ = np.append(self.selected_features_,
80 |                 free_features[to_add])
81 |             free_features = np.delete(free_features, to_add)
82 | 
83 |         self.best_score_ = cross_val_score(self._estimator,
84 |             X[:, self.selected_features_], y, cv=self.cv,
85 |             scoring=self.measure).mean()
86 |         self._estimator.fit(X[:, self.selected_features_], y)
87 | 


--------------------------------------------------------------------------------
/ITMO_FS/wrappers/deterministic/__init__.py:
--------------------------------------------------------------------------------
1 | from .AddDelWrapper import AddDelWrapper
2 | from .BackwardSelection import BackwardSelection
3 | from .RecursiveElimination import RecursiveElimination
4 | from .SequentialForwardSelection import SequentialForwardSelection
5 | from .qpfs_wrapper import QPFSWrapper
6 | 


--------------------------------------------------------------------------------
/ITMO_FS/wrappers/deterministic/qpfs_wrapper.py:
--------------------------------------------------------------------------------
 1 | from ITMO_FS.filters.univariate.measures import pearson_corr
 2 | from ITMO_FS.utils.qpfs_body import qpfs_body
 3 | from ...utils import BaseWrapper
 4 | 
 5 | class QPFSWrapper(BaseWrapper):
 6 |     """
 7 |     #TODO rewrite to the proper notation
 8 |     Performs Quadratic Programming Feature Selection algorithm.
 9 |     Note that this realization requires labels to start from 1 and be numberical.
10 |     This is function for wrapper based on qpfs so alpha parameter must be specified, in case you don't know alpha parameter
11 |     it is suggested to use qpfs_filter
12 | 
13 |     Parameters
14 |     ----------
15 |     alpha : double value
16 |         That represents balance between relevance and redundancy of features.
17 |     r : int
18 |         The number of samples to be used in Nystrom optimization.
19 |     sigma : double
20 |         The threshold for eigenvalues to be used in solving QP optimization.
21 |     solv : string
22 |         The name of qp solver according to qpsolvers(https://pypi.org/project/qpsolvers/) naming.
23 |         Note quadprog is used by default.
24 |     fn : function(array, array)
25 |         The function to count correlation, for example pearson correlation or  mutual information.
26 |         Note pearson_corr from ITMO_FS measures is used by default.
27 |     Returns
28 |     ------
29 |     array-like, shape (n_features) : the ranks of features in dataset, with rank increase, feature relevance increases and redundancy decreases.
30 |     
31 |     See Also
32 |     --------
33 |     http://www.jmlr.org/papers/volume11/rodriguez-lujan10a/rodriguez-lujan10a.pdf
34 | 
35 |     Examples
36 |     --------
37 |     >>> import numpy as np
38 |     >>> x = np.array([[3, 3, 3, 2, 2], [3, 3, 1, 2, 3], [1, 3, 5, 1, 1], [3, 1, 4, 3, 1], [3, 1, 2, 3, 1]])
39 |     >>> y = np.array([1, 3, 2, 1, 2])
40 |     >>> alpha = 0.5
41 |     >>> ranks = qpfs_wrapper(x, y, alpha)
42 |     >>> print(ranks)
43 | 
44 |     """
45 |     def __init__(self, alpha, r=None, sigma=None, solv='quadprog', fn=pearson_corr):
46 |         self.alpha = alpha
47 |         self.r = r
48 |         self.sigma = sigma
49 |         self.solv = solv
50 |         self.fn = fn
51 | 
52 |     def _fit(X, y):
53 |         """
54 |             Fits wrapper.
55 | 
56 |             Parameters
57 |             ----------
58 |             X : array-like, shape (n_samples,n_features)
59 |                 The training input samples.
60 |             y : array-like, shape (n_samples,)
61 |                 The target values.
62 |             Returns
63 |             ------
64 |             None
65 |         """        
66 |         return qpfs_body(X, y, fn, alpha=alpha, r=r, sigma=sigma, solv=solv)
67 | 


--------------------------------------------------------------------------------
/ITMO_FS/wrappers/randomized/HillClimbing.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | import numpy as np
 4 | from sklearn.base import clone
 5 | from sklearn.model_selection import cross_val_score
 6 | 
 7 | from ...utils import generate_features, BaseWrapper
 8 | 
 9 | class HillClimbingWrapper(BaseWrapper):
10 |     """Hill Climbing algorithm.
11 | 
12 |     Parameters
13 |     ----------
14 |     estimator : object
15 |         A supervised learning estimator that should have a fit(X, y) method and
16 |         a predict(X) method.
17 |     measure : string or callable
18 |         A standard estimator metric (e.g. 'f1' or 'roc_auc') or a callable with
19 |         signature measure(estimator, X, y) which should return only a single
20 |         value.
21 |     seed : int
22 |         Random seed used to initialize np.random.default_rng().
23 |     cv : int
24 |         Number of folds in cross-validation.
25 | 
26 |     See Also
27 |     --------
28 | 
29 |     Examples
30 |     --------
31 |     >>> from ITMO_FS.wrappers import HillClimbingWrapper
32 |     >>> from sklearn.linear_model import LogisticRegression
33 |     >>> from sklearn.datasets import make_classification
34 |     >>> import numpy as np
35 |     >>> dataset = make_classification(n_samples=100, n_features=20,
36 |     ... n_informative=5, n_redundant=0, shuffle=False, random_state=42)
37 |     >>> x, y = np.array(dataset[0]), np.array(dataset[1])
38 |     >>> model = HillClimbingWrapper(LogisticRegression(),
39 |     ... measure='f1_macro').fit(x, y)
40 |     >>> model.selected_features_
41 |     array([ 0,  1,  2,  3,  4,  6,  7,  9, 11, 13, 14, 15], dtype=int64)
42 |     """
43 |     def __init__(self, estimator, measure, seed=42, cv=3):
44 |         self.estimator = estimator
45 |         self.measure = measure
46 |         self.seed = seed
47 |         self.cv = cv
48 | 
49 |     def _fit(self, X, y):
50 |         """Fit the wrapper.
51 | 
52 |         Parameters
53 |         ----------
54 |         X : array-like, shape (n_samples, n_features)
55 |             The training input samples.
56 |         y : array-like, shape (n_samples,)
57 |             The target values.
58 | 
59 |         Returns
60 |         -------
61 |         None
62 |         """
63 |         rng = np.random.default_rng(self.seed)
64 | 
65 |         features = generate_features(X)
66 |         mask = rng.choice([True, False], self.n_features_)
67 |         getLogger(__name__).info("Initial feature mask: %s", mask)
68 |         score = cross_val_score(
69 |             self._estimator, X[:, features[mask]], y, cv=self.cv,
70 |             scoring=self.measure).mean()
71 | 
72 |         while True:
73 |             getLogger(__name__).info("Current best score: %d", score)
74 |             old_score = score
75 |             order = rng.permutation(self.n_features_)
76 |             for feature in order:
77 |                 getLogger(__name__).info("Trying to change feature %d", feature)
78 |                 mask[feature] = not(mask[feature])
79 |                 new_score = cross_val_score(self._estimator,
80 |                     X[:, features[mask]], y, cv=self.cv,
81 |                     scoring=self.measure).mean()
82 |                 getLogger(__name__).info("New score: %d", new_score)
83 |                 if new_score > score:
84 |                     score = new_score
85 |                     break
86 |                 mask[feature] = not(mask[feature])
87 |             if old_score == score:
88 |                 break
89 | 
90 |         self.selected_features_ = features[mask]
91 |         self.best_score_ = score
92 |         self._estimator.fit(X[:, self.selected_features_], y)
93 | 


--------------------------------------------------------------------------------
/ITMO_FS/wrappers/randomized/SimulatedAnnealing.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger
  2 | 
  3 | import numpy as np
  4 | from sklearn.model_selection import cross_val_score
  5 | 
  6 | from ...utils import BaseWrapper, generate_features
  7 | 
  8 | 
  9 | class SimulatedAnnealing(BaseWrapper):
 10 |     """Simulated Annealing algorithm.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     estimator : object
 15 |         A supervised learning estimator that should have a fit(X, y) method and
 16 |         a predict(X) method.
 17 |     measure : string or callable
 18 |         A standard estimator metric (e.g. 'f1' or 'roc_auc') or a callable with
 19 |         signature measure(estimator, X, y) which should return only a single
 20 |         value.
 21 |     seed : int
 22 |         Random seed used to initialize np.random.default_rng().
 23 |     iteration_number : int
 24 |         Number of iterations of the algorithm.
 25 |     c : int
 26 |         A constant that is used to control the rate of feature perturbation.
 27 |     init_number_of_features : int
 28 |         The number of features to initialize start features subset with, by
 29 |         default 5-10 percents of features is used.
 30 |     cv : int
 31 |         Number of folds in cross-validation.
 32 |         
 33 |     Notes
 34 |     -----
 35 |     For more details see `this paper <http://www.feat.engineering/simulated-annealing.html/>`_.
 36 | 
 37 |     Examples
 38 |     --------
 39 |     >>> from sklearn.datasets import make_classification
 40 |     >>> from sklearn.linear_model import LogisticRegression
 41 |     >>> from ITMO_FS.wrappers.randomized import SimulatedAnnealing
 42 |     >>> dataset = make_classification(n_samples=100, n_features=20,
 43 |     ... n_informative=5, n_redundant=0, shuffle=False, random_state=42)
 44 |     >>> x, y = np.array(dataset[0]), np.array(dataset[1])
 45 |     >>> sa = SimulatedAnnealing(LogisticRegression(), measure='f1_macro',
 46 |     ... iteration_number=50).fit(x, y)
 47 |     >>> sa.selected_features_
 48 |     array([ 1,  4,  3, 17, 10, 16, 11, 14,  5], dtype=int64)
 49 |     """
 50 |     def __init__(self, estimator, measure, seed=42, iteration_number=100, c=1,
 51 |                  init_number_of_features=None, cv=3):
 52 |         self.estimator = estimator
 53 |         self.measure = measure
 54 |         self.seed = seed
 55 |         self.iteration_number = iteration_number
 56 |         self.c = c
 57 |         self.init_number_of_features = init_number_of_features
 58 |         self.cv = cv
 59 | 
 60 |     def __acceptance(self, i, prev_score, cur_score):
 61 |         return np.exp((i + 1) / self.c * (cur_score - prev_score) / prev_score)
 62 | 
 63 |     def _fit(self, X, y):
 64 |         """Fit the wrapper.
 65 | 
 66 |         Parameters
 67 |         ----------
 68 |         X : array-like, shape (n_samples, n_features)
 69 |             The training input samples.
 70 |         y : array-like, shape (n_samples,)
 71 |             the target values.
 72 | 
 73 |         Returns
 74 |         -------
 75 |         None
 76 |         """
 77 |         rng = np.random.default_rng(self.seed)
 78 |         features = generate_features(X)
 79 | 
 80 |         if self.init_number_of_features is None:
 81 |             percentage = rng.integers(5, 11)
 82 |             init_number_of_features = int(
 83 |                 self.n_features_ * percentage / 100) + 1
 84 |         elif self.init_number_of_features == 0:
 85 |             getLogger(__name__).warning(
 86 |                 "Initial number of features was set to zero; would use one "
 87 |                 "instead")
 88 |             init_number_of_features = 1
 89 |         else:
 90 |             init_number_of_features = self.init_number_of_features
 91 | 
 92 |         feature_subset = np.unique(
 93 |             rng.integers(0, self.n_features_, init_number_of_features))
 94 |         getLogger(__name__).info("Initial selected set: %s", feature_subset)
 95 |         prev_score = cross_val_score(
 96 |             self._estimator, X[:, feature_subset], y, cv=self.cv,
 97 |             scoring=self.measure).mean()
 98 |         getLogger(__name__).info("Initial score: %d", prev_score)
 99 | 
100 |         for i in range(self.iteration_number):
101 |             getLogger(__name__).info("Current best score: %d", prev_score)
102 |             operation = rng.integers(0, 2)
103 |             percentage = rng.integers(1, 5)
104 |             if operation == 1 & feature_subset.shape[0] != self.n_features_:
105 |                 # inc
106 |                 not_included_features = np.setdiff1d(features, feature_subset)
107 |                 include_number = min(
108 |                     not_included_features.shape[0],
109 |                     int(self.n_features_ * (percentage / 100)) + 1)
110 |                 to_add = rng.choice(
111 |                     not_included_features, size=include_number, replace=False)
112 |                 getLogger(__name__).info(
113 |                     "Trying to add features %s into the selected set", to_add)
114 |                 cur_subset = np.append(feature_subset, to_add)
115 |             else:
116 |                 # exc
117 |                 exclude_number = min(
118 |                     feature_subset.shape[0] - 1,
119 |                     int(self.n_features_ * (percentage / 100)) + 1)
120 |                 to_delete = rng.choice(
121 |                     np.arange(feature_subset.shape[0]), size=exclude_number,
122 |                     replace=False)
123 |                 getLogger(__name__).info(
124 |                     "Trying to delete features %s from the selected set",
125 |                     feature_subset[to_delete])
126 |                 cur_subset = np.delete(feature_subset, to_delete)
127 |             cur_score = cross_val_score(
128 |                 self._estimator, X[:, cur_subset], y, cv=self.cv,
129 |                 scoring=self.measure).mean()
130 |             getLogger(__name__).info("New score: %d", cur_score)
131 |             if cur_score > prev_score:
132 |                 feature_subset = cur_subset
133 |                 prev_score = cur_score
134 |             else:
135 |                 getLogger(__name__).info(
136 |                     "Score has not improved; trying to accept the new subset "
137 |                     "anyway")
138 |                 ruv = rng.random()
139 |                 acceptance = self.__acceptance(i, prev_score, cur_score)
140 |                 getLogger(__name__).info(
141 |                     "Random value = %d, acceptance = %d", ruv, acceptance)
142 |                 if ruv < acceptance:
143 |                     getLogger(__name__).info("Accepting the new subset")
144 |                     feature_subset = cur_subset
145 |                     prev_score = cur_score
146 | 
147 |         self.selected_features_ = feature_subset
148 |         self.best_score_ = prev_score
149 |         self._estimator.fit(X[:, self.selected_features_], y)
150 | 


--------------------------------------------------------------------------------
/ITMO_FS/wrappers/randomized/__init__.py:
--------------------------------------------------------------------------------
1 | from .HillClimbing import HillClimbingWrapper
2 | from .TPhMGWO import TPhMGWO
3 | from .SimulatedAnnealing import SimulatedAnnealing


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2020, ITMO University,Nikita Pilnenskiy
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-minimal


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_templates/class.rst:
--------------------------------------------------------------------------------
 1 | :mod:`{{module}}`.{{objname}}
 2 | {{ underline }}==============
 3 | 
 4 | .. currentmodule:: {{ module }}
 5 | 
 6 | .. autoclass:: {{ objname }}
 7 | 
 8 |    {% block methods %}
 9 |    .. automethod:: __init__
10 |    {% endblock %}
11 | 
12 | 
13 | .. include:: {{module}}.{{objname}}.examples
14 | 
15 | .. raw:: html
16 | 
17 |     <div style='clear:both'></div>
18 | 


--------------------------------------------------------------------------------
/docs/_templates/function.rst:
--------------------------------------------------------------------------------
 1 | :mod:`{{module}}`.{{objname}}
 2 | {{ underline }}====================
 3 | 
 4 | .. currentmodule:: {{ module }}
 5 | 
 6 | .. autofunction:: {{ objname }}
 7 | 
 8 | .. include:: {{module}}.{{objname}}.examples
 9 | 
10 | .. raw:: html
11 | 
12 |     <div style='clear:both'></div>


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
  1 | ######################
  2 | ITMO_FS API
  3 | ######################
  4 | 
  5 | This is the full API documentation of the `ITMO_FS` toolbox.
  6 | 
  7 | .. _filters_ref:
  8 | 
  9 | :mod:`ITMO_FS.filters`: Filter methods
 10 | ======================================
 11 | 
 12 | .. automodule:: filters
 13 |     :no-members:
 14 |     :no-inherited-members:
 15 | 
 16 | .. currentmodule:: ITMO_FS
 17 | 
 18 | :mod:`ITMO_FS.filters.univariate`: Univariate filter methods
 19 | ------------------------------------------------------------
 20 | 
 21 | .. automodule:: filters.univariate
 22 |     :no-members:
 23 |     :no-inherited-members:
 24 | 
 25 | .. currentmodule:: ITMO_FS
 26 | 
 27 | .. autosummary::
 28 |    :toctree: generated/
 29 |    :template: class.rst
 30 | 
 31 |     filters.univariate.VDM
 32 |     filters.univariate.UnivariateFilter
 33 | 
 34 | Measures for univariate filters
 35 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 36 | 
 37 | .. automodule:: filters.univariate.measures
 38 |     :no-members:
 39 |     :no-inherited-members:
 40 | 
 41 | .. currentmodule:: ITMO_FS
 42 | 
 43 | 
 44 | .. autosummary::
 45 |    :toctree: generated/
 46 |    :template: function.rst
 47 | 
 48 |     filters.univariate.fit_criterion_measure
 49 |     filters.univariate.f_ratio_measure
 50 |     filters.univariate.gini_index
 51 |     filters.univariate.su_measure
 52 |     filters.univariate.spearman_corr
 53 |     filters.univariate.pearson_corr
 54 |     filters.univariate.fechner_corr
 55 |     filters.univariate.kendall_corr
 56 |     filters.univariate.reliefF_measure
 57 |     filters.univariate.chi2_measure
 58 |     filters.univariate.information_gain
 59 | 
 60 | Cutting rules for univariate filters
 61 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 62 | 
 63 | .. automodule:: filters.univariate.measures
 64 |     :no-members:
 65 |     :no-inherited-members:
 66 | 
 67 | .. currentmodule:: ITMO_FS
 68 | 
 69 | 
 70 | .. autosummary::
 71 |    :toctree: generated/
 72 |    :template: function.rst
 73 | 
 74 |     filters.univariate.select_best_by_value
 75 |     filters.univariate.select_worst_by_value
 76 |     filters.univariate.select_k_best
 77 |     filters.univariate.select_k_worst
 78 |     filters.univariate.select_best_percentage
 79 |     filters.univariate.select_worst_percentage
 80 | 
 81 | 
 82 | :mod:`ITMO_FS.filters.multivariate`: Multivariate filter methods
 83 | ----------------------------------------------------------------
 84 | 
 85 | .. automodule:: filters.multivariate
 86 |     :no-members:
 87 |     :no-inherited-members:
 88 | 
 89 | .. currentmodule:: ITMO_FS
 90 | 
 91 | .. autosummary::
 92 |    :toctree: generated/
 93 |    :template: class.rst
 94 | 
 95 |     filters.multivariate.DISRWithMassive
 96 |     filters.multivariate.FCBFDiscreteFilter
 97 |     filters.multivariate.MultivariateFilter
 98 |     filters.multivariate.STIR
 99 |     filters.multivariate.TraceRatioFisher
100 |     filters.multivariate.MIMAGA
101 | 
102 | 
103 | Measures for multivariate filters
104 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
105 | 
106 | .. automodule:: filters.multivariate.measures
107 |     :no-members:
108 |     :no-inherited-members:
109 | 
110 | .. currentmodule:: ITMO_FS
111 | 
112 | 
113 | .. autosummary::
114 |    :toctree: generated/
115 |    :template: function.rst
116 | 
117 |     filters.multivariate.MIM
118 |     filters.multivariate.MRMR
119 |     filters.multivariate.JMI
120 |     filters.multivariate.CIFE
121 |     filters.multivariate.MIFS
122 |     filters.multivariate.CMIM
123 |     filters.multivariate.ICAP
124 |     filters.multivariate.DCSF
125 |     filters.multivariate.CFR
126 |     filters.multivariate.MRI
127 |     filters.multivariate.IWFS
128 |     filters.multivariate.generalizedCriteria
129 | 
130 | 
131 | :mod:`ITMO_FS.filters.unsupervised`: Unsupervised filter methods
132 | ----------------------------------------------------------------
133 | 
134 | .. automodule:: filters.unsupervised
135 |     :no-members:
136 |     :no-inherited-members:
137 | 
138 | .. currentmodule:: ITMO_FS
139 | 
140 | 
141 | .. autosummary::
142 |    :toctree: generated/
143 |    :template: class.rst
144 | 
145 |     filters.unsupervised.TraceRatioLaplacian
146 | 
147 | 
148 | :mod:`ITMO_FS.filters.sparse`: Sparse filter methods
149 | ----------------------------------------------------------------
150 | 
151 | .. automodule:: filters.sparse
152 |     :no-members:
153 |     :no-inherited-members:
154 | 
155 | .. currentmodule:: ITMO_FS
156 | 
157 | 
158 | .. autosummary::
159 |    :toctree: generated/
160 |    :template: class.rst
161 | 
162 |     filters.sparse.MCFS
163 |     filters.sparse.NDFS
164 |     filters.sparse.RFS
165 |     filters.sparse.SPEC
166 |     filters.sparse.UDFS
167 | 
168 | 
169 | 
170 | .. _ensembles_ref:
171 | 
172 | :mod:`ITMO_FS.ensembles`: Ensemble methods
173 | ==========================================
174 | 
175 | .. automodule:: ensembles
176 |     :no-members:
177 |     :no-inherited-members:
178 | 
179 | .. currentmodule:: ITMO_FS
180 | 
181 | :mod:`ITMO_FS.ensembles.measure_based`: Measure based ensemble methods
182 | -----------------------------------------------------------------------
183 | 
184 | .. automodule:: ensembles.measure_based
185 |     :no-members:
186 |     :no-inherited-members:
187 | 
188 | .. currentmodule:: ITMO_FS
189 | 
190 | .. autosummary::
191 |    :toctree: generated/
192 |    :template: class.rst
193 | 
194 |     ensembles.measure_based.WeightBased
195 | 
196 | 
197 | :mod:`ITMO_FS.ensembles.model_based`: Model based ensemble methods
198 | ------------------------------------------------------------------
199 | 
200 | .. automodule:: ensembles.model_based
201 |     :no-members:
202 |     :no-inherited-members:
203 | 
204 | .. currentmodule:: ITMO_FS
205 | 
206 | .. autosummary::
207 |    :toctree: generated/
208 |    :template: class.rst
209 | 
210 |     ensembles.model_based.BestSum
211 | 
212 | 
213 | :mod:`ITMO_FS.ensembles.ranking_based`: Ranking based ensemble methods
214 | ----------------------------------------------------------------------
215 | 
216 | .. automodule:: ensembles.ranking_based
217 |     :no-members:
218 |     :no-inherited-members:
219 | 
220 | .. currentmodule:: ITMO_FS
221 | 
222 | .. autosummary::
223 |    :toctree: generated/
224 |    :template: class.rst
225 | 
226 |     ensembles.ranking_based.Mixed
227 | 
228 | 
229 | .. _embedded_ref:
230 | 
231 | :mod:`ITMO_FS.embedded`: Embedded methods
232 | =========================================
233 | 
234 | .. automodule:: embedded
235 |     :no-members:
236 |     :no-inherited-members:
237 | 
238 | .. currentmodule:: ITMO_FS
239 | 
240 | .. autosummary::
241 |    :toctree: generated/
242 |    :template: class.rst
243 | 
244 |     embedded.MOS
245 | 
246 | 
247 | .. _hybrid_ref:
248 | 
249 | :mod:`ITMO_FS.hybrid`: Hybrid methods
250 | =========================================
251 | 
252 | .. automodule:: hybrid
253 |     :no-members:
254 |     :no-inherited-members:
255 | 
256 | .. currentmodule:: ITMO_FS
257 | 
258 | .. autosummary::
259 |    :toctree: generated/
260 |    :template: class.rst
261 | 
262 |     hybrid.FilterWrapperHybrid
263 |     hybrid.Melif
264 | 
265 | 
266 | .. _wrappers_ref:
267 | 
268 | :mod:`ITMO_FS.wrappers`: Wrapper methods
269 | ========================================
270 | 
271 | .. automodule:: wrappers
272 |     :no-members:
273 |     :no-inherited-members:
274 | 
275 | .. currentmodule:: ITMO_FS
276 | 
277 | :mod:`ITMO_FS.wrappers.deterministic`: Deterministic wrapper methods
278 | --------------------------------------------------------------------
279 | 
280 | .. automodule:: wrappers.deterministic
281 |     :no-members:
282 |     :no-inherited-members:
283 | 
284 | .. currentmodule:: ITMO_FS
285 | 
286 | .. autosummary::
287 |    :toctree: generated/
288 |    :template: class.rst
289 | 
290 |     wrappers.deterministic.AddDelWrapper
291 |     wrappers.deterministic.BackwardSelection
292 |     wrappers.deterministic.RecursiveElimination
293 |     wrappers.deterministic.SequentialForwardSelection
294 | 
295 | Deterministic wrapper function
296 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
297 | 
298 | .. autosummary::
299 |    :toctree: generated/
300 |    :template: function.rst
301 | 
302 |     wrappers.deterministic.qpfs_wrapper
303 | 
304 | 
305 | 
306 | :mod:`ITMO_FS.wrappers.randomized`: Randomized wrapper methods
307 | ------------------------------------------------------------------
308 | 
309 | .. automodule:: wrappers.randomized
310 |     :no-members:
311 |     :no-inherited-members:
312 | 
313 | .. currentmodule:: ITMO_FS
314 | 
315 | .. autosummary::
316 |    :toctree: generated/
317 |    :template: class.rst
318 | 
319 |     wrappers.randomized.HillClimbingWrapper
320 |     wrappers.randomized.SimulatedAnnealing
321 |     wrappers.randomized.TPhMGWO
322 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | import sphinx_rtd_theme
16 | sys.path.insert(0, os.path.abspath('../ITMO_FS'))
17 | sys.path.insert(1, os.path.abspath('..'))
18 | 
19 | 
20 | # -- Project information -----------------------------------------------------
21 | 
22 | project = 'ITMO_FS'
23 | copyright = '2020, ITMO University,Nikita Pilnenskiy'
24 | author = 'Nikita Pilnenskiy'
25 | 
26 | # The full version, including alpha/beta/rc tags
27 | release = '0.3.2'
28 | 
29 | 
30 | # -- General configuration ---------------------------------------------------
31 | 
32 | # Add any Sphinx extension module names here, as strings. They can be
33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
34 | # ones.
35 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.napoleon', 'sphinx.ext.autosummary']
36 | 
37 | 
38 | autodoc_default_flags = ['members', 'inherited-members']
39 | 
40 | # Add any paths that contain templates here, relative to this directory.
41 | templates_path = ['_templates']
42 | 
43 | # generate autosummary even if no references
44 | autosummary_generate = True
45 | 
46 | # The language for content autogenerated by Sphinx. Refer to documentation
47 | # for a list of supported languages.
48 | #
49 | # This is also used if you do content translation via gettext catalogs.
50 | # Usually you set "language" from the command line for these cases.
51 | language = 'ru'
52 | master_doc = 'index'
53 | # List of patterns, relative to source directory, that match files and
54 | # directories to ignore when looking for source files.
55 | # This pattern also affects html_static_path and html_extra_path.
56 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
57 | 
58 | 
59 | # -- Options for HTML output -------------------------------------------------
60 | 
61 | # The theme to use for HTML and HTML Help pages.  See the documentation for
62 | # a list of builtin themes.
63 | #
64 | html_theme = 'sphinx_rtd_theme'
65 | 
66 | 
67 | # Add any paths that contain custom static files (such as style sheets) here,
68 | # relative to this directory. They are copied after the builtin static files,
69 | # so a file named "default.css" will overwrite the builtin "default.css".
70 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
71 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. project-template documentation master file, created by
 2 |    sphinx-quickstart on Mon Jan 18 14:44:12 2016.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | ##########################################
 7 | Welcome to ITMO_FS!
 8 | ##########################################
 9 | 
10 | .. toctree::
11 |    :maxdepth: 2
12 |    :hidden:
13 |    :caption: Getting Started
14 | 
15 |    install
16 | 
17 | .. toctree::
18 |    :maxdepth: 2
19 |    :hidden:
20 |    :caption: Documentation
21 | 
22 |    user_guide
23 |    api
24 | 
25 | 
26 | `Getting started <install.html>`_
27 | ---------------------------------
28 | 
29 | Information to install, test, and contribute to the package.
30 | 
31 | `User Guide <user_guide.html>`_
32 | -------------------------------
33 | 
34 | User guide of ITMO_FS
35 | 
36 | `API <api.html>`_
37 | -------------------------------
38 | 
39 | The main documentation. This contains an in-depth description of all
40 | algorithms and how to apply them.
41 | 
42 | `API Documentation <api.html>`_
43 | -------------------------------
44 | 
45 | The exact API of all functions and classes, as given in the
46 | doctring. The API documents expected types and allowed features for
47 | all functions, and all parameters available for the algorithms.
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/docs/install.rst:
--------------------------------------------------------------------------------
 1 | ########################
 2 | Install and contribution
 3 | ########################
 4 | 
 5 | Prerequisites
 6 | =============
 7 | 
 8 | The feature selection library requires the following dependencies:
 9 | 
10 | * python (>=3.6)
11 | * numpy (>=1.13.3)
12 | * scipy (>=0.19.1)
13 | * scikit-learn (>=0.22)
14 | * imblearn (>=0.0)
15 | * qpsolvers (>=1.0.1)
16 | 
17 | Install
18 | =======
19 | 
20 | ITMO_FS is currently available on the PyPi's repositories and you can
21 | install it via `pip`::
22 | 
23 |   pip install -U ITMO_FS
24 | 
25 | If you prefer, you can clone it and run the setup.py file. Use the following
26 | commands to get a copy from Github and install all dependencies::
27 | 
28 |   git clone https://github.com/LastShekel/ITMO_FS.git
29 |   cd ITMO_FS
30 |   pip install .
31 | 
32 | Or install using pip and GitHub::
33 | 
34 |   pip install -U git+https://github.com/LastShekel/ITMO_FS.git
35 | 
36 | Test and coverage
37 | =================
38 | 
39 | You want to test the code before to install::
40 | 
41 |   $ make test
42 | 
43 | You wish to test the coverage of your version::
44 | 
45 |   $ make coverage
46 | 
47 | You can also use `pytest`::
48 | 
49 |   $ pytest ITMO_FS -v
50 | 
51 | Contribute
52 | ==========
53 | 
54 | You can contribute to this code through Pull Request on GitHub_. Please, make
55 | sure that your code is coming with unit tests to ensure full coverage and
56 | continuous integration in the API.
57 | 
58 | .. _GitHub: https://github.com/LastShekel/ITMO_FS/pulls
59 | 


--------------------------------------------------------------------------------
/docs/introduction.rst:
--------------------------------------------------------------------------------
 1 | .. _introduction:
 2 | 
 3 | ============
 4 | Introduction
 5 | ============
 6 | 
 7 | .. _api_ITMO_FS:
 8 | 
 9 | API's of feature selectors
10 | ----------------------------------
11 | 
12 | Available selectors follow the scikit-learn API using the base estimator
13 | and selector mixin:
14 | 
15 | :Transformer:
16 | 
17 |     The base object, implements a ``fit`` method to learn from data, either::
18 | 
19 |       selector.fit(data, targets)
20 | 
21 |     To select features from a data set after learning, each selector implements::
22 | 
23 |       data_selected = selector.transform(data)
24 | 
25 |     To learn from data and select features from the same data set at once, each selector implements::
26 | 
27 |       data_selected = selector.fit_transform(data, targets)
28 | 
29 |     To reverse the selection operation, each selector implements::
30 | 
31 |       data_reversed = selector.fit_transform(data)
32 | 
33 | Feature selectors accept the same inputs that in scikit-learn:
34 | 
35 | * ``data``: array-like (2-D list, pandas.Dataframe, numpy.array) or sparse
36 |   matrices;
37 | * ``targets``: array-like (1-D list, pandas.Series, numpy.array).
38 | 
39 | The output will be of the following type:
40 | 
41 | * ``data_selected``: array-like (2-D list, pandas.Dataframe, numpy.array) or
42 |    sparse matrices;
43 | * ``data_reversed``: array-like (2-D list, pandas.Dataframe, numpy.array) or
44 |    sparse matrices;
45 | 
46 | .. topic:: Sparse input
47 | 
48 |    For sparse input the data is **converted to the Compressed Sparse Rows
49 |    representation** (see ``scipy.sparse.csr_matrix``) before being fed to the
50 |    sampler. To avoid unnecessary memory copies, it is recommended to choose the
51 |    CSR representation upstream.
52 | 
53 | .. _problem_statement:
54 | 
55 | Problem statement regarding data sets with redundant features
56 | -------------------------------------------------------------
57 | 
58 | Feature selection methods can be used to identify and remove unneeded,
59 | irrelevant and redundant attributes from data that do not contribute
60 | to the accuracy of a predictive model or may in fact decrease the
61 | accuracy of the model. Fewer attributes is desirable because it reduces
62 | the complexity of the model, and a simpler model is simpler to understand
63 | and explain.
64 | 
65 | Here is one of examples of feature selection improving the classification quality::
66 | 
67 |     >>> from sklearn.datasets import make_classification
68 |     >>> from sklearn.linear_model import SGDClassifier
69 |     >>> from ITMO_FS.embedded import MOS
70 | 
71 |     >>> X, y = make_classification(n_samples=300, n_features=10, random_state=0, n_informative=2)
72 |     >>> sel = MOS()
73 |     >>> trX = sel.fit_transform(X, y, smote=False)
74 | 
75 |     >>> cl1 = SGDClassifier()
76 |     >>> cl1.fit(X, y)
77 |     >>> cl1.score(X, y)
78 |     0.9033333333333333
79 | 
80 |     >>> cl2 = SGDClassifier()
81 |     >>> cl2.fit(trX, y)
82 |     >>> cl2.score(trX, y)
83 |     0.9433333333333334
84 | 
85 | As expected, the quality of the SVGClassifier's results is impacted by the presence of redundant features in data set.
86 | We can see that after using of feature selection the mean accuracy increases from 0.903 to 0.943.
87 | 


--------------------------------------------------------------------------------
/docs/logos/logo_itmo_fs_itog_colour.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ctlab/ITMO_FS/a2e61e2fabb9dfb34d90a1130fc7f5f162a2c921/docs/logos/logo_itmo_fs_itog_colour.jpg


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/user_guide.rst:
--------------------------------------------------------------------------------
 1 | .. title:: User guide: contents
 2 | 
 3 | .. _user_guide:
 4 | 
 5 | ==========
 6 | User Guide
 7 | ==========
 8 | 
 9 | .. toctree::
10 |    :numbered:
11 | 
12 |    introduction.rst


--------------------------------------------------------------------------------
/meta.yml:
--------------------------------------------------------------------------------
 1 | {% set name = "itmo_fs" %}
 2 | {% set version = "0.3.3" %}
 3 | 
 4 | package:
 5 |   name: "{{ name|lower }}"
 6 |   version: "{{ load_setup_py_data().version }}"
 7 | 
 8 | source:
 9 |   git_rev:
10 |   git_url:
11 |   url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.tar.gz
12 |       sha256: 5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7
13 | 
14 | build:
15 |   number: 0
16 |   script: "{{ PYTHON }} -m pip install . --no-deps --ignore-installed -vv "
17 | 
18 | 
19 | requirements:
20 |   build:
21 |     - pip
22 |     - python
23 |     - setuptools
24 | 
25 |   run:
26 |     - python
27 | 
28 | test:
29 |   imports:
30 |     -itmo_fs
31 |     -pandas
32 |     -pytest
33 | 
34 | about:
35 |   home:


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | imbalanced-learn
 2 | numpy~=1.22
 3 | scipy~=1.5.2
 4 | scikit-learn~=0.23.2
 5 | qpsolvers
 6 | 
 7 | dvc~=1.11.16
 8 | pandas~=1.1.3
 9 | imblearn~=0.0
10 | setuptools~=50.3.1


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.rst
3 | 
4 | [aliases]
5 | test = pytest
6 | 
7 | [tool:pytest]
8 | addopts = --doctest-modules


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | 
 3 | from setuptools import find_packages, setup
 4 | import os
 5 | base_dir = os.path.dirname(__file__)
 6 | 
 7 | about = {}
 8 | with open(os.path.join(base_dir, "ITMO_FS", "__about__.py")) as f:
 9 |     exec(f.read(), about)
10 | 
11 | DISTNAME = 'ITMO_FS'
12 | DESCRIPTION = 'Python Feature Selection library from ITMO University.'
13 | with codecs.open('README.rst') as f:
14 |     LONG_DESCRIPTION = f.read()
15 | MAINTAINER = 'N. Pilnenskiy'
16 | MAINTAINER_EMAIL = 'somacruz@bk.ru'
17 | URL = 'https://github.com/ctlab/ITMO_FS'
18 | LICENSE = 'new BSD'
19 | DOWNLOAD_URL = 'https://github.com/ctlab/ITMO_FS'
20 | VERSION = about["__version__"],
21 | INSTALL_REQUIRES = ['numpy', 'scipy', 'scikit-learn', 'imbalanced-learn', 'qpsolvers']
22 | CLASSIFIERS = ['Intended Audience :: Science/Research',
23 |                'Intended Audience :: Developers',
24 |                'License :: OSI Approved',
25 |                'Programming Language :: Python',
26 |                'Topic :: Software Development',
27 |                'Topic :: Scientific/Engineering',
28 |                'Operating System :: Microsoft :: Windows',
29 |                'Operating System :: POSIX',
30 |                'Operating System :: Unix',
31 |                'Operating System :: MacOS',
32 |                'Programming Language :: Python :: 2.7',
33 |                'Programming Language :: Python :: 3.5',
34 |                'Programming Language :: Python :: 3.6',
35 |                'Programming Language :: Python :: 3.7',
36 |                'Programming Language :: Python :: 3.8']
37 | EXTRAS_REQUIRE = {
38 |     'tests': [
39 |         'pytest',
40 |         'pytest-cov'],
41 |     'docs': [
42 |         'sphinx',
43 |         'sphinx-gallery',
44 |         'sphinx_rtd_theme',
45 |         'numpydoc',
46 |         'matplotlib'
47 |     ]
48 | }
49 | 
50 | setup(name=DISTNAME,
51 |       maintainer=MAINTAINER,
52 |       maintainer_email=MAINTAINER_EMAIL,
53 |       description=DESCRIPTION,
54 |       license=LICENSE,
55 |       url=URL,
56 |       version=VERSION,
57 |       download_url=DOWNLOAD_URL,
58 |       long_description=LONG_DESCRIPTION,
59 |       zip_safe=False,  # the package can run out of an .egg file
60 |       classifiers=CLASSIFIERS,
61 |       packages=find_packages(),
62 |       install_requires=INSTALL_REQUIRES,
63 |       extras_require=EXTRAS_REQUIRE)
64 | 


--------------------------------------------------------------------------------
/test/Melif_test.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import unittest
 3 | 
 4 | import pandas as pd
 5 | from sklearn.datasets import make_classification, make_regression
 6 | from sklearn.metrics import f1_score
 7 | from sklearn.model_selection import train_test_split
 8 | from sklearn.svm import SVC
 9 | from sklearn.utils.estimator_checks import check_estimator
10 | from ITMO_FS.ensembles import WeightBased
11 | from ITMO_FS.filters import *
12 | from ITMO_FS.hybrid.Melif import Melif
13 | from ITMO_FS.utils import f1_scorer
14 | 
15 | 
16 | class MyTestCase(unittest.TestCase):
17 |     wide_classification = make_classification(n_features=2000, n_informative=100, n_redundant=500)
18 |     tall_classification = make_classification(n_samples=50000, n_features=100, n_informative=23, n_redundant=30)
19 |     wide_regression = make_regression(n_features=2000, n_informative=100)
20 |     tall_regression = make_regression(n_samples=50000, n_features=200, n_informative=50)
21 |     filters = [UnivariateFilter(gini_index),
22 |                UnivariateFilter(pearson_corr),
23 |                UnivariateFilter(spearman_corr)]
24 | 
25 |     estimator = SVC(random_state=42)
26 |     ensemble = WeightBased(filters, cutting_rule=select_k_best(50))
27 | 
28 |     melif = Melif(estimator, select_k_best(1500), ensemble, scorer=f1_score, verbose=True)
29 | 
30 | 
31 | 
32 |     def test_wide(self):
33 |         data, target = self.wide_classification[0], self.wide_classification[1]
34 | 
35 |         train_data, test_data, train_target, test_target = train_test_split(data, target)
36 |         self.melif.fit(train_data, train_target)
37 | 
38 |         print(f1_score(test_target, self.melif.predict(test_data)))
39 | 
40 |     def test_wide_pd(self):
41 |         data, target = pd.DataFrame(self.wide_classification[0]), pd.DataFrame(self.wide_classification[1])
42 |         train_data, test_data, train_target, test_target = train_test_split(data, target)
43 |         self.melif.fit(train_data, train_target)
44 |         print(f1_score(test_target, self.melif.predict(test_data)))
45 | 
46 |     def test_R(self):
47 |         data = pd.read_csv('C:\\Users\\SomaC\\PycharmProjects\\machinka\\mlrcheck\\boston_corrected.csv')
48 |         target = 'class'
49 |         features = data.loc[:, data.columns != 'b'].columns
50 |         # data[target]=data[target].apply(lambda x: 0 if x<=0 else 1)
51 |         ks = [int(i * 500) for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]]
52 |         print()
53 |         for j in ks:
54 |             print('|' + str(j) + '|')
55 |             start = datetime.datetime.now()
56 |             f = UnivariateFilter(pearson_corr, select_k_best(j))
57 |             f.fit(data[features], data[target])
58 |             print('|', datetime.datetime.now() - start, '|')
59 |             start = datetime.datetime.now()
60 |             f = UnivariateFilter(spearman_corr, select_k_best(j))
61 |             f.fit(data[features], data[target])
62 |             print('|', datetime.datetime.now() - start, '|')
63 |             # start = datetime.datetime.now()
64 |             # f = UnivariateFilter(chi2_measure, select_k_best(j))
65 |             # f.fit(data[features], data[target])
66 |             # print('|', datetime.datetime.now() - start, '|')
67 |             start = datetime.datetime.now()
68 |             f = UnivariateFilter(information_gain, select_k_best(j))
69 |             f.fit(data[features], data[target])
70 |             print('|', datetime.datetime.now() - start, '|')
71 | 
72 |     def test_est(self):
73 |         melif = Melif(self.estimator, select_k_best(2), self.ensemble, scorer=f1_scorer)
74 |         check_estimator(melif)
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     unittest.main()
79 | 


--------------------------------------------------------------------------------
/test/datasets/.gitignore:
--------------------------------------------------------------------------------
1 | /gisette.csv
2 | /madelon.csv
3 | /arcene.csv
4 | /dexter.csv
5 | /dorothea.csv
6 | 


--------------------------------------------------------------------------------
/test/datasets/arcene.csv.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 13c55f4366f0bc89c4bc6c4bc826b0bc
3 |   size: 2715738
4 |   path: arcene.csv
5 | 


--------------------------------------------------------------------------------
/test/datasets/dexter.csv.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: cca2fe50c6a4869948fba36d56f0ef86
3 |   size: 24038795
4 |   path: dexter.csv
5 | 


--------------------------------------------------------------------------------
/test/datasets/dorothea.csv.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 9ae080330bbe3e419b415fbbb5761252
3 |   size: 320002322
4 |   path: dorothea.csv
5 | 


--------------------------------------------------------------------------------
/test/datasets/gisette.csv.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 2a1efe73ab9eace5947d2d5cb62d16d6
3 |   size: 67797724
4 |   path: gisette.csv
5 | 


--------------------------------------------------------------------------------
/test/datasets/madelon.csv.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 44cce2dfb74b4ade59fcf63be7c59610
3 |   size: 4004995
4 |   path: madelon.csv
5 | 


--------------------------------------------------------------------------------
/test/embedded_test.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import pandas as pd
  4 | from sklearn.linear_model import LogisticRegression, SGDClassifier
  5 | from sklearn.pipeline import Pipeline
  6 | import numpy as np
  7 | 
  8 | from sklearn.utils.estimator_checks import check_estimator
  9 | 
 10 | from ITMO_FS.embedded import *
 11 | from ITMO_FS.utils import weight_func
 12 | 
 13 | np.random.seed(42)
 14 | 
 15 | 
 16 | class TestCases(unittest.TestCase):
 17 |     data, target = np.random.randint(
 18 |         10, size=(
 19 |             100, 20)), np.random.randint(
 20 |         10, size=(
 21 |             100,))
 22 | 
 23 |     def test_MOS_err_loss(self):
 24 |         with self.assertRaises(KeyError):
 25 |             MOS(model=SGDClassifier(), weight_func=weight_func,
 26 |                 sampling=True, loss="err").fit(self.data,
 27 |                                                self.target)
 28 | 
 29 |     def test_MOS_no_sampling(self):
 30 |         # MOSS
 31 |         res = MOS(
 32 |             model=SGDClassifier(),
 33 |             weight_func=weight_func).fit_transform(
 34 |             self.data,
 35 |             self.target)
 36 |         assert self.data.shape[0] == res.shape[0]
 37 |         print("MOSS:", self.data.shape, '--->', res.shape)
 38 | 
 39 |     def test_MOSS(self):
 40 |         # MOSS
 41 |         res = MOS(
 42 |             model=SGDClassifier(),
 43 |             weight_func=weight_func,
 44 |             sampling=True).fit_transform(
 45 |             self.data,
 46 |             self.target)
 47 |         assert self.data.shape[0] == res.shape[0]
 48 |         print("MOSS:", self.data.shape, '--->', res.shape)
 49 | 
 50 |     def test_MOSS_n_naigbours_err(self):
 51 |         # MOSS
 52 |         with self.assertRaises(ValueError):
 53 |             MOS(
 54 |                 model=SGDClassifier(),
 55 |                 weight_func=weight_func,
 56 |                 sampling=True, k_neighbors=1000).fit_transform(
 57 |                 self.data,
 58 |                 self.target)
 59 | 
 60 |     def test_MOSS_hinge(self):
 61 |         # MOSS
 62 |         res = MOS(
 63 |             model=SGDClassifier(),
 64 |             weight_func=weight_func,
 65 |             sampling=True, loss="hinge").fit_transform(
 66 |             self.data,
 67 |             self.target)
 68 |         assert self.data.shape[0] == res.shape[0]
 69 |         print("MOSS:", self.data.shape, '--->', res.shape)
 70 | 
 71 |     def test_MOSNS(self):
 72 |         # MOSNS
 73 |         res = MOS(
 74 |             model=SGDClassifier(),
 75 |             weight_func=weight_func,
 76 |             sampling=False).fit_transform(
 77 |             self.data,
 78 |             self.target)
 79 |         assert self.data.shape[0] == res.shape[0]
 80 |         print("MOSNS:", self.data.shape, '--->', res.shape)
 81 | 
 82 |     def test_losses(self):
 83 |         for loss in ['log', 'hinge']:
 84 |             res = MOS(
 85 |                 model=SGDClassifier(),
 86 |                 weight_func=weight_func,
 87 |                 loss=loss).fit_transform(
 88 |                 self.data,
 89 |                 self.target)
 90 |             assert self.data.shape[0] == res.shape[0]
 91 | 
 92 |     def test_df(self):
 93 |         f = MOS(model=SGDClassifier(), weight_func=weight_func, sampling=True)
 94 | 
 95 |         df = f.fit_transform(
 96 |             pd.DataFrame(
 97 |                 self.data), pd.DataFrame(
 98 |                 self.target))
 99 |         arr = f.fit_transform(self.data, self.target)
100 |         np.testing.assert_array_equal(df, arr)
101 | 
102 |         f = MOS(model=SGDClassifier(), weight_func=weight_func, sampling=False)
103 | 
104 |         df = f.fit_transform(
105 |             pd.DataFrame(
106 |                 self.data), pd.DataFrame(
107 |                 self.target))
108 |         arr = f.fit_transform(self.data, self.target)
109 |         np.testing.assert_array_equal(df, arr)
110 | 
111 |     def test_pipeline(self):
112 |         # FS
113 |         p = Pipeline(
114 |             [('FS1', MOS(model=SGDClassifier(), weight_func=weight_func))])
115 |         p.fit(self.data, self.target)
116 |         res = p.transform(self.data)
117 |         assert self.data.shape[0] == res.shape[0]
118 | 
119 |         # FS - estim
120 |         p = Pipeline([('FS1', MOS(model=SGDClassifier(),
121 |                                   weight_func=weight_func)),
122 |                       ('E1', LogisticRegression())])
123 |         p.fit(self.data, self.target)
124 |         assert 0 <= p.score(self.data, self.target) <= 1
125 | 
126 |         # FS - FS
127 |         p = Pipeline([('FS1',
128 |                        MOS(model=SGDClassifier(),
129 |                            weight_func=weight_func,
130 |                            loss='log')),
131 |                       ('FS2',
132 |                        MOS(model=SGDClassifier(),
133 |                            weight_func=weight_func,
134 |                            loss='hinge'))])
135 |         p.fit(self.data, self.target)
136 |         res = p.transform(self.data)
137 |         assert self.data.shape[0] == res.shape[0]
138 | 
139 |         # FS - FS - estim
140 |         p = Pipeline([('FS1',
141 |                        MOS(model=SGDClassifier(), weight_func=weight_func,
142 |                            loss='log')), ('FS2', MOS(
143 |                                model=SGDClassifier(), weight_func=weight_func, loss='hinge')),
144 |                       ('E1', LogisticRegression())])
145 |         p.fit(self.data, self.target)
146 |         assert 0 <= p.score(self.data, self.target) <= 1
147 | 
148 |     def test_est(self):
149 |         moss = MOS(
150 |             model=SGDClassifier(),
151 |             weight_func=weight_func,
152 |             sampling=True)
153 |         mosns = MOS(
154 |             model=SGDClassifier(),
155 |             weight_func=weight_func,
156 |             sampling=False)
157 | 
158 |         # for some reason using local weight_func or lambda here causes it to fail with pickle errors
159 |         # so we're using an imported weight_func
160 |         check_estimator(moss)
161 |         check_estimator(mosns)
162 | 
163 | 
164 | if __name__ == "__main__":
165 |     unittest.main()
166 | 


--------------------------------------------------------------------------------
/test/hybrid_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from sklearn.linear_model import LogisticRegression
 4 | from sklearn.utils.estimator_checks import check_estimator
 5 | from sklearn.metrics import make_scorer
 6 | from ITMO_FS.filters import *
 7 | from ITMO_FS.wrappers import BackwardSelection
 8 | from ITMO_FS.utils import f1_scorer
 9 | from ITMO_FS.hybrid import FilterWrapperHybrid
10 | 
11 | 
12 | class MyTestCase(unittest.TestCase):
13 | 
14 |     def test_est(self):
15 |         classifier = LogisticRegression(max_iter=1000)
16 |         back_selection = BackwardSelection(classifier, 2, make_scorer(f1_scorer))
17 |         fw = FilterWrapperHybrid(UnivariateFilter(spearman_corr, cutting_rule=('K best', 2)), back_selection)
18 |         check_estimator(fw)
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     unittest.main()
23 | 


--------------------------------------------------------------------------------
/test/multivariate_filters_test.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import pandas as pd
  4 | from sklearn.linear_model import LogisticRegression
  5 | from sklearn.pipeline import Pipeline
  6 | import numpy as np
  7 | 
  8 | from sklearn.utils.estimator_checks import check_estimator
  9 | 
 10 | from ITMO_FS.filters.multivariate import *
 11 | from utils import load_dataset
 12 | 
 13 | np.random.seed(42)
 14 | 
 15 | 
 16 | class TestCases(unittest.TestCase):
 17 |     data, target = np.random.randint(10, size=(100, 20)), np.random.randint(10,
 18 |                                                                             size=(
 19 |                                                                                 100,))
 20 |     madelon = load_dataset("madelon.csv")
 21 | 
 22 |     def test_FCBF(self):
 23 |         # FCBF
 24 |         res = FCBFDiscreteFilter().fit_transform(self.data, self.target)
 25 |         assert self.data.shape[0] == res.shape[0]
 26 |         print("Fast Correlation Based filter:", self.data.shape, '--->',
 27 |               res.shape)
 28 | 
 29 |     def test_DISR(self):
 30 |         # DISR
 31 |         res = DISRWithMassive(10).fit_transform(self.data, self.target)
 32 |         assert self.data.shape[0] == res.shape[0]
 33 |         print("Double Input Symmetric Relevance:", self.data.shape, '--->',
 34 |               res.shape)
 35 | 
 36 |     def test_JMIM_error(self):
 37 |         # JMIM
 38 |         data, target = self.madelon.drop(['target'], axis=1), self.madelon[
 39 |             "target"]
 40 |         with self.assertRaises(ValueError):
 41 |             JMIM(10000).fit_transform(data, target)
 42 | 
 43 |     def test_JMIM(self):
 44 |         # JMIM
 45 |         res = JMIM(10).fit_transform(self.data, self.target)
 46 |         assert self.data.shape[0] == res.shape[0]
 47 |         print("Joint Mutual Information Maximisation:", self.data.shape,
 48 |               '--->', res.shape)
 49 | 
 50 |     def test_JMIM_normalised(self):
 51 |         # JMIM
 52 |         res = JMIM(10, normalized=True).fit_transform(self.data, self.target)
 53 |         assert self.data.shape[0] == res.shape[0]
 54 |         print("Joint Mutual Information Maximisation:", self.data.shape,
 55 |               '--->', res.shape)
 56 | 
 57 |     def test_multivariate_interface_error_features(self):
 58 |         with self.assertRaises(ValueError):
 59 |             filter = MultivariateFilter("MRMR", 5000000)
 60 |             filter.fit(self.data, self.target)
 61 | 
 62 |     def test_multivariate_interface_error_name(self):
 63 |         with self.assertRaises(KeyError):
 64 |             filter = MultivariateFilter("asoikfhlkjfdslfh", 5)
 65 |             filter.fit(self.data, self.target)
 66 | 
 67 |     def test_trace_ratio(self):
 68 |         # TraceRatioFisher
 69 |         res = TraceRatioFisher(10).fit_transform(self.data, self.target)
 70 |         assert self.data.shape[0] == res.shape[0]
 71 |         print("TraceRatio:", self.data.shape, '--->', res.shape)
 72 | 
 73 |     def test_stir(self):
 74 |         # STIR
 75 |         res = STIR(10).fit_transform(self.data, self.target)
 76 |         assert self.data.shape[0] == res.shape[0]
 77 |         print("Statistical Inference Relief:", self.data.shape, '--->',
 78 |               res.shape)
 79 | 
 80 |     def test_base_multivariate(self):
 81 |         # Multivariate with callable
 82 |         f = MultivariateFilter(MIM, 10)
 83 |         f.fit(self.data, self.target)
 84 |         res = f.transform(self.data)
 85 |         assert self.data.shape[0] == res.shape[0]
 86 |         print("Multivariate with callable:", self.data.shape, '--->',
 87 |               res.shape)
 88 | 
 89 |         # Multivariate with string
 90 |         f = MultivariateFilter('MRMR', 10)
 91 |         f.fit(self.data, self.target)
 92 |         res = f.transform(self.data)
 93 |         assert self.data.shape[0] == res.shape[0]
 94 |         print("Multivariate with string:", self.data.shape, '--->', res.shape)
 95 | 
 96 |     def test_k_best(self):
 97 |         for i in [5, 10, 20]:
 98 |             res = DISRWithMassive(i).fit_transform(self.data, self.target)
 99 |             assert i == res.shape[1]
100 | 
101 |         for i in [5, 10, 20]:
102 |             res = JMIM(i).fit_transform(self.data, self.target)
103 |             assert i == res.shape[1]
104 | 
105 |         for i in [5, 10, 20]:
106 |             f = MultivariateFilter(MIM, i)
107 |             f.fit(self.data, self.target)
108 |             res = f.transform(self.data)
109 |             assert i == res.shape[1]
110 | 
111 |         for i in [5, 10, 20]:
112 |             res = TraceRatioFisher(i).fit_transform(self.data, self.target)
113 |             assert i == res.shape[1]
114 | 
115 |         for i in [5, 10, 20]:
116 |             res = STIR(i).fit_transform(self.data, self.target)
117 |             assert i == res.shape[1]
118 | 
119 |     def test_measures(self):
120 |         # Multivariate
121 |         for measure in MEASURE_NAMES:
122 |             beta = 0.3 if measure in ['MIFS', 'generalizedCriteria'] else None
123 |             gamma = 0.4 if measure == 'generalizedCriteria' else None
124 |             f = MultivariateFilter(measure, 10, beta, gamma)
125 |             f.fit(self.data, self.target)
126 |             res = f.transform(self.data)
127 |             assert self.data.shape[0] == res.shape[0] and res.shape[1] == 10
128 | 
129 |     def test_df(self):
130 |         for f in [FCBFDiscreteFilter(), DISRWithMassive(10), JMIM(10),
131 |                   MultivariateFilter(MIM, 10), \
132 |                   TraceRatioFisher(10), STIR(10)]:
133 |             df = f.fit_transform(pd.DataFrame(self.data),
134 |                                  pd.DataFrame(self.target))
135 |             arr = f.fit_transform(self.data, self.target)
136 |             np.testing.assert_array_equal(df, arr)
137 | 
138 |     def test_pipeline(self):
139 |         # FS
140 |         p = Pipeline([('FS1', MultivariateFilter(MIM, 10))])
141 |         p.fit(self.data, self.target)
142 |         res = p.transform(self.data)
143 |         assert self.data.shape[0] == res.shape[0] and res.shape[1] == 10
144 | 
145 |         # FS - estim
146 |         p = Pipeline([('FS1', FCBFDiscreteFilter()),
147 |                       ('E1', LogisticRegression(max_iter=10000))])
148 |         p.fit(self.data, self.target)
149 |         assert 0 <= p.score(self.data, self.target) <= 1
150 | 
151 |         # FS - FS
152 |         p = Pipeline([('FS1', MultivariateFilter(MIM, 10)), ('FS2', STIR(5))])
153 |         p.fit(self.data, self.target)
154 |         res = p.transform(self.data)
155 |         assert self.data.shape[0] == res.shape[0] and res.shape[1] == 5
156 | 
157 |         # FS - FS - estim
158 |         p = Pipeline(
159 |             [('FS1', TraceRatioFisher(10)), ('FS2', DISRWithMassive(5)),
160 |              ('E1', LogisticRegression(max_iter=10000))])
161 |         p.fit(self.data, self.target)
162 |         assert 0 <= p.score(self.data, self.target) <= 1
163 | 
164 |     def test_est(self):
165 |         for f in [
166 |             FCBFDiscreteFilter(),
167 |             DISRWithMassive(2),
168 |             MultivariateFilter(MIM, 2),
169 |             TraceRatioFisher(2),
170 |             STIR(2)]:
171 |             check_estimator(f)
172 | 
173 | 
174 | if __name__ == "__main__":
175 |     unittest.main()
176 | 


--------------------------------------------------------------------------------
/test/univariate_measures_test.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from ITMO_FS.filters.univariate.measures import *
  3 | from utils import load_dataset
  4 | 
  5 | 
  6 | class UnivariateMeasuresTest(unittest.TestCase):
  7 |     madelon = load_dataset("madelon.csv")
  8 | 
  9 |     def test_measures(self):
 10 |         data = self.madelon.drop(['target'], axis=1).values
 11 |         for f, answer in zip(
 12 |                 [su_measure,
 13 |                  laplacian_score],
 14 |                 [1,
 15 |                  1,
 16 |                  1]):
 17 |             np.testing.assert_allclose(
 18 |                 f(data[0].reshape((-1, 1)), data[0]),
 19 |                 answer, atol=1e-05)
 20 | 
 21 |     def test_information_gain(self):
 22 |         data = self.madelon.drop(['target'], axis=1).values
 23 |         np.testing.assert_allclose(
 24 |             information_gain(data[:, 0].reshape((-1, 1)), data[:, 0]),
 25 |             0, atol=1e-05)
 26 | 
 27 |     def test_pearson_correlation(self):
 28 |         data = self.madelon.drop(['target'], axis=1).values
 29 |         np.testing.assert_allclose(
 30 |             pearson_corr(data[:, 0].reshape((-1, 1)), data[:, 0]),
 31 |             1, atol=1e-05)
 32 | 
 33 |     def test_pearson_correlation_1d(self):
 34 |         data = self.madelon.drop(['target'], axis=1).values
 35 |         np.testing.assert_allclose(
 36 |             pearson_corr(data[0], data[0]),
 37 |             1, atol=1e-05)
 38 | 
 39 |     def test_spearman_measure(self):
 40 |         data = self.madelon.drop(['target'], axis=1).values
 41 |         np.testing.assert_allclose(
 42 |             spearman_corr(data[:, 0].reshape((-1, 1)), data[:, 0]),
 43 |             1, atol=1e-05)
 44 | 
 45 |     def test_spearman_measure_1d(self):
 46 |         data = self.madelon.drop(['target'], axis=1).values
 47 |         np.testing.assert_allclose(
 48 |             spearman_corr(data[:, 0], data[:, 0]),
 49 |             1, atol=1e-05)
 50 | 
 51 |     def test_spearman_measure_error(self):
 52 |         with self.assertRaises(ValueError):
 53 |             spearman_corr(np.array([-1]), [-1])
 54 | 
 55 |     def test_chi2_measure(self):
 56 |         data = self.madelon.drop(['target'], axis=1).values
 57 |         np.testing.assert_allclose(
 58 |             chi2_measure(data[:, 0].reshape((-1, 1)), data[:, 0]),
 59 |             1, atol=1e-05)
 60 | 
 61 |     def test_chi2_measure_error(self):
 62 |         with self.assertRaises(ValueError):
 63 |             chi2_measure(np.array([-1]), [-1])
 64 | 
 65 |     def test_gini_index(self):
 66 |         data = self.madelon.drop(['target'], axis=1).values
 67 |         assert gini_index(data[0].reshape((-1, 1)), data[0]), 0
 68 |         with self.assertRaises(ValueError):
 69 |             gini_index(data[0, :1], data[0])
 70 | 
 71 |     def test_relief_error(self):
 72 |         data, target = (self.madelon.drop(['target'], axis=1).values,
 73 |                         self.madelon["target"].values)
 74 |         with self.assertRaises(ValueError):
 75 |             relief_measure(data, target[target > 0])
 76 | 
 77 |     def test_relief(self):
 78 |         data, target = (self.madelon.drop(['target'], axis=1).values,
 79 |                         self.madelon["target"].values)
 80 |         relief_measure(data, target)
 81 | 
 82 |     def test_reliefF_measure(self):
 83 |         data, target = (self.madelon.drop(['target'], axis=1).values,
 84 |                         self.madelon["target"].values)
 85 |         reliefF_measure(data, target)
 86 | 
 87 |     def test_laplacian_score(self):
 88 |         data, target = (self.madelon.drop(['target'], axis=1).values,
 89 |                         self.madelon["target"].values)
 90 |         laplacian_score(data, target)
 91 | 
 92 |     def test_cutting_rules(self):
 93 |         data = dict(
 94 |             zip(['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10'],
 95 |                 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
 96 |         assert select_k_best(5)(data), ['f10', 'f9', 'f8', 'f7', 'f6']
 97 |         assert select_k_worst(5)(data), ['f1', 'f2', 'f3', 'f4', 'f5']
 98 | 
 99 |         with self.assertRaises(TypeError):
100 |             select_k_best(0.5)(data)
101 | 
102 |         with self.assertRaises(ValueError):
103 |             select_k_best(100)(data)
104 | 
105 |         assert select_best_by_value(5)(data), ['f10', 'f9', 'f8', 'f7', 'f6']
106 |         assert select_worst_by_value(5)(data), ['f1', 'f2', 'f3', 'f4', 'f5']
107 | 
108 |         assert select_best_percentage(0.5)(data), ['f10', 'f9', 'f8', 'f7',
109 |                                                    'f6']
110 |         assert select_worst_percentage(0.5)(data), ['f1', 'f2', 'f3', 'f4',
111 |                                                     'f5']
112 | 
113 |     def test_fit_criterion(self):
114 |         data, target = (self.madelon.drop(['target'], axis=1).values,
115 |                         self.madelon["target"].values)
116 |         fit_criterion_measure(data, target)
117 | 
118 |     def test_anova(self):
119 |         data, target = (self.madelon.drop(['target'], axis=1).values,
120 |                         self.madelon["target"].values)
121 |         anova(data, target)
122 | 
123 |     def test_modified_t_score(self):
124 |         data, target = (self.madelon.drop(['target'], axis=1).values,
125 |                         self.madelon["target"].values)
126 |         modified_t_score(data, target)
127 | 
128 |     def test_f_ratio(self):
129 |         data, target = (self.madelon.drop(['target'], axis=1).values,
130 |                         self.madelon["target"].values)
131 |         f_ratio_measure(data, target)
132 | 
133 |     def test_kendall(self):
134 |         data, target = (self.madelon.drop(['target'], axis=1).values,
135 |                         self.madelon["target"].values)
136 |         kendall_corr(data[:, 0], target)
137 |         kendall_corr(data, target)
138 | 
139 |     def test_fechner(self):
140 |         data, target = (self.madelon.drop(['target'], axis=1).values,
141 |                         self.madelon["target"].values)
142 |         fechner_corr(data[:, 0], target)
143 |         fechner_corr(data, target)
144 | 
145 | 
146 | if __name__ == '__main__':
147 |     unittest.main()
148 | 


--------------------------------------------------------------------------------
/test/unsupervised_filters_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | from sklearn.linear_model import LogisticRegression
 5 | from sklearn.pipeline import Pipeline
 6 | import numpy as np
 7 | 
 8 | from sklearn.utils.estimator_checks import check_estimator
 9 | 
10 | from ITMO_FS.filters.unsupervised import *
11 | from ITMO_FS.filters.univariate import *
12 | 
13 | np.random.seed(42)
14 | 
15 | 
16 | class TestCases(unittest.TestCase):  # TODO: add TraceRatioLaplacian tests and tests without target
17 |     data, target = np.random.randint(10, size=(100, 20)), np.random.randint(10, size=(100,))
18 | 
19 |     def test_MCFS(self):
20 |         # MCFS
21 |         res = MCFS(10).fit_transform(self.data, self.target)
22 |         assert self.data.shape[0] == res.shape[0]
23 |         print("MCFS:", self.data.shape, '--->', res.shape)
24 | 
25 |     def test_UDFS(self):
26 |         # UDFS
27 |         res = UDFS(10).fit_transform(self.data, self.target)
28 |         assert self.data.shape[0] == res.shape[0]
29 |         print("UDFS:", self.data.shape, '--->', res.shape)
30 | 
31 |     def test_df(self):
32 |         for f in [MCFS(10), UDFS(10)]:
33 |             df = f.fit_transform(pd.DataFrame(self.data), pd.DataFrame(self.target))
34 |             arr = f.fit_transform(self.data, self.target)
35 |             np.testing.assert_array_equal(df, arr)
36 | 
37 |     def test_pipeline(self):
38 |         # FS
39 |         p = Pipeline([('FS1', MCFS(10))])
40 |         p.fit(self.data, self.target)
41 |         res = p.transform(self.data)
42 |         assert self.data.shape[0] == res.shape[0] and res.shape[1] == 10
43 | 
44 |         # FS - estim
45 |         p = Pipeline([('FS1', UDFS(10)), ('E1', LogisticRegression())])
46 |         p.fit(self.data, self.target)
47 |         assert 0 <= p.score(self.data, self.target) <= 1
48 | 
49 |         # FS - FS
50 |         p = Pipeline([('FS1', MCFS(10)), ('FS2', UDFS(5))])
51 |         p.fit(self.data, self.target)
52 |         res = p.transform(self.data)
53 |         assert self.data.shape[0] == res.shape[0] and res.shape[1] == 5
54 | 
55 |         # FS - FS - estim
56 |         p = Pipeline([('FS1', UDFS(10)), ('FS2', MCFS(5)), ('E1', LogisticRegression())])
57 |         p.fit(self.data, self.target)
58 |         assert 0 <= p.score(self.data, self.target) <= 1
59 | 
60 |     def test_est(self):
61 |         for f in [MCFS(2), UDFS(2)]:
62 |             check_estimator(f)
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     unittest.main()
67 | 


--------------------------------------------------------------------------------
/test/utils.py:
--------------------------------------------------------------------------------
 1 | import dvc.api
 2 | import pandas as pd
 3 | 
 4 | datasets = ["arcene.csv",
 5 |             "dexter.csv",
 6 |             "dorothea.csv",
 7 |             "gisette.csv",
 8 |             "madelon.csv"]
 9 | 
10 | 
11 | def load_dataset(name):  # todo fails to hold header
12 |     with dvc.api.open(
13 |             'test/datasets/' + name) as fd:
14 |         df = pd.read_csv(fd, header=None)
15 |         features = ['v' + str(i) for i in range(df.shape[1] - 1)] + ["target"]
16 |         df.columns = features
17 |         return df
18 | 
19 | 
20 | def load_datasets():
21 |     data = []
22 |     for d in datasets:
23 |         data.append(load_dataset(d))
24 |     return data
25 | 


--------------------------------------------------------------------------------
/test/wrapper_test.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import pandas as pd
  4 | from math import sqrt
  5 | from scipy import stats
  6 | from sklearn.datasets import load_iris
  7 | from sklearn.datasets import make_classification, make_regression
  8 | from sklearn.linear_model import LinearRegression, LogisticRegression
  9 | from sklearn.metrics import f1_score
 10 | from sklearn.model_selection import cross_val_score, KFold
 11 | from sklearn.neighbors import KNeighborsClassifier
 12 | from sklearn.svm import SVC
 13 | from sklearn.utils.estimator_checks import check_estimator
 14 | from sklearn.metrics import make_scorer
 15 | 
 16 | from ITMO_FS import RecursiveElimination, BackwardSelection, AddDelWrapper, SequentialForwardSelection, \
 17 |     HillClimbingWrapper, SimulatedAnnealing, TPhMGWO
 18 | from ITMO_FS.utils.information_theory import *
 19 | from ITMO_FS.utils import test_scorer
 20 | 
 21 | np.random.seed(42)
 22 | 
 23 | 
 24 | class TestCases(unittest.TestCase):
 25 |     wide_classification = make_classification(n_features=2000, n_informative=100, n_redundant=500)
 26 |     tall_classification = make_classification(n_samples=50000, n_features=100, n_informative=23, n_redundant=30)
 27 |     wide_regression = make_regression(n_features=2000, n_informative=100)
 28 |     tall_regression = make_regression(n_samples=50000, n_features=200, n_informative=50)
 29 | 
 30 |     # def test_rec_elim(self):
 31 |     #     classifier = LogisticRegression(max_iter=1000)
 32 |     #     rec_elimination = RecursiveElimination(classifier, 10, 'f1')
 33 |     #     X, y = self.wide_classification
 34 |     #
 35 |     #     default_score = cross_val_score(classifier, X, y, cv=5, scoring='f1').mean()
 36 |     #
 37 |     #     rec_elimination.fit(X, y)
 38 |     #     features = rec_elimination.selected_features_
 39 |     #     assert len(features) == 10
 40 |     #
 41 |     #     wrapper_score = cross_val_score(classifier, X[:, features], y, cv=5, scoring='f1').mean()
 42 |     #
 43 |     #     assert default_score < wrapper_score
 44 | 
 45 |     # def test_back_sel(self):
 46 |     #     classifier = LogisticRegression(max_iter=1000)
 47 |     #     back_selection = BackwardSelection(classifier, 10, 'f1')
 48 |     #     X, y = self.wide_classification
 49 |     #
 50 |     #     print('start calculating the default score')
 51 |     #     default_score = cross_val_score(classifier, X, y, cv=5, scoring='f1').mean()
 52 |     #     print('finish calculating the default score')
 53 |     #
 54 |     #     print('start backward selection')
 55 |     #     # TODO backward selection works for too long
 56 |     #     back_selection.fit(X, y)
 57 |     #     print('finish backward selection')
 58 |     #
 59 |     #     features = back_selection.selected_features_
 60 |     #     assert len(features) == 10
 61 |     #
 62 |     #     wrapper_score = cross_val_score(classifier, X[:, features], y, cv=5, scoring='f1').mean()
 63 |     #
 64 |     #     assert default_score < wrapper_score
 65 | 
 66 |     # def test_add_del_wrapper(self):
 67 |     #     classifier = LogisticRegression(max_iter=1000)
 68 |     #     add_del_wrapper = AddDelWrapper(classifier, f1_score)
 69 |     #     X, y = self.wide_classification
 70 |     #
 71 |     #     default_score = cross_val_score(classifier, X, y, cv=5, scoring='f1').mean()
 72 |     #
 73 |     #     add_del_wrapper.fit(X, y)
 74 |     #     features = add_del_wrapper.selected_features_
 75 |     #
 76 |     #     wrapper_score = cross_val_score(classifier, X[:, features], y, cv=5, scoring='f1').mean()
 77 |     #
 78 |     #     assert default_score < wrapper_score
 79 |     #
 80 |     # def test_seq_forw_sel(self):
 81 |     #     classifier = LogisticRegression(max_iter=1000)
 82 |     #     seq_forw_sel = SequentialForwardSelection(classifier, 10, 'f1')
 83 |     #     X, y = self.wide_classification
 84 |     #
 85 |     #     default_score = cross_val_score(classifier, X, y, cv=5, scoring='f1').mean()
 86 |     #
 87 |     #     seq_forw_sel.fit(X, y)
 88 |     #     features = seq_forw_sel.selected_features_
 89 |     #     assert len(features) == 10
 90 |     #
 91 |     #     wrapper_score = cross_val_score(classifier, X[:, features], y, cv=5, scoring='f1').mean()
 92 |     #
 93 |     #     assert default_score < wrapper_score
 94 | 
 95 |     # def test_qpfs_wrapper(self):
 96 |     #     classifier = LogisticRegression(max_iter=1000)
 97 |     #     seq_forw_sel = SequentialForwardSelection(LogisticRegression(), 10, 'f1')
 98 |     #     X, y = self.wide_classification
 99 |     #
100 |     #     default_score = cross_val_score(classifier, X, y, cv=5, scoring='f1')
101 |     #
102 |     #     seq_forw_sel.fit(X, y)
103 |     #     features = seq_forw_sel.selected_features
104 |     #     assert len(features) == 10
105 |     #
106 |     #     wrapper_score = cross_val_score(classifier, X[:, features], y, cv=5, scoring='f1')
107 |     #
108 |     #     assert all(default_score < wrapper_score)
109 | 
110 |     # def test_hill_climbing(self):
111 |     #     classifier = LogisticRegression(max_iter=1000)
112 |     #     hill_climbing = HillClimbingWrapper(classifier, f1_score)
113 |     #     X, y = self.wide_classification
114 |     #
115 |     #     default_score = cross_val_score(classifier, X, y, cv=5, scoring='f1').mean()
116 |     #
117 |     #     hill_climbing.fit(X, y)
118 |     #     features = hill_climbing.selected_features_
119 |     #     # assert len(features) == 10
120 |     #
121 |     #     wrapper_score = cross_val_score(classifier, X[:, features], y, cv=5, scoring='f1').mean()
122 |     #
123 |     #     assert default_score < wrapper_score
124 |     #
125 |     # def test_sim_annealing(self):
126 |     #     classifier = LogisticRegression(max_iter=1000)
127 |     #     sim_annealing = SimulatedAnnealing(classifier, f1_score)
128 |     #     X, y = self.wide_classification
129 |     #
130 |     #     sim_annealing.fit(X, y)
131 |     #     default_score = cross_val_score(classifier, X, y, cv=5, scoring='f1').mean()
132 |     #
133 |     #     features = sim_annealing.selected_features_
134 |     #     # assert len(features) == 10
135 |     #
136 |     #     wrapper_score = cross_val_score(classifier, X[:, features], y, cv=5, scoring='f1').mean()
137 |     #
138 |     #     assert default_score < wrapper_score
139 |     #
140 |     # def test_wolves(self):
141 |     #     classifier = LogisticRegression(max_iter=1000)
142 |     #     tphmgwo = TPhMGWO()
143 |     #     X, y = self.wide_classification
144 |     #
145 |     #     default_score = cross_val_score(classifier, X, y, cv=5, scoring='f1').mean()
146 |     #
147 |     #     tphmgwo.run(X, y)
148 |     #     features = tphmgwo.selected_features_
149 |     #     # assert len(features) == 10
150 |     #
151 |     #     wrapper_score = cross_val_score(classifier, X[:, features], y, cv=5, scoring='f1').mean()
152 |     #
153 |     #     assert default_score < wrapper_score
154 |     #
155 |     # def test_est(self):
156 |     #     classifier = LogisticRegression(max_iter=1000)
157 |     #     for f in [RecursiveElimination(classifier, 2, make_scorer(test_scorer)), BackwardSelection(classifier, 2, make_scorer(test_scorer)),
158 |     #     AddDelWrapper(classifier, test_scorer), SequentialForwardSelection(classifier, 2, make_scorer(test_scorer)),
159 |     #     HillClimbingWrapper(classifier, test_scorer), SimulatedAnnealing(classifier, test_scorer), TPhMGWO()]:
160 |     #         check_estimator(f)
161 | 
162 | if __name__ == "__main__":
163 |     unittest.main()
164 | 


--------------------------------------------------------------------------------