├── .circleci └── config.yml ├── .dvc ├── .gitignore ├── config └── plots │ ├── confusion.json │ ├── default.json │ ├── scatter.json │ └── smooth.json ├── .dvcignore ├── .github ├── .codecov.yml ├── ISSUE_TEMPLATE │ ├── add-method.md │ ├── bug_report.md │ └── feature-request.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── codeql-analysis.yml │ └── pythonpublish.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── ITMO_FS ├── __about__.py ├── __init__.py ├── embedded │ ├── MOS.py │ └── __init__.py ├── ensembles │ ├── __init__.py │ ├── measure_based │ │ ├── WeightBased.py │ │ ├── __init__.py │ │ └── fusion_functions.py │ ├── model_based │ │ ├── __init__.py │ │ └── best_sum.py │ └── ranking_based │ │ ├── Mixed.py │ │ ├── __init__.py │ │ └── fusion_functions.py ├── filters │ ├── __init__.py │ ├── multivariate │ │ ├── DISRwithMassive.py │ │ ├── FCBF.py │ │ ├── MultivariateFilter.py │ │ ├── STIR.py │ │ ├── TraceRatioFisher.py │ │ ├── __init__.py │ │ ├── measures.py │ │ └── mimaga.py │ ├── univariate │ │ ├── NDFS.py │ │ ├── RFS.py │ │ ├── SPEC.py │ │ ├── UnivariateFilter.py │ │ ├── VDM.py │ │ ├── __init__.py │ │ └── measures.py │ └── unsupervised │ │ ├── MCFS.py │ │ ├── UDFS.py │ │ ├── __init__.py │ │ └── trace_ratio_laplacian.py ├── hybrid │ ├── IWSSr_SFLA.py │ ├── Melif.py │ ├── __init__.py │ └── filter_wrapper_hybrid.py ├── utils │ ├── __init__.py │ ├── base_transformer.py │ ├── base_wrapper.py │ ├── data_check.py │ ├── functions.py │ ├── information_theory.py │ └── qpfs_body.py └── wrappers │ ├── __init__.py │ ├── deterministic │ ├── AddDelWrapper.py │ ├── BackwardSelection.py │ ├── RecursiveElimination.py │ ├── SequentialForwardSelection.py │ ├── __init__.py │ └── qpfs_wrapper.py │ └── randomized │ ├── HillClimbing.py │ ├── SimulatedAnnealing.py │ ├── TPhMGWO.py │ └── __init__.py ├── LICENSE ├── README.rst ├── _config.yml ├── docs ├── Makefile ├── _templates │ ├── class.rst │ └── function.rst ├── api.rst ├── conf.py ├── index.rst ├── install.rst ├── introduction.rst ├── logos │ └── logo_itmo_fs_itog_colour.jpg ├── make.bat └── user_guide.rst ├── meta.yml ├── requirements.txt ├── setup.cfg ├── setup.py └── test ├── Melif_test.py ├── datasets ├── .gitignore ├── arcene.csv.dvc ├── dexter.csv.dvc ├── dorothea.csv.dvc ├── gisette.csv.dvc └── madelon.csv.dvc ├── embedded_test.py ├── ensemble_test.py ├── hybrid_test.py ├── multivariate_filters_test.py ├── univariate_filters_test.py ├── univariate_measures_test.py ├── unsupervised_filters_test.py ├── utils.py └── wrapper_test.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | 3 | orbs: 4 | python: circleci/python@1.2.0 5 | 6 | jobs: 7 | build-and-test: 8 | executor: python/default 9 | steps: 10 | - checkout 11 | 12 | # Download and cache dependencies 13 | - restore_cache: 14 | keys: 15 | - v1-dependencies-{{ checksum "requirements.txt" }} 16 | # fallback to using the latest cache if no exact match is found 17 | - v1-dependencies- 18 | 19 | - run: 20 | name: install dependencies 21 | command: | 22 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh 23 | chmod +x miniconda.sh && ./miniconda.sh -b -p ~/miniconda 24 | export PATH="~/miniconda/bin:$PATH" 25 | conda update --yes --quiet conda 26 | conda create -n testenv --yes --quiet python=3 27 | source activate testenv 28 | pip install -r requirements.txt 29 | pip install pandas sphinx_rtd_theme dvc 30 | pip install sphinx-gallery 31 | pip install dvc 32 | pip install pydrive2 33 | pip install . 34 | cd docs 35 | make html 36 | 37 | - save_cache: 38 | paths: 39 | - ./venv 40 | key: v1-dependencies-{{ checksum "requirements.txt" }} 41 | 42 | # run tests! 43 | # this example uses Django's built-in test-runner 44 | # other common Python testing frameworks include pytest and nose 45 | # https://pytest.org 46 | # https://nose.readthedocs.io 47 | - run: 48 | name: run tests 49 | paths: 50 | - ./venv 51 | command: | 52 | pip install -r requirements.txt 53 | pip install pandas sphinx_rtd_theme 54 | pip install pandas pandas 55 | pip install dvc 56 | pip install pytest 57 | pytest --cov=ITMO_FS test/ 58 | 59 | - store_artifacts: 60 | path: test-reports 61 | destination: test-reports 62 | 63 | workflows: 64 | main: 65 | jobs: 66 | - build-and-test -------------------------------------------------------------------------------- /.dvc/.gitignore: -------------------------------------------------------------------------------- 1 | /config.local 2 | /tmp 3 | /cache 4 | -------------------------------------------------------------------------------- /.dvc/config: -------------------------------------------------------------------------------- 1 | [core] 2 | remote = gdrive 3 | ['remote "gdrive"'] 4 | url = gdrive://1T6v5zENAgNdIXQWZ01xRGpqVOvDNAo7w 5 | -------------------------------------------------------------------------------- /.dvc/plots/confusion.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "mark": "rect", 8 | "encoding": { 9 | "x": { 10 | "field": "", 11 | "type": "nominal", 12 | "sort": "ascending", 13 | "title": "" 14 | }, 15 | "y": { 16 | "field": "", 17 | "type": "nominal", 18 | "sort": "ascending", 19 | "title": "" 20 | }, 21 | "color": { 22 | "aggregate": "count", 23 | "type": "quantitative" 24 | }, 25 | "facet": { 26 | "field": "rev", 27 | "type": "nominal" 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /.dvc/plots/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "mark": { 8 | "type": "line" 9 | }, 10 | "encoding": { 11 | "x": { 12 | "field": "", 13 | "type": "quantitative", 14 | "title": "" 15 | }, 16 | "y": { 17 | "field": "", 18 | "type": "quantitative", 19 | "title": "", 20 | "scale": { 21 | "zero": false 22 | } 23 | }, 24 | "color": { 25 | "field": "rev", 26 | "type": "nominal" 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /.dvc/plots/scatter.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "mark": "point", 8 | "encoding": { 9 | "x": { 10 | "field": "", 11 | "type": "quantitative", 12 | "title": "" 13 | }, 14 | "y": { 15 | "field": "", 16 | "type": "quantitative", 17 | "title": "", 18 | "scale": { 19 | "zero": false 20 | } 21 | }, 22 | "color": { 23 | "field": "rev", 24 | "type": "nominal" 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /.dvc/plots/smooth.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "mark": { 8 | "type": "line" 9 | }, 10 | "encoding": { 11 | "x": { 12 | "field": "", 13 | "type": "quantitative", 14 | "title": "" 15 | }, 16 | "y": { 17 | "field": "", 18 | "type": "quantitative", 19 | "title": "", 20 | "scale": { 21 | "zero": false 22 | } 23 | }, 24 | "color": { 25 | "field": "rev", 26 | "type": "nominal" 27 | } 28 | }, 29 | "transform": [ 30 | { 31 | "loess": "", 32 | "on": "", 33 | "groupby": [ 34 | "rev" 35 | ], 36 | "bandwidth": 0.3 37 | } 38 | ] 39 | } 40 | -------------------------------------------------------------------------------- /.dvcignore: -------------------------------------------------------------------------------- 1 | # Add patterns of files dvc should ignore, which could improve 2 | # the performance. Learn more at 3 | # https://dvc.org/doc/user-guide/dvcignore 4 | -------------------------------------------------------------------------------- /.github/.codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | require_ci_to_pass: yes 3 | token: f8baf545-d745-4d99-81e7-ef7e019a1d1c 4 | coverage: 5 | precision: 2 6 | round: down 7 | range: "70...100" 8 | 9 | parsers: 10 | gcov: 11 | branch_detection: 12 | conditional: yes 13 | loop: yes 14 | method: no 15 | macro: no 16 | 17 | install: 18 | pip: 19 | requirements.txt 20 | 21 | comment: 22 | layout: "reach,diff,flags,tree" 23 | behavior: default 24 | require_changes: no -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/add-method.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Add method 3 | about: Suggest an idea for this project 4 | title: 'New Method:' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe a method you want to be added** 11 | Brief description of the method. 12 | 13 | **Link to paper** 14 | A link to the paper with the method if you have it. 15 | 16 | **Additional context** 17 | Add any other context or screenshots about the feature request here. 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: 'BUG : ' 5 | labels: '' 6 | assignees: LastShekel 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behaviour: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See an error 19 | 20 | **Expected behaviour** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | 29 | **Additional context** 30 | Add any other context about the problem here. 31 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: 'ADD : ' 5 | labels: '' 6 | assignees: LastShekel 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # Pull Request Template 2 | 3 | ## Description 4 | 5 | Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change. 6 | 7 | Fixes # (issue) 8 | 9 | ## Type of change 10 | 11 | Please delete options that are not relevant. 12 | 13 | - [ ] Bug fix (non-breaking change which fixes an issue) 14 | - [ ] New feature (non-breaking change which adds functionality) 15 | - [ ] New method (set a link to paper then) 16 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) 17 | - [ ] This change requires a documentation update 18 | 19 | ## How Has This Been Tested? 20 | 21 | Please describe the tests or set link to that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration 22 | 23 | 24 | 25 | ## Checklist: 26 | 27 | - [ ] My code follows the style guidelines of this project 28 | - [ ] I have performed a self-review of my own code 29 | - [ ] I have commented my code, particularly in hard-to-understand areas 30 | - [ ] I have made corresponding changes to the documentation 31 | - [ ] My changes generate no new warnings 32 | - [ ] I have added tests that prove my fix is effective or that my feature works 33 | - [ ] New and existing unit tests pass locally with my changes 34 | - [ ] Any dependent changes have been merged and published in downstream modules 35 | - [ ] I have checked my code and corrected any misspellings -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ master ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ master ] 20 | schedule: 21 | - cron: '21 23 * * 1' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] 37 | # Learn more: 38 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed 39 | 40 | steps: 41 | - name: Checkout repository 42 | uses: actions/checkout@v2 43 | 44 | # Initializes the CodeQL tools for scanning. 45 | - name: Initialize CodeQL 46 | uses: github/codeql-action/init@v1 47 | with: 48 | languages: ${{ matrix.language }} 49 | # If you wish to specify custom queries, you can do so here or in a config file. 50 | # By default, queries listed here will override any specified in a config file. 51 | # Prefix the list here with "+" to use these queries and those in the config file. 52 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 53 | 54 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 55 | # If this step fails, then you should remove it and run the build manually (see below) 56 | - name: Autobuild 57 | uses: github/codeql-action/autobuild@v1 58 | 59 | # ℹ️ Command-line programs to run using the OS shell. 60 | # 📚 https://git.io/JvXDl 61 | 62 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 63 | # and modify them (or add more) to build your code if your project 64 | # uses a compiled language 65 | 66 | #- run: | 67 | # make bootstrap 68 | # make release 69 | 70 | - name: Perform CodeQL Analysis 71 | uses: github/codeql-action/analyze@v1 72 | -------------------------------------------------------------------------------- /.github/workflows/pythonpublish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v1 12 | - name: Set up Python 13 | uses: actions/setup-python@v1 14 | with: 15 | python-version: '3.x' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools wheel twine 20 | - name: Build and publish 21 | env: 22 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 23 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 24 | run: | 25 | python setup.py sdist bdist_wheel 26 | twine upload dist/* 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | ITMO_FS.egg-info 3 | dist 4 | build 5 | common -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at SomaCruz@bk.ru. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /ITMO_FS/__about__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["__title__", "__uri__", "__version__"] 2 | 3 | __title__ = "ITMO_FS" 4 | __uri__ = "https://github.com/ctlab/ITMO_FS" 5 | 6 | __version__ = "0.3.5" 7 | -------------------------------------------------------------------------------- /ITMO_FS/__init__.py: -------------------------------------------------------------------------------- 1 | from .embedded import * 2 | from .ensembles import * 3 | from .filters import * 4 | from .hybrid import * 5 | from .wrappers import * 6 | -------------------------------------------------------------------------------- /ITMO_FS/embedded/MOS.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | from imblearn.over_sampling import SMOTE 5 | from sklearn.base import clone 6 | 7 | from ..utils import augmented_rvalue, BaseTransformer 8 | 9 | 10 | class MOS(BaseTransformer): 11 | """Perform Minimizing Overlapping Selection under SMOTE (MOSS) or under 12 | No-Sampling (MOSNS) algorithm. 13 | 14 | Parameters 15 | ---------- 16 | model : object 17 | The model that should have a fit(X, y) method and a field corresponding 18 | to feature weights. Currently only SGDClassifier should be passed, 19 | other models would not work. 20 | weight_func : callable 21 | The function to extract weights from the model. 22 | loss : str, 'log' or 'hinge' 23 | Loss function to use in the algorithm. 'log' gives a logistic 24 | regression, while 'hinge' gives a support vector machine. 25 | seed : int, optional 26 | Seed for python random. 27 | l1_ratio : float 28 | The value used to balance the L1 and L2 penalties in elastic-net. 29 | threshold : float 30 | The threshold value for feature dropout. Instead of comparing them to 31 | zero, they are normalized and values with absolute value lower than the 32 | threshold are dropped out. 33 | epochs : int 34 | The number of epochs to perform in the algorithm. 35 | alphas : array-like, shape (n_alphas,), optional 36 | The range of lambdas that should form the regularization path. 37 | sampling : bool 38 | Bool value that control whether MOSS (True) or MOSNS (False) should be 39 | executed. 40 | k_neighbors : int 41 | Amount of nearest neighbors to use in SMOTE if MOSS is used. 42 | 43 | Notes 44 | ----- 45 | For more details see `this paper 46 | `_. 47 | 48 | Examples 49 | -------- 50 | >>> from ITMO_FS.embedded import MOS 51 | >>> from sklearn.linear_model import SGDClassifier 52 | >>> import numpy as np 53 | >>> from sklearn.datasets import make_classification 54 | >>> from sklearn.linear_model import LogisticRegression 55 | >>> dataset = make_classification(n_samples=100, n_features=10, 56 | ... n_informative=5, n_redundant=0, weights=[0.85, 0.15], random_state=42, 57 | ... shuffle=False) 58 | >>> X, y = np.array(dataset[0]), np.array(dataset[1]) 59 | >>> m = MOS(model=SGDClassifier(), 60 | ... weight_func=lambda model: np.square(model.coef_).sum(axis=0)).fit(X, y) 61 | >>> m.selected_features_ 62 | array([1, 3, 4], dtype=int64) 63 | >>> m = MOS(model=SGDClassifier(), sampling=True, 64 | ... weight_func=lambda model: np.square(model.coef_).sum(axis=0)).fit(X, y) 65 | >>> m.selected_features_ 66 | array([1, 3, 4, 6], dtype=int64) 67 | """ 68 | def __init__(self, model, weight_func, loss='log', seed=42, l1_ratio=0.5, 69 | threshold=1e-3, epochs=1000, alphas=np.arange(0.01, 0.2, 0.01), 70 | sampling=False, k_neighbors=2): 71 | self.model = model 72 | self.weight_func = weight_func 73 | self.loss = loss 74 | self.seed = seed 75 | self.l1_ratio = l1_ratio 76 | self.threshold = threshold 77 | self.epochs = epochs 78 | self.alphas = alphas 79 | self.sampling = sampling 80 | self.k_neighbors = k_neighbors 81 | 82 | def _fit(self, X, y): 83 | """Run the MOS algorithm on the specified dataset. 84 | 85 | Parameters 86 | ---------- 87 | X : array-like, shape (n_samples, n_features) 88 | The input samples. 89 | y : array-like, shape (n_samples,) 90 | The classes for the samples. 91 | 92 | Returns 93 | ------- 94 | None 95 | """ 96 | if self.loss not in ['hinge', 'log']: 97 | getLogger(__name__).error( 98 | "Loss should be 'hinge' or 'log', %s was passed", self.loss) 99 | raise KeyError( 100 | "Loss should be 'hinge' or 'log', %s was passed" % self.loss) 101 | 102 | if self.sampling: 103 | try: 104 | X, y = SMOTE( 105 | random_state=self.seed, 106 | k_neighbors=self.k_neighbors).fit_resample(X, y) 107 | except ValueError: 108 | getLogger(__name__).warning( 109 | "Couldn't perform SMOTE because k_neighbors is bigger " 110 | "than amount of instances in one of the classes; MOSNS " 111 | "would be performed instead") 112 | 113 | min_rvalue = 1 114 | min_b = [] 115 | model = clone(self.model) 116 | for a in self.alphas: # TODO: do a little more 117 | # research on the range of lambdas 118 | model = model.set_params( 119 | loss=self.loss, random_state=self.seed, penalty='elasticnet', 120 | alpha=a, l1_ratio=self.l1_ratio, max_iter=self.epochs) 121 | model.fit(X, y) 122 | b = self.weight_func(model) 123 | rvalue = augmented_rvalue( 124 | X[:, np.flatnonzero(np.abs(b) > self.threshold)], y) 125 | getLogger(__name__).info( 126 | "For alpha %f: rvalue = %f, weight vector = %s", a, rvalue, b) 127 | if min_rvalue > rvalue: 128 | min_rvalue = rvalue 129 | min_b = b 130 | getLogger(__name__).info("New minimum rvalue: %f", rvalue) 131 | getLogger(__name__).info("New weight vector: %s", b) 132 | self.selected_features_ = np.flatnonzero(np.abs(min_b) > self.threshold) 133 | -------------------------------------------------------------------------------- /ITMO_FS/embedded/__init__.py: -------------------------------------------------------------------------------- 1 | from .MOS import MOS 2 | -------------------------------------------------------------------------------- /ITMO_FS/ensembles/__init__.py: -------------------------------------------------------------------------------- 1 | from .measure_based import * 2 | from .model_based import * 3 | from .ranking_based import * 4 | -------------------------------------------------------------------------------- /ITMO_FS/ensembles/measure_based/WeightBased.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | from sklearn.base import clone 5 | 6 | from .fusion_functions import * 7 | from ...utils import BaseTransformer, apply_cr, check_filters 8 | 9 | 10 | class WeightBased(BaseTransformer): 11 | """Weight-based filter ensemble. The ensemble first computes all filter 12 | scores for the dataset and then aggregates them using a selected fusion 13 | function. 14 | 15 | Parameters 16 | ---------- 17 | filters : collection 18 | Collection of filter objects. Filters should have a fit(X, y) method 19 | and a feature_scores_ field that contains scores for all features. 20 | cutting_rule : string or callable 21 | A cutting rule name defined in GLOB_CR or a callable with signature 22 | cutting_rule (features), which should return a list features ranked by 23 | some rule. 24 | fusion_function : callable 25 | A function with signature (filter_scores (array-like, shape 26 | (n_filters, n_features)), weights (array-like, shape (n_filters,))) 27 | that should return the aggregated weights for all features. 28 | weights : array-like 29 | An array of shape (n_filters,) defining the weights for input filters. 30 | 31 | See Also 32 | -------- 33 | 34 | Examples 35 | -------- 36 | >>> from ITMO_FS.ensembles import WeightBased 37 | >>> from ITMO_FS.filters.univariate import UnivariateFilter 38 | >>> import numpy as np 39 | >>> filters = [UnivariateFilter('GiniIndex'), 40 | ... UnivariateFilter('FechnerCorr'), 41 | ... UnivariateFilter('SpearmanCorr'), 42 | ... UnivariateFilter('PearsonCorr')] 43 | >>> x = np.array([[3, 3, 3, 2, 2], [3, 3, 1, 2, 3], [1, 3, 5, 1, 1], 44 | ... [3, 1, 4, 3, 1], [3, 1, 2, 3, 1]]) 45 | >>> y = np.array([1, 3, 2, 1, 2]) 46 | >>> wb = WeightBased(filters, ("K best", 2)).fit(x, y) 47 | >>> wb.selected_features_ 48 | array([4, 1], dtype=int64) 49 | """ 50 | def __init__(self, filters, cutting_rule=("K best", 2), 51 | fusion_function=weight_fusion, weights=None): 52 | self.filters = filters 53 | self.cutting_rule = cutting_rule 54 | self.fusion_function = fusion_function 55 | self.weights = weights 56 | 57 | def get_scores(self, X, y): 58 | """Return the normalized feature scores for all filters. 59 | 60 | Parameters 61 | ---------- 62 | X : array-like, shape (n_samples, n_features) 63 | The training input samples. 64 | y : array-like, shape (n_samples,) 65 | The target values. 66 | 67 | Returns 68 | ------- 69 | array-like, shape (n_filters, n_features) : feature scores 70 | """ 71 | scores = np.vectorize( 72 | lambda f: clone(f).fit(X, y).feature_scores_, 73 | signature='()->(1)')(self.filters) 74 | getLogger(__name__).info("Scores for all filters: %s", scores) 75 | mins = np.min(scores, axis=1).reshape(-1, 1) 76 | maxs = np.max(scores, axis=1).reshape(-1, 1) 77 | return (scores - mins) / (maxs - mins) 78 | 79 | def __len__(self): 80 | """Return the number of filters used in the ensemble. 81 | 82 | Parameters 83 | ---------- 84 | 85 | Returns 86 | ------- 87 | int : number of filters 88 | """ 89 | return len(self.filters) 90 | 91 | def _fit(self, X, y): 92 | """Fit the ensemble. 93 | 94 | Parameters 95 | ---------- 96 | X : array-like, shape (n_samples, n_features) 97 | The training input samples. 98 | y : array-like, shape (n_samples,) 99 | The target values. 100 | 101 | Returns 102 | ------- 103 | None 104 | """ 105 | check_filters(self.filters) 106 | getLogger(__name__).info( 107 | "Running WeightBased with filters: %s", self.filters) 108 | filter_scores = self.get_scores(X, y) 109 | getLogger(__name__).info( 110 | "Normalized scores for all filters: %s", filter_scores) 111 | if self.weights is None: 112 | weights = np.ones(len(self.filters)) / len(self.filters) 113 | else: 114 | weights = self.weights 115 | getLogger(__name__).info("Weights vector: %s", weights) 116 | self.feature_scores_ = self.fusion_function(filter_scores, weights) 117 | getLogger(__name__).info("Feature scores: %s", self.feature_scores_) 118 | self.selected_features_ = apply_cr(self.cutting_rule)( 119 | self.feature_scores_) 120 | -------------------------------------------------------------------------------- /ITMO_FS/ensembles/measure_based/__init__.py: -------------------------------------------------------------------------------- 1 | from .WeightBased import * 2 | from .fusion_functions import * 3 | -------------------------------------------------------------------------------- /ITMO_FS/ensembles/measure_based/fusion_functions.py: -------------------------------------------------------------------------------- 1 | from numpy import dot 2 | 3 | 4 | def weight_fusion(filter_scores, weights): 5 | """Calculate the weighted score of each feature. 6 | 7 | Parameters 8 | ---------- 9 | filter_scores : array-like, shape (n_filters, n_features) 10 | Scores for all filters. 11 | weights : array-like, shape (n_filters,) 12 | Filter weights. 13 | 14 | Returns 15 | ------- 16 | array-like, shape (n_features,) : feature scores 17 | """ 18 | return filter_scores.T.dot(weights) 19 | -------------------------------------------------------------------------------- /ITMO_FS/ensembles/model_based/__init__.py: -------------------------------------------------------------------------------- 1 | from .best_sum import BestSum 2 | -------------------------------------------------------------------------------- /ITMO_FS/ensembles/model_based/best_sum.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.base import clone 3 | from sklearn.model_selection import cross_val_score 4 | from logging import getLogger 5 | from ...utils import BaseTransformer, apply_cr 6 | 7 | 8 | class BestSum(BaseTransformer): 9 | """Best weighted sum ensemble. The ensemble fits the input models and 10 | computes the feature scores as the weighted sum of the models' feature 11 | scores and some performance metric (e.g. accuracy) 12 | 13 | Parameters 14 | ---------- 15 | models : collection 16 | Collection of model objects. Models should have a fit(X, y) method and 17 | a field corresponding to feature weights. 18 | cutting_rule : string or callable 19 | A cutting rule name defined in GLOB_CR or a callable with signature 20 | cutting_rule (features), which should return a list features ranked by 21 | some rule. 22 | weight_func : callable 23 | The function to extract weights from the model. 24 | metric : string or callable 25 | A standard estimator metric (e.g. 'f1' or 'roc_auc') or a callable 26 | object / function with signature measure(estimator, X, y) which 27 | should return only a single value. 28 | cv : int 29 | Number of folds in cross-validation. 30 | 31 | See Also 32 | -------- 33 | Jeon, H.; Oh, S. Hybrid-Recursive Feature Elimination for Efficient 34 | Feature Selection. Appl. Sci. 2020, 10, 3211. 35 | 36 | Examples 37 | -------- 38 | >>> from ITMO_FS.ensembles import BestSum 39 | >>> from sklearn.svm import SVC 40 | >>> from sklearn.linear_model import LogisticRegression 41 | >>> from sklearn.linear_model import RidgeClassifier 42 | >>> import numpy as np 43 | >>> models = [SVC(kernel='linear'), 44 | ... LogisticRegression(), 45 | ... RidgeClassifier()] 46 | >>> x = np.array([[3, 3, 3, 2, 2], 47 | ... [3, 3, 1, 2, 3], 48 | ... [1, 3, 5, 1, 1], 49 | ... [3, 1, 4, 3, 1], 50 | ... [3, 1, 2, 3, 1]]) 51 | >>> y = np.array([1, 2, 2, 1, 2]) 52 | >>> bs = BestSum(models, ("K best", 2), 53 | ... lambda model: np.square(model.coef_).sum(axis=0), cv=2).fit(x, y) 54 | >>> bs.selected_features_ 55 | array([0, 2], dtype=int64) 56 | """ 57 | 58 | def __init__(self, models, cutting_rule, weight_func, metric='f1_micro', 59 | cv=3): 60 | super().__init__() 61 | self.models = models 62 | self.cutting_rule = cutting_rule 63 | self.weight_func = weight_func 64 | self.metric = metric 65 | self.cv = cv 66 | 67 | def _fit(self, X, y): 68 | """ 69 | Fits the ensemble. 70 | 71 | Parameters 72 | ---------- 73 | X : array-like, shape (n_samples, n_features) 74 | The training input samples. 75 | y : array-like, shape (n_samples, ) 76 | The target values. 77 | 78 | Returns 79 | ------- 80 | None 81 | """ 82 | 83 | def __get_weights(model): 84 | _model = clone(model).fit(X, y) 85 | weights = self.weight_func(_model) 86 | perf = cross_val_score(_model, X, y, cv=self.cv, 87 | scoring=self.metric).mean() 88 | return weights * perf 89 | 90 | if len(self.models) == 0: 91 | getLogger(__name__).error("No models are set") 92 | raise ValueError("No models are set") 93 | 94 | model_scores = np.vectorize( 95 | lambda model: __get_weights(model), 96 | signature='()->(1)')(self.models) 97 | getLogger(__name__).info("Weighted model scores: %s", model_scores) 98 | self.feature_scores_ = model_scores.sum(axis=0) 99 | getLogger(__name__).info("Feature scores: %s", self.feature_scores_) 100 | self.selected_features_ = apply_cr(self.cutting_rule)( 101 | self.feature_scores_) 102 | -------------------------------------------------------------------------------- /ITMO_FS/ensembles/ranking_based/Mixed.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | 5 | from .fusion_functions import * 6 | from ...utils import BaseTransformer 7 | 8 | 9 | class Mixed(BaseTransformer): 10 | """Perform feature selection based on several filters, selecting features 11 | this way: 12 | Get ranks from every filter from input. 13 | Then loops through, on every iteration=i 14 | selects features on i position on every filter 15 | then shuffles them, then adds to result list without 16 | duplication, 17 | continues until specified number of features 18 | 19 | Parameters 20 | ---------- 21 | filters : collection 22 | Collection of measure functions with signature measure(X, y) that 23 | should return an array of importance values for each feature. 24 | n_features : int 25 | Amount of features to select. 26 | fusion_function : callable 27 | A function with signature (filter_ranks (array-like, shape 28 | (n_filters, n_features), k (int)) that should return the indices of k 29 | selected features based on the filter rankings. 30 | 31 | Examples 32 | -------- 33 | >>> from ITMO_FS.filters.univariate.measures import * 34 | >>> from ITMO_FS.ensembles.ranking_based.Mixed import Mixed 35 | >>> import numpy as np 36 | >>> x = np.array([[3, 3, 3, 2, 2], [3, 3, 1, 2, 3], [1, 3, 5, 1, 1], 37 | ... [3, 1, 4, 3, 1], [3, 1, 2, 3, 1]]) 38 | >>> y = np.array([1, 3, 2, 1, 2]) 39 | >>> mixed = Mixed([gini_index, chi2_measure], 2).fit(x, y) 40 | >>> mixed.selected_features_ 41 | array([2, 4], dtype=int64) 42 | """ 43 | def __init__(self, filters, n_features, 44 | fusion_function=best_goes_first_fusion): 45 | self.filters = filters 46 | self.n_features = n_features 47 | self.fusion_function = fusion_function 48 | 49 | def _fit(self, X, y): 50 | """Fit the ensemble. 51 | 52 | Parameters 53 | ---------- 54 | X : array-like, shape (n_samples, n_features) 55 | The training input samples. 56 | y : array-like, shape (n_samples,) 57 | The target values. 58 | 59 | Returns 60 | ------- 61 | None 62 | """ 63 | #TODO: some measures are 'lower is better', a simple argsort would not 64 | #work there - need to call a different ranking function 65 | self.filter_ranks_ = np.vectorize( 66 | lambda f: np.argsort(f(X, y))[::-1], 67 | signature='()->(1)')(self.filters) 68 | getLogger(__name__).info("Filter ranks: %s", self.filter_ranks_) 69 | self.selected_features_ = self.fusion_function( 70 | self.filter_ranks_, self.n_features) 71 | -------------------------------------------------------------------------------- /ITMO_FS/ensembles/ranking_based/__init__.py: -------------------------------------------------------------------------------- 1 | from .Mixed import * 2 | -------------------------------------------------------------------------------- /ITMO_FS/ensembles/ranking_based/fusion_functions.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | 5 | def best_goes_first_fusion(filter_ranks, k): 6 | """ 7 | Fusion function mixes filter results according feature appearance in 8 | range of each filter. Selects first k of them. 9 | 10 | Parameters 11 | ---------- 12 | filter_ranks : array-like, shape (n_filters, n_features) 13 | Feature ranking for all filters. 14 | k : int 15 | Amount of features to select. 16 | 17 | Returns 18 | ------- 19 | array-like, shape (k,) : selected features 20 | """ 21 | result = np.array([], dtype='int') 22 | place = 0 23 | while len(result) < k: 24 | placed_features = np.setdiff1d(filter_ranks[:, place], result) 25 | random.shuffle(placed_features) 26 | result = np.append(result, placed_features) 27 | place += 1 28 | return result[:k] 29 | 30 | 31 | def borda_fusion(filter_ranks, k): 32 | """Select features according to borda function. 33 | 34 | Parameters 35 | ---------- 36 | filter_ranks : array-like, shape (n_filters, n_features) 37 | Feature ranking for all filters. 38 | k : int 39 | Amount of features to select. 40 | 41 | Returns 42 | ------- 43 | array-like, shape (k,) : selected features 44 | """ 45 | n_features = filter_ranks.shape[1] 46 | scores = np.zeros(n_features) 47 | for f in filter_ranks: 48 | scores[f] += np.arange(1, n_features + 1) 49 | return np.argsort(scores)[:k] 50 | -------------------------------------------------------------------------------- /ITMO_FS/filters/__init__.py: -------------------------------------------------------------------------------- 1 | from .multivariate import * 2 | from .univariate import * 3 | from .unsupervised import * 4 | -------------------------------------------------------------------------------- /ITMO_FS/filters/multivariate/DISRwithMassive.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | from sklearn.metrics import pairwise_distances 5 | 6 | from ...utils import BaseTransformer, generate_features 7 | from ...utils.information_theory import (entropy, joint_entropy, 8 | mutual_information) 9 | 10 | 11 | def _complementarity(x_i, x_j, y): 12 | return (entropy(x_i) + entropy(x_j) + entropy(y) - joint_entropy(x_i, x_j) 13 | - joint_entropy(x_i, y) - joint_entropy(x_j, y) 14 | + joint_entropy(x_i, x_j, y)) 15 | 16 | 17 | def _chained_information(x_i, x_j, y): 18 | return (mutual_information(x_i, y) + mutual_information(x_j, y) 19 | + _complementarity(x_i, x_j, y)) 20 | 21 | 22 | class DISRWithMassive(BaseTransformer): 23 | """Create DISR (Double Input Symmetric Relevance) feature selection filter 24 | based on kASSI criterin for feature selection which aims at maximizing the 25 | mutual information avoiding, meanwhile, large multivariate density 26 | estimation. Its a kASSI criterion with approximation of the information of 27 | a set of variables by counting average information of subset on combination 28 | of two features. This formulation thus deals with feature complementarity 29 | up to order two by preserving the same computational complexity of the 30 | MRMR and CMIM criteria The DISR calculation is done using graph based 31 | solution. 32 | 33 | Parameters 34 | ---------- 35 | n_features : int 36 | Number of features to select. 37 | 38 | Notes 39 | ----- 40 | For more details see `this paper 41 | `_. 42 | 43 | Examples 44 | -------- 45 | >>> from ITMO_FS.filters.multivariate import DISRWithMassive 46 | >>> import numpy as np 47 | >>> X = np.array([[1, 2, 3, 3, 1], [2, 2, 3, 3, 2], [1, 3, 3, 1, 3], 48 | ... [3, 1, 3, 1, 4], [4, 4, 3, 1, 5]]) 49 | >>> y = np.array([1, 2, 3, 4, 5]) 50 | >>> disr = DISRWithMassive(3).fit(X, y) 51 | >>> disr.selected_features_ 52 | array([0, 1, 4], dtype=int64) 53 | """ 54 | def __init__(self, n_features): 55 | self.n_features = n_features 56 | 57 | def _fit(self, x, y): 58 | """Fit the filter. 59 | 60 | Parameters 61 | ---------- 62 | x : array-like, shape (n_samples, n_features) 63 | The training input samples. 64 | y : array-like, shape (n_samples,) 65 | The target values. 66 | 67 | Returns 68 | ------- 69 | None 70 | """ 71 | free_features = np.array([], dtype='int') 72 | self.selected_features_ = generate_features(x) 73 | self._edges = pairwise_distances( 74 | x.T, x.T, lambda xi, xj: (_chained_information(xi, xj, y) 75 | / (joint_entropy(xi, xj) + 1e-15))) 76 | np.fill_diagonal(self._edges, 0) 77 | getLogger(__name__).info("Graph weights: %s", self._edges) 78 | 79 | while len(self.selected_features_) != self.n_features: 80 | min_index = np.argmin( 81 | np.sum(self._edges[np.ix_(self.selected_features_, 82 | self.selected_features_)], axis=0)) 83 | getLogger(__name__).info( 84 | "Removing feature %d from selected set", 85 | self.selected_features_[min_index]) 86 | free_features = np.append( 87 | free_features, self.selected_features_[min_index]) 88 | self.selected_features_ = np.delete( 89 | self.selected_features_, min_index) 90 | 91 | getLogger(__name__).info( 92 | "Selected set: %s, free set: %s", self.selected_features_, 93 | free_features) 94 | 95 | while True: 96 | selected_weights = np.sum( 97 | self._edges[np.ix_(self.selected_features_, 98 | self.selected_features_)], axis=0) 99 | getLogger(__name__).info( 100 | "Graph of selected set: %s", selected_weights) 101 | 102 | free_weights = np.sum(self._edges[np.ix_(self.selected_features_, 103 | free_features)], axis=0) 104 | getLogger(__name__).info( 105 | "Free weights that would be added: %s", free_weights) 106 | 107 | difference = ( 108 | free_weights.reshape(-1, 1) 109 | - self._edges[np.ix_(free_features, self.selected_features_)] 110 | - selected_weights) 111 | getLogger(__name__).info("Difference matrix: %s", difference) 112 | 113 | if np.all(difference <= 0): 114 | getLogger(__name__).info( 115 | "All differences are non-positive, terminating") 116 | break 117 | index_add, index_del = np.unravel_index( 118 | np.argmax(difference), difference.shape) 119 | getLogger(__name__).info( 120 | "Maximum difference found at index (%d, %d), swapping those " 121 | "features", index_add, index_del) 122 | 123 | self.selected_features_[index_del], free_features[index_add] = ( 124 | free_features[index_add], self.selected_features_[index_del]) 125 | -------------------------------------------------------------------------------- /ITMO_FS/filters/multivariate/FCBF.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | 5 | from ...utils import BaseTransformer, generate_features 6 | from ...utils.information_theory import entropy, conditional_entropy 7 | 8 | 9 | class FCBFDiscreteFilter(BaseTransformer): 10 | """Create FCBF (Fast Correlation Based filter) feature selection filter 11 | based on mutual information criteria for data with discrete features. This 12 | filter finds best set of features by searching for a feature, which 13 | provides the most information about classification problem on given dataset 14 | at each step and then eliminating features which are less relevant than 15 | redundant. 16 | 17 | Parameters 18 | ---------- 19 | delta : float 20 | Symmetric uncertainty value threshold. 21 | 22 | Notes 23 | ----- 24 | For more details see `this paper 25 | `_. 26 | 27 | Examples 28 | -------- 29 | >>> from ITMO_FS.filters.multivariate import FCBFDiscreteFilter 30 | >>> import numpy as np 31 | >>> X = np.array([[1, 2, 3, 3, 1], [2, 2, 3, 3, 2], [1, 3, 3, 1, 3], 32 | ... [3, 1, 3, 1, 4], [4, 4, 3, 1, 5]]) 33 | >>> y = np.array([1, 2, 3, 4, 5]) 34 | >>> fcbf = FCBFDiscreteFilter().fit(X, y) 35 | >>> fcbf.selected_features_ 36 | array([4], dtype=int64) 37 | """ 38 | def __init__(self, delta=0.1): 39 | self.delta = delta 40 | 41 | def _fit(self, x, y): 42 | """Fit the filter. 43 | 44 | Parameters 45 | ---------- 46 | x : array-like, shape (n_samples, n_features) 47 | The training input samples. 48 | y : array-like, shape (n_samples,) 49 | The target values. 50 | 51 | Returns 52 | ------- 53 | None 54 | """ 55 | def __SU(x, y, entropy_y): 56 | entropy_x = entropy(x) 57 | return 2 * ((entropy_x - conditional_entropy(y, x)) 58 | / (entropy_x + entropy_y)) 59 | 60 | free_features = generate_features(x) 61 | self.selected_features_ = np.array([], dtype='int') 62 | entropy_y = entropy(y) 63 | getLogger(__name__).info("Entropy of y: %f", entropy_y) 64 | 65 | su_class = np.apply_along_axis(__SU, 0, x, y, entropy_y) 66 | getLogger(__name__).info("SU values against y: %s", su_class) 67 | self.selected_features_ = np.argsort(su_class)[::-1][: 68 | np.count_nonzero(su_class > self.delta)] 69 | getLogger(__name__).info("Selected set: %s", self.selected_features_) 70 | 71 | index = 1 72 | while index < self.selected_features_.shape[0]: 73 | feature = self.selected_features_[index - 1] 74 | getLogger(__name__).info("Leading feature: %d", feature) 75 | entropy_feature = entropy(x[:, feature]) 76 | getLogger(__name__).info( 77 | "Leading feature entropy: %f", entropy_feature) 78 | su_classes = su_class[self.selected_features_[index:]] 79 | getLogger(__name__).info( 80 | "SU values against y for the remaining features: %s", 81 | su_classes) 82 | su_feature = np.apply_along_axis( 83 | __SU, 0, x[:, self.selected_features_[index:]], x[:, feature], 84 | entropy_feature) 85 | getLogger(__name__).info( 86 | "SU values against leading feature for the remaining features: " 87 | "%s", su_feature) 88 | to_delete = np.flatnonzero(su_feature >= su_classes) + index 89 | getLogger(__name__).info( 90 | "Deleting those features from the selected set: %s", 91 | self.selected_features_[to_delete]) 92 | self.selected_features_ = np.delete( 93 | self.selected_features_, to_delete) 94 | index += 1 95 | -------------------------------------------------------------------------------- /ITMO_FS/filters/multivariate/MultivariateFilter.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | from sklearn.base import TransformerMixin 5 | 6 | from .measures import (MEASURE_NAMES, mutual_information, 7 | matrix_mutual_information) 8 | from ...utils import BaseTransformer, generate_features 9 | 10 | 11 | class MultivariateFilter(BaseTransformer): 12 | """Provides basic functionality for multivariate filters. 13 | 14 | Parameters 15 | ---------- 16 | measure : string or callable 17 | A metric name defined in GLOB_MEASURE or a callable with signature 18 | measure(selected_features, free_features, dataset, labels) which 19 | should return a list of metric values for each feature in the dataset. 20 | n_features : int 21 | Number of features to select. 22 | beta : float, optional 23 | Initialize only in case you run MIFS or generalizedCriteria metrics. 24 | gamma : float, optional 25 | Initialize only in case you run generalizedCriteria metric. 26 | 27 | See Also 28 | -------- 29 | 30 | Examples 31 | -------- 32 | >>> from ITMO_FS.filters.multivariate import MultivariateFilter 33 | >>> from sklearn.preprocessing import KBinsDiscretizer 34 | >>> import numpy as np 35 | >>> est = KBinsDiscretizer(n_bins=10, encode='ordinal') 36 | >>> x = np.array([[1, 2, 3, 3, 1], [2, 2, 3, 3, 2], [1, 3, 3, 1, 3], 37 | ... [3, 1, 3, 1, 4], [4, 4, 3, 1, 5]]) 38 | >>> y = np.array([1, 2, 3, 4, 5]) 39 | >>> data = est.fit_transform(x) 40 | >>> model = MultivariateFilter('JMI', 3).fit(x, y) 41 | >>> model.selected_features_ 42 | array([4, 0, 1], dtype=int64) 43 | """ 44 | def __init__(self, measure, n_features, beta=None, gamma=None): 45 | self.measure = measure 46 | self.n_features = n_features 47 | self.beta = beta 48 | self.gamma = gamma 49 | 50 | def _fit(self, X, y, **kwargs): 51 | """Fit the filter. 52 | 53 | Parameters 54 | ---------- 55 | X : array-like, shape (n_samples, n_features) 56 | The training input samples. 57 | y : array-like, shape (n_samples,) 58 | The target values. 59 | **kwargs 60 | 61 | Returns 62 | ------- 63 | None 64 | """ 65 | if isinstance(self.measure, str): 66 | try: 67 | measure = MEASURE_NAMES[self.measure] 68 | except KeyError: 69 | getLogger(__name__).error("No %r measure yet", self.measure) 70 | raise KeyError("No %r measure yet" % self.measure) 71 | 72 | getLogger(__name__).info( 73 | "Using MultivariateFilter with %s measure", measure) 74 | free_features = generate_features(X) 75 | self.selected_features_ = np.array([], dtype='int') 76 | 77 | relevance = np.apply_along_axis( 78 | mutual_information, 0, X[:, free_features], y) 79 | getLogger(__name__).info("Relevance vector: %s", relevance) 80 | 81 | redundancy = np.vectorize( 82 | lambda free_feature: matrix_mutual_information( 83 | X[:, free_features], X[:, free_feature]), 84 | signature='()->(1)')(free_features) 85 | getLogger(__name__).info("Redundancy vector: %s", redundancy) 86 | 87 | while len(self.selected_features_) != self.n_features: 88 | if self.beta is None: 89 | values = measure( 90 | self.selected_features_, free_features, X, y, 91 | relevance=relevance[free_features], 92 | redundancy=np.sum( 93 | redundancy[self.selected_features_], 94 | axis=0)[free_features]) 95 | else: 96 | if self.gamma is not None: 97 | values = measure( 98 | self.selected_features_, free_features, X, y, self.beta, 99 | self.gamma, relevance=relevance[free_features], 100 | redundancy=np.sum( 101 | redundancy[self.selected_features_], 102 | axis=0)[free_features]) 103 | else: 104 | values = measure( 105 | self.selected_features_,free_features, X, y, self.beta, 106 | relevance=relevance[free_features], 107 | redundancy=np.sum( 108 | redundancy[self.selected_features_], 109 | axis=0)[free_features]) 110 | 111 | getLogger(__name__).info("Free features: %s", free_features) 112 | getLogger(__name__).info("Measure values: %s", values) 113 | to_add = np.argmax(values) 114 | getLogger(__name__).info( 115 | "Adding feature %d to the selected set", free_features[to_add]) 116 | self.selected_features_ = np.append( 117 | self.selected_features_, free_features[to_add]) 118 | free_features = np.delete(free_features, to_add) 119 | -------------------------------------------------------------------------------- /ITMO_FS/filters/multivariate/STIR.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | from sklearn.metrics import pairwise_distances 5 | from sklearn.preprocessing import MinMaxScaler 6 | 7 | from ...utils import knn_from_class, BaseTransformer 8 | 9 | 10 | class STIR(BaseTransformer): 11 | """Feature selection using STIR algorithm. 12 | 13 | Parameters 14 | ---------- 15 | n_features : int 16 | Number of features to select. 17 | metric : str or callable 18 | Distance metric to use in kNN. If str, should be one of the standard 19 | distance metrics (e.g. 'euclidean' or 'manhattan'). If callable, should 20 | have the signature metric(x1 (array-like, shape (n,)), x2 (array-like, 21 | shape (n,))) that should return the distance between two vectors. 22 | k : int 23 | Number of constant nearest hits/misses. 24 | 25 | Notes 26 | ----- 27 | For more details see `this paper `_. 28 | 29 | Examples 30 | -------- 31 | >>> from ITMO_FS.filters.multivariate import STIR 32 | >>> import numpy as np 33 | >>> X = np.array([[3, 3, 3, 2, 2], [3, 3, 1, 2, 3], [1, 3, 5, 1, 1], 34 | ... [3, 1, 4, 3, 1], [3, 1, 2, 3, 1]]) 35 | >>> y = np.array([1, 2, 2, 1, 2]) 36 | >>> model = STIR(2).fit(X, y) 37 | >>> model.selected_features_ 38 | array([2, 0], dtype=int64) 39 | """ 40 | def __init__(self, n_features, metric='manhattan', k=1): 41 | self.n_features = n_features 42 | self.metric = metric 43 | self.k = k 44 | 45 | def _fit(self, X, y): 46 | """Fit the filter. 47 | 48 | Parameters 49 | ---------- 50 | X : array-like, shape (n_samples, n_features) 51 | The input samples. 52 | y : array-like, shape (n_samples,) 53 | The classes for the samples. 54 | 55 | Returns 56 | ------- 57 | None 58 | """ 59 | n_samples = X.shape[0] 60 | classes, counts = np.unique(y, return_counts=True) 61 | 62 | if np.any(counts <= self.k): 63 | getLogger(__name__).error( 64 | "Cannot select %d nearest neighbors because one of the classes " 65 | "has less than %d samples", self.k, self.k + 1) 66 | raise ValueError( 67 | "Cannot select %d nearest neighbors because one of the classes " 68 | "has less than %d samples" % (self.k, self.k + 1)) 69 | 70 | x_normalized = MinMaxScaler().fit_transform(X) 71 | dm = pairwise_distances(x_normalized, x_normalized, self.metric) 72 | getLogger(__name__).info("Distance matrix: %s", dm) 73 | 74 | indices = np.arange(n_samples) 75 | hits_diffs = np.abs( 76 | np.vectorize( 77 | lambda index: ( 78 | x_normalized[index] 79 | - x_normalized[knn_from_class( 80 | dm, y, index, self.k, y[index])]), 81 | signature='()->(n,m)')(indices)) 82 | getLogger(__name__).info("Hit differences matrix: %s", hits_diffs) 83 | misses_diffs = np.abs( 84 | np.vectorize( 85 | lambda index: ( 86 | x_normalized[index] 87 | - x_normalized[knn_from_class( 88 | dm, y, index, self.k, y[index], anyOtherClass=True)]), 89 | signature='()->(n,m)')(indices)) 90 | getLogger(__name__).info("Miss differences matrix: %s", misses_diffs) 91 | 92 | H = np.mean(hits_diffs, axis=(0,1)) 93 | getLogger(__name__).info("H: %s", H) 94 | M = np.mean(misses_diffs, axis=(0,1)) 95 | getLogger(__name__).info("M: %s", M) 96 | var_H = np.var(hits_diffs, axis=(0,1)) 97 | var_M = np.var(misses_diffs, axis=(0,1)) 98 | 99 | # the 1 / (1 / |M| + 1 / |H|) ^ (1/2) multiplier is constant, we omit it 100 | self.feature_scores_ = ( 101 | (M - H) * np.sqrt(2 * self.k * n_samples - 2) 102 | / (np.sqrt((self.k * n_samples - 1) * (var_H + var_M)) + 1e-15)) 103 | getLogger(__name__).info("Feature scores: %s", self.feature_scores_) 104 | self.selected_features_ = np.argsort(self.feature_scores_)[::-1][ 105 | :self.n_features] 106 | 107 | -------------------------------------------------------------------------------- /ITMO_FS/filters/multivariate/TraceRatioFisher.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | from sklearn.metrics.pairwise import pairwise_distances 5 | 6 | from ...utils import BaseTransformer, generate_features 7 | 8 | class TraceRatioFisher(BaseTransformer): 9 | """Creates TraceRatio(similarity based) feature selection filter 10 | performed in supervised way, i.e. fisher version 11 | 12 | Parameters 13 | ---------- 14 | n_features : int 15 | Number of features to select. 16 | epsilon : float 17 | Lambda change threshold. 18 | 19 | Notes 20 | ----- 21 | For more details see `this paper 22 | `_. 23 | 24 | Examples 25 | -------- 26 | >>> from ITMO_FS.filters.multivariate import TraceRatioFisher 27 | >>> x = np.array([[1, 2, 3, 3, 1], [2, 2, 3, 3, 2], [1, 3, 3, 1, 3], 28 | ... [3, 1, 3, 1, 4], [4, 4, 3, 1, 5]]) 29 | >>> y = np.array([1, 2, 1, 1, 2]) 30 | >>> tracer = TraceRatioFisher(3).fit(x, y) 31 | >>> tracer.selected_features_ 32 | array([0, 1, 3], dtype=int64) 33 | """ 34 | def __init__(self, n_features, epsilon=1e-3): 35 | self.n_features = n_features 36 | self.epsilon = epsilon 37 | 38 | def _fit(self, X, y): 39 | """Fit the filter. 40 | 41 | Parameters 42 | ---------- 43 | X : array-like, shape (n_samples, n_features) 44 | The training input samples 45 | y : array-like, shape (n_samples,) 46 | The target values 47 | 48 | Returns 49 | ------- 50 | None 51 | """ 52 | n_samples = X.shape[0] 53 | classes, counts = np.unique(y, return_counts=True) 54 | counts_d = {cl: counts[idx] for idx, cl in enumerate(classes)} 55 | getLogger(__name__).info("Class counts: %s", counts_d) 56 | 57 | A_within = pairwise_distances( 58 | y.reshape(-1, 1), metric=lambda x, y: ( 59 | (x[0] == y[0]) / counts_d[x[0]])) 60 | L_within = np.eye(n_samples) - A_within 61 | getLogger(__name__).info("A_w: %s", A_within) 62 | getLogger(__name__).info("L_w: %s", L_within) 63 | 64 | L_between = A_within - np.ones((n_samples, n_samples)) / n_samples 65 | getLogger(__name__).info("L_b: %s", L_between) 66 | 67 | E = X.T.dot(L_within).dot(X) 68 | B = X.T.dot(L_between).dot(X) 69 | 70 | # we need only diagonal elements for trace calculation 71 | e = np.array(np.diag(E)) 72 | b = np.array(np.diag(B)) 73 | getLogger(__name__).info("E: %s", e) 74 | getLogger(__name__).info("B: %s", b) 75 | lam = 0 76 | prev_lam = -1 77 | while (lam - prev_lam >= self.epsilon): # TODO: optimize 78 | score = b - lam * e 79 | getLogger(__name__).info("Score: %s", score) 80 | self.selected_features_ = np.argsort(score)[::-1][:self.n_features] 81 | getLogger(__name__).info( 82 | "New selected set: %s", self.selected_features_) 83 | prev_lam = lam 84 | lam = (np.sum(b[self.selected_features_]) 85 | / np.sum(e[self.selected_features_])) 86 | getLogger(__name__).info("New lambda: %d", lam) 87 | self.score_ = score 88 | self.lam_ = lam 89 | -------------------------------------------------------------------------------- /ITMO_FS/filters/multivariate/__init__.py: -------------------------------------------------------------------------------- 1 | from .DISRwithMassive import * 2 | from .FCBF import * 3 | from .MultivariateFilter import MultivariateFilter 4 | from .measures import * 5 | from .TraceRatioFisher import TraceRatioFisher 6 | from .STIR import STIR 7 | from .mimaga import MIMAGA 8 | -------------------------------------------------------------------------------- /ITMO_FS/filters/univariate/NDFS.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | from sklearn.cluster import KMeans 5 | from sklearn.metrics.pairwise import pairwise_distances 6 | from sklearn.neighbors import NearestNeighbors 7 | from sklearn.preprocessing import OneHotEncoder 8 | 9 | from ...utils import l21_norm, matrix_norm, power_neg_half, BaseTransformer 10 | 11 | 12 | class NDFS(BaseTransformer): 13 | """Nonnegative Discriminative Feature Selection algorithm. 14 | 15 | Parameters 16 | ---------- 17 | n_features : int 18 | Number of features to select. 19 | c : int 20 | Amount of clusters to find. 21 | k : int 22 | Amount of nearest neighbors to use while building the graph. 23 | alpha : float 24 | Parameter in the objective function. 25 | beta : float 26 | Regularization parameter in the objective function. 27 | gamma : float 28 | Parameter in the objective function that controls the orthogonality 29 | condition. 30 | sigma : float 31 | Parameter for the weighting scheme. 32 | max_iterations : int 33 | Maximum amount of iterations to perform. 34 | epsilon : positive float 35 | Specifies the needed residual between the target functions from 36 | consecutive iterations. If the residual is smaller than epsilon, the 37 | algorithm is considered to have converged. 38 | 39 | See Also 40 | -------- 41 | http://www.nlpr.ia.ac.cn/2012papers/gjhy/gh27.pdf 42 | 43 | Examples 44 | -------- 45 | >>> from ITMO_FS.filters.univariate import NDFS 46 | >>> import numpy as np 47 | >>> X = np.array([[1, 2, 3, 3, 1], [2, 2, 3, 3, 2], [1, 3, 3, 1, 3], 48 | ... [1, 1, 3, 1, 4], [2, 4, 3, 1, 5]]) 49 | >>> y = np.array([1, 2, 1, 1, 2]) 50 | >>> model = NDFS(3).fit(X, y) 51 | >>> model.selected_features_ 52 | array([0, 3, 4], dtype=int64) 53 | >>> model = NDFS(3).fit(X) 54 | >>> model.selected_features_ 55 | array([3, 4, 1], dtype=int64) 56 | """ 57 | def __init__(self, n_features, c=2, k=3, alpha=1, beta=1, gamma=10e8, 58 | sigma=1, max_iterations=1000, epsilon=1e-5): 59 | self.n_features = n_features 60 | self.c = c 61 | self.k = k 62 | self.alpha = alpha 63 | self.beta = beta 64 | self.gamma = gamma 65 | self.sigma = sigma 66 | self.max_iterations = max_iterations 67 | self.epsilon = epsilon 68 | 69 | def __scheme(self, x1, x2): 70 | return np.exp(-np.linalg.norm(x1 - x2) ** 2 / (self.sigma ** 2)) 71 | 72 | def _fit(self, X, y, **kwargs): 73 | """Fit the filter. 74 | 75 | Parameters 76 | ---------- 77 | X : array-like, shape (n_samples, n_features) 78 | The training input samples. 79 | y : array-like, shape (n_samples,) or (n_samples, n_classes) 80 | The target values or their one-hot encoding that are used to 81 | compute F. If not present, a k-means clusterization algorithm 82 | is used. If present, n_classes should be equal to c. 83 | 84 | Returns 85 | ------- 86 | None 87 | """ 88 | n_samples = X.shape[0] 89 | 90 | if self.k >= n_samples: 91 | getLogger(__name__).error( 92 | "Cannot select %d nearest neighbors with n_samples = %d", 93 | self.k, n_samples) 94 | raise ValueError( 95 | "Cannot select %d nearest neighbors with n_samples = %d" 96 | % (self.k, n_samples)) 97 | 98 | graph = NearestNeighbors( 99 | n_neighbors=self.k, 100 | algorithm='ball_tree').fit(X).kneighbors_graph().toarray() 101 | graph = np.minimum(1, graph + graph.T) 102 | getLogger(__name__).info("Nearest neighbors graph: %s", graph) 103 | 104 | S = graph * pairwise_distances( 105 | X, metric=lambda x, y: self.__scheme(x, y)) 106 | getLogger(__name__).info("S: %s", S) 107 | A = np.diag(S.sum(axis=0)) 108 | getLogger(__name__).info("A: %s", A) 109 | L = power_neg_half(A).dot(A - S).dot(power_neg_half(A)) 110 | getLogger(__name__).info("L: %s", L) 111 | 112 | if y is not None: 113 | if len(y.shape) == 2: 114 | Y = y 115 | else: 116 | Y = OneHotEncoder().fit_transform(y.reshape(-1, 1)).toarray() 117 | else: 118 | if self.c > n_samples: 119 | getLogger(__name__).error( 120 | "Cannot find %d clusters with n_samples = %d", self.c, 121 | n_samples) 122 | raise ValueError( 123 | "Cannot find %d clusters with n_samples = %d" 124 | % (self.c, n_samples)) 125 | Y = self.__run_kmeans(X) 126 | getLogger(__name__).info("Transformed Y: %s", Y) 127 | F = Y.dot(power_neg_half(Y.T.dot(Y))) 128 | getLogger(__name__).info("F: %s", F) 129 | D = np.eye(self.n_features_) 130 | In = np.eye(n_samples) 131 | Ic = np.eye(Y.shape[1]) 132 | 133 | previous_target = -1 134 | for _ in range(self.max_iterations): 135 | M = (L + self.alpha 136 | * (In - X.dot( 137 | np.linalg.inv(X.T.dot(X) + self.beta * D)).dot(X.T))) 138 | getLogger(__name__).info("M: %s", M) 139 | F = (F * ((self.gamma * F) 140 | / (M.dot(F) + self.gamma * F.dot(F.T).dot(F)))) 141 | getLogger(__name__).info("F: %s", F) 142 | W = np.linalg.inv(X.T.dot(X) + self.beta * D).dot(X.T.dot(F)) 143 | getLogger(__name__).info("W: %s", W) 144 | diag = 2 * matrix_norm(W) 145 | diag[diag < 1e-10] = 1e-10 # prevents division by zero 146 | D = np.diag(1 / diag) 147 | getLogger(__name__).info("D: %s", D) 148 | 149 | target = (np.trace(F.T.dot(L).dot(F)) 150 | + self.alpha * (np.linalg.norm(X.dot(W) - F) ** 2 151 | + self.beta * l21_norm(W)) 152 | + self.gamma * (np.linalg.norm(F.T.dot(F) - Ic) ** 2) / 2) 153 | getLogger(__name__).info("New target value: %d", target) 154 | if abs(target - previous_target) < self.epsilon: 155 | break 156 | previous_target = target 157 | 158 | getLogger(__name__).info("Ended up with W: %s", W) 159 | self.feature_scores_ = matrix_norm(W) 160 | getLogger(__name__).info("Feature scores: %s", self.feature_scores_) 161 | ranking = np.argsort(self.feature_scores_)[::-1] 162 | self.selected_features_ = ranking[:self.n_features] 163 | 164 | def __run_kmeans(self, X): 165 | kmeans = KMeans(n_clusters=self.c, copy_x=True) 166 | kmeans.fit(X) 167 | labels = kmeans.labels_ 168 | getLogger(__name__).info("Labels from KMeans: %s", labels) 169 | return OneHotEncoder().fit_transform(labels.reshape(-1, 1)).toarray() 170 | -------------------------------------------------------------------------------- /ITMO_FS/filters/univariate/RFS.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | from sklearn.preprocessing import OneHotEncoder 5 | 6 | from ...utils import l21_norm, matrix_norm, BaseTransformer 7 | 8 | 9 | class RFS(BaseTransformer): 10 | """Robust Feature Selection via Joint L2,1-Norms Minimization algorithm. 11 | 12 | Parameters 13 | ---------- 14 | n_features : int 15 | Number of features to select. 16 | gamma : float 17 | Regularization parameter. 18 | max_iterations : int 19 | Maximum amount of iterations to perform. 20 | epsilon : positive float 21 | Specifies the needed residual between the target functions from 22 | consecutive iterations. If the residual is smaller than epsilon, the 23 | algorithm is considered to have converged. 24 | 25 | Notes 26 | ----- 27 | For more details see `this paper 28 | `_. 29 | 30 | Examples 31 | -------- 32 | >>> from ITMO_FS.filters.univariate import RFS 33 | >>> import numpy as np 34 | >>> X = np.array([[1, 2, 3, 3, 1], [2, 2, 3, 3, 2], [1, 3, 3, 1, 3], 35 | ... [1, 1, 3, 1, 4], [2, 4, 3, 1, 5]]) 36 | >>> y = np.array([1, 2, 1, 1, 2]) 37 | >>> model = RFS(2).fit(X, y) 38 | >>> model.selected_features_ 39 | array([0, 3], dtype=int64) 40 | """ 41 | def __init__(self, n_features, gamma=1, max_iterations=1000, epsilon=1e-5): 42 | self.n_features = n_features 43 | self.gamma = gamma 44 | self.max_iterations = max_iterations 45 | self.epsilon = epsilon 46 | 47 | def _fit(self, X, y): 48 | """Fit the filter. 49 | 50 | Parameters 51 | ---------- 52 | X : array-like, shape (n_samples, n_features) 53 | The training input samples. 54 | y : array-like, shape (n_samples,) or (n_samples, n_classes) 55 | The target values or their one-hot encoding. 56 | 57 | Returns 58 | ------- 59 | None 60 | """ 61 | if len(y.shape) == 2: 62 | Y = y 63 | else: 64 | Y = OneHotEncoder().fit_transform(y.reshape(-1, 1)).toarray() 65 | 66 | getLogger(__name__).info("Transformed Y: %s", Y) 67 | n_samples = X.shape[0] 68 | A = np.append(X, self.gamma * np.eye(n_samples), axis=1) 69 | getLogger(__name__).info("A: %s", A) 70 | D = np.eye(n_samples + self.n_features_) 71 | 72 | previous_target = -1 73 | for _ in range(self.max_iterations): 74 | D_inv = np.linalg.inv(D) 75 | U = D_inv.dot(A.T).dot(np.linalg.inv(A.dot(D_inv).dot(A.T))).dot(Y) 76 | getLogger(__name__).info("U: %s", U) 77 | diag = 2 * matrix_norm(U) 78 | diag[diag < 1e-10] = 1e-10 # prevents division by zero 79 | D = np.diag(1 / diag) 80 | getLogger(__name__).info("D: %s", D) 81 | 82 | target = l21_norm(U) 83 | getLogger(__name__).info("New target value: %d", target) 84 | if abs(target - previous_target) < self.epsilon: 85 | break 86 | previous_target = target 87 | 88 | getLogger(__name__).info("Ended up with U: %s", U) 89 | self.feature_scores_ = matrix_norm(U[:self.n_features_]) 90 | getLogger(__name__).info("Feature scores: %s", self.feature_scores_) 91 | ranking = np.argsort(self.feature_scores_)[::-1] 92 | self.selected_features_ = ranking[:self.n_features] 93 | -------------------------------------------------------------------------------- /ITMO_FS/filters/univariate/SPEC.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | from scipy.linalg import eigh 5 | from sklearn.metrics.pairwise import pairwise_distances 6 | 7 | from ...utils import l21_norm, matrix_norm, power_neg_half, BaseTransformer 8 | 9 | 10 | class SPEC(BaseTransformer): 11 | """Spectral Feature Selection algorithm. 12 | 13 | Parameters 14 | ---------- 15 | n_features : int 16 | Number of features to select. 17 | k : int 18 | Amount of clusters to find. 19 | gamma : callable 20 | An "increasing function that penalizes high frequency components". 21 | Default is gamma(x) = x^2. 22 | sigma : float 23 | Parameter for the weighting scheme. 24 | phi_type : int (1, 2 or 3) 25 | Type of feature ranking function to use. 26 | 27 | Notes 28 | ----- 29 | For more details see `this paper `_. 30 | 31 | Examples 32 | -------- 33 | >>> from ITMO_FS.filters.univariate import SPEC 34 | >>> import numpy as np 35 | >>> X = np.array([[1, 2, 3, 3, 1], [2, 2, 3, 3, 2], [1, 3, 3, 1, 3], 36 | ... [1, 1, 3, 1, 4], [2, 4, 3, 1, 5]]) 37 | >>> y = np.array([1, 2, 1, 1, 2]) 38 | >>> model = SPEC(3).fit(X, y) 39 | >>> model.selected_features_ 40 | array([0, 1, 4], dtype=int64) 41 | >>> model = SPEC(3).fit(X) 42 | >>> model.selected_features_ 43 | array([3, 4, 1], dtype=int64) 44 | """ 45 | def __init__(self, n_features, k=2, gamma=(lambda x: x ** 2), sigma=0.5, 46 | phi_type=3): 47 | self.n_features = n_features 48 | self.k = k 49 | self.gamma = gamma 50 | self.sigma = sigma 51 | self.phi_type = phi_type 52 | 53 | def __scheme(self, x1, x2): 54 | return np.exp(-np.linalg.norm(x1 - x2) ** 2 / (2 * self.sigma ** 2)) 55 | 56 | def __phi1(self, cosines, eigvals, k): 57 | return np.sum(cosines * cosines * self.gamma(eigvals)) 58 | 59 | def __phi2(self, cosines, eigvals, k): 60 | return (np.sum(cosines[1:] * cosines[1:] * self.gamma(eigvals[1:])) 61 | / np.sum(cosines[1:] * cosines[1:])) 62 | 63 | def __phi3(self, cosines, eigvals, k): 64 | return np.sum(cosines[1:k] * cosines[1:k] 65 | * (self.gamma(2) - self.gamma(eigvals[1:k]))) 66 | 67 | def _fit(self, X, y): 68 | """Fit the filter. 69 | 70 | Parameters 71 | ---------- 72 | X : array-like, shape (n_samples, n_features) 73 | The training input samples. 74 | y : array-like, shape (n_samples,), optional 75 | The target values. If present, label values are used to 76 | construct the similarity graph and the amount of classes 77 | overrides k. 78 | 79 | Returns 80 | ------- 81 | None 82 | """ 83 | def calc_weight(f): 84 | f_norm = np.sqrt(D).dot(f) 85 | f_norm /= np.linalg.norm(f_norm) 86 | 87 | cosines = np.apply_along_axis( 88 | lambda vec: np.dot(vec / np.linalg.norm(vec), f_norm), 0, 89 | eigvectors) 90 | return phi(cosines, eigvals, k) 91 | 92 | if self.phi_type == 1: 93 | phi = self.__phi1 94 | elif self.phi_type == 2: 95 | phi = self.__phi2 96 | elif self.phi_type == 3: 97 | phi = self.__phi3 98 | else: 99 | getLogger(__name__).error( 100 | "phi_type should be 1, 2 or 3, %d passed", self.phi_type) 101 | raise ValueError( 102 | "phi_type should be 1, 2 or 3, %d passed" % self.phi_type) 103 | 104 | n_samples = X.shape[0] 105 | 106 | if y is None: 107 | if self.k > n_samples: 108 | getLogger(__name__).error( 109 | "Cannot find %d clusters with n_samples = %d", 110 | self.k, n_samples) 111 | raise ValueError( 112 | "Cannot find %d clusters with n_samples = %d" 113 | % (self.k, n_samples)) 114 | k = self.k 115 | graph = np.ones((n_samples, n_samples)) 116 | W = graph * pairwise_distances( 117 | X, metric=lambda x, y: self.__scheme(x, y)) 118 | else: 119 | values, counts = np.unique(y, return_counts=True) 120 | values_dict = dict(zip(values, counts)) 121 | k = len(values) 122 | W = pairwise_distances( 123 | y.reshape(-1, 1), 124 | metric=lambda x, y: (x[0] == y[0]) / values_dict[x[0]]) 125 | 126 | getLogger(__name__).info("W: %s", W) 127 | 128 | D = np.diag(W.sum(axis=1)) 129 | getLogger(__name__).info("D: %s", D) 130 | L = D - W 131 | getLogger(__name__).info("L: %s", L) 132 | L_norm = power_neg_half(D).dot(L).dot(power_neg_half(D)) 133 | getLogger(__name__).info("Normalized L: %s", L_norm) 134 | eigvals, eigvectors = eigh(a=L_norm) 135 | getLogger(__name__).info( 136 | "Eigenvalues for normalized L: %s, eigenvectors: %s", 137 | eigvals, eigvectors) 138 | 139 | self.feature_scores_ = np.apply_along_axis( 140 | lambda f: calc_weight(f), 0, X) 141 | getLogger(__name__).info("Feature scores: %s", self.feature_scores_) 142 | ranking = np.argsort(self.feature_scores_) 143 | if self.phi_type == 3: 144 | ranking = ranking[::-1] 145 | self.selected_features_ = ranking[:self.n_features] 146 | -------------------------------------------------------------------------------- /ITMO_FS/filters/univariate/UnivariateFilter.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | 5 | from .measures import CR_NAMES, MEASURE_NAMES 6 | from ...utils import (BaseTransformer, generate_features, check_restrictions, 7 | apply_cr) 8 | 9 | 10 | class UnivariateFilter(BaseTransformer): 11 | """Basic interface for using univariate measures for feature selection. 12 | List of available measures is in ITMO_FS.filters.univariate.measures, also 13 | you can provide your own measure but it should suit the argument scheme for 14 | measures, i.e. take two arguments x,y and return scores for all the 15 | features in dataset x. Same applies to cutting rules. 16 | 17 | Parameters 18 | ---------- 19 | measure : string or callable 20 | A metric name defined in GLOB_MEASURE or a callable with signature 21 | measure (sample dataset, labels of dataset samples) which should 22 | return a list of metric values for each feature in the dataset. 23 | cutting_rule : string or callables 24 | A cutting rule name defined in GLOB_CR or a callable with signature 25 | cutting_rule (features) which should return a list of features ranked by 26 | some rule. 27 | 28 | See Also 29 | -------- 30 | 31 | Examples 32 | -------- 33 | 34 | >>> import numpy as np 35 | >>> from ITMO_FS.filters.univariate import select_k_best 36 | >>> from ITMO_FS.filters.univariate import UnivariateFilter 37 | >>> from ITMO_FS.filters.univariate import f_ratio_measure 38 | >>> x = np.array([[3, 3, 3, 2, 2], [3, 3, 1, 2, 3], [1, 3, 5, 1, 1], 39 | ... [3, 1, 4, 3, 1], [3, 1, 2, 3, 1]]) 40 | >>> y = np.array([1, 3, 2, 1, 2]) 41 | >>> filter = UnivariateFilter(f_ratio_measure, 42 | ... select_k_best(2)).fit(x, y) 43 | >>> filter.selected_features_ 44 | array([4, 2], dtype=int64) 45 | >>> filter.feature_scores_ 46 | array([0.6 , 0.2 , 1. , 0.12, 5.4 ]) 47 | """ 48 | def __init__(self, measure, cutting_rule=("Best by percentage", 1.0)): 49 | self.measure = measure 50 | self.cutting_rule = cutting_rule 51 | 52 | def __apply_ms(self): 53 | if isinstance(self.measure, str): 54 | try: 55 | measure = MEASURE_NAMES[self.measure] 56 | except KeyError: 57 | getLogger(__name__).error("No %s measure yet", self.measure) 58 | raise KeyError("No %s measure yet" % self.measure) 59 | elif hasattr(self.measure, '__call__'): 60 | measure = self.measure 61 | else: 62 | getLogger(__name__).error( 63 | "%s isn't a measure function or string", self.measure) 64 | raise KeyError( 65 | "%s isn't a measure function or string" % self.measure) 66 | return measure 67 | 68 | def _fit(self, X, y, store_scores=True): 69 | """Fit the filter. 70 | 71 | Parameters 72 | ---------- 73 | X : array-like, shape (n_samples, n_features) 74 | The training input samples. 75 | y : array-like, shape (n_samples,) 76 | The target values. 77 | store_scores : boolean, optional 78 | In case you want to store the scores of features 79 | for future calls to Univariate filter; default True 80 | 81 | Returns 82 | ------- 83 | None 84 | """ 85 | measure = self.__apply_ms() 86 | cutting_rule = apply_cr(self.cutting_rule) 87 | getLogger(__name__).info( 88 | "Using UnivariateFilter with measure %s and cutting rule %s", 89 | measure, cutting_rule) 90 | 91 | check_restrictions(measure.__name__, cutting_rule.__name__) 92 | 93 | feature_scores = measure(X, y) 94 | getLogger(__name__).info("Feature scores: %s", feature_scores) 95 | 96 | if store_scores: 97 | self.feature_scores_ = feature_scores 98 | self.selected_features_ = cutting_rule(feature_scores) 99 | -------------------------------------------------------------------------------- /ITMO_FS/filters/univariate/__init__.py: -------------------------------------------------------------------------------- 1 | from .UnivariateFilter import UnivariateFilter 2 | from .VDM import VDM 3 | from .measures import anova, fit_criterion_measure, f_ratio_measure, \ 4 | gini_index, su_measure, modified_t_score, fechner_corr, \ 5 | information_gain, relief_measure, reliefF_measure, chi2_measure, \ 6 | spearman_corr, pearson_corr, laplacian_score, qpfs_filter, \ 7 | kendall_corr, select_k_best, select_k_worst, select_worst_by_value, \ 8 | select_best_by_value, select_best_percentage, \ 9 | select_worst_percentage 10 | from .NDFS import NDFS 11 | from .RFS import RFS 12 | from .SPEC import SPEC 13 | -------------------------------------------------------------------------------- /ITMO_FS/filters/unsupervised/MCFS.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | from scipy.linalg import eigh 5 | from sklearn.linear_model import Lars 6 | from sklearn.neighbors import NearestNeighbors 7 | from sklearn.metrics.pairwise import pairwise_distances 8 | 9 | from ...utils import BaseTransformer 10 | 11 | 12 | class MCFS(BaseTransformer): 13 | """Unsupervised Feature Selection for Multi-Cluster Data algorithm. 14 | 15 | Parameters 16 | ---------- 17 | n_features : int 18 | Number of features to select. 19 | k : int 20 | Amount of clusters to find. 21 | p : int 22 | Amount of nearest neighbors to use while building the graph. 23 | scheme : str, either '0-1', 'heat' or 'dot' 24 | Weighting scheme to use while building the graph. 25 | sigma : float 26 | Parameter for heat weighting scheme. Ignored if scheme is not 'heat'. 27 | full_graph : boolean 28 | If True, connect all vertices in the graph to each other instead of 29 | running the k-nearest neighbors algorithm. Use with 'heat' or 'dot' 30 | schemes. 31 | 32 | Notes 33 | ----- 34 | For more details see `this paper 35 | `_. 36 | 37 | Examples 38 | -------- 39 | >>> from ITMO_FS.filters.unsupervised import MCFS 40 | >>> from sklearn.datasets import make_classification 41 | >>> import numpy as np 42 | >>> dataset = make_classification(n_samples=500, n_features=100, 43 | ... n_informative=5, n_redundant=0, random_state=42, shuffle=False) 44 | >>> X, y = np.array(dataset[0]), np.array(dataset[1]) 45 | >>> model = MCFS(5).fit(X) 46 | >>> model.selected_features_ 47 | array([0, 2, 4, 1, 3], dtype=int64) 48 | """ 49 | def __init__(self, n_features, k=2, p=3, scheme='dot', sigma=1, 50 | full_graph=False): 51 | self.n_features = n_features 52 | self.k = k 53 | self.p = p 54 | self.scheme = scheme 55 | self.sigma = sigma 56 | self.full_graph = full_graph 57 | 58 | def __scheme_01(self, x1, x2): 59 | return 1 60 | 61 | def __scheme_heat(self, x1, x2): 62 | return np.exp(-np.linalg.norm(x1 - x2) ** 2 / self.sigma) 63 | 64 | def __scheme_dot(self, x1, x2): 65 | return (x1 / np.linalg.norm(x1 + 1e-10)).dot( 66 | x2 / np.linalg.norm(x2 + 1e-10)) 67 | 68 | def _fit(self, X, y): 69 | """ 70 | Fits the filter. 71 | 72 | Parameters 73 | ---------- 74 | X : array-like, shape (n_samples, n_features) 75 | The training input samples. 76 | y : array-like 77 | The target values (ignored). 78 | 79 | Returns 80 | ---------- 81 | None 82 | """ 83 | if self.scheme == '0-1': 84 | scheme = self.__scheme_01 85 | elif self.scheme == 'heat': 86 | scheme = self.__scheme_heat 87 | elif self.scheme == 'dot': 88 | scheme = self.__scheme_dot 89 | else: 90 | getLogger(__name__).error( 91 | "scheme should be either '0-1', 'heat' or 'dot'; %s passed", 92 | self.scheme) 93 | raise KeyError( 94 | "scheme should be either '0-1', 'heat' or 'dot'; %s passed" 95 | % self.scheme) 96 | 97 | n_samples = X.shape[0] 98 | 99 | 100 | if self.k > n_samples: 101 | getLogger(__name__).error( 102 | "Cannot find %d clusters with n_samples = %d", 103 | self.k, n_samples) 104 | raise ValueError( 105 | "Cannot find %d clusters with n_samples = %d" 106 | % (self.k, n_samples)) 107 | 108 | if self.p >= n_samples: 109 | getLogger(__name__).error( 110 | "Cannot select %d nearest neighbors with n_samples = %d", 111 | self.p, n_samples) 112 | raise ValueError( 113 | "Cannot select %d nearest neighbors with n_samples = %d" 114 | % (self.p, n_samples)) 115 | 116 | if self.full_graph: 117 | graph = np.ones((n_samples, n_samples)) 118 | else: 119 | graph = NearestNeighbors(n_neighbors=self.p, 120 | algorithm='ball_tree').fit(X).kneighbors_graph().toarray() 121 | graph = np.minimum(1, graph + graph.T) 122 | 123 | getLogger(__name__).info("Nearest neighbors graph: %s", graph) 124 | 125 | W = graph * pairwise_distances(X, metric=lambda x, y: scheme(x, y)) 126 | getLogger(__name__).info("W: %s", W) 127 | D = np.diag(W.sum(axis=0)) 128 | getLogger(__name__).info("D: %s", D) 129 | L = D - W 130 | getLogger(__name__).info("L: %s", L) 131 | eigvals, Y = eigh(type=1, a=L, b=D, subset_by_index=[1, self.k]) 132 | getLogger(__name__).info("Eigenvalues: %s, classes: %s", eigvals, Y) 133 | 134 | weights = np.zeros((self.n_features_, self.k)) 135 | for i in range(self.k): 136 | clf = Lars(n_nonzero_coefs=self.n_features) 137 | clf.fit(X, Y[:, i]) 138 | weights[:, i] = np.abs(clf.coef_) 139 | getLogger(__name__).info( 140 | "Weights for eigenvalue %d: %s", i, weights[:, i]) 141 | 142 | self.feature_scores_ = weights.max(axis=1) 143 | getLogger(__name__).info("Feature scores: %s", self.feature_scores_) 144 | ranking = np.argsort(self.feature_scores_)[::-1] 145 | self.selected_features_ = ranking[:self.n_features] 146 | -------------------------------------------------------------------------------- /ITMO_FS/filters/unsupervised/UDFS.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | from scipy.linalg import eigh 5 | from sklearn.neighbors import NearestNeighbors 6 | 7 | from ...utils import l21_norm, matrix_norm, BaseTransformer 8 | 9 | 10 | class UDFS(BaseTransformer): 11 | """Unsupervised Discriminative Feature Selection algorithm. 12 | 13 | Parameters 14 | ---------- 15 | n_features : int 16 | Number of features to select. 17 | c : int 18 | Amount of clusters to find. 19 | k : int 20 | Amount of nearest neighbors to use while building the graph. 21 | gamma : float 22 | Regularization term in the target function. 23 | l : float 24 | Parameter that controls the invertibility of the matrix used in 25 | computing of B. 26 | max_iterations : int 27 | Maximum amount of iterations to perform. 28 | epsilon : positive float 29 | Specifies the needed residual between the target functions from 30 | consecutive iterations. If the residual is smaller than epsilon, the 31 | algorithm is considered to have converged. 32 | 33 | Notes 34 | ----- 35 | For more details see `this paper `_. 36 | 37 | Examples 38 | -------- 39 | >>> from ITMO_FS.filters.unsupervised import UDFS 40 | >>> from sklearn.datasets import make_classification 41 | >>> import numpy as np 42 | >>> dataset = make_classification(n_samples=500, n_features=100, 43 | ... n_informative=5, n_redundant=0, random_state=42, shuffle=False, 44 | ... n_clusters_per_class=1) 45 | >>> X, y = np.array(dataset[0]), np.array(dataset[1]) 46 | >>> model = UDFS(5).fit(X) 47 | >>> model.selected_features_ 48 | array([ 2, 3, 19, 90, 92], dtype=int64) 49 | """ 50 | def __init__(self, n_features, c=2, k=3, gamma=1, l=1e-6, 51 | max_iterations=1000, epsilon=1e-5): 52 | self.n_features = n_features 53 | self.c = c 54 | self.k = k 55 | self.gamma = gamma 56 | self.l = l 57 | self.max_iterations = max_iterations 58 | self.epsilon = epsilon 59 | 60 | def _fit(self, X, y): 61 | """Fit the filter. 62 | 63 | Parameters 64 | ---------- 65 | X : array-like, shape (n_samples, n_features) 66 | The training input samples. 67 | y : array-like 68 | The target values (ignored). 69 | 70 | Returns 71 | ------- 72 | None 73 | """ 74 | def construct_S(arr): 75 | S = np.zeros((n_samples, self.k + 1)) 76 | for idx in range(self.k + 1): 77 | S[arr[idx], idx] = 1 78 | return S 79 | 80 | n_samples = X.shape[0] 81 | 82 | if self.c > n_samples: 83 | getLogger(__name__).error( 84 | "Cannot find %d clusters with n_samples = %d", 85 | self.c, n_samples) 86 | raise ValueError( 87 | "Cannot find %d clusters with n_samples = %d" 88 | % (self.c, n_samples)) 89 | 90 | if self.k >= n_samples: 91 | getLogger(__name__).error( 92 | "Cannot select %d nearest neighbors with n_samples = %d", 93 | self.k, n_samples) 94 | raise ValueError( 95 | "Cannot select %d nearest neighbors with n_samples = %d" 96 | % (self.k, n_samples)) 97 | 98 | indices = list(range(n_samples)) 99 | I = np.eye(self.k + 1) 100 | H = I - np.ones((self.k + 1, self.k + 1)) / (self.k + 1) 101 | 102 | neighbors = NearestNeighbors( 103 | n_neighbors=self.k + 1, 104 | algorithm='ball_tree').fit(X).kneighbors(X, return_distance=False) 105 | getLogger(__name__).info("Neighbors graph: %s", neighbors) 106 | X_centered = np.apply_along_axis( 107 | lambda arr: X[arr].T.dot(H), 1, neighbors) 108 | 109 | S = np.apply_along_axis(lambda arr: construct_S(arr), 1, neighbors) 110 | getLogger(__name__).info("S: %s", S) 111 | B = np.vectorize( 112 | lambda idx: np.linalg.inv(X_centered[idx].T.dot(X_centered[idx]) 113 | + self.l * I), 114 | signature='()->(1,1)')(indices) 115 | getLogger(__name__).info("B: %s", B) 116 | Mi = np.vectorize( 117 | lambda idx: S[idx].dot(H).dot(B[idx]).dot(H).dot(S[idx].T), 118 | signature='()->(1,1)')(indices) 119 | M = X.T.dot(Mi.sum(axis=0)).dot(X) 120 | getLogger(__name__).info("M: %s", M) 121 | 122 | D = np.eye(self.n_features_) 123 | previous_target = -1 124 | for step in range(self.max_iterations): 125 | P = M + self.gamma * D 126 | getLogger(__name__).info("P: %s", P) 127 | _, W = eigh(a=P, subset_by_index=[0, self.c - 1]) 128 | getLogger(__name__).info("W: %s", W) 129 | diag = 2 * matrix_norm(W) 130 | diag[diag < 1e-10] = 1e-10 # prevents division by zero 131 | D = np.diag(1 / diag) 132 | getLogger(__name__).info("D: %s", D) 133 | 134 | target = np.trace(W.T.dot(M).dot(W)) + self.gamma * l21_norm(W) 135 | getLogger(__name__).info("New target value: %d", target) 136 | if abs(target - previous_target) < self.epsilon: 137 | break 138 | previous_target = target 139 | 140 | getLogger(__name__).info("Ended up with W = %s", W) 141 | self.feature_scores_ = matrix_norm(W) 142 | getLogger(__name__).info("Feature scores: %s", self.feature_scores_) 143 | ranking = np.argsort(self.feature_scores_)[::-1] 144 | self.selected_features_ = ranking[:self.n_features] 145 | -------------------------------------------------------------------------------- /ITMO_FS/filters/unsupervised/__init__.py: -------------------------------------------------------------------------------- 1 | from .MCFS import MCFS 2 | from .UDFS import UDFS 3 | from .trace_ratio_laplacian import TraceRatioLaplacian 4 | -------------------------------------------------------------------------------- /ITMO_FS/filters/unsupervised/trace_ratio_laplacian.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | from sklearn.metrics.pairwise import pairwise_distances 5 | from sklearn.neighbors import NearestNeighbors 6 | 7 | from ...utils import BaseTransformer 8 | 9 | class TraceRatioLaplacian(BaseTransformer): 10 | """TraceRatio(similarity based) feature selection filter performed in 11 | unsupervised way, i.e laplacian version 12 | 13 | Parameters 14 | ---------- 15 | n_features : int 16 | Amount of features to select. 17 | k : int 18 | Amount of nearest neighbors to use while building the graph. 19 | t : int 20 | constant for kernel function calculation 21 | epsilon : float 22 | Lambda change threshold. 23 | 24 | Notes 25 | ----- 26 | For more details see `this paper `_. 27 | 28 | Examples 29 | -------- 30 | >>> from ITMO_FS.filters.unsupervised import TraceRatioLaplacian 31 | >>> import numpy as np 32 | >>> X = np.array([[1, 2, 3, 3, 1], [2, 2, 3, 3, 2], [1, 3, 3, 1, 3], 33 | ... [1, 1, 3, 1, 4], [2, 4, 3, 1, 5]]) 34 | >>> y = np.array([1, 2, 1, 1, 2]) 35 | >>> tracer = TraceRatioLaplacian(2, k=2).fit(X) 36 | >>> tracer.selected_features_ 37 | array([3, 1], dtype=int64) 38 | """ 39 | def __init__(self, n_features, k=5, t=1, epsilon=1e-3): 40 | self.n_features = n_features 41 | self.k = k 42 | self.t = t 43 | self.epsilon = epsilon 44 | 45 | def _fit(self, X, y): 46 | """Fit the filter. 47 | 48 | Parameters 49 | ---------- 50 | X : array-likey, shape (n_samples, n_features) 51 | The training input samples 52 | y : array-like, shape (n_samples,) 53 | The target values 54 | 55 | Returns 56 | ------- 57 | None 58 | """ 59 | n_samples = X.shape[0] 60 | 61 | if self.k >= n_samples: 62 | getLogger(__name__).error( 63 | "Cannot select %d nearest neighbors with n_samples = %d", 64 | self.k, n_samples) 65 | raise ValueError( 66 | "Cannot select %d nearest neighbors with n_samples = %d" 67 | % (self.k, n_samples)) 68 | 69 | graph = NearestNeighbors( 70 | n_neighbors=self.n_features, 71 | algorithm='ball_tree').fit(X).kneighbors_graph().toarray() 72 | graph = np.minimum(1, graph + graph.T) 73 | getLogger(__name__).info("Nearest neighbors graph: %s", graph) 74 | 75 | A_within = graph * pairwise_distances( 76 | X, metric=lambda x, y: np.exp(-np.linalg.norm(x - y) ** 2 / self.t)) 77 | getLogger(__name__).info("A_within: %s", A_within) 78 | D_within = np.diag(A_within.sum(axis=1)) 79 | getLogger(__name__).info("D_within: %s", D_within) 80 | L_within = D_within - A_within 81 | getLogger(__name__).info("L_within: %s", L_within) 82 | A_between = (D_within.dot(np.ones((n_samples, n_samples))).dot(D_within) 83 | / np.sum(D_within)) 84 | getLogger(__name__).info("A_between: %s", A_between) 85 | D_between = np.diag(A_between.sum(axis=1)) 86 | getLogger(__name__).info("D_between: %s", D_between) 87 | L_between = D_between - A_between 88 | getLogger(__name__).info("L_between: %s", L_between) 89 | 90 | E = X.T.dot(L_within).dot(X) 91 | B = X.T.dot(L_between).dot(X) 92 | 93 | # we need only diagonal elements for trace calculation 94 | e = np.array(np.diag(E)) 95 | b = np.array(np.diag(B)) 96 | getLogger(__name__).info("E: %s", e) 97 | getLogger(__name__).info("B: %s", b) 98 | lam = 0 99 | prev_lam = -1 100 | while lam - prev_lam >= self.epsilon: # TODO: optimize 101 | score = b - lam * e 102 | getLogger(__name__).info("Score: %s", score) 103 | self.selected_features_ = np.argsort(score)[::-1][:self.n_features] 104 | getLogger(__name__).info( 105 | "New selected set: %s", self.selected_features_) 106 | prev_lam = lam 107 | lam = (np.sum(b[self.selected_features_]) 108 | / np.sum(e[self.selected_features_])) 109 | getLogger(__name__).info("New lambda: %d", lam) 110 | self.score_ = score 111 | self.lam_ = lam 112 | -------------------------------------------------------------------------------- /ITMO_FS/hybrid/Melif.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | from sklearn.base import clone 5 | from sklearn.model_selection import cross_val_score 6 | 7 | from ITMO_FS.ensembles import WeightBased 8 | from ITMO_FS.utils import BaseWrapper, apply_cr 9 | from ITMO_FS.utils.data_check import * 10 | 11 | 12 | class Melif(BaseWrapper): 13 | """MeLiF algorithm. 14 | 15 | Parameters 16 | ---------- 17 | estimator : object 18 | A supervised learning estimator that should have a fit(X, y) method and 19 | a predict(X) method. 20 | measure : string or callable 21 | A standard estimator metric (e.g. 'f1' or 'roc_auc') or a callable with 22 | signature measure(estimator, X, y) which should return only a single 23 | value. 24 | cutting_rule : string or callable 25 | A cutting rule name defined in GLOB_CR or a callable with signature 26 | cutting_rule (features), which should return a list features ranked by 27 | some rule. 28 | filter_ensemble : object 29 | A filter ensemble (e.g. WeightBased) or a list of filters that will be 30 | used to create a WeightBased ensemble. 31 | delta : float 32 | The step in coordinate descent. 33 | points : array-like 34 | An array of starting points in the search. 35 | seed : int 36 | Random seed used to initialize np.random.default_rng(). 37 | cv : int 38 | Number of folds in cross-validation. 39 | 40 | See Also 41 | -------- 42 | For more details see `this paper `_. 43 | 44 | Examples 45 | -------- 46 | >>> from ITMO_FS.hybrid import Melif 47 | >>> from ITMO_FS.filters.univariate import UnivariateFilter 48 | >>> from sklearn.datasets import make_classification 49 | >>> from sklearn.preprocessing import KBinsDiscretizer 50 | >>> from sklearn.linear_model import LogisticRegression 51 | >>> dataset = make_classification(n_samples=100, n_features=20, 52 | ... n_informative=5, n_redundant=0, shuffle=False, random_state=42) 53 | >>> x, y = np.array(dataset[0]), np.array(dataset[1]) 54 | >>> x = KBinsDiscretizer(n_bins=10, encode='ordinal', 55 | ... strategy='uniform').fit_transform(x) 56 | >>> filters = [UnivariateFilter('GiniIndex'), 57 | ... UnivariateFilter('FechnerCorr'), 58 | ... UnivariateFilter('SpearmanCorr'), 59 | ... UnivariateFilter('PearsonCorr')] 60 | >>> algo = Melif(LogisticRegression(), 'f1_macro', ("K best", 5), 61 | ... filters, delta=0.5).fit(x, y) 62 | >>> algo.selected_features_ 63 | array([ 3, 4, 1, 13, 16], dtype=int64) 64 | """ 65 | def __init__(self, estimator, measure, cutting_rule, filter_ensemble, 66 | delta=0.5, points=None, seed=42, cv=3): 67 | self.estimator = estimator 68 | self.measure = measure 69 | self.cutting_rule = cutting_rule 70 | self.filter_ensemble = filter_ensemble 71 | self.delta = delta 72 | self.points = points 73 | self.seed = seed 74 | self.cv = cv 75 | 76 | def _fit(self, X, y): 77 | """Run the MeLiF algorithm on the specified dataset. 78 | 79 | Parameters 80 | ---------- 81 | X : array-like, shape (n_samples, n_features) 82 | The input samples. 83 | y : array-like, shape (n_samples,) 84 | The classes for the samples. 85 | 86 | Returns 87 | ------- 88 | None 89 | """ 90 | self._rng = np.random.default_rng(self.seed) 91 | if type(self.filter_ensemble) is list: 92 | self.__ensemble = WeightBased(self.filter_ensemble) 93 | else: 94 | self.__ensemble = clone(self.filter_ensemble) 95 | 96 | self.n_filters = len(self.__ensemble) 97 | self.__filter_weights = np.ones(self.n_filters) / self.n_filters 98 | 99 | check_cutting_rule(self.cutting_rule) 100 | cutting_rule = apply_cr(self.cutting_rule) 101 | getLogger(__name__).info( 102 | "Using MeLiF with ensemble: %s and cutting rule: %s", 103 | self.__ensemble, cutting_rule) 104 | scores = self.__ensemble.get_scores(X, y) 105 | 106 | if self.points is None: 107 | points = np.vstack((self.__filter_weights, np.eye(self.n_filters))) 108 | else: 109 | points = self.points 110 | best_point_ = points[0] 111 | 112 | self.best_score_ = 0 113 | for point in points: 114 | getLogger(__name__).info( 115 | "Running coordinate descent from point %s", point) 116 | new_point, new_score = self.__search( 117 | X, y, point, scores, cutting_rule) 118 | getLogger(__name__).info( 119 | "Ended up in point %s with score %d", new_point, new_score) 120 | if new_score > self.best_score_: 121 | self.best_score_ = new_score 122 | self.best_point_ = new_point 123 | getLogger(__name__).info( 124 | "Final best point: %s with score %d", 125 | best_point_, self.best_score_) 126 | self.selected_features_ = cutting_rule( 127 | np.dot(scores.T, self.best_point_)) 128 | self._estimator.fit(X[:, self.selected_features_], y) 129 | 130 | def __search(self, X, y, point, scores, cutting_rule): 131 | """Perform a coordinate descent from the given point. 132 | 133 | Parameters 134 | ---------- 135 | X : array-like, shape (n_samples, n_features) 136 | The input samples. 137 | y : array-like, shape (n_samples,) 138 | The classes for the samples. 139 | point : array-like, shape (n_filters,) 140 | The starting point. 141 | scores : array-like, shape (n_filters, n_features) 142 | The scores for the features from all filters. 143 | cutting_rule : callable 144 | The cutting rule to use. 145 | 146 | Returns 147 | ------- 148 | tuple (array-like, float) : the optimal point and its score 149 | """ 150 | best_point = point 151 | selected_features = cutting_rule(np.dot(scores.T, point)) 152 | best_score = cross_val_score( 153 | self._estimator, X[:, selected_features], y, cv=self.cv, 154 | scoring=self.measure).mean() 155 | delta = np.eye(self.n_filters) * self.delta 156 | changed = True 157 | while changed: 158 | #the original paper descends starting from the first filter; 159 | #we randomize the order instead to avoid local maximas 160 | getLogger(__name__).info( 161 | "Current optimal point: %s with score = %d", 162 | best_point, best_score) 163 | order = self._rng.permutation(self.n_filters) 164 | changed = False 165 | for f in order: 166 | iteration_point_plus = best_point + delta[f] 167 | selected_features = cutting_rule( 168 | np.dot(scores.T, iteration_point_plus)) 169 | score = cross_val_score( 170 | self._estimator, X[:, selected_features], y, cv=self.cv, 171 | scoring=self.measure).mean() 172 | getLogger(__name__).info( 173 | "Trying to move to point %s: score = %d", 174 | iteration_point_plus, score) 175 | if score > best_score: 176 | best_score = score 177 | best_point = iteration_point_plus 178 | changed = True 179 | break 180 | 181 | iteration_point_minus = best_point - delta[f] 182 | selected_features = cutting_rule( 183 | np.dot(scores.T, iteration_point_minus)) 184 | score = cross_val_score( 185 | self._estimator, X[:, selected_features], y, cv=self.cv, 186 | scoring=self.measure).mean() 187 | getLogger(__name__).info( 188 | "Trying to move to point %s: score = %d", 189 | iteration_point_minus, score) 190 | if score > best_score: 191 | best_score = score 192 | best_point = iteration_point_minus 193 | changed = True 194 | break 195 | return best_point, best_score 196 | -------------------------------------------------------------------------------- /ITMO_FS/hybrid/__init__.py: -------------------------------------------------------------------------------- 1 | from .filter_wrapper_hybrid import * 2 | from .Melif import Melif 3 | from .IWSSr_SFLA import IWSSr_SFLA 4 | -------------------------------------------------------------------------------- /ITMO_FS/hybrid/filter_wrapper_hybrid.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | from sklearn.base import clone 4 | 5 | from ..utils import BaseTransformer 6 | 7 | class FilterWrapperHybrid(BaseTransformer): 8 | """Perform the filter + wrapper hybrid algorithm by first running the 9 | filter algorithm on the full dataset, leaving the selected features and 10 | running the wrapper algorithm on the cut dataset. 11 | 12 | Parameters 13 | ---------- 14 | filter_ : object 15 | A feature selection model that should have a fit(X, y) method and a 16 | selected_features_ attribute available after fitting. 17 | wrapper : object 18 | A feature selection model that should have a fit(X, y) method, 19 | selected_features_ and best_score_ attributes available after fitting 20 | and a predict(X) method. 21 | 22 | Notes 23 | ----- 24 | This class doesn't require the first algorithm to be a filter (the only 25 | requirements are a fit(X, y) method and a selected_features_ attribute) 26 | but it is recommended to use a fast algorithm first to remove a lot of 27 | unnecessary features before processing the resulting dataset with a more 28 | time-consuming algorithm (e.g. a wrapper). 29 | 30 | Examples 31 | -------- 32 | >>> import numpy as np 33 | >>> from sklearn.linear_model import LogisticRegression 34 | >>> from ITMO_FS.wrappers.deterministic import BackwardSelection 35 | >>> from ITMO_FS.filters.univariate import UnivariateFilter 36 | >>> from ITMO_FS.hybrid import FilterWrapperHybrid 37 | >>> from sklearn.datasets import make_classification 38 | >>> dataset = make_classification(n_samples=100, n_features=20, 39 | ... n_informative=5, n_redundant=0, shuffle=False, random_state=42) 40 | >>> x, y = np.array(dataset[0]), np.array(dataset[1]) 41 | >>> filter_ = UnivariateFilter('FRatio', ("K best", 10)) 42 | >>> wrapper = BackwardSelection(LogisticRegression(), 5, measure='f1_macro') 43 | >>> model = FilterWrapperHybrid(filter_, wrapper).fit(x, y) 44 | >>> model.selected_features_ 45 | array([ 1, 3, 4, 10, 7], dtype=int64) 46 | """ 47 | def __init__(self, filter_, wrapper): 48 | self.filter_ = filter_ 49 | self.wrapper = wrapper 50 | 51 | def _fit(self, X, y): 52 | """Fit the model. 53 | 54 | Parameters 55 | ---------- 56 | X : array-like, shape (n_samples, n_features) 57 | The input samples. 58 | y : array-like, shape (n_samples,) 59 | The classes for the samples. 60 | 61 | Returns 62 | ------- 63 | None 64 | """ 65 | self._filter = clone(self.filter_) 66 | self._wrapper = clone(self.wrapper) 67 | getLogger(__name__).info( 68 | "Running FilterWrapper with filter = %s, wrapper = %s", 69 | self._filter, self._wrapper) 70 | 71 | selected_filter = self._filter.fit(X, y).selected_features_ 72 | getLogger(__name__).info( 73 | "Features selected by filter: %s", selected_filter) 74 | self.selected_features_ = selected_filter[self._wrapper.fit( 75 | X[:, selected_filter], y).selected_features_] 76 | self.best_score_ = self._wrapper.best_score_ 77 | 78 | def predict(self, X): 79 | """Predict class labels for the input data. 80 | 81 | Parameters 82 | ---------- 83 | X : array-like, shape (n_samples, n_features) 84 | The input samples. 85 | 86 | Returns 87 | ------ 88 | array-like, shape (n_samples,) : class labels 89 | """ 90 | return self._wrapper.predict(X) 91 | -------------------------------------------------------------------------------- /ITMO_FS/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_check import * 2 | from .functions import * 3 | from .information_theory import * 4 | from .qpfs_body import qpfs_body 5 | from .base_transformer import BaseTransformer 6 | from .base_wrapper import BaseWrapper 7 | -------------------------------------------------------------------------------- /ITMO_FS/utils/base_transformer.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from logging import getLogger 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.base import BaseEstimator, TransformerMixin 7 | from sklearn.feature_selection import VarianceThreshold 8 | from sklearn.utils import check_X_y, check_array 9 | from sklearn.utils.validation import check_is_fitted 10 | 11 | 12 | class BaseTransformer(TransformerMixin, BaseEstimator): 13 | def __init__(self): 14 | pass 15 | 16 | def fit(self, X, y=None, **fit_params): 17 | """Fit the algorithm. 18 | 19 | Parameters 20 | ---------- 21 | X : array-like, shape (n_samples, n_features) 22 | The training input samples. 23 | y : array-like, shape (n_samples,), optional 24 | The class labels. 25 | fit_params : dict, optional 26 | Additional parameters to pass to underlying _fit function. 27 | 28 | Returns 29 | ------- 30 | Self, i.e. the transformer object. 31 | """ 32 | if y is not None: 33 | X, y = check_X_y(X, y, dtype='numeric') 34 | if y.dtype.kind == 'O': 35 | y = y.astype('int') 36 | else: 37 | X = check_array(X, dtype='float64', accept_large_sparse=False) 38 | 39 | self.n_total_features_ = X.shape[1] 40 | nonconst_features = VarianceThreshold().fit(X).get_support(indices=True) 41 | self.n_features_ = nonconst_features.shape[0] 42 | 43 | if self.n_features_ != self.n_total_features_: 44 | getLogger(__name__).warning( 45 | "Found %d constant features; they would not be used in fit") 46 | 47 | if hasattr(self, 'n_features'): 48 | if self.n_features > self.n_features_: 49 | getLogger(__name__).error( 50 | "Cannot select %d features with n_features = %d", 51 | self.n_features, self.n_features_) 52 | raise ValueError( 53 | "Cannot select %d features with n_features = %d" 54 | % (self.n_features, self.n_features_)) 55 | 56 | if hasattr(self, 'epsilon'): 57 | if self.epsilon <= 0: 58 | getLogger(__name__).error( 59 | "Epsilon should be positive, %d passed", self.epsilon) 60 | raise ValueError( 61 | "Epsilon should be positive, %d passed" % self.epsilon) 62 | 63 | 64 | self._fit(X[:, nonconst_features], y, **fit_params) 65 | 66 | if hasattr(self, 'feature_scores_'): 67 | scores = np.empty(self.n_total_features_) 68 | scores.fill(np.nan) 69 | scores[nonconst_features] = self.feature_scores_ 70 | self.feature_scores_ = scores 71 | self.selected_features_ = nonconst_features[self.selected_features_] 72 | 73 | return self 74 | 75 | def transform(self, X): 76 | """ 77 | Transform given data by slicing it with selected features. 78 | 79 | Parameters 80 | ---------- 81 | X : array-like, shape (n_samples, n_features) 82 | The training input samples. 83 | 84 | Returns 85 | ------ 86 | Transformed 2D numpy array 87 | """ 88 | check_is_fitted(self, 'selected_features_') 89 | X_ = check_array(X, dtype='numeric', accept_large_sparse=False) 90 | if X_.shape[1] != self.n_total_features_: 91 | getLogger(__name__).error( 92 | "Shape of input is different from what was seen in 'fit'") 93 | raise ValueError( 94 | "Shape of input is different from what was seen in 'fit'") 95 | if isinstance(X, pd.DataFrame): 96 | return X[X.columns[self.selected_features_]] 97 | else: 98 | return X_[:, self.selected_features_] 99 | 100 | @abstractmethod 101 | def _fit(self, X, y): 102 | pass 103 | -------------------------------------------------------------------------------- /ITMO_FS/utils/base_wrapper.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | from sklearn.base import clone 4 | from sklearn.utils import check_array 5 | from sklearn.utils.validation import check_is_fitted 6 | 7 | from . import BaseTransformer 8 | 9 | class BaseWrapper(BaseTransformer): 10 | def __init__(self): 11 | pass 12 | 13 | def fit(self, X, y=None, **fit_params): 14 | """Fit the algorithm. 15 | 16 | Parameters 17 | ---------- 18 | X : array-like, shape (n_samples, n_features) 19 | The training input samples. 20 | y : array-like, shape (n_samples,), optional 21 | The class labels. 22 | fit_params : dict, optional 23 | Additional parameters to pass to underlying _fit function. 24 | 25 | Returns 26 | ------- 27 | Self, i.e. the transformer object. 28 | """ 29 | if not hasattr(self.estimator, 'fit'): 30 | getLogger(__name__).error( 31 | "estimator should be an estimator implementing " 32 | "'fit' method, %s was passed", self.estimator) 33 | raise TypeError( 34 | "estimator should be an estimator implementing " 35 | "'fit' method, %s was passed" % self.estimator) 36 | if not hasattr(self.estimator, 'predict'): 37 | getLogger(__name__).error( 38 | "estimator should be an estimator implementing " 39 | "'predict' method, %s was passed", self.estimator) 40 | raise TypeError( 41 | "estimator should be an estimator implementing " 42 | "'predict' method, %s was passed" % self.estimator) 43 | self._estimator = clone(self.estimator) 44 | 45 | return super().fit(X, y, **fit_params) 46 | 47 | def predict(self, X): 48 | """Predict class labels for the input data. 49 | 50 | Parameters 51 | ---------- 52 | X : array-like, shape (n_samples, n_features) 53 | The input samples. 54 | 55 | Returns 56 | ------- 57 | array-like, shape (n_samples,) : class labels 58 | """ 59 | check_is_fitted(self, 'selected_features_') 60 | X_ = check_array(X, dtype='float64', accept_large_sparse=False) 61 | if X_.shape[1] != self.n_features_: 62 | getLogger(__name__).error( 63 | "Shape of input is different from what was seen in 'fit'") 64 | raise ValueError( 65 | "Shape of input is different from what was seen in 'fit'") 66 | 67 | return self._estimator.predict(X_[:, self.selected_features_]) 68 | -------------------------------------------------------------------------------- /ITMO_FS/utils/data_check.py: -------------------------------------------------------------------------------- 1 | from numpy import array 2 | 3 | 4 | def generate_features(X, features=None): 5 | if features is None: 6 | try: 7 | if X.columns is list: 8 | features = X.columns 9 | else: 10 | features = list(X.columns) 11 | except AttributeError: 12 | features = [i for i in range(X.shape[1])] 13 | return array(features) 14 | 15 | 16 | def check_filters(filters): 17 | for filter_ in filters: 18 | attr = None 19 | if not hasattr(filter_, 'fit'): 20 | attr = 'fit' 21 | if not hasattr(filter_, 'transform'): 22 | attr = 'transform' 23 | if not hasattr(filter_, 'fit_transform'): 24 | attr = 'fit_transform' 25 | if not (attr is None): 26 | raise TypeError( 27 | "filters should be a list of filters each implementing {0} " 28 | "method, {1} was passed".format(attr, filter_)) 29 | 30 | 31 | def check_cutting_rule(cutting_rule): 32 | pass # todo check cutting rule 33 | 34 | 35 | RESTRICTIONS = {'qpfs_filter': {'__select_k'}} 36 | 37 | 38 | def check_restrictions(measure_name, cutting_rule_name): 39 | if (measure_name in RESTRICTIONS.keys() and 40 | cutting_rule_name not in RESTRICTIONS[measure_name]): 41 | raise KeyError( 42 | "This measure %s doesn't support this cutting rule %s" 43 | % (measure_name, cutting_rule_name)) 44 | -------------------------------------------------------------------------------- /ITMO_FS/utils/functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import f1_score 3 | from sklearn.metrics.pairwise import euclidean_distances 4 | 5 | 6 | def cartesian(rw, cl): # returns cartesian product for passed numpy arrays as two paired numpy array 7 | tmp = np.array(np.meshgrid(rw, cl)).T.reshape(len(rw) * len(cl), 2) 8 | return tmp.T[0], tmp.T[1] 9 | 10 | def weight_func(model): # weight function used in MOS testing 11 | return model.coef_[0] 12 | 13 | def f1_scorer(y_true, y_pred): 14 | return f1_score(y_true, y_pred, average='micro') 15 | 16 | def augmented_rvalue(X, y, k=7, theta=3): 17 | """Calculate the augmented R-value for a dataset with two classes. 18 | 19 | Parameters 20 | ---------- 21 | X : array-like, shape (n_samples, n_features) 22 | The input samples. 23 | y : array-like, shape (n_samples,) 24 | The classes for the samples. 25 | k : int 26 | The amount of nearest neighbors used in the calculation. 27 | theta : int 28 | The threshold value: if from k nearest neighbors of an object more than 29 | theta of them are of a different class, then this object is in the 30 | overlap region. 31 | 32 | Returns 33 | ------- 34 | float - the augmented R-value for the dataset; the value is in the range 35 | [-1, 1]. 36 | 37 | Notes 38 | ----- 39 | For more details see `this paper `_. 40 | """ 41 | unique, counts = np.unique(y, return_counts=True) 42 | freq = sorted(list(zip(unique, counts)), key=lambda x: x[1], reverse=True) 43 | dm = euclidean_distances(X, X) 44 | Rs = [] 45 | Cs = [] 46 | 47 | for label, frequency in freq: 48 | Cs.append(frequency) 49 | count = 0 50 | for elem in [i for i, x in enumerate(y) if x == label]: 51 | nearest = knn_from_class(dm, y, elem, k, 1, anyClass=True) 52 | count += np.sign( 53 | k 54 | - list(map(lambda x: y[x], nearest)).count(label) 55 | - theta) 56 | Rs.append(count / frequency) 57 | Cs = Cs[::-1] 58 | return np.dot(Rs, Cs) / len(X) 59 | 60 | 61 | def knn_from_class(distances, y, index, k, cl, anyOtherClass=False, 62 | anyClass=False): 63 | """Return the indices of k nearest neighbors of X[index] from the selected 64 | class. 65 | 66 | Parameters 67 | ---------- 68 | distances : array-like, shape (n_samples, n_samples) 69 | The distance matrix of the input samples. 70 | y : array-like, shape (n_samples,) 71 | The classes for the samples. 72 | index : int 73 | The index of an element. 74 | k : int 75 | The amount of nearest neighbors to return. 76 | cl : int 77 | The class label for the nearest neighbors. 78 | anyClass : bool 79 | If True, returns neighbors not belonging to the same class as X[index]. 80 | 81 | Returns 82 | ------- 83 | array-like, shape (k,) - the indices of the nearest neighbors 84 | """ 85 | y_c = np.copy(y) 86 | if anyOtherClass: 87 | cl = y_c[index] + 1 88 | y_c[y_c != y_c[index]] = cl 89 | if anyClass: 90 | y_c.fill(cl) 91 | class_indices = np.nonzero(y_c == cl)[0] 92 | distances_class = distances[index][class_indices] 93 | nearest = np.argsort(distances_class) 94 | if y_c[index] == cl: 95 | nearest = nearest[1:] 96 | 97 | return class_indices[nearest[:k]] 98 | 99 | def matrix_norm(M): 100 | """Calculate the norm of all rows in the matrix. 101 | 102 | Parameters 103 | ---------- 104 | M : array-like, shape (n, m) 105 | The matrix. 106 | 107 | Returns 108 | ------- 109 | array-like, shape (n,) : the norms for each row in the matrix 110 | """ 111 | return np.sqrt((M * M).sum(axis=1)) 112 | 113 | def l21_norm(M): 114 | """Calculate the L2,1 norm of a matrix. 115 | 116 | Parameters 117 | ---------- 118 | M : array-like, shape (n, m) 119 | The matrix. 120 | 121 | Returns 122 | ------- 123 | float : the L2,1 norm of this matrix 124 | """ 125 | return matrix_norm(M).sum() 126 | 127 | def power_neg_half(M): 128 | """Calculate M ^ (-1/2). 129 | 130 | Parameters 131 | ---------- 132 | M : array-like, shape (n, m) 133 | The matrix. 134 | 135 | Returns 136 | ------- 137 | array-like, shape (n, m) : M ^ (-1/2) 138 | """ 139 | return np.sqrt(np.linalg.inv(M)) 140 | 141 | def apply_cr(cutting_rule): 142 | """Extract the cutting rule from a tuple or callable. 143 | 144 | Parameters 145 | ---------- 146 | cutting_rule : tuple or callable 147 | A (str, float) tuple describing a cutting rule or a callable with 148 | signature cutting_rule (features) which should return a list of features 149 | ranked by some rule. 150 | 151 | Returns 152 | ------- 153 | callable : a cutting rule callable 154 | """ 155 | from ..filters.univariate.measures import CR_NAMES, MEASURE_NAMES 156 | if type(cutting_rule) is tuple: 157 | cutting_rule_name = cutting_rule[0] 158 | cutting_rule_value = cutting_rule[1] 159 | try: 160 | cr = CR_NAMES[cutting_rule_name](cutting_rule_value) 161 | except KeyError: 162 | raise KeyError("No %s cutting rule yet" % cutting_rule_name) 163 | elif hasattr(cutting_rule, '__call__'): 164 | cr = cutting_rule 165 | else: 166 | raise KeyError( 167 | "%s isn't a cutting rule function or string" % cutting_rule) 168 | return cr 169 | -------------------------------------------------------------------------------- /ITMO_FS/utils/qpfs_body.py: -------------------------------------------------------------------------------- 1 | import math 2 | from functools import partial 3 | 4 | import numpy as np 5 | from qpsolvers import solve_qp 6 | from scipy.linalg import sqrtm 7 | 8 | 9 | def qpfs_body(X, y, fn, alpha=None, r=None, sigma=None, solv='quadprog', 10 | metric_for_complex=complex.__abs__): 11 | # TODO understand why complex double appears 12 | # TODO find suitable r parameter value 13 | # TODO find suitable sigma parameter value 14 | if r is None: 15 | r = X.shape[1] - 1 16 | if r >= X.shape[1]: 17 | raise TypeError("r parameter should be less than the number of features") 18 | F = np.zeros(X.shape[1], dtype=np.double) # F vector represents how each variable is correlated class 19 | class_size = max( 20 | y) + 1 # Count the number of classes, we assume that class labels would be numbers from 1 to max(y) 21 | priors = np.histogram(y, bins=max(y))[0] # Count prior probabilities of classes 22 | for i in range(1, class_size): # Loop through classes 23 | Ck = np.where(y == i, 1, 0) # Get array C(i) where C(k) is 1 when i = k and 0 otherwise 24 | F += priors[i - 1] * fn(X, Ck) # Counting F vector 25 | Q = np.apply_along_axis(partial(fn, X), 0, X).reshape(X.shape[1], X.shape[1]) 26 | indices = np.random.random_integers(0, Q.shape[0] - 1, 27 | r) # Taking random r indices according to Nystrom approximation 28 | A = Q[indices][:, :r] # A matrix for Nystrom(matrix of real numbers with size of [r, r]) 29 | B = Q[indices][:, r:] # B matrix for Nystrom(matrix of real numbers with size of [r, M - r]) 30 | if alpha is None: 31 | alpha = __countAlpha(A, B, F) # Only in filter method, in wrapper we should adapt it based on performance 32 | AInvSqrt = sqrtm(np.linalg.pinv(A)) # Calculate squared root of inverted matrix A 33 | S = np.add(A, AInvSqrt.dot(B).dot(B.T).dot(AInvSqrt)) # Caluclate S matrix 34 | eigvals, EVect = np.linalg.eig(S) # eigenvalues and eigenvectors of S 35 | U = np.append(A, B.T, axis=0).dot(AInvSqrt).dot(EVect).dot( 36 | sqrtm(np.linalg.pinv(EVect))) # Eigenvectors of Q matrix using [A B] 37 | eigvalsFilt, UFilt = __filterBy(sigma, eigvals, 38 | U) # Take onyl eigenvalues greater than threshold and corresponding eigenvectors 39 | LFilt = np.zeros((len(eigvalsFilt), len(eigvalsFilt)), dtype=complex) # initialize diagonal matrix of eigenvalues 40 | for i in range(len(eigvalsFilt)): # Loop through eigenvalues 41 | LFilt[i][i] = eigvalsFilt[i] # Init diagonal values 42 | UFilt = np.array([list(map(metric_for_complex, t)) for t in UFilt]) 43 | LFilt = np.array([list(map(metric_for_complex, t)) for t in LFilt]) 44 | yf = solve_qp((1 - alpha) * LFilt, alpha * F.dot(UFilt), UFilt, np.zeros(UFilt.shape[0]), 45 | solver=solv) # perform qp on stated problem 46 | xSolution = UFilt.dot(yf) # Find x - weights of features 47 | forRanks = list(zip(xSolution, F, [x for x in range(len(F))])) # Zip into array of tuple for proper sort 48 | forRanks.sort(reverse=True) 49 | ranks = np.zeros(len(F)) 50 | rankIndex = 1 51 | for i in forRanks: 52 | ranks[int(i[2])] = rankIndex 53 | rankIndex += 1 54 | return ranks 55 | 56 | 57 | def __filterBy(sigma, eigvals, U): 58 | if sigma is None: 59 | return eigvals, U 60 | y = np.where(eigvals > sigma)[0] 61 | return eigvals[y], U[:, y] 62 | 63 | 64 | def __countAlpha(A, B, F): 65 | Comb = B.T.dot(np.linalg.pinv(A)).dot(B) 66 | sumQ = np.sum(A) + 2 * np.sum(B) + np.sum(Comb) 67 | sumQ /= (A.shape[1] + B.shape[1]) ** 2 68 | sumF = np.sum(F) 69 | sumF /= len(F) 70 | return sumQ / (sumQ + sumF) 71 | -------------------------------------------------------------------------------- /ITMO_FS/wrappers/__init__.py: -------------------------------------------------------------------------------- 1 | from .deterministic import * 2 | from .randomized import * 3 | -------------------------------------------------------------------------------- /ITMO_FS/wrappers/deterministic/AddDelWrapper.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | import random as rnd 3 | 4 | import numpy as np 5 | from sklearn.model_selection import cross_val_score 6 | 7 | from ...utils import BaseWrapper, generate_features 8 | 9 | 10 | class AddDelWrapper(BaseWrapper): 11 | """Add-Del feature wrapper. 12 | 13 | Parameters 14 | ---------- 15 | estimator : object 16 | A supervised learning estimator that should have a fit(X, y) method and 17 | a predict(X) method. 18 | measure : string or callable 19 | A standard estimator metric (e.g. 'f1' or 'roc_auc') or a callable with 20 | signature measure(estimator, X, y) which should return only a single 21 | value. 22 | cv : int 23 | Number of folds in cross-validation. 24 | seed : int 25 | Seed for python random. 26 | d : int 27 | Amount of consecutive iterations for add ana del procedures that can 28 | have decreasing objective function before the algorithm terminates. 29 | 30 | See Also 31 | -------- 32 | Lecture about feature selection (ru), p.13 - 33 | http://www.ccas.ru/voron/download/Modeling.pdf 34 | 35 | Examples 36 | -------- 37 | >>> from sklearn.datasets import make_classification 38 | >>> from sklearn.linear_model import LogisticRegression 39 | >>> dataset = make_classification(n_samples=100, n_features=20, 40 | ... n_informative=5, n_redundant=0, shuffle=False, random_state=42) 41 | >>> x, y = np.array(dataset[0]), np.array(dataset[1]) 42 | >>> lg = LogisticRegression(solver='lbfgs') 43 | >>> add_del = AddDelWrapper(lg, 'accuracy').fit(x, y) 44 | >>> add_del.selected_features_ 45 | array([1, 4, 3], dtype=int64) 46 | """ 47 | def __init__(self, estimator, measure, cv=3, seed=42, d=1): 48 | self.estimator = estimator 49 | self.measure = measure 50 | self.cv = cv 51 | self.seed = seed 52 | self.d = d 53 | 54 | def __add(self, X, y, free_features): 55 | """Add features to the selected set one by one until either all of 56 | the features are added or more than d iterations pass without 57 | increasing the objective function. 58 | 59 | Parameters 60 | ---------- 61 | X : array-like, shape (n_samples, n_features) 62 | The training input samples. 63 | y : array-like, shape (n_samples,) 64 | The target values. 65 | free_features : array-like, shape (n_not_selected_features,) 66 | The array of current free features. 67 | 68 | Returns 69 | ------- 70 | array-like, shape (n_new_selected_features,) : selected features; 71 | array-like, shape (n_new_not_selected_features,) : new free features 72 | """ 73 | best_score = self.best_score_ 74 | iteration_features = self.selected_features_ 75 | iteration_free_features = free_features 76 | selected_features = self.selected_features_ 77 | getLogger(__name__).info( 78 | "Trying to add features from free set %s to selected set %s", 79 | free_features, selected_features) 80 | 81 | while (iteration_features.shape[0] - selected_features.shape[0] <= 82 | self.d) & (iteration_free_features.shape[0] != 0): 83 | getLogger(__name__).info( 84 | "Current selected set: %s, best score: %d", 85 | selected_features, best_score) 86 | scores = np.vectorize( 87 | lambda f: cross_val_score( 88 | self._estimator, X[:, np.append(iteration_features, f)], y, 89 | cv=self.cv, scoring=self.measure).mean())( 90 | iteration_free_features) 91 | getLogger(__name__).info("Scores for all free features: %s", scores) 92 | 93 | to_add = np.argmax(scores) 94 | iteration_score = scores[to_add] 95 | getLogger(__name__).info( 96 | "Adding feature %d, new score: %d", 97 | iteration_free_features[to_add], iteration_score) 98 | iteration_features = np.append( 99 | iteration_features, iteration_free_features[to_add]) 100 | iteration_free_features = np.delete(iteration_free_features, to_add) 101 | 102 | if iteration_score > best_score: 103 | selected_features = iteration_features 104 | free_features = iteration_free_features 105 | best_score = iteration_score 106 | 107 | return selected_features, free_features 108 | 109 | def __del(self, X, y, selected_features, free_features): 110 | """Delete features from the selected set one by one until either only 111 | one feature is left or more than d iterations pass without 112 | increasing the objective function. 113 | 114 | Parameters 115 | ---------- 116 | X : array-like, shape (n_samples, n_features) 117 | The training input samples. 118 | y : array-like, shape (n_samples,) 119 | The target values. 120 | selected_features : array-like, shape (n_selected_features,) 121 | The array of current selected features. 122 | free_features : array-like, shape (n_not_selected_features,) 123 | The array of current free features. 124 | 125 | Returns 126 | ------- 127 | array-like, shape (n_new_selected_features,) : new selected features; 128 | array-like, shape (n_new_not_selected_features,) : new free features; 129 | float : score for the selected feature set 130 | """ 131 | best_score = cross_val_score( 132 | self._estimator, X[:, selected_features], y, scoring=self.measure, 133 | cv=self.cv).mean() 134 | iteration_features = selected_features 135 | iteration_free_features = free_features 136 | getLogger(__name__).info( 137 | "Trying to delete features from selected set %s", selected_features) 138 | 139 | while (selected_features.shape[0] - iteration_features.shape[0] <= 140 | self.d) & (iteration_features.shape[0] != 1): 141 | getLogger(__name__).info( 142 | "Current selected set: %s, best score: %d", 143 | selected_features, best_score) 144 | scores = np.vectorize( 145 | lambda i: cross_val_score( 146 | self._estimator, X[:, np.delete(iteration_features, i)], y, 147 | cv=self.cv, scoring=self.measure).mean())( 148 | np.arange(0, iteration_features.shape[0])) 149 | getLogger(__name__).info( 150 | "Scores for all selected features: %s", scores) 151 | 152 | to_delete = np.argmax(scores) 153 | iteration_score = scores[to_delete] 154 | getLogger(__name__).info( 155 | "Deleting feature %d, new score: %d", 156 | iteration_features[to_delete], iteration_score) 157 | iteration_free_features = np.append( 158 | iteration_free_features, iteration_features[to_delete]) 159 | iteration_features = np.delete(iteration_features, to_delete) 160 | 161 | if iteration_score > best_score: 162 | selected_features = iteration_features 163 | free_features = iteration_free_features 164 | best_score = iteration_score 165 | 166 | return selected_features, free_features, best_score 167 | 168 | def _fit(self, X, y): 169 | """Fit the wrapper. 170 | 171 | Parameters 172 | ---------- 173 | X : array-like, shape (n_samples, n_features) 174 | The training input samples. 175 | y : array-like, shape (n_samples,) 176 | The target values. 177 | 178 | Returns 179 | ------- 180 | None 181 | """ 182 | self.selected_features_ = np.array([], dtype='int') 183 | free_features = generate_features(X) 184 | self.best_score_ = 0 185 | while True: 186 | selected_features, free_features = self.__add(X, y, free_features) 187 | getLogger(__name__).info( 188 | "After add: selected set = %s, free set = %s", 189 | selected_features, free_features) 190 | selected_features, free_features, iteration_score = self.__del( 191 | X, y, selected_features, free_features) 192 | getLogger(__name__).info( 193 | "After del: selected set = %s, free set = %s, score = %d", 194 | selected_features, free_features, iteration_score) 195 | 196 | if iteration_score > self.best_score_: 197 | self.best_score_ = iteration_score 198 | self.selected_features_ = selected_features 199 | else: 200 | break 201 | self._estimator.fit(X[:, self.selected_features_], y) 202 | -------------------------------------------------------------------------------- /ITMO_FS/wrappers/deterministic/BackwardSelection.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | from sklearn.model_selection import cross_val_score 5 | 6 | from ...utils import generate_features, BaseWrapper 7 | 8 | 9 | class BackwardSelection(BaseWrapper): 10 | """Backward Selection removes one feature at a time until the number of 11 | features to be removed is reached. On each step, the best n-1 features 12 | out of n are chosen (according to some estimator metric) and the last one 13 | is removed. 14 | 15 | Parameters 16 | ---------- 17 | estimator : object 18 | A supervised learning estimator that should have a fit(X, y) method and 19 | a predict(X) method. 20 | n_features : int 21 | Number of features to select. 22 | measure : string or callable 23 | A standard estimator metric (e.g. 'f1' or 'roc_auc') or a callable with 24 | signature measure(estimator, X, y) which should return only a single 25 | value. 26 | cv : int 27 | Number of folds in cross-validation. 28 | 29 | See Also 30 | -------- 31 | 32 | Examples 33 | -------- 34 | >>> from ITMO_FS.wrappers import BackwardSelection 35 | >>> from sklearn.linear_model import LogisticRegression 36 | >>> from sklearn.datasets import make_classification 37 | >>> import numpy as np 38 | >>> dataset = make_classification(n_samples=100, n_features=20, 39 | ... n_informative=5, n_redundant=0, shuffle=False, random_state=42) 40 | >>> x, y = np.array(dataset[0]), np.array(dataset[1]) 41 | >>> model = BackwardSelection(LogisticRegression(), 5, 42 | ... measure='f1_macro').fit(x, y) 43 | >>> model.selected_features_ 44 | array([ 0, 1, 2, 3, 13], dtype=int64) 45 | """ 46 | def __init__(self, estimator, n_features, measure, cv=3): 47 | self.estimator = estimator 48 | self.n_features = n_features 49 | self.measure = measure 50 | self.cv = cv 51 | 52 | def _fit(self, X, y): 53 | """Fit the wrapper. 54 | 55 | Parameters 56 | ---------- 57 | X : array-like, shape (n_samples, n_features) 58 | The training input samples. 59 | y : array-like, shape (n_samples,) 60 | The target values. 61 | 62 | Returns 63 | ------- 64 | None 65 | """ 66 | self.selected_features_ = generate_features(X) 67 | 68 | while self.selected_features_.shape[0] != self.n_features: 69 | getLogger(__name__).info( 70 | "Current selected set: %s", self.selected_features_) 71 | scores = np.vectorize( 72 | lambda i: cross_val_score( 73 | self._estimator, 74 | X[:, np.delete(self.selected_features_, i)], y, cv=self.cv, 75 | scoring=self.measure).mean())( 76 | np.arange(0, self.selected_features_.shape[0])) 77 | getLogger(__name__).info( 78 | "Scores for all selected features: %s", scores) 79 | to_delete = np.argmax(scores) 80 | getLogger(__name__).info( 81 | "Deleting feature %d", self.selected_features_[to_delete]) 82 | 83 | self.selected_features_ = np.delete( 84 | self.selected_features_, to_delete) 85 | self.best_score_ = cross_val_score( 86 | self._estimator, X[:, self.selected_features_], y, cv=self.cv, 87 | scoring=self.measure).mean() 88 | self._estimator.fit(X[:, self.selected_features_], y) 89 | -------------------------------------------------------------------------------- /ITMO_FS/wrappers/deterministic/RecursiveElimination.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | from sklearn.model_selection import cross_val_score 5 | 6 | from ...utils import generate_features, BaseWrapper 7 | 8 | 9 | class RecursiveElimination(BaseWrapper): 10 | """Recursive feature elimination algorithm. 11 | 12 | Parameters 13 | ---------- 14 | estimator : object 15 | A supervised learning estimator that should have a fit(X, y) method, a 16 | predict(X) method and a field corresponding to feature weights. 17 | n_features : int 18 | Number of features to leave. 19 | measure : string or callable 20 | A standard estimator metric (e.g. 'f1' or 'roc_auc') or a callable with 21 | signature measure(estimator, X, y) which should return only a single 22 | value. 23 | weight_func : callable 24 | A function to extract weights from the model. 25 | cv : int 26 | Number of folds in cross-validation. 27 | 28 | See Also 29 | -------- 30 | Guyon, I., Weston, J., Barnhill, S., & Vapnik, V., “Gene selection for 31 | cancer classification using support vector machines”, Mach. Learn., 32 | 46(1-3), 389–422, 2002. 33 | https://link.springer.com/article/10.1023/A:1012487302797 34 | 35 | Examples 36 | -------- 37 | >>> from sklearn.datasets import make_classification 38 | >>> from ITMO_FS.wrappers import RecursiveElimination 39 | >>> from sklearn.svm import SVC 40 | >>> import numpy as np 41 | >>> dataset = make_classification(n_samples=100, n_features=20, 42 | ... n_informative=4, n_redundant=0, shuffle=False, random_state=42) 43 | >>> x, y = np.array(dataset[0]), np.array(dataset[1]) 44 | >>> model = SVC(kernel='linear') 45 | >>> rfe = RecursiveElimination(model, 5, measure='f1_macro', 46 | ... weight_func=lambda model: np.square(model.coef_).sum(axis=0)).fit(x, y) 47 | >>> rfe.selected_features_ 48 | array([ 0, 1, 2, 11, 19], dtype=int64) 49 | """ 50 | def __init__(self, estimator, n_features, measure, weight_func, cv=3): 51 | self.estimator = estimator 52 | self.n_features = n_features 53 | self.measure = measure 54 | self.weight_func = weight_func 55 | self.cv = cv 56 | 57 | def _fit(self, X, y): 58 | """Fit the wrapper. 59 | 60 | Parameters 61 | ---------- 62 | X : array-like, shape (n_samples, n_features) 63 | The training input samples. 64 | y : array-like, shape (n_samples,) 65 | the target values. 66 | 67 | Returns 68 | ------- 69 | None 70 | """ 71 | self.selected_features_ = generate_features(X) 72 | 73 | while self.selected_features_.shape[0] != self.n_features: 74 | getLogger(__name__).info( 75 | "Current selected set: %s", self.selected_features_) 76 | self._estimator.fit(X[:, self.selected_features_], y) 77 | weights = self.weight_func(self._estimator) 78 | getLogger(__name__).info( 79 | "Weights for all selected features: %s", weights) 80 | least_important = np.argmin(weights) 81 | getLogger(__name__).info( 82 | "Deleting the least important feature %d", 83 | self.selected_features_[least_important]) 84 | self.selected_features_ = np.delete(self.selected_features_, 85 | least_important) 86 | 87 | self.best_score_ = cross_val_score(self._estimator, 88 | X[:, self.selected_features_], y, cv=self.cv, 89 | scoring=self.measure).mean() 90 | self._estimator.fit(X[:, self.selected_features_], y) 91 | -------------------------------------------------------------------------------- /ITMO_FS/wrappers/deterministic/SequentialForwardSelection.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | from sklearn.model_selection import cross_val_score 5 | 6 | from ...utils import generate_features, BaseWrapper 7 | 8 | 9 | class SequentialForwardSelection(BaseWrapper): 10 | """Sequentially add features that maximize the classifying function when 11 | combined with the features already used. 12 | #TODO add theory about this method 13 | 14 | Parameters 15 | ---------- 16 | estimator: object 17 | A supervised learning estimator that should have a fit(X, y) method and 18 | a predict(X) method. 19 | n_features : int 20 | Number of features to select. 21 | measure : string or callable 22 | A standard estimator metric (e.g. 'f1' or 'roc_auc') or a callable with 23 | signature measure(estimator, X, y) which should return only a single 24 | value. 25 | cv : int 26 | Number of folds in cross-validation. 27 | 28 | See Also 29 | -------- 30 | 31 | Examples 32 | -------- 33 | >>> from ITMO_FS.wrappers import SequentialForwardSelection 34 | >>> from sklearn.linear_model import LogisticRegression 35 | >>> from sklearn.datasets import make_classification 36 | >>> import numpy as np 37 | >>> dataset = make_classification(n_samples=100, n_features=20, 38 | ... n_informative=5, n_redundant=0, shuffle=False, random_state=42) 39 | >>> x, y = np.array(dataset[0]), np.array(dataset[1]) 40 | >>> model = SequentialForwardSelection(LogisticRegression(), 5, 41 | ... measure='f1_macro').fit(x, y) 42 | >>> model.selected_features_ 43 | array([ 1, 4, 3, 5, 19], dtype=int64) 44 | """ 45 | def __init__(self, estimator, n_features, measure, cv=3): 46 | self.estimator = estimator 47 | self.n_features = n_features 48 | self.measure = measure 49 | self.cv = cv 50 | 51 | def _fit(self, X, y): 52 | """Fit the wrapper. 53 | 54 | Parameters 55 | ---------- 56 | X : array-like, shape (n_samples, n_features) 57 | The training input samples. 58 | y : array-like, shape (n_samples,) 59 | The target values. 60 | 61 | Returns 62 | ------- 63 | None 64 | """ 65 | self.selected_features_ = np.array([], dtype=int) 66 | free_features = generate_features(X) 67 | 68 | while self.selected_features_.shape[0] != self.n_features: 69 | getLogger(__name__).info( 70 | "Current selected set: %s", self.selected_features_) 71 | scores = np.vectorize( 72 | lambda f: cross_val_score( 73 | self._estimator, 74 | X[:, np.append(self.selected_features_, f)], y, cv=self.cv, 75 | scoring=self.measure).mean())(free_features) 76 | getLogger(__name__).info("Scores for all free features: %s", scores) 77 | to_add = np.argmax(scores) 78 | getLogger(__name__).info("Adding feature %d", free_features[to_add]) 79 | self.selected_features_ = np.append(self.selected_features_, 80 | free_features[to_add]) 81 | free_features = np.delete(free_features, to_add) 82 | 83 | self.best_score_ = cross_val_score(self._estimator, 84 | X[:, self.selected_features_], y, cv=self.cv, 85 | scoring=self.measure).mean() 86 | self._estimator.fit(X[:, self.selected_features_], y) 87 | -------------------------------------------------------------------------------- /ITMO_FS/wrappers/deterministic/__init__.py: -------------------------------------------------------------------------------- 1 | from .AddDelWrapper import AddDelWrapper 2 | from .BackwardSelection import BackwardSelection 3 | from .RecursiveElimination import RecursiveElimination 4 | from .SequentialForwardSelection import SequentialForwardSelection 5 | from .qpfs_wrapper import QPFSWrapper 6 | -------------------------------------------------------------------------------- /ITMO_FS/wrappers/deterministic/qpfs_wrapper.py: -------------------------------------------------------------------------------- 1 | from ITMO_FS.filters.univariate.measures import pearson_corr 2 | from ITMO_FS.utils.qpfs_body import qpfs_body 3 | from ...utils import BaseWrapper 4 | 5 | class QPFSWrapper(BaseWrapper): 6 | """ 7 | #TODO rewrite to the proper notation 8 | Performs Quadratic Programming Feature Selection algorithm. 9 | Note that this realization requires labels to start from 1 and be numberical. 10 | This is function for wrapper based on qpfs so alpha parameter must be specified, in case you don't know alpha parameter 11 | it is suggested to use qpfs_filter 12 | 13 | Parameters 14 | ---------- 15 | alpha : double value 16 | That represents balance between relevance and redundancy of features. 17 | r : int 18 | The number of samples to be used in Nystrom optimization. 19 | sigma : double 20 | The threshold for eigenvalues to be used in solving QP optimization. 21 | solv : string 22 | The name of qp solver according to qpsolvers(https://pypi.org/project/qpsolvers/) naming. 23 | Note quadprog is used by default. 24 | fn : function(array, array) 25 | The function to count correlation, for example pearson correlation or mutual information. 26 | Note pearson_corr from ITMO_FS measures is used by default. 27 | Returns 28 | ------ 29 | array-like, shape (n_features) : the ranks of features in dataset, with rank increase, feature relevance increases and redundancy decreases. 30 | 31 | See Also 32 | -------- 33 | http://www.jmlr.org/papers/volume11/rodriguez-lujan10a/rodriguez-lujan10a.pdf 34 | 35 | Examples 36 | -------- 37 | >>> import numpy as np 38 | >>> x = np.array([[3, 3, 3, 2, 2], [3, 3, 1, 2, 3], [1, 3, 5, 1, 1], [3, 1, 4, 3, 1], [3, 1, 2, 3, 1]]) 39 | >>> y = np.array([1, 3, 2, 1, 2]) 40 | >>> alpha = 0.5 41 | >>> ranks = qpfs_wrapper(x, y, alpha) 42 | >>> print(ranks) 43 | 44 | """ 45 | def __init__(self, alpha, r=None, sigma=None, solv='quadprog', fn=pearson_corr): 46 | self.alpha = alpha 47 | self.r = r 48 | self.sigma = sigma 49 | self.solv = solv 50 | self.fn = fn 51 | 52 | def _fit(X, y): 53 | """ 54 | Fits wrapper. 55 | 56 | Parameters 57 | ---------- 58 | X : array-like, shape (n_samples,n_features) 59 | The training input samples. 60 | y : array-like, shape (n_samples,) 61 | The target values. 62 | Returns 63 | ------ 64 | None 65 | """ 66 | return qpfs_body(X, y, fn, alpha=alpha, r=r, sigma=sigma, solv=solv) 67 | -------------------------------------------------------------------------------- /ITMO_FS/wrappers/randomized/HillClimbing.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | from sklearn.base import clone 5 | from sklearn.model_selection import cross_val_score 6 | 7 | from ...utils import generate_features, BaseWrapper 8 | 9 | class HillClimbingWrapper(BaseWrapper): 10 | """Hill Climbing algorithm. 11 | 12 | Parameters 13 | ---------- 14 | estimator : object 15 | A supervised learning estimator that should have a fit(X, y) method and 16 | a predict(X) method. 17 | measure : string or callable 18 | A standard estimator metric (e.g. 'f1' or 'roc_auc') or a callable with 19 | signature measure(estimator, X, y) which should return only a single 20 | value. 21 | seed : int 22 | Random seed used to initialize np.random.default_rng(). 23 | cv : int 24 | Number of folds in cross-validation. 25 | 26 | See Also 27 | -------- 28 | 29 | Examples 30 | -------- 31 | >>> from ITMO_FS.wrappers import HillClimbingWrapper 32 | >>> from sklearn.linear_model import LogisticRegression 33 | >>> from sklearn.datasets import make_classification 34 | >>> import numpy as np 35 | >>> dataset = make_classification(n_samples=100, n_features=20, 36 | ... n_informative=5, n_redundant=0, shuffle=False, random_state=42) 37 | >>> x, y = np.array(dataset[0]), np.array(dataset[1]) 38 | >>> model = HillClimbingWrapper(LogisticRegression(), 39 | ... measure='f1_macro').fit(x, y) 40 | >>> model.selected_features_ 41 | array([ 0, 1, 2, 3, 4, 6, 7, 9, 11, 13, 14, 15], dtype=int64) 42 | """ 43 | def __init__(self, estimator, measure, seed=42, cv=3): 44 | self.estimator = estimator 45 | self.measure = measure 46 | self.seed = seed 47 | self.cv = cv 48 | 49 | def _fit(self, X, y): 50 | """Fit the wrapper. 51 | 52 | Parameters 53 | ---------- 54 | X : array-like, shape (n_samples, n_features) 55 | The training input samples. 56 | y : array-like, shape (n_samples,) 57 | The target values. 58 | 59 | Returns 60 | ------- 61 | None 62 | """ 63 | rng = np.random.default_rng(self.seed) 64 | 65 | features = generate_features(X) 66 | mask = rng.choice([True, False], self.n_features_) 67 | getLogger(__name__).info("Initial feature mask: %s", mask) 68 | score = cross_val_score( 69 | self._estimator, X[:, features[mask]], y, cv=self.cv, 70 | scoring=self.measure).mean() 71 | 72 | while True: 73 | getLogger(__name__).info("Current best score: %d", score) 74 | old_score = score 75 | order = rng.permutation(self.n_features_) 76 | for feature in order: 77 | getLogger(__name__).info("Trying to change feature %d", feature) 78 | mask[feature] = not(mask[feature]) 79 | new_score = cross_val_score(self._estimator, 80 | X[:, features[mask]], y, cv=self.cv, 81 | scoring=self.measure).mean() 82 | getLogger(__name__).info("New score: %d", new_score) 83 | if new_score > score: 84 | score = new_score 85 | break 86 | mask[feature] = not(mask[feature]) 87 | if old_score == score: 88 | break 89 | 90 | self.selected_features_ = features[mask] 91 | self.best_score_ = score 92 | self._estimator.fit(X[:, self.selected_features_], y) 93 | -------------------------------------------------------------------------------- /ITMO_FS/wrappers/randomized/SimulatedAnnealing.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | import numpy as np 4 | from sklearn.model_selection import cross_val_score 5 | 6 | from ...utils import BaseWrapper, generate_features 7 | 8 | 9 | class SimulatedAnnealing(BaseWrapper): 10 | """Simulated Annealing algorithm. 11 | 12 | Parameters 13 | ---------- 14 | estimator : object 15 | A supervised learning estimator that should have a fit(X, y) method and 16 | a predict(X) method. 17 | measure : string or callable 18 | A standard estimator metric (e.g. 'f1' or 'roc_auc') or a callable with 19 | signature measure(estimator, X, y) which should return only a single 20 | value. 21 | seed : int 22 | Random seed used to initialize np.random.default_rng(). 23 | iteration_number : int 24 | Number of iterations of the algorithm. 25 | c : int 26 | A constant that is used to control the rate of feature perturbation. 27 | init_number_of_features : int 28 | The number of features to initialize start features subset with, by 29 | default 5-10 percents of features is used. 30 | cv : int 31 | Number of folds in cross-validation. 32 | 33 | Notes 34 | ----- 35 | For more details see `this paper `_. 36 | 37 | Examples 38 | -------- 39 | >>> from sklearn.datasets import make_classification 40 | >>> from sklearn.linear_model import LogisticRegression 41 | >>> from ITMO_FS.wrappers.randomized import SimulatedAnnealing 42 | >>> dataset = make_classification(n_samples=100, n_features=20, 43 | ... n_informative=5, n_redundant=0, shuffle=False, random_state=42) 44 | >>> x, y = np.array(dataset[0]), np.array(dataset[1]) 45 | >>> sa = SimulatedAnnealing(LogisticRegression(), measure='f1_macro', 46 | ... iteration_number=50).fit(x, y) 47 | >>> sa.selected_features_ 48 | array([ 1, 4, 3, 17, 10, 16, 11, 14, 5], dtype=int64) 49 | """ 50 | def __init__(self, estimator, measure, seed=42, iteration_number=100, c=1, 51 | init_number_of_features=None, cv=3): 52 | self.estimator = estimator 53 | self.measure = measure 54 | self.seed = seed 55 | self.iteration_number = iteration_number 56 | self.c = c 57 | self.init_number_of_features = init_number_of_features 58 | self.cv = cv 59 | 60 | def __acceptance(self, i, prev_score, cur_score): 61 | return np.exp((i + 1) / self.c * (cur_score - prev_score) / prev_score) 62 | 63 | def _fit(self, X, y): 64 | """Fit the wrapper. 65 | 66 | Parameters 67 | ---------- 68 | X : array-like, shape (n_samples, n_features) 69 | The training input samples. 70 | y : array-like, shape (n_samples,) 71 | the target values. 72 | 73 | Returns 74 | ------- 75 | None 76 | """ 77 | rng = np.random.default_rng(self.seed) 78 | features = generate_features(X) 79 | 80 | if self.init_number_of_features is None: 81 | percentage = rng.integers(5, 11) 82 | init_number_of_features = int( 83 | self.n_features_ * percentage / 100) + 1 84 | elif self.init_number_of_features == 0: 85 | getLogger(__name__).warning( 86 | "Initial number of features was set to zero; would use one " 87 | "instead") 88 | init_number_of_features = 1 89 | else: 90 | init_number_of_features = self.init_number_of_features 91 | 92 | feature_subset = np.unique( 93 | rng.integers(0, self.n_features_, init_number_of_features)) 94 | getLogger(__name__).info("Initial selected set: %s", feature_subset) 95 | prev_score = cross_val_score( 96 | self._estimator, X[:, feature_subset], y, cv=self.cv, 97 | scoring=self.measure).mean() 98 | getLogger(__name__).info("Initial score: %d", prev_score) 99 | 100 | for i in range(self.iteration_number): 101 | getLogger(__name__).info("Current best score: %d", prev_score) 102 | operation = rng.integers(0, 2) 103 | percentage = rng.integers(1, 5) 104 | if operation == 1 & feature_subset.shape[0] != self.n_features_: 105 | # inc 106 | not_included_features = np.setdiff1d(features, feature_subset) 107 | include_number = min( 108 | not_included_features.shape[0], 109 | int(self.n_features_ * (percentage / 100)) + 1) 110 | to_add = rng.choice( 111 | not_included_features, size=include_number, replace=False) 112 | getLogger(__name__).info( 113 | "Trying to add features %s into the selected set", to_add) 114 | cur_subset = np.append(feature_subset, to_add) 115 | else: 116 | # exc 117 | exclude_number = min( 118 | feature_subset.shape[0] - 1, 119 | int(self.n_features_ * (percentage / 100)) + 1) 120 | to_delete = rng.choice( 121 | np.arange(feature_subset.shape[0]), size=exclude_number, 122 | replace=False) 123 | getLogger(__name__).info( 124 | "Trying to delete features %s from the selected set", 125 | feature_subset[to_delete]) 126 | cur_subset = np.delete(feature_subset, to_delete) 127 | cur_score = cross_val_score( 128 | self._estimator, X[:, cur_subset], y, cv=self.cv, 129 | scoring=self.measure).mean() 130 | getLogger(__name__).info("New score: %d", cur_score) 131 | if cur_score > prev_score: 132 | feature_subset = cur_subset 133 | prev_score = cur_score 134 | else: 135 | getLogger(__name__).info( 136 | "Score has not improved; trying to accept the new subset " 137 | "anyway") 138 | ruv = rng.random() 139 | acceptance = self.__acceptance(i, prev_score, cur_score) 140 | getLogger(__name__).info( 141 | "Random value = %d, acceptance = %d", ruv, acceptance) 142 | if ruv < acceptance: 143 | getLogger(__name__).info("Accepting the new subset") 144 | feature_subset = cur_subset 145 | prev_score = cur_score 146 | 147 | self.selected_features_ = feature_subset 148 | self.best_score_ = prev_score 149 | self._estimator.fit(X[:, self.selected_features_], y) 150 | -------------------------------------------------------------------------------- /ITMO_FS/wrappers/randomized/__init__.py: -------------------------------------------------------------------------------- 1 | from .HillClimbing import HillClimbingWrapper 2 | from .TPhMGWO import TPhMGWO 3 | from .SimulatedAnnealing import SimulatedAnnealing -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, ITMO University,Nikita Pilnenskiy 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-minimal -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_templates/class.rst: -------------------------------------------------------------------------------- 1 | :mod:`{{module}}`.{{objname}} 2 | {{ underline }}============== 3 | 4 | .. currentmodule:: {{ module }} 5 | 6 | .. autoclass:: {{ objname }} 7 | 8 | {% block methods %} 9 | .. automethod:: __init__ 10 | {% endblock %} 11 | 12 | 13 | .. include:: {{module}}.{{objname}}.examples 14 | 15 | .. raw:: html 16 | 17 |
18 | -------------------------------------------------------------------------------- /docs/_templates/function.rst: -------------------------------------------------------------------------------- 1 | :mod:`{{module}}`.{{objname}} 2 | {{ underline }}==================== 3 | 4 | .. currentmodule:: {{ module }} 5 | 6 | .. autofunction:: {{ objname }} 7 | 8 | .. include:: {{module}}.{{objname}}.examples 9 | 10 | .. raw:: html 11 | 12 |
-------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | ###################### 2 | ITMO_FS API 3 | ###################### 4 | 5 | This is the full API documentation of the `ITMO_FS` toolbox. 6 | 7 | .. _filters_ref: 8 | 9 | :mod:`ITMO_FS.filters`: Filter methods 10 | ====================================== 11 | 12 | .. automodule:: filters 13 | :no-members: 14 | :no-inherited-members: 15 | 16 | .. currentmodule:: ITMO_FS 17 | 18 | :mod:`ITMO_FS.filters.univariate`: Univariate filter methods 19 | ------------------------------------------------------------ 20 | 21 | .. automodule:: filters.univariate 22 | :no-members: 23 | :no-inherited-members: 24 | 25 | .. currentmodule:: ITMO_FS 26 | 27 | .. autosummary:: 28 | :toctree: generated/ 29 | :template: class.rst 30 | 31 | filters.univariate.VDM 32 | filters.univariate.UnivariateFilter 33 | 34 | Measures for univariate filters 35 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 36 | 37 | .. automodule:: filters.univariate.measures 38 | :no-members: 39 | :no-inherited-members: 40 | 41 | .. currentmodule:: ITMO_FS 42 | 43 | 44 | .. autosummary:: 45 | :toctree: generated/ 46 | :template: function.rst 47 | 48 | filters.univariate.fit_criterion_measure 49 | filters.univariate.f_ratio_measure 50 | filters.univariate.gini_index 51 | filters.univariate.su_measure 52 | filters.univariate.spearman_corr 53 | filters.univariate.pearson_corr 54 | filters.univariate.fechner_corr 55 | filters.univariate.kendall_corr 56 | filters.univariate.reliefF_measure 57 | filters.univariate.chi2_measure 58 | filters.univariate.information_gain 59 | 60 | Cutting rules for univariate filters 61 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 62 | 63 | .. automodule:: filters.univariate.measures 64 | :no-members: 65 | :no-inherited-members: 66 | 67 | .. currentmodule:: ITMO_FS 68 | 69 | 70 | .. autosummary:: 71 | :toctree: generated/ 72 | :template: function.rst 73 | 74 | filters.univariate.select_best_by_value 75 | filters.univariate.select_worst_by_value 76 | filters.univariate.select_k_best 77 | filters.univariate.select_k_worst 78 | filters.univariate.select_best_percentage 79 | filters.univariate.select_worst_percentage 80 | 81 | 82 | :mod:`ITMO_FS.filters.multivariate`: Multivariate filter methods 83 | ---------------------------------------------------------------- 84 | 85 | .. automodule:: filters.multivariate 86 | :no-members: 87 | :no-inherited-members: 88 | 89 | .. currentmodule:: ITMO_FS 90 | 91 | .. autosummary:: 92 | :toctree: generated/ 93 | :template: class.rst 94 | 95 | filters.multivariate.DISRWithMassive 96 | filters.multivariate.FCBFDiscreteFilter 97 | filters.multivariate.MultivariateFilter 98 | filters.multivariate.STIR 99 | filters.multivariate.TraceRatioFisher 100 | filters.multivariate.MIMAGA 101 | 102 | 103 | Measures for multivariate filters 104 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 105 | 106 | .. automodule:: filters.multivariate.measures 107 | :no-members: 108 | :no-inherited-members: 109 | 110 | .. currentmodule:: ITMO_FS 111 | 112 | 113 | .. autosummary:: 114 | :toctree: generated/ 115 | :template: function.rst 116 | 117 | filters.multivariate.MIM 118 | filters.multivariate.MRMR 119 | filters.multivariate.JMI 120 | filters.multivariate.CIFE 121 | filters.multivariate.MIFS 122 | filters.multivariate.CMIM 123 | filters.multivariate.ICAP 124 | filters.multivariate.DCSF 125 | filters.multivariate.CFR 126 | filters.multivariate.MRI 127 | filters.multivariate.IWFS 128 | filters.multivariate.generalizedCriteria 129 | 130 | 131 | :mod:`ITMO_FS.filters.unsupervised`: Unsupervised filter methods 132 | ---------------------------------------------------------------- 133 | 134 | .. automodule:: filters.unsupervised 135 | :no-members: 136 | :no-inherited-members: 137 | 138 | .. currentmodule:: ITMO_FS 139 | 140 | 141 | .. autosummary:: 142 | :toctree: generated/ 143 | :template: class.rst 144 | 145 | filters.unsupervised.TraceRatioLaplacian 146 | 147 | 148 | :mod:`ITMO_FS.filters.sparse`: Sparse filter methods 149 | ---------------------------------------------------------------- 150 | 151 | .. automodule:: filters.sparse 152 | :no-members: 153 | :no-inherited-members: 154 | 155 | .. currentmodule:: ITMO_FS 156 | 157 | 158 | .. autosummary:: 159 | :toctree: generated/ 160 | :template: class.rst 161 | 162 | filters.sparse.MCFS 163 | filters.sparse.NDFS 164 | filters.sparse.RFS 165 | filters.sparse.SPEC 166 | filters.sparse.UDFS 167 | 168 | 169 | 170 | .. _ensembles_ref: 171 | 172 | :mod:`ITMO_FS.ensembles`: Ensemble methods 173 | ========================================== 174 | 175 | .. automodule:: ensembles 176 | :no-members: 177 | :no-inherited-members: 178 | 179 | .. currentmodule:: ITMO_FS 180 | 181 | :mod:`ITMO_FS.ensembles.measure_based`: Measure based ensemble methods 182 | ----------------------------------------------------------------------- 183 | 184 | .. automodule:: ensembles.measure_based 185 | :no-members: 186 | :no-inherited-members: 187 | 188 | .. currentmodule:: ITMO_FS 189 | 190 | .. autosummary:: 191 | :toctree: generated/ 192 | :template: class.rst 193 | 194 | ensembles.measure_based.WeightBased 195 | 196 | 197 | :mod:`ITMO_FS.ensembles.model_based`: Model based ensemble methods 198 | ------------------------------------------------------------------ 199 | 200 | .. automodule:: ensembles.model_based 201 | :no-members: 202 | :no-inherited-members: 203 | 204 | .. currentmodule:: ITMO_FS 205 | 206 | .. autosummary:: 207 | :toctree: generated/ 208 | :template: class.rst 209 | 210 | ensembles.model_based.BestSum 211 | 212 | 213 | :mod:`ITMO_FS.ensembles.ranking_based`: Ranking based ensemble methods 214 | ---------------------------------------------------------------------- 215 | 216 | .. automodule:: ensembles.ranking_based 217 | :no-members: 218 | :no-inherited-members: 219 | 220 | .. currentmodule:: ITMO_FS 221 | 222 | .. autosummary:: 223 | :toctree: generated/ 224 | :template: class.rst 225 | 226 | ensembles.ranking_based.Mixed 227 | 228 | 229 | .. _embedded_ref: 230 | 231 | :mod:`ITMO_FS.embedded`: Embedded methods 232 | ========================================= 233 | 234 | .. automodule:: embedded 235 | :no-members: 236 | :no-inherited-members: 237 | 238 | .. currentmodule:: ITMO_FS 239 | 240 | .. autosummary:: 241 | :toctree: generated/ 242 | :template: class.rst 243 | 244 | embedded.MOS 245 | 246 | 247 | .. _hybrid_ref: 248 | 249 | :mod:`ITMO_FS.hybrid`: Hybrid methods 250 | ========================================= 251 | 252 | .. automodule:: hybrid 253 | :no-members: 254 | :no-inherited-members: 255 | 256 | .. currentmodule:: ITMO_FS 257 | 258 | .. autosummary:: 259 | :toctree: generated/ 260 | :template: class.rst 261 | 262 | hybrid.FilterWrapperHybrid 263 | hybrid.Melif 264 | 265 | 266 | .. _wrappers_ref: 267 | 268 | :mod:`ITMO_FS.wrappers`: Wrapper methods 269 | ======================================== 270 | 271 | .. automodule:: wrappers 272 | :no-members: 273 | :no-inherited-members: 274 | 275 | .. currentmodule:: ITMO_FS 276 | 277 | :mod:`ITMO_FS.wrappers.deterministic`: Deterministic wrapper methods 278 | -------------------------------------------------------------------- 279 | 280 | .. automodule:: wrappers.deterministic 281 | :no-members: 282 | :no-inherited-members: 283 | 284 | .. currentmodule:: ITMO_FS 285 | 286 | .. autosummary:: 287 | :toctree: generated/ 288 | :template: class.rst 289 | 290 | wrappers.deterministic.AddDelWrapper 291 | wrappers.deterministic.BackwardSelection 292 | wrappers.deterministic.RecursiveElimination 293 | wrappers.deterministic.SequentialForwardSelection 294 | 295 | Deterministic wrapper function 296 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 297 | 298 | .. autosummary:: 299 | :toctree: generated/ 300 | :template: function.rst 301 | 302 | wrappers.deterministic.qpfs_wrapper 303 | 304 | 305 | 306 | :mod:`ITMO_FS.wrappers.randomized`: Randomized wrapper methods 307 | ------------------------------------------------------------------ 308 | 309 | .. automodule:: wrappers.randomized 310 | :no-members: 311 | :no-inherited-members: 312 | 313 | .. currentmodule:: ITMO_FS 314 | 315 | .. autosummary:: 316 | :toctree: generated/ 317 | :template: class.rst 318 | 319 | wrappers.randomized.HillClimbingWrapper 320 | wrappers.randomized.SimulatedAnnealing 321 | wrappers.randomized.TPhMGWO 322 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | import sphinx_rtd_theme 16 | sys.path.insert(0, os.path.abspath('../ITMO_FS')) 17 | sys.path.insert(1, os.path.abspath('..')) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'ITMO_FS' 23 | copyright = '2020, ITMO University,Nikita Pilnenskiy' 24 | author = 'Nikita Pilnenskiy' 25 | 26 | # The full version, including alpha/beta/rc tags 27 | release = '0.3.2' 28 | 29 | 30 | # -- General configuration --------------------------------------------------- 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.napoleon', 'sphinx.ext.autosummary'] 36 | 37 | 38 | autodoc_default_flags = ['members', 'inherited-members'] 39 | 40 | # Add any paths that contain templates here, relative to this directory. 41 | templates_path = ['_templates'] 42 | 43 | # generate autosummary even if no references 44 | autosummary_generate = True 45 | 46 | # The language for content autogenerated by Sphinx. Refer to documentation 47 | # for a list of supported languages. 48 | # 49 | # This is also used if you do content translation via gettext catalogs. 50 | # Usually you set "language" from the command line for these cases. 51 | language = 'ru' 52 | master_doc = 'index' 53 | # List of patterns, relative to source directory, that match files and 54 | # directories to ignore when looking for source files. 55 | # This pattern also affects html_static_path and html_extra_path. 56 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 57 | 58 | 59 | # -- Options for HTML output ------------------------------------------------- 60 | 61 | # The theme to use for HTML and HTML Help pages. See the documentation for 62 | # a list of builtin themes. 63 | # 64 | html_theme = 'sphinx_rtd_theme' 65 | 66 | 67 | # Add any paths that contain custom static files (such as style sheets) here, 68 | # relative to this directory. They are copied after the builtin static files, 69 | # so a file named "default.css" will overwrite the builtin "default.css". 70 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 71 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. project-template documentation master file, created by 2 | sphinx-quickstart on Mon Jan 18 14:44:12 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | ########################################## 7 | Welcome to ITMO_FS! 8 | ########################################## 9 | 10 | .. toctree:: 11 | :maxdepth: 2 12 | :hidden: 13 | :caption: Getting Started 14 | 15 | install 16 | 17 | .. toctree:: 18 | :maxdepth: 2 19 | :hidden: 20 | :caption: Documentation 21 | 22 | user_guide 23 | api 24 | 25 | 26 | `Getting started `_ 27 | --------------------------------- 28 | 29 | Information to install, test, and contribute to the package. 30 | 31 | `User Guide `_ 32 | ------------------------------- 33 | 34 | User guide of ITMO_FS 35 | 36 | `API `_ 37 | ------------------------------- 38 | 39 | The main documentation. This contains an in-depth description of all 40 | algorithms and how to apply them. 41 | 42 | `API Documentation `_ 43 | ------------------------------- 44 | 45 | The exact API of all functions and classes, as given in the 46 | doctring. The API documents expected types and allowed features for 47 | all functions, and all parameters available for the algorithms. 48 | 49 | 50 | -------------------------------------------------------------------------------- /docs/install.rst: -------------------------------------------------------------------------------- 1 | ######################## 2 | Install and contribution 3 | ######################## 4 | 5 | Prerequisites 6 | ============= 7 | 8 | The feature selection library requires the following dependencies: 9 | 10 | * python (>=3.6) 11 | * numpy (>=1.13.3) 12 | * scipy (>=0.19.1) 13 | * scikit-learn (>=0.22) 14 | * imblearn (>=0.0) 15 | * qpsolvers (>=1.0.1) 16 | 17 | Install 18 | ======= 19 | 20 | ITMO_FS is currently available on the PyPi's repositories and you can 21 | install it via `pip`:: 22 | 23 | pip install -U ITMO_FS 24 | 25 | If you prefer, you can clone it and run the setup.py file. Use the following 26 | commands to get a copy from Github and install all dependencies:: 27 | 28 | git clone https://github.com/LastShekel/ITMO_FS.git 29 | cd ITMO_FS 30 | pip install . 31 | 32 | Or install using pip and GitHub:: 33 | 34 | pip install -U git+https://github.com/LastShekel/ITMO_FS.git 35 | 36 | Test and coverage 37 | ================= 38 | 39 | You want to test the code before to install:: 40 | 41 | $ make test 42 | 43 | You wish to test the coverage of your version:: 44 | 45 | $ make coverage 46 | 47 | You can also use `pytest`:: 48 | 49 | $ pytest ITMO_FS -v 50 | 51 | Contribute 52 | ========== 53 | 54 | You can contribute to this code through Pull Request on GitHub_. Please, make 55 | sure that your code is coming with unit tests to ensure full coverage and 56 | continuous integration in the API. 57 | 58 | .. _GitHub: https://github.com/LastShekel/ITMO_FS/pulls 59 | -------------------------------------------------------------------------------- /docs/introduction.rst: -------------------------------------------------------------------------------- 1 | .. _introduction: 2 | 3 | ============ 4 | Introduction 5 | ============ 6 | 7 | .. _api_ITMO_FS: 8 | 9 | API's of feature selectors 10 | ---------------------------------- 11 | 12 | Available selectors follow the scikit-learn API using the base estimator 13 | and selector mixin: 14 | 15 | :Transformer: 16 | 17 | The base object, implements a ``fit`` method to learn from data, either:: 18 | 19 | selector.fit(data, targets) 20 | 21 | To select features from a data set after learning, each selector implements:: 22 | 23 | data_selected = selector.transform(data) 24 | 25 | To learn from data and select features from the same data set at once, each selector implements:: 26 | 27 | data_selected = selector.fit_transform(data, targets) 28 | 29 | To reverse the selection operation, each selector implements:: 30 | 31 | data_reversed = selector.fit_transform(data) 32 | 33 | Feature selectors accept the same inputs that in scikit-learn: 34 | 35 | * ``data``: array-like (2-D list, pandas.Dataframe, numpy.array) or sparse 36 | matrices; 37 | * ``targets``: array-like (1-D list, pandas.Series, numpy.array). 38 | 39 | The output will be of the following type: 40 | 41 | * ``data_selected``: array-like (2-D list, pandas.Dataframe, numpy.array) or 42 | sparse matrices; 43 | * ``data_reversed``: array-like (2-D list, pandas.Dataframe, numpy.array) or 44 | sparse matrices; 45 | 46 | .. topic:: Sparse input 47 | 48 | For sparse input the data is **converted to the Compressed Sparse Rows 49 | representation** (see ``scipy.sparse.csr_matrix``) before being fed to the 50 | sampler. To avoid unnecessary memory copies, it is recommended to choose the 51 | CSR representation upstream. 52 | 53 | .. _problem_statement: 54 | 55 | Problem statement regarding data sets with redundant features 56 | ------------------------------------------------------------- 57 | 58 | Feature selection methods can be used to identify and remove unneeded, 59 | irrelevant and redundant attributes from data that do not contribute 60 | to the accuracy of a predictive model or may in fact decrease the 61 | accuracy of the model. Fewer attributes is desirable because it reduces 62 | the complexity of the model, and a simpler model is simpler to understand 63 | and explain. 64 | 65 | Here is one of examples of feature selection improving the classification quality:: 66 | 67 | >>> from sklearn.datasets import make_classification 68 | >>> from sklearn.linear_model import SGDClassifier 69 | >>> from ITMO_FS.embedded import MOS 70 | 71 | >>> X, y = make_classification(n_samples=300, n_features=10, random_state=0, n_informative=2) 72 | >>> sel = MOS() 73 | >>> trX = sel.fit_transform(X, y, smote=False) 74 | 75 | >>> cl1 = SGDClassifier() 76 | >>> cl1.fit(X, y) 77 | >>> cl1.score(X, y) 78 | 0.9033333333333333 79 | 80 | >>> cl2 = SGDClassifier() 81 | >>> cl2.fit(trX, y) 82 | >>> cl2.score(trX, y) 83 | 0.9433333333333334 84 | 85 | As expected, the quality of the SVGClassifier's results is impacted by the presence of redundant features in data set. 86 | We can see that after using of feature selection the mean accuracy increases from 0.903 to 0.943. 87 | -------------------------------------------------------------------------------- /docs/logos/logo_itmo_fs_itog_colour.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ctlab/ITMO_FS/a2e61e2fabb9dfb34d90a1130fc7f5f162a2c921/docs/logos/logo_itmo_fs_itog_colour.jpg -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/user_guide.rst: -------------------------------------------------------------------------------- 1 | .. title:: User guide: contents 2 | 3 | .. _user_guide: 4 | 5 | ========== 6 | User Guide 7 | ========== 8 | 9 | .. toctree:: 10 | :numbered: 11 | 12 | introduction.rst -------------------------------------------------------------------------------- /meta.yml: -------------------------------------------------------------------------------- 1 | {% set name = "itmo_fs" %} 2 | {% set version = "0.3.3" %} 3 | 4 | package: 5 | name: "{{ name|lower }}" 6 | version: "{{ load_setup_py_data().version }}" 7 | 8 | source: 9 | git_rev: 10 | git_url: 11 | url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.tar.gz 12 | sha256: 5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7 13 | 14 | build: 15 | number: 0 16 | script: "{{ PYTHON }} -m pip install . --no-deps --ignore-installed -vv " 17 | 18 | 19 | requirements: 20 | build: 21 | - pip 22 | - python 23 | - setuptools 24 | 25 | run: 26 | - python 27 | 28 | test: 29 | imports: 30 | -itmo_fs 31 | -pandas 32 | -pytest 33 | 34 | about: 35 | home: -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | imbalanced-learn 2 | numpy~=1.22 3 | scipy~=1.5.2 4 | scikit-learn~=0.23.2 5 | qpsolvers 6 | 7 | dvc~=1.11.16 8 | pandas~=1.1.3 9 | imblearn~=0.0 10 | setuptools~=50.3.1 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.rst 3 | 4 | [aliases] 5 | test = pytest 6 | 7 | [tool:pytest] 8 | addopts = --doctest-modules -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | 3 | from setuptools import find_packages, setup 4 | import os 5 | base_dir = os.path.dirname(__file__) 6 | 7 | about = {} 8 | with open(os.path.join(base_dir, "ITMO_FS", "__about__.py")) as f: 9 | exec(f.read(), about) 10 | 11 | DISTNAME = 'ITMO_FS' 12 | DESCRIPTION = 'Python Feature Selection library from ITMO University.' 13 | with codecs.open('README.rst') as f: 14 | LONG_DESCRIPTION = f.read() 15 | MAINTAINER = 'N. Pilnenskiy' 16 | MAINTAINER_EMAIL = 'somacruz@bk.ru' 17 | URL = 'https://github.com/ctlab/ITMO_FS' 18 | LICENSE = 'new BSD' 19 | DOWNLOAD_URL = 'https://github.com/ctlab/ITMO_FS' 20 | VERSION = about["__version__"], 21 | INSTALL_REQUIRES = ['numpy', 'scipy', 'scikit-learn', 'imbalanced-learn', 'qpsolvers'] 22 | CLASSIFIERS = ['Intended Audience :: Science/Research', 23 | 'Intended Audience :: Developers', 24 | 'License :: OSI Approved', 25 | 'Programming Language :: Python', 26 | 'Topic :: Software Development', 27 | 'Topic :: Scientific/Engineering', 28 | 'Operating System :: Microsoft :: Windows', 29 | 'Operating System :: POSIX', 30 | 'Operating System :: Unix', 31 | 'Operating System :: MacOS', 32 | 'Programming Language :: Python :: 2.7', 33 | 'Programming Language :: Python :: 3.5', 34 | 'Programming Language :: Python :: 3.6', 35 | 'Programming Language :: Python :: 3.7', 36 | 'Programming Language :: Python :: 3.8'] 37 | EXTRAS_REQUIRE = { 38 | 'tests': [ 39 | 'pytest', 40 | 'pytest-cov'], 41 | 'docs': [ 42 | 'sphinx', 43 | 'sphinx-gallery', 44 | 'sphinx_rtd_theme', 45 | 'numpydoc', 46 | 'matplotlib' 47 | ] 48 | } 49 | 50 | setup(name=DISTNAME, 51 | maintainer=MAINTAINER, 52 | maintainer_email=MAINTAINER_EMAIL, 53 | description=DESCRIPTION, 54 | license=LICENSE, 55 | url=URL, 56 | version=VERSION, 57 | download_url=DOWNLOAD_URL, 58 | long_description=LONG_DESCRIPTION, 59 | zip_safe=False, # the package can run out of an .egg file 60 | classifiers=CLASSIFIERS, 61 | packages=find_packages(), 62 | install_requires=INSTALL_REQUIRES, 63 | extras_require=EXTRAS_REQUIRE) 64 | -------------------------------------------------------------------------------- /test/Melif_test.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import unittest 3 | 4 | import pandas as pd 5 | from sklearn.datasets import make_classification, make_regression 6 | from sklearn.metrics import f1_score 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.svm import SVC 9 | from sklearn.utils.estimator_checks import check_estimator 10 | from ITMO_FS.ensembles import WeightBased 11 | from ITMO_FS.filters import * 12 | from ITMO_FS.hybrid.Melif import Melif 13 | from ITMO_FS.utils import f1_scorer 14 | 15 | 16 | class MyTestCase(unittest.TestCase): 17 | wide_classification = make_classification(n_features=2000, n_informative=100, n_redundant=500) 18 | tall_classification = make_classification(n_samples=50000, n_features=100, n_informative=23, n_redundant=30) 19 | wide_regression = make_regression(n_features=2000, n_informative=100) 20 | tall_regression = make_regression(n_samples=50000, n_features=200, n_informative=50) 21 | filters = [UnivariateFilter(gini_index), 22 | UnivariateFilter(pearson_corr), 23 | UnivariateFilter(spearman_corr)] 24 | 25 | estimator = SVC(random_state=42) 26 | ensemble = WeightBased(filters, cutting_rule=select_k_best(50)) 27 | 28 | melif = Melif(estimator, select_k_best(1500), ensemble, scorer=f1_score, verbose=True) 29 | 30 | 31 | 32 | def test_wide(self): 33 | data, target = self.wide_classification[0], self.wide_classification[1] 34 | 35 | train_data, test_data, train_target, test_target = train_test_split(data, target) 36 | self.melif.fit(train_data, train_target) 37 | 38 | print(f1_score(test_target, self.melif.predict(test_data))) 39 | 40 | def test_wide_pd(self): 41 | data, target = pd.DataFrame(self.wide_classification[0]), pd.DataFrame(self.wide_classification[1]) 42 | train_data, test_data, train_target, test_target = train_test_split(data, target) 43 | self.melif.fit(train_data, train_target) 44 | print(f1_score(test_target, self.melif.predict(test_data))) 45 | 46 | def test_R(self): 47 | data = pd.read_csv('C:\\Users\\SomaC\\PycharmProjects\\machinka\\mlrcheck\\boston_corrected.csv') 48 | target = 'class' 49 | features = data.loc[:, data.columns != 'b'].columns 50 | # data[target]=data[target].apply(lambda x: 0 if x<=0 else 1) 51 | ks = [int(i * 500) for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]] 52 | print() 53 | for j in ks: 54 | print('|' + str(j) + '|') 55 | start = datetime.datetime.now() 56 | f = UnivariateFilter(pearson_corr, select_k_best(j)) 57 | f.fit(data[features], data[target]) 58 | print('|', datetime.datetime.now() - start, '|') 59 | start = datetime.datetime.now() 60 | f = UnivariateFilter(spearman_corr, select_k_best(j)) 61 | f.fit(data[features], data[target]) 62 | print('|', datetime.datetime.now() - start, '|') 63 | # start = datetime.datetime.now() 64 | # f = UnivariateFilter(chi2_measure, select_k_best(j)) 65 | # f.fit(data[features], data[target]) 66 | # print('|', datetime.datetime.now() - start, '|') 67 | start = datetime.datetime.now() 68 | f = UnivariateFilter(information_gain, select_k_best(j)) 69 | f.fit(data[features], data[target]) 70 | print('|', datetime.datetime.now() - start, '|') 71 | 72 | def test_est(self): 73 | melif = Melif(self.estimator, select_k_best(2), self.ensemble, scorer=f1_scorer) 74 | check_estimator(melif) 75 | 76 | 77 | if __name__ == '__main__': 78 | unittest.main() 79 | -------------------------------------------------------------------------------- /test/datasets/.gitignore: -------------------------------------------------------------------------------- 1 | /gisette.csv 2 | /madelon.csv 3 | /arcene.csv 4 | /dexter.csv 5 | /dorothea.csv 6 | -------------------------------------------------------------------------------- /test/datasets/arcene.csv.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 13c55f4366f0bc89c4bc6c4bc826b0bc 3 | size: 2715738 4 | path: arcene.csv 5 | -------------------------------------------------------------------------------- /test/datasets/dexter.csv.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: cca2fe50c6a4869948fba36d56f0ef86 3 | size: 24038795 4 | path: dexter.csv 5 | -------------------------------------------------------------------------------- /test/datasets/dorothea.csv.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 9ae080330bbe3e419b415fbbb5761252 3 | size: 320002322 4 | path: dorothea.csv 5 | -------------------------------------------------------------------------------- /test/datasets/gisette.csv.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 2a1efe73ab9eace5947d2d5cb62d16d6 3 | size: 67797724 4 | path: gisette.csv 5 | -------------------------------------------------------------------------------- /test/datasets/madelon.csv.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 44cce2dfb74b4ade59fcf63be7c59610 3 | size: 4004995 4 | path: madelon.csv 5 | -------------------------------------------------------------------------------- /test/embedded_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | from sklearn.linear_model import LogisticRegression, SGDClassifier 5 | from sklearn.pipeline import Pipeline 6 | import numpy as np 7 | 8 | from sklearn.utils.estimator_checks import check_estimator 9 | 10 | from ITMO_FS.embedded import * 11 | from ITMO_FS.utils import weight_func 12 | 13 | np.random.seed(42) 14 | 15 | 16 | class TestCases(unittest.TestCase): 17 | data, target = np.random.randint( 18 | 10, size=( 19 | 100, 20)), np.random.randint( 20 | 10, size=( 21 | 100,)) 22 | 23 | def test_MOS_err_loss(self): 24 | with self.assertRaises(KeyError): 25 | MOS(model=SGDClassifier(), weight_func=weight_func, 26 | sampling=True, loss="err").fit(self.data, 27 | self.target) 28 | 29 | def test_MOS_no_sampling(self): 30 | # MOSS 31 | res = MOS( 32 | model=SGDClassifier(), 33 | weight_func=weight_func).fit_transform( 34 | self.data, 35 | self.target) 36 | assert self.data.shape[0] == res.shape[0] 37 | print("MOSS:", self.data.shape, '--->', res.shape) 38 | 39 | def test_MOSS(self): 40 | # MOSS 41 | res = MOS( 42 | model=SGDClassifier(), 43 | weight_func=weight_func, 44 | sampling=True).fit_transform( 45 | self.data, 46 | self.target) 47 | assert self.data.shape[0] == res.shape[0] 48 | print("MOSS:", self.data.shape, '--->', res.shape) 49 | 50 | def test_MOSS_n_naigbours_err(self): 51 | # MOSS 52 | with self.assertRaises(ValueError): 53 | MOS( 54 | model=SGDClassifier(), 55 | weight_func=weight_func, 56 | sampling=True, k_neighbors=1000).fit_transform( 57 | self.data, 58 | self.target) 59 | 60 | def test_MOSS_hinge(self): 61 | # MOSS 62 | res = MOS( 63 | model=SGDClassifier(), 64 | weight_func=weight_func, 65 | sampling=True, loss="hinge").fit_transform( 66 | self.data, 67 | self.target) 68 | assert self.data.shape[0] == res.shape[0] 69 | print("MOSS:", self.data.shape, '--->', res.shape) 70 | 71 | def test_MOSNS(self): 72 | # MOSNS 73 | res = MOS( 74 | model=SGDClassifier(), 75 | weight_func=weight_func, 76 | sampling=False).fit_transform( 77 | self.data, 78 | self.target) 79 | assert self.data.shape[0] == res.shape[0] 80 | print("MOSNS:", self.data.shape, '--->', res.shape) 81 | 82 | def test_losses(self): 83 | for loss in ['log', 'hinge']: 84 | res = MOS( 85 | model=SGDClassifier(), 86 | weight_func=weight_func, 87 | loss=loss).fit_transform( 88 | self.data, 89 | self.target) 90 | assert self.data.shape[0] == res.shape[0] 91 | 92 | def test_df(self): 93 | f = MOS(model=SGDClassifier(), weight_func=weight_func, sampling=True) 94 | 95 | df = f.fit_transform( 96 | pd.DataFrame( 97 | self.data), pd.DataFrame( 98 | self.target)) 99 | arr = f.fit_transform(self.data, self.target) 100 | np.testing.assert_array_equal(df, arr) 101 | 102 | f = MOS(model=SGDClassifier(), weight_func=weight_func, sampling=False) 103 | 104 | df = f.fit_transform( 105 | pd.DataFrame( 106 | self.data), pd.DataFrame( 107 | self.target)) 108 | arr = f.fit_transform(self.data, self.target) 109 | np.testing.assert_array_equal(df, arr) 110 | 111 | def test_pipeline(self): 112 | # FS 113 | p = Pipeline( 114 | [('FS1', MOS(model=SGDClassifier(), weight_func=weight_func))]) 115 | p.fit(self.data, self.target) 116 | res = p.transform(self.data) 117 | assert self.data.shape[0] == res.shape[0] 118 | 119 | # FS - estim 120 | p = Pipeline([('FS1', MOS(model=SGDClassifier(), 121 | weight_func=weight_func)), 122 | ('E1', LogisticRegression())]) 123 | p.fit(self.data, self.target) 124 | assert 0 <= p.score(self.data, self.target) <= 1 125 | 126 | # FS - FS 127 | p = Pipeline([('FS1', 128 | MOS(model=SGDClassifier(), 129 | weight_func=weight_func, 130 | loss='log')), 131 | ('FS2', 132 | MOS(model=SGDClassifier(), 133 | weight_func=weight_func, 134 | loss='hinge'))]) 135 | p.fit(self.data, self.target) 136 | res = p.transform(self.data) 137 | assert self.data.shape[0] == res.shape[0] 138 | 139 | # FS - FS - estim 140 | p = Pipeline([('FS1', 141 | MOS(model=SGDClassifier(), weight_func=weight_func, 142 | loss='log')), ('FS2', MOS( 143 | model=SGDClassifier(), weight_func=weight_func, loss='hinge')), 144 | ('E1', LogisticRegression())]) 145 | p.fit(self.data, self.target) 146 | assert 0 <= p.score(self.data, self.target) <= 1 147 | 148 | def test_est(self): 149 | moss = MOS( 150 | model=SGDClassifier(), 151 | weight_func=weight_func, 152 | sampling=True) 153 | mosns = MOS( 154 | model=SGDClassifier(), 155 | weight_func=weight_func, 156 | sampling=False) 157 | 158 | # for some reason using local weight_func or lambda here causes it to fail with pickle errors 159 | # so we're using an imported weight_func 160 | check_estimator(moss) 161 | check_estimator(mosns) 162 | 163 | 164 | if __name__ == "__main__": 165 | unittest.main() 166 | -------------------------------------------------------------------------------- /test/hybrid_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from sklearn.linear_model import LogisticRegression 4 | from sklearn.utils.estimator_checks import check_estimator 5 | from sklearn.metrics import make_scorer 6 | from ITMO_FS.filters import * 7 | from ITMO_FS.wrappers import BackwardSelection 8 | from ITMO_FS.utils import f1_scorer 9 | from ITMO_FS.hybrid import FilterWrapperHybrid 10 | 11 | 12 | class MyTestCase(unittest.TestCase): 13 | 14 | def test_est(self): 15 | classifier = LogisticRegression(max_iter=1000) 16 | back_selection = BackwardSelection(classifier, 2, make_scorer(f1_scorer)) 17 | fw = FilterWrapperHybrid(UnivariateFilter(spearman_corr, cutting_rule=('K best', 2)), back_selection) 18 | check_estimator(fw) 19 | 20 | 21 | if __name__ == '__main__': 22 | unittest.main() 23 | -------------------------------------------------------------------------------- /test/multivariate_filters_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.pipeline import Pipeline 6 | import numpy as np 7 | 8 | from sklearn.utils.estimator_checks import check_estimator 9 | 10 | from ITMO_FS.filters.multivariate import * 11 | from utils import load_dataset 12 | 13 | np.random.seed(42) 14 | 15 | 16 | class TestCases(unittest.TestCase): 17 | data, target = np.random.randint(10, size=(100, 20)), np.random.randint(10, 18 | size=( 19 | 100,)) 20 | madelon = load_dataset("madelon.csv") 21 | 22 | def test_FCBF(self): 23 | # FCBF 24 | res = FCBFDiscreteFilter().fit_transform(self.data, self.target) 25 | assert self.data.shape[0] == res.shape[0] 26 | print("Fast Correlation Based filter:", self.data.shape, '--->', 27 | res.shape) 28 | 29 | def test_DISR(self): 30 | # DISR 31 | res = DISRWithMassive(10).fit_transform(self.data, self.target) 32 | assert self.data.shape[0] == res.shape[0] 33 | print("Double Input Symmetric Relevance:", self.data.shape, '--->', 34 | res.shape) 35 | 36 | def test_JMIM_error(self): 37 | # JMIM 38 | data, target = self.madelon.drop(['target'], axis=1), self.madelon[ 39 | "target"] 40 | with self.assertRaises(ValueError): 41 | JMIM(10000).fit_transform(data, target) 42 | 43 | def test_JMIM(self): 44 | # JMIM 45 | res = JMIM(10).fit_transform(self.data, self.target) 46 | assert self.data.shape[0] == res.shape[0] 47 | print("Joint Mutual Information Maximisation:", self.data.shape, 48 | '--->', res.shape) 49 | 50 | def test_JMIM_normalised(self): 51 | # JMIM 52 | res = JMIM(10, normalized=True).fit_transform(self.data, self.target) 53 | assert self.data.shape[0] == res.shape[0] 54 | print("Joint Mutual Information Maximisation:", self.data.shape, 55 | '--->', res.shape) 56 | 57 | def test_multivariate_interface_error_features(self): 58 | with self.assertRaises(ValueError): 59 | filter = MultivariateFilter("MRMR", 5000000) 60 | filter.fit(self.data, self.target) 61 | 62 | def test_multivariate_interface_error_name(self): 63 | with self.assertRaises(KeyError): 64 | filter = MultivariateFilter("asoikfhlkjfdslfh", 5) 65 | filter.fit(self.data, self.target) 66 | 67 | def test_trace_ratio(self): 68 | # TraceRatioFisher 69 | res = TraceRatioFisher(10).fit_transform(self.data, self.target) 70 | assert self.data.shape[0] == res.shape[0] 71 | print("TraceRatio:", self.data.shape, '--->', res.shape) 72 | 73 | def test_stir(self): 74 | # STIR 75 | res = STIR(10).fit_transform(self.data, self.target) 76 | assert self.data.shape[0] == res.shape[0] 77 | print("Statistical Inference Relief:", self.data.shape, '--->', 78 | res.shape) 79 | 80 | def test_base_multivariate(self): 81 | # Multivariate with callable 82 | f = MultivariateFilter(MIM, 10) 83 | f.fit(self.data, self.target) 84 | res = f.transform(self.data) 85 | assert self.data.shape[0] == res.shape[0] 86 | print("Multivariate with callable:", self.data.shape, '--->', 87 | res.shape) 88 | 89 | # Multivariate with string 90 | f = MultivariateFilter('MRMR', 10) 91 | f.fit(self.data, self.target) 92 | res = f.transform(self.data) 93 | assert self.data.shape[0] == res.shape[0] 94 | print("Multivariate with string:", self.data.shape, '--->', res.shape) 95 | 96 | def test_k_best(self): 97 | for i in [5, 10, 20]: 98 | res = DISRWithMassive(i).fit_transform(self.data, self.target) 99 | assert i == res.shape[1] 100 | 101 | for i in [5, 10, 20]: 102 | res = JMIM(i).fit_transform(self.data, self.target) 103 | assert i == res.shape[1] 104 | 105 | for i in [5, 10, 20]: 106 | f = MultivariateFilter(MIM, i) 107 | f.fit(self.data, self.target) 108 | res = f.transform(self.data) 109 | assert i == res.shape[1] 110 | 111 | for i in [5, 10, 20]: 112 | res = TraceRatioFisher(i).fit_transform(self.data, self.target) 113 | assert i == res.shape[1] 114 | 115 | for i in [5, 10, 20]: 116 | res = STIR(i).fit_transform(self.data, self.target) 117 | assert i == res.shape[1] 118 | 119 | def test_measures(self): 120 | # Multivariate 121 | for measure in MEASURE_NAMES: 122 | beta = 0.3 if measure in ['MIFS', 'generalizedCriteria'] else None 123 | gamma = 0.4 if measure == 'generalizedCriteria' else None 124 | f = MultivariateFilter(measure, 10, beta, gamma) 125 | f.fit(self.data, self.target) 126 | res = f.transform(self.data) 127 | assert self.data.shape[0] == res.shape[0] and res.shape[1] == 10 128 | 129 | def test_df(self): 130 | for f in [FCBFDiscreteFilter(), DISRWithMassive(10), JMIM(10), 131 | MultivariateFilter(MIM, 10), \ 132 | TraceRatioFisher(10), STIR(10)]: 133 | df = f.fit_transform(pd.DataFrame(self.data), 134 | pd.DataFrame(self.target)) 135 | arr = f.fit_transform(self.data, self.target) 136 | np.testing.assert_array_equal(df, arr) 137 | 138 | def test_pipeline(self): 139 | # FS 140 | p = Pipeline([('FS1', MultivariateFilter(MIM, 10))]) 141 | p.fit(self.data, self.target) 142 | res = p.transform(self.data) 143 | assert self.data.shape[0] == res.shape[0] and res.shape[1] == 10 144 | 145 | # FS - estim 146 | p = Pipeline([('FS1', FCBFDiscreteFilter()), 147 | ('E1', LogisticRegression(max_iter=10000))]) 148 | p.fit(self.data, self.target) 149 | assert 0 <= p.score(self.data, self.target) <= 1 150 | 151 | # FS - FS 152 | p = Pipeline([('FS1', MultivariateFilter(MIM, 10)), ('FS2', STIR(5))]) 153 | p.fit(self.data, self.target) 154 | res = p.transform(self.data) 155 | assert self.data.shape[0] == res.shape[0] and res.shape[1] == 5 156 | 157 | # FS - FS - estim 158 | p = Pipeline( 159 | [('FS1', TraceRatioFisher(10)), ('FS2', DISRWithMassive(5)), 160 | ('E1', LogisticRegression(max_iter=10000))]) 161 | p.fit(self.data, self.target) 162 | assert 0 <= p.score(self.data, self.target) <= 1 163 | 164 | def test_est(self): 165 | for f in [ 166 | FCBFDiscreteFilter(), 167 | DISRWithMassive(2), 168 | MultivariateFilter(MIM, 2), 169 | TraceRatioFisher(2), 170 | STIR(2)]: 171 | check_estimator(f) 172 | 173 | 174 | if __name__ == "__main__": 175 | unittest.main() 176 | -------------------------------------------------------------------------------- /test/univariate_measures_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from ITMO_FS.filters.univariate.measures import * 3 | from utils import load_dataset 4 | 5 | 6 | class UnivariateMeasuresTest(unittest.TestCase): 7 | madelon = load_dataset("madelon.csv") 8 | 9 | def test_measures(self): 10 | data = self.madelon.drop(['target'], axis=1).values 11 | for f, answer in zip( 12 | [su_measure, 13 | laplacian_score], 14 | [1, 15 | 1, 16 | 1]): 17 | np.testing.assert_allclose( 18 | f(data[0].reshape((-1, 1)), data[0]), 19 | answer, atol=1e-05) 20 | 21 | def test_information_gain(self): 22 | data = self.madelon.drop(['target'], axis=1).values 23 | np.testing.assert_allclose( 24 | information_gain(data[:, 0].reshape((-1, 1)), data[:, 0]), 25 | 0, atol=1e-05) 26 | 27 | def test_pearson_correlation(self): 28 | data = self.madelon.drop(['target'], axis=1).values 29 | np.testing.assert_allclose( 30 | pearson_corr(data[:, 0].reshape((-1, 1)), data[:, 0]), 31 | 1, atol=1e-05) 32 | 33 | def test_pearson_correlation_1d(self): 34 | data = self.madelon.drop(['target'], axis=1).values 35 | np.testing.assert_allclose( 36 | pearson_corr(data[0], data[0]), 37 | 1, atol=1e-05) 38 | 39 | def test_spearman_measure(self): 40 | data = self.madelon.drop(['target'], axis=1).values 41 | np.testing.assert_allclose( 42 | spearman_corr(data[:, 0].reshape((-1, 1)), data[:, 0]), 43 | 1, atol=1e-05) 44 | 45 | def test_spearman_measure_1d(self): 46 | data = self.madelon.drop(['target'], axis=1).values 47 | np.testing.assert_allclose( 48 | spearman_corr(data[:, 0], data[:, 0]), 49 | 1, atol=1e-05) 50 | 51 | def test_spearman_measure_error(self): 52 | with self.assertRaises(ValueError): 53 | spearman_corr(np.array([-1]), [-1]) 54 | 55 | def test_chi2_measure(self): 56 | data = self.madelon.drop(['target'], axis=1).values 57 | np.testing.assert_allclose( 58 | chi2_measure(data[:, 0].reshape((-1, 1)), data[:, 0]), 59 | 1, atol=1e-05) 60 | 61 | def test_chi2_measure_error(self): 62 | with self.assertRaises(ValueError): 63 | chi2_measure(np.array([-1]), [-1]) 64 | 65 | def test_gini_index(self): 66 | data = self.madelon.drop(['target'], axis=1).values 67 | assert gini_index(data[0].reshape((-1, 1)), data[0]), 0 68 | with self.assertRaises(ValueError): 69 | gini_index(data[0, :1], data[0]) 70 | 71 | def test_relief_error(self): 72 | data, target = (self.madelon.drop(['target'], axis=1).values, 73 | self.madelon["target"].values) 74 | with self.assertRaises(ValueError): 75 | relief_measure(data, target[target > 0]) 76 | 77 | def test_relief(self): 78 | data, target = (self.madelon.drop(['target'], axis=1).values, 79 | self.madelon["target"].values) 80 | relief_measure(data, target) 81 | 82 | def test_reliefF_measure(self): 83 | data, target = (self.madelon.drop(['target'], axis=1).values, 84 | self.madelon["target"].values) 85 | reliefF_measure(data, target) 86 | 87 | def test_laplacian_score(self): 88 | data, target = (self.madelon.drop(['target'], axis=1).values, 89 | self.madelon["target"].values) 90 | laplacian_score(data, target) 91 | 92 | def test_cutting_rules(self): 93 | data = dict( 94 | zip(['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10'], 95 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) 96 | assert select_k_best(5)(data), ['f10', 'f9', 'f8', 'f7', 'f6'] 97 | assert select_k_worst(5)(data), ['f1', 'f2', 'f3', 'f4', 'f5'] 98 | 99 | with self.assertRaises(TypeError): 100 | select_k_best(0.5)(data) 101 | 102 | with self.assertRaises(ValueError): 103 | select_k_best(100)(data) 104 | 105 | assert select_best_by_value(5)(data), ['f10', 'f9', 'f8', 'f7', 'f6'] 106 | assert select_worst_by_value(5)(data), ['f1', 'f2', 'f3', 'f4', 'f5'] 107 | 108 | assert select_best_percentage(0.5)(data), ['f10', 'f9', 'f8', 'f7', 109 | 'f6'] 110 | assert select_worst_percentage(0.5)(data), ['f1', 'f2', 'f3', 'f4', 111 | 'f5'] 112 | 113 | def test_fit_criterion(self): 114 | data, target = (self.madelon.drop(['target'], axis=1).values, 115 | self.madelon["target"].values) 116 | fit_criterion_measure(data, target) 117 | 118 | def test_anova(self): 119 | data, target = (self.madelon.drop(['target'], axis=1).values, 120 | self.madelon["target"].values) 121 | anova(data, target) 122 | 123 | def test_modified_t_score(self): 124 | data, target = (self.madelon.drop(['target'], axis=1).values, 125 | self.madelon["target"].values) 126 | modified_t_score(data, target) 127 | 128 | def test_f_ratio(self): 129 | data, target = (self.madelon.drop(['target'], axis=1).values, 130 | self.madelon["target"].values) 131 | f_ratio_measure(data, target) 132 | 133 | def test_kendall(self): 134 | data, target = (self.madelon.drop(['target'], axis=1).values, 135 | self.madelon["target"].values) 136 | kendall_corr(data[:, 0], target) 137 | kendall_corr(data, target) 138 | 139 | def test_fechner(self): 140 | data, target = (self.madelon.drop(['target'], axis=1).values, 141 | self.madelon["target"].values) 142 | fechner_corr(data[:, 0], target) 143 | fechner_corr(data, target) 144 | 145 | 146 | if __name__ == '__main__': 147 | unittest.main() 148 | -------------------------------------------------------------------------------- /test/unsupervised_filters_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.pipeline import Pipeline 6 | import numpy as np 7 | 8 | from sklearn.utils.estimator_checks import check_estimator 9 | 10 | from ITMO_FS.filters.unsupervised import * 11 | from ITMO_FS.filters.univariate import * 12 | 13 | np.random.seed(42) 14 | 15 | 16 | class TestCases(unittest.TestCase): # TODO: add TraceRatioLaplacian tests and tests without target 17 | data, target = np.random.randint(10, size=(100, 20)), np.random.randint(10, size=(100,)) 18 | 19 | def test_MCFS(self): 20 | # MCFS 21 | res = MCFS(10).fit_transform(self.data, self.target) 22 | assert self.data.shape[0] == res.shape[0] 23 | print("MCFS:", self.data.shape, '--->', res.shape) 24 | 25 | def test_UDFS(self): 26 | # UDFS 27 | res = UDFS(10).fit_transform(self.data, self.target) 28 | assert self.data.shape[0] == res.shape[0] 29 | print("UDFS:", self.data.shape, '--->', res.shape) 30 | 31 | def test_df(self): 32 | for f in [MCFS(10), UDFS(10)]: 33 | df = f.fit_transform(pd.DataFrame(self.data), pd.DataFrame(self.target)) 34 | arr = f.fit_transform(self.data, self.target) 35 | np.testing.assert_array_equal(df, arr) 36 | 37 | def test_pipeline(self): 38 | # FS 39 | p = Pipeline([('FS1', MCFS(10))]) 40 | p.fit(self.data, self.target) 41 | res = p.transform(self.data) 42 | assert self.data.shape[0] == res.shape[0] and res.shape[1] == 10 43 | 44 | # FS - estim 45 | p = Pipeline([('FS1', UDFS(10)), ('E1', LogisticRegression())]) 46 | p.fit(self.data, self.target) 47 | assert 0 <= p.score(self.data, self.target) <= 1 48 | 49 | # FS - FS 50 | p = Pipeline([('FS1', MCFS(10)), ('FS2', UDFS(5))]) 51 | p.fit(self.data, self.target) 52 | res = p.transform(self.data) 53 | assert self.data.shape[0] == res.shape[0] and res.shape[1] == 5 54 | 55 | # FS - FS - estim 56 | p = Pipeline([('FS1', UDFS(10)), ('FS2', MCFS(5)), ('E1', LogisticRegression())]) 57 | p.fit(self.data, self.target) 58 | assert 0 <= p.score(self.data, self.target) <= 1 59 | 60 | def test_est(self): 61 | for f in [MCFS(2), UDFS(2)]: 62 | check_estimator(f) 63 | 64 | 65 | if __name__ == "__main__": 66 | unittest.main() 67 | -------------------------------------------------------------------------------- /test/utils.py: -------------------------------------------------------------------------------- 1 | import dvc.api 2 | import pandas as pd 3 | 4 | datasets = ["arcene.csv", 5 | "dexter.csv", 6 | "dorothea.csv", 7 | "gisette.csv", 8 | "madelon.csv"] 9 | 10 | 11 | def load_dataset(name): # todo fails to hold header 12 | with dvc.api.open( 13 | 'test/datasets/' + name) as fd: 14 | df = pd.read_csv(fd, header=None) 15 | features = ['v' + str(i) for i in range(df.shape[1] - 1)] + ["target"] 16 | df.columns = features 17 | return df 18 | 19 | 20 | def load_datasets(): 21 | data = [] 22 | for d in datasets: 23 | data.append(load_dataset(d)) 24 | return data 25 | -------------------------------------------------------------------------------- /test/wrapper_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | from math import sqrt 5 | from scipy import stats 6 | from sklearn.datasets import load_iris 7 | from sklearn.datasets import make_classification, make_regression 8 | from sklearn.linear_model import LinearRegression, LogisticRegression 9 | from sklearn.metrics import f1_score 10 | from sklearn.model_selection import cross_val_score, KFold 11 | from sklearn.neighbors import KNeighborsClassifier 12 | from sklearn.svm import SVC 13 | from sklearn.utils.estimator_checks import check_estimator 14 | from sklearn.metrics import make_scorer 15 | 16 | from ITMO_FS import RecursiveElimination, BackwardSelection, AddDelWrapper, SequentialForwardSelection, \ 17 | HillClimbingWrapper, SimulatedAnnealing, TPhMGWO 18 | from ITMO_FS.utils.information_theory import * 19 | from ITMO_FS.utils import test_scorer 20 | 21 | np.random.seed(42) 22 | 23 | 24 | class TestCases(unittest.TestCase): 25 | wide_classification = make_classification(n_features=2000, n_informative=100, n_redundant=500) 26 | tall_classification = make_classification(n_samples=50000, n_features=100, n_informative=23, n_redundant=30) 27 | wide_regression = make_regression(n_features=2000, n_informative=100) 28 | tall_regression = make_regression(n_samples=50000, n_features=200, n_informative=50) 29 | 30 | # def test_rec_elim(self): 31 | # classifier = LogisticRegression(max_iter=1000) 32 | # rec_elimination = RecursiveElimination(classifier, 10, 'f1') 33 | # X, y = self.wide_classification 34 | # 35 | # default_score = cross_val_score(classifier, X, y, cv=5, scoring='f1').mean() 36 | # 37 | # rec_elimination.fit(X, y) 38 | # features = rec_elimination.selected_features_ 39 | # assert len(features) == 10 40 | # 41 | # wrapper_score = cross_val_score(classifier, X[:, features], y, cv=5, scoring='f1').mean() 42 | # 43 | # assert default_score < wrapper_score 44 | 45 | # def test_back_sel(self): 46 | # classifier = LogisticRegression(max_iter=1000) 47 | # back_selection = BackwardSelection(classifier, 10, 'f1') 48 | # X, y = self.wide_classification 49 | # 50 | # print('start calculating the default score') 51 | # default_score = cross_val_score(classifier, X, y, cv=5, scoring='f1').mean() 52 | # print('finish calculating the default score') 53 | # 54 | # print('start backward selection') 55 | # # TODO backward selection works for too long 56 | # back_selection.fit(X, y) 57 | # print('finish backward selection') 58 | # 59 | # features = back_selection.selected_features_ 60 | # assert len(features) == 10 61 | # 62 | # wrapper_score = cross_val_score(classifier, X[:, features], y, cv=5, scoring='f1').mean() 63 | # 64 | # assert default_score < wrapper_score 65 | 66 | # def test_add_del_wrapper(self): 67 | # classifier = LogisticRegression(max_iter=1000) 68 | # add_del_wrapper = AddDelWrapper(classifier, f1_score) 69 | # X, y = self.wide_classification 70 | # 71 | # default_score = cross_val_score(classifier, X, y, cv=5, scoring='f1').mean() 72 | # 73 | # add_del_wrapper.fit(X, y) 74 | # features = add_del_wrapper.selected_features_ 75 | # 76 | # wrapper_score = cross_val_score(classifier, X[:, features], y, cv=5, scoring='f1').mean() 77 | # 78 | # assert default_score < wrapper_score 79 | # 80 | # def test_seq_forw_sel(self): 81 | # classifier = LogisticRegression(max_iter=1000) 82 | # seq_forw_sel = SequentialForwardSelection(classifier, 10, 'f1') 83 | # X, y = self.wide_classification 84 | # 85 | # default_score = cross_val_score(classifier, X, y, cv=5, scoring='f1').mean() 86 | # 87 | # seq_forw_sel.fit(X, y) 88 | # features = seq_forw_sel.selected_features_ 89 | # assert len(features) == 10 90 | # 91 | # wrapper_score = cross_val_score(classifier, X[:, features], y, cv=5, scoring='f1').mean() 92 | # 93 | # assert default_score < wrapper_score 94 | 95 | # def test_qpfs_wrapper(self): 96 | # classifier = LogisticRegression(max_iter=1000) 97 | # seq_forw_sel = SequentialForwardSelection(LogisticRegression(), 10, 'f1') 98 | # X, y = self.wide_classification 99 | # 100 | # default_score = cross_val_score(classifier, X, y, cv=5, scoring='f1') 101 | # 102 | # seq_forw_sel.fit(X, y) 103 | # features = seq_forw_sel.selected_features 104 | # assert len(features) == 10 105 | # 106 | # wrapper_score = cross_val_score(classifier, X[:, features], y, cv=5, scoring='f1') 107 | # 108 | # assert all(default_score < wrapper_score) 109 | 110 | # def test_hill_climbing(self): 111 | # classifier = LogisticRegression(max_iter=1000) 112 | # hill_climbing = HillClimbingWrapper(classifier, f1_score) 113 | # X, y = self.wide_classification 114 | # 115 | # default_score = cross_val_score(classifier, X, y, cv=5, scoring='f1').mean() 116 | # 117 | # hill_climbing.fit(X, y) 118 | # features = hill_climbing.selected_features_ 119 | # # assert len(features) == 10 120 | # 121 | # wrapper_score = cross_val_score(classifier, X[:, features], y, cv=5, scoring='f1').mean() 122 | # 123 | # assert default_score < wrapper_score 124 | # 125 | # def test_sim_annealing(self): 126 | # classifier = LogisticRegression(max_iter=1000) 127 | # sim_annealing = SimulatedAnnealing(classifier, f1_score) 128 | # X, y = self.wide_classification 129 | # 130 | # sim_annealing.fit(X, y) 131 | # default_score = cross_val_score(classifier, X, y, cv=5, scoring='f1').mean() 132 | # 133 | # features = sim_annealing.selected_features_ 134 | # # assert len(features) == 10 135 | # 136 | # wrapper_score = cross_val_score(classifier, X[:, features], y, cv=5, scoring='f1').mean() 137 | # 138 | # assert default_score < wrapper_score 139 | # 140 | # def test_wolves(self): 141 | # classifier = LogisticRegression(max_iter=1000) 142 | # tphmgwo = TPhMGWO() 143 | # X, y = self.wide_classification 144 | # 145 | # default_score = cross_val_score(classifier, X, y, cv=5, scoring='f1').mean() 146 | # 147 | # tphmgwo.run(X, y) 148 | # features = tphmgwo.selected_features_ 149 | # # assert len(features) == 10 150 | # 151 | # wrapper_score = cross_val_score(classifier, X[:, features], y, cv=5, scoring='f1').mean() 152 | # 153 | # assert default_score < wrapper_score 154 | # 155 | # def test_est(self): 156 | # classifier = LogisticRegression(max_iter=1000) 157 | # for f in [RecursiveElimination(classifier, 2, make_scorer(test_scorer)), BackwardSelection(classifier, 2, make_scorer(test_scorer)), 158 | # AddDelWrapper(classifier, test_scorer), SequentialForwardSelection(classifier, 2, make_scorer(test_scorer)), 159 | # HillClimbingWrapper(classifier, test_scorer), SimulatedAnnealing(classifier, test_scorer), TPhMGWO()]: 160 | # check_estimator(f) 161 | 162 | if __name__ == "__main__": 163 | unittest.main() 164 | --------------------------------------------------------------------------------