├── .github ├── FUNDING.yml ├── dependabot.yml └── workflows │ └── ruff.yml ├── .gitignore ├── .readthedocs.yml ├── .vs ├── VSWorkspaceState.json ├── allrelevantfs │ └── v17 │ │ └── .suo └── slnx.sqlite ├── CHANGELOG.md ├── CITATION.cff ├── LICENSE.md ├── README.md ├── docs ├── Introduction.rst ├── Makefile ├── Methods overview.rst ├── arfs.feature_selection.rst ├── arfs.rst ├── boostaroota.png ├── boruta.png ├── conf.py ├── grootcv.png ├── index.rst ├── logo.png ├── make.bat ├── modules.rst ├── notebooks │ ├── arfs_boruta_borutaShap_comparison.ipynb │ ├── arfs_classification.ipynb │ ├── arfs_grootcv_custom_params.ipynb │ ├── arfs_large_data_sampling.ipynb │ ├── arfs_non_normal_loss_and_sample_weight.ipynb │ ├── arfs_on_GPU.ipynb │ ├── arfs_regression.ipynb │ ├── arfs_shap_vs_fastshap.ipynb │ ├── arfs_timeseries.ipynb │ ├── association_and_feature_selection.ipynb │ ├── basic_feature_selection.ipynb │ ├── bender_hex_mini.png │ ├── issue_categoricals.ipynb │ ├── issue_collinearity.ipynb │ ├── lasso_feature_selection.ipynb │ ├── mrmr_feature_selection.ipynb │ ├── mrmr_fs_VS_arfs.ipynb │ └── preprocessing.ipynb └── requirements.txt ├── images ├── boostagroota-boston-lgb.png ├── grootcv-boston.png ├── leshy-boston.png ├── leshy-titanic-catboost-shap.png ├── leshy-titanic-lgbm-shap.png └── leshy-titanic-rndforest-shap.png ├── logo.png ├── pyproject.toml ├── src └── arfs │ ├── .gitignore │ ├── __init__.py │ ├── association.py │ ├── benchmark.py │ ├── dataset │ ├── data │ │ ├── boston_bunch.joblib │ │ └── housing.zip │ └── descr │ │ └── housing.rst │ ├── feature_selection │ ├── __init__.py │ ├── allrelevant.py │ ├── base.py │ ├── lasso.py │ ├── mrmr.py │ ├── summary.py │ ├── unsupervised.py │ └── variable_importance.py │ ├── gbm.py │ ├── parallel.py │ ├── preprocessing.py │ ├── sampling.py │ └── utils.py ├── tests ├── __init__.py ├── test_allrelevant.py └── test_featselect.py └── uv.lock /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: V7V72SOHX 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" 9 | directory: "/" 10 | schedule: 11 | interval: "weekly" 12 | allow: 13 | # Allow only direct updates for 14 | # Django and any packages starting "django" 15 | - dependency-name: "django*" 16 | dependency-type: "direct" 17 | # Allow only production updates for Sphinx 18 | - dependency-name: "sphinx" 19 | dependency-type: "production" 20 | -------------------------------------------------------------------------------- /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- 1 | name: Ruff 2 | on: [ push, pull_request ] 3 | jobs: 4 | ruff: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v4 8 | - uses: astral-sh/ruff-action@v3 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | # example NB 3 | examples/catboost_info/ 4 | examples/.ipynb_checkpoints/ 5 | examples/cb_model.json 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # ide 16 | .idea/ 17 | .vscode/ 18 | 19 | # Distribution / packaging 20 | .Python 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # celery beat schedule file 89 | celerybeat-schedule 90 | 91 | # SageMath parsed files 92 | *.sage.py 93 | 94 | # Environments 95 | .env 96 | .venv 97 | env/ 98 | venv/ 99 | ENV/ 100 | env.bak/ 101 | venv.bak/ 102 | 103 | # Spyder project settings 104 | .spyderproject 105 | .spyproject 106 | 107 | # Rope project settings 108 | .ropeproject 109 | 110 | # mkdocs documentation 111 | /site 112 | 113 | # mypy 114 | .mypy_cache/ 115 | 116 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 117 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 118 | 119 | # User-specific stuff 120 | .idea/**/workspace.xml 121 | .idea/**/tasks.xml 122 | .idea/**/dictionaries 123 | .idea/**/shelf 124 | 125 | # Sensitive or high-churn files 126 | .idea/**/dataSources/ 127 | .idea/**/dataSources.ids 128 | .idea/**/dataSources.local.xml 129 | .idea/**/sqlDataSources.xml 130 | .idea/**/dynamic.xml 131 | .idea/**/uiDesigner.xml 132 | .idea/**/dbnavigator.xml 133 | 134 | # Gradle 135 | .idea/**/gradle.xml 136 | .idea/**/libraries 137 | 138 | # CMake 139 | cmake-build-debug/ 140 | cmake-build-release/ 141 | 142 | # Mongo Explorer plugin 143 | .idea/**/mongoSettings.xml 144 | 145 | # File-based project format 146 | *.iws 147 | 148 | # IntelliJ 149 | out/ 150 | 151 | # mpeltonen/sbt-idea plugin 152 | .idea_modules/ 153 | 154 | # JIRA plugin 155 | atlassian-ide-plugin.xml 156 | 157 | # Cursive Clojure plugin 158 | .idea/replstate.xml 159 | 160 | # Crashlytics plugin (for Android Studio and IntelliJ) 161 | com_crashlytics_export_strings.xml 162 | crashlytics.properties 163 | crashlytics-build.properties 164 | fabric.properties 165 | 166 | # Editor-based Rest Client 167 | .idea/httpRequests 168 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file for Sphinx projects 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | # Required 5 | version: 2 6 | 7 | # Set the OS, Python version and other tools you might need 8 | build: 9 | os: "ubuntu-22.04" 10 | tools: 11 | python: "3.10" 12 | jobs: 13 | post_install: 14 | - pip uninstall -y sphinx-rtd-theme 15 | 16 | # Build documentation in the "docs/" directory with Sphinx 17 | sphinx: 18 | configuration: docs/conf.py 19 | 20 | # We recommend specifying your dependencies to enable reproducible builds: 21 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 22 | python: 23 | install: 24 | - requirements: docs/requirements.txt 25 | - method: pip 26 | path: . 27 | extra_requirements: 28 | - docs 29 | -------------------------------------------------------------------------------- /.vs/VSWorkspaceState.json: -------------------------------------------------------------------------------- 1 | { 2 | "ExpandedNodes": [ 3 | "", 4 | "\\arfs" 5 | ], 6 | "SelectedNode": "\\arfs\\allrelevant.py", 7 | "PreviewInSolutionExplorer": false 8 | } -------------------------------------------------------------------------------- /.vs/allrelevantfs/v17/.suo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/.vs/allrelevantfs/v17/.suo -------------------------------------------------------------------------------- /.vs/slnx.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/.vs/slnx.sqlite -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changes 2 | 3 | # 3.0.0 4 | 5 | - [ENHANCEMENT] Upgrade to newer SHAP and lightgbm version 6 | - [ENHANCEMENT] Migrate project management to `uv` 7 | 8 | # 2.4.0 9 | - [BUG] Add a safety for the array size in the weighted correlation ratio 10 | - [DOC] Contribution for better documentation, typos and fixing docstrings 11 | 12 | # 2.3.3 13 | - [BUG] Fix range, which should run from 1 to `max_iter` 14 | 15 | # 2.3.2 16 | - [BUG] Fix errors generated when updating dependencies with different naming for arguments 17 | 18 | # 2.3.1 19 | - [BUG] replace np.Inf by np.inf for compatibility purpose 20 | 21 | # 2.3.0 22 | - [BUG] corrected the column names for the GrootCV scheme, setting the shadow var in last position to guarantee the real names are used 23 | - [ENHANCEMENT] support user defined cross-validation scheme for time series applications for GrootCV 24 | 25 | # 2.2.6 26 | - [BUG] fix the calculation of the SHAP feature importance for multi-class 27 | - [ENHANCEMENT] Update pandas aggregation to get rid of the future deprecation warnings 28 | 29 | # 2.2.5 30 | - [BUG] fix the calculation of the SHAP feature importance for multi-class 31 | - [ENHANCEMENT] return the feature for the importance 32 | 33 | # 2.2.4 34 | - [BUG] add axis=1 to compute the max on the right dimension in _reduce_vars_sklearn 35 | - [BUG] remove merge causing duplication of the feature importance in _reduce_vars_sklearn 36 | 37 | # 2.2.3 38 | - [BUG] change the default of the weighted correlation for consistency with existing doc 39 | - [ENHANCEMENTS] speedup the correlation feature selector 40 | # 2.2.1 41 | - [BUG] add copy() to prevent modifying the input pandas DF in the mrmr when fitting the mrmr selector 42 | # 2.2.0 43 | - [BUG] fix the collinearity feature elimination 44 | - [BUG] fix the feature importance if fasttreeshap not installed 45 | - [REFACTORING] refactor the association module for removing redundancy and faster computation 46 | # 2.1.3 47 | - [BUG] fix the hardcoded threshold in collinearity elimination, closes #33 48 | # 2.1.2 49 | - [BUG] fix a bug in computing the association matrix when a single column of a specific dtype is passed in the sub_matrix (nom-nom, num-num) calculators. 50 | # 2.1.1 51 | - Refactor TreeDiscretizer 52 | # 2.1.0 53 | - Add a mechanism to the TreeDiscretizer that restricts the length of combined strings for categorical columns, preventing excessively lengthy entries. 54 | # 2.0.7 55 | - implement link for the lasso feature selection, e.g. log for ensuring positivity 56 | # 2.0.6 57 | - downgrade the lightgbm version to 3.3.1 for compatibility reasons (with optuna for instance) 58 | ## 2.0.5 59 | - Fix: strictly greater than threshold rather than geq in the base threshold transformer 60 | - Update: due to a change in the lightgbm train API (v4), update the code for GBM 61 | ## 2.0.4 62 | - Documentation: fix the format of some docstrings and remove old sphinx generated files 63 | ## 2.0.3 64 | - Fix: remove unnecessary `__all__` in the preprocessing module and improve the consistency of the module docstrings 65 | ## 2.0.2 66 | - Fix: when the L1 == 0 in fit_regularized, statsmodels returns the regularized wrapper without refit, which breaks the class (statistics not available) 67 | ## 2.0.1 68 | - Build: remove explicit dependencies on holoviews and panel 69 | ## 2.0.0 70 | - Add fasttreeshap implementation as an option to compute shap importance (fasttreeshap does not work with XGBoost though) 71 | - New feature: lasso feature selection, especially useful for models without interactions (LM, GLM, GAM) 72 | - New feature: pass lightgbm parameters to GrootCV 73 | - Bug: fix sample weight shape in mrMR 74 | - Documentation: update and upgrade tuto NB 75 | ## 1.1.4 76 | - update the required python version >= 3.9 77 | ## 1.1.3 78 | - Change tqdm to auto for better rendering in NB for variable importance selector 79 | - User defined n_jobs for association matrix computation 80 | ## 1.1 81 | 82 | - Corrected an issue in Leshy that occurred when using categorical variables. The use of NumPy functions and methods instead of Pandas ones resulted in the modification of original data types. 83 | 84 | ## 1.0.7 85 | 86 | - Patch preventing zero division in the conditional entropy calculation 87 | 88 | ## 1.0.6 89 | 90 | - Return self in mrmr, fixing error when in scikit-learn pipeline 91 | 92 | ## 1.0.5 93 | 94 | - Patching classes where old unused argument was causing an error 95 | 96 | ## 1.0.2 97 | 98 | - Distribute a toy dataset for regression by modifying the Boston dataset adding noise and made up columns 99 | 100 | ## 1.0.1 101 | 102 | - Fix pkg data distribution 103 | 104 | ## 1.0.0 105 | 106 | - Parallelization of functions applied on pandas data frame 107 | - Faster and more modular association measures 108 | - Removing dependencies (e.g. dython) 109 | - Better static and interactive visualization 110 | - Sklearn selectors rather than a big class 111 | - Discretization of continuous and categorical predictors 112 | - Minimal redundancy maximal relevance feature selection added (a subset of all relevant predictors), based on Uber's MRmr flavor 113 | - architecture closer to the scikit-learn one 114 | 115 | ## 0.3.8 116 | 117 | - Fix bug when compute shap importance for classifier in GrootCV 118 | 119 | ## 0.3.7 120 | 121 | - Add defensive check if no categorical found in the subsampling of the dataset 122 | - Re-run the notebooks with the new version 123 | ## 0.3.6 124 | 125 | - Fix clustering when plotting only strongly correlated predictors 126 | - Remove palettable dependencies for plotting 127 | - Add default colormap but implement the user defined option 128 | ## 0.3.5 129 | 130 | - Enable clustering before plotting the correlation/association matrix, optional 131 | - Decrease fontsize for the lables of the correlation matrix 132 | 133 | ## 0.3.4 134 | 135 | - Update requirements 136 | 137 | ## 0.3.3 138 | 139 | - Upgrade documentation 140 | 141 | ## 0.3.2 142 | 143 | - Fix typo for distributing the dataset and pinned the dependencies 144 | ## 0.3.1 145 | 146 | - Update the syntax for computing associations using the latest version of dython 147 | 148 | ## 0.3.0 149 | 150 | - Fix the Boruta_py feature counts, now adds up to n_features 151 | - Fix the boxplot colours, when only rejected and accepted (no tentative) the background color was the tentative color 152 | - Numpy docstring style 153 | - Implement the new lightGBM callbacks. The new lgbm version (>3.3.0) implements the early stopping using a callback rather than an argument 154 | - Fix a bug for computing the shap importance when the estimator is lightGBM and the task is classification 155 | - Add ranking and absolute ranking attributes for all the classes 156 | - Fix future pandas TypeError when computing numerical values on a dataframe containing non-numerical columns 157 | - Add housing data to the distribution 158 | - Add "extreme" sampling methods 159 | - Re-run the NBs 160 | - reindex to keep the original columns order 161 | 162 | ## 0.2.3 163 | 164 | - Update syntax to stick to the new argument names in Dython 165 | 166 | ## 0.2.2 167 | 168 | - Check if no feature selected, warn rather than throw error 169 | 170 | ## 0.2.1 171 | 172 | - Fix a bug when removing collinear columns 173 | 174 | ## 0.2.0 175 | 176 | - Prefilters now support the filtering of continuous and nominal (categorical) collinear variables 177 | 178 | ## 0.1.6 179 | 180 | - improve the plot_y_vs_X function 181 | - remove gc.collect() 182 | 183 | ## 0.1.5 184 | 185 | - fix readme (typos) 186 | - move utilities in utils sub-package 187 | - make unit tests lighter 188 | 189 | ## 0.1.4 190 | 191 | - fix bug when using catboost, clone estimator (avoid error and be sure to use a non-fitted estimator) 192 | 193 | ## 0.1.3 194 | 195 | - change the defaut for categorical encoding in pre-filters (pd.cat to integers as default) 196 | - fix the unit tests with new defaults and names 197 | 198 | ## 0.1.2 199 | 200 | - change arguments name in pre-filters 201 | 202 | ## 0.1.1 203 | 204 | - remove old attribute names in unit-tests 205 | 206 | ## 0.1.0 207 | 208 | - Fix lightGBM warnings 209 | - Typo in repr 210 | - Provide load_data utility 211 | - Enhance jupyter NB examples 212 | - highlighting synthetic random predictors 213 | - Benchmark using sklearn permutation importance 214 | - Harmonization of the attributes and parameters 215 | - Fix categoricals handling 216 | 217 | ## 0.0.4 218 | 219 | - setting optimal number of features (according to "Elements of statistical learning") when using lightGBM random forest boosting. 220 | - Providing random forest, lightgbm implementation, estimators 221 | 222 | ## 0.0.3 223 | 224 | - Adding examples and expanding documentation 225 | 226 | ## 0.0.2 227 | 228 | - fix bug: relative import removed -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | abstract: >- 3 | All relevant feature selection means trying to find all features carrying information usable for prediction, rather than finding a possibly compact subset of features on which some particular model has a minimal error. This might include redundant predictors. 4 | title: All relevant feature selection 5 | message: >- 6 | If you use this software, please cite it using the metadata from this file. 7 | type: software 8 | authors: 9 | - given-names: Thomas 10 | family-names: Bury 11 | orcid: 'https://orcid.org/0000-0003-1421-4184' 12 | keywords: 13 | - "Feature Selection" 14 | - "All Relevant Feature Selection" 15 | - "Machine Learning" 16 | license: MIT License 17 | url: 'https://github.com/ThomasBury/arfs' 18 | version: 3.0.0 19 | date-released: 2021-12-18 20 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [2020] [Thomas Bury] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | drawing 2 | 3 | [buy me caffeine](https://ko-fi.com/V7V72SOHX) 4 | 5 | [![PyPI version](https://badge.fury.io/py/arfs.svg)](https://badge.fury.io/py/arfs) [![Downloads](https://static.pepy.tech/personalized-badge/arfs?period=total&units=international_system&left_color=grey&right_color=yellow&left_text=Downloads)](https://pepy.tech/project/arfs) [![Documentation Status](https://readthedocs.org/projects/arfs/badge/?version=latest)](https://arfs.readthedocs.io/en/latest/?badge=latest) [![Code Style](https://img.shields.io/badge/code%20style-black-black)](https://img.shields.io/badge/code%20style-black-black) 6 | 7 | 8 | [ARFS readthedocs](https://arfs.readthedocs.io/en/latest/#) 9 | 10 | # All relevant feature selection 11 | 12 | All relevant feature selection means trying to find all features carrying information usable for prediction, rather than finding a possibly compact subset of features on which some particular model has a minimal error. This might include redundant predictors. All relevant feature selection is model agnostic in the sense that it doesn't optimize a scoring function for a *specific* model but rather tries to select all the predictors which are related to the response. 13 | 14 | This package implements 3 different methods (Leshy is an evolution of Boruta, BoostAGroota is an evolution of BoostARoota and GrootCV is a new one). They are sklearn compatible. See hereunder for details about those methods. You can use any sklearn compatible estimator with Leshy and BoostAGroota but I recommend lightGBM. It's fast, accurate and has SHAP values builtin. It also provides a module for performing preprocessing and perform basic feature selection (autobinning, remove columns with too many missing values, zero variance, high-cardinality, highly correlated, etc.). Examples and detailled methods hereunder. 15 | 16 | Moreover, as an alternative to the all relevant problem, the ARFS package provides a MRmr feature selection which, theoretically, returns a subset of the predictors selected by an arfs method. ARFS also provides a `LASSO` feature selection which works especially well for (G)LMs and GAMs. You can combine Lasso with the `TreeDiscretizer` for introducing non-linearities into linear models and perform feature selection. 17 | 18 | Please note that one limitation of the lasso is that it treats the levels of a categorical predictor individually. However, this issue can be addressed by utilizing the `TreeDiscretizer`, which automatically bins numerical variables and groups the levels of categorical variables. 19 | 20 | ## Installation 21 | 22 | `$ pip install arfs` 23 | 24 | REM: If you're interested in using the `fastshap` option, you'll need to install [fasttreeshap](https://github.com/linkedin/FastTreeSHAP) first. For a smooth installation process, I suggest using `conda install -c conda-forge fasttreeshap` since the c++ source code requires compilation. Using pip may involve additional dependencies, such as requiring VS for compiling the c++ code. 25 | 26 | ## Example 27 | 28 | Working examples for: 29 | 30 | - [Preprocessing](./docs/notebooks/preprocessingipynb) 31 | - [Basic FS (best before ARFS)](./docs/notebooks/basic_feature_selection.ipynb) 32 | - [Regression](./docs/notebooks/arfs_regression.ipynb) 33 | - [Classification](./docs/notebooks/arfs_classification.ipynb) 34 | - [LASSO and (G)LM feature selection](./docs/notebooks/lasso_feature_selection.ipynb) 35 | - [Passing custom params](./docs/notebooks/arfs_grootcv_custom_params.ipynb) 36 | - [Non-normal loss and sample weights](./docs/notebooks/arfs_non_normal_loss_and_sample_weight.ipynb) 37 | - [ARFS on GPU](./docs/notebooks/arfs_on_GPU.ipynb) 38 | - [Fast Shap](./docs/notebooks/arfs_shap_vs_fastshap.ipynb) 39 | - [Categoricals](./docs/notebooks/issue_categoricals.ipynb) 40 | - [Collinearity](./docs/notebooks/issue_collinearity.ipynb) 41 | - [Reducing run time for large data](./docs/notebooks/arfs_large_data_sampling.ipynb) 42 | - [Comparison to Boruta and BorutaShap](./docs/notebooks/arfs_boruta_borutaShap_comparison.ipynb) 43 | - [MRmr alternative](./docs/notebooks/mrmr_feature_selection.ipynb) 44 | - [MRmr vs ARFS](./docs/notebooks/mrmr_fs_VS_arfs.ipynb) 45 | 46 | For imbalanced classification: 47 | - GrootCV will automatically detect imbalanced data and set the lightGBM `'is_unbalance' = True` 48 | - For Leshy and BoostAGroota, you can pass the estimator with the relevant parameter (e.g. `class_weight = 'balanced'`) 49 | 50 | 51 | 52 | ## Boruta 53 | 54 | The Boruta algorithm tries to capture all the important features you might have in your dataset with respect to an outcome variable. The procedure is the following: 55 | 56 | * Create duplicate copies of all independent variables. When the number of independent variables in the original data is less than 5, create at least 5 copies using existing variables. 57 | * Shuffle the values of added duplicate copies to remove their correlations with the target variable. It is called shadow features or permuted copies. 58 | * Combine the original ones with shuffled copies 59 | * Run a random forest classifier on the combined dataset and performs a variable importance measure (the default is Mean Decrease Accuracy) to evaluate the importance of each variable where higher means more important. 60 | * Then Z score is computed. It means mean of accuracy loss divided by the standard deviation of accuracy loss. 61 | * Find the maximum Z score among shadow attributes (MZSA) 62 | * Tag the variables as 'unimportant' when they have importance significantly lower than MZSA. Then we permanently remove them from the process. 63 | * Tag the variables as 'important' when they have importance significantly higher than MZSA. 64 | * Repeat the above steps for a predefined number of iterations (random forest runs), or until all attributes are either tagged 'unimportant' or 'important', whichever comes first. 65 | 66 | At every iteration, the algorithm compares the Z-scores of the shuffled copies of the features and the original features to see if the latter performed better than the former. If it does, the algorithm will mark the feature as important. In essence, the algorithm is trying to validate the importance of the feature by comparing with randomly shuffled copies, which increases the robustness. This is done by simply comparing the number of times a feature did better with the shadow features using a binomial distribution. Since the whole process is done on the same train-test split, the variance of the variable importance comes only from the different re-fit of the model over the different iterations. 67 | 68 | 69 | drawing 70 | 71 | ## BoostARoota 72 | 73 | BoostARoota follows closely the Boruta method but modifies a few things: 74 | 75 | * One-Hot-Encode the feature set 76 | * Double width of the data set, making a copy of all features in the original dataset 77 | * Randomly shuffle the new features created in (2). These duplicated and shuffled features are referred to as "shadow features" 78 | * Run XGBoost classifier on the entire data set ten times. Running it ten times allows for random noise to be smoothed, resulting in more robust estimates of importance. The number of repeats is a parameter than can be changed. 79 | * Obtain importance values for each feature. This is a simple importance metric that sums up how many times the particular feature was split on in the XGBoost algorithm. 80 | * Compute "cutoff": the average feature importance value for all shadow features and divide by four. Shadow importance values are divided by four (parameter can be changed) to make it more difficult for the variables to be removed. With values lower than this, features are removed at too high of a rate. 81 | * Remove features with average importance across the ten iterations that is less than the cutoff specified in (6) 82 | * Go back to (2) until the number of features removed is less than ten per cent of the total. 83 | * Method returns the features remaining once completed. 84 | 85 | In the spirit, the same heuristic than Boruta but using Boosting (originally Boruta was supporting only random forest). The validation of the importance is done by comparing to the maximum of the median var. imp of the shadow predictors (in Boruta, a statistical test is performed using the Z-score). Since the whole process is done on the same train-test split, the variance of the variable importance comes only from the different re-fit of the model over the different iterations. 86 | 87 | drawing 88 | 89 | ## Modifications to Boruta and BoostARoota 90 | 91 | I forked both Boruta and BoostARoota and made the following changes (under PR): 92 | 93 | **Boruta --> Leshy**: 94 | 95 | - The categorical features (they are detected, encoded. The tree-based models are working better with integer encoding rather than with OHE, which leads to deep and unstable trees). If Catboost is used, then the cat.pred (if any) are set up 96 | - Using lightGBM as the default speeds up by an order of magnitude the running time 97 | - Work with Catboost, sklearn API 98 | - Allow using sample_weight, for applications like Poisson regression or any requiring weights 99 | - Supports 3 different feature importances: native, SHAP and permutation. Native being the least consistent(because of the imp. biased towards numerical and large cardinality categorical) but the fastest of the 3. Indeed, the impurity var.imp. are biased en sensitive to large cardinality (see [scikit demo](https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py)) 100 | 101 | **BoostARoota --> BoostAGroota**: 102 | 103 | - Replace XGBoost with LightGBM, you can still use tree-based scikitlearn models 104 | - Replace native var.imp by SHAP var.imp. Indeed, the impurity var.imp. are biased en sensitive to large cardinality (see [scikit demo](https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py)). Moreover, the native var. imp are computed on the train set, here the data are split (internally) in train and test, var. imp computed on the test set. 105 | - Handling categorical predictors. Cat. predictors should NOT be one-hot encoded, it leads to deep unstable trees. Instead, it's better to use the native method of lightGBM or CatBoost. A preprocessing step is needed to encode (ligthGBM and CatBoost use integer encoding and reference to categorical columns. The splitting strategies are different then, see official doc). 106 | - Work with sample_weight, for Poisson or any application requiring a weighting. 107 | 108 | ## GrootCV, a new method 109 | 110 | **New: GrootCV**: 111 | 112 | - Cross-validated feature importance to smooth out the noise, based on lightGBM only (which is, most of the time, the fastest and more accurate Boosting). 113 | - the feature importance is derived using SHAP importance 114 | - Taking the max of the median of the shadow var. imp over folds otherwise not enough conservative and it improves the convergence (needs less evaluation to find a threshold) 115 | - Not based on a given percentage of cols needed to be deleted 116 | - Plot method for var. imp 117 | 118 | drawing 119 | 120 | ## References 121 | 122 | **Theory** 123 | 124 | - [Consistent feature selection for pattern recognition in polynomial time](https://www.jmlr.org/papers/volume8/nilsson07a/nilsson07a.pdf) 125 | - [Maximum Relevance and Minimum Redundancy Feature Selection Methods for a Marketing Machine Learning Platform](https://eng.uber.com/research/maximum-relevance-and-minimum-redundancy-feature-selection-methods-for-a-marketing-machine-learning-platform/) 126 | 127 | **Applications** 128 | 129 | - [The Boruta paper](https://www.jstatsoft.org/article/view/v036i11/v36i11.pdf) 130 | - [The python implementation](https://github.com/scikit-learn-contrib/boruta_py) 131 | - [BoostARoota](https://github.com/chasedehan/BoostARoota) 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /docs/Introduction.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | ============ 3 | 4 | All relevant feature selection means trying to find all features carrying information usable for prediction, 5 | rather than finding a possibly compact subset of features on which some particular model has a minimal error. 6 | This might include redundant predictors. All relevant feature selection is model agnostic in the sense that it 7 | doesn't optimize a scoring function for a *specific* model but rather tries to select all the predictors which are related to the response. 8 | This package implements 3 different methods (Leshy is an evolution of Boruta, BoostAGroota is an evolution of BoostARoota and GrootCV is a new one). 9 | They are sklearn compatible. See hereunder for details about those methods. You can use any sklearn compatible estimator 10 | with Leshy and BoostAGroota but I recommend lightGBM. It's fast, accurate and has SHAP values builtin. 11 | 12 | It also provides a module for performing preprocessing and perform basic feature selection 13 | (autobinning, remove columns with too many missing values, zero variance, high-cardinality, highly correlated, etc.). 14 | 15 | Moreover, as an alternative to the all relevant problem, the ARFS package provides a MRmr feature selection which, 16 | theoretically, returns a subset of the predictors selected by an arfs method. ARFS also provides a `LASSO` feature 17 | selection which works especially well for (G)LMs and GAMs. You can combine Lasso with the `TreeDiscretizer` for introducing 18 | non-linearities into linear models and perform feature selection. 19 | Please note that one limitation of the lasso is that it treats the levels of a categorical predictor individually. 20 | However, this issue can be addressed by utilizing the `TreeDiscretizer`, which automatically bins numerical variables and 21 | groups the levels of categorical variables. 22 | 23 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/Methods overview.rst: -------------------------------------------------------------------------------- 1 | Methods overview 2 | ================ 3 | 4 | Boruta 5 | ------ 6 | 7 | The Boruta algorithm tries to capture all the important features you might have in your dataset with respect to an outcome variable. The procedure is as follows: 8 | 9 | * Create duplicate copies of all independent variables. When the number of independent variables in the original data is less than 5, create at least 5 copies using existing variables. 10 | * Shuffle the values of added duplicate copies to remove their correlations with the target variable. It is called shadow features or permuted copies. 11 | * Combine the original ones with shuffled copies. 12 | * Run a random forest classifier on the combined dataset and perform a variable importance measure (the default is Mean Decrease Accuracy) to evaluate the importance of each variable where higher means more important. 13 | * Then Z score is computed. It means the mean of accuracy loss divided by the standard deviation of accuracy loss. 14 | * Find the maximum Z score among shadow attributes (MZSA). 15 | * Tag the variables as 'unimportant' when they have importance significantly lower than MZSA. Then we permanently remove them from the process. 16 | * Tag the variables as 'important' when they have importance significantly higher than MZSA. 17 | * Repeat the above steps for a predefined number of iterations (random forest runs), or until all attributes are either tagged 'unimportant' or 'important', whichever comes first. 18 | 19 | At every iteration, the algorithm compares the Z-scores of the shuffled copies of the features and the original features to see if the latter performed better than the former. If it does, the algorithm will mark the feature as important. In essence, the algorithm is trying to validate the importance of the feature by comparing with randomly shuffled copies, which increases the robustness. This is done by simply comparing the number of times a feature did better with the shadow features using a binomial distribution. Since the whole process is done on the same train-test split, the variance of the variable importance comes only from the different re-fit of the model over the different iterations. 20 | 21 | 22 | BoostARoota 23 | ----------- 24 | 25 | BoostARoota follows closely the Boruta method but modifies a few things: 26 | 27 | * One-Hot-Encode the feature set. 28 | * Double width of the data set, making a copy of all features in the original dataset. 29 | * Randomly shuffle the new features created in (2). These duplicated and shuffled features are referred to as "shadow features." 30 | * Run XGBoost classifier on the entire data set ten times. Running it ten times allows for random noise to be smoothed, resulting in more robust estimates of importance. The number of repeats is a parameter that can be changed. 31 | * Obtain importance values for each feature. This is a simple importance metric that sums up how many times the particular feature was split in the XGBoost algorithm. 32 | * Compute "cutoff": the average feature importance value for all shadow features and divide by four. Shadow importance values are divided by four (parameters can be changed) to make it more difficult for the variables to be removed. With values lower than this, features are removed at too high of a rate. 33 | * Remove features with average importance across the ten iterations that are less than the cutoff specified in (6). 34 | * Go back to (2) until the number of features removed is less than ten percent of the total. 35 | * The method returns the features remaining once completed. 36 | 37 | Modifications to Boruta 38 | ----------------------- 39 | 40 | Boruta --> Leshy: 41 | 42 | For chronological development, see https://github.com/scikit-learn-contrib/boruta_py/pull/77 and https://github.com/scikit-learn-contrib/boruta_py/pull/100 43 | 44 | Leshy vs. BorutaPy: 45 | To summarize, this PR solves/enhances: 46 | * The categorical features (they are detected, encoded. The tree-based models are working better with integer encoding rather than with OHE, which leads to deep and unstable trees). If Catboost is used, then the cat.pred (if any) are set up. 47 | * Work with Catboost sklearn API. 48 | * Allow using sample_weight, for applications like Poisson regression or any requiring weights. 49 | * 3 different feature importances: native, SHAP, and permutation. Native being the least consistent (because of the imp. biased towards numerical and large cardinality categorical) but the fastest of the 3. 50 | * Using LightGBM as default speed up by an order of magnitude the running time. 51 | * Visualization like in the R package. 52 | 53 | BorutaPy vs. Boruta R: 54 | The improvements of this implementation include: 55 | * Faster run times: Thanks to scikit-learn's fast implementation of the ensemble methods. 56 | * Scikit-learn like interface: Use BorutaPy just like any other scikit-learn: fit, fit_transform, and transform are all implemented in a similar fashion. 57 | * Modularity: Any ensemble method could be used: random forest, extra trees classifier, even gradient boosted trees. 58 | * Two-step correction: The original Boruta code corrects for multiple testing in an overly conservative way. In this implementation, the Benjamini Hochberg FDR is used to correct in each iteration across active features. This means only those features are included in the correction which are still in the selection process. Following this, each that passed goes through a regular Bonferroni correction to check for the repeated testing over the iterations. 59 | * Percentile: Instead of using the max values of the shadow features, the user can specify which percentile to use. This gives a finer control over this crucial parameter. For more info, please read about the perc parameter. 60 | * Automatic tree number: Setting the n_estimator to 'auto' will calculate the number of trees in each iteration based on the number of features under investigation. This way more trees are used when the training data has many features and fewer when most of the features have been rejected. 61 | * Ranking of features: After fitting BorutaPy, it provides the user with ranking of features. Confirmed ones are 1, Tentatives are 2, and the rejected are ranked starting from 3, based on their feature importance history through the iterations. 62 | * Using either the native variable importance, scikit permutation importance, SHAP importance. 63 | 64 | We highly recommend using pruned trees with a depth between 3-7. For more, see the docs of these functions, and the examples below. Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/ 65 | 66 | GrootCV, a new method 67 | --------------------- 68 | 69 | New: GrootCV: 70 | - Cross-validated feature importance to smooth out the noise, based on lightGBM only (which is, most of the time, the fastest and more accurate Boosting). 71 | - The feature importance is derived using SHAP importance. 72 | - Taking the max of median of the shadow var. imp over folds otherwise not enough conservative and it improves the convergence (needs less evaluation to find a threshold). 73 | - Not based on a given percentage of cols needed to be deleted. 74 | - Plot method for var. imp. 75 | 76 | MRmr 77 | ---- 78 | 79 | Re-implementing the Uber MRmr scheme using associations for handling continuous and categorical predictors. 80 | - Theil's U statistics for the categorical-categorical association (correlation). 81 | - Variance ratio for continuous-categorical association. 82 | - Pearson or Spearman correlation for continuous-continuous association. 83 | 84 | Lasso 85 | ----- 86 | 87 | Performing a simple grid search with enforced lasso regularization. 88 | The best model is chosen based on the minimum BIC or deviance score, and all non-zero coefficients are selected. 89 | The loss function can belong to the exponential family, as seen in the statsmodels GLM documentation. 90 | Using the bic metric is faster since it is evaluated on the training data, making it unsuitable for the test data, whereas the deviance is cross-validated. 91 | 92 | This approach can be combined with the TreeDiscretizer transformer to introduce univariate non-linearities (tree-GAM) before feature selection. 93 | This serves as a workaround to compensate for the absence of fused and grouped lasso regularization. 94 | 95 | References 96 | ---------- 97 | 98 | **Theory** 99 | - [Consistent feature selection for pattern recognition in polynomial time](http://compmed.se/files/6914/2107/3475/pub_2007_5.pdf) 100 | - [Maximum Relevance and Minimum Redundancy Feature Selection Methods for a Marketing Machine Learning Platform](https://www.uber.com/blog/research/maximum-relevance-and-minimum-redundancy-feature-selection-methods-for-a-marketing-machine-learning-platform) 101 | 102 | **Applications** 103 | - [The Boruta paper](https://www.jstatsoft.org/article/view/v036i11/v36i11.pdf) 104 | - [The python implementation](https://github.com/scikit-learn-contrib/boruta_py) 105 | - [BoostARoota](https://github.com/chasedehan/BoostARoota) -------------------------------------------------------------------------------- /docs/arfs.feature_selection.rst: -------------------------------------------------------------------------------- 1 | arfs.feature\_selection package 2 | =============================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | arfs.feature\_selection.allrelevant module 8 | ------------------------------------------ 9 | 10 | .. automodule:: arfs.feature_selection.allrelevant 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | arfs.feature\_selection.base module 16 | ----------------------------------- 17 | 18 | .. automodule:: arfs.feature_selection.base 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | arfs.feature\_selection.lasso module 24 | ------------------------------------ 25 | 26 | .. automodule:: arfs.feature_selection.lasso 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | arfs.feature\_selection.mrmr module 32 | ----------------------------------- 33 | 34 | .. automodule:: arfs.feature_selection.mrmr 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | arfs.feature\_selection.summary module 40 | -------------------------------------- 41 | 42 | .. automodule:: arfs.feature_selection.summary 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | arfs.feature\_selection.unsupervised module 48 | ------------------------------------------- 49 | 50 | .. automodule:: arfs.feature_selection.unsupervised 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | arfs.feature\_selection.variable\_importance module 56 | --------------------------------------------------- 57 | 58 | .. automodule:: arfs.feature_selection.variable_importance 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | Module contents 64 | --------------- 65 | 66 | .. automodule:: arfs.feature_selection 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | -------------------------------------------------------------------------------- /docs/arfs.rst: -------------------------------------------------------------------------------- 1 | arfs package 2 | ============ 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | arfs.feature_selection 11 | 12 | Submodules 13 | ---------- 14 | 15 | arfs.association module 16 | ----------------------- 17 | 18 | .. automodule:: arfs.association 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | arfs.benchmark module 24 | --------------------- 25 | 26 | .. automodule:: arfs.benchmark 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | arfs.gbm module 32 | --------------- 33 | 34 | .. automodule:: arfs.gbm 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | arfs.parallel module 40 | -------------------- 41 | 42 | .. automodule:: arfs.parallel 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | arfs.preprocessing module 48 | ------------------------- 49 | 50 | .. automodule:: arfs.preprocessing 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | arfs.sampling module 56 | -------------------- 57 | 58 | .. automodule:: arfs.sampling 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | arfs.utils module 64 | ----------------- 65 | 66 | .. automodule:: arfs.utils 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | Module contents 72 | --------------- 73 | 74 | .. automodule:: arfs 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | -------------------------------------------------------------------------------- /docs/boostaroota.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/docs/boostaroota.png -------------------------------------------------------------------------------- /docs/boruta.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/docs/boruta.png -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | import sys 17 | import os 18 | import datetime 19 | 20 | sys.path.insert(0, os.path.abspath("../../arfs")) 21 | # -- Project information ----------------------------------------------------- 22 | 23 | project = "arfs" 24 | copyright = "2024, Thomas Bury" 25 | author = "Thomas Bury" 26 | 27 | # The full version, including alpha/beta/rc tags 28 | release = "3.0.0" 29 | 30 | # If extensions (or modules to document with autodoc) are in another 31 | # directory, add these directories to sys.path here. If the directory is 32 | # relative to the documentation root, use os.path.abspath to make it 33 | # absolute, like shown here. 34 | # sys.path.append(os.path.join(os.path.abspath(os.pardir))) 35 | 36 | # Don't add the same path again, remove the following line: 37 | # sys.path.insert(0, os.path.abspath("..")) 38 | 39 | sys.path.append(os.path.abspath(os.path.join(__file__, "../../src"))) 40 | autodoc_mock_imports = ["_tkinter", "sphinx_tabs.tabs"] 41 | 42 | # Get the project root dir, which is the parent dir of this 43 | cwd = os.getcwd() 44 | project_root = os.path.dirname(cwd) 45 | 46 | 47 | # -- General configuration --------------------------------------------------- 48 | 49 | # Add any Sphinx extension module names here, as strings. They can be 50 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 51 | # ones. 52 | extensions = [ 53 | "sphinx.ext.autodoc", 54 | "sphinx.ext.autosectionlabel", 55 | "sphinx.ext.napoleon", 56 | "sphinx.ext.viewcode", 57 | "sphinx_autodoc_typehints", 58 | "sphinx_copybutton", 59 | "nbsphinx", 60 | "sphinx_tabs.tabs", 61 | ] 62 | 63 | # Add any paths that contain templates here, relative to this directory. 64 | templates_path = ["_templates"] 65 | autosummary_generate = True 66 | 67 | # List of patterns, relative to source directory, that match files and 68 | # directories to ignore when looking for source files. 69 | # This pattern also affects html_static_path and html_extra_path. 70 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 71 | 72 | 73 | # -- Options for HTML output ------------------------------------------------- 74 | 75 | # The theme to use for HTML and HTML Help pages. See the documentation for 76 | # a list of builtin themes. 77 | # 78 | # html_theme = "sphinx_rtd_theme" 79 | html_permalinks_icon = "#" 80 | html_theme = "sphinxawesome_theme" 81 | 82 | # If not None, a 'Last updated on:' timestamp is inserted at every page 83 | # bottom, using the given strftime format. 84 | # The empty string is equivalent to '%b %d, %Y'. 85 | html_last_updated_fmt = "%B %d, %Y at %H:%M" 86 | today_fmt = "%B %d, %Y at %H:%M" 87 | 88 | # Add any paths that contain custom static files (such as style sheets) here, 89 | # relative to this directory. They are copied after the builtin static files, 90 | # so a file named "default.css" will overwrite the builtin "default.css". 91 | # html_static_path = ["_static"] 92 | html_title = "ARFS Documentation" 93 | html_show_sourcelink = True 94 | html_logo = "logo.png" 95 | 96 | # -- Napoleon settings (for numpydoc parsing) -------------------------------- 97 | # https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html#configuration 98 | napoleon_google_docstring = False 99 | napoleon_numpy_docstring = True 100 | napoleon_include_init_with_doc = True 101 | napoleon_include_private_with_doc = True 102 | napoleon_include_special_with_doc = True 103 | napoleon_use_admonition_for_examples = False 104 | napoleon_use_admonition_for_notes = False 105 | napoleon_use_admonition_for_references = False 106 | napoleon_use_ivar = True 107 | napoleon_use_param = True 108 | napoleon_use_rtype = False 109 | napoleon_preprocess_types = True 110 | napoleon_type_aliases = None 111 | napoleon_attr_annotations = True 112 | -------------------------------------------------------------------------------- /docs/grootcv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/docs/grootcv.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to arfs's documentation! 2 | ================================ 3 | 4 | A package for performing All Relevant Feature Selection but not only that. 5 | 6 | Documentation last change: |today| 7 | 8 | .. toctree:: 9 | :maxdepth: 2 10 | :caption: User's guide 11 | 12 | Introduction 13 | Methods overview 14 | modules 15 | 16 | 17 | .. toctree:: 18 | :maxdepth: 4 19 | :glob: 20 | :caption: Tutorials 21 | 22 | notebooks/preprocessing.ipynb 23 | notebooks/basic_feature_selection.ipynb 24 | notebooks/association_and_feature_selection.ipynb 25 | notebooks/arfs_classification.ipynb 26 | notebooks/arfs_regression.ipynb 27 | notebooks/arfs_timeseries.ipynb 28 | notebooks/arfs_large_data_sampling.ipynb 29 | notebooks/arfs_on_GPU.ipynb 30 | notebooks/arfs_shap_vs_fastshap.ipynb 31 | notebooks/arfs_grootcv_custom_params.ipynb 32 | notebooks/arfs_boruta_borutaShap_comparison.ipynb 33 | notebooks/arfs_non_normal_loss_and_sample_weight.ipynb 34 | notebooks/mrmr_feature_selection.ipynb 35 | notebooks/mrmr_fs_VS_arfs.ipynb 36 | notebooks/lasso_feature_selection.ipynb 37 | notebooks/issue_categoricals.ipynb 38 | notebooks/issue_collinearity.ipynb 39 | 40 | Indices and tables 41 | ================== 42 | 43 | * :ref:`genindex` 44 | * :ref:`modindex` 45 | * :ref:`search` -------------------------------------------------------------------------------- /docs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/docs/logo.png -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | src 2 | === 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | arfs 8 | -------------------------------------------------------------------------------- /docs/notebooks/arfs_on_GPU.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# ARFS - Using GPU\n", 8 | "\n", 9 | "You can leverage the GPU implementation of lightGBM (or other GBM flavours) but this often requires to compile or install some libraries or kit (such as CUDA)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# from IPython.core.display import display, HTML\n", 19 | "# display(HTML(\"\"))\n", 20 | "import time\n", 21 | "import numpy as np\n", 22 | "import pandas as pd\n", 23 | "import matplotlib as mpl\n", 24 | "import matplotlib.pyplot as plt\n", 25 | "from lightgbm import LGBMRegressor\n", 26 | "\n", 27 | "import arfs\n", 28 | "from arfs.feature_selection import GrootCV, Leshy\n", 29 | "from arfs.utils import load_data\n", 30 | "from arfs.benchmark import highlight_tick\n", 31 | "\n", 32 | "rng = np.random.RandomState(seed=42)\n", 33 | "\n", 34 | "# import warnings\n", 35 | "# warnings.filterwarnings('ignore')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## GrootCV on GPU\n", 43 | "\n", 44 | "If the data is small, using a GPU mught not be the most efficient." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "from sklearn.datasets import make_regression\n", 54 | "from sklearn.model_selection import train_test_split\n", 55 | "\n", 56 | "# Generate synthetic data with Poisson-distributed target variable\n", 57 | "bias = 1\n", 58 | "\n", 59 | "n_samples = 100_00 # 1_000_000\n", 60 | "n_features = 100\n", 61 | "n_informative = 20\n", 62 | "\n", 63 | "X, y, true_coef = make_regression(\n", 64 | " n_samples=n_samples,\n", 65 | " n_features=n_features,\n", 66 | " n_informative=n_informative,\n", 67 | " noise=1,\n", 68 | " random_state=8,\n", 69 | " bias=bias,\n", 70 | " coef=True,\n", 71 | ")\n", 72 | "y = (y - y.mean()) / y.std()\n", 73 | "y = np.exp(y) # Transform to positive values for Poisson distribution\n", 74 | "y = np.random.poisson(y) # Add Poisson noise to the target variable\n", 75 | "# dummy sample weight (e.g. exposure), smallest being 30 days\n", 76 | "w = np.random.uniform(30 / 365, 1, size=len(y))\n", 77 | "# make the count a Poisson rate (frequency)\n", 78 | "y = y / w\n", 79 | "\n", 80 | "X = pd.DataFrame(X)\n", 81 | "X.columns = [f\"pred_{i}\" for i in range(X.shape[1])]\n", 82 | "\n", 83 | "# Split the data into training and testing sets\n", 84 | "X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(\n", 85 | " X, y, w, test_size=0.5, random_state=42\n", 86 | ")\n", 87 | "\n", 88 | "true_coef = pd.Series(true_coef)\n", 89 | "true_coef.index = X.columns\n", 90 | "true_coef = pd.Series({**{\"intercept\": bias}, **true_coef})\n", 91 | "true_coef\n", 92 | "\n", 93 | "genuine_predictors = true_coef[true_coef > 0.0]\n", 94 | "\n", 95 | "print(f\"The true coefficient of the linear data generating process are:\\n {true_coef}\")" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "GPU" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "%%time\n", 112 | "feat_selector = GrootCV(\n", 113 | " objective=\"rmse\",\n", 114 | " cutoff=1,\n", 115 | " n_folds=3,\n", 116 | " n_iter=3,\n", 117 | " silent=True,\n", 118 | " fastshap=True,\n", 119 | " n_jobs=0,\n", 120 | " lgbm_params={\"device\": \"gpu\", \"gpu_device_id\": 1},\n", 121 | ")\n", 122 | "feat_selector.fit(X_train, y_train, sample_weight=None)\n", 123 | "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n", 124 | "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n", 125 | "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n", 126 | "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n", 127 | "\n", 128 | "# highlight synthetic random variable\n", 129 | "for name in true_coef.index:\n", 130 | " if name in genuine_predictors.index:\n", 131 | " fig = highlight_tick(figure=fig, str_match=name, color=\"green\")\n", 132 | " else:\n", 133 | " fig = highlight_tick(figure=fig, str_match=name)\n", 134 | "\n", 135 | "plt.show()" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "CPU" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "%%time\n", 152 | "feat_selector = GrootCV(\n", 153 | " objective=\"rmse\",\n", 154 | " cutoff=1,\n", 155 | " n_folds=3,\n", 156 | " n_iter=3,\n", 157 | " silent=True,\n", 158 | " fastshap=True,\n", 159 | " n_jobs=0,\n", 160 | " lgbm_params={\"device\": \"cpu\"},\n", 161 | ")\n", 162 | "feat_selector.fit(X_train, y_train, sample_weight=None)\n", 163 | "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n", 164 | "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n", 165 | "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n", 166 | "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n", 167 | "\n", 168 | "# highlight synthetic random variable\n", 169 | "for name in true_coef.index:\n", 170 | " if name in genuine_predictors.index:\n", 171 | " fig = highlight_tick(figure=fig, str_match=name, color=\"green\")\n", 172 | " else:\n", 173 | " fig = highlight_tick(figure=fig, str_match=name)\n", 174 | "\n", 175 | "plt.show()" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "On a smaller data set, for illustrative purposes." 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 5, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "boston = load_data(name=\"Boston\")\n", 192 | "X, y = boston.data, boston.target" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "%%time\n", 202 | "feat_selector = GrootCV(\n", 203 | " objective=\"rmse\",\n", 204 | " cutoff=1,\n", 205 | " n_folds=5,\n", 206 | " n_iter=5,\n", 207 | " silent=True,\n", 208 | " fastshap=True,\n", 209 | " n_jobs=0,\n", 210 | " lgbm_params={\"device\": \"cpu\"},\n", 211 | ")\n", 212 | "feat_selector.fit(X, y, sample_weight=None)\n", 213 | "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n", 214 | "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n", 215 | "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n", 216 | "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n", 217 | "\n", 218 | "# highlight synthetic random variable\n", 219 | "fig = highlight_tick(figure=fig, str_match=\"random\")\n", 220 | "fig = highlight_tick(figure=fig, str_match=\"genuine\", color=\"green\")\n", 221 | "plt.show()" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "%%time\n", 231 | "feat_selector = GrootCV(\n", 232 | " objective=\"rmse\",\n", 233 | " cutoff=1,\n", 234 | " n_folds=5,\n", 235 | " n_iter=5,\n", 236 | " silent=True,\n", 237 | " fastshap=True,\n", 238 | " n_jobs=0,\n", 239 | " lgbm_params={\"device\": \"gpu\"},\n", 240 | ")\n", 241 | "feat_selector.fit(X, y, sample_weight=None)\n", 242 | "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n", 243 | "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n", 244 | "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n", 245 | "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n", 246 | "\n", 247 | "# highlight synthetic random variable\n", 248 | "fig = highlight_tick(figure=fig, str_match=\"random\")\n", 249 | "fig = highlight_tick(figure=fig, str_match=\"genuine\", color=\"green\")\n", 250 | "plt.show()" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "%%time\n", 260 | "feat_selector = GrootCV(\n", 261 | " objective=\"rmse\",\n", 262 | " cutoff=1,\n", 263 | " n_folds=5,\n", 264 | " n_iter=5,\n", 265 | " silent=True,\n", 266 | " fastshap=True,\n", 267 | " n_jobs=0,\n", 268 | " lgbm_params={\"device\": \"cuda\"},\n", 269 | ")\n", 270 | "feat_selector.fit(X, y, sample_weight=None)\n", 271 | "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n", 272 | "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n", 273 | "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n", 274 | "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n", 275 | "\n", 276 | "# highlight synthetic random variable\n", 277 | "fig = highlight_tick(figure=fig, str_match=\"random\")\n", 278 | "fig = highlight_tick(figure=fig, str_match=\"genuine\", color=\"green\")\n", 279 | "plt.show()" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "## Leshy on GPU" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 9, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "model = LGBMRegressor(random_state=42, verbose=-1, device=\"gpu\")" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "%%time\n", 305 | "# Leshy\n", 306 | "feat_selector = Leshy(\n", 307 | " model, n_estimators=20, verbose=1, max_iter=10, random_state=42, importance=\"native\"\n", 308 | ")\n", 309 | "feat_selector.fit(X, y, sample_weight=None)\n", 310 | "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n", 311 | "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n", 312 | "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n", 313 | "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n", 314 | "\n", 315 | "# highlight synthetic random variable\n", 316 | "fig = highlight_tick(figure=fig, str_match=\"random\")\n", 317 | "fig = highlight_tick(figure=fig, str_match=\"genuine\", color=\"green\")\n", 318 | "plt.show()" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 11, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "model = LGBMRegressor(random_state=42, verbose=-1, device=\"cpu\")" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "%%time\n", 337 | "# Leshy\n", 338 | "feat_selector = Leshy(\n", 339 | " model, n_estimators=20, verbose=1, max_iter=10, random_state=42, importance=\"native\"\n", 340 | ")\n", 341 | "feat_selector.fit(X, y, sample_weight=None)\n", 342 | "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n", 343 | "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n", 344 | "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n", 345 | "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n", 346 | "\n", 347 | "# highlight synthetic random variable\n", 348 | "fig = highlight_tick(figure=fig, str_match=\"random\")\n", 349 | "fig = highlight_tick(figure=fig, str_match=\"genuine\", color=\"green\")\n", 350 | "plt.show()" 351 | ] 352 | } 353 | ], 354 | "metadata": { 355 | "kernelspec": { 356 | "display_name": "arfs", 357 | "language": "python", 358 | "name": "python3" 359 | }, 360 | "language_info": { 361 | "codemirror_mode": { 362 | "name": "ipython", 363 | "version": 3 364 | }, 365 | "file_extension": ".py", 366 | "mimetype": "text/x-python", 367 | "name": "python", 368 | "nbconvert_exporter": "python", 369 | "pygments_lexer": "ipython3", 370 | "version": "3.10.14" 371 | }, 372 | "orig_nbformat": 4 373 | }, 374 | "nbformat": 4, 375 | "nbformat_minor": 2 376 | } 377 | -------------------------------------------------------------------------------- /docs/notebooks/arfs_shap_vs_fastshap.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# ARFS - fasttreeshap vs shap\n", 8 | "\n", 9 | "Leshy, BoostAGroota, and GrootCV are tree-based algorithms. They benefit from a [faster implementation of the Shapley values by LinkedIn](https://engineering.linkedin.com/blog/2022/fasttreeshap--accelerating-shap-value-computation-for-trees), which is claimed to outperform both the treeExplainer in the SHAP package and the native C++ implementation of lightgbm/xgboost/catboost. The improvement in speed will vary depending on the size of the task and your hardware resources (including virtualization for VMs). On older machine, the `fasttreeshap` implementation might actually be slower.\n", 10 | "\n", 11 | "However, it currently does not work with xgboost (not a deal breaker because lightgbm is the preferred default)." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "name": "stderr", 21 | "output_type": "stream", 22 | "text": [ 23 | "Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "import numpy as np\n", 29 | "import pandas as pd\n", 30 | "\n", 31 | "from sklearn.datasets import make_regression\n", 32 | "from sklearn.model_selection import train_test_split\n", 33 | "\n", 34 | "import arfs\n", 35 | "from arfs.feature_selection import GrootCV, Leshy\n", 36 | "from arfs.utils import load_data\n", 37 | "from arfs.benchmark import highlight_tick\n", 38 | "\n", 39 | "rng = np.random.RandomState(seed=42)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "name": "stdout", 49 | "output_type": "stream", 50 | "text": [ 51 | "The true coefficient of the linear data generating process are:\n", 52 | " intercept 1.000000\n", 53 | "pred_0 0.000000\n", 54 | "pred_1 0.000000\n", 55 | "pred_2 0.000000\n", 56 | "pred_3 0.000000\n", 57 | " ... \n", 58 | "pred_95 0.000000\n", 59 | "pred_96 10.576299\n", 60 | "pred_97 0.000000\n", 61 | "pred_98 0.000000\n", 62 | "pred_99 62.472033\n", 63 | "Length: 101, dtype: float64\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "# Generate synthetic data with Poisson-distributed target variable\n", 69 | "bias = 1\n", 70 | "\n", 71 | "n_samples = 100_000\n", 72 | "n_features = 100\n", 73 | "n_informative = 20\n", 74 | "\n", 75 | "X, y, true_coef = make_regression(\n", 76 | " n_samples=n_samples,\n", 77 | " n_features=n_features,\n", 78 | " n_informative=n_informative,\n", 79 | " noise=1,\n", 80 | " random_state=8,\n", 81 | " bias=bias,\n", 82 | " coef=True,\n", 83 | ")\n", 84 | "y = (y - y.mean()) / y.std()\n", 85 | "y = np.exp(y) # Transform to positive values for Poisson distribution\n", 86 | "y = np.random.poisson(y) # Add Poisson noise to the target variable\n", 87 | "# dummy sample weight (e.g. exposure), smallest being 30 days\n", 88 | "w = np.random.uniform(30 / 365, 1, size=len(y))\n", 89 | "# make the count a Poisson rate (frequency)\n", 90 | "y = y / w\n", 91 | "\n", 92 | "X = pd.DataFrame(X)\n", 93 | "X.columns = [f\"pred_{i}\" for i in range(X.shape[1])]\n", 94 | "\n", 95 | "# Split the data into training and testing sets\n", 96 | "X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(\n", 97 | " X, y, w, test_size=0.5, random_state=42\n", 98 | ")\n", 99 | "\n", 100 | "true_coef = pd.Series(true_coef)\n", 101 | "true_coef.index = X.columns\n", 102 | "true_coef = pd.Series({**{\"intercept\": bias}, **true_coef})\n", 103 | "true_coef\n", 104 | "\n", 105 | "genuine_predictors = true_coef[true_coef > 0.0]\n", 106 | "\n", 107 | "print(f\"The true coefficient of the linear data generating process are:\\n {true_coef}\")" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "## GrootCV - fastshap vs shap \n", 115 | "\n", 116 | "### Fastshap enable" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 3, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "application/vnd.jupyter.widget-view+json": { 127 | "model_id": "b4a9fbb99730414786a1cc452df59ca8", 128 | "version_major": 2, 129 | "version_minor": 0 130 | }, 131 | "text/plain": [ 132 | "Repeated k-fold: 0%| | 0/9 [00:00#sk-container-id-1 {color: black;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}
GrootCV(fastshap=True,\n",
150 |        "        lgbm_params={'device': 'cpu', 'num_threads': 0, 'objective': 'rmse',\n",
151 |        "                     'verbosity': -1},\n",
152 |        "        n_folds=3, n_iter=3, objective='rmse')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" 156 | ], 157 | "text/plain": [ 158 | "GrootCV(fastshap=True,\n", 159 | " lgbm_params={'device': 'cpu', 'num_threads': 0, 'objective': 'rmse',\n", 160 | " 'verbosity': -1},\n", 161 | " n_folds=3, n_iter=3, objective='rmse')" 162 | ] 163 | }, 164 | "execution_count": 3, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "%%time\n", 171 | "feat_selector = GrootCV(\n", 172 | " objective=\"rmse\",\n", 173 | " cutoff=1,\n", 174 | " n_folds=3,\n", 175 | " n_iter=3,\n", 176 | " silent=True,\n", 177 | " fastshap=True,\n", 178 | " n_jobs=0,\n", 179 | " lgbm_params={\"device\": \"cpu\"},\n", 180 | ")\n", 181 | "feat_selector.fit(X_train, y_train, sample_weight=None)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 4, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "The selected features: ['pred_7' 'pred_9' 'pred_15' 'pred_23' 'pred_27' 'pred_31' 'pred_35'\n", 194 | " 'pred_39' 'pred_41' 'pred_46' 'pred_48' 'pred_49' 'pred_52' 'pred_66'\n", 195 | " 'pred_71' 'pred_79' 'pred_85' 'pred_96' 'pred_99']\n", 196 | "The agnostic ranking: [1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 2 1 1 1 2 1\n", 197 | " 1 1 2 1 2 1 1 1 1 2 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1\n", 198 | " 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2]\n", 199 | "The naive ranking: ['pred_7', 'pred_9', 'pred_31', 'pred_49', 'pred_41', 'pred_52', 'pred_71', 'pred_66', 'pred_27', 'pred_99', 'pred_23', 'pred_79', 'pred_39', 'pred_35', 'pred_85', 'pred_48', 'pred_46', 'pred_96', 'pred_15', 'pred_89', 'pred_21', 'pred_38', 'pred_32', 'pred_16', 'pred_69', 'pred_47', 'pred_50', 'pred_28', 'pred_60', 'pred_44', 'pred_67', 'pred_61', 'pred_34', 'pred_84', 'pred_17', 'pred_37', 'pred_29', 'pred_70', 'pred_5', 'pred_62', 'pred_19', 'pred_78', 'pred_59', 'pred_82', 'pred_64', 'pred_24', 'pred_92', 'pred_22', 'pred_80', 'pred_97', 'pred_95', 'pred_68', 'pred_58', 'pred_81', 'pred_91', 'pred_77', 'pred_53', 'pred_36', 'pred_10', 'pred_74', 'pred_45', 'pred_93', 'pred_30', 'pred_4', 'pred_65', 'pred_63', 'pred_76', 'pred_54', 'pred_43', 'pred_8', 'pred_56', 'pred_72', 'pred_0', 'pred_20', 'pred_11', 'pred_75', 'pred_83', 'pred_73', 'pred_18', 'pred_57', 'pred_14', 'pred_55', 'pred_12', 'pred_98', 'pred_88', 'pred_87', 'pred_26', 'pred_90', 'pred_42', 'pred_1', 'pred_33', 'pred_25', 'pred_94', 'pred_51', 'pred_2', 'pred_6', 'pred_40', 'pred_3', 'pred_13', 'pred_86']\n" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n", 205 | "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n", 206 | "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n", 207 | "\n", 208 | "\n", 209 | "# fig = feat_selector.plot_importance(n_feat_per_inch=5)\n", 210 | "# # highlight synthetic random variable\n", 211 | "# for name in true_coef.index:\n", 212 | "# if name in genuine_predictors.index:\n", 213 | "# fig = highlight_tick(figure=fig, str_match=name, color=\"green\")\n", 214 | "# else:\n", 215 | "# fig = highlight_tick(figure=fig, str_match=name)\n", 216 | "\n", 217 | "# plt.show()" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "### Fastshap disable" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 5, 230 | "metadata": {}, 231 | "outputs": [ 232 | { 233 | "data": { 234 | "application/vnd.jupyter.widget-view+json": { 235 | "model_id": "ecc5744cca034da7bd6a5a58e6f0dc34", 236 | "version_major": 2, 237 | "version_minor": 0 238 | }, 239 | "text/plain": [ 240 | "Repeated k-fold: 0%| | 0/9 [00:00#sk-container-id-2 {color: black;}#sk-container-id-2 pre{padding: 0;}#sk-container-id-2 div.sk-toggleable {background-color: white;}#sk-container-id-2 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-2 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-2 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-2 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-2 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-2 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-2 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-2 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-2 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-2 div.sk-item {position: relative;z-index: 1;}#sk-container-id-2 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-2 div.sk-item::before, #sk-container-id-2 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-2 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-2 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-2 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-2 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-2 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-2 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-2 div.sk-label-container {text-align: center;}#sk-container-id-2 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-2 div.sk-text-repr-fallback {display: none;}
GrootCV(lgbm_params={'device': 'cpu', 'num_threads': 0, 'objective': 'rmse',\n",
258 |        "                     'verbosity': -1},\n",
259 |        "        n_folds=3, n_iter=3, objective='rmse')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" 262 | ], 263 | "text/plain": [ 264 | "GrootCV(lgbm_params={'device': 'cpu', 'num_threads': 0, 'objective': 'rmse',\n", 265 | " 'verbosity': -1},\n", 266 | " n_folds=3, n_iter=3, objective='rmse')" 267 | ] 268 | }, 269 | "execution_count": 5, 270 | "metadata": {}, 271 | "output_type": "execute_result" 272 | } 273 | ], 274 | "source": [ 275 | "%%time\n", 276 | "feat_selector = GrootCV(\n", 277 | " objective=\"rmse\",\n", 278 | " cutoff=1,\n", 279 | " n_folds=3,\n", 280 | " n_iter=3,\n", 281 | " silent=True,\n", 282 | " fastshap=False,\n", 283 | " n_jobs=0,\n", 284 | " lgbm_params={\"device\": \"cpu\"},\n", 285 | ")\n", 286 | "feat_selector.fit(X_train, y_train, sample_weight=None)" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 6, 292 | "metadata": {}, 293 | "outputs": [ 294 | { 295 | "name": "stdout", 296 | "output_type": "stream", 297 | "text": [ 298 | "The selected features: ['pred_7' 'pred_9' 'pred_15' 'pred_23' 'pred_27' 'pred_31' 'pred_35'\n", 299 | " 'pred_39' 'pred_41' 'pred_46' 'pred_48' 'pred_49' 'pred_52' 'pred_66'\n", 300 | " 'pred_71' 'pred_79' 'pred_85' 'pred_96' 'pred_99']\n", 301 | "The agnostic ranking: [1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 2 1 1 1 2 1\n", 302 | " 1 1 2 1 2 1 1 1 1 2 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1\n", 303 | " 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2]\n", 304 | "The naive ranking: ['pred_7', 'pred_9', 'pred_31', 'pred_49', 'pred_41', 'pred_52', 'pred_71', 'pred_66', 'pred_27', 'pred_99', 'pred_23', 'pred_79', 'pred_39', 'pred_35', 'pred_85', 'pred_48', 'pred_46', 'pred_96', 'pred_15', 'pred_38', 'pred_32', 'pred_21', 'pred_89', 'pred_50', 'pred_5', 'pred_17', 'pred_29', 'pred_28', 'pred_69', 'pred_61', 'pred_84', 'pred_58', 'pred_67', 'pred_59', 'pred_68', 'pred_34', 'pred_97', 'pred_47', 'pred_60', 'pred_91', 'pred_75', 'pred_22', 'pred_10', 'pred_82', 'pred_16', 'pred_78', 'pred_42', 'pred_95', 'pred_80', 'pred_37', 'pred_2', 'pred_62', 'pred_76', 'pred_92', 'pred_20', 'pred_77', 'pred_19', 'pred_24', 'pred_63', 'pred_93', 'pred_44', 'pred_11', 'pred_53', 'pred_65', 'pred_33', 'pred_45', 'pred_14', 'pred_98', 'pred_57', 'pred_64', 'pred_30', 'pred_81', 'pred_83', 'pred_87', 'pred_25', 'pred_51', 'pred_70', 'pred_8', 'pred_36', 'pred_55', 'pred_0', 'pred_88', 'pred_43', 'pred_12', 'pred_4', 'pred_74', 'pred_72', 'pred_54', 'pred_1', 'pred_13', 'pred_73', 'pred_40', 'pred_56', 'pred_3', 'pred_26', 'pred_18', 'pred_94', 'pred_6', 'pred_86', 'pred_90']\n" 305 | ] 306 | } 307 | ], 308 | "source": [ 309 | "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n", 310 | "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n", 311 | "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")" 312 | ] 313 | } 314 | ], 315 | "metadata": { 316 | "kernelspec": { 317 | "display_name": "arfs", 318 | "language": "python", 319 | "name": "python3" 320 | }, 321 | "language_info": { 322 | "codemirror_mode": { 323 | "name": "ipython", 324 | "version": 3 325 | }, 326 | "file_extension": ".py", 327 | "mimetype": "text/x-python", 328 | "name": "python", 329 | "nbconvert_exporter": "python", 330 | "pygments_lexer": "ipython3", 331 | "version": "3.10.12" 332 | }, 333 | "orig_nbformat": 4 334 | }, 335 | "nbformat": 4, 336 | "nbformat_minor": 2 337 | } 338 | -------------------------------------------------------------------------------- /docs/notebooks/bender_hex_mini.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/docs/notebooks/bender_hex_mini.png -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | lightgbm>=3.3.1 2 | matplotlib>=3.5 3 | numpy>=1.21 4 | pandas>=1.4 5 | scikit_learn>=1.0 6 | scipy>=1.8.0 7 | seaborn>=0.11.2 8 | shap>=0.40.0 9 | tqdm>=4.62.3 10 | statsmodels>=0.14.0 11 | ipykernel 12 | ipython_genutils 13 | pandoc 14 | sphinx 15 | sphinxawesome-theme==5.0.0b5 16 | nbsphinx==0.9.2 17 | sphinx-autodoc-typehints<1.24.0 18 | sphinx-copybutton==0.5.2 19 | sphinx-tabs==3.4.1 20 | fasttreeshap -------------------------------------------------------------------------------- /images/boostagroota-boston-lgb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/images/boostagroota-boston-lgb.png -------------------------------------------------------------------------------- /images/grootcv-boston.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/images/grootcv-boston.png -------------------------------------------------------------------------------- /images/leshy-boston.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/images/leshy-boston.png -------------------------------------------------------------------------------- /images/leshy-titanic-catboost-shap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/images/leshy-titanic-catboost-shap.png -------------------------------------------------------------------------------- /images/leshy-titanic-lgbm-shap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/images/leshy-titanic-lgbm-shap.png -------------------------------------------------------------------------------- /images/leshy-titanic-rndforest-shap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/images/leshy-titanic-rndforest-shap.png -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/logo.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "arfs" 7 | description = "All Relevant Feature Selection and Maximal Relevant minimal redundancy FS" 8 | readme = "README.md" 9 | license = { text = "MIT" } 10 | authors = [ 11 | { name = "Thomas Bury", email = "bury.thomas@gmail.com" }, 12 | ] 13 | requires-python = ">=3.9, <3.13" 14 | dynamic = ["version"] 15 | keywords = ["feature-selection", "all-relevant", "selection", "MRmr"] 16 | 17 | classifiers = [ 18 | "Programming Language :: Python :: 3", 19 | ] 20 | 21 | dependencies = [ 22 | "lightgbm>=4.6.0", 23 | "matplotlib>=3.9.4", 24 | "numpy>=2.0.2", 25 | "pandas>=2.2.3", 26 | "scikit-learn>=1.6.1", 27 | "scipy>=1.13.1", 28 | "seaborn>=0.13.2", 29 | "shap>=0.47.0", 30 | "statsmodels>=0.14.4", 31 | "tqdm>=4.67.1", 32 | ] 33 | 34 | [project.optional-dependencies] 35 | docs = [ 36 | "ipykernel", 37 | "ipython_genutils", 38 | "pandoc", 39 | "sphinx", 40 | "sphinxawesome-theme==5.0.0b5", 41 | "nbsphinx==0.9.2", 42 | "sphinx-autodoc-typehints<1.24.0", 43 | "sphinx-copybutton==0.5.2", 44 | "sphinx-tabs==3.4.1", 45 | # "fasttreeshap" 46 | ] 47 | 48 | test = [ 49 | "pytest", 50 | "pytest-cov" 51 | ] 52 | 53 | [project.urls] 54 | Documentation = "https://github.com/ThomasBury/arfs" 55 | Source = "https://github.com/ThomasBury/arfs" 56 | Tracker = "https://github.com/ThomasBury/arfs/issues" 57 | Download = "https://pypi.org/project/arfs/" 58 | 59 | [tool.setuptools] 60 | package-dir = { "" = "src" } 61 | zip-safe = false 62 | 63 | [tool.setuptools.packages.find] 64 | where = ["src"] 65 | 66 | [tool.setuptools.dynamic] 67 | version = { attr = "arfs.__version__" } 68 | 69 | [tool.setuptools.package-data] 70 | "arfs.dataset.data" = ["*.joblib", "*.zip"] 71 | "arfs.dataset.description" = ["*.rst"] 72 | -------------------------------------------------------------------------------- /src/arfs/.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | 107 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 108 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 109 | 110 | # User-specific stuff 111 | .idea/**/workspace.xml 112 | .idea/**/tasks.xml 113 | .idea/**/dictionaries 114 | .idea/**/shelf 115 | 116 | # Sensitive or high-churn files 117 | .idea/**/dataSources/ 118 | .idea/**/dataSources.ids 119 | .idea/**/dataSources.local.xml 120 | .idea/**/sqlDataSources.xml 121 | .idea/**/dynamic.xml 122 | .idea/**/uiDesigner.xml 123 | .idea/**/dbnavigator.xml 124 | 125 | # Gradle 126 | .idea/**/gradle.xml 127 | .idea/**/libraries 128 | 129 | # CMake 130 | cmake-build-debug/ 131 | cmake-build-release/ 132 | 133 | # Mongo Explorer plugin 134 | .idea/**/mongoSettings.xml 135 | 136 | # File-based project format 137 | *.iws 138 | 139 | # IntelliJ 140 | out/ 141 | 142 | # mpeltonen/sbt-idea plugin 143 | .idea_modules/ 144 | 145 | # JIRA plugin 146 | atlassian-ide-plugin.xml 147 | 148 | # Cursive Clojure plugin 149 | .idea/replstate.xml 150 | 151 | # Crashlytics plugin (for Android Studio and IntelliJ) 152 | com_crashlytics_export_strings.xml 153 | crashlytics.properties 154 | crashlytics-build.properties 155 | fabric.properties 156 | 157 | # Editor-based Rest Client 158 | .idea/httpRequests 159 | 160 | # catboost 161 | docs/notebooks/catboost_info -------------------------------------------------------------------------------- /src/arfs/__init__.py: -------------------------------------------------------------------------------- 1 | """init module, providing information about the arfs package""" 2 | 3 | __version__ = "3.0.0" 4 | -------------------------------------------------------------------------------- /src/arfs/benchmark.py: -------------------------------------------------------------------------------- 1 | """Benchmark Feature Selection 2 | 3 | This module provides utilities for comparing and benchmarking feature selection methods 4 | 5 | Module Structure: 6 | ----------------- 7 | - ``sklearn_pimp_bench``: function for comparing using the sklearn permutation importance 8 | - ``compare_varimp``: function for comparing using possible 4 kinds of variable importance 9 | - ``highlight_tick``: function for highlighting specific (genuine or noise for instance) predictors in the importance chart 10 | """ 11 | 12 | from __future__ import print_function, division 13 | 14 | import itertools 15 | from matplotlib import pyplot as plt 16 | from sklearn.model_selection import train_test_split 17 | from sklearn.inspection import permutation_importance 18 | 19 | from sklearn.base import clone 20 | 21 | from .preprocessing import OrdinalEncoderPandas 22 | 23 | 24 | def sklearn_pimp_bench(model, X, y, task="regression", sample_weight=None): 25 | """Benchmark using sklearn permutation importance, works for regression and classification. 26 | 27 | Parameters 28 | ---------- 29 | model: object 30 | An estimator that has not been fitted, sklearn compatible. 31 | X : ndarray or DataFrame, shape (n_samples, n_features) 32 | Data on which permutation importance will be computed. 33 | y : array-like or None, shape (n_samples, ) or (n_samples, n_classes) 34 | Targets for supervised or None for unsupervised. 35 | task : str, optional 36 | kind of task, either 'regression' or 'classification', by default 'regression' 37 | sample_weight : array-like of shape (n_samples,), optional 38 | Sample weights, by default None 39 | 40 | Returns 41 | ------- 42 | plt.figure 43 | the figure corresponding to the feature selection 44 | 45 | Raises 46 | ------ 47 | ValueError 48 | if task is not 'regression' or 'classification' 49 | """ 50 | 51 | # for lightGBM cat feat as contiguous int 52 | # https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html 53 | # same for Random Forest and XGBoost (OHE leads to deep and sparse trees). 54 | # For illustrations, see 55 | # https://towardsdatascience.com/one-hot-encoding-is-making- 56 | # your-tree-based-ensembles-worse-heres-why-d64b282b5769 57 | 58 | # X, cat_var_df, inv_mapper, mapper = cat_var(X) 59 | X = OrdinalEncoderPandas().fit_transform(X) 60 | 61 | if task == "regression": 62 | stratify = None 63 | elif task == "classification": 64 | stratify = y 65 | else: 66 | raise ValueError("`task` should be either 'regression' or 'classification' ") 67 | 68 | if sample_weight is not None: 69 | X_train, X_test, y_train, y_test, w_train, w_test = train_test_split( 70 | X, y, sample_weight, stratify=stratify, random_state=42 71 | ) 72 | else: 73 | X_train, X_test, y_train, y_test = train_test_split( 74 | X, y, stratify=stratify, random_state=42 75 | ) 76 | w_train, w_test = None, None 77 | 78 | # lightgbm faster and better than RF 79 | 80 | model.fit(X_train, y_train, sample_weight=w_train) 81 | result = permutation_importance( 82 | model, 83 | X_test, 84 | y_test, 85 | n_repeats=10, 86 | random_state=42, 87 | n_jobs=2, 88 | sample_weight=w_test, 89 | ) 90 | 91 | sorted_idx = result.importances_mean.argsort() 92 | # Plot (5 predictors per inch) 93 | fig, ax = plt.subplots(figsize=(16, X.shape[1] / 5)) 94 | ax.boxplot( 95 | result.importances[sorted_idx].T, vert=False, labels=X_test.columns[sorted_idx] 96 | ) 97 | ax.set_title("Permutation Importances (test set)") 98 | ax.tick_params(axis="both", which="major", labelsize=9) 99 | fig.tight_layout() 100 | indices = [i for i, s in enumerate(X_test.columns[sorted_idx]) if "random" in s] 101 | [fig.gca().get_yticklabels()[idx].set_color("red") for idx in indices] 102 | indices = [i for i, s in enumerate(X_test.columns[sorted_idx]) if "genuine" in s] 103 | [fig.gca().get_yticklabels()[idx].set_color("green") for idx in indices] 104 | plt.show() 105 | return fig 106 | 107 | 108 | def compare_varimp(feat_selector, models, X, y, sample_weight=None): 109 | """Utility function to compare the results for the three possible kind of feature importance 110 | 111 | Parameters 112 | ---------- 113 | feat_selector : object 114 | an instance of either Leshy, BoostaGRoota or GrootCV 115 | models : list of objects 116 | list of tree based scikit-learn estimators 117 | X : pd.DataFrame, shape (n_samples, n_features) 118 | the predictors frame 119 | y : pd.Series 120 | the target (same length as X) 121 | sample_weight : None or pd.Series, optional 122 | sample weights if any, by default None 123 | """ 124 | 125 | varimp_list = ["shap", "pimp", "native"] 126 | for model, varimp in itertools.product(models, varimp_list): 127 | print( 128 | "=" * 20 129 | + " " 130 | + str(feat_selector.__class__.__name__) 131 | + " - testing: {mod:>25} for var.imp: {vimp:<15} ".format( 132 | mod=str(model.__class__.__name__), vimp=varimp 133 | ) 134 | + "=" * 20 135 | ) 136 | # change the varimp 137 | feat_selector.importance = varimp 138 | # change model 139 | mod_clone = clone(model, safe=True) 140 | feat_selector.estimator = mod_clone 141 | # fit the feature selector 142 | feat_selector.fit(X=X, y=y, sample_weight=sample_weight) 143 | # print the results 144 | print(feat_selector.selected_features_) 145 | fig = feat_selector.plot_importance(n_feat_per_inch=5) 146 | 147 | if fig is not None: 148 | # highlight synthetic random variable 149 | fig = highlight_tick(figure=fig, str_match="random") 150 | fig = highlight_tick(figure=fig, str_match="genuine", color="green") 151 | plt.show() 152 | 153 | 154 | def highlight_tick(str_match, figure, color="red", axis="y"): 155 | """Highlight the x/y tick-labels if they contain a given string 156 | 157 | Parameters 158 | ---------- 159 | str_match : str 160 | the substring to match 161 | figure : object 162 | the matplotlib figure 163 | color : str, optional 164 | the matplotlib color for highlighting tick-labels, by default 'red' 165 | axis : str, optional 166 | axis to use for highlighting, by default 'y' 167 | 168 | Returns 169 | ------- 170 | plt.figure 171 | the modified matplotlib figure 172 | 173 | Raises 174 | ------ 175 | ValueError 176 | if axis is not 'x' or 'y' 177 | """ 178 | 179 | if axis == "y": 180 | labels = [item.get_text() for item in figure.gca().get_yticklabels()] 181 | indices = [i for i, s in enumerate(labels) if str_match in s] 182 | [figure.gca().get_yticklabels()[idx].set_color(color) for idx in indices] 183 | elif axis == "x": 184 | labels = [item.get_text() for item in figure.gca().get_xticklabels()] 185 | indices = [i for i, s in enumerate(labels) if str_match in s] 186 | [figure.gca().get_xticklabels()[idx].set_color(color) for idx in indices] 187 | else: 188 | raise ValueError("`axis` should be a string, either 'y' or 'x'") 189 | 190 | return figure 191 | -------------------------------------------------------------------------------- /src/arfs/dataset/data/boston_bunch.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/src/arfs/dataset/data/boston_bunch.joblib -------------------------------------------------------------------------------- /src/arfs/dataset/data/housing.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/src/arfs/dataset/data/housing.zip -------------------------------------------------------------------------------- /src/arfs/dataset/descr/housing.rst: -------------------------------------------------------------------------------- 1 | description: 2 | .. _california_housing_dataset: 3 | 4 | California Housing dataset 5 | -------------------------- 6 | 7 | **Data Set Characteristics:** 8 | 9 | :Number of Instances: 20640 10 | 11 | :Number of Attributes: 8 numeric, predictive attributes and the target 12 | 13 | :Attribute Information: 14 | - MedInc median income in block 15 | - HouseAge median house age in block 16 | - AveRooms average number of rooms 17 | - AveBedrms average number of bedrooms 18 | - Population block population 19 | - AveOccup average house occupancy 20 | - Latitude house block latitude 21 | - Longitude house block longitude 22 | 23 | :Missing Attribute Values: None 24 | 25 | This dataset was obtained from the StatLib repository. 26 | http://lib.stat.cmu.edu/datasets/ 27 | 28 | The target variable is the median house value for California districts. 29 | 30 | This dataset was derived from the 1990 U.S. census, using one row per census 31 | block group. A block group is the smallest geographical unit for which the U.S. 32 | Census Bureau publishes sample data (a block group typically has a population 33 | of 600 to 3,000 people). 34 | 35 | It can be downloaded/loaded using the 36 | :func:`sklearn.datasets.fetch_california_housing` function. 37 | 38 | .. topic:: References 39 | 40 | - Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions, 41 | Statistics and Probability Letters, 33 (1997) 291-297 -------------------------------------------------------------------------------- /src/arfs/feature_selection/__init__.py: -------------------------------------------------------------------------------- 1 | from .allrelevant import Leshy, BoostAGroota, GrootCV 2 | from .unsupervised import ( 3 | MissingValueThreshold, 4 | UniqueValuesThreshold, 5 | CardinalityThreshold, 6 | CollinearityThreshold, 7 | ) 8 | 9 | from .lasso import LassoFeatureSelection 10 | from .variable_importance import VariableImportance 11 | from .summary import make_fs_summary 12 | from .mrmr import MinRedundancyMaxRelevance 13 | 14 | __all__ = [ 15 | "BaseThresholdSelector", 16 | "MissingValueThreshold", 17 | "UniqueValuesThreshold", 18 | "CardinalityThreshold", 19 | "CollinearityThreshold", 20 | "VariableImportance", 21 | "make_fs_summary", 22 | "Leshy", 23 | "BoostAGroota", 24 | "GrootCV", 25 | "MinRedundancyMaxRelevance", 26 | "LassoFeatureSelection", 27 | ] 28 | -------------------------------------------------------------------------------- /src/arfs/feature_selection/base.py: -------------------------------------------------------------------------------- 1 | """Base Submodule 2 | 3 | This module provides a base class for selector using a statistic and a threshold 4 | 5 | Module Structure: 6 | ----------------- 7 | - ``BaseThresholdSelector``: parent class for the "treshold-based" selectors 8 | 9 | """ 10 | 11 | # Settings and libraries 12 | from __future__ import print_function 13 | 14 | # pandas 15 | import pandas as pd 16 | 17 | # numpy 18 | import numpy as np 19 | 20 | # sklearn 21 | 22 | from sklearn.utils.validation import check_is_fitted 23 | from sklearn.base import BaseEstimator 24 | from sklearn.feature_selection._base import SelectorMixin 25 | 26 | 27 | # fix random seed for reproducibility 28 | np.random.seed(7) 29 | 30 | 31 | class BaseThresholdSelector(SelectorMixin, BaseEstimator): 32 | """Base class for threshold-based feature selection 33 | 34 | Parameters 35 | ---------- 36 | threshold : float, .05 37 | Features with a training-set missing greater/lower (geq/leq) than this threshold will be removed 38 | statistic_fn : callable, optional 39 | The function for computing the statistic series. The index should be the column names and the 40 | the values the computed statistic 41 | greater_than_threshold : bool, False 42 | Whether or not to reject the features if lower or greater than threshold 43 | 44 | Returns 45 | ------- 46 | selected_features: list of str 47 | List of selected features. 48 | 49 | Attributes 50 | ---------- 51 | n_features_in_ : int 52 | number of input predictors 53 | support_ : list of bool 54 | the list of the selected X-columns 55 | selected_features_ : list of str 56 | the list of names of selected features 57 | not_selected_features_ : list of str 58 | the list of names of rejected features 59 | 60 | """ 61 | 62 | def __init__( 63 | self, 64 | threshold=0.05, 65 | statistic_fn=None, 66 | greater_than_threshold=False, 67 | ): 68 | self.threshold = threshold 69 | self.statistic_fn = statistic_fn 70 | self.greater_than_threshold = greater_than_threshold 71 | 72 | def fit(self, X, y=None, sample_weight=None): 73 | """Learn empirical statistics from X. 74 | 75 | Parameters 76 | ---------- 77 | X : pd.DataFrame, shape (n_samples, n_features) 78 | Data from which to compute variances, where `n_samples` is 79 | the number of samples and `n_features` is the number of features. 80 | y : any, default=None 81 | Ignored. This parameter exists only for compatibility with 82 | sklearn.pipeline.Pipeline. 83 | sample_weight : pd.Series, optional, shape (n_samples,) 84 | weights for computing the statistics (e.g. weighted average) 85 | 86 | Returns 87 | ------- 88 | self : object 89 | Returns the instance itself. 90 | """ 91 | 92 | # Calculate the fraction of missing in each column 93 | 94 | if isinstance(X, pd.DataFrame): 95 | self.feature_names_in_ = X.columns.to_numpy() 96 | else: 97 | raise TypeError("X is not a dataframe") 98 | 99 | self.statistic_series_ = self.statistic_fn(X) 100 | self.statistic_df_ = pd.DataFrame(self.statistic_series_).rename( 101 | columns={"index": "feature", 0: "statistic"} 102 | ) 103 | 104 | # Sort with highest number of missing values on top 105 | self.statistic_df_ = self.statistic_df_.sort_values( 106 | "statistic", ascending=False 107 | ) 108 | if self.greater_than_threshold: 109 | self.support_ = self.statistic_series_.values > self.threshold 110 | else: 111 | self.support_ = self.statistic_series_.values < self.threshold 112 | 113 | self.selected_features_ = self.feature_names_in_[self.support_] 114 | self.not_selected_features_ = self.feature_names_in_[~self.support_] 115 | 116 | return self 117 | 118 | def _get_support_mask(self): 119 | check_is_fitted(self) 120 | 121 | return self.support_ 122 | 123 | def transform(self, X): 124 | """ 125 | Transform the data, returns a transformed version of `X`. 126 | 127 | Parameters 128 | ---------- 129 | X : array-like of shape (n_samples, n_features) 130 | Input samples. 131 | 132 | Returns 133 | ------- 134 | X_new : ndarray array of shape (n_samples, n_features_new) 135 | Transformed array. 136 | """ 137 | if not isinstance(X, pd.DataFrame): 138 | raise TypeError("X is not a dataframe") 139 | return X[self.selected_features_] 140 | 141 | def fit_transform(self, X, y=None, sample_weight=None, **fit_params): 142 | """ 143 | Fit to data, then transform it. 144 | Fits transformer to `X` and `y` with optional parameters `fit_params` 145 | and returns a transformed version of `X`. 146 | Parameters 147 | ---------- 148 | X : array-like of shape (n_samples, n_features) 149 | Input samples. 150 | y : array-like of shape (n_samples,) or (n_samples, n_outputs), \ 151 | default=None 152 | Target values (None for unsupervised transformations). 153 | sample_weight : array-like of shape (n_samples,) or (n_samples, n_outputs), \ 154 | default=None 155 | sample weight values. 156 | **fit_params : dict 157 | Additional fit parameters. 158 | Returns 159 | ------- 160 | X_new : ndarray array of shape (n_samples, n_features_new) 161 | Transformed array. 162 | """ 163 | return self.fit(X=X, y=y, sample_weight=sample_weight, **fit_params).transform( 164 | X 165 | ) 166 | 167 | def _more_tags(self): 168 | return {"allow_nan": True} 169 | -------------------------------------------------------------------------------- /src/arfs/feature_selection/mrmr.py: -------------------------------------------------------------------------------- 1 | """MRMR Feature Selection Module 2 | 3 | This module provides MinRedundancyMaxRelevance (MRMR) feature selection for classification or regression tasks. 4 | In a classification task, the target should be of object or pandas category dtype, while in a regression task, 5 | the target should be numeric. The predictors can be categorical or numerical without requiring encoding, 6 | as the appropriate method (correlation, correlation ratio, or Theil's U) will be automatically selected based on the data type. 7 | 8 | Module Structure: 9 | ----------------- 10 | - ``MinRedundancyMaxRelevance``: MRMR feature selection class for classification or regression tasks. 11 | """ 12 | 13 | import functools 14 | import numpy as np 15 | import pandas as pd 16 | from sklearn.base import BaseEstimator 17 | from sklearn.utils.validation import check_is_fitted 18 | from tqdm.auto import tqdm 19 | from sklearn.feature_selection._base import SelectorMixin 20 | from ..association import ( 21 | f_stat_classification_parallel, 22 | f_stat_regression_parallel, 23 | association_series, 24 | ) 25 | 26 | FLOOR = 0.001 27 | 28 | 29 | class MinRedundancyMaxRelevance(SelectorMixin, BaseEstimator): 30 | """MRMR feature selection for a classification or a regression task 31 | For a classification task, the target should be of object or pandas category 32 | dtype. For a regression task, the target should be of numpy categorical dtype. 33 | The predictors can be categorical or numerical, there is no encoding required. 34 | The dtype will be automatically detected and the right method applied (either 35 | correlation, correlation ration or Theil's U) 36 | 37 | 38 | Parameters 39 | ---------- 40 | n_features_to_select: int 41 | Number of features to select. 42 | relevance_func: callable, optional 43 | relevance function having arguments "X", "y", "sample_weight" and returning a pd.Series 44 | containing a score of relevance for each feature 45 | redundancy_func: callable, optional 46 | Redundancy method. 47 | If callable, it should take "X", "sample_weight" as input and return a pandas.Series 48 | containing a score of redundancy for each feature. 49 | denominator_func: str or callable (optional, default='mean') 50 | Synthesis function to apply to the denominator of MRMR score. 51 | If string, name of method. Supported: 'max', 'mean'. 52 | If callable, it should take an iterable as input and return a scalar. 53 | task: str 54 | either "regression" or "classification" 55 | only_same_domain: bool (optional, default=False) 56 | If False, all the necessary correlation coefficients are computed. 57 | If True, only features belonging to the same domain are compared. 58 | Domain is defined by the string preceding the first underscore: 59 | for instance "cusinfo_age" and "cusinfo_income" belong to the same domain, whereas "age" and "income" don't. 60 | return_scores: bool (optional, default=False) 61 | If False, only the list of selected features is returned. 62 | If True, a tuple containing (list of selected features, relevance, redundancy) is returned. 63 | n_jobs: int (optional, default=1) 64 | Maximum number of workers to use. Only used when relevance = "f" or redundancy = "corr". 65 | If -1, use as many workers as min(cpu count, number of features). 66 | show_progress: bool (optional, default=True) 67 | If False, no progress bar is displayed. 68 | If True, a TQDM progress bar shows the number of features processed. 69 | 70 | Returns 71 | ------- 72 | selected_features: list of str 73 | List of selected features. 74 | 75 | Attributes 76 | ---------- 77 | n_features_in_ : int 78 | number of input predictors 79 | ranking_ : pd.DataFrame 80 | name and scores for the selected features 81 | support_ : list of bool 82 | the list of the selected X-columns 83 | Example 84 | ------- 85 | >>> from sklearn.datasets import make_classification, make_regression 86 | >>> X, y = make_regression(n_samples = 1000, n_features = 50, n_informative = 5, shuffle=False) # , n_redundant = 5 87 | >>> X = pd.DataFrame(X) 88 | >>> y = pd.Series(y) 89 | >>> pred_name = [f"pred_{i}" for i in range(X.shape[1])] 90 | >>> X.columns = pred_name 91 | >>> y.name = "target" 92 | >>> fs_mrmr = MinRedundancyMaxRelevance( 93 | >>> n_features_to_select=5, 94 | >>> relevance_func=None, 95 | >>> redundancy_func=None, 96 | >>> task="regression", #"classification", 97 | >>> denominator_func=np.mean, 98 | >>> only_same_domain=False, 99 | >>> return_scores=False, 100 | >>> show_progress=True) 101 | >>> #fs_mrmr.fit(X=X, y=y.astype(str), sample_weight=None) 102 | >>> fs_mrmr.fit(X=X, y=y, sample_weight=None) 103 | """ 104 | 105 | def __init__( 106 | self, 107 | n_features_to_select, 108 | relevance_func=None, 109 | redundancy_func=None, 110 | task="regression", 111 | denominator_func=np.mean, 112 | only_same_domain=False, 113 | return_scores=False, 114 | n_jobs=1, 115 | show_progress=True, 116 | ): 117 | self.n_features_to_select = n_features_to_select 118 | self.relevance_func = relevance_func 119 | self.redundancy_func = redundancy_func 120 | self.denominator_func = denominator_func 121 | self.only_same_domain = only_same_domain 122 | self.return_scores = return_scores 123 | self.show_progress = show_progress 124 | self.n_jobs = n_jobs 125 | self.task = task 126 | 127 | if self.relevance_func is None: 128 | if self.task == "regression": 129 | self.relevance_func = functools.partial( 130 | f_stat_regression_parallel, n_jobs=self.n_jobs 131 | ) 132 | else: 133 | self.relevance_func = functools.partial( 134 | f_stat_classification_parallel, n_jobs=self.n_jobs 135 | ) 136 | 137 | if self.redundancy_func is None: 138 | self.redundancy_func = functools.partial( 139 | association_series, n_jobs=self.n_jobs, normalize=True 140 | ) 141 | 142 | def fit(self, X, y, sample_weight=None): 143 | """fit the MRmr selector by learning the associations 144 | 145 | Parameters 146 | ---------- 147 | X : pd.DataFrame, shape (n_samples, n_features) 148 | Data from which to compute variances, where `n_samples` is 149 | the number of samples and `n_features` is the number of features. 150 | y : array-like or pd.Series of shape (n_samples,) 151 | Target vector. Must be numeric for regression or categorical for classification. 152 | sample_weight : pd.Series, optional, shape (n_samples,) 153 | weights for computing the statistics (e.g. weighted average) 154 | 155 | Returns 156 | ------- 157 | self : object 158 | If `return_scores=False`, returns self. 159 | If `return_scores=True`, returns (selected_features, relevance_scores). 160 | """ 161 | 162 | if isinstance(X, pd.DataFrame): 163 | self.feature_names_in_ = X.columns.to_numpy() 164 | else: 165 | raise TypeError("X is not a pd.DataFrame") 166 | 167 | if not isinstance(y, pd.Series): 168 | y = pd.Series(y) 169 | 170 | y.name = "target" 171 | 172 | target = y.copy() 173 | if self.task == "classification": 174 | target = target.astype("category") 175 | 176 | self.relevance_args = {"X": X, "y": target, "sample_weight": sample_weight} 177 | self.redundancy_args = {"X": X, "sample_weight": sample_weight} 178 | 179 | self.relevance = self.relevance_func(**self.relevance_args) 180 | self.features = self.relevance[~self.relevance.isna()].index.to_list() 181 | self.relevance = self.relevance.loc[self.features] 182 | self.redundancy = pd.DataFrame( 183 | FLOOR, index=self.features, columns=self.features 184 | ) 185 | self.n_features_to_select = min(self.n_features_to_select, len(self.features)) 186 | 187 | if isinstance(X, pd.DataFrame): 188 | self.feature_names_in_ = X.columns.to_numpy() 189 | 190 | self.n_features_in_ = len(self.features) 191 | 192 | self.selected_features = [] 193 | self.not_selected_features = self.features.copy() 194 | self.ranking_ = pd.Series( 195 | dtype="float64" 196 | ) # pd.DataFrame(columns=['var_name', 'mrmr', 'relevancy', 'redundancy']) 197 | self.redundancy_ = pd.Series(dtype="float64") 198 | self.run_feature_selection() 199 | 200 | # store the output in the sklearn flavour 201 | self.relevance_ = self.relevance 202 | self.ranking_ = pd.concat( 203 | [self.ranking_, self.relevance_, self.redundancy_], axis=1 204 | ) 205 | self.ranking_.columns = ["mrmr", "relevance", "redundancy"] 206 | self.ranking_ = self.ranking_.iloc[: self.n_features_to_select, :] 207 | 208 | # Set back the mrmr score to Inf for the first selected feature to avoid dividing by zero 209 | self.ranking_.iloc[0, 0] = float("Inf") 210 | 211 | self.selected_features_ = self.selected_features 212 | self.support_ = np.asarray( 213 | [x in self.selected_features for x in self.feature_names_in_] 214 | ) 215 | self.not_selected_features_ = self.not_selected_features 216 | 217 | if self.return_scores: 218 | return self.selected_features_, self.relevance_, self.redundancy_ 219 | return self 220 | 221 | def transform(self, X): 222 | """ 223 | Transform the data, returns a transformed version of `X`. 224 | 225 | Parameters 226 | ---------- 227 | X : array-like of shape (n_samples, n_features) 228 | Input samples. 229 | 230 | Returns 231 | ------- 232 | X_new : ndarray array of shape (n_samples, n_features_new) 233 | Transformed array. 234 | """ 235 | if not isinstance(X, pd.DataFrame): 236 | raise TypeError("X is not a dataframe") 237 | return X[self.selected_features_] 238 | 239 | def fit_transform(self, X, y, sample_weight=None, **fit_params): 240 | """ 241 | Fit to data, then transform it. 242 | Fits transformer to `X` and `y` and optionally sample_weight 243 | with optional parameters `fit_params` 244 | and returns a transformed version of `X`. 245 | 246 | Parameters 247 | ---------- 248 | X : array-like of shape (n_samples, n_features) 249 | Input samples. 250 | y : array-like of shape (n_samples,) or (n_samples, n_outputs), \ 251 | default=None 252 | Target values (None for unsupervised transformations). 253 | sample_weight : array-like of shape (n_samples,) or (n_samples, n_outputs), \ 254 | default=None 255 | sample weight values. 256 | **fit_params : dict 257 | Additional fit parameters. 258 | 259 | Returns 260 | ------- 261 | X_new : ndarray array of shape (n_samples, n_features_new) 262 | Transformed array. 263 | """ 264 | return self.fit(X=X, y=y, sample_weight=sample_weight).transform(X) 265 | 266 | def _get_support_mask(self): 267 | check_is_fitted(self) 268 | 269 | return self.support_ 270 | 271 | def _more_tags(self): 272 | return {"allow_nan": True} 273 | 274 | def select_next_feature( 275 | self, not_selected_features, selected_features, relevance, redundancy 276 | ): 277 | score_numerator = relevance.loc[not_selected_features] 278 | 279 | if len(selected_features) > 0: 280 | last_selected_feature = selected_features[-1] 281 | 282 | if self.only_same_domain: 283 | not_selected_features_sub = [ 284 | c 285 | for c in not_selected_features 286 | if c.split("_")[0] == last_selected_feature.split("_")[0] 287 | ] 288 | else: 289 | not_selected_features_sub = not_selected_features 290 | 291 | if not_selected_features_sub: 292 | redundancy.loc[not_selected_features_sub, last_selected_feature] = ( 293 | self.redundancy_func( 294 | target=last_selected_feature, 295 | features=not_selected_features_sub, 296 | **self.redundancy_args, 297 | ) 298 | .fillna(FLOOR) 299 | .abs() 300 | .clip(FLOOR) 301 | ) 302 | score_denominator = ( 303 | redundancy.loc[not_selected_features, selected_features] 304 | .apply(self.denominator_func, axis=1) 305 | .replace(1.0, float("Inf")) 306 | ) 307 | 308 | else: 309 | score_denominator = pd.Series(1, index=self.features) 310 | 311 | else: 312 | score_denominator = pd.Series(1, index=self.features) 313 | 314 | score = score_numerator / score_denominator 315 | score = score.sort_values(ascending=False) 316 | best_feature = score.index[score.argmax()] 317 | 318 | return best_feature, score, score_denominator 319 | 320 | def update_ranks(self, best_feature, score, score_denominator): 321 | self.ranking_ = pd.concat( 322 | [ 323 | self.ranking_, 324 | pd.Series({best_feature: score.loc[best_feature]}, dtype="float64"), 325 | ] 326 | ) 327 | self.redundancy_ = pd.concat( 328 | [ 329 | self.redundancy_, 330 | pd.Series( 331 | {best_feature: score_denominator.loc[best_feature]}, 332 | dtype="float64", 333 | ), 334 | ] 335 | ) 336 | # the first selected feature has a default denominator (redundancy) = 1 to avoid dividing by zero 337 | # I set it back to zero 338 | self.redundancy_ = self.redundancy_.replace(1.0, 0.0) 339 | self.selected_features.append(best_feature) 340 | self.not_selected_features.remove(best_feature) 341 | 342 | def run_feature_selection(self): 343 | for i in tqdm(range(self.n_features_to_select), disable=not self.show_progress): 344 | best_feature, score, score_denominator = self.select_next_feature( 345 | self.not_selected_features, 346 | self.selected_features, 347 | self.relevance, 348 | self.redundancy, 349 | ) 350 | self.update_ranks(best_feature, score, score_denominator) 351 | -------------------------------------------------------------------------------- /src/arfs/feature_selection/summary.py: -------------------------------------------------------------------------------- 1 | """Feature Selection Summary Module 2 | 3 | This module provides a function for creating the summary report of a FS pipeline 4 | 5 | Module Structure: 6 | ----------------- 7 | - ``make_fs_summary`` main function for creating the summary 8 | - ``highlight_discarded`` function for creating style for the pd.DataFrame 9 | """ 10 | 11 | import pandas as pd 12 | import numpy as np 13 | 14 | 15 | def highlight_discarded(s): 16 | """highlight X in red and V in green. 17 | 18 | Parameters 19 | ---------- 20 | s : array-like of shape (n_features,) 21 | the boolean array for defining the style 22 | 23 | 24 | """ 25 | is_X = s == 0 26 | return [ 27 | "background-color: #ba0202" if v else "background-color: #0c8a30" for v in is_X 28 | ] 29 | 30 | 31 | def make_fs_summary(selector_pipe): 32 | """make_fs_summary makes a summary dataframe highlighting at which step a 33 | given predictor has been rejected (if any). 34 | 35 | Parameters 36 | ---------- 37 | selector_pipe : sklearn.pipeline.Pipeline 38 | the feature selector pipeline. 39 | 40 | Examples 41 | -------- 42 | >>> groot_pipeline = Pipeline([ 43 | ... ('missing', MissingValueThreshold()), 44 | ... ('unique', UniqueValuesThreshold()), 45 | ... ('cardinality', CardinalityThreshold()), 46 | ... ('collinearity', CollinearityThreshold(threshold=0.5)), 47 | ... ('lowimp', VariableImportance(eval_metric='poisson', objective='poisson', verbose=2)), 48 | ... ('grootcv', GrootCV(objective='poisson', cutoff=1, n_folds=3, n_iter=5))]) 49 | >>> groot_pipeline.fit_transform( 50 | X=df[predictors], 51 | y=df[target], 52 | lowimp__sample_weight=df[weight], 53 | grootcv__sample_weight=df[weight]) 54 | >>> fs_summary_df = make_fs_summary(groot_pipeline) 55 | """ 56 | tag_df = pd.DataFrame({"predictor": selector_pipe[0].feature_names_in_}) 57 | for selector_name, selector in selector_pipe.named_steps.items(): 58 | if hasattr(selector, "support_"): 59 | feature_in = selector.feature_names_in_ 60 | to_drop = list(set(feature_in) - set(selector.get_feature_names_out())) 61 | tag_df[selector_name] = np.where( 62 | tag_df["predictor"].isin(to_drop), 0, 1 63 | ) * np.where(tag_df["predictor"].isin(feature_in), 1, np.nan) 64 | else: 65 | tag_df[selector_name] = np.nan 66 | 67 | style = ( 68 | tag_df.style.apply(highlight_discarded, subset=tag_df.columns[1:]) 69 | .applymap(lambda x: "" if x == x else "background-color: #f57505") 70 | .format(precision=0) 71 | ) 72 | 73 | return style 74 | -------------------------------------------------------------------------------- /src/arfs/feature_selection/unsupervised.py: -------------------------------------------------------------------------------- 1 | """Unsupervised Feature Selection 2 | 3 | This module provides selectors using unsupervised statistics and a threshold 4 | 5 | Module Structure: 6 | ----------------- 7 | - ``MissingValueThreshold``: child class of the ``BaseThresholdSelector``, filter out columns with too many missing values 8 | - ``UniqueValuesThreshold`` child of the ``BaseThresholdSelector``, filter out columns with zero variance 9 | - ``CardinalityThreshold`` child of the ``BaseThresholdSelector``, filter out categorical columns with too many levels 10 | - ``CollinearityThreshold`` child of the ``BaseThresholdSelector``, filter out collinear columns 11 | """ 12 | 13 | from __future__ import print_function 14 | from tqdm.auto import trange 15 | 16 | # pandas 17 | import pandas as pd 18 | 19 | # numpy 20 | import numpy as np 21 | 22 | # sklearn 23 | from sklearn.utils.validation import check_is_fitted 24 | from sklearn.base import BaseEstimator 25 | from sklearn.feature_selection._base import SelectorMixin 26 | 27 | # ARFS 28 | from .base import BaseThresholdSelector 29 | from ..utils import create_dtype_dict 30 | from ..association import ( 31 | association_matrix, 32 | xy_to_matrix, 33 | plot_association_matrix, 34 | weighted_theils_u, 35 | weighted_corr, 36 | correlation_ratio, 37 | ) 38 | from ..preprocessing import OrdinalEncoderPandas 39 | 40 | 41 | # fix random seed for reproducibility 42 | np.random.seed(7) 43 | 44 | 45 | def _missing_ratio(df): 46 | if not isinstance(df, pd.DataFrame): 47 | raise TypeError("df should be a pandas DataFrame") 48 | numeric_columns = df.select_dtypes(np.number).columns 49 | n_samples = len(df) 50 | 51 | missing_counts = {} 52 | for column in df.columns: 53 | if column in numeric_columns: 54 | missing_counts[column] = ( 55 | df[column].isnull().sum() + np.isinf(df[column]).sum() 56 | ) / n_samples 57 | else: 58 | missing_counts[column] = df[column].isnull().sum() / n_samples 59 | return pd.Series(missing_counts) 60 | 61 | 62 | class MissingValueThreshold(BaseThresholdSelector): 63 | """Feature selector that removes all high missing percentage features. 64 | This feature selection algorithm looks only at the features (X), 65 | not the desired outputs (y), and can thus be used for unsupervised learning. 66 | 67 | 68 | Parameters 69 | ---------- 70 | threshold: float, default = .05 71 | Features with a training-set missing larger than this threshold will be removed. 72 | 73 | Returns 74 | ------- 75 | selected_features: list of str 76 | List of selected features. 77 | 78 | Attributes 79 | ---------- 80 | n_features_in_ : int 81 | number of input predictors 82 | support_ : list of bool 83 | the list of the selected X-columns 84 | selected_features_ : list of str 85 | the list of names of selected features 86 | not_selected_features_ : list of str 87 | the list of names of rejected features 88 | 89 | Example 90 | ------- 91 | >>> from sklearn.datasets import make_classification, make_regression 92 | >>> X, y = make_regression(n_samples = 1000, n_features = 50, n_informative = 5, shuffle=False) # , n_redundant = 5 93 | >>> X = pd.DataFrame(X) 94 | >>> y = pd.Series(y) 95 | >>> pred_name = [f"pred_{i}" for i in range(X.shape[1])] 96 | >>> X.columns = pred_name 97 | >>> selector = MissingValueThreshold(0.05) 98 | >>> selector.fit_transform(X) 99 | """ 100 | 101 | def __init__(self, threshold=0.05): 102 | super().__init__( 103 | threshold=threshold, 104 | statistic_fn=_missing_ratio, 105 | greater_than_threshold=False, 106 | ) 107 | 108 | 109 | def _pandas_count_unique_values(X): 110 | if not isinstance(X, pd.DataFrame): 111 | raise TypeError("X should be a pandas DataFrame") 112 | return X.nunique() 113 | 114 | 115 | class UniqueValuesThreshold(BaseThresholdSelector): 116 | """Feature selector that removes all features with zero variance (single unique values) 117 | or remove columns with less unique values than threshold 118 | This feature selection algorithm looks only at the features (X), 119 | not the desired outputs (y), and can thus be used for unsupervised learning. 120 | 121 | Parameters 122 | ---------- 123 | threshold: int, default = 1 124 | Features with a training-set missing larger than this threshold will be removed. 125 | The thresold should be >= 1 126 | 127 | Returns 128 | ------- 129 | selected_features: list of str 130 | List of selected features. 131 | 132 | Attributes 133 | ---------- 134 | n_features_in_ : int 135 | number of input predictors 136 | support_ : list of bool 137 | the list of the selected X-columns 138 | selected_features_ : list of str 139 | the list of names of selected features 140 | not_selected_features_ : list of str 141 | the list of names of rejected features 142 | 143 | Example 144 | ------- 145 | >>> from sklearn.datasets import make_classification, make_regression 146 | >>> X, y = make_regression(n_samples = 1000, n_features = 50, n_informative = 5, shuffle=False) # , n_redundant = 5 147 | >>> X = pd.DataFrame(X) 148 | >>> y = pd.Series(y) 149 | >>> pred_name = [f"pred_{i}" for i in range(X.shape[1])] 150 | >>> X.columns = pred_name 151 | >>> selector = UniqueValuesThreshold(1) 152 | >>> selector.fit_transform(X) 153 | """ 154 | 155 | def __init__(self, threshold=1): 156 | super().__init__( 157 | threshold=threshold, 158 | statistic_fn=_pandas_count_unique_values, 159 | greater_than_threshold=True, 160 | ) 161 | 162 | 163 | def _pandas_count_unique_values_cat_features(X): 164 | """ 165 | Counts the number of unique values in categorical features of a pandas DataFrame. 166 | 167 | Parameters 168 | ---------- 169 | X : pandas DataFrame 170 | The input data. 171 | 172 | Returns 173 | ------- 174 | pandas Series 175 | The number of unique values in each categorical feature. 176 | 177 | Raises 178 | ------ 179 | TypeError 180 | If the input data is not a pandas DataFrame. 181 | """ 182 | if not isinstance(X, pd.DataFrame): 183 | raise TypeError("X should be a pandas DataFrame") 184 | count_series = pd.Series(data=0, index=X.columns) 185 | dtype_dic = create_dtype_dict(X, dic_keys="dtypes") 186 | for c in dtype_dic["cat"]: 187 | count_series[c] = X[c].nunique() 188 | return count_series 189 | 190 | 191 | class CardinalityThreshold(BaseThresholdSelector): 192 | """Feature selector that removes all categorical features with more unique values than threshold 193 | This feature selection algorithm looks only at the features (X), 194 | not the desired outputs (y), and can thus be used for unsupervised learning. 195 | 196 | Parameters 197 | ---------- 198 | threshold: int, default = 1000 199 | Features with a training-set missing larger than this threshold will be removed. 200 | The thresold should be >= 1 201 | 202 | Returns 203 | ------- 204 | selected_features: list of str 205 | List of selected features. 206 | 207 | Attributes 208 | ---------- 209 | n_features_in_ : int 210 | number of input predictors 211 | support_ : list of bool 212 | the list of the selected X-columns 213 | selected_features_ : list of str 214 | the list of names of selected features 215 | not_selected_features_ : list of str 216 | the list of names of rejected features 217 | 218 | Example 219 | ------- 220 | >>> from sklearn.datasets import make_classification, make_regression 221 | >>> X, y = make_regression(n_samples = 1000, n_features = 50, n_informative = 5, shuffle=False) # , n_redundant = 5 222 | >>> X = pd.DataFrame(X) 223 | >>> y = pd.Series(y) 224 | >>> pred_name = [f"pred_{i}" for i in range(X.shape[1])] 225 | >>> X.columns = pred_name 226 | >>> selector = CardinalityThreshold(100) 227 | >>> selector.fit_transform(X) 228 | """ 229 | 230 | def __init__(self, threshold=1000): 231 | super().__init__( 232 | threshold=threshold, 233 | statistic_fn=_pandas_count_unique_values_cat_features, 234 | greater_than_threshold=False, 235 | ) 236 | 237 | 238 | class CollinearityThreshold(SelectorMixin, BaseEstimator): 239 | """Feature selector that removes collinear features. 240 | This feature selection algorithm looks only at the features (X), 241 | not the desired outputs (y), and can thus be used for unsupervised learning. 242 | It computes the association between features (continuous or categorical), 243 | store the pairs of collinear features and remove one of them for all pairs having 244 | an association value above the threshold. 245 | 246 | The association measures are the Spearman correlation coefficient, correlation ratio 247 | and Theil's U. The association matrix is not necessarily symmetrical. 248 | 249 | By changing the method to "correlation", data will be encoded as integer 250 | and the Spearman correlation coefficient will be used instead. Faster but not 251 | a best practice because the categorical variables are considered as numeric. 252 | 253 | Parameters 254 | ---------- 255 | threshold : float, default = .8 256 | Features with a training-set missing larger than this threshold will be removed 257 | The thresold should be > 0 and =< 1 258 | method : str, default = "association" 259 | method for computing the association matrix. Either "association" or "correlation". 260 | Correlation leads to encoding of categorical variables as numeric 261 | n_jobs : int, default = -1 262 | the number of threads, -1 uses all the threads for computating the association matrix 263 | nom_nom_assoc : str or callable, default = "theil" 264 | the categorical-categorical association measure, by default Theil's U, not symmetrical! 265 | num_num_assoc : str or callable, default = "spearman" 266 | the numeric-numeric association measure 267 | nom_num_assoc : str or callable, default = "correlation_ratio" 268 | the numeric-categorical association measure 269 | 270 | Returns 271 | ------- 272 | selected_features: list of str 273 | List of selected features. 274 | 275 | Attributes 276 | ---------- 277 | n_features_in_ : int 278 | number of input predictors 279 | assoc_matrix_ : pd.DataFrame 280 | the square association matrix 281 | collinearity_summary_ : pd.DataFrame 282 | the pairs of collinear features and the association values 283 | support_ : list of bool 284 | the list of the selected X-columns 285 | selected_features_ : list of str 286 | the list of names of selected features 287 | not_selected_features_ : list of str 288 | the list of names of rejected features 289 | 290 | Example 291 | ------- 292 | >>> from sklearn.datasets import make_classification, make_regression 293 | >>> X, y = make_regression(n_samples = 1000, n_features = 50, n_informative = 5, shuffle=False) # , n_redundant = 5 294 | >>> X = pd.DataFrame(X) 295 | >>> y = pd.Series(y) 296 | >>> pred_name = [f"pred_{i}" for i in range(X.shape[1])] 297 | >>> X.columns = pred_name 298 | >>> selector = CollinearityThreshold(threshold=0.75) 299 | >>> selector.fit_transform(X) 300 | """ 301 | 302 | def __init__( 303 | self, 304 | threshold=0.80, 305 | method="association", 306 | n_jobs=1, 307 | nom_nom_assoc=weighted_theils_u, 308 | num_num_assoc=weighted_corr, 309 | nom_num_assoc=correlation_ratio, 310 | ): 311 | self.threshold = threshold 312 | self.method = method 313 | self.n_jobs = n_jobs 314 | self.nom_nom_assoc = nom_nom_assoc 315 | self.num_num_assoc = num_num_assoc 316 | self.nom_num_assoc = nom_num_assoc 317 | 318 | if self.method not in ["association", "correlation"]: 319 | raise ValueError("``method`` should be 'association' or 'correlation'") 320 | 321 | if (self.threshold > 1.0) or (self.threshold < 0.0): 322 | raise ValueError("``threshold`` should be larger than 0 and smaller than 1") 323 | 324 | def fit(self, X, y=None, sample_weight=None): 325 | """Learn empirical associtions from X. 326 | 327 | Parameters 328 | ---------- 329 | X : pd.DataFrame, shape (n_samples, n_features) 330 | Data from which to compute variances, where `n_samples` is 331 | the number of samples and `n_features` is the number of features. 332 | y : any, default=None 333 | Ignored. This parameter exists only for compatibility with 334 | sklearn.pipeline.Pipeline. 335 | sample_weight : pd.Series, optional, shape (n_samples,) 336 | weights for computing the statistics (e.g. weighted average) 337 | 338 | Returns 339 | ------- 340 | self : object 341 | Returns the instance itself. 342 | """ 343 | 344 | if isinstance(X, pd.DataFrame): 345 | self.feature_names_in_ = X.columns.to_numpy() 346 | else: 347 | raise TypeError("X is not a dataframe") 348 | 349 | self.suffix_dic = create_dtype_dict(X) 350 | 351 | if self.method == "correlation": 352 | encoder = OrdinalEncoderPandas() 353 | X = encoder.fit_transform(X) 354 | del encoder 355 | 356 | assoc_matrix = association_matrix( 357 | X=X, 358 | sample_weight=sample_weight, 359 | n_jobs=self.n_jobs, 360 | nom_nom_assoc=self.nom_nom_assoc, 361 | num_num_assoc=self.num_num_assoc, 362 | nom_num_assoc=self.nom_num_assoc, 363 | ) 364 | self.assoc_matrix_ = xy_to_matrix(assoc_matrix) 365 | 366 | to_drop = _recursive_collinear_elimination(self.assoc_matrix_, self.threshold) 367 | 368 | self.support_ = np.asarray( 369 | [True if c not in to_drop else False for c in X.columns] 370 | ) 371 | self.selected_features_ = self.feature_names_in_[self.support_] 372 | self.not_selected_features_ = self.feature_names_in_[~self.support_] 373 | 374 | return self 375 | 376 | def _get_support_mask(self): 377 | check_is_fitted(self) 378 | 379 | return self.support_ 380 | 381 | def transform(self, X): 382 | if not isinstance(X, pd.DataFrame): 383 | raise TypeError("X is not a dataframe") 384 | return X[self.selected_features_] 385 | 386 | def _more_tags(self): 387 | return {"allow_nan": True} 388 | 389 | def plot_association( 390 | self, ax=None, cmap="PuOr", figsize=None, cbar_kw=None, imgshow_kw=None 391 | ): 392 | """plot_association plots the association matrix 393 | 394 | Parameters 395 | ---------- 396 | ax : matplotlib.axes.Axes, optional 397 | the mpl axes if the figure object exists already, by default None 398 | cmap : str, optional 399 | colormap name, by default "PuOr" 400 | figsize : tuple of float, optional 401 | figure size, by default None 402 | cbar_kw : dict, optional 403 | colorbar kwargs, by default None 404 | imgshow_kw : dict, optional 405 | imgshow kwargs, by default None 406 | """ 407 | 408 | if figsize is None: 409 | figsize = (self.assoc_matrix_.shape[0] / 3, self.assoc_matrix_.shape[0] / 3) 410 | 411 | f, ax = plot_association_matrix( 412 | assoc_mat=self.assoc_matrix_, 413 | suffix_dic=self.suffix_dic, 414 | ax=ax, 415 | cmap=cmap, 416 | cbarlabel="association value", 417 | figsize=figsize, 418 | show=True, 419 | cbar_kw=cbar_kw, 420 | imgshow_kw=imgshow_kw, 421 | ) 422 | 423 | return f 424 | 425 | 426 | def _most_collinear(association_matrix_abs, threshold): 427 | cols_to_drop = association_matrix_abs.loc[ 428 | :, (association_matrix_abs > threshold).any(axis=0) 429 | ].columns.values 430 | rows_to_drop = association_matrix_abs.loc[ 431 | (association_matrix_abs > threshold).any(axis=1), : 432 | ].index.values 433 | to_drop = list(set(cols_to_drop).union(set(rows_to_drop))) 434 | if not to_drop: 435 | return None, None 436 | # for features in `to_drop` sum up their column and row values to find 437 | # the most collinear feature 438 | most_collinear_series = association_matrix_abs.loc[:, to_drop].sum(axis=0) 439 | most_collinear_series += association_matrix_abs.loc[to_drop, :].sum(axis=1) 440 | # not necessarily but avoids exceeding 1 441 | most_collinear_series /= 2 442 | return most_collinear_series.sort_values(ascending=False).index[0], to_drop 443 | 444 | 445 | def _recursive_collinear_elimination(association_matrix, threshold): 446 | dum = association_matrix.abs() 447 | most_collinear_features = [] 448 | 449 | while True: 450 | most_collinear_feature, to_drop = _most_collinear(dum, threshold) 451 | 452 | # Break if no more features to drop 453 | if not to_drop: 454 | break 455 | # the if statement below can probably also be removed since we can only 456 | # remove features we have left in dum 457 | if most_collinear_feature not in most_collinear_features: 458 | most_collinear_features.append(most_collinear_feature) 459 | dum = dum.drop(columns=most_collinear_feature, index=most_collinear_feature) 460 | 461 | return most_collinear_features 462 | -------------------------------------------------------------------------------- /src/arfs/feature_selection/variable_importance.py: -------------------------------------------------------------------------------- 1 | """Supervised Feature Selection 2 | 3 | This module provides selectors using supervised statistics and a threshold, using SHAP, permutation importance or impurity (Gini) importance. 4 | 5 | Module Structure: 6 | ----------------- 7 | - ``VariableImportance`` main class for identifying non-important features 8 | """ 9 | 10 | from __future__ import print_function 11 | from tqdm.auto import trange 12 | 13 | # pandas 14 | import pandas as pd 15 | 16 | # numpy 17 | import numpy as np 18 | 19 | # matplotlib 20 | import matplotlib.pyplot as plt 21 | import matplotlib.gridspec as gridspec 22 | 23 | # sklearn 24 | from sklearn.utils.validation import check_is_fitted 25 | from sklearn.base import BaseEstimator 26 | from sklearn.feature_selection._base import SelectorMixin 27 | 28 | # ARFS 29 | from ..utils import reset_plot 30 | from ..gbm import GradientBoosting 31 | from ..preprocessing import OrdinalEncoderPandas 32 | 33 | 34 | class VariableImportance(SelectorMixin, BaseEstimator): 35 | """Feature selector that removes predictors with zero or low variable importance. 36 | 37 | Identify the features with zero/low importance according to SHAP values of a lightgbm. 38 | The gbm can be trained with early stopping using a utils set to prevent overfitting. 39 | The feature importances are averaged over `n_iterations` to reduce the variance. 40 | The predictors are then ranked from the most important to the least important and the 41 | cumulative variable importance is computed. All the predictors not contributing (VI=0) or 42 | contributing to less than the threshold to the cumulative importance are removed. 43 | 44 | Parameters 45 | ---------- 46 | task : string 47 | The machine learning task, either 'classification' or 'regression' or 'multiclass', 48 | be sure to use a consistent objective function 49 | encode : boolean, default = True 50 | Whether or not to encode the predictors 51 | n_iterations : int, default = 10 52 | Number of iterations, the more iterations, the smaller the variance 53 | threshold : float, default = .99 54 | The selector computes the cumulative feature importance and ranks 55 | the predictors from the most important to the least important. 56 | All the predictors contributing to less than this value are rejected. 57 | lgb_kwargs : dictionary of keyword arguments 58 | dictionary of lightgbm estimators parameters with at least the objective function {'objective':'rmse'} 59 | encoder_kwargs : dictionary of keyword arguments, optional 60 | dictionary of the :class:`OrdinalEncoderPandas` parameters 61 | 62 | 63 | Returns 64 | ------- 65 | selected_features: list of str 66 | List of selected features. 67 | 68 | Attributes 69 | ---------- 70 | n_features_in_ : int 71 | number of input predictors 72 | assoc_matrix_ : pd.DataFrame 73 | the square association matrix 74 | collinearity_summary_ : pd.DataFrame 75 | the pairs of collinear features and the association values 76 | support_ : list of bool 77 | the list of the selected X-columns 78 | selected_features_ : list of str 79 | the list of names of selected features 80 | not_selected_features_ : list of str 81 | the list of names of rejected features 82 | fastshap : boolean 83 | enable or not the fasttreeshap implementation 84 | verbose : int, default = -1 85 | controls the progress bar, > 1 print out progress 86 | 87 | Example 88 | ------- 89 | >>> from sklearn.datasets import make_classification, make_regression 90 | >>> X, y = make_regression(n_samples = 1000, n_features = 50, n_informative = 5, shuffle=False) # , n_redundant = 5 91 | >>> X = pd.DataFrame(X) 92 | >>> y = pd.Series(y) 93 | >>> pred_name = [f"pred_{i}" for i in range(X.shape[1])] 94 | >>> X.columns = pred_name 95 | >>> selector = VariableImportance(threshold=0.75) 96 | >>> selector.fit_transform(X, y) 97 | """ 98 | 99 | def __init__( 100 | self, 101 | task="regression", 102 | encode=True, 103 | n_iterations=10, 104 | threshold=0.99, 105 | lgb_kwargs={"objective": "rmse", "zero_as_missing": False}, 106 | encoder_kwargs=None, 107 | fastshap=False, 108 | verbose=-1, 109 | ): 110 | self.task = task 111 | self.encode = encode 112 | self.n_iterations = n_iterations 113 | self.threshold = threshold 114 | self.lgb_kwargs = lgb_kwargs 115 | self.encoder_kwargs = encoder_kwargs 116 | self.verbose = verbose 117 | self.fastshap = fastshap 118 | 119 | if (self.threshold > 1.0) or (self.threshold < 0.0): 120 | raise ValueError("``threshold`` should be larger than 0 and smaller than 1") 121 | 122 | def fit(self, X, y, sample_weight=None): 123 | """Learn variable importance from X and y, supervised learning. 124 | 125 | Parameters 126 | ---------- 127 | X : pd.DataFrame, shape (n_samples, n_features) 128 | Data from which to compute variances, where `n_samples` is 129 | the number of samples and `n_features` is the number of features. 130 | y : any, default=None 131 | Ignored. This parameter exists only for compatibility with 132 | sklearn.pipeline.Pipeline. 133 | sample_weight : pd.Series, optional, shape (n_samples,) 134 | weights for computing the statistics (e.g. weighted average) 135 | 136 | Returns 137 | ------- 138 | self : object 139 | Returns the instance itself. 140 | """ 141 | 142 | if isinstance(X, pd.DataFrame): 143 | self.feature_names_in_ = X.columns.to_numpy() 144 | else: 145 | raise TypeError("X is not a dataframe") 146 | 147 | feature_importances = _compute_varimp_lgb( 148 | X=X, 149 | y=y, 150 | sample_weight=sample_weight, 151 | encode=self.encode, 152 | task=self.task, 153 | n_iterations=self.n_iterations, 154 | verbose=self.verbose, 155 | encoder_kwargs=self.encoder_kwargs, 156 | lgb_kwargs=self.lgb_kwargs, 157 | fastshap=self.fastshap, 158 | ) 159 | 160 | self.feature_importances_summary_ = feature_importances 161 | 162 | support_ordered = ( 163 | self.feature_importances_summary_["cumulative_importance"] >= self.threshold 164 | ) 165 | to_drop = list( 166 | self.feature_importances_summary_.loc[support_ordered, "feature"] 167 | ) 168 | 169 | self.support_ = np.asarray( 170 | [False if c in to_drop else True for c in self.feature_names_in_] 171 | ) 172 | self.selected_features_ = self.feature_names_in_[self.support_] 173 | self.not_selected_features_ = self.feature_names_in_[~self.support_] 174 | 175 | return self 176 | 177 | def _get_support_mask(self): 178 | check_is_fitted(self) 179 | 180 | return self.support_ 181 | 182 | def transform(self, X): 183 | """ 184 | Transform the data, returns a transformed version of `X`. 185 | 186 | Parameters 187 | ---------- 188 | X : array-like of shape (n_samples, n_features) 189 | Input samples. 190 | 191 | Returns 192 | ------- 193 | X : ndarray array of shape (n_samples, n_features_new) 194 | Transformed array. 195 | 196 | Raises 197 | ------ 198 | TypeError 199 | if the input is not a pd.DataFrame 200 | """ 201 | 202 | if not isinstance(X, pd.DataFrame): 203 | raise TypeError("X is not a dataframe") 204 | return X[self.selected_features_] 205 | 206 | def fit_transform(self, X, y=None, sample_weight=None): 207 | """ 208 | Fit to data, then transform it. 209 | Fits transformer to `X` and `y` with optional parameters `fit_params` 210 | and returns a transformed version of `X`. 211 | Parameters 212 | ---------- 213 | X : array-like of shape (n_samples, n_features) 214 | Input samples. 215 | y : array-like of shape (n_samples,) or (n_samples, n_outputs), \ 216 | default=None 217 | Target values (None for unsupervised transformations). 218 | **fit_params : dict 219 | Additional fit parameters. 220 | Returns 221 | ------- 222 | X_new : ndarray array of shape (n_samples, n_features_new) 223 | Transformed array. 224 | """ 225 | return self.fit(X=X, y=y, sample_weight=sample_weight).transform(X) 226 | 227 | def _more_tags(self): 228 | return {"allow_nan": True} 229 | 230 | def plot_importance( 231 | self, figsize=None, plot_n=50, n_feat_per_inch=3, log=True, style=None 232 | ): 233 | """Plots `plot_n` most important features and the cumulative importance of features. 234 | If `threshold` is provided, prints the number of features needed to reach `threshold` 235 | cumulative importance. 236 | 237 | Parameters 238 | ---------- 239 | plot_n : int, default = 50 240 | Number of most important features to plot. Defaults to 15 or the maximum 241 | number of features whichever is smaller 242 | n_feat_per_inch : int 243 | number of features per inch, the larger the less space between labels 244 | figsize : tuple of float, optional 245 | The rendered size as a percentage size 246 | log : bool, default=True 247 | Whether or not render variable importance on a log scale 248 | style : bool, default=False 249 | set arfs style or not 250 | 251 | Returns 252 | ------- 253 | hv.plot 254 | the feature importances holoviews object 255 | 256 | """ 257 | if style: 258 | plt.style.use(style) 259 | else: 260 | reset_plot() 261 | 262 | if plot_n > self.feature_importances_summary_.shape[0]: 263 | plot_n = self.feature_importances_summary_.shape[0] - 1 264 | 265 | df = self.feature_importances_summary_ 266 | importance_index = np.min( 267 | np.where(df["cumulative_importance"] > self.threshold) 268 | ) 269 | non_cum_threshold = df.iloc[importance_index, 2] 270 | max_norm_importance = 0.99 * df.normalized_importance.max() 271 | 272 | if plot_n > df.shape[0]: 273 | plot_n = df.shape[0] - 1 274 | 275 | if figsize is None: 276 | figsize = (8, plot_n / n_feat_per_inch) 277 | fig = plt.figure(tight_layout=True, figsize=figsize) 278 | gs = gridspec.GridSpec(3, 3) 279 | ax1 = fig.add_subplot(gs[:, 0]) 280 | ax1.scatter(df.normalized_importance, df.feature) 281 | # ax.set_ylabel('YLabel0') 282 | ax1.set_xlabel("normalized importance") 283 | ax1.xaxis.set_label_position("top") 284 | ax1.invert_yaxis() 285 | ax1.axvline(x=non_cum_threshold, linestyle="dashed", color="r") 286 | if log: 287 | ax1.set_xscale("log") 288 | ax1.grid() 289 | ax1.set(frame_on=False) 290 | 291 | ax2 = fig.add_subplot(gs[:, 1:]) 292 | ax2.scatter(df.feature, df.cumulative_importance) 293 | # ax.set_ylabel('YLabel0') 294 | ax2.set_ylabel("cumulative importance") 295 | ax2.tick_params(axis="x", labelrotation=90) 296 | 297 | importance_min_value_on_axis = max_norm_importance if log else 0 298 | x_vert, y_vert = ( 299 | [importance_index, importance_index], 300 | [ 301 | importance_min_value_on_axis, 302 | self.threshold, 303 | ], 304 | ) 305 | x_horiz, y_horiz = ( 306 | [importance_min_value_on_axis, importance_index], 307 | [ 308 | self.threshold, 309 | self.threshold, 310 | ], 311 | ) 312 | 313 | ax2.plot(x_vert, y_vert, linestyle="dashed", color="r") 314 | ax2.plot(x_horiz, y_horiz, linestyle="dashed", color="r") 315 | ax2.set_ylim(max_norm_importance, 1.0) 316 | if log: 317 | ax2.set_xscale("log") 318 | ax2.grid() 319 | ax2.set(frame_on=False) 320 | 321 | fig.align_labels() 322 | return fig 323 | 324 | 325 | def _compute_varimp_lgb( 326 | X, 327 | y, 328 | sample_weight=None, 329 | encode=False, 330 | task="regression", 331 | n_iterations=10, 332 | verbose=-1, 333 | fastshap=True, 334 | encoder_kwargs=None, 335 | lgb_kwargs={"objective": "rmse", "zero_as_missing": False}, 336 | ): 337 | if task not in ["regression", "classification", "multiclass"]: 338 | raise ValueError('Task must be either "classification" or "regression"') 339 | 340 | if y is None: 341 | raise ValueError("No training labels provided.") 342 | 343 | if encode: 344 | encoder = ( 345 | OrdinalEncoderPandas(**encoder_kwargs) 346 | if encoder_kwargs is not None 347 | else OrdinalEncoderPandas() 348 | ) 349 | X = encoder.fit(X).transform(X) 350 | del encoder 351 | # Extract feature names 352 | feature_names = list(X.columns) 353 | # Empty array for feature importances 354 | feature_importance_values = np.zeros(len(feature_names)) 355 | progress_bar = trange(n_iterations) if verbose > 1 else range(n_iterations) 356 | 357 | # Iterate through each fold 358 | for _ in progress_bar: 359 | if verbose > 1: 360 | progress_bar.set_description("Iteration nb: {0:<3}".format(_)) 361 | 362 | # lgb_kwargs['verbose'] = -1 363 | gbm_model = GradientBoosting( 364 | cat_feat="auto", 365 | stratified=False, 366 | params=lgb_kwargs, 367 | show_learning_curve=False, 368 | return_valid_features=True, 369 | verbose_eval=0, 370 | ) 371 | 372 | gbm_model.fit(X=X, y=y, sample_weight=sample_weight) 373 | 374 | # pimp cool but too slow 375 | # perm_imp = permutation_importance( 376 | # model, valid_features, valid_labels, n_repeats=10, random_state=42, n_jobs=-1 377 | # ) 378 | # perm_imp = perm_imp.importances_mean 379 | if fastshap: 380 | try: 381 | from fasttreeshap import TreeExplainer as FastTreeExplainer 382 | except ImportError: 383 | ImportError("fasttreeshap is not installed") 384 | 385 | explainer = FastTreeExplainer( 386 | gbm_model.model, 387 | algorithm="auto", 388 | shortcut=False, 389 | feature_perturbation="tree_path_dependent", 390 | ) 391 | shap_matrix = explainer.shap_values(gbm_model.valid_features) 392 | if isinstance(shap_matrix, list): 393 | # For LightGBM classifier, RF, in sklearn API, SHAP returns a list of arrays 394 | # https://github.com/slundberg/shap/issues/526 395 | shap_imp = np.mean([np.abs(sv).mean(0) for sv in shap_matrix], axis=0) 396 | else: 397 | shap_imp = np.abs(shap_matrix).mean(0) 398 | else: 399 | shap_matrix = gbm_model.model.predict( 400 | gbm_model.valid_features, pred_contrib=True 401 | ) 402 | # the dim changed in lightGBM >= 3.0.0 403 | if task == "multiclass": 404 | # X_SHAP_values (array-like of shape = [n_samples, n_features + 1] 405 | # or shape = [n_samples, (n_features + 1) * n_classes]) 406 | # index starts from 0 407 | n_features_plus_bias = gbm_model.valid_features.shape[1] + 1 408 | n_samples = gbm_model.valid_features.shape[0] 409 | y_freq_table = pd.Series(y.fillna(0)).value_counts(normalize=True) 410 | n_classes = y_freq_table.size 411 | 412 | # Reshape the array to [n_samples, n_features + 1, n_classes] 413 | reshaped_values = shap_matrix.reshape( 414 | n_samples, n_classes, n_features_plus_bias 415 | ) 416 | 417 | # Since we need (n_samples, n_features + 1, n_classes), transpose the second and third dimensions 418 | reshaped_values = reshaped_values.transpose(0, 2, 1) 419 | reshaped_values = reshaped_values[:, :-1, :] 420 | reshaped_values.shape 421 | # Sum the contributions for each class ignoring the bias term 422 | # average on all the samples 423 | shap_imp = np.abs(reshaped_values).sum(axis=-1).mean(axis=0) 424 | else: 425 | # for binary, only one class is returned, for regression a single column added as well 426 | shap_imp = np.mean(np.abs(shap_matrix[:, :-1]), axis=0) 427 | 428 | # Record the feature importances 429 | feature_importance_values += ( 430 | shap_imp / n_iterations 431 | ) # model.feature_importances_ / n_iterations 432 | feature_importances = pd.DataFrame( 433 | {"feature": feature_names, "importance": feature_importance_values} 434 | ) 435 | # Sort features according to importance 436 | feature_importances = feature_importances.sort_values( 437 | "importance", ascending=False 438 | ).reset_index(drop=True) 439 | # Normalize the feature importances to add up to one 440 | feature_importances["normalized_importance"] = ( 441 | feature_importances["importance"] / feature_importances["importance"].sum() 442 | ) 443 | feature_importances["cumulative_importance"] = np.cumsum( 444 | feature_importances["normalized_importance"] 445 | ) 446 | # Extract the features with zero importance 447 | # record_zero_importance = feature_importances[ 448 | # feature_importances["importance"] == 0.0 449 | # ] 450 | return feature_importances 451 | -------------------------------------------------------------------------------- /src/arfs/parallel.py: -------------------------------------------------------------------------------- 1 | """Parallelize Pandas 2 | 3 | This module provides utilities for parallelizing operations on pd.DataFrame 4 | 5 | Module Structure: 6 | ----------------- 7 | - ``parallel_matrix_entries`` for parallelizing operations returning a matrix (2D) (apply on pairs of columns) 8 | - ``parallel_df`` for parallelizing operations returning a series (1D) (apply on a single column at a time) 9 | """ 10 | 11 | import numpy as np 12 | import pandas as pd 13 | from joblib import Parallel, delayed 14 | from multiprocessing import cpu_count 15 | from itertools import chain 16 | 17 | 18 | def parallel_matrix_entries(func, df, comb_list, sample_weight=None, n_jobs=-1): 19 | """parallel_matrix_entries applies a function to each chunk of 20 | combination of columns of the dataframe, distributed by cores. 21 | This is similar to https://github.com/smazzanti/mrmr/mrmr/pandas.py 22 | 23 | 24 | Parameters 25 | ---------- 26 | func : callable 27 | function to be applied to each pair of columns in comb_list 28 | df : pd.DataFrame 29 | the dataframe on which to apply the function 30 | comb_list : list of tuples of str 31 | Pairs of column names corresponding to the entries 32 | sample_weight : pd.Series or np.array, optional 33 | The weight vector, if any, of shape (n_samples,), by default None 34 | n_jobs : int, optional 35 | the number of cores to use for the computation, by default -1 36 | 37 | Returns 38 | ------- 39 | pd.DataFrame 40 | concatenated results into a single pandas DF 41 | """ 42 | # Determining the number of jobs 43 | n_jobs = cpu_count() if n_jobs == -1 else min(cpu_count(), n_jobs) 44 | 45 | if n_jobs == 1: 46 | lst = func(X=df, sample_weight=sample_weight, comb_list=comb_list) 47 | return pd.concat(lst, ignore_index=True).sort_values("val", ascending=False) 48 | 49 | comb_chunks = np.array_split(comb_list, n_jobs) 50 | lst = Parallel(n_jobs=n_jobs)( 51 | delayed(func)(X=df, sample_weight=sample_weight, comb_list=comb_chunk) 52 | for comb_chunk in comb_chunks 53 | ) 54 | # Directly return the single DataFrame if lst contains only one element 55 | if len(lst) == 1: 56 | return lst[0] 57 | else: 58 | return pd.concat(list(chain(*lst)), ignore_index=True) 59 | 60 | 61 | def parallel_df(func, df, series, sample_weight=None, n_jobs=-1): 62 | """parallel_df apply a function to each column of the dataframe, distributed by cores. 63 | This is similar to https://github.com/smazzanti/mrmr/mrmr/pandas.py 64 | 65 | Parameters 66 | ---------- 67 | func : callable 68 | function to be applied to each column 69 | df : pd.DataFrame 70 | the dataframe on which to apply the function 71 | series : pd.Series 72 | series (target) used by the function 73 | sample_weight : pd.Series or np.array, optional 74 | The weight vector, if any, of shape (n_samples,), by default None 75 | n_jobs : int, optional 76 | the number of cores to use for the computation, by default -1 77 | 78 | Returns 79 | ------- 80 | pd.DataFrame 81 | concatenated results into a single pandas DF 82 | """ 83 | # Determining the number of jobs 84 | n_jobs = cpu_count() if n_jobs == -1 else min(cpu_count(), n_jobs) 85 | 86 | if n_jobs == 1: 87 | lst = func(df, series, sample_weight).sort_values(ascending=False) 88 | 89 | return ( 90 | pd.concat(lst, ignore_index=True).sort_values("val", ascending=False) 91 | if isinstance(lst, list) 92 | else lst 93 | ) 94 | else: 95 | col_chunks = np.array_split(range(len(df.columns)), n_jobs) 96 | lst = Parallel(n_jobs=n_jobs)( 97 | delayed(func)(df.iloc[:, col_chunk], series, sample_weight) 98 | for col_chunk in col_chunks 99 | ) 100 | 101 | return pd.concat(lst).sort_values(ascending=False) 102 | 103 | 104 | def _compute_series( 105 | X, 106 | y, 107 | sample_weight=None, 108 | func_xyw=None, 109 | ): 110 | """_compute_series is a utility function for computing the series 111 | resulting of the ``apply`` 112 | 113 | Parameters 114 | ---------- 115 | X : pd.DataFrame, of shape (n_samples, n_features) 116 | The set of regressors that will be tested sequentially 117 | y : pd.Series or np.array, of shape (n_samples,) 118 | The target vector 119 | sample_weight : pd.Series or np.array, of shape (n_samples,), optional 120 | The weight vector, if any, by default None 121 | func_xyw : callable, optional 122 | callable (function) for computing the individual elements of the series 123 | takes two mandatory inputs (x and y) and an optional input w, sample_weights 124 | """ 125 | 126 | def _closure_compute_series(x, y, sample_weight): 127 | x_not_na = ~x.isna() 128 | if x_not_na.sum() == 0: 129 | return 0 130 | return func_xyw( 131 | x=x[x_not_na], 132 | y=y[x_not_na], 133 | sample_weight=sample_weight[x_not_na], 134 | as_frame=False, 135 | ) 136 | 137 | return X.apply( 138 | lambda col: _closure_compute_series(x=col, y=y, sample_weight=sample_weight) 139 | ).fillna(0.0) 140 | 141 | 142 | def _compute_matrix_entries( 143 | X, 144 | comb_list, 145 | sample_weight=None, 146 | func_xyw=None, 147 | ): 148 | """base closure for computing matrix entries applying a function to each chunk of 149 | combination of columns of the dataframe, distributed by cores. 150 | This is similar to https://github.com/smazzanti/mrmr/mrmr/pandas.py 151 | 152 | Parameters 153 | ---------- 154 | X : pd.DataFrame, of shape (n_samples, n_features) 155 | The set of regressors that will be tested sequentially 156 | sample_weight : pd.Series or np.array, of shape (n_samples,), optional 157 | The weight vector, if any, by default None 158 | func_xyw : callable, optional 159 | callable (function) for computing the individual elements of the matrix 160 | takes two mandatory inputs (x and y) and an optional input w, sample_weights 161 | comb_list : list of 2-tuple of str 162 | Pairs of column names corresponding to the entries 163 | 164 | Returns 165 | ------- 166 | List[pd.DataFrame] 167 | a list of partial dfs to be concatenated 168 | """ 169 | v_df_list = [ 170 | func_xyw(x=X[comb[0]], y=X[comb[1]], sample_weight=sample_weight, as_frame=True) 171 | for comb in comb_list 172 | ] 173 | 174 | return v_df_list 175 | -------------------------------------------------------------------------------- /src/arfs/sampling.py: -------------------------------------------------------------------------------- 1 | """This module provide methods for sampling large datasets for reducing the running time""" 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from scipy.sparse import issparse 6 | from collections import Counter 7 | from sklearn.cluster import AgglomerativeClustering 8 | from sklearn.ensemble import IsolationForest 9 | from scipy.stats import ks_2samp 10 | from .utils import is_list_of_str, is_list_of_bool, is_list_of_int 11 | 12 | 13 | def sample(df, n=1000, sample_weight=None, method="gower"): 14 | """Sampling rows from a dataframe when random sampling is not 15 | enough for reducing the number of rows. 16 | The strategies are either using hierarchical clustering 17 | based on the Gower distance or using isolation forest for identifying 18 | the most similar elements. 19 | For the clustering algorithm, clusters are determined using the Gower distance 20 | (mixed type data) and the dataset is shrunk from n_samples to n_clusters. 21 | 22 | For the isolation forest algorithm, samples are added till a sufficient 2-samples 23 | KS statistics is reached or if the number iteration reached the max number (20) 24 | 25 | Parameters 26 | ---------- 27 | df : pd.DataFrame 28 | the dataframe to sample, with or without the target 29 | n : int, optional 30 | the number of clusters if method is ``"gower"``, by default 100 31 | sample_weight : pd.Series or np.array, optional 32 | sample weights, by default None 33 | method : str, optional 34 | the strategy to use for sampling the rows. Either ``"gower"`` or ``"isoforest"``, by default 'gower' 35 | 36 | Returns 37 | ------- 38 | pd.DataFrame 39 | the sampled dataframe 40 | 41 | """ 42 | assert isinstance(df, pd.DataFrame), "X should be a DataFrame" 43 | X = df.copy() 44 | num_cols = list(X.select_dtypes(include=[np.number])) 45 | non_num_cols = list(set(list(X.columns)) - set(num_cols)) 46 | 47 | if method == "gower": 48 | # basic imputation 49 | if non_num_cols: 50 | X[non_num_cols] = X[non_num_cols].fillna(X[non_num_cols].mode().iloc[0]) 51 | if num_cols: 52 | X[num_cols] = X[num_cols].fillna(X[num_cols].mean().iloc[0]) 53 | 54 | # no need for scaling, it is built-in the computation of the Gower distance 55 | gd = gower_matrix(X, cat_features=non_num_cols, weight=sample_weight) 56 | 57 | labels = AgglomerativeClustering( 58 | n_clusters=n, metric="precomputed", linkage="complete" 59 | ).fit_predict(gd) 60 | X["label"] = labels 61 | X["label"] = "clus_" + X["label"].astype(str) 62 | X_num = X.groupby("label")[num_cols].agg("mean") 63 | if non_num_cols: 64 | X_nonnum = X.groupby("label")[non_num_cols].agg(get_most_common) 65 | X_sampled = X_num.join(X_nonnum) 66 | else: 67 | X_sampled = X_num 68 | X_sampled = X_sampled.reindex(X.columns, axis=1) 69 | return X_sampled 70 | elif method == "isoforest": 71 | X[non_num_cols] = X[non_num_cols].astype("str").astype("category") 72 | for col in non_num_cols: 73 | X[col] = X[col].astype("category").cat.codes 74 | idx = isof_find_sample(X, sample_weight=None) 75 | return X.iloc[idx, :] 76 | else: 77 | NotImplementedError(f"{method} not implemented") 78 | 79 | 80 | def get_most_common(srs): 81 | x = list(srs) 82 | my_counter = Counter(x) 83 | return my_counter.most_common(1)[0][0] 84 | 85 | 86 | def gower_matrix( 87 | data_x, 88 | data_y=None, 89 | weight=None, 90 | cat_features="auto", 91 | ): 92 | """Computes the gower distances between X and Y 93 | 94 | Gower is a similarity measure for categorical, boolean and numerical mixed 95 | data. 96 | 97 | Parameters 98 | ---------- 99 | data_x : np.array or pd.DataFrame 100 | The data for computing the Gower distance 101 | data_y : np.array or pd.DataFrame or pd.Series, optional 102 | The reference matrix or vector to compare with, optional 103 | weight : np.array or pd.Series, optional 104 | sample weight, optional 105 | cat_features : list of str or bool or int, optional 106 | auto-detect cat features or a list of cat features, by default 'auto' 107 | 108 | Returns 109 | ------- 110 | np.array 111 | The Gower distance matrix, shape (n_samples, n_samples) 112 | 113 | Notes 114 | ----- 115 | The non-numeric features, and numeric feature ranges are determined from X and not Y. 116 | 117 | Raises 118 | ------ 119 | TypeError 120 | If two dataframes are passed but have different number of columns 121 | TypeError 122 | If two arrays are passed but have different number of columns 123 | TypeError 124 | Sparse matrices are not supported 125 | TypeError 126 | if a list of categorical columns is passed, it should be a list of strings or integers or boolean values 127 | """ 128 | # function checks 129 | X = data_x 130 | if data_y is None: 131 | Y = data_x 132 | else: 133 | Y = data_y 134 | if not isinstance(X, np.ndarray): 135 | y_col = Y.columns if isinstance(Y, pd.DataFrame) else Y.index 136 | if not np.array_equal(X.columns, y_col): 137 | raise TypeError("X and Y must have same columns!") 138 | else: 139 | if not X.shape[1] == Y.shape[1]: 140 | raise TypeError("X and Y must have same y-dim!") 141 | if issparse(X) or issparse(Y): 142 | raise TypeError("Sparse matrices are not supported!") 143 | 144 | x_n_rows, x_n_cols = X.shape 145 | y_n_rows, y_n_cols = Y.shape 146 | 147 | if cat_features == "auto": 148 | if not isinstance(X, np.ndarray): 149 | is_number = np.vectorize(lambda x: not np.issubdtype(x, np.number)) 150 | cat_features = is_number(X.dtypes) 151 | else: 152 | cat_features = np.zeros(x_n_cols, dtype=bool) 153 | for col in range(x_n_cols): 154 | if not np.issubdtype(type(X[0, col]), np.number): 155 | cat_features[col] = True 156 | else: 157 | # force categorical columns (if integer encoded for instance) 158 | if is_list_of_str(cat_features): 159 | cat_feat = [True if c in cat_features else False for c in X.columns] 160 | cat_features = np.array(cat_feat) 161 | elif is_list_of_bool(cat_features): 162 | cat_features = np.array(cat_features) 163 | elif is_list_of_int(cat_features): 164 | cat_feat = [ 165 | True if c in cat_features else False for c in range(len(X.columns)) 166 | ] 167 | cat_features = np.array(cat_feat) 168 | else: 169 | raise TypeError( 170 | "If not 'auto' cat_features should be a list of strings, integers or Booleans" 171 | ) 172 | 173 | # print(cat_features) 174 | 175 | if not isinstance(X, np.ndarray): 176 | X = np.asarray(X) 177 | if not isinstance(Y, np.ndarray): 178 | Y = np.asarray(Y) 179 | 180 | Z = np.concatenate((X, Y)) 181 | 182 | x_index = range(0, x_n_rows) 183 | y_index = range(x_n_rows, x_n_rows + y_n_rows) 184 | 185 | Z_num = Z[:, np.logical_not(cat_features)] 186 | 187 | num_cols = Z_num.shape[1] 188 | num_ranges = np.zeros(num_cols) 189 | num_max = np.zeros(num_cols) 190 | 191 | for col in range(num_cols): 192 | col_array = Z_num[:, col].astype(np.float32) 193 | max_ = np.nanmax(col_array) 194 | min_ = np.nanmin(col_array) 195 | 196 | if np.isnan(max_): 197 | max_ = 0.0 198 | if np.isnan(min_): 199 | min_ = 0.0 200 | num_max[col] = max_ 201 | num_ranges[col] = (1 - min_ / max_) if (max_ != 0) else 0.0 202 | 203 | # This is to normalize the numeric values between 0 and 1. 204 | Z_num = np.divide( 205 | Z_num.astype(float), 206 | num_max.astype(float), 207 | out=np.zeros_like(Z_num).astype(float), 208 | where=num_max != 0, 209 | ) 210 | Z_cat = Z[:, cat_features] 211 | 212 | if weight is None: 213 | weight = np.ones(Z.shape[1]) 214 | 215 | # print(weight) 216 | 217 | weight_cat = weight[cat_features] 218 | weight_num = weight[np.logical_not(cat_features)] 219 | 220 | out = np.zeros((x_n_rows, y_n_rows), dtype=np.float32) 221 | 222 | weight_sum = weight.sum() 223 | 224 | X_cat = Z_cat[x_index,] 225 | X_num = Z_num[x_index,] 226 | Y_cat = Z_cat[y_index,] 227 | Y_num = Z_num[y_index,] 228 | 229 | # print(X_cat,X_num,Y_cat,Y_num) 230 | 231 | for i in range(x_n_rows): 232 | j_start = i 233 | if x_n_rows != y_n_rows: 234 | j_start = 0 235 | # call the main function 236 | res = _gower_distance_row( 237 | X_cat[i, :], 238 | X_num[i, :], 239 | Y_cat[j_start:y_n_rows, :], 240 | Y_num[j_start:y_n_rows, :], 241 | weight_cat, 242 | weight_num, 243 | weight_sum, 244 | num_ranges, 245 | ) 246 | # print(res) 247 | out[i, j_start:] = res 248 | if x_n_rows == y_n_rows: 249 | out[i:, j_start] = res 250 | 251 | return out 252 | 253 | 254 | def _gower_distance_row( 255 | xi_cat, 256 | xi_num, 257 | xj_cat, 258 | xj_num, 259 | feature_weight_cat, 260 | feature_weight_num, 261 | feature_weight_sum, 262 | ranges_of_numeric, 263 | ): 264 | """Compute a row of the Gower matrix 265 | 266 | Parameters 267 | ---------- 268 | xi_cat : np.array 269 | categorical row of the X matrix 270 | xi_num : np.array 271 | numerical row of the X matrix 272 | xj_cat : np.array 273 | categorical row of the X matrix 274 | xj_num : np.array 275 | numerical row of the X matrix 276 | feature_weight_cat : np.array 277 | weight vector for the categorical features 278 | feature_weight_num : np.array 279 | weight vector for the numerical features 280 | feature_weight_sum : float 281 | The sum of the weights 282 | ranges_of_numeric : np.array 283 | range of the scaled numerical features (between 0 and 1) 284 | 285 | Returns 286 | ------- 287 | np.array : array 288 | a row vector of the Gower distance 289 | """ 290 | # categorical columns 291 | sij_cat = np.where(xi_cat == xj_cat, np.zeros_like(xi_cat), np.ones_like(xi_cat)) 292 | sum_cat = np.multiply(feature_weight_cat, sij_cat).sum(axis=1) 293 | 294 | # numerical columns 295 | abs_delta = np.absolute(xi_num - xj_num) 296 | sij_num = np.divide( 297 | abs_delta, 298 | ranges_of_numeric, 299 | out=np.zeros_like(abs_delta), 300 | where=ranges_of_numeric != 0, 301 | ) 302 | 303 | sum_num = np.multiply(feature_weight_num, sij_num).sum(axis=1) 304 | sums = np.add(sum_cat, sum_num) 305 | sum_sij = np.divide(sums, feature_weight_sum) 306 | 307 | return sum_sij 308 | 309 | 310 | def smallest_indices(ary, n): 311 | """Returns the n largest indices from a numpy array. 312 | 313 | Parameters 314 | ---------- 315 | ary : np.array 316 | the array for which to return largest indices 317 | n : int 318 | the number of indices to return 319 | 320 | Returns 321 | ------- 322 | dict 323 | the dictionary of indices and values of the largest elements 324 | """ 325 | # n += 1 326 | flat = np.nan_to_num(ary.flatten(), nan=999) 327 | indices = np.argpartition(-flat, -n)[-n:] 328 | indices = indices[np.argsort(flat[indices])] 329 | # indices = np.delete(indices,0,0) 330 | values = flat[indices] 331 | return {"index": indices, "values": values} 332 | 333 | 334 | def gower_topn( 335 | data_x, 336 | data_y=None, 337 | weight=None, 338 | cat_features="auto", 339 | n=5, 340 | key=None, 341 | ): 342 | """Get the n most similar elements 343 | 344 | Parameters 345 | ---------- 346 | data_x : np.array or pd.DataFrame 347 | The data for the look up 348 | data_y : np.array or pd.DataFrame or pd.Series, optional 349 | elements for which to return the most similar elements, should be a single row 350 | weight : np.array or pd.Series, optional 351 | sample weight, by default None 352 | cat_features : list of str or bool or int, optional 353 | auto detection of cat features or a list of strings, booleans or integers, by default 'auto' 354 | n : int, optional 355 | the number of neighbors/similar rows to find, by default 5 356 | key : str, optional 357 | identifier key. If several rows refer to the same id, this column 358 | will be used for finding the nearest neighbors with a 359 | different id, by default None 360 | 361 | Returns 362 | ------- 363 | dict 364 | the dictionary of indices and values of the closest elements 365 | 366 | Raises 367 | ------ 368 | TypeError 369 | if the reference element is not a single row 370 | """ 371 | 372 | if data_y.shape[0] >= 2: 373 | raise TypeError("Only support `data_y` of 1 row. ") 374 | if key is None: 375 | dm = gower_matrix(data_y, data_x, weight, cat_features) 376 | else: 377 | X = data_x.drop(key, axis=1) 378 | Y = data_x.drop(key, axis=1) 379 | dm = gower_matrix(Y, X, weight, cat_features) 380 | 381 | if key is not None: 382 | idx = smallest_indices(np.nan_to_num(dm[0], nan=1), n)["index"] 383 | val = smallest_indices(np.nan_to_num(dm[0], nan=1), n)["values"] 384 | unique_id = data_x.iloc[idx, :] 385 | unique_id = unique_id[key] 386 | nunique_id = unique_id.nunique() 387 | mul = 1 388 | # continue looking for the closest n unique records with a different id 389 | while nunique_id < n: 390 | idx = smallest_indices(np.nan_to_num(dm[0], nan=1), mul * n)["index"] 391 | val = smallest_indices(np.nan_to_num(dm[0], nan=1), mul * n)["values"] 392 | unique_id = data_x.iloc[idx, :].reset_index() 393 | unique_id = unique_id[key] 394 | nunique_id = unique_id.nunique() 395 | mul += 1 396 | 397 | # find the indices of the unique id 398 | _, idx_n = np.unique(unique_id, return_index=True) 399 | # select only the rows corresponding to unique id 400 | val = val[idx_n] 401 | idx = idx[idx_n] 402 | # sort them from the closest to the farthest, according to the Gower metrics 403 | idx_n = np.argsort(val) 404 | # return the n closest records, with a different id 405 | return {"index": idx[idx_n[:n]], "values": val[idx_n[:n]]} 406 | else: 407 | return smallest_indices(np.nan_to_num(dm[0], nan=1), n) 408 | 409 | 410 | def get_5_percent_splits(length): 411 | """splits dataframe into 5% intervals 412 | 413 | Parameters 414 | ---------- 415 | length : int 416 | array length 417 | 418 | Returns 419 | ------- 420 | array 421 | vector of sizes 422 | """ 423 | 424 | five_percent = round(5 / 100 * length) 425 | return np.arange(five_percent, length, five_percent) 426 | 427 | 428 | def isolation_forest(X, sample_weight=None): 429 | """fits isolation forest to the dataset and gives an anomaly score to every sample 430 | 431 | Parameters 432 | ---------- 433 | X : pd.DataFrame or np.array 434 | the predictors matrix 435 | sample_weight : pd.Series or np.array, optional 436 | the sample weights, if any, by default None 437 | """ 438 | clf = IsolationForest().fit(X, sample_weight=sample_weight) 439 | return clf.score_samples(X) 440 | 441 | 442 | def isof_find_sample(X, sample_weight=None): 443 | """Finds a sample by comparing the distributions of the anomaly scores between the sample and the original 444 | distribution using the KS-test. Starts of a 5% however will increase to 10% and then 15% etc. if a significant sample can not be found 445 | 446 | References 447 | ---------- 448 | Sampling method taken from boruta_shap, author: https://github.com/Ekeany 449 | 450 | Parameters 451 | ---------- 452 | X : pd.DataFrame 453 | the predictors matrix 454 | sample_weight : pd.Series or np.array, optional 455 | the sample weights, if any, by default None 456 | 457 | Returns 458 | ------- 459 | array 460 | the indices for reducing the shadow predictors matrix 461 | """ 462 | loop = True 463 | iteration = 0 464 | size = get_5_percent_splits(length=X.shape[0]) 465 | element = 1 466 | preds = isolation_forest(X, sample_weight) 467 | while loop: 468 | sample_indices = np.random.choice( 469 | np.arange(preds.size), size=size[element], replace=False 470 | ) 471 | sample = np.take(preds, sample_indices) 472 | if ks_2samp(preds, sample).pvalue > 0.95: 473 | break 474 | if iteration == 20: 475 | element += 1 476 | iteration = 0 477 | return sample_indices 478 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_allrelevant.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import lightgbm as lgb 4 | from arfs.feature_selection.allrelevant import Leshy, BoostAGroota, GrootCV 5 | from arfs.utils import ( 6 | _make_corr_dataset_regression, 7 | _make_corr_dataset_classification, 8 | ) 9 | from arfs.utils import LightForestClassifier, LightForestRegressor 10 | 11 | 12 | class TestLeshy: 13 | """ 14 | Test suite for all-relevant FS boruta-like method: Leshy 15 | """ 16 | 17 | def test_borutaPy_vs_leshy_with_rfc_and_native_feature_importance(self): 18 | # too slow for circleci to run them in a reasonable time 19 | # takes 2 min on laptop, 1h or more on circleci 20 | # sklearn random forest implementation 21 | # X, y, w = _make_corr_dataset_classification() 22 | # rfc = RandomForestClassifier(max_features='sqrt', max_samples=0.632, n_estimators=100) 23 | # bt = BorutaPy(rfc) 24 | # bt.fit(X.values, y) 25 | # borutapy_rfc_list = sorted(list(X.columns[bt.support_])) 26 | 27 | # lightGBM random forest implementation 28 | baseline_list = ["var0", "var1", "var2", "var3", "var4"] 29 | X, y, w = _make_corr_dataset_classification(size=100) 30 | n_feat = X.shape[1] 31 | rfc = LightForestClassifier(n_feat) 32 | # RandomForestClassifier(max_features='sqrt', max_samples=0.632, n_estimators=100) # --> too slow 33 | arfs = Leshy(rfc, verbose=0, max_iter=10, random_state=42, importance="native") 34 | arfs.fit(X, y) 35 | leshy_rfc_list = sorted(arfs.feature_names_in_[arfs.support_]) 36 | 37 | # assert borutapy_rfc_list == leshy_rfc_list, "same selected features are expected" 38 | assert bool(set(baseline_list) & set(leshy_rfc_list)), ( 39 | "expect non-empty intersection" 40 | ) 41 | 42 | def test_borutaPy_vs_leshy_with_rfr_and_native_feature_importance(self): 43 | # too slow for circleci to run them in a reasonable time 44 | # takes 2 min on laptop, 1h or more on circleci 45 | # # sklearn random forest implementation 46 | # X, y, w = _generated_corr_dataset_regr() 47 | # rfr = RandomForestRegressor(max_features=0.3, max_samples=0.632, n_estimators=100) 48 | # bt = BorutaPy(rfr) 49 | # bt.fit(X.values, y) 50 | # borutapy_rfc_list = sorted(list(X.columns[bt.support_])) 51 | 52 | # lightGBM random forest implementation 53 | baseline_list = ["var0", "var1", "var2", "var3", "var4"] 54 | X, y, w = _make_corr_dataset_regression(size=100) 55 | n_feat = X.shape[1] 56 | rfr = LightForestRegressor(n_feat) 57 | # rfr = RandomForestRegressor(max_features=0.3, max_samples=0.632, n_estimators=10) 58 | arfs = Leshy(rfr, verbose=0, max_iter=10, random_state=42, importance="native") 59 | arfs.fit(X, y) 60 | leshy_rfc_list = sorted(arfs.feature_names_in_[arfs.support_]) 61 | 62 | # assert borutapy_rfc_list == leshy_rfc_list, "same selected features are expected" 63 | assert bool(set(baseline_list) & set(leshy_rfc_list)), ( 64 | "expect non-empty intersection" 65 | ) 66 | 67 | def test_borutaPy_vs_leshy_with_rfc_and_shap_feature_importance(self): 68 | # too slow for circleci to run them in a reasonable time 69 | # takes 2 min on laptop, 1h or more on circleci 70 | # # sklearn random forest implementation 71 | # X, y, w = _make_corr_dataset_classification() 72 | # rfc = RandomForestClassifier(max_features='sqrt', max_samples=0.632, n_estimators=100) 73 | # bt = BorutaPy(rfc) 74 | # bt.fit(X.values, y) 75 | # borutapy_rfc_list = sorted(list(X.columns[bt.support_])) 76 | 77 | # lightGBM random forest implementation 78 | baseline_list = ["var0", "var1", "var2", "var3", "var4"] 79 | X, y, w = _make_corr_dataset_classification(size=100) 80 | n_feat = X.shape[1] 81 | model = LightForestClassifier(n_feat) 82 | arfs = Leshy(model, verbose=0, max_iter=10, random_state=42, importance="shap") 83 | arfs.fit(X, y) 84 | leshy_rfc_list = sorted(arfs.feature_names_in_[arfs.support_]) 85 | 86 | # assert borutapy_rfc_list == leshy_rfc_list, "same selected features are expected" 87 | assert bool(set(baseline_list) & set(leshy_rfc_list)), ( 88 | "expect non-empty intersection" 89 | ) 90 | 91 | def test_borutaPy_vs_leshy_with_rfr_and_shap_feature_importance(self): 92 | # too slow for circleci to run them in a reasonable time 93 | # takes 2 min on laptop, 1h or more on circleci 94 | # # sklearn random forest implementation 95 | # X, y, w = _generated_corr_dataset_regr() 96 | # rfr = RandomForestRegressor(max_features=0.3, max_samples=0.632, n_estimators=100) 97 | # bt = BorutaPy(rfr) 98 | # bt.fit(X.values, y) 99 | # borutapy_rfc_list = sorted(list(X.columns[bt.support_])) 100 | 101 | # lightGBM random forest implementation 102 | baseline_list = ["var0", "var1", "var2", "var3", "var4"] 103 | X, y, w = _make_corr_dataset_regression(size=500) 104 | n_feat = X.shape[1] 105 | model = LightForestRegressor(n_feat) 106 | arfs = Leshy(model, verbose=0, max_iter=10, random_state=42, importance="shap") 107 | arfs.fit(X, y) 108 | leshy_rfc_list = sorted(arfs.feature_names_in_[arfs.support_]) 109 | 110 | # assert borutapy_rfc_list == leshy_rfc_list, "same selected features are expected" 111 | assert bool(set(baseline_list) & set(leshy_rfc_list)), ( 112 | "expect non-empty intersection" 113 | ) 114 | 115 | def test_leshy_clf_with_lgb_and_shap_feature_importance_and_sample_weight(self): 116 | baseline_list = ["var0", "var1", "var2", "var3", "var4"] 117 | 118 | X, y, w = _make_corr_dataset_classification(size=500) 119 | model = lgb.LGBMClassifier(verbose=-1, force_col_wise=True, n_estimators=10) 120 | arfs = Leshy(model, verbose=0, max_iter=10, random_state=42, importance="shap") 121 | arfs.fit(X, y, w) 122 | leshy_list = sorted(arfs.feature_names_in_[arfs.support_]) 123 | 124 | assert bool(set(baseline_list) & set(leshy_list)), ( 125 | "expect non-empty intersection" 126 | ) 127 | 128 | def test_leshy_regr_with_lgb_and_shap_feature_importance_and_sample_weight(self): 129 | baseline_list = ["var0", "var1", "var2", "var3", "var4", "var5"] 130 | 131 | X, y, w = _make_corr_dataset_classification(size=500) 132 | model = lgb.LGBMRegressor(verbose=-1, force_col_wise=True, n_estimators=10) 133 | arfs = Leshy(model, verbose=0, max_iter=10, random_state=42, importance="shap") 134 | arfs.fit(X, y, w) 135 | leshy_list = sorted(arfs.feature_names_in_[arfs.support_]) 136 | 137 | assert bool(set(baseline_list) & set(leshy_list)), ( 138 | "expect non-empty intersection" 139 | ) 140 | 141 | 142 | class TestBoostAGroota: 143 | """ 144 | Test suite for all-relevant FS boruta-like method: Leshy 145 | """ 146 | 147 | def test_boostagroota_clf_with_lgb_and_shap_feature_importance_and_sample_weight( 148 | self, 149 | ): 150 | baseline_list = ["var0", "var1", "var2", "var3", "var4"] 151 | 152 | X, y, w = _make_corr_dataset_classification(size=500) 153 | model = lgb.LGBMClassifier(verbose=-1, force_col_wise=True, n_estimators=10) 154 | arfs = BoostAGroota( 155 | estimator=model, 156 | cutoff=1, 157 | iters=3, 158 | max_rounds=3, 159 | delta=0.1, 160 | silent=False, 161 | importance="shap", 162 | ) 163 | arfs.fit(X, y, w) 164 | leshy_list = sorted(arfs.feature_names_in_[arfs.support_]) 165 | 166 | assert bool(set(baseline_list) & set(leshy_list)), ( 167 | "expect non-empty intersection" 168 | ) 169 | 170 | def test_boostagroota_clf_with_lgb_and_pimp_feature_importance_and_sample_weight( 171 | self, 172 | ): 173 | baseline_list = ["var0", "var1", "var2", "var3", "var4"] 174 | 175 | X, y, w = _make_corr_dataset_classification(size=500) 176 | model = lgb.LGBMClassifier(verbose=-1, force_col_wise=True, n_estimators=10) 177 | arfs = BoostAGroota( 178 | estimator=model, 179 | cutoff=1, 180 | iters=3, 181 | max_rounds=3, 182 | delta=0.1, 183 | silent=False, 184 | importance="pimp", 185 | ) 186 | arfs.fit(X, y, w) 187 | leshy_list = sorted(arfs.feature_names_in_[arfs.support_]) 188 | 189 | assert bool(set(baseline_list) & set(leshy_list)), ( 190 | "expect non-empty intersection" 191 | ) 192 | 193 | def test_boostagroota_rgr_with_lgb_and_shap_feature_importance_and_sample_weight( 194 | self, 195 | ): 196 | baseline_list = ["var0", "var1", "var2", "var3", "var4", "var5"] 197 | 198 | X, y, w = _make_corr_dataset_regression(size=500) 199 | model = lgb.LGBMRegressor(verbose=-1, force_col_wise=True, n_estimators=10) 200 | arfs = BoostAGroota( 201 | estimator=model, 202 | cutoff=1, 203 | iters=3, 204 | max_rounds=3, 205 | delta=0.1, 206 | silent=False, 207 | importance="shap", 208 | ) 209 | arfs.fit(X, y, w) 210 | leshy_list = sorted(arfs.feature_names_in_[arfs.support_]) 211 | 212 | assert bool(set(baseline_list) & set(leshy_list)), ( 213 | "expect non-empty intersection" 214 | ) 215 | 216 | def test_boostagroota_regr_with_lgb_and_pimp_feature_importance_and_sample_weight( 217 | self, 218 | ): 219 | baseline_list = ["var0", "var1", "var2", "var3", "var4", "var5"] 220 | 221 | X, y, w = _make_corr_dataset_regression(size=500) 222 | model = lgb.LGBMRegressor(verbose=-1, force_col_wise=True, n_estimators=10) 223 | arfs = BoostAGroota( 224 | estimator=model, 225 | cutoff=1, 226 | iters=3, 227 | max_rounds=3, 228 | delta=0.1, 229 | silent=False, 230 | importance="pimp", 231 | ) 232 | arfs.fit(X, y, w) 233 | leshy_list = sorted(arfs.feature_names_in_[arfs.support_]) 234 | 235 | assert bool(set(baseline_list) & set(leshy_list)), ( 236 | "expect non-empty intersection" 237 | ) 238 | 239 | 240 | class TestGrootCV: 241 | """ 242 | Test suite for all-relevant FS boruta-like method: Leshy 243 | """ 244 | 245 | def test_grootcv_classification_with_and_sample_weight(self): 246 | baseline_list = ["var0", "var1", "var2", "var3", "var4"] 247 | 248 | X, y, w = _make_corr_dataset_classification(size=100) 249 | arfs = GrootCV(objective="binary", cutoff=1, n_folds=3, n_iter=3, silent=False) 250 | arfs.fit(X, y, w) 251 | grootcv_list = sorted(arfs.feature_names_in_[arfs.support_]) 252 | 253 | assert bool(set(baseline_list) & set(grootcv_list)), ( 254 | "expect non-empty intersection" 255 | ) 256 | 257 | def test_grootcv_regression_with_and_sample_weight(self): 258 | baseline_list = ["var0", "var1", "var2", "var3", "var4", "var5"] 259 | 260 | X, y, w = _make_corr_dataset_regression(size=100) 261 | arfs = GrootCV(objective="l2", cutoff=1, n_folds=3, n_iter=3, silent=False) 262 | arfs.fit(X, y, w) 263 | grootcv_list = sorted(arfs.feature_names_in_[arfs.support_]) 264 | 265 | assert bool(set(baseline_list) & set(grootcv_list)), ( 266 | "expect non-empty intersection" 267 | ) 268 | -------------------------------------------------------------------------------- /tests/test_featselect.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import pandas as pd 4 | from arfs.feature_selection import ( 5 | MissingValueThreshold, 6 | UniqueValuesThreshold, 7 | CardinalityThreshold, 8 | CollinearityThreshold, 9 | ) 10 | from arfs.utils import ( 11 | _make_corr_dataset_regression, 12 | _make_corr_dataset_classification, 13 | ) 14 | 15 | 16 | class TestFeatSelectMissing: 17 | """ 18 | Test suite for FeatureSelector, missing values 19 | """ 20 | 21 | def test_identify_missing_for_classification(self): 22 | # not task dependent (same for clf and regr) 23 | X, y, w = _make_corr_dataset_classification(size=10) 24 | fs = MissingValueThreshold(threshold=0.01) 25 | fs.fit(X) 26 | message = "Expected: {0}, Actual: {1}".format( 27 | "var12", fs.not_selected_features_ 28 | ) 29 | assert fs.not_selected_features_ == ["var12"], message 30 | 31 | 32 | class TestFeatSelectZeroVariance: 33 | """ 34 | Test suite for FeatureSelector, missing values 35 | """ 36 | 37 | def test_identify_single_unique_classification(self): 38 | # not task dependent (same for clf and regr) 39 | X, y, w = _make_corr_dataset_classification(size=10) 40 | fs = UniqueValuesThreshold(threshold=2) 41 | fs.fit(X) 42 | message = "Expected: {0}, Actual: {1}".format( 43 | "var10", fs.not_selected_features_ 44 | ) 45 | assert fs.not_selected_features_ == ["var10"], message 46 | 47 | 48 | class TestFeatSelectHighCardinality: 49 | """ 50 | Test suite for FeatureSelector, high cardinality 51 | """ 52 | 53 | def test_identify_high_cardinality_classification(self): 54 | # not task dependent (same for clf and regr) 55 | X, y, w = _make_corr_dataset_classification(size=100) 56 | fs = CardinalityThreshold(threshold=5) 57 | fs.fit(X) 58 | expected = sorted(["dummy", "nice_guys"]) 59 | actual = sorted(list(fs.not_selected_features_)) 60 | message = "Expected: {0}, Actual: {1}".format(expected, actual) 61 | assert actual == expected, message 62 | 63 | 64 | # class TestFeatSelectCollinearity: 65 | # """ 66 | # test suite for FeatureSelector, high cardinality 67 | # """ 68 | 69 | # def test_identify_collinear_spearman_no_encoding(self): 70 | # X, y, w = _generated_corr_dataset_regr(size=100) 71 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 72 | # fs.identify_collinear(correlation_threshold=0.5, encode=False, method='spearman') 73 | # message = "Expected: {0}, Actual: {1}".format(['var2', 'var3', 'var4', 'var12'], fs.ops['collinear']) 74 | # assert fs.ops['collinear'] == ['var2', 'var3', 'var4', 'var12'], message 75 | 76 | # def test_identify_collinear_pearson_no_encoding(self): 77 | # X, y, w = _generated_corr_dataset_regr(size=100) 78 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 79 | # fs.identify_collinear(correlation_threshold=0.5, encode=False, method='pearson') 80 | # message = "Expected: {0}, Actual: {1}".format(['var2', 'var3', 'var12'], fs.ops['collinear']) 81 | # assert fs.ops['collinear'] == ['var2', 'var3', 'var12'], message 82 | 83 | # def test_identify_collinear_spearman_with_encoding(self): 84 | # X, y, w = _generated_corr_dataset_regr(size=100) 85 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 86 | # fs.identify_collinear(correlation_threshold=0.5, encode=True, method='spearman') 87 | # message = "Expected: {0}, Actual: {1}".format(['var2', 'var3', 'var4', 'var12'], fs.ops['collinear']) 88 | # assert fs.ops['collinear'] == ['var2', 'var3', 'var4', 'var12'], message 89 | 90 | # def test_identify_collinear_pearson_with_encoding(self): 91 | # X, y, w = _generated_corr_dataset_regr(size=100) 92 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 93 | # fs.identify_collinear(correlation_threshold=0.5, encode=True, method='pearson') 94 | # message = "Expected: {0}, Actual: {1}".format(['var2', 'var3', 'var12'], fs.ops['collinear']) 95 | # assert fs.ops['collinear'] == ['var2', 'var3', 'var12'], message 96 | 97 | 98 | # class TestFeatSelectZeroImportance: 99 | # """ 100 | # test suite for FeatureSelector, high cardinality 101 | # """ 102 | 103 | # def test_identify_zero_importance_for_regression_with_early_stopping(self): 104 | # X, y, w = _generated_corr_dataset_regr(size=100) 105 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 106 | # fs.identify_zero_importance(task='regression', eval_metric='l2', objective='l2', n_iterations=2, 107 | # early_stopping=True) 108 | # message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance']) 109 | # assert 'var10' in fs.ops['zero_importance'], message 110 | 111 | # @pytest.mark.xfail 112 | # def test_identify_zero_importance_for_regression_with_early_stopping_no_eval_metric(self): 113 | # X, y, w = _generated_corr_dataset_regr(size=100) 114 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 115 | # # Xfail: expected to fail because the eval metric is not provided 116 | # fs.identify_zero_importance(task='regression', eval_metric=None, objective='l2', n_iterations=2, 117 | # early_stopping=True) 118 | # message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance']) 119 | # assert 'var10' in fs.ops['zero_importance'], message 120 | 121 | # @pytest.mark.xfail 122 | # def test_identify_zero_importance_for_regression_with_early_stopping_no_eval_metric_no_objective(self): 123 | # X, y, w = _generated_corr_dataset_regr(size=100) 124 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 125 | # # Xfail: expected to fail because the eval metric is not provided 126 | # fs.identify_zero_importance(task='regression', eval_metric=None, objective=None, n_iterations=2, 127 | # early_stopping=True) 128 | # message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance']) 129 | # assert 'var10' in fs.ops['zero_importance'], message 130 | 131 | # @pytest.mark.xfail 132 | # def test_identify_zero_importance_for_regression_with_early_stopping_wrong_task(self): 133 | # X, y, w = _generated_corr_dataset_regr(size=10) 134 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 135 | # # Xfail: expected to fail because the eval metric is not provided 136 | # fs.identify_zero_importance(task='classification', eval_metric='l2', objective='l2', n_iterations=2, 137 | # early_stopping=True) 138 | # message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance']) 139 | # assert 'var10' in fs.ops['zero_importance'], message 140 | 141 | # def test_identify_zero_importance_for_regression_without_early_stopping(self): 142 | # X, y, w = _generated_corr_dataset_regr(size=100) 143 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 144 | # fs.identify_zero_importance(task='regression', eval_metric='l2', objective='l2', n_iterations=2, 145 | # early_stopping=False) 146 | # message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance']) 147 | # assert 'var10' in fs.ops['zero_importance'], message 148 | 149 | # def test_identify_zero_importance_for_regression_without_early_stopping_no_objective(self): 150 | # X, y, w = _generated_corr_dataset_regr(size=100) 151 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 152 | # fs.identify_zero_importance(task='regression', n_iterations=2, 153 | # early_stopping=False) 154 | # message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance']) 155 | # assert 'var10' in fs.ops['zero_importance'], message 156 | 157 | # def test_identify_zero_importance_for_classification_with_early_stopping(self): 158 | # X, y, w = _make_corr_dataset_classification(size=100) 159 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 160 | # fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=2, 161 | # early_stopping=True) 162 | # message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance']) 163 | # assert 'var10' in fs.ops['zero_importance'], message 164 | 165 | # @pytest.mark.xfail 166 | # def test_identify_zero_importance_for_classification_with_early_stopping_no_eval_metric(self): 167 | # X, y, w = _make_corr_dataset_classification(size=10) 168 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 169 | # # Xfail: expected to fail because the eval metric is not provided 170 | # fs.identify_zero_importance(task='classification', eval_metric=None, n_iterations=2, 171 | # early_stopping=True) 172 | # message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance']) 173 | # assert 'var10' in fs.ops['zero_importance'], message 174 | 175 | # @pytest.mark.xfail 176 | # def test_identify_zero_importance_for_classification_with_early_stopping_no_eval_metric_no_objective(self): 177 | # X, y, w = _make_corr_dataset_classification(size=10) 178 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 179 | # # Xfail: expected to fail because the eval metric is not provided 180 | # fs.identify_zero_importance(task='classification', eval_metric=None, objective=None, n_iterations=2, 181 | # early_stopping=True) 182 | # message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance']) 183 | # assert 'var10' in fs.ops['zero_importance'], message 184 | 185 | # @pytest.mark.xfail 186 | # def test_identify_zero_importance_for_classification_with_early_stopping_wrong_task(self): 187 | # X, y, w = _make_corr_dataset_classification(size=10) 188 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 189 | # # Xfail: expected to fail because the eval metric is not provided 190 | # fs.identify_zero_importance(task='regression', eval_metric='auc', objective='cross-entropy', n_iterations=2, 191 | # early_stopping=True) 192 | # message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance']) 193 | # assert 'var10' in fs.ops['zero_importance'], message 194 | 195 | # def test_identify_zero_importance_for_classification_without_early_stopping(self): 196 | # X, y, w = _make_corr_dataset_classification(size=100) 197 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 198 | # fs.identify_zero_importance(task='classification', objective='binary', n_iterations=2, 199 | # early_stopping=False) 200 | # message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance']) 201 | # assert 'var10' in fs.ops['zero_importance'], message 202 | 203 | # def test_identify_zero_importance_for_classification_without_early_stopping_no_objective(self): 204 | # X, y, w = _make_corr_dataset_classification(size=100) 205 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 206 | # fs.identify_zero_importance(task='classification', n_iterations=2, 207 | # early_stopping=False) 208 | # message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance']) 209 | # assert 'var10' in fs.ops['zero_importance'], message 210 | 211 | 212 | # class TestFeatSelectLowImportance: 213 | # """ 214 | # test suite for FeatureSelector, high cardinality 215 | # """ 216 | 217 | # def test_identify_low_importance_for_regression_with_early_stopping(self): 218 | # X, y, w = _generated_corr_dataset_regr(size=100) 219 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 220 | # fs.identify_zero_importance(task='regression', eval_metric='l2', objective='l2', n_iterations=2, 221 | # early_stopping=True) 222 | # cum_imp_threshold = 0.95 223 | # fs.identify_low_importance(cumulative_importance=cum_imp_threshold) 224 | # expected = 1 225 | # message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance'])) 226 | # assert len(fs.ops['low_importance']) >= expected, message 227 | 228 | # @pytest.mark.xfail 229 | # def test_identify_low_importance_for_regression_with_early_stopping_no_eval_metric(self): 230 | # X, y, w = _generated_corr_dataset_regr(size=100) 231 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 232 | # # Xfail: expected to fail because the eval metric is not provided 233 | # fs.identify_zero_importance(task='regression', eval_metric=None, objective='l2', n_iterations=2, 234 | # early_stopping=True) 235 | # cum_imp_threshold = 0.95 236 | # fs.identify_low_importance(cumulative_importance=cum_imp_threshold) 237 | # expected = 1 238 | # message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance'])) 239 | # assert len(fs.ops['low_importance']) >= expected, message 240 | 241 | # @pytest.mark.xfail 242 | # def test_identify_low_importance_for_regression_with_early_stopping_no_eval_metric_no_objective(self): 243 | # X, y, w = _generated_corr_dataset_regr(size=100) 244 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 245 | # # Xfail: expected to fail because the eval metric is not provided 246 | # fs.identify_zero_importance(task='regression', eval_metric=None, objective=None, n_iterations=2, 247 | # early_stopping=True) 248 | # cum_imp_threshold = 0.95 249 | # fs.identify_low_importance(cumulative_importance=cum_imp_threshold) 250 | # expected = 1 251 | # message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance'])) 252 | # assert len(fs.ops['low_importance']) >= expected, message 253 | 254 | # @pytest.mark.xfail 255 | # def test_identify_low_importance_for_regression_with_early_stopping_wrong_task(self): 256 | # X, y, w = _generated_corr_dataset_regr(size=100) 257 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 258 | # # Xfail: expected to fail because the eval metric is not provided 259 | # fs.identify_zero_importance(task='classification', eval_metric='l2', objective='l2', n_iterations=2, 260 | # early_stopping=True) 261 | # cum_imp_threshold = 0.95 262 | # fs.identify_low_importance(cumulative_importance=cum_imp_threshold) 263 | # expected = 1 264 | # message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance'])) 265 | # assert len(fs.ops['low_importance']) >= expected, message 266 | 267 | # def test_identify_low_importance_for_regression_without_early_stopping(self): 268 | # X, y, w = _generated_corr_dataset_regr(size=100) 269 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 270 | # fs.identify_zero_importance(task='regression', objective='l2', n_iterations=2, early_stopping=False) 271 | # cum_imp_threshold = 0.95 272 | # fs.identify_low_importance(cumulative_importance=cum_imp_threshold) 273 | # expected = 1 274 | # message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance'])) 275 | # assert len(fs.ops['low_importance']) >= expected, message 276 | 277 | # def test_identify_low_importance_for_regression_without_early_stopping_no_objective(self): 278 | # X, y, w = _generated_corr_dataset_regr(size=100) 279 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 280 | # fs.identify_zero_importance(task='regression', n_iterations=2, early_stopping=False) 281 | # cum_imp_threshold = 0.95 282 | # fs.identify_low_importance(cumulative_importance=cum_imp_threshold) 283 | # expected = 1 284 | # message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance'])) 285 | # assert len(fs.ops['low_importance']) >= expected, message 286 | 287 | # def test_identify_low_importance_for_classification_with_early_stopping(self): 288 | # X, y, w = _make_corr_dataset_classification(size=100) 289 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 290 | # fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=2, early_stopping=True) 291 | # cum_imp_threshold = 0.95 292 | # fs.identify_low_importance(cumulative_importance=cum_imp_threshold) 293 | # expected = 1 294 | # message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance'])) 295 | # assert len(fs.ops['low_importance']) >= expected, message 296 | 297 | # @pytest.mark.xfail 298 | # def test_identify_low_importance_for_classification_with_early_stopping_no_eval_metric(self): 299 | # X, y, w = _make_corr_dataset_classification(size=100) 300 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 301 | # # Xfail: expected to fail because the eval metric is not provided 302 | # fs.identify_zero_importance(task='classification', eval_metric=None, n_iterations=2, 303 | # early_stopping=True) 304 | # cum_imp_threshold = 0.95 305 | # fs.identify_low_importance(cumulative_importance=cum_imp_threshold) 306 | # expected = 1 307 | # message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance'])) 308 | # assert len(fs.ops['low_importance']) >= expected, message 309 | 310 | # @pytest.mark.xfail 311 | # def test_identify_low_importance_for_classification_with_early_stopping_no_eval_metric_no_objective(self): 312 | # X, y, w = _make_corr_dataset_classification(size=100) 313 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 314 | # # Xfail: expected to fail because the eval metric is not provided 315 | # fs.identify_zero_importance(task='classification', eval_metric=None, objective=None, n_iterations=2, 316 | # early_stopping=True) 317 | # cum_imp_threshold = 0.95 318 | # fs.identify_low_importance(cumulative_importance=cum_imp_threshold) 319 | # expected = 1 320 | # message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance'])) 321 | # assert len(fs.ops['low_importance']) >= expected, message 322 | 323 | # @pytest.mark.xfail 324 | # def test_identify_low_importance_for_classification_with_early_stopping_wrong_task(self): 325 | # X, y, w = _make_corr_dataset_classification(size=100) 326 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 327 | # # Xfail: expected to fail because the eval metric is not provided 328 | # fs.identify_zero_importance(task='regression', eval_metric='auc', objective='cross-entropy', n_iterations=2, 329 | # early_stopping=True) 330 | # cum_imp_threshold = 0.95 331 | # fs.identify_low_importance(cumulative_importance=cum_imp_threshold) 332 | # expected = 1 333 | # message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance'])) 334 | # assert len(fs.ops['low_importance']) >= expected, message 335 | 336 | # def test_identify_low_importance_for_classification_without_early_stopping(self): 337 | # X, y, w = _make_corr_dataset_classification(size=100) 338 | # fs = FeatureSelector(X=X, y=y, sample_weight=w) 339 | # fs.identify_zero_importance(task='classification', n_iterations=2, early_stopping=False) 340 | # cum_imp_threshold = 0.95 341 | # fs.identify_low_importance(cumulative_importance=cum_imp_threshold) 342 | # expected = 1 343 | # message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance'])) 344 | # assert len(fs.ops['low_importance']) >= expected, message 345 | --------------------------------------------------------------------------------