├── .github
    ├── FUNDING.yml
    ├── dependabot.yml
    └── workflows
    │   └── ruff.yml
├── .gitignore
├── .readthedocs.yml
├── .vs
    ├── VSWorkspaceState.json
    ├── allrelevantfs
    │   └── v17
    │   │   └── .suo
    └── slnx.sqlite
├── CHANGELOG.md
├── CITATION.cff
├── LICENSE.md
├── README.md
├── docs
    ├── Introduction.rst
    ├── Makefile
    ├── Methods overview.rst
    ├── arfs.feature_selection.rst
    ├── arfs.rst
    ├── boostaroota.png
    ├── boruta.png
    ├── conf.py
    ├── grootcv.png
    ├── index.rst
    ├── logo.png
    ├── make.bat
    ├── modules.rst
    ├── notebooks
    │   ├── arfs_boruta_borutaShap_comparison.ipynb
    │   ├── arfs_classification.ipynb
    │   ├── arfs_grootcv_custom_params.ipynb
    │   ├── arfs_large_data_sampling.ipynb
    │   ├── arfs_non_normal_loss_and_sample_weight.ipynb
    │   ├── arfs_on_GPU.ipynb
    │   ├── arfs_regression.ipynb
    │   ├── arfs_shap_vs_fastshap.ipynb
    │   ├── arfs_timeseries.ipynb
    │   ├── association_and_feature_selection.ipynb
    │   ├── basic_feature_selection.ipynb
    │   ├── bender_hex_mini.png
    │   ├── issue_categoricals.ipynb
    │   ├── issue_collinearity.ipynb
    │   ├── lasso_feature_selection.ipynb
    │   ├── mrmr_feature_selection.ipynb
    │   ├── mrmr_fs_VS_arfs.ipynb
    │   └── preprocessing.ipynb
    └── requirements.txt
├── images
    ├── boostagroota-boston-lgb.png
    ├── grootcv-boston.png
    ├── leshy-boston.png
    ├── leshy-titanic-catboost-shap.png
    ├── leshy-titanic-lgbm-shap.png
    └── leshy-titanic-rndforest-shap.png
├── logo.png
├── pyproject.toml
├── src
    └── arfs
    │   ├── .gitignore
    │   ├── __init__.py
    │   ├── association.py
    │   ├── benchmark.py
    │   ├── dataset
    │       ├── data
    │       │   ├── boston_bunch.joblib
    │       │   └── housing.zip
    │       └── descr
    │       │   └── housing.rst
    │   ├── feature_selection
    │       ├── __init__.py
    │       ├── allrelevant.py
    │       ├── base.py
    │       ├── lasso.py
    │       ├── mrmr.py
    │       ├── summary.py
    │       ├── unsupervised.py
    │       └── variable_importance.py
    │   ├── gbm.py
    │   ├── parallel.py
    │   ├── preprocessing.py
    │   ├── sampling.py
    │   └── utils.py
├── tests
    ├── __init__.py
    ├── test_allrelevant.py
    └── test_featselect.py
└── uv.lock


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: V7V72SOHX
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "pip"
 9 |     directory: "/"
10 |     schedule:
11 |       interval: "weekly"
12 |     allow:
13 |       # Allow only direct updates for
14 |       # Django and any packages starting "django"
15 |       - dependency-name: "django*"
16 |         dependency-type: "direct"
17 |       # Allow only production updates for Sphinx
18 |       - dependency-name: "sphinx"
19 |         dependency-type: "production"
20 | 


--------------------------------------------------------------------------------
/.github/workflows/ruff.yml:
--------------------------------------------------------------------------------
1 | name: Ruff
2 | on: [ push, pull_request ]
3 | jobs:
4 |   ruff:
5 |     runs-on: ubuntu-latest
6 |     steps:
7 |       - uses: actions/checkout@v4
8 |       - uses: astral-sh/ruff-action@v3


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ### Python template
  2 | # example NB
  3 | examples/catboost_info/
  4 | examples/.ipynb_checkpoints/
  5 | examples/cb_model.json
  6 | 
  7 | # Byte-compiled / optimized / DLL files
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # ide
 16 | .idea/
 17 | .vscode/
 18 | 
 19 | # Distribution / packaging
 20 | .Python
 21 | build/
 22 | develop-eggs/
 23 | dist/
 24 | downloads/
 25 | eggs/
 26 | .eggs/
 27 | lib/
 28 | lib64/
 29 | parts/
 30 | sdist/
 31 | var/
 32 | wheels/
 33 | *.egg-info/
 34 | .installed.cfg
 35 | *.egg
 36 | MANIFEST
 37 | 
 38 | # PyInstaller
 39 | #  Usually these files are written by a python script from a template
 40 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 41 | *.manifest
 42 | *.spec
 43 | 
 44 | # Installer logs
 45 | pip-log.txt
 46 | pip-delete-this-directory.txt
 47 | 
 48 | # Unit test / coverage reports
 49 | htmlcov/
 50 | .tox/
 51 | .coverage
 52 | .coverage.*
 53 | .cache
 54 | nosetests.xml
 55 | coverage.xml
 56 | *.cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | *.log
 66 | local_settings.py
 67 | db.sqlite3
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # celery beat schedule file
 89 | celerybeat-schedule
 90 | 
 91 | # SageMath parsed files
 92 | *.sage.py
 93 | 
 94 | # Environments
 95 | .env
 96 | .venv
 97 | env/
 98 | venv/
 99 | ENV/
100 | env.bak/
101 | venv.bak/
102 | 
103 | # Spyder project settings
104 | .spyderproject
105 | .spyproject
106 | 
107 | # Rope project settings
108 | .ropeproject
109 | 
110 | # mkdocs documentation
111 | /site
112 | 
113 | # mypy
114 | .mypy_cache/
115 | 
116 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
117 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
118 | 
119 | # User-specific stuff
120 | .idea/**/workspace.xml
121 | .idea/**/tasks.xml
122 | .idea/**/dictionaries
123 | .idea/**/shelf
124 | 
125 | # Sensitive or high-churn files
126 | .idea/**/dataSources/
127 | .idea/**/dataSources.ids
128 | .idea/**/dataSources.local.xml
129 | .idea/**/sqlDataSources.xml
130 | .idea/**/dynamic.xml
131 | .idea/**/uiDesigner.xml
132 | .idea/**/dbnavigator.xml
133 | 
134 | # Gradle
135 | .idea/**/gradle.xml
136 | .idea/**/libraries
137 | 
138 | # CMake
139 | cmake-build-debug/
140 | cmake-build-release/
141 | 
142 | # Mongo Explorer plugin
143 | .idea/**/mongoSettings.xml
144 | 
145 | # File-based project format
146 | *.iws
147 | 
148 | # IntelliJ
149 | out/
150 | 
151 | # mpeltonen/sbt-idea plugin
152 | .idea_modules/
153 | 
154 | # JIRA plugin
155 | atlassian-ide-plugin.xml
156 | 
157 | # Cursive Clojure plugin
158 | .idea/replstate.xml
159 | 
160 | # Crashlytics plugin (for Android Studio and IntelliJ)
161 | com_crashlytics_export_strings.xml
162 | crashlytics.properties
163 | crashlytics-build.properties
164 | fabric.properties
165 | 
166 | # Editor-based Rest Client
167 | .idea/httpRequests
168 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file for Sphinx projects
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | # Required
 5 | version: 2
 6 | 
 7 | # Set the OS, Python version and other tools you might need
 8 | build:
 9 |   os: "ubuntu-22.04"
10 |   tools:
11 |     python: "3.10"
12 |   jobs:
13 |     post_install:
14 |     - pip uninstall -y sphinx-rtd-theme
15 | 
16 | # Build documentation in the "docs/" directory with Sphinx
17 | sphinx:
18 |   configuration: docs/conf.py
19 | 
20 | # We recommend specifying your dependencies to enable reproducible builds:
21 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
22 | python:
23 |   install:
24 |   - requirements: docs/requirements.txt
25 |   - method: pip
26 |     path: .
27 |     extra_requirements:
28 |     - docs
29 | 


--------------------------------------------------------------------------------
/.vs/VSWorkspaceState.json:
--------------------------------------------------------------------------------
1 | {
2 |   "ExpandedNodes": [
3 |     "",
4 |     "\\arfs"
5 |   ],
6 |   "SelectedNode": "\\arfs\\allrelevant.py",
7 |   "PreviewInSolutionExplorer": false
8 | }


--------------------------------------------------------------------------------
/.vs/allrelevantfs/v17/.suo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/.vs/allrelevantfs/v17/.suo


--------------------------------------------------------------------------------
/.vs/slnx.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/.vs/slnx.sqlite


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changes
  2 | 
  3 | # 3.0.0
  4 | 
  5 |  - [ENHANCEMENT] Upgrade to newer SHAP and lightgbm version
  6 |  - [ENHANCEMENT] Migrate project management to `uv`
  7 | 
  8 | # 2.4.0
  9 |  - [BUG] Add a safety for the array size in the weighted correlation ratio
 10 |  - [DOC] Contribution for better documentation, typos and fixing docstrings
 11 | 
 12 | # 2.3.3
 13 |  - [BUG] Fix range, which should run from 1 to `max_iter`
 14 | 
 15 | # 2.3.2
 16 |  - [BUG] Fix errors generated when updating dependencies with different naming for arguments
 17 |   
 18 | # 2.3.1
 19 |  - [BUG] replace np.Inf by np.inf for compatibility purpose
 20 | 
 21 | # 2.3.0
 22 |  - [BUG] corrected the column names for the GrootCV scheme, setting the shadow var in last position to guarantee the real names are used
 23 |  - [ENHANCEMENT] support user defined cross-validation scheme for time series applications for GrootCV
 24 | 
 25 | # 2.2.6
 26 |  - [BUG] fix the calculation of the SHAP feature importance for multi-class
 27 |  - [ENHANCEMENT] Update pandas aggregation to get rid of the future deprecation warnings
 28 | 
 29 | # 2.2.5
 30 |  - [BUG] fix the calculation of the SHAP feature importance for multi-class
 31 |  - [ENHANCEMENT] return the feature for the importance
 32 | 
 33 | # 2.2.4
 34 |  - [BUG] add axis=1 to compute the max on the right dimension in _reduce_vars_sklearn
 35 |  - [BUG] remove merge causing duplication of the feature importance in _reduce_vars_sklearn
 36 | 
 37 | # 2.2.3
 38 |  - [BUG] change the default of the weighted correlation for consistency with existing doc
 39 |  - [ENHANCEMENTS] speedup the correlation feature selector
 40 | # 2.2.1
 41 |  - [BUG] add copy() to prevent modifying the input pandas DF in the mrmr when fitting the mrmr selector
 42 | # 2.2.0
 43 |  - [BUG] fix the collinearity feature elimination
 44 |  - [BUG] fix the feature importance if fasttreeshap not installed
 45 |  - [REFACTORING] refactor the association module for removing redundancy and faster computation
 46 | # 2.1.3
 47 |  - [BUG] fix the hardcoded threshold in collinearity elimination, closes #33
 48 | # 2.1.2
 49 |  - [BUG] fix a bug in computing the association matrix when a single column of a specific dtype is passed in the sub_matrix (nom-nom, num-num) calculators.
 50 | # 2.1.1
 51 |  - Refactor TreeDiscretizer
 52 | # 2.1.0
 53 |  - Add a mechanism to the TreeDiscretizer that restricts the length of combined strings for categorical columns, preventing excessively lengthy entries.
 54 | # 2.0.7
 55 |  - implement link for the lasso feature selection, e.g. log for ensuring positivity 
 56 | # 2.0.6
 57 |  - downgrade the lightgbm version to 3.3.1 for compatibility reasons (with optuna for instance)
 58 | ## 2.0.5
 59 |  - Fix: strictly greater than threshold rather than geq in the base threshold transformer
 60 |  - Update: due to a change in the lightgbm train API (v4), update the code for GBM
 61 | ## 2.0.4
 62 |  - Documentation: fix the format of some docstrings and remove old sphinx generated files
 63 | ## 2.0.3
 64 |  - Fix: remove unnecessary `__all__` in the preprocessing module and improve the consistency of the module docstrings 
 65 | ## 2.0.2
 66 |  - Fix: when the L1 == 0 in fit_regularized, statsmodels returns the regularized wrapper without refit, which breaks the class (statistics not available)
 67 | ## 2.0.1
 68 |  - Build: remove explicit dependencies on holoviews and panel
 69 | ## 2.0.0
 70 |  - Add fasttreeshap implementation as an option to compute shap importance (fasttreeshap does not work with XGBoost though)
 71 |  - New feature: lasso feature selection, especially useful for models without interactions (LM, GLM, GAM)
 72 |  - New feature: pass lightgbm parameters to GrootCV
 73 |  - Bug: fix sample weight shape in mrMR
 74 |  - Documentation: update and upgrade tuto NB
 75 | ## 1.1.4
 76 |  - update the required python version >= 3.9
 77 | ## 1.1.3
 78 |  - Change tqdm to auto for better rendering in NB for variable importance selector
 79 |  - User defined n_jobs for association matrix computation
 80 | ## 1.1
 81 | 
 82 |  - Corrected an issue in Leshy that occurred when using categorical variables. The use of NumPy functions and methods instead of Pandas ones resulted in the modification of original data types.
 83 | 
 84 | ## 1.0.7
 85 | 
 86 |  - Patch preventing zero division in the conditional entropy calculation
 87 |   
 88 | ## 1.0.6
 89 | 
 90 |  - Return self in mrmr, fixing error when in scikit-learn pipeline
 91 | 
 92 | ## 1.0.5
 93 | 
 94 |  - Patching classes where old unused argument was causing an error
 95 | 
 96 | ## 1.0.2
 97 | 
 98 |  - Distribute a toy dataset for regression by modifying the Boston dataset adding noise and made up columns
 99 | 
100 | ## 1.0.1
101 | 
102 |  - Fix pkg data distribution
103 |   
104 | ## 1.0.0
105 | 
106 |  - Parallelization of functions applied on pandas data frame
107 |  - Faster and more modular association measures 
108 |  - Removing dependencies (e.g. dython)
109 |  - Better static and interactive visualization
110 |  - Sklearn selectors rather than a big class
111 |  - Discretization of continuous and categorical predictors
112 |  - Minimal redundancy maximal relevance feature selection added (a subset of all relevant predictors), based on Uber's MRmr flavor
113 |  - architecture closer to the scikit-learn one
114 |   
115 | ## 0.3.8
116 | 
117 |  - Fix bug when compute shap importance for classifier in GrootCV
118 | 
119 | ## 0.3.7
120 | 
121 |  - Add defensive check if no categorical found in the subsampling of the dataset
122 |  - Re-run the notebooks with the new version
123 | ## 0.3.6
124 | 
125 |  - Fix clustering when plotting only strongly correlated predictors
126 |  - Remove palettable dependencies for plotting 
127 |  - Add default colormap but implement the user defined option
128 | ## 0.3.5
129 | 
130 | - Enable clustering before plotting the correlation/association matrix, optional
131 | - Decrease fontsize for the lables of the correlation matrix
132 | 
133 | ## 0.3.4
134 | 
135 |  - Update requirements 
136 | 
137 | ## 0.3.3
138 | 
139 |  - Upgrade documentation
140 | 
141 | ## 0.3.2
142 | 
143 |  - Fix typo for distributing the dataset and pinned the dependencies
144 | ## 0.3.1 
145 | 
146 |  - Update the syntax for computing associations using the latest version of dython
147 | 
148 | ## 0.3.0
149 | 
150 |  - Fix the Boruta_py feature counts, now adds up to n_features
151 |  - Fix the boxplot colours, when only rejected and accepted (no tentative) the background color was the tentative color
152 |  - Numpy docstring style
153 |  - Implement the new lightGBM callbacks. The new lgbm version (>3.3.0) implements the early stopping using a callback rather than an argument
154 |  - Fix a bug for computing the shap importance when the estimator is lightGBM and the task is classification
155 |  - Add ranking and absolute ranking attributes for all the classes
156 |  - Fix future pandas TypeError when computing numerical values on a dataframe containing non-numerical columns
157 |  - Add housing data to the distribution
158 |  - Add "extreme" sampling methods
159 |  - Re-run the NBs
160 |  - reindex to keep the original columns order
161 | 
162 | ## 0.2.3
163 | 
164 |  - Update syntax to stick to the new argument names in Dython
165 | 
166 | ## 0.2.2
167 | 
168 |  - Check if no feature selected, warn rather than throw error
169 | 
170 | ## 0.2.1
171 | 
172 |  - Fix a bug when removing collinear columns
173 | 
174 | ## 0.2.0
175 | 
176 |  - Prefilters now support the filtering of continuous and nominal (categorical) collinear variables
177 | 
178 | ## 0.1.6
179 | 
180 |  - improve the plot_y_vs_X function
181 |  - remove gc.collect()
182 | 
183 | ## 0.1.5
184 | 
185 |  - fix readme (typos)
186 |  - move utilities in utils sub-package
187 |  - make unit tests lighter
188 | 
189 | ## 0.1.4
190 | 
191 |  - fix bug when using catboost, clone estimator (avoid error and be sure to use a non-fitted estimator)
192 | 
193 | ## 0.1.3
194 | 
195 |  - change the defaut for categorical encoding in pre-filters (pd.cat to integers as default)
196 |  - fix the unit tests with new defaults and names
197 | 
198 | ## 0.1.2
199 | 
200 |  - change arguments name in pre-filters
201 | 
202 | ## 0.1.1
203 | 
204 |  - remove old attribute names in unit-tests
205 | 
206 | ## 0.1.0
207 | 
208 |  - Fix lightGBM warnings
209 |  - Typo in repr
210 |  - Provide load_data utility
211 |  - Enhance jupyter NB examples
212 |  - highlighting synthetic random predictors
213 |  - Benchmark using sklearn permutation importance
214 |  - Harmonization of the attributes and parameters
215 |  - Fix categoricals handling
216 | 
217 | ## 0.0.4
218 | 
219 |  - setting optimal number of features (according to "Elements of statistical learning") when using lightGBM random forest boosting.
220 |  - Providing random forest, lightgbm implementation, estimators
221 | 
222 | ## 0.0.3
223 | 
224 |  - Adding examples and expanding documentation
225 | 
226 | ## 0.0.2
227 | 
228 |  - fix bug: relative import removed


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | abstract: >-
 3 |   All relevant feature selection means trying to find all features carrying information usable for prediction,  rather than finding a possibly compact subset of features on which some particular model has a minimal error.  This might include redundant predictors.
 4 | title: All relevant feature selection
 5 | message: >-
 6 |   If you use this software, please cite it using the metadata from this file.
 7 | type: software
 8 | authors:
 9 | - given-names: Thomas
10 |   family-names: Bury
11 |   orcid: 'https://orcid.org/0000-0003-1421-4184'
12 | keywords:
13 | - "Feature Selection"
14 | - "All Relevant Feature Selection"
15 | - "Machine Learning"
16 | license: MIT License
17 | url: 'https://github.com/ThomasBury/arfs'
18 | version: 3.0.0
19 | date-released: 2021-12-18
20 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) [2020] [Thomas Bury]
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <img src="logo.png" alt="drawing" width="200"/>
  2 | 
  3 | [buy me caffeine](https://ko-fi.com/V7V72SOHX)
  4 | 
  5 | [![PyPI version](https://badge.fury.io/py/arfs.svg)](https://badge.fury.io/py/arfs) [![Downloads](https://static.pepy.tech/personalized-badge/arfs?period=total&units=international_system&left_color=grey&right_color=yellow&left_text=Downloads)](https://pepy.tech/project/arfs) [![Documentation Status](https://readthedocs.org/projects/arfs/badge/?version=latest)](https://arfs.readthedocs.io/en/latest/?badge=latest) [![Code Style](https://img.shields.io/badge/code%20style-black-black)](https://img.shields.io/badge/code%20style-black-black)
  6 | 
  7 | 
  8 | [ARFS readthedocs](https://arfs.readthedocs.io/en/latest/#)
  9 | 
 10 | # All relevant feature selection
 11 | 
 12 | All relevant feature selection means trying to find all features carrying information usable for prediction, rather than finding a possibly compact subset of features on which some particular model has a minimal error. This might include redundant predictors. All relevant feature selection is model agnostic in the sense that it doesn't optimize a scoring function for a *specific* model but rather tries to select all the predictors which are related to the response. 
 13 | 
 14 | This package implements 3 different methods (Leshy is an evolution of Boruta, BoostAGroota is an evolution of BoostARoota and GrootCV is a new one). They are sklearn compatible. See hereunder for details about those methods. You can use any sklearn compatible estimator with Leshy and BoostAGroota but I recommend lightGBM. It's fast, accurate and has SHAP values builtin. It also provides a module for performing preprocessing and perform basic feature selection (autobinning, remove columns with too many missing values, zero variance, high-cardinality, highly correlated, etc.). Examples and detailled methods hereunder.
 15 | 
 16 | Moreover, as an alternative to the all relevant problem, the ARFS package provides a MRmr feature selection which, theoretically, returns a subset of the predictors selected by an arfs method. ARFS also provides a `LASSO` feature selection which works especially well for (G)LMs and GAMs. You can combine Lasso with the `TreeDiscretizer` for introducing non-linearities into linear models and perform feature selection.
 17 | 
 18 | Please note that one limitation of the lasso is that it treats the levels of a categorical predictor individually. However, this issue can be addressed by utilizing the `TreeDiscretizer`, which automatically bins numerical variables and groups the levels of categorical variables.
 19 | 
 20 | ## Installation
 21 | 
 22 | `$ pip install arfs`
 23 | 
 24 | REM: If you're interested in using the `fastshap` option, you'll need to install [fasttreeshap](https://github.com/linkedin/FastTreeSHAP) first. For a smooth installation process, I suggest using `conda install -c conda-forge fasttreeshap` since the c++ source code requires compilation. Using pip may involve additional dependencies, such as requiring VS for compiling the c++ code.
 25 | 
 26 | ## Example
 27 | 
 28 | Working examples for:
 29 | 
 30 |  - [Preprocessing](./docs/notebooks/preprocessingipynb)
 31 |  - [Basic FS (best before ARFS)](./docs/notebooks/basic_feature_selection.ipynb)
 32 |  - [Regression](./docs/notebooks/arfs_regression.ipynb)
 33 |  - [Classification](./docs/notebooks/arfs_classification.ipynb)
 34 |  - [LASSO and (G)LM feature selection](./docs/notebooks/lasso_feature_selection.ipynb)
 35 |  - [Passing custom params](./docs/notebooks/arfs_grootcv_custom_params.ipynb)
 36 |  - [Non-normal loss and sample weights](./docs/notebooks/arfs_non_normal_loss_and_sample_weight.ipynb)
 37 |  - [ARFS on GPU](./docs/notebooks/arfs_on_GPU.ipynb)
 38 |  - [Fast Shap](./docs/notebooks/arfs_shap_vs_fastshap.ipynb)
 39 |  - [Categoricals](./docs/notebooks/issue_categoricals.ipynb)
 40 |  - [Collinearity](./docs/notebooks/issue_collinearity.ipynb)
 41 |  - [Reducing run time for large data](./docs/notebooks/arfs_large_data_sampling.ipynb)
 42 |  - [Comparison to Boruta and BorutaShap](./docs/notebooks/arfs_boruta_borutaShap_comparison.ipynb)
 43 |  - [MRmr alternative](./docs/notebooks/mrmr_feature_selection.ipynb)
 44 |  - [MRmr vs ARFS](./docs/notebooks/mrmr_fs_VS_arfs.ipynb)
 45 | 
 46 | For imbalanced classification:
 47 |  - GrootCV will automatically detect imbalanced data and set the lightGBM `'is_unbalance' = True`
 48 |  - For Leshy and BoostAGroota, you can pass the estimator with the relevant parameter (e.g. `class_weight = 'balanced'`)
 49 | 
 50 | 
 51 | 
 52 | ## Boruta
 53 | 
 54 | The Boruta algorithm tries to capture all the important features you might have in your dataset with respect to an outcome variable. The procedure is the following:
 55 | 
 56 |  * Create duplicate copies of all independent variables. When the number of independent variables in the original data is less than 5, create at least 5 copies using existing variables.
 57 |  * Shuffle the values of added duplicate copies to remove their correlations with the target variable. It is called shadow features or permuted copies.
 58 |  * Combine the original ones with shuffled copies
 59 |  * Run a random forest classifier on the combined dataset and performs a variable importance measure (the default is Mean Decrease Accuracy) to evaluate the importance of each variable where higher means more important.
 60 |  * Then Z score is computed. It means mean of accuracy loss divided by the standard deviation of accuracy loss.
 61 |  * Find the maximum Z score among shadow attributes (MZSA)
 62 |  * Tag the variables as 'unimportant' when they have importance significantly lower than MZSA. Then we permanently remove them from the process.
 63 |  * Tag the variables as 'important' when they have importance significantly higher than MZSA.
 64 |  * Repeat the above steps for a predefined number of iterations (random forest runs), or until all attributes are either tagged 'unimportant' or 'important', whichever comes first.
 65 | 
 66 | At every iteration, the algorithm compares the Z-scores of the shuffled copies of the features and the original features to see if the latter performed better than the former. If it does, the algorithm will mark the feature as important. In essence, the algorithm is trying to validate the importance of the feature by comparing with randomly shuffled copies, which increases the robustness. This is done by simply comparing the number of times a feature did better with the shadow features using a binomial distribution. Since the whole process is done on the same train-test split, the variance of the variable importance comes only from the different re-fit of the model over the different iterations.
 67 | 
 68 | 
 69 | <img src="./docs/boruta.png" alt="drawing" width="600"/>
 70 | 
 71 | ## BoostARoota
 72 | 
 73 | BoostARoota follows closely the Boruta method but modifies a few things:
 74 | 
 75 |  * One-Hot-Encode the feature set
 76 |  * Double width of the data set, making a copy of all features in the original dataset
 77 |  * Randomly shuffle the new features created in (2). These duplicated and shuffled features are referred to as "shadow features"
 78 |  * Run XGBoost classifier on the entire data set ten times. Running it ten times allows for random noise to be smoothed, resulting in more robust estimates of importance. The number of repeats is a parameter than can be changed.
 79 |  * Obtain importance values for each feature. This is a simple importance metric that sums up how many times the particular feature was split on in the XGBoost algorithm.
 80 |  * Compute "cutoff": the average feature importance value for all shadow features and divide by four. Shadow importance values are divided by four (parameter can be changed) to make it more difficult for the variables to be removed. With values lower than this, features are removed at too high of a rate.
 81 |  * Remove features with average importance across the ten iterations that is less than the cutoff specified in (6)
 82 |  * Go back to (2) until the number of features removed is less than ten per cent of the total.
 83 |  * Method returns the features remaining once completed.
 84 | 
 85 | In the spirit, the same heuristic than Boruta but using Boosting (originally Boruta was supporting only random forest). The validation of the importance is done by comparing to the maximum of the median var. imp of the shadow predictors (in Boruta, a statistical test is performed using the Z-score). Since the whole process is done on the same train-test split, the variance of the variable importance comes only from the different re-fit of the model over the different iterations.
 86 | 
 87 | <img src="./docs/boostaroota.png" alt="drawing" width="600"/>
 88 | 
 89 | ## Modifications to Boruta and BoostARoota
 90 | 
 91 |  I forked both Boruta and BoostARoota and made the following changes (under PR):
 92 | 
 93 | **Boruta --> Leshy**:
 94 | 
 95 |   - The categorical features (they are detected, encoded. The tree-based models are working better with integer encoding rather than with OHE, which leads to deep and unstable trees). If Catboost is used, then the cat.pred (if any) are set up
 96 |   - Using lightGBM as the default speeds up by an order of magnitude the running time
 97 |   - Work with Catboost, sklearn API
 98 |   - Allow using sample_weight, for applications like Poisson regression or any requiring weights
 99 |   - Supports 3 different feature importances: native, SHAP and permutation. Native being the least consistent(because of the imp. biased towards numerical and large cardinality categorical) but the fastest of the 3. Indeed, the impurity var.imp. are biased en sensitive to large cardinality (see [scikit demo](https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py))
100 | 
101 | **BoostARoota --> BoostAGroota**:
102 | 
103 |   - Replace XGBoost with LightGBM, you can still use tree-based scikitlearn models
104 |   - Replace native var.imp by SHAP var.imp. Indeed, the impurity var.imp. are biased en sensitive to large cardinality (see [scikit demo](https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py)). Moreover, the native var. imp are computed on the train set, here the data are split (internally) in train and test, var. imp computed on the test set.
105 |   - Handling categorical predictors. Cat. predictors should NOT be one-hot encoded, it leads to deep unstable trees. Instead, it's better to use the native method of lightGBM or CatBoost. A preprocessing step is needed to encode (ligthGBM and CatBoost use integer encoding and reference to categorical columns. The splitting strategies are different then, see official doc).
106 |   - Work with sample_weight, for Poisson or any application requiring a weighting.
107 | 
108 | ## GrootCV, a new method
109 | 
110 | **New: GrootCV**:
111 | 
112 |   - Cross-validated feature importance to smooth out the noise, based on lightGBM only (which is, most of the time, the fastest and more accurate Boosting).
113 |   - the feature importance is derived using SHAP importance
114 |   - Taking the max of the median of the shadow var. imp over folds otherwise not enough conservative and it improves the convergence (needs less evaluation to find a threshold)
115 |   - Not based on a given percentage of cols needed to be deleted
116 |   - Plot method for var. imp
117 | 
118 | <img src="./docs/grootcv.png" alt="drawing" width="400"/>
119 | 
120 | ## References
121 | 
122 | **Theory**
123 | 
124 |  - [Consistent feature selection for pattern recognition in polynomial time](https://www.jmlr.org/papers/volume8/nilsson07a/nilsson07a.pdf)
125 |  - [Maximum Relevance and Minimum Redundancy Feature Selection Methods for a Marketing Machine Learning Platform](https://eng.uber.com/research/maximum-relevance-and-minimum-redundancy-feature-selection-methods-for-a-marketing-machine-learning-platform/)
126 | 
127 | **Applications**
128 | 
129 |  - [The Boruta paper](https://www.jstatsoft.org/article/view/v036i11/v36i11.pdf)
130 |  - [The python implementation](https://github.com/scikit-learn-contrib/boruta_py)
131 |  - [BoostARoota](https://github.com/chasedehan/BoostARoota)
132 | 
133 | 
134 | 
135 | 


--------------------------------------------------------------------------------
/docs/Introduction.rst:
--------------------------------------------------------------------------------
 1 | Introduction
 2 | ============
 3 | 
 4 | All relevant feature selection means trying to find all features carrying information usable for prediction, 
 5 | rather than finding a possibly compact subset of features on which some particular model has a minimal error. 
 6 | This might include redundant predictors. All relevant feature selection is model agnostic in the sense that it 
 7 | doesn't optimize a scoring function for a *specific* model but rather tries to select all the predictors which are related to the response. 
 8 | This package implements 3 different methods (Leshy is an evolution of Boruta, BoostAGroota is an evolution of BoostARoota and GrootCV is a new one). 
 9 | They are sklearn compatible. See hereunder for details about those methods. You can use any sklearn compatible estimator 
10 | with Leshy and BoostAGroota but I recommend lightGBM. It's fast, accurate and has SHAP values builtin. 
11 | 
12 | It also provides a module for performing preprocessing and perform basic feature selection 
13 | (autobinning, remove columns with too many missing values, zero variance, high-cardinality, highly correlated, etc.). 
14 | 
15 | Moreover, as an alternative to the all relevant problem, the ARFS package provides a MRmr feature selection which, 
16 | theoretically, returns a subset of the predictors selected by an arfs method. ARFS also provides a `LASSO` feature 
17 | selection which works especially well for (G)LMs and GAMs. You can combine Lasso with the `TreeDiscretizer` for introducing 
18 | non-linearities into linear models and perform feature selection.
19 | Please note that one limitation of the lasso is that it treats the levels of a categorical predictor individually. 
20 | However, this issue can be addressed by utilizing the `TreeDiscretizer`, which automatically bins numerical variables and 
21 | groups the levels of categorical variables.
22 | 
23 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/Methods overview.rst:
--------------------------------------------------------------------------------
  1 | Methods overview
  2 | ================
  3 | 
  4 | Boruta
  5 | ------
  6 | 
  7 | The Boruta algorithm tries to capture all the important features you might have in your dataset with respect to an outcome variable. The procedure is as follows:
  8 | 
  9 | * Create duplicate copies of all independent variables. When the number of independent variables in the original data is less than 5, create at least 5 copies using existing variables.
 10 | * Shuffle the values of added duplicate copies to remove their correlations with the target variable. It is called shadow features or permuted copies.
 11 | * Combine the original ones with shuffled copies.
 12 | * Run a random forest classifier on the combined dataset and perform a variable importance measure (the default is Mean Decrease Accuracy) to evaluate the importance of each variable where higher means more important.
 13 | * Then Z score is computed. It means the mean of accuracy loss divided by the standard deviation of accuracy loss.
 14 | * Find the maximum Z score among shadow attributes (MZSA).
 15 | * Tag the variables as 'unimportant' when they have importance significantly lower than MZSA. Then we permanently remove them from the process.
 16 | * Tag the variables as 'important' when they have importance significantly higher than MZSA.
 17 | * Repeat the above steps for a predefined number of iterations (random forest runs), or until all attributes are either tagged 'unimportant' or 'important', whichever comes first.
 18 | 
 19 | At every iteration, the algorithm compares the Z-scores of the shuffled copies of the features and the original features to see if the latter performed better than the former. If it does, the algorithm will mark the feature as important. In essence, the algorithm is trying to validate the importance of the feature by comparing with randomly shuffled copies, which increases the robustness. This is done by simply comparing the number of times a feature did better with the shadow features using a binomial distribution. Since the whole process is done on the same train-test split, the variance of the variable importance comes only from the different re-fit of the model over the different iterations.
 20 | 
 21 | 
 22 | BoostARoota
 23 | -----------
 24 | 
 25 | BoostARoota follows closely the Boruta method but modifies a few things:
 26 | 
 27 | * One-Hot-Encode the feature set.
 28 | * Double width of the data set, making a copy of all features in the original dataset.
 29 | * Randomly shuffle the new features created in (2). These duplicated and shuffled features are referred to as "shadow features."
 30 | * Run XGBoost classifier on the entire data set ten times. Running it ten times allows for random noise to be smoothed, resulting in more robust estimates of importance. The number of repeats is a parameter that can be changed.
 31 | * Obtain importance values for each feature. This is a simple importance metric that sums up how many times the particular feature was split in the XGBoost algorithm.
 32 | * Compute "cutoff": the average feature importance value for all shadow features and divide by four. Shadow importance values are divided by four (parameters can be changed) to make it more difficult for the variables to be removed. With values lower than this, features are removed at too high of a rate.
 33 | * Remove features with average importance across the ten iterations that are less than the cutoff specified in (6).
 34 | * Go back to (2) until the number of features removed is less than ten percent of the total.
 35 | * The method returns the features remaining once completed.
 36 | 
 37 | Modifications to Boruta
 38 | -----------------------
 39 | 
 40 | Boruta --> Leshy:
 41 | 
 42 | For chronological development, see https://github.com/scikit-learn-contrib/boruta_py/pull/77 and https://github.com/scikit-learn-contrib/boruta_py/pull/100
 43 | 
 44 | Leshy vs. BorutaPy:
 45 |     To summarize, this PR solves/enhances:
 46 |     * The categorical features (they are detected, encoded. The tree-based models are working better with integer encoding rather than with OHE, which leads to deep and unstable trees). If Catboost is used, then the cat.pred (if any) are set up.
 47 |     * Work with Catboost sklearn API.
 48 |     * Allow using sample_weight, for applications like Poisson regression or any requiring weights.
 49 |     * 3 different feature importances: native, SHAP, and permutation. Native being the least consistent (because of the imp. biased towards numerical and large cardinality categorical) but the fastest of the 3.
 50 |     * Using LightGBM as default speed up by an order of magnitude the running time.
 51 |     * Visualization like in the R package.
 52 | 
 53 | BorutaPy vs. Boruta R:
 54 |     The improvements of this implementation include:
 55 |     * Faster run times: Thanks to scikit-learn's fast implementation of the ensemble methods.
 56 |     * Scikit-learn like interface: Use BorutaPy just like any other scikit-learn: fit, fit_transform, and transform are all implemented in a similar fashion.
 57 |     * Modularity: Any ensemble method could be used: random forest, extra trees classifier, even gradient boosted trees.
 58 |     * Two-step correction: The original Boruta code corrects for multiple testing in an overly conservative way. In this implementation, the Benjamini Hochberg FDR is used to correct in each iteration across active features. This means only those features are included in the correction which are still in the selection process. Following this, each that passed goes through a regular Bonferroni correction to check for the repeated testing over the iterations.
 59 |     * Percentile: Instead of using the max values of the shadow features, the user can specify which percentile to use. This gives a finer control over this crucial parameter. For more info, please read about the perc parameter.
 60 |     * Automatic tree number: Setting the n_estimator to 'auto' will calculate the number of trees in each iteration based on the number of features under investigation. This way more trees are used when the training data has many features and fewer when most of the features have been rejected.
 61 |     * Ranking of features: After fitting BorutaPy, it provides the user with ranking of features. Confirmed ones are 1, Tentatives are 2, and the rejected are ranked starting from 3, based on their feature importance history through the iterations.
 62 |     * Using either the native variable importance, scikit permutation importance, SHAP importance.
 63 | 
 64 |     We highly recommend using pruned trees with a depth between 3-7. For more, see the docs of these functions, and the examples below. Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/
 65 |     
 66 | GrootCV, a new method
 67 | ---------------------
 68 | 
 69 | New: GrootCV:
 70 | - Cross-validated feature importance to smooth out the noise, based on lightGBM only (which is, most of the time, the fastest and more accurate Boosting).
 71 | - The feature importance is derived using SHAP importance.
 72 | - Taking the max of median of the shadow var. imp over folds otherwise not enough conservative and it improves the convergence (needs less evaluation to find a threshold).
 73 | - Not based on a given percentage of cols needed to be deleted.
 74 | - Plot method for var. imp.
 75 | 
 76 | MRmr
 77 | ----
 78 | 
 79 | Re-implementing the Uber MRmr scheme using associations for handling continuous and categorical predictors.
 80 | - Theil's U statistics for the categorical-categorical association (correlation).
 81 | - Variance ratio for continuous-categorical association.
 82 | - Pearson or Spearman correlation for continuous-continuous association.
 83 | 
 84 | Lasso
 85 | -----
 86 | 
 87 | Performing a simple grid search with enforced lasso regularization.
 88 | The best model is chosen based on the minimum BIC or deviance score, and all non-zero coefficients are selected.
 89 | The loss function can belong to the exponential family, as seen in the statsmodels GLM documentation.
 90 | Using the bic metric is faster since it is evaluated on the training data, making it unsuitable for the test data, whereas the deviance is cross-validated.
 91 | 
 92 | This approach can be combined with the TreeDiscretizer transformer to introduce univariate non-linearities (tree-GAM) before feature selection.
 93 | This serves as a workaround to compensate for the absence of fused and grouped lasso regularization.
 94 | 
 95 | References
 96 | ----------
 97 | 
 98 | **Theory**
 99 | - [Consistent feature selection for pattern recognition in polynomial time](http://compmed.se/files/6914/2107/3475/pub_2007_5.pdf)
100 | - [Maximum Relevance and Minimum Redundancy Feature Selection Methods for a Marketing Machine Learning Platform](https://www.uber.com/blog/research/maximum-relevance-and-minimum-redundancy-feature-selection-methods-for-a-marketing-machine-learning-platform)
101 | 
102 | **Applications**
103 | - [The Boruta paper](https://www.jstatsoft.org/article/view/v036i11/v36i11.pdf)
104 | - [The python implementation](https://github.com/scikit-learn-contrib/boruta_py)
105 | - [BoostARoota](https://github.com/chasedehan/BoostARoota)


--------------------------------------------------------------------------------
/docs/arfs.feature_selection.rst:
--------------------------------------------------------------------------------
 1 | arfs.feature\_selection package
 2 | ===============================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | arfs.feature\_selection.allrelevant module
 8 | ------------------------------------------
 9 | 
10 | .. automodule:: arfs.feature_selection.allrelevant
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | arfs.feature\_selection.base module
16 | -----------------------------------
17 | 
18 | .. automodule:: arfs.feature_selection.base
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | arfs.feature\_selection.lasso module
24 | ------------------------------------
25 | 
26 | .. automodule:: arfs.feature_selection.lasso
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | arfs.feature\_selection.mrmr module
32 | -----------------------------------
33 | 
34 | .. automodule:: arfs.feature_selection.mrmr
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
39 | arfs.feature\_selection.summary module
40 | --------------------------------------
41 | 
42 | .. automodule:: arfs.feature_selection.summary
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 
47 | arfs.feature\_selection.unsupervised module
48 | -------------------------------------------
49 | 
50 | .. automodule:: arfs.feature_selection.unsupervised
51 |    :members:
52 |    :undoc-members:
53 |    :show-inheritance:
54 | 
55 | arfs.feature\_selection.variable\_importance module
56 | ---------------------------------------------------
57 | 
58 | .. automodule:: arfs.feature_selection.variable_importance
59 |    :members:
60 |    :undoc-members:
61 |    :show-inheritance:
62 | 
63 | Module contents
64 | ---------------
65 | 
66 | .. automodule:: arfs.feature_selection
67 |    :members:
68 |    :undoc-members:
69 |    :show-inheritance:
70 | 


--------------------------------------------------------------------------------
/docs/arfs.rst:
--------------------------------------------------------------------------------
 1 | arfs package
 2 | ============
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    arfs.feature_selection
11 | 
12 | Submodules
13 | ----------
14 | 
15 | arfs.association module
16 | -----------------------
17 | 
18 | .. automodule:: arfs.association
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | arfs.benchmark module
24 | ---------------------
25 | 
26 | .. automodule:: arfs.benchmark
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | arfs.gbm module
32 | ---------------
33 | 
34 | .. automodule:: arfs.gbm
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
39 | arfs.parallel module
40 | --------------------
41 | 
42 | .. automodule:: arfs.parallel
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 
47 | arfs.preprocessing module
48 | -------------------------
49 | 
50 | .. automodule:: arfs.preprocessing
51 |    :members:
52 |    :undoc-members:
53 |    :show-inheritance:
54 | 
55 | arfs.sampling module
56 | --------------------
57 | 
58 | .. automodule:: arfs.sampling
59 |    :members:
60 |    :undoc-members:
61 |    :show-inheritance:
62 | 
63 | arfs.utils module
64 | -----------------
65 | 
66 | .. automodule:: arfs.utils
67 |    :members:
68 |    :undoc-members:
69 |    :show-inheritance:
70 | 
71 | Module contents
72 | ---------------
73 | 
74 | .. automodule:: arfs
75 |    :members:
76 |    :undoc-members:
77 |    :show-inheritance:
78 | 


--------------------------------------------------------------------------------
/docs/boostaroota.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/docs/boostaroota.png


--------------------------------------------------------------------------------
/docs/boruta.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/docs/boruta.png


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  6 | 
  7 | # -- Path setup --------------------------------------------------------------
  8 | 
  9 | # If extensions (or modules to document with autodoc) are in another directory,
 10 | # add these directories to sys.path here. If the directory is relative to the
 11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 12 | #
 13 | # import os
 14 | # import sys
 15 | # sys.path.insert(0, os.path.abspath('.'))
 16 | import sys
 17 | import os
 18 | import datetime
 19 | 
 20 | sys.path.insert(0, os.path.abspath("../../arfs"))
 21 | # -- Project information -----------------------------------------------------
 22 | 
 23 | project = "arfs"
 24 | copyright = "2024, Thomas Bury"
 25 | author = "Thomas Bury"
 26 | 
 27 | # The full version, including alpha/beta/rc tags
 28 | release = "3.0.0"
 29 | 
 30 | # If extensions (or modules to document with autodoc) are in another
 31 | # directory, add these directories to sys.path here. If the directory is
 32 | # relative to the documentation root, use os.path.abspath to make it
 33 | # absolute, like shown here.
 34 | # sys.path.append(os.path.join(os.path.abspath(os.pardir)))
 35 | 
 36 | # Don't add the same path again, remove the following line:
 37 | # sys.path.insert(0, os.path.abspath(".."))
 38 | 
 39 | sys.path.append(os.path.abspath(os.path.join(__file__, "../../src")))
 40 | autodoc_mock_imports = ["_tkinter", "sphinx_tabs.tabs"]
 41 | 
 42 | # Get the project root dir, which is the parent dir of this
 43 | cwd = os.getcwd()
 44 | project_root = os.path.dirname(cwd)
 45 | 
 46 | 
 47 | # -- General configuration ---------------------------------------------------
 48 | 
 49 | # Add any Sphinx extension module names here, as strings. They can be
 50 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 51 | # ones.
 52 | extensions = [
 53 |     "sphinx.ext.autodoc",
 54 |     "sphinx.ext.autosectionlabel",
 55 |     "sphinx.ext.napoleon",
 56 |     "sphinx.ext.viewcode",
 57 |     "sphinx_autodoc_typehints",
 58 |     "sphinx_copybutton",
 59 |     "nbsphinx",
 60 |     "sphinx_tabs.tabs",
 61 | ]
 62 | 
 63 | # Add any paths that contain templates here, relative to this directory.
 64 | templates_path = ["_templates"]
 65 | autosummary_generate = True
 66 | 
 67 | # List of patterns, relative to source directory, that match files and
 68 | # directories to ignore when looking for source files.
 69 | # This pattern also affects html_static_path and html_extra_path.
 70 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 71 | 
 72 | 
 73 | # -- Options for HTML output -------------------------------------------------
 74 | 
 75 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 76 | # a list of builtin themes.
 77 | #
 78 | # html_theme = "sphinx_rtd_theme"
 79 | html_permalinks_icon = "<span>#</span>"
 80 | html_theme = "sphinxawesome_theme"
 81 | 
 82 | # If not None, a 'Last updated on:' timestamp is inserted at every page
 83 | # bottom, using the given strftime format.
 84 | # The empty string is equivalent to '%b %d, %Y'.
 85 | html_last_updated_fmt = "%B %d, %Y at %H:%M"
 86 | today_fmt = "%B %d, %Y at %H:%M"
 87 | 
 88 | # Add any paths that contain custom static files (such as style sheets) here,
 89 | # relative to this directory. They are copied after the builtin static files,
 90 | # so a file named "default.css" will overwrite the builtin "default.css".
 91 | # html_static_path = ["_static"]
 92 | html_title = "ARFS Documentation"
 93 | html_show_sourcelink = True
 94 | html_logo = "logo.png"
 95 | 
 96 | # -- Napoleon settings (for numpydoc parsing) --------------------------------
 97 | # https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html#configuration
 98 | napoleon_google_docstring = False
 99 | napoleon_numpy_docstring = True
100 | napoleon_include_init_with_doc = True
101 | napoleon_include_private_with_doc = True
102 | napoleon_include_special_with_doc = True
103 | napoleon_use_admonition_for_examples = False
104 | napoleon_use_admonition_for_notes = False
105 | napoleon_use_admonition_for_references = False
106 | napoleon_use_ivar = True
107 | napoleon_use_param = True
108 | napoleon_use_rtype = False
109 | napoleon_preprocess_types = True
110 | napoleon_type_aliases = None
111 | napoleon_attr_annotations = True
112 | 


--------------------------------------------------------------------------------
/docs/grootcv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/docs/grootcv.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to arfs's documentation!
 2 | ================================
 3 | 
 4 | A package for performing All Relevant Feature Selection but not only that.
 5 | 
 6 | Documentation last change: |today|
 7 | 
 8 | .. toctree::
 9 |    :maxdepth: 2
10 |    :caption: User's guide
11 | 
12 |    Introduction
13 |    Methods overview
14 |    modules
15 | 
16 | 
17 | .. toctree::
18 |    :maxdepth: 4
19 |    :glob:
20 |    :caption: Tutorials
21 | 
22 |    notebooks/preprocessing.ipynb
23 |    notebooks/basic_feature_selection.ipynb
24 |    notebooks/association_and_feature_selection.ipynb
25 |    notebooks/arfs_classification.ipynb
26 |    notebooks/arfs_regression.ipynb
27 |    notebooks/arfs_timeseries.ipynb
28 |    notebooks/arfs_large_data_sampling.ipynb
29 |    notebooks/arfs_on_GPU.ipynb
30 |    notebooks/arfs_shap_vs_fastshap.ipynb
31 |    notebooks/arfs_grootcv_custom_params.ipynb
32 |    notebooks/arfs_boruta_borutaShap_comparison.ipynb
33 |    notebooks/arfs_non_normal_loss_and_sample_weight.ipynb
34 |    notebooks/mrmr_feature_selection.ipynb
35 |    notebooks/mrmr_fs_VS_arfs.ipynb
36 |    notebooks/lasso_feature_selection.ipynb
37 |    notebooks/issue_categoricals.ipynb
38 |    notebooks/issue_collinearity.ipynb
39 |    
40 | Indices and tables
41 | ==================
42 | 
43 | * :ref:`genindex`
44 | * :ref:`modindex`
45 | * :ref:`search`


--------------------------------------------------------------------------------
/docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/docs/logo.png


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | src
2 | ===
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    arfs
8 | 


--------------------------------------------------------------------------------
/docs/notebooks/arfs_on_GPU.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# ARFS - Using GPU\n",
  8 |     "\n",
  9 |     "You can leverage the GPU implementation of lightGBM (or other GBM flavours) but this often requires to compile or install some libraries or kit (such as CUDA)"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# from IPython.core.display import display, HTML\n",
 19 |     "# display(HTML(\"<style>.container { width:95% !important; }</style>\"))\n",
 20 |     "import time\n",
 21 |     "import numpy as np\n",
 22 |     "import pandas as pd\n",
 23 |     "import matplotlib as mpl\n",
 24 |     "import matplotlib.pyplot as plt\n",
 25 |     "from lightgbm import LGBMRegressor\n",
 26 |     "\n",
 27 |     "import arfs\n",
 28 |     "from arfs.feature_selection import GrootCV, Leshy\n",
 29 |     "from arfs.utils import load_data\n",
 30 |     "from arfs.benchmark import highlight_tick\n",
 31 |     "\n",
 32 |     "rng = np.random.RandomState(seed=42)\n",
 33 |     "\n",
 34 |     "# import warnings\n",
 35 |     "# warnings.filterwarnings('ignore')"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## GrootCV on GPU\n",
 43 |     "\n",
 44 |     "If the data is small, using a GPU mught not be the most efficient."
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "from sklearn.datasets import make_regression\n",
 54 |     "from sklearn.model_selection import train_test_split\n",
 55 |     "\n",
 56 |     "# Generate synthetic data with Poisson-distributed target variable\n",
 57 |     "bias = 1\n",
 58 |     "\n",
 59 |     "n_samples = 100_00  # 1_000_000\n",
 60 |     "n_features = 100\n",
 61 |     "n_informative = 20\n",
 62 |     "\n",
 63 |     "X, y, true_coef = make_regression(\n",
 64 |     "    n_samples=n_samples,\n",
 65 |     "    n_features=n_features,\n",
 66 |     "    n_informative=n_informative,\n",
 67 |     "    noise=1,\n",
 68 |     "    random_state=8,\n",
 69 |     "    bias=bias,\n",
 70 |     "    coef=True,\n",
 71 |     ")\n",
 72 |     "y = (y - y.mean()) / y.std()\n",
 73 |     "y = np.exp(y)  # Transform to positive values for Poisson distribution\n",
 74 |     "y = np.random.poisson(y)  # Add Poisson noise to the target variable\n",
 75 |     "# dummy sample weight (e.g. exposure), smallest being 30 days\n",
 76 |     "w = np.random.uniform(30 / 365, 1, size=len(y))\n",
 77 |     "# make the count a Poisson rate (frequency)\n",
 78 |     "y = y / w\n",
 79 |     "\n",
 80 |     "X = pd.DataFrame(X)\n",
 81 |     "X.columns = [f\"pred_{i}\" for i in range(X.shape[1])]\n",
 82 |     "\n",
 83 |     "# Split the data into training and testing sets\n",
 84 |     "X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(\n",
 85 |     "    X, y, w, test_size=0.5, random_state=42\n",
 86 |     ")\n",
 87 |     "\n",
 88 |     "true_coef = pd.Series(true_coef)\n",
 89 |     "true_coef.index = X.columns\n",
 90 |     "true_coef = pd.Series({**{\"intercept\": bias}, **true_coef})\n",
 91 |     "true_coef\n",
 92 |     "\n",
 93 |     "genuine_predictors = true_coef[true_coef > 0.0]\n",
 94 |     "\n",
 95 |     "print(f\"The true coefficient of the linear data generating process are:\\n {true_coef}\")"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "GPU"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "%%time\n",
112 |     "feat_selector = GrootCV(\n",
113 |     "    objective=\"rmse\",\n",
114 |     "    cutoff=1,\n",
115 |     "    n_folds=3,\n",
116 |     "    n_iter=3,\n",
117 |     "    silent=True,\n",
118 |     "    fastshap=True,\n",
119 |     "    n_jobs=0,\n",
120 |     "    lgbm_params={\"device\": \"gpu\", \"gpu_device_id\": 1},\n",
121 |     ")\n",
122 |     "feat_selector.fit(X_train, y_train, sample_weight=None)\n",
123 |     "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
124 |     "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n",
125 |     "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n",
126 |     "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n",
127 |     "\n",
128 |     "# highlight synthetic random variable\n",
129 |     "for name in true_coef.index:\n",
130 |     "    if name in genuine_predictors.index:\n",
131 |     "        fig = highlight_tick(figure=fig, str_match=name, color=\"green\")\n",
132 |     "    else:\n",
133 |     "        fig = highlight_tick(figure=fig, str_match=name)\n",
134 |     "\n",
135 |     "plt.show()"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {},
141 |    "source": [
142 |     "CPU"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "%%time\n",
152 |     "feat_selector = GrootCV(\n",
153 |     "    objective=\"rmse\",\n",
154 |     "    cutoff=1,\n",
155 |     "    n_folds=3,\n",
156 |     "    n_iter=3,\n",
157 |     "    silent=True,\n",
158 |     "    fastshap=True,\n",
159 |     "    n_jobs=0,\n",
160 |     "    lgbm_params={\"device\": \"cpu\"},\n",
161 |     ")\n",
162 |     "feat_selector.fit(X_train, y_train, sample_weight=None)\n",
163 |     "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
164 |     "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n",
165 |     "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n",
166 |     "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n",
167 |     "\n",
168 |     "# highlight synthetic random variable\n",
169 |     "for name in true_coef.index:\n",
170 |     "    if name in genuine_predictors.index:\n",
171 |     "        fig = highlight_tick(figure=fig, str_match=name, color=\"green\")\n",
172 |     "    else:\n",
173 |     "        fig = highlight_tick(figure=fig, str_match=name)\n",
174 |     "\n",
175 |     "plt.show()"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "On a smaller data set, for illustrative purposes."
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 5,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "boston = load_data(name=\"Boston\")\n",
192 |     "X, y = boston.data, boston.target"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "%%time\n",
202 |     "feat_selector = GrootCV(\n",
203 |     "    objective=\"rmse\",\n",
204 |     "    cutoff=1,\n",
205 |     "    n_folds=5,\n",
206 |     "    n_iter=5,\n",
207 |     "    silent=True,\n",
208 |     "    fastshap=True,\n",
209 |     "    n_jobs=0,\n",
210 |     "    lgbm_params={\"device\": \"cpu\"},\n",
211 |     ")\n",
212 |     "feat_selector.fit(X, y, sample_weight=None)\n",
213 |     "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
214 |     "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n",
215 |     "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n",
216 |     "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n",
217 |     "\n",
218 |     "# highlight synthetic random variable\n",
219 |     "fig = highlight_tick(figure=fig, str_match=\"random\")\n",
220 |     "fig = highlight_tick(figure=fig, str_match=\"genuine\", color=\"green\")\n",
221 |     "plt.show()"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "%%time\n",
231 |     "feat_selector = GrootCV(\n",
232 |     "    objective=\"rmse\",\n",
233 |     "    cutoff=1,\n",
234 |     "    n_folds=5,\n",
235 |     "    n_iter=5,\n",
236 |     "    silent=True,\n",
237 |     "    fastshap=True,\n",
238 |     "    n_jobs=0,\n",
239 |     "    lgbm_params={\"device\": \"gpu\"},\n",
240 |     ")\n",
241 |     "feat_selector.fit(X, y, sample_weight=None)\n",
242 |     "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
243 |     "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n",
244 |     "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n",
245 |     "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n",
246 |     "\n",
247 |     "# highlight synthetic random variable\n",
248 |     "fig = highlight_tick(figure=fig, str_match=\"random\")\n",
249 |     "fig = highlight_tick(figure=fig, str_match=\"genuine\", color=\"green\")\n",
250 |     "plt.show()"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "%%time\n",
260 |     "feat_selector = GrootCV(\n",
261 |     "    objective=\"rmse\",\n",
262 |     "    cutoff=1,\n",
263 |     "    n_folds=5,\n",
264 |     "    n_iter=5,\n",
265 |     "    silent=True,\n",
266 |     "    fastshap=True,\n",
267 |     "    n_jobs=0,\n",
268 |     "    lgbm_params={\"device\": \"cuda\"},\n",
269 |     ")\n",
270 |     "feat_selector.fit(X, y, sample_weight=None)\n",
271 |     "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
272 |     "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n",
273 |     "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n",
274 |     "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n",
275 |     "\n",
276 |     "# highlight synthetic random variable\n",
277 |     "fig = highlight_tick(figure=fig, str_match=\"random\")\n",
278 |     "fig = highlight_tick(figure=fig, str_match=\"genuine\", color=\"green\")\n",
279 |     "plt.show()"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "## Leshy on GPU"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 9,
292 |    "metadata": {},
293 |    "outputs": [],
294 |    "source": [
295 |     "model = LGBMRegressor(random_state=42, verbose=-1, device=\"gpu\")"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": null,
301 |    "metadata": {},
302 |    "outputs": [],
303 |    "source": [
304 |     "%%time\n",
305 |     "# Leshy\n",
306 |     "feat_selector = Leshy(\n",
307 |     "    model, n_estimators=20, verbose=1, max_iter=10, random_state=42, importance=\"native\"\n",
308 |     ")\n",
309 |     "feat_selector.fit(X, y, sample_weight=None)\n",
310 |     "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
311 |     "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n",
312 |     "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n",
313 |     "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n",
314 |     "\n",
315 |     "# highlight synthetic random variable\n",
316 |     "fig = highlight_tick(figure=fig, str_match=\"random\")\n",
317 |     "fig = highlight_tick(figure=fig, str_match=\"genuine\", color=\"green\")\n",
318 |     "plt.show()"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 11,
324 |    "metadata": {},
325 |    "outputs": [],
326 |    "source": [
327 |     "model = LGBMRegressor(random_state=42, verbose=-1, device=\"cpu\")"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": null,
333 |    "metadata": {},
334 |    "outputs": [],
335 |    "source": [
336 |     "%%time\n",
337 |     "# Leshy\n",
338 |     "feat_selector = Leshy(\n",
339 |     "    model, n_estimators=20, verbose=1, max_iter=10, random_state=42, importance=\"native\"\n",
340 |     ")\n",
341 |     "feat_selector.fit(X, y, sample_weight=None)\n",
342 |     "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
343 |     "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n",
344 |     "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n",
345 |     "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n",
346 |     "\n",
347 |     "# highlight synthetic random variable\n",
348 |     "fig = highlight_tick(figure=fig, str_match=\"random\")\n",
349 |     "fig = highlight_tick(figure=fig, str_match=\"genuine\", color=\"green\")\n",
350 |     "plt.show()"
351 |    ]
352 |   }
353 |  ],
354 |  "metadata": {
355 |   "kernelspec": {
356 |    "display_name": "arfs",
357 |    "language": "python",
358 |    "name": "python3"
359 |   },
360 |   "language_info": {
361 |    "codemirror_mode": {
362 |     "name": "ipython",
363 |     "version": 3
364 |    },
365 |    "file_extension": ".py",
366 |    "mimetype": "text/x-python",
367 |    "name": "python",
368 |    "nbconvert_exporter": "python",
369 |    "pygments_lexer": "ipython3",
370 |    "version": "3.10.14"
371 |   },
372 |   "orig_nbformat": 4
373 |  },
374 |  "nbformat": 4,
375 |  "nbformat_minor": 2
376 | }
377 | 


--------------------------------------------------------------------------------
/docs/notebooks/arfs_shap_vs_fastshap.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# ARFS - fasttreeshap vs shap\n",
  8 |     "\n",
  9 |     "Leshy, BoostAGroota, and GrootCV are tree-based algorithms. They benefit from a [faster implementation of the Shapley values by LinkedIn](https://engineering.linkedin.com/blog/2022/fasttreeshap--accelerating-shap-value-computation-for-trees), which is claimed to outperform both the treeExplainer in the SHAP package and the native C++ implementation of lightgbm/xgboost/catboost. The improvement in speed will vary depending on the size of the task and your hardware resources (including virtualization for VMs). On older machine, the `fasttreeshap` implementation might actually be slower.\n",
 10 |     "\n",
 11 |     "However, it currently does not work with xgboost (not a deal breaker because lightgbm is the preferred default)."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "name": "stderr",
 21 |      "output_type": "stream",
 22 |      "text": [
 23 |       "Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n"
 24 |      ]
 25 |     }
 26 |    ],
 27 |    "source": [
 28 |     "import numpy as np\n",
 29 |     "import pandas as pd\n",
 30 |     "\n",
 31 |     "from sklearn.datasets import make_regression\n",
 32 |     "from sklearn.model_selection import train_test_split\n",
 33 |     "\n",
 34 |     "import arfs\n",
 35 |     "from arfs.feature_selection import GrootCV, Leshy\n",
 36 |     "from arfs.utils import load_data\n",
 37 |     "from arfs.benchmark import highlight_tick\n",
 38 |     "\n",
 39 |     "rng = np.random.RandomState(seed=42)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {},
 46 |    "outputs": [
 47 |     {
 48 |      "name": "stdout",
 49 |      "output_type": "stream",
 50 |      "text": [
 51 |       "The true coefficient of the linear data generating process are:\n",
 52 |       " intercept     1.000000\n",
 53 |       "pred_0        0.000000\n",
 54 |       "pred_1        0.000000\n",
 55 |       "pred_2        0.000000\n",
 56 |       "pred_3        0.000000\n",
 57 |       "               ...    \n",
 58 |       "pred_95       0.000000\n",
 59 |       "pred_96      10.576299\n",
 60 |       "pred_97       0.000000\n",
 61 |       "pred_98       0.000000\n",
 62 |       "pred_99      62.472033\n",
 63 |       "Length: 101, dtype: float64\n"
 64 |      ]
 65 |     }
 66 |    ],
 67 |    "source": [
 68 |     "# Generate synthetic data with Poisson-distributed target variable\n",
 69 |     "bias = 1\n",
 70 |     "\n",
 71 |     "n_samples = 100_000\n",
 72 |     "n_features = 100\n",
 73 |     "n_informative = 20\n",
 74 |     "\n",
 75 |     "X, y, true_coef = make_regression(\n",
 76 |     "    n_samples=n_samples,\n",
 77 |     "    n_features=n_features,\n",
 78 |     "    n_informative=n_informative,\n",
 79 |     "    noise=1,\n",
 80 |     "    random_state=8,\n",
 81 |     "    bias=bias,\n",
 82 |     "    coef=True,\n",
 83 |     ")\n",
 84 |     "y = (y - y.mean()) / y.std()\n",
 85 |     "y = np.exp(y)  # Transform to positive values for Poisson distribution\n",
 86 |     "y = np.random.poisson(y)  # Add Poisson noise to the target variable\n",
 87 |     "# dummy sample weight (e.g. exposure), smallest being 30 days\n",
 88 |     "w = np.random.uniform(30 / 365, 1, size=len(y))\n",
 89 |     "# make the count a Poisson rate (frequency)\n",
 90 |     "y = y / w\n",
 91 |     "\n",
 92 |     "X = pd.DataFrame(X)\n",
 93 |     "X.columns = [f\"pred_{i}\" for i in range(X.shape[1])]\n",
 94 |     "\n",
 95 |     "# Split the data into training and testing sets\n",
 96 |     "X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(\n",
 97 |     "    X, y, w, test_size=0.5, random_state=42\n",
 98 |     ")\n",
 99 |     "\n",
100 |     "true_coef = pd.Series(true_coef)\n",
101 |     "true_coef.index = X.columns\n",
102 |     "true_coef = pd.Series({**{\"intercept\": bias}, **true_coef})\n",
103 |     "true_coef\n",
104 |     "\n",
105 |     "genuine_predictors = true_coef[true_coef > 0.0]\n",
106 |     "\n",
107 |     "print(f\"The true coefficient of the linear data generating process are:\\n {true_coef}\")"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "## GrootCV - fastshap vs shap \n",
115 |     "\n",
116 |     "### Fastshap enable"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 3,
122 |    "metadata": {},
123 |    "outputs": [
124 |     {
125 |      "data": {
126 |       "application/vnd.jupyter.widget-view+json": {
127 |        "model_id": "b4a9fbb99730414786a1cc452df59ca8",
128 |        "version_major": 2,
129 |        "version_minor": 0
130 |       },
131 |       "text/plain": [
132 |        "Repeated k-fold:   0%|          | 0/9 [00:00<?, ?it/s]"
133 |       ]
134 |      },
135 |      "metadata": {},
136 |      "output_type": "display_data"
137 |     },
138 |     {
139 |      "name": "stdout",
140 |      "output_type": "stream",
141 |      "text": [
142 |       "CPU times: user 10min 34s, sys: 4.55 s, total: 10min 39s\n",
143 |       "Wall time: 3min 11s\n"
144 |      ]
145 |     },
146 |     {
147 |      "data": {
148 |       "text/html": [
149 |        "<style>#sk-container-id-1 {color: black;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GrootCV(fastshap=True,\n",
150 |        "        lgbm_params={&#x27;device&#x27;: &#x27;cpu&#x27;, &#x27;num_threads&#x27;: 0, &#x27;objective&#x27;: &#x27;rmse&#x27;,\n",
151 |        "                     &#x27;verbosity&#x27;: -1},\n",
152 |        "        n_folds=3, n_iter=3, objective=&#x27;rmse&#x27;)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">GrootCV</label><div class=\"sk-toggleable__content\"><pre>GrootCV(fastshap=True,\n",
153 |        "        lgbm_params={&#x27;device&#x27;: &#x27;cpu&#x27;, &#x27;num_threads&#x27;: 0, &#x27;objective&#x27;: &#x27;rmse&#x27;,\n",
154 |        "                     &#x27;verbosity&#x27;: -1},\n",
155 |        "        n_folds=3, n_iter=3, objective=&#x27;rmse&#x27;)</pre></div></div></div></div></div>"
156 |       ],
157 |       "text/plain": [
158 |        "GrootCV(fastshap=True,\n",
159 |        "        lgbm_params={'device': 'cpu', 'num_threads': 0, 'objective': 'rmse',\n",
160 |        "                     'verbosity': -1},\n",
161 |        "        n_folds=3, n_iter=3, objective='rmse')"
162 |       ]
163 |      },
164 |      "execution_count": 3,
165 |      "metadata": {},
166 |      "output_type": "execute_result"
167 |     }
168 |    ],
169 |    "source": [
170 |     "%%time\n",
171 |     "feat_selector = GrootCV(\n",
172 |     "    objective=\"rmse\",\n",
173 |     "    cutoff=1,\n",
174 |     "    n_folds=3,\n",
175 |     "    n_iter=3,\n",
176 |     "    silent=True,\n",
177 |     "    fastshap=True,\n",
178 |     "    n_jobs=0,\n",
179 |     "    lgbm_params={\"device\": \"cpu\"},\n",
180 |     ")\n",
181 |     "feat_selector.fit(X_train, y_train, sample_weight=None)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 4,
187 |    "metadata": {},
188 |    "outputs": [
189 |     {
190 |      "name": "stdout",
191 |      "output_type": "stream",
192 |      "text": [
193 |       "The selected features: ['pred_7' 'pred_9' 'pred_15' 'pred_23' 'pred_27' 'pred_31' 'pred_35'\n",
194 |       " 'pred_39' 'pred_41' 'pred_46' 'pred_48' 'pred_49' 'pred_52' 'pred_66'\n",
195 |       " 'pred_71' 'pred_79' 'pred_85' 'pred_96' 'pred_99']\n",
196 |       "The agnostic ranking: [1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 2 1 1 1 2 1\n",
197 |       " 1 1 2 1 2 1 1 1 1 2 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1\n",
198 |       " 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2]\n",
199 |       "The naive ranking: ['pred_7', 'pred_9', 'pred_31', 'pred_49', 'pred_41', 'pred_52', 'pred_71', 'pred_66', 'pred_27', 'pred_99', 'pred_23', 'pred_79', 'pred_39', 'pred_35', 'pred_85', 'pred_48', 'pred_46', 'pred_96', 'pred_15', 'pred_89', 'pred_21', 'pred_38', 'pred_32', 'pred_16', 'pred_69', 'pred_47', 'pred_50', 'pred_28', 'pred_60', 'pred_44', 'pred_67', 'pred_61', 'pred_34', 'pred_84', 'pred_17', 'pred_37', 'pred_29', 'pred_70', 'pred_5', 'pred_62', 'pred_19', 'pred_78', 'pred_59', 'pred_82', 'pred_64', 'pred_24', 'pred_92', 'pred_22', 'pred_80', 'pred_97', 'pred_95', 'pred_68', 'pred_58', 'pred_81', 'pred_91', 'pred_77', 'pred_53', 'pred_36', 'pred_10', 'pred_74', 'pred_45', 'pred_93', 'pred_30', 'pred_4', 'pred_65', 'pred_63', 'pred_76', 'pred_54', 'pred_43', 'pred_8', 'pred_56', 'pred_72', 'pred_0', 'pred_20', 'pred_11', 'pred_75', 'pred_83', 'pred_73', 'pred_18', 'pred_57', 'pred_14', 'pred_55', 'pred_12', 'pred_98', 'pred_88', 'pred_87', 'pred_26', 'pred_90', 'pred_42', 'pred_1', 'pred_33', 'pred_25', 'pred_94', 'pred_51', 'pred_2', 'pred_6', 'pred_40', 'pred_3', 'pred_13', 'pred_86']\n"
200 |      ]
201 |     }
202 |    ],
203 |    "source": [
204 |     "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
205 |     "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n",
206 |     "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n",
207 |     "\n",
208 |     "\n",
209 |     "# fig = feat_selector.plot_importance(n_feat_per_inch=5)\n",
210 |     "# # highlight synthetic random variable\n",
211 |     "# for name in true_coef.index:\n",
212 |     "#     if name in genuine_predictors.index:\n",
213 |     "#         fig = highlight_tick(figure=fig, str_match=name, color=\"green\")\n",
214 |     "#     else:\n",
215 |     "#         fig = highlight_tick(figure=fig, str_match=name)\n",
216 |     "\n",
217 |     "# plt.show()"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {},
223 |    "source": [
224 |     "### Fastshap disable"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 5,
230 |    "metadata": {},
231 |    "outputs": [
232 |     {
233 |      "data": {
234 |       "application/vnd.jupyter.widget-view+json": {
235 |        "model_id": "ecc5744cca034da7bd6a5a58e6f0dc34",
236 |        "version_major": 2,
237 |        "version_minor": 0
238 |       },
239 |       "text/plain": [
240 |        "Repeated k-fold:   0%|          | 0/9 [00:00<?, ?it/s]"
241 |       ]
242 |      },
243 |      "metadata": {},
244 |      "output_type": "display_data"
245 |     },
246 |     {
247 |      "name": "stdout",
248 |      "output_type": "stream",
249 |      "text": [
250 |       "CPU times: user 18min 15s, sys: 3.74 s, total: 18min 19s\n",
251 |       "Wall time: 5min 23s\n"
252 |      ]
253 |     },
254 |     {
255 |      "data": {
256 |       "text/html": [
257 |        "<style>#sk-container-id-2 {color: black;}#sk-container-id-2 pre{padding: 0;}#sk-container-id-2 div.sk-toggleable {background-color: white;}#sk-container-id-2 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-2 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-2 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-2 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-2 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-2 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-2 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-2 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-2 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-2 div.sk-item {position: relative;z-index: 1;}#sk-container-id-2 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-2 div.sk-item::before, #sk-container-id-2 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-2 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-2 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-2 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-2 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-2 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-2 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-2 div.sk-label-container {text-align: center;}#sk-container-id-2 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-2 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GrootCV(lgbm_params={&#x27;device&#x27;: &#x27;cpu&#x27;, &#x27;num_threads&#x27;: 0, &#x27;objective&#x27;: &#x27;rmse&#x27;,\n",
258 |        "                     &#x27;verbosity&#x27;: -1},\n",
259 |        "        n_folds=3, n_iter=3, objective=&#x27;rmse&#x27;)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" checked><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">GrootCV</label><div class=\"sk-toggleable__content\"><pre>GrootCV(lgbm_params={&#x27;device&#x27;: &#x27;cpu&#x27;, &#x27;num_threads&#x27;: 0, &#x27;objective&#x27;: &#x27;rmse&#x27;,\n",
260 |        "                     &#x27;verbosity&#x27;: -1},\n",
261 |        "        n_folds=3, n_iter=3, objective=&#x27;rmse&#x27;)</pre></div></div></div></div></div>"
262 |       ],
263 |       "text/plain": [
264 |        "GrootCV(lgbm_params={'device': 'cpu', 'num_threads': 0, 'objective': 'rmse',\n",
265 |        "                     'verbosity': -1},\n",
266 |        "        n_folds=3, n_iter=3, objective='rmse')"
267 |       ]
268 |      },
269 |      "execution_count": 5,
270 |      "metadata": {},
271 |      "output_type": "execute_result"
272 |     }
273 |    ],
274 |    "source": [
275 |     "%%time\n",
276 |     "feat_selector = GrootCV(\n",
277 |     "    objective=\"rmse\",\n",
278 |     "    cutoff=1,\n",
279 |     "    n_folds=3,\n",
280 |     "    n_iter=3,\n",
281 |     "    silent=True,\n",
282 |     "    fastshap=False,\n",
283 |     "    n_jobs=0,\n",
284 |     "    lgbm_params={\"device\": \"cpu\"},\n",
285 |     ")\n",
286 |     "feat_selector.fit(X_train, y_train, sample_weight=None)"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 6,
292 |    "metadata": {},
293 |    "outputs": [
294 |     {
295 |      "name": "stdout",
296 |      "output_type": "stream",
297 |      "text": [
298 |       "The selected features: ['pred_7' 'pred_9' 'pred_15' 'pred_23' 'pred_27' 'pred_31' 'pred_35'\n",
299 |       " 'pred_39' 'pred_41' 'pred_46' 'pred_48' 'pred_49' 'pred_52' 'pred_66'\n",
300 |       " 'pred_71' 'pred_79' 'pred_85' 'pred_96' 'pred_99']\n",
301 |       "The agnostic ranking: [1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 2 1 1 1 2 1\n",
302 |       " 1 1 2 1 2 1 1 1 1 2 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1\n",
303 |       " 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2]\n",
304 |       "The naive ranking: ['pred_7', 'pred_9', 'pred_31', 'pred_49', 'pred_41', 'pred_52', 'pred_71', 'pred_66', 'pred_27', 'pred_99', 'pred_23', 'pred_79', 'pred_39', 'pred_35', 'pred_85', 'pred_48', 'pred_46', 'pred_96', 'pred_15', 'pred_38', 'pred_32', 'pred_21', 'pred_89', 'pred_50', 'pred_5', 'pred_17', 'pred_29', 'pred_28', 'pred_69', 'pred_61', 'pred_84', 'pred_58', 'pred_67', 'pred_59', 'pred_68', 'pred_34', 'pred_97', 'pred_47', 'pred_60', 'pred_91', 'pred_75', 'pred_22', 'pred_10', 'pred_82', 'pred_16', 'pred_78', 'pred_42', 'pred_95', 'pred_80', 'pred_37', 'pred_2', 'pred_62', 'pred_76', 'pred_92', 'pred_20', 'pred_77', 'pred_19', 'pred_24', 'pred_63', 'pred_93', 'pred_44', 'pred_11', 'pred_53', 'pred_65', 'pred_33', 'pred_45', 'pred_14', 'pred_98', 'pred_57', 'pred_64', 'pred_30', 'pred_81', 'pred_83', 'pred_87', 'pred_25', 'pred_51', 'pred_70', 'pred_8', 'pred_36', 'pred_55', 'pred_0', 'pred_88', 'pred_43', 'pred_12', 'pred_4', 'pred_74', 'pred_72', 'pred_54', 'pred_1', 'pred_13', 'pred_73', 'pred_40', 'pred_56', 'pred_3', 'pred_26', 'pred_18', 'pred_94', 'pred_6', 'pred_86', 'pred_90']\n"
305 |      ]
306 |     }
307 |    ],
308 |    "source": [
309 |     "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
310 |     "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n",
311 |     "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")"
312 |    ]
313 |   }
314 |  ],
315 |  "metadata": {
316 |   "kernelspec": {
317 |    "display_name": "arfs",
318 |    "language": "python",
319 |    "name": "python3"
320 |   },
321 |   "language_info": {
322 |    "codemirror_mode": {
323 |     "name": "ipython",
324 |     "version": 3
325 |    },
326 |    "file_extension": ".py",
327 |    "mimetype": "text/x-python",
328 |    "name": "python",
329 |    "nbconvert_exporter": "python",
330 |    "pygments_lexer": "ipython3",
331 |    "version": "3.10.12"
332 |   },
333 |   "orig_nbformat": 4
334 |  },
335 |  "nbformat": 4,
336 |  "nbformat_minor": 2
337 | }
338 | 


--------------------------------------------------------------------------------
/docs/notebooks/bender_hex_mini.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/docs/notebooks/bender_hex_mini.png


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | lightgbm>=3.3.1
 2 | matplotlib>=3.5
 3 | numpy>=1.21
 4 | pandas>=1.4
 5 | scikit_learn>=1.0
 6 | scipy>=1.8.0
 7 | seaborn>=0.11.2
 8 | shap>=0.40.0
 9 | tqdm>=4.62.3
10 | statsmodels>=0.14.0
11 | ipykernel
12 | ipython_genutils
13 | pandoc
14 | sphinx
15 | sphinxawesome-theme==5.0.0b5
16 | nbsphinx==0.9.2
17 | sphinx-autodoc-typehints<1.24.0
18 | sphinx-copybutton==0.5.2
19 | sphinx-tabs==3.4.1
20 | fasttreeshap


--------------------------------------------------------------------------------
/images/boostagroota-boston-lgb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/images/boostagroota-boston-lgb.png


--------------------------------------------------------------------------------
/images/grootcv-boston.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/images/grootcv-boston.png


--------------------------------------------------------------------------------
/images/leshy-boston.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/images/leshy-boston.png


--------------------------------------------------------------------------------
/images/leshy-titanic-catboost-shap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/images/leshy-titanic-catboost-shap.png


--------------------------------------------------------------------------------
/images/leshy-titanic-lgbm-shap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/images/leshy-titanic-lgbm-shap.png


--------------------------------------------------------------------------------
/images/leshy-titanic-rndforest-shap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/images/leshy-titanic-rndforest-shap.png


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/logo.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=64", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "arfs"
 7 | description = "All Relevant Feature Selection and Maximal Relevant minimal redundancy FS"
 8 | readme = "README.md"
 9 | license = { text = "MIT" }
10 | authors = [
11 |   { name = "Thomas Bury", email = "bury.thomas@gmail.com" },
12 | ]
13 | requires-python = ">=3.9, <3.13"
14 | dynamic = ["version"]
15 | keywords = ["feature-selection", "all-relevant", "selection", "MRmr"]
16 | 
17 | classifiers = [
18 |   "Programming Language :: Python :: 3",
19 | ]
20 | 
21 | dependencies = [
22 |     "lightgbm>=4.6.0",
23 |     "matplotlib>=3.9.4",
24 |     "numpy>=2.0.2",
25 |     "pandas>=2.2.3",
26 |     "scikit-learn>=1.6.1",
27 |     "scipy>=1.13.1",
28 |     "seaborn>=0.13.2",
29 |     "shap>=0.47.0",
30 |     "statsmodels>=0.14.4",
31 |     "tqdm>=4.67.1",
32 | ]
33 | 
34 | [project.optional-dependencies]
35 | docs = [
36 |   "ipykernel",
37 |   "ipython_genutils",
38 |   "pandoc",
39 |   "sphinx",
40 |   "sphinxawesome-theme==5.0.0b5",
41 |   "nbsphinx==0.9.2",
42 |   "sphinx-autodoc-typehints<1.24.0",
43 |   "sphinx-copybutton==0.5.2",
44 |   "sphinx-tabs==3.4.1",
45 | #   "fasttreeshap"
46 | ]
47 | 
48 | test = [
49 |   "pytest",
50 |   "pytest-cov"
51 | ]
52 | 
53 | [project.urls]
54 | Documentation = "https://github.com/ThomasBury/arfs"
55 | Source = "https://github.com/ThomasBury/arfs"
56 | Tracker = "https://github.com/ThomasBury/arfs/issues"
57 | Download = "https://pypi.org/project/arfs/"
58 | 
59 | [tool.setuptools]
60 | package-dir = { "" = "src" }
61 | zip-safe = false
62 | 
63 | [tool.setuptools.packages.find]
64 | where = ["src"]
65 | 
66 | [tool.setuptools.dynamic]
67 | version = { attr = "arfs.__version__" }
68 | 
69 | [tool.setuptools.package-data]
70 | "arfs.dataset.data" = ["*.joblib", "*.zip"]
71 | "arfs.dataset.description" = ["*.rst"]
72 | 


--------------------------------------------------------------------------------
/src/arfs/.gitignore:
--------------------------------------------------------------------------------
  1 | ### Python template
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | db.sqlite3
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # Environments
 86 | .env
 87 | .venv
 88 | env/
 89 | venv/
 90 | ENV/
 91 | env.bak/
 92 | venv.bak/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | .spyproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 
101 | # mkdocs documentation
102 | /site
103 | 
104 | # mypy
105 | .mypy_cache/
106 | 
107 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
108 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
109 | 
110 | # User-specific stuff
111 | .idea/**/workspace.xml
112 | .idea/**/tasks.xml
113 | .idea/**/dictionaries
114 | .idea/**/shelf
115 | 
116 | # Sensitive or high-churn files
117 | .idea/**/dataSources/
118 | .idea/**/dataSources.ids
119 | .idea/**/dataSources.local.xml
120 | .idea/**/sqlDataSources.xml
121 | .idea/**/dynamic.xml
122 | .idea/**/uiDesigner.xml
123 | .idea/**/dbnavigator.xml
124 | 
125 | # Gradle
126 | .idea/**/gradle.xml
127 | .idea/**/libraries
128 | 
129 | # CMake
130 | cmake-build-debug/
131 | cmake-build-release/
132 | 
133 | # Mongo Explorer plugin
134 | .idea/**/mongoSettings.xml
135 | 
136 | # File-based project format
137 | *.iws
138 | 
139 | # IntelliJ
140 | out/
141 | 
142 | # mpeltonen/sbt-idea plugin
143 | .idea_modules/
144 | 
145 | # JIRA plugin
146 | atlassian-ide-plugin.xml
147 | 
148 | # Cursive Clojure plugin
149 | .idea/replstate.xml
150 | 
151 | # Crashlytics plugin (for Android Studio and IntelliJ)
152 | com_crashlytics_export_strings.xml
153 | crashlytics.properties
154 | crashlytics-build.properties
155 | fabric.properties
156 | 
157 | # Editor-based Rest Client
158 | .idea/httpRequests
159 | 
160 | # catboost
161 | docs/notebooks/catboost_info


--------------------------------------------------------------------------------
/src/arfs/__init__.py:
--------------------------------------------------------------------------------
1 | """init module, providing information about the arfs package"""
2 | 
3 | __version__ = "3.0.0"
4 | 


--------------------------------------------------------------------------------
/src/arfs/benchmark.py:
--------------------------------------------------------------------------------
  1 | """Benchmark Feature Selection
  2 | 
  3 | This module provides utilities for comparing and benchmarking feature selection methods
  4 | 
  5 | Module Structure:
  6 | -----------------
  7 | - ``sklearn_pimp_bench``: function for comparing using the sklearn permutation importance
  8 | - ``compare_varimp``: function for comparing using possible 4 kinds of variable importance
  9 | - ``highlight_tick``: function for highlighting specific (genuine or noise for instance) predictors in the importance chart
 10 | """
 11 | 
 12 | from __future__ import print_function, division
 13 | 
 14 | import itertools
 15 | from matplotlib import pyplot as plt
 16 | from sklearn.model_selection import train_test_split
 17 | from sklearn.inspection import permutation_importance
 18 | 
 19 | from sklearn.base import clone
 20 | 
 21 | from .preprocessing import OrdinalEncoderPandas
 22 | 
 23 | 
 24 | def sklearn_pimp_bench(model, X, y, task="regression", sample_weight=None):
 25 |     """Benchmark using sklearn permutation importance, works for regression and classification.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     model: object
 30 |         An estimator that has not been fitted, sklearn compatible.
 31 |     X : ndarray or DataFrame, shape (n_samples, n_features)
 32 |         Data on which permutation importance will be computed.
 33 |     y : array-like or None, shape (n_samples, ) or (n_samples, n_classes)
 34 |         Targets for supervised or None for unsupervised.
 35 |     task : str, optional
 36 |         kind of task, either 'regression' or 'classification', by default 'regression'
 37 |     sample_weight : array-like of shape (n_samples,), optional
 38 |         Sample weights, by default None
 39 | 
 40 |     Returns
 41 |     -------
 42 |     plt.figure
 43 |         the figure corresponding to the feature selection
 44 | 
 45 |     Raises
 46 |     ------
 47 |     ValueError
 48 |         if task is not 'regression' or 'classification'
 49 |     """
 50 | 
 51 |     # for lightGBM cat feat as contiguous int
 52 |     # https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html
 53 |     # same for Random Forest and XGBoost (OHE leads to deep and sparse trees).
 54 |     # For illustrations, see
 55 |     # https://towardsdatascience.com/one-hot-encoding-is-making-
 56 |     # your-tree-based-ensembles-worse-heres-why-d64b282b5769
 57 | 
 58 |     # X, cat_var_df, inv_mapper, mapper = cat_var(X)
 59 |     X = OrdinalEncoderPandas().fit_transform(X)
 60 | 
 61 |     if task == "regression":
 62 |         stratify = None
 63 |     elif task == "classification":
 64 |         stratify = y
 65 |     else:
 66 |         raise ValueError("`task` should be either 'regression' or 'classification' ")
 67 | 
 68 |     if sample_weight is not None:
 69 |         X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
 70 |             X, y, sample_weight, stratify=stratify, random_state=42
 71 |         )
 72 |     else:
 73 |         X_train, X_test, y_train, y_test = train_test_split(
 74 |             X, y, stratify=stratify, random_state=42
 75 |         )
 76 |         w_train, w_test = None, None
 77 | 
 78 |     # lightgbm faster and better than RF
 79 | 
 80 |     model.fit(X_train, y_train, sample_weight=w_train)
 81 |     result = permutation_importance(
 82 |         model,
 83 |         X_test,
 84 |         y_test,
 85 |         n_repeats=10,
 86 |         random_state=42,
 87 |         n_jobs=2,
 88 |         sample_weight=w_test,
 89 |     )
 90 | 
 91 |     sorted_idx = result.importances_mean.argsort()
 92 |     # Plot (5 predictors per inch)
 93 |     fig, ax = plt.subplots(figsize=(16, X.shape[1] / 5))
 94 |     ax.boxplot(
 95 |         result.importances[sorted_idx].T, vert=False, labels=X_test.columns[sorted_idx]
 96 |     )
 97 |     ax.set_title("Permutation Importances (test set)")
 98 |     ax.tick_params(axis="both", which="major", labelsize=9)
 99 |     fig.tight_layout()
100 |     indices = [i for i, s in enumerate(X_test.columns[sorted_idx]) if "random" in s]
101 |     [fig.gca().get_yticklabels()[idx].set_color("red") for idx in indices]
102 |     indices = [i for i, s in enumerate(X_test.columns[sorted_idx]) if "genuine" in s]
103 |     [fig.gca().get_yticklabels()[idx].set_color("green") for idx in indices]
104 |     plt.show()
105 |     return fig
106 | 
107 | 
108 | def compare_varimp(feat_selector, models, X, y, sample_weight=None):
109 |     """Utility function to compare the results for the three possible kind of feature importance
110 | 
111 |     Parameters
112 |     ----------
113 |     feat_selector : object
114 |         an instance of either Leshy, BoostaGRoota or GrootCV
115 |     models : list of objects
116 |         list of tree based scikit-learn estimators
117 |     X : pd.DataFrame, shape (n_samples, n_features)
118 |         the predictors frame
119 |     y : pd.Series
120 |         the target (same length as X)
121 |     sample_weight : None or pd.Series, optional
122 |         sample weights if any, by default None
123 |     """
124 | 
125 |     varimp_list = ["shap", "pimp", "native"]
126 |     for model, varimp in itertools.product(models, varimp_list):
127 |         print(
128 |             "=" * 20
129 |             + " "
130 |             + str(feat_selector.__class__.__name__)
131 |             + " - testing: {mod:>25} for var.imp: {vimp:<15} ".format(
132 |                 mod=str(model.__class__.__name__), vimp=varimp
133 |             )
134 |             + "=" * 20
135 |         )
136 |         # change the varimp
137 |         feat_selector.importance = varimp
138 |         # change model
139 |         mod_clone = clone(model, safe=True)
140 |         feat_selector.estimator = mod_clone
141 |         # fit the feature selector
142 |         feat_selector.fit(X=X, y=y, sample_weight=sample_weight)
143 |         # print the results
144 |         print(feat_selector.selected_features_)
145 |         fig = feat_selector.plot_importance(n_feat_per_inch=5)
146 | 
147 |         if fig is not None:
148 |             # highlight synthetic random variable
149 |             fig = highlight_tick(figure=fig, str_match="random")
150 |             fig = highlight_tick(figure=fig, str_match="genuine", color="green")
151 |             plt.show()
152 | 
153 | 
154 | def highlight_tick(str_match, figure, color="red", axis="y"):
155 |     """Highlight the x/y tick-labels if they contain a given string
156 | 
157 |     Parameters
158 |     ----------
159 |     str_match : str
160 |         the substring to match
161 |     figure : object
162 |         the matplotlib figure
163 |     color : str, optional
164 |         the matplotlib color for highlighting tick-labels, by default 'red'
165 |     axis : str, optional
166 |         axis to use for highlighting, by default 'y'
167 | 
168 |     Returns
169 |     -------
170 |     plt.figure
171 |         the modified matplotlib figure
172 | 
173 |     Raises
174 |     ------
175 |     ValueError
176 |         if axis is not 'x' or 'y'
177 |     """
178 | 
179 |     if axis == "y":
180 |         labels = [item.get_text() for item in figure.gca().get_yticklabels()]
181 |         indices = [i for i, s in enumerate(labels) if str_match in s]
182 |         [figure.gca().get_yticklabels()[idx].set_color(color) for idx in indices]
183 |     elif axis == "x":
184 |         labels = [item.get_text() for item in figure.gca().get_xticklabels()]
185 |         indices = [i for i, s in enumerate(labels) if str_match in s]
186 |         [figure.gca().get_xticklabels()[idx].set_color(color) for idx in indices]
187 |     else:
188 |         raise ValueError("`axis` should be a string, either 'y' or 'x'")
189 | 
190 |     return figure
191 | 


--------------------------------------------------------------------------------
/src/arfs/dataset/data/boston_bunch.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/src/arfs/dataset/data/boston_bunch.joblib


--------------------------------------------------------------------------------
/src/arfs/dataset/data/housing.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/src/arfs/dataset/data/housing.zip


--------------------------------------------------------------------------------
/src/arfs/dataset/descr/housing.rst:
--------------------------------------------------------------------------------
 1 | description:
 2 | .. _california_housing_dataset:
 3 | 
 4 | California Housing dataset
 5 | --------------------------
 6 | 
 7 | **Data Set Characteristics:**
 8 | 
 9 |     :Number of Instances: 20640
10 | 
11 |     :Number of Attributes: 8 numeric, predictive attributes and the target
12 | 
13 |     :Attribute Information:
14 |         - MedInc        median income in block
15 |         - HouseAge      median house age in block
16 |         - AveRooms      average number of rooms
17 |         - AveBedrms     average number of bedrooms
18 |         - Population    block population
19 |         - AveOccup      average house occupancy
20 |         - Latitude      house block latitude
21 |         - Longitude     house block longitude
22 | 
23 |     :Missing Attribute Values: None
24 | 
25 | This dataset was obtained from the StatLib repository.
26 | http://lib.stat.cmu.edu/datasets/
27 | 
28 | The target variable is the median house value for California districts.
29 | 
30 | This dataset was derived from the 1990 U.S. census, using one row per census
31 | block group. A block group is the smallest geographical unit for which the U.S.
32 | Census Bureau publishes sample data (a block group typically has a population
33 | of 600 to 3,000 people).
34 | 
35 | It can be downloaded/loaded using the
36 | :func:`sklearn.datasets.fetch_california_housing` function.
37 | 
38 | .. topic:: References
39 | 
40 |     - Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
41 |       Statistics and Probability Letters, 33 (1997) 291-297


--------------------------------------------------------------------------------
/src/arfs/feature_selection/__init__.py:
--------------------------------------------------------------------------------
 1 | from .allrelevant import Leshy, BoostAGroota, GrootCV
 2 | from .unsupervised import (
 3 |     MissingValueThreshold,
 4 |     UniqueValuesThreshold,
 5 |     CardinalityThreshold,
 6 |     CollinearityThreshold,
 7 | )
 8 | 
 9 | from .lasso import LassoFeatureSelection
10 | from .variable_importance import VariableImportance
11 | from .summary import make_fs_summary
12 | from .mrmr import MinRedundancyMaxRelevance
13 | 
14 | __all__ = [
15 |     "BaseThresholdSelector",
16 |     "MissingValueThreshold",
17 |     "UniqueValuesThreshold",
18 |     "CardinalityThreshold",
19 |     "CollinearityThreshold",
20 |     "VariableImportance",
21 |     "make_fs_summary",
22 |     "Leshy",
23 |     "BoostAGroota",
24 |     "GrootCV",
25 |     "MinRedundancyMaxRelevance",
26 |     "LassoFeatureSelection",
27 | ]
28 | 


--------------------------------------------------------------------------------
/src/arfs/feature_selection/base.py:
--------------------------------------------------------------------------------
  1 | """Base Submodule
  2 | 
  3 | This module provides a base class for selector using a statistic and a threshold
  4 | 
  5 | Module Structure:
  6 | -----------------
  7 | - ``BaseThresholdSelector``: parent class for the "treshold-based" selectors
  8 | 
  9 | """
 10 | 
 11 | # Settings and libraries
 12 | from __future__ import print_function
 13 | 
 14 | # pandas
 15 | import pandas as pd
 16 | 
 17 | # numpy
 18 | import numpy as np
 19 | 
 20 | # sklearn
 21 | 
 22 | from sklearn.utils.validation import check_is_fitted
 23 | from sklearn.base import BaseEstimator
 24 | from sklearn.feature_selection._base import SelectorMixin
 25 | 
 26 | 
 27 | # fix random seed for reproducibility
 28 | np.random.seed(7)
 29 | 
 30 | 
 31 | class BaseThresholdSelector(SelectorMixin, BaseEstimator):
 32 |     """Base class for threshold-based feature selection
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     threshold : float, .05
 37 |         Features with a training-set missing greater/lower (geq/leq) than this threshold will be removed
 38 |     statistic_fn : callable, optional
 39 |         The function for computing the statistic series. The index should be the column names and the
 40 |         the values the computed statistic
 41 |     greater_than_threshold : bool, False
 42 |         Whether or not to reject the features if lower or greater than threshold
 43 | 
 44 |     Returns
 45 |     -------
 46 |     selected_features: list of str
 47 |         List of selected features.
 48 | 
 49 |     Attributes
 50 |     ----------
 51 |     n_features_in_ : int
 52 |         number of input predictors
 53 |     support_ : list of bool
 54 |         the list of the selected X-columns
 55 |     selected_features_ : list of str
 56 |         the list of names of selected features
 57 |     not_selected_features_ : list of str
 58 |         the list of names of rejected features
 59 | 
 60 |     """
 61 | 
 62 |     def __init__(
 63 |         self,
 64 |         threshold=0.05,
 65 |         statistic_fn=None,
 66 |         greater_than_threshold=False,
 67 |     ):
 68 |         self.threshold = threshold
 69 |         self.statistic_fn = statistic_fn
 70 |         self.greater_than_threshold = greater_than_threshold
 71 | 
 72 |     def fit(self, X, y=None, sample_weight=None):
 73 |         """Learn empirical statistics from X.
 74 | 
 75 |         Parameters
 76 |         ----------
 77 |         X : pd.DataFrame, shape (n_samples, n_features)
 78 |             Data from which to compute variances, where `n_samples` is
 79 |             the number of samples and `n_features` is the number of features.
 80 |         y : any, default=None
 81 |             Ignored. This parameter exists only for compatibility with
 82 |             sklearn.pipeline.Pipeline.
 83 |         sample_weight : pd.Series, optional, shape (n_samples,)
 84 |             weights for computing the statistics (e.g. weighted average)
 85 | 
 86 |         Returns
 87 |         -------
 88 |         self : object
 89 |             Returns the instance itself.
 90 |         """
 91 | 
 92 |         # Calculate the fraction of missing in each column
 93 | 
 94 |         if isinstance(X, pd.DataFrame):
 95 |             self.feature_names_in_ = X.columns.to_numpy()
 96 |         else:
 97 |             raise TypeError("X is not a dataframe")
 98 | 
 99 |         self.statistic_series_ = self.statistic_fn(X)
100 |         self.statistic_df_ = pd.DataFrame(self.statistic_series_).rename(
101 |             columns={"index": "feature", 0: "statistic"}
102 |         )
103 | 
104 |         # Sort with highest number of missing values on top
105 |         self.statistic_df_ = self.statistic_df_.sort_values(
106 |             "statistic", ascending=False
107 |         )
108 |         if self.greater_than_threshold:
109 |             self.support_ = self.statistic_series_.values > self.threshold
110 |         else:
111 |             self.support_ = self.statistic_series_.values < self.threshold
112 | 
113 |         self.selected_features_ = self.feature_names_in_[self.support_]
114 |         self.not_selected_features_ = self.feature_names_in_[~self.support_]
115 | 
116 |         return self
117 | 
118 |     def _get_support_mask(self):
119 |         check_is_fitted(self)
120 | 
121 |         return self.support_
122 | 
123 |     def transform(self, X):
124 |         """
125 |         Transform the data, returns a transformed version of `X`.
126 | 
127 |         Parameters
128 |         ----------
129 |         X : array-like of shape (n_samples, n_features)
130 |             Input samples.
131 | 
132 |         Returns
133 |         -------
134 |         X_new : ndarray array of shape (n_samples, n_features_new)
135 |             Transformed array.
136 |         """
137 |         if not isinstance(X, pd.DataFrame):
138 |             raise TypeError("X is not a dataframe")
139 |         return X[self.selected_features_]
140 | 
141 |     def fit_transform(self, X, y=None, sample_weight=None, **fit_params):
142 |         """
143 |         Fit to data, then transform it.
144 |         Fits transformer to `X` and `y` with optional parameters `fit_params`
145 |         and returns a transformed version of `X`.
146 |         Parameters
147 |         ----------
148 |         X : array-like of shape (n_samples, n_features)
149 |             Input samples.
150 |         y :  array-like of shape (n_samples,) or (n_samples, n_outputs), \
151 |                 default=None
152 |             Target values (None for unsupervised transformations).
153 |         sample_weight :  array-like of shape (n_samples,) or (n_samples, n_outputs), \
154 |                 default=None
155 |             sample weight values.
156 |         **fit_params : dict
157 |             Additional fit parameters.
158 |         Returns
159 |         -------
160 |         X_new : ndarray array of shape (n_samples, n_features_new)
161 |             Transformed array.
162 |         """
163 |         return self.fit(X=X, y=y, sample_weight=sample_weight, **fit_params).transform(
164 |             X
165 |         )
166 | 
167 |     def _more_tags(self):
168 |         return {"allow_nan": True}
169 | 


--------------------------------------------------------------------------------
/src/arfs/feature_selection/mrmr.py:
--------------------------------------------------------------------------------
  1 | """MRMR Feature Selection Module
  2 | 
  3 | This module provides MinRedundancyMaxRelevance (MRMR) feature selection for classification or regression tasks.
  4 | In a classification task, the target should be of object or pandas category dtype, while in a regression task,
  5 | the target should be numeric. The predictors can be categorical or numerical without requiring encoding,
  6 | as the appropriate method (correlation, correlation ratio, or Theil's U) will be automatically selected based on the data type.
  7 | 
  8 | Module Structure:
  9 | -----------------
 10 | - ``MinRedundancyMaxRelevance``: MRMR feature selection class for classification or regression tasks.
 11 | """
 12 | 
 13 | import functools
 14 | import numpy as np
 15 | import pandas as pd
 16 | from sklearn.base import BaseEstimator
 17 | from sklearn.utils.validation import check_is_fitted
 18 | from tqdm.auto import tqdm
 19 | from sklearn.feature_selection._base import SelectorMixin
 20 | from ..association import (
 21 |     f_stat_classification_parallel,
 22 |     f_stat_regression_parallel,
 23 |     association_series,
 24 | )
 25 | 
 26 | FLOOR = 0.001
 27 | 
 28 | 
 29 | class MinRedundancyMaxRelevance(SelectorMixin, BaseEstimator):
 30 |     """MRMR feature selection for a classification or a regression task
 31 |     For a classification task, the target should be of object or pandas category
 32 |     dtype. For a regression task, the target should be of numpy categorical dtype.
 33 |     The predictors can be categorical or numerical, there is no encoding required.
 34 |     The dtype will be automatically detected and the right method applied (either
 35 |     correlation, correlation ration or Theil's U)
 36 | 
 37 | 
 38 |     Parameters
 39 |     ----------
 40 |     n_features_to_select: int
 41 |         Number of features to select.
 42 |     relevance_func: callable, optional
 43 |         relevance function having arguments "X", "y", "sample_weight" and returning a pd.Series
 44 |         containing a score of relevance for each feature
 45 |     redundancy_func: callable, optional
 46 |         Redundancy method.
 47 |         If callable, it should take "X", "sample_weight" as input and return a pandas.Series
 48 |         containing a score of redundancy for each feature.
 49 |     denominator_func: str or callable (optional, default='mean')
 50 |         Synthesis function to apply to the denominator of MRMR score.
 51 |         If string, name of method. Supported: 'max', 'mean'.
 52 |         If callable, it should take an iterable as input and return a scalar.
 53 |     task: str
 54 |         either "regression" or "classification"
 55 |     only_same_domain: bool (optional, default=False)
 56 |         If False, all the necessary correlation coefficients are computed.
 57 |         If True, only features belonging to the same domain are compared.
 58 |         Domain is defined by the string preceding the first underscore:
 59 |         for instance "cusinfo_age" and "cusinfo_income" belong to the same domain, whereas "age" and "income" don't.
 60 |     return_scores: bool (optional, default=False)
 61 |         If False, only the list of selected features is returned.
 62 |         If True, a tuple containing (list of selected features, relevance, redundancy) is returned.
 63 |     n_jobs: int (optional, default=1)
 64 |         Maximum number of workers to use. Only used when relevance = "f" or redundancy = "corr".
 65 |         If -1, use as many workers as min(cpu count, number of features).
 66 |     show_progress: bool (optional, default=True)
 67 |         If False, no progress bar is displayed.
 68 |         If True, a TQDM progress bar shows the number of features processed.
 69 | 
 70 |     Returns
 71 |     -------
 72 |     selected_features: list of str
 73 |         List of selected features.
 74 | 
 75 |     Attributes
 76 |     ----------
 77 |     n_features_in_ : int
 78 |         number of input predictors
 79 |     ranking_ : pd.DataFrame
 80 |         name and scores for the selected features
 81 |     support_ : list of bool
 82 |         the list of the selected X-columns
 83 |     Example
 84 |     -------
 85 |     >>> from sklearn.datasets import make_classification, make_regression
 86 |     >>> X, y = make_regression(n_samples = 1000, n_features = 50, n_informative = 5, shuffle=False) # , n_redundant = 5
 87 |     >>> X = pd.DataFrame(X)
 88 |     >>> y = pd.Series(y)
 89 |     >>> pred_name = [f"pred_{i}" for i in range(X.shape[1])]
 90 |     >>> X.columns = pred_name
 91 |     >>> y.name = "target"
 92 |     >>> fs_mrmr = MinRedundancyMaxRelevance(
 93 |     >>>                  n_features_to_select=5,
 94 |     >>>                  relevance_func=None,
 95 |     >>>                  redundancy_func=None,
 96 |     >>>                  task="regression", #"classification",
 97 |     >>>                  denominator_func=np.mean,
 98 |     >>>                  only_same_domain=False,
 99 |     >>>                  return_scores=False,
100 |     >>>                  show_progress=True)
101 |     >>> #fs_mrmr.fit(X=X, y=y.astype(str), sample_weight=None)
102 |     >>> fs_mrmr.fit(X=X, y=y, sample_weight=None)
103 |     """
104 | 
105 |     def __init__(
106 |         self,
107 |         n_features_to_select,
108 |         relevance_func=None,
109 |         redundancy_func=None,
110 |         task="regression",
111 |         denominator_func=np.mean,
112 |         only_same_domain=False,
113 |         return_scores=False,
114 |         n_jobs=1,
115 |         show_progress=True,
116 |     ):
117 |         self.n_features_to_select = n_features_to_select
118 |         self.relevance_func = relevance_func
119 |         self.redundancy_func = redundancy_func
120 |         self.denominator_func = denominator_func
121 |         self.only_same_domain = only_same_domain
122 |         self.return_scores = return_scores
123 |         self.show_progress = show_progress
124 |         self.n_jobs = n_jobs
125 |         self.task = task
126 | 
127 |         if self.relevance_func is None:
128 |             if self.task == "regression":
129 |                 self.relevance_func = functools.partial(
130 |                     f_stat_regression_parallel, n_jobs=self.n_jobs
131 |                 )
132 |             else:
133 |                 self.relevance_func = functools.partial(
134 |                     f_stat_classification_parallel, n_jobs=self.n_jobs
135 |                 )
136 | 
137 |         if self.redundancy_func is None:
138 |             self.redundancy_func = functools.partial(
139 |                 association_series, n_jobs=self.n_jobs, normalize=True
140 |             )
141 | 
142 |     def fit(self, X, y, sample_weight=None):
143 |         """fit the MRmr selector by learning the associations
144 | 
145 |         Parameters
146 |         ----------
147 |         X : pd.DataFrame, shape (n_samples, n_features)
148 |             Data from which to compute variances, where `n_samples` is
149 |             the number of samples and `n_features` is the number of features.
150 |         y : array-like or pd.Series of shape (n_samples,)
151 |             Target vector. Must be numeric for regression or categorical for classification.
152 |         sample_weight : pd.Series, optional, shape (n_samples,)
153 |             weights for computing the statistics (e.g. weighted average)
154 | 
155 |         Returns
156 |         -------
157 |         self : object
158 |             If `return_scores=False`, returns self.
159 |             If `return_scores=True`, returns (selected_features, relevance_scores).
160 |         """
161 | 
162 |         if isinstance(X, pd.DataFrame):
163 |             self.feature_names_in_ = X.columns.to_numpy()
164 |         else:
165 |             raise TypeError("X is not a pd.DataFrame")
166 | 
167 |         if not isinstance(y, pd.Series):
168 |             y = pd.Series(y)
169 | 
170 |         y.name = "target"
171 | 
172 |         target = y.copy()
173 |         if self.task == "classification":
174 |             target = target.astype("category")
175 | 
176 |         self.relevance_args = {"X": X, "y": target, "sample_weight": sample_weight}
177 |         self.redundancy_args = {"X": X, "sample_weight": sample_weight}
178 | 
179 |         self.relevance = self.relevance_func(**self.relevance_args)
180 |         self.features = self.relevance[~self.relevance.isna()].index.to_list()
181 |         self.relevance = self.relevance.loc[self.features]
182 |         self.redundancy = pd.DataFrame(
183 |             FLOOR, index=self.features, columns=self.features
184 |         )
185 |         self.n_features_to_select = min(self.n_features_to_select, len(self.features))
186 | 
187 |         if isinstance(X, pd.DataFrame):
188 |             self.feature_names_in_ = X.columns.to_numpy()
189 | 
190 |         self.n_features_in_ = len(self.features)
191 | 
192 |         self.selected_features = []
193 |         self.not_selected_features = self.features.copy()
194 |         self.ranking_ = pd.Series(
195 |             dtype="float64"
196 |         )  # pd.DataFrame(columns=['var_name', 'mrmr', 'relevancy', 'redundancy'])
197 |         self.redundancy_ = pd.Series(dtype="float64")
198 |         self.run_feature_selection()
199 | 
200 |         # store the output in the sklearn flavour
201 |         self.relevance_ = self.relevance
202 |         self.ranking_ = pd.concat(
203 |             [self.ranking_, self.relevance_, self.redundancy_], axis=1
204 |         )
205 |         self.ranking_.columns = ["mrmr", "relevance", "redundancy"]
206 |         self.ranking_ = self.ranking_.iloc[: self.n_features_to_select, :]
207 | 
208 |         # Set back the mrmr score to Inf for the first selected feature to avoid dividing by zero
209 |         self.ranking_.iloc[0, 0] = float("Inf")
210 | 
211 |         self.selected_features_ = self.selected_features
212 |         self.support_ = np.asarray(
213 |             [x in self.selected_features for x in self.feature_names_in_]
214 |         )
215 |         self.not_selected_features_ = self.not_selected_features
216 | 
217 |         if self.return_scores:
218 |             return self.selected_features_, self.relevance_, self.redundancy_
219 |         return self
220 | 
221 |     def transform(self, X):
222 |         """
223 |         Transform the data, returns a transformed version of `X`.
224 | 
225 |         Parameters
226 |         ----------
227 |         X : array-like of shape (n_samples, n_features)
228 |             Input samples.
229 | 
230 |         Returns
231 |         -------
232 |         X_new : ndarray array of shape (n_samples, n_features_new)
233 |             Transformed array.
234 |         """
235 |         if not isinstance(X, pd.DataFrame):
236 |             raise TypeError("X is not a dataframe")
237 |         return X[self.selected_features_]
238 | 
239 |     def fit_transform(self, X, y, sample_weight=None, **fit_params):
240 |         """
241 |         Fit to data, then transform it.
242 |         Fits transformer to `X` and `y` and optionally sample_weight
243 |         with optional parameters `fit_params`
244 |         and returns a transformed version of `X`.
245 |         
246 |         Parameters
247 |         ----------
248 |         X : array-like of shape (n_samples, n_features)
249 |             Input samples.
250 |         y :  array-like of shape (n_samples,) or (n_samples, n_outputs), \
251 |                 default=None
252 |             Target values (None for unsupervised transformations).
253 |         sample_weight :  array-like of shape (n_samples,) or (n_samples, n_outputs), \
254 |                 default=None
255 |             sample weight values.
256 |         **fit_params : dict
257 |             Additional fit parameters.
258 |             
259 |         Returns
260 |         -------
261 |         X_new : ndarray array of shape (n_samples, n_features_new)
262 |             Transformed array.
263 |         """
264 |         return self.fit(X=X, y=y, sample_weight=sample_weight).transform(X)
265 | 
266 |     def _get_support_mask(self):
267 |         check_is_fitted(self)
268 | 
269 |         return self.support_
270 | 
271 |     def _more_tags(self):
272 |         return {"allow_nan": True}
273 | 
274 |     def select_next_feature(
275 |         self, not_selected_features, selected_features, relevance, redundancy
276 |     ):
277 |         score_numerator = relevance.loc[not_selected_features]
278 | 
279 |         if len(selected_features) > 0:
280 |             last_selected_feature = selected_features[-1]
281 | 
282 |             if self.only_same_domain:
283 |                 not_selected_features_sub = [
284 |                     c
285 |                     for c in not_selected_features
286 |                     if c.split("_")[0] == last_selected_feature.split("_")[0]
287 |                 ]
288 |             else:
289 |                 not_selected_features_sub = not_selected_features
290 | 
291 |             if not_selected_features_sub:
292 |                 redundancy.loc[not_selected_features_sub, last_selected_feature] = (
293 |                     self.redundancy_func(
294 |                         target=last_selected_feature,
295 |                         features=not_selected_features_sub,
296 |                         **self.redundancy_args,
297 |                     )
298 |                     .fillna(FLOOR)
299 |                     .abs()
300 |                     .clip(FLOOR)
301 |                 )
302 |                 score_denominator = (
303 |                     redundancy.loc[not_selected_features, selected_features]
304 |                     .apply(self.denominator_func, axis=1)
305 |                     .replace(1.0, float("Inf"))
306 |                 )
307 | 
308 |             else:
309 |                 score_denominator = pd.Series(1, index=self.features)
310 | 
311 |         else:
312 |             score_denominator = pd.Series(1, index=self.features)
313 | 
314 |         score = score_numerator / score_denominator
315 |         score = score.sort_values(ascending=False)
316 |         best_feature = score.index[score.argmax()]
317 | 
318 |         return best_feature, score, score_denominator
319 | 
320 |     def update_ranks(self, best_feature, score, score_denominator):
321 |         self.ranking_ = pd.concat(
322 |             [
323 |                 self.ranking_,
324 |                 pd.Series({best_feature: score.loc[best_feature]}, dtype="float64"),
325 |             ]
326 |         )
327 |         self.redundancy_ = pd.concat(
328 |             [
329 |                 self.redundancy_,
330 |                 pd.Series(
331 |                     {best_feature: score_denominator.loc[best_feature]},
332 |                     dtype="float64",
333 |                 ),
334 |             ]
335 |         )
336 |         # the first selected feature has a default denominator (redundancy) = 1 to avoid dividing by zero
337 |         # I set it back to zero
338 |         self.redundancy_ = self.redundancy_.replace(1.0, 0.0)
339 |         self.selected_features.append(best_feature)
340 |         self.not_selected_features.remove(best_feature)
341 | 
342 |     def run_feature_selection(self):
343 |         for i in tqdm(range(self.n_features_to_select), disable=not self.show_progress):
344 |             best_feature, score, score_denominator = self.select_next_feature(
345 |                 self.not_selected_features,
346 |                 self.selected_features,
347 |                 self.relevance,
348 |                 self.redundancy,
349 |             )
350 |             self.update_ranks(best_feature, score, score_denominator)
351 | 


--------------------------------------------------------------------------------
/src/arfs/feature_selection/summary.py:
--------------------------------------------------------------------------------
 1 | """Feature Selection Summary Module
 2 | 
 3 | This module provides a function for creating the summary report of a FS pipeline
 4 | 
 5 | Module Structure:
 6 | -----------------
 7 | - ``make_fs_summary`` main function for creating the summary
 8 | - ``highlight_discarded`` function for creating style for the pd.DataFrame
 9 | """
10 | 
11 | import pandas as pd
12 | import numpy as np
13 | 
14 | 
15 | def highlight_discarded(s):
16 |     """highlight X in red and V in green.
17 | 
18 |     Parameters
19 |     ----------
20 |     s : array-like of shape (n_features,)
21 |         the boolean array for defining the style
22 | 
23 | 
24 |     """
25 |     is_X = s == 0
26 |     return [
27 |         "background-color: #ba0202" if v else "background-color: #0c8a30" for v in is_X
28 |     ]
29 | 
30 | 
31 | def make_fs_summary(selector_pipe):
32 |     """make_fs_summary makes a summary dataframe highlighting at which step a
33 |     given predictor has been rejected (if any).
34 | 
35 |     Parameters
36 |     ----------
37 |     selector_pipe : sklearn.pipeline.Pipeline
38 |         the feature selector pipeline.
39 | 
40 |     Examples
41 |     --------
42 |     >>> groot_pipeline = Pipeline([
43 |     ... ('missing', MissingValueThreshold()),
44 |     ... ('unique', UniqueValuesThreshold()),
45 |     ... ('cardinality', CardinalityThreshold()),
46 |     ... ('collinearity', CollinearityThreshold(threshold=0.5)),
47 |     ... ('lowimp', VariableImportance(eval_metric='poisson', objective='poisson', verbose=2)),
48 |     ... ('grootcv', GrootCV(objective='poisson', cutoff=1, n_folds=3, n_iter=5))])
49 |     >>> groot_pipeline.fit_transform(
50 |         X=df[predictors],
51 |         y=df[target],
52 |         lowimp__sample_weight=df[weight],
53 |         grootcv__sample_weight=df[weight])
54 |     >>> fs_summary_df = make_fs_summary(groot_pipeline)
55 |     """
56 |     tag_df = pd.DataFrame({"predictor": selector_pipe[0].feature_names_in_})
57 |     for selector_name, selector in selector_pipe.named_steps.items():
58 |         if hasattr(selector, "support_"):
59 |             feature_in = selector.feature_names_in_
60 |             to_drop = list(set(feature_in) - set(selector.get_feature_names_out()))
61 |             tag_df[selector_name] = np.where(
62 |                 tag_df["predictor"].isin(to_drop), 0, 1
63 |             ) * np.where(tag_df["predictor"].isin(feature_in), 1, np.nan)
64 |         else:
65 |             tag_df[selector_name] = np.nan
66 | 
67 |     style = (
68 |         tag_df.style.apply(highlight_discarded, subset=tag_df.columns[1:])
69 |         .applymap(lambda x: "" if x == x else "background-color: #f57505")
70 |         .format(precision=0)
71 |     )
72 | 
73 |     return style
74 | 


--------------------------------------------------------------------------------
/src/arfs/feature_selection/unsupervised.py:
--------------------------------------------------------------------------------
  1 | """Unsupervised Feature Selection
  2 | 
  3 | This module provides selectors using unsupervised statistics and a threshold
  4 | 
  5 | Module Structure:
  6 | -----------------
  7 | - ``MissingValueThreshold``: child class of the ``BaseThresholdSelector``, filter out columns with too many missing values
  8 | - ``UniqueValuesThreshold`` child of the ``BaseThresholdSelector``, filter out columns with zero variance
  9 | - ``CardinalityThreshold`` child of the ``BaseThresholdSelector``, filter out categorical columns with too many levels
 10 | - ``CollinearityThreshold`` child of the ``BaseThresholdSelector``, filter out collinear columns
 11 | """
 12 | 
 13 | from __future__ import print_function
 14 | from tqdm.auto import trange
 15 | 
 16 | # pandas
 17 | import pandas as pd
 18 | 
 19 | # numpy
 20 | import numpy as np
 21 | 
 22 | # sklearn
 23 | from sklearn.utils.validation import check_is_fitted
 24 | from sklearn.base import BaseEstimator
 25 | from sklearn.feature_selection._base import SelectorMixin
 26 | 
 27 | # ARFS
 28 | from .base import BaseThresholdSelector
 29 | from ..utils import create_dtype_dict
 30 | from ..association import (
 31 |     association_matrix,
 32 |     xy_to_matrix,
 33 |     plot_association_matrix,
 34 |     weighted_theils_u,
 35 |     weighted_corr,
 36 |     correlation_ratio,
 37 | )
 38 | from ..preprocessing import OrdinalEncoderPandas
 39 | 
 40 | 
 41 | # fix random seed for reproducibility
 42 | np.random.seed(7)
 43 | 
 44 | 
 45 | def _missing_ratio(df):
 46 |     if not isinstance(df, pd.DataFrame):
 47 |         raise TypeError("df should be a pandas DataFrame")
 48 |     numeric_columns = df.select_dtypes(np.number).columns
 49 |     n_samples = len(df)
 50 | 
 51 |     missing_counts = {}
 52 |     for column in df.columns:
 53 |         if column in numeric_columns:
 54 |             missing_counts[column] = (
 55 |                 df[column].isnull().sum() + np.isinf(df[column]).sum()
 56 |             ) / n_samples
 57 |         else:
 58 |             missing_counts[column] = df[column].isnull().sum() / n_samples
 59 |     return pd.Series(missing_counts)
 60 | 
 61 | 
 62 | class MissingValueThreshold(BaseThresholdSelector):
 63 |     """Feature selector that removes all high missing percentage features.
 64 |     This feature selection algorithm looks only at the features (X),
 65 |     not the desired outputs (y), and can thus be used for unsupervised learning.
 66 | 
 67 | 
 68 |     Parameters
 69 |     ----------
 70 |     threshold: float, default = .05
 71 |         Features with a training-set missing larger than this threshold will be removed.
 72 | 
 73 |     Returns
 74 |     -------
 75 |     selected_features: list of str
 76 |         List of selected features.
 77 | 
 78 |     Attributes
 79 |     ----------
 80 |     n_features_in_ : int
 81 |         number of input predictors
 82 |     support_ : list of bool
 83 |         the list of the selected X-columns
 84 |     selected_features_ : list of str
 85 |         the list of names of selected features
 86 |     not_selected_features_ : list of str
 87 |         the list of names of rejected features
 88 | 
 89 |     Example
 90 |     -------
 91 |     >>> from sklearn.datasets import make_classification, make_regression
 92 |     >>> X, y = make_regression(n_samples = 1000, n_features = 50, n_informative = 5, shuffle=False) # , n_redundant = 5
 93 |     >>> X = pd.DataFrame(X)
 94 |     >>> y = pd.Series(y)
 95 |     >>> pred_name = [f"pred_{i}" for i in range(X.shape[1])]
 96 |     >>> X.columns = pred_name
 97 |     >>> selector = MissingValueThreshold(0.05)
 98 |     >>> selector.fit_transform(X)
 99 |     """
100 | 
101 |     def __init__(self, threshold=0.05):
102 |         super().__init__(
103 |             threshold=threshold,
104 |             statistic_fn=_missing_ratio,
105 |             greater_than_threshold=False,
106 |         )
107 | 
108 | 
109 | def _pandas_count_unique_values(X):
110 |     if not isinstance(X, pd.DataFrame):
111 |         raise TypeError("X should be a pandas DataFrame")
112 |     return X.nunique()
113 | 
114 | 
115 | class UniqueValuesThreshold(BaseThresholdSelector):
116 |     """Feature selector that removes all features with zero variance (single unique values)
117 |     or remove columns with less unique values than threshold
118 |     This feature selection algorithm looks only at the features (X),
119 |     not the desired outputs (y), and can thus be used for unsupervised learning.
120 | 
121 |     Parameters
122 |     ----------
123 |     threshold: int, default = 1
124 |         Features with a training-set missing larger than this threshold will be removed.
125 |         The thresold should be >= 1
126 | 
127 |     Returns
128 |     -------
129 |     selected_features: list of str
130 |         List of selected features.
131 | 
132 |     Attributes
133 |     ----------
134 |     n_features_in_ : int
135 |         number of input predictors
136 |     support_ : list of bool
137 |         the list of the selected X-columns
138 |     selected_features_ : list of str
139 |         the list of names of selected features
140 |     not_selected_features_ : list of str
141 |         the list of names of rejected features
142 | 
143 |     Example
144 |     -------
145 |     >>> from sklearn.datasets import make_classification, make_regression
146 |     >>> X, y = make_regression(n_samples = 1000, n_features = 50, n_informative = 5, shuffle=False) # , n_redundant = 5
147 |     >>> X = pd.DataFrame(X)
148 |     >>> y = pd.Series(y)
149 |     >>> pred_name = [f"pred_{i}" for i in range(X.shape[1])]
150 |     >>> X.columns = pred_name
151 |     >>> selector = UniqueValuesThreshold(1)
152 |     >>> selector.fit_transform(X)
153 |     """
154 | 
155 |     def __init__(self, threshold=1):
156 |         super().__init__(
157 |             threshold=threshold,
158 |             statistic_fn=_pandas_count_unique_values,
159 |             greater_than_threshold=True,
160 |         )
161 | 
162 | 
163 | def _pandas_count_unique_values_cat_features(X):
164 |     """
165 |     Counts the number of unique values in categorical features of a pandas DataFrame.
166 | 
167 |     Parameters
168 |     ----------
169 |     X : pandas DataFrame
170 |         The input data.
171 | 
172 |     Returns
173 |     -------
174 |     pandas Series
175 |         The number of unique values in each categorical feature.
176 | 
177 |     Raises
178 |     ------
179 |     TypeError
180 |         If the input data is not a pandas DataFrame.
181 |     """
182 |     if not isinstance(X, pd.DataFrame):
183 |         raise TypeError("X should be a pandas DataFrame")
184 |     count_series = pd.Series(data=0, index=X.columns)
185 |     dtype_dic = create_dtype_dict(X, dic_keys="dtypes")
186 |     for c in dtype_dic["cat"]:
187 |         count_series[c] = X[c].nunique()
188 |     return count_series
189 | 
190 | 
191 | class CardinalityThreshold(BaseThresholdSelector):
192 |     """Feature selector that removes all categorical features with more unique values than threshold
193 |     This feature selection algorithm looks only at the features (X),
194 |     not the desired outputs (y), and can thus be used for unsupervised learning.
195 | 
196 |     Parameters
197 |     ----------
198 |     threshold: int, default = 1000
199 |         Features with a training-set missing larger than this threshold will be removed.
200 |         The thresold should be >= 1
201 | 
202 |     Returns
203 |     -------
204 |     selected_features: list of str
205 |         List of selected features.
206 | 
207 |     Attributes
208 |     ----------
209 |     n_features_in_ : int
210 |         number of input predictors
211 |     support_ : list of bool
212 |         the list of the selected X-columns
213 |     selected_features_ : list of str
214 |         the list of names of selected features
215 |     not_selected_features_ : list of str
216 |         the list of names of rejected features
217 | 
218 |     Example
219 |     -------
220 |     >>> from sklearn.datasets import make_classification, make_regression
221 |     >>> X, y = make_regression(n_samples = 1000, n_features = 50, n_informative = 5, shuffle=False) # , n_redundant = 5
222 |     >>> X = pd.DataFrame(X)
223 |     >>> y = pd.Series(y)
224 |     >>> pred_name = [f"pred_{i}" for i in range(X.shape[1])]
225 |     >>> X.columns = pred_name
226 |     >>> selector = CardinalityThreshold(100)
227 |     >>> selector.fit_transform(X)
228 |     """
229 | 
230 |     def __init__(self, threshold=1000):
231 |         super().__init__(
232 |             threshold=threshold,
233 |             statistic_fn=_pandas_count_unique_values_cat_features,
234 |             greater_than_threshold=False,
235 |         )
236 | 
237 | 
238 | class CollinearityThreshold(SelectorMixin, BaseEstimator):
239 |     """Feature selector that removes collinear features.
240 |     This feature selection algorithm looks only at the features (X),
241 |     not the desired outputs (y), and can thus be used for unsupervised learning.
242 |     It computes the association between features (continuous or categorical),
243 |     store the pairs of collinear features and remove one of them for all pairs having
244 |     an association value above the threshold.
245 | 
246 |     The association measures are the Spearman correlation coefficient, correlation ratio
247 |     and Theil's U. The association matrix is not necessarily symmetrical.
248 | 
249 |     By changing the method to "correlation", data will be encoded as integer
250 |     and the Spearman correlation coefficient will be used instead. Faster but not
251 |     a best practice because the categorical variables are considered as numeric.
252 | 
253 |     Parameters
254 |     ----------
255 |     threshold : float, default = .8
256 |         Features with a training-set missing larger than this threshold will be removed
257 |         The thresold should be > 0 and =< 1
258 |     method : str, default = "association"
259 |         method for computing the association matrix. Either "association" or "correlation".
260 |         Correlation leads to encoding of categorical variables as numeric
261 |     n_jobs : int, default = -1
262 |         the number of threads, -1 uses all the threads for computating the association matrix
263 |     nom_nom_assoc : str or callable, default = "theil"
264 |         the categorical-categorical association measure, by default Theil's U, not symmetrical!
265 |     num_num_assoc : str or callable, default = "spearman"
266 |         the numeric-numeric association measure
267 |     nom_num_assoc : str or callable, default = "correlation_ratio"
268 |         the numeric-categorical association measure
269 | 
270 |     Returns
271 |     -------
272 |     selected_features: list of str
273 |         List of selected features.
274 | 
275 |     Attributes
276 |     ----------
277 |     n_features_in_ : int
278 |         number of input predictors
279 |     assoc_matrix_ : pd.DataFrame
280 |         the square association matrix
281 |     collinearity_summary_ : pd.DataFrame
282 |         the pairs of collinear features and the association values
283 |     support_ : list of bool
284 |         the list of the selected X-columns
285 |     selected_features_ : list of str
286 |         the list of names of selected features
287 |     not_selected_features_ : list of str
288 |         the list of names of rejected features
289 | 
290 |     Example
291 |     -------
292 |     >>> from sklearn.datasets import make_classification, make_regression
293 |     >>> X, y = make_regression(n_samples = 1000, n_features = 50, n_informative = 5, shuffle=False) # , n_redundant = 5
294 |     >>> X = pd.DataFrame(X)
295 |     >>> y = pd.Series(y)
296 |     >>> pred_name = [f"pred_{i}" for i in range(X.shape[1])]
297 |     >>> X.columns = pred_name
298 |     >>> selector = CollinearityThreshold(threshold=0.75)
299 |     >>> selector.fit_transform(X)
300 |     """
301 | 
302 |     def __init__(
303 |         self,
304 |         threshold=0.80,
305 |         method="association",
306 |         n_jobs=1,
307 |         nom_nom_assoc=weighted_theils_u,
308 |         num_num_assoc=weighted_corr,
309 |         nom_num_assoc=correlation_ratio,
310 |     ):
311 |         self.threshold = threshold
312 |         self.method = method
313 |         self.n_jobs = n_jobs
314 |         self.nom_nom_assoc = nom_nom_assoc
315 |         self.num_num_assoc = num_num_assoc
316 |         self.nom_num_assoc = nom_num_assoc
317 | 
318 |         if self.method not in ["association", "correlation"]:
319 |             raise ValueError("``method`` should be 'association' or 'correlation'")
320 | 
321 |         if (self.threshold > 1.0) or (self.threshold < 0.0):
322 |             raise ValueError("``threshold`` should be larger than 0 and smaller than 1")
323 | 
324 |     def fit(self, X, y=None, sample_weight=None):
325 |         """Learn empirical associtions from X.
326 | 
327 |         Parameters
328 |         ----------
329 |         X : pd.DataFrame, shape (n_samples, n_features)
330 |             Data from which to compute variances, where `n_samples` is
331 |             the number of samples and `n_features` is the number of features.
332 |         y : any, default=None
333 |             Ignored. This parameter exists only for compatibility with
334 |             sklearn.pipeline.Pipeline.
335 |         sample_weight : pd.Series, optional, shape (n_samples,)
336 |             weights for computing the statistics (e.g. weighted average)
337 | 
338 |         Returns
339 |         -------
340 |         self : object
341 |             Returns the instance itself.
342 |         """
343 | 
344 |         if isinstance(X, pd.DataFrame):
345 |             self.feature_names_in_ = X.columns.to_numpy()
346 |         else:
347 |             raise TypeError("X is not a dataframe")
348 | 
349 |         self.suffix_dic = create_dtype_dict(X)
350 | 
351 |         if self.method == "correlation":
352 |             encoder = OrdinalEncoderPandas()
353 |             X = encoder.fit_transform(X)
354 |             del encoder
355 | 
356 |         assoc_matrix = association_matrix(
357 |             X=X,
358 |             sample_weight=sample_weight,
359 |             n_jobs=self.n_jobs,
360 |             nom_nom_assoc=self.nom_nom_assoc,
361 |             num_num_assoc=self.num_num_assoc,
362 |             nom_num_assoc=self.nom_num_assoc,
363 |         )
364 |         self.assoc_matrix_ = xy_to_matrix(assoc_matrix)
365 | 
366 |         to_drop = _recursive_collinear_elimination(self.assoc_matrix_, self.threshold)
367 | 
368 |         self.support_ = np.asarray(
369 |             [True if c not in to_drop else False for c in X.columns]
370 |         )
371 |         self.selected_features_ = self.feature_names_in_[self.support_]
372 |         self.not_selected_features_ = self.feature_names_in_[~self.support_]
373 | 
374 |         return self
375 | 
376 |     def _get_support_mask(self):
377 |         check_is_fitted(self)
378 | 
379 |         return self.support_
380 | 
381 |     def transform(self, X):
382 |         if not isinstance(X, pd.DataFrame):
383 |             raise TypeError("X is not a dataframe")
384 |         return X[self.selected_features_]
385 | 
386 |     def _more_tags(self):
387 |         return {"allow_nan": True}
388 | 
389 |     def plot_association(
390 |         self, ax=None, cmap="PuOr", figsize=None, cbar_kw=None, imgshow_kw=None
391 |     ):
392 |         """plot_association plots the association matrix
393 | 
394 |         Parameters
395 |         ----------
396 |         ax : matplotlib.axes.Axes, optional
397 |             the mpl axes if the figure object exists already, by default None
398 |         cmap : str, optional
399 |             colormap name, by default "PuOr"
400 |         figsize : tuple of float, optional
401 |             figure size, by default None
402 |         cbar_kw : dict, optional
403 |             colorbar kwargs, by default None
404 |         imgshow_kw : dict, optional
405 |             imgshow kwargs, by default None
406 |         """
407 | 
408 |         if figsize is None:
409 |             figsize = (self.assoc_matrix_.shape[0] / 3, self.assoc_matrix_.shape[0] / 3)
410 | 
411 |         f, ax = plot_association_matrix(
412 |             assoc_mat=self.assoc_matrix_,
413 |             suffix_dic=self.suffix_dic,
414 |             ax=ax,
415 |             cmap=cmap,
416 |             cbarlabel="association value",
417 |             figsize=figsize,
418 |             show=True,
419 |             cbar_kw=cbar_kw,
420 |             imgshow_kw=imgshow_kw,
421 |         )
422 | 
423 |         return f
424 | 
425 | 
426 | def _most_collinear(association_matrix_abs, threshold):
427 |     cols_to_drop = association_matrix_abs.loc[
428 |         :, (association_matrix_abs > threshold).any(axis=0)
429 |     ].columns.values
430 |     rows_to_drop = association_matrix_abs.loc[
431 |         (association_matrix_abs > threshold).any(axis=1), :
432 |     ].index.values
433 |     to_drop = list(set(cols_to_drop).union(set(rows_to_drop)))
434 |     if not to_drop:
435 |         return None, None
436 |     # for features in `to_drop` sum up their column and row values to find
437 |     # the most collinear feature
438 |     most_collinear_series = association_matrix_abs.loc[:, to_drop].sum(axis=0)
439 |     most_collinear_series += association_matrix_abs.loc[to_drop, :].sum(axis=1)
440 |     # not necessarily but avoids exceeding 1
441 |     most_collinear_series /= 2
442 |     return most_collinear_series.sort_values(ascending=False).index[0], to_drop
443 | 
444 | 
445 | def _recursive_collinear_elimination(association_matrix, threshold):
446 |     dum = association_matrix.abs()
447 |     most_collinear_features = []
448 | 
449 |     while True:
450 |         most_collinear_feature, to_drop = _most_collinear(dum, threshold)
451 | 
452 |         # Break if no more features to drop
453 |         if not to_drop:
454 |             break
455 |         # the if statement below can probably also be removed since we can only
456 |         # remove features we have left in dum
457 |         if most_collinear_feature not in most_collinear_features:
458 |             most_collinear_features.append(most_collinear_feature)
459 |             dum = dum.drop(columns=most_collinear_feature, index=most_collinear_feature)
460 | 
461 |     return most_collinear_features
462 | 


--------------------------------------------------------------------------------
/src/arfs/feature_selection/variable_importance.py:
--------------------------------------------------------------------------------
  1 | """Supervised Feature Selection
  2 | 
  3 | This module provides selectors using supervised statistics and a threshold, using SHAP, permutation importance or impurity (Gini) importance.
  4 | 
  5 | Module Structure:
  6 | -----------------
  7 | - ``VariableImportance`` main class for identifying non-important features
  8 | """
  9 | 
 10 | from __future__ import print_function
 11 | from tqdm.auto import trange
 12 | 
 13 | # pandas
 14 | import pandas as pd
 15 | 
 16 | # numpy
 17 | import numpy as np
 18 | 
 19 | # matplotlib
 20 | import matplotlib.pyplot as plt
 21 | import matplotlib.gridspec as gridspec
 22 | 
 23 | # sklearn
 24 | from sklearn.utils.validation import check_is_fitted
 25 | from sklearn.base import BaseEstimator
 26 | from sklearn.feature_selection._base import SelectorMixin
 27 | 
 28 | # ARFS
 29 | from ..utils import reset_plot
 30 | from ..gbm import GradientBoosting
 31 | from ..preprocessing import OrdinalEncoderPandas
 32 | 
 33 | 
 34 | class VariableImportance(SelectorMixin, BaseEstimator):
 35 |     """Feature selector that removes predictors with zero or low variable importance.
 36 | 
 37 |     Identify the features with zero/low importance according to SHAP values of a lightgbm.
 38 |     The gbm can be trained with early stopping using a utils set to prevent overfitting.
 39 |     The feature importances are averaged over `n_iterations` to reduce the variance.
 40 |     The predictors are then ranked from the most important to the least important and the
 41 |     cumulative variable importance is computed. All the predictors not contributing (VI=0) or
 42 |     contributing to less than the threshold to the cumulative importance are removed.
 43 | 
 44 |     Parameters
 45 |     ----------
 46 |     task : string
 47 |         The machine learning task, either 'classification' or 'regression' or 'multiclass',
 48 |         be sure to use a consistent objective function
 49 |     encode : boolean, default = True
 50 |         Whether or not to encode the predictors
 51 |     n_iterations : int, default = 10
 52 |         Number of iterations, the more iterations, the smaller the variance
 53 |     threshold : float, default = .99
 54 |         The selector computes the cumulative feature importance and ranks
 55 |         the predictors from the most important to the least important.
 56 |         All the predictors contributing to less than this value are rejected.
 57 |     lgb_kwargs : dictionary of keyword arguments
 58 |         dictionary of lightgbm estimators parameters with at least the objective function {'objective':'rmse'}
 59 |     encoder_kwargs : dictionary of keyword arguments, optional
 60 |         dictionary of the :class:`OrdinalEncoderPandas` parameters
 61 | 
 62 | 
 63 |     Returns
 64 |     -------
 65 |     selected_features: list of str
 66 |         List of selected features.
 67 | 
 68 |     Attributes
 69 |     ----------
 70 |     n_features_in_ : int
 71 |         number of input predictors
 72 |     assoc_matrix_ : pd.DataFrame
 73 |         the square association matrix
 74 |     collinearity_summary_ : pd.DataFrame
 75 |         the pairs of collinear features and the association values
 76 |     support_ : list of bool
 77 |         the list of the selected X-columns
 78 |     selected_features_ : list of str
 79 |         the list of names of selected features
 80 |     not_selected_features_ : list of str
 81 |         the list of names of rejected features
 82 |     fastshap : boolean
 83 |         enable or not the fasttreeshap implementation
 84 |     verbose : int, default = -1
 85 |         controls the progress bar, > 1 print out progress
 86 | 
 87 |     Example
 88 |     -------
 89 |     >>> from sklearn.datasets import make_classification, make_regression
 90 |     >>> X, y = make_regression(n_samples = 1000, n_features = 50, n_informative = 5, shuffle=False) # , n_redundant = 5
 91 |     >>> X = pd.DataFrame(X)
 92 |     >>> y = pd.Series(y)
 93 |     >>> pred_name = [f"pred_{i}" for i in range(X.shape[1])]
 94 |     >>> X.columns = pred_name
 95 |     >>> selector = VariableImportance(threshold=0.75)
 96 |     >>> selector.fit_transform(X, y)
 97 |     """
 98 | 
 99 |     def __init__(
100 |         self,
101 |         task="regression",
102 |         encode=True,
103 |         n_iterations=10,
104 |         threshold=0.99,
105 |         lgb_kwargs={"objective": "rmse", "zero_as_missing": False},
106 |         encoder_kwargs=None,
107 |         fastshap=False,
108 |         verbose=-1,
109 |     ):
110 |         self.task = task
111 |         self.encode = encode
112 |         self.n_iterations = n_iterations
113 |         self.threshold = threshold
114 |         self.lgb_kwargs = lgb_kwargs
115 |         self.encoder_kwargs = encoder_kwargs
116 |         self.verbose = verbose
117 |         self.fastshap = fastshap
118 | 
119 |         if (self.threshold > 1.0) or (self.threshold < 0.0):
120 |             raise ValueError("``threshold`` should be larger than 0 and smaller than 1")
121 | 
122 |     def fit(self, X, y, sample_weight=None):
123 |         """Learn variable importance from X and y, supervised learning.
124 | 
125 |         Parameters
126 |         ----------
127 |         X : pd.DataFrame, shape (n_samples, n_features)
128 |             Data from which to compute variances, where `n_samples` is
129 |             the number of samples and `n_features` is the number of features.
130 |         y : any, default=None
131 |             Ignored. This parameter exists only for compatibility with
132 |             sklearn.pipeline.Pipeline.
133 |         sample_weight : pd.Series, optional, shape (n_samples,)
134 |             weights for computing the statistics (e.g. weighted average)
135 | 
136 |         Returns
137 |         -------
138 |         self : object
139 |             Returns the instance itself.
140 |         """
141 | 
142 |         if isinstance(X, pd.DataFrame):
143 |             self.feature_names_in_ = X.columns.to_numpy()
144 |         else:
145 |             raise TypeError("X is not a dataframe")
146 | 
147 |         feature_importances = _compute_varimp_lgb(
148 |             X=X,
149 |             y=y,
150 |             sample_weight=sample_weight,
151 |             encode=self.encode,
152 |             task=self.task,
153 |             n_iterations=self.n_iterations,
154 |             verbose=self.verbose,
155 |             encoder_kwargs=self.encoder_kwargs,
156 |             lgb_kwargs=self.lgb_kwargs,
157 |             fastshap=self.fastshap,
158 |         )
159 | 
160 |         self.feature_importances_summary_ = feature_importances
161 | 
162 |         support_ordered = (
163 |             self.feature_importances_summary_["cumulative_importance"] >= self.threshold
164 |         )
165 |         to_drop = list(
166 |             self.feature_importances_summary_.loc[support_ordered, "feature"]
167 |         )
168 | 
169 |         self.support_ = np.asarray(
170 |             [False if c in to_drop else True for c in self.feature_names_in_]
171 |         )
172 |         self.selected_features_ = self.feature_names_in_[self.support_]
173 |         self.not_selected_features_ = self.feature_names_in_[~self.support_]
174 | 
175 |         return self
176 | 
177 |     def _get_support_mask(self):
178 |         check_is_fitted(self)
179 | 
180 |         return self.support_
181 | 
182 |     def transform(self, X):
183 |         """
184 |         Transform the data, returns a transformed version of `X`.
185 | 
186 |         Parameters
187 |         ----------
188 |         X : array-like of shape (n_samples, n_features)
189 |             Input samples.
190 | 
191 |         Returns
192 |         -------
193 |         X : ndarray array of shape (n_samples, n_features_new)
194 |             Transformed array.
195 | 
196 |         Raises
197 |         ------
198 |         TypeError
199 |             if the input is not a pd.DataFrame
200 |         """
201 | 
202 |         if not isinstance(X, pd.DataFrame):
203 |             raise TypeError("X is not a dataframe")
204 |         return X[self.selected_features_]
205 | 
206 |     def fit_transform(self, X, y=None, sample_weight=None):
207 |         """
208 |         Fit to data, then transform it.
209 |         Fits transformer to `X` and `y` with optional parameters `fit_params`
210 |         and returns a transformed version of `X`.
211 |         Parameters
212 |         ----------
213 |         X : array-like of shape (n_samples, n_features)
214 |             Input samples.
215 |         y :  array-like of shape (n_samples,) or (n_samples, n_outputs), \
216 |                 default=None
217 |             Target values (None for unsupervised transformations).
218 |         **fit_params : dict
219 |             Additional fit parameters.
220 |         Returns
221 |         -------
222 |         X_new : ndarray array of shape (n_samples, n_features_new)
223 |             Transformed array.
224 |         """
225 |         return self.fit(X=X, y=y, sample_weight=sample_weight).transform(X)
226 | 
227 |     def _more_tags(self):
228 |         return {"allow_nan": True}
229 | 
230 |     def plot_importance(
231 |         self, figsize=None, plot_n=50, n_feat_per_inch=3, log=True, style=None
232 |     ):
233 |         """Plots `plot_n` most important features and the cumulative importance of features.
234 |         If `threshold` is provided, prints the number of features needed to reach `threshold`
235 |         cumulative importance.
236 | 
237 |         Parameters
238 |         ----------
239 |         plot_n : int, default = 50
240 |             Number of most important features to plot. Defaults to 15 or the maximum
241 |             number of features whichever is smaller
242 |         n_feat_per_inch : int
243 |             number of features per inch, the larger the less space between labels
244 |         figsize : tuple of float, optional
245 |             The rendered size as a percentage size
246 |         log : bool, default=True
247 |             Whether or not render variable importance on a log scale
248 |         style : bool, default=False
249 |             set arfs style or not
250 | 
251 |         Returns
252 |         -------
253 |         hv.plot
254 |             the feature importances holoviews object
255 | 
256 |         """
257 |         if style:
258 |             plt.style.use(style)
259 |         else:
260 |             reset_plot()
261 | 
262 |         if plot_n > self.feature_importances_summary_.shape[0]:
263 |             plot_n = self.feature_importances_summary_.shape[0] - 1
264 | 
265 |         df = self.feature_importances_summary_
266 |         importance_index = np.min(
267 |             np.where(df["cumulative_importance"] > self.threshold)
268 |         )
269 |         non_cum_threshold = df.iloc[importance_index, 2]
270 |         max_norm_importance = 0.99 * df.normalized_importance.max()
271 | 
272 |         if plot_n > df.shape[0]:
273 |             plot_n = df.shape[0] - 1
274 | 
275 |         if figsize is None:
276 |             figsize = (8, plot_n / n_feat_per_inch)
277 |         fig = plt.figure(tight_layout=True, figsize=figsize)
278 |         gs = gridspec.GridSpec(3, 3)
279 |         ax1 = fig.add_subplot(gs[:, 0])
280 |         ax1.scatter(df.normalized_importance, df.feature)
281 |         # ax.set_ylabel('YLabel0')
282 |         ax1.set_xlabel("normalized importance")
283 |         ax1.xaxis.set_label_position("top")
284 |         ax1.invert_yaxis()
285 |         ax1.axvline(x=non_cum_threshold, linestyle="dashed", color="r")
286 |         if log:
287 |             ax1.set_xscale("log")
288 |         ax1.grid()
289 |         ax1.set(frame_on=False)
290 | 
291 |         ax2 = fig.add_subplot(gs[:, 1:])
292 |         ax2.scatter(df.feature, df.cumulative_importance)
293 |         # ax.set_ylabel('YLabel0')
294 |         ax2.set_ylabel("cumulative importance")
295 |         ax2.tick_params(axis="x", labelrotation=90)
296 | 
297 |         importance_min_value_on_axis = max_norm_importance if log else 0
298 |         x_vert, y_vert = (
299 |             [importance_index, importance_index],
300 |             [
301 |                 importance_min_value_on_axis,
302 |                 self.threshold,
303 |             ],
304 |         )
305 |         x_horiz, y_horiz = (
306 |             [importance_min_value_on_axis, importance_index],
307 |             [
308 |                 self.threshold,
309 |                 self.threshold,
310 |             ],
311 |         )
312 | 
313 |         ax2.plot(x_vert, y_vert, linestyle="dashed", color="r")
314 |         ax2.plot(x_horiz, y_horiz, linestyle="dashed", color="r")
315 |         ax2.set_ylim(max_norm_importance, 1.0)
316 |         if log:
317 |             ax2.set_xscale("log")
318 |         ax2.grid()
319 |         ax2.set(frame_on=False)
320 | 
321 |         fig.align_labels()
322 |         return fig
323 | 
324 | 
325 | def _compute_varimp_lgb(
326 |     X,
327 |     y,
328 |     sample_weight=None,
329 |     encode=False,
330 |     task="regression",
331 |     n_iterations=10,
332 |     verbose=-1,
333 |     fastshap=True,
334 |     encoder_kwargs=None,
335 |     lgb_kwargs={"objective": "rmse", "zero_as_missing": False},
336 | ):
337 |     if task not in ["regression", "classification", "multiclass"]:
338 |         raise ValueError('Task must be either "classification" or "regression"')
339 | 
340 |     if y is None:
341 |         raise ValueError("No training labels provided.")
342 | 
343 |     if encode:
344 |         encoder = (
345 |             OrdinalEncoderPandas(**encoder_kwargs)
346 |             if encoder_kwargs is not None
347 |             else OrdinalEncoderPandas()
348 |         )
349 |         X = encoder.fit(X).transform(X)
350 |         del encoder
351 |     # Extract feature names
352 |     feature_names = list(X.columns)
353 |     # Empty array for feature importances
354 |     feature_importance_values = np.zeros(len(feature_names))
355 |     progress_bar = trange(n_iterations) if verbose > 1 else range(n_iterations)
356 | 
357 |     # Iterate through each fold
358 |     for _ in progress_bar:
359 |         if verbose > 1:
360 |             progress_bar.set_description("Iteration nb: {0:<3}".format(_))
361 | 
362 |         # lgb_kwargs['verbose'] = -1
363 |         gbm_model = GradientBoosting(
364 |             cat_feat="auto",
365 |             stratified=False,
366 |             params=lgb_kwargs,
367 |             show_learning_curve=False,
368 |             return_valid_features=True,
369 |             verbose_eval=0,
370 |         )
371 | 
372 |         gbm_model.fit(X=X, y=y, sample_weight=sample_weight)
373 | 
374 |         # pimp cool but too slow
375 |         # perm_imp =  permutation_importance(
376 |         # model, valid_features, valid_labels, n_repeats=10, random_state=42, n_jobs=-1
377 |         # )
378 |         # perm_imp = perm_imp.importances_mean
379 |         if fastshap:
380 |             try:
381 |                 from fasttreeshap import TreeExplainer as FastTreeExplainer
382 |             except ImportError:
383 |                 ImportError("fasttreeshap is not installed")
384 | 
385 |             explainer = FastTreeExplainer(
386 |                 gbm_model.model,
387 |                 algorithm="auto",
388 |                 shortcut=False,
389 |                 feature_perturbation="tree_path_dependent",
390 |             )
391 |             shap_matrix = explainer.shap_values(gbm_model.valid_features)
392 |             if isinstance(shap_matrix, list):
393 |                 # For LightGBM classifier, RF, in sklearn API, SHAP returns a list of arrays
394 |                 # https://github.com/slundberg/shap/issues/526
395 |                 shap_imp = np.mean([np.abs(sv).mean(0) for sv in shap_matrix], axis=0)
396 |             else:
397 |                 shap_imp = np.abs(shap_matrix).mean(0)
398 |         else:
399 |             shap_matrix = gbm_model.model.predict(
400 |                 gbm_model.valid_features, pred_contrib=True
401 |             )
402 |             # the dim changed in lightGBM >= 3.0.0
403 |             if task == "multiclass":
404 |                 # X_SHAP_values (array-like of shape = [n_samples, n_features + 1]
405 |                 # or shape = [n_samples, (n_features + 1) * n_classes])
406 |                 # index starts from 0
407 |                 n_features_plus_bias = gbm_model.valid_features.shape[1] + 1
408 |                 n_samples = gbm_model.valid_features.shape[0]
409 |                 y_freq_table = pd.Series(y.fillna(0)).value_counts(normalize=True)
410 |                 n_classes = y_freq_table.size
411 | 
412 |                 # Reshape the array to [n_samples, n_features + 1, n_classes]
413 |                 reshaped_values = shap_matrix.reshape(
414 |                     n_samples, n_classes, n_features_plus_bias
415 |                 )
416 | 
417 |                 # Since we need (n_samples, n_features + 1, n_classes), transpose the second and third dimensions
418 |                 reshaped_values = reshaped_values.transpose(0, 2, 1)
419 |                 reshaped_values = reshaped_values[:, :-1, :]
420 |                 reshaped_values.shape
421 |                 # Sum the contributions for each class ignoring the bias term
422 |                 # average on all the samples
423 |                 shap_imp = np.abs(reshaped_values).sum(axis=-1).mean(axis=0)
424 |             else:
425 |                 # for binary, only one class is returned, for regression a single column added as well
426 |                 shap_imp = np.mean(np.abs(shap_matrix[:, :-1]), axis=0)
427 | 
428 |         # Record the feature importances
429 |         feature_importance_values += (
430 |             shap_imp / n_iterations
431 |         )  # model.feature_importances_ / n_iterations
432 |     feature_importances = pd.DataFrame(
433 |         {"feature": feature_names, "importance": feature_importance_values}
434 |     )
435 |     # Sort features according to importance
436 |     feature_importances = feature_importances.sort_values(
437 |         "importance", ascending=False
438 |     ).reset_index(drop=True)
439 |     # Normalize the feature importances to add up to one
440 |     feature_importances["normalized_importance"] = (
441 |         feature_importances["importance"] / feature_importances["importance"].sum()
442 |     )
443 |     feature_importances["cumulative_importance"] = np.cumsum(
444 |         feature_importances["normalized_importance"]
445 |     )
446 |     # Extract the features with zero importance
447 |     # record_zero_importance = feature_importances[
448 |     #     feature_importances["importance"] == 0.0
449 |     # ]
450 |     return feature_importances
451 | 


--------------------------------------------------------------------------------
/src/arfs/parallel.py:
--------------------------------------------------------------------------------
  1 | """Parallelize Pandas
  2 | 
  3 | This module provides utilities for parallelizing operations on pd.DataFrame
  4 | 
  5 | Module Structure:
  6 | -----------------
  7 | - ``parallel_matrix_entries`` for parallelizing operations returning a matrix (2D) (apply on pairs of columns)
  8 | - ``parallel_df`` for parallelizing operations returning a series (1D) (apply on a single column at a time)
  9 | """
 10 | 
 11 | import numpy as np
 12 | import pandas as pd
 13 | from joblib import Parallel, delayed
 14 | from multiprocessing import cpu_count
 15 | from itertools import chain
 16 | 
 17 | 
 18 | def parallel_matrix_entries(func, df, comb_list, sample_weight=None, n_jobs=-1):
 19 |     """parallel_matrix_entries applies a function to each chunk of
 20 |     combination of columns of the dataframe, distributed by cores.
 21 |     This is similar to https://github.com/smazzanti/mrmr/mrmr/pandas.py
 22 | 
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     func : callable
 27 |         function to be applied to each pair of columns in comb_list
 28 |     df : pd.DataFrame
 29 |         the dataframe on which to apply the function
 30 |     comb_list : list of tuples of str
 31 |         Pairs of column names corresponding to the entries
 32 |     sample_weight : pd.Series or np.array, optional
 33 |         The weight vector, if any, of shape (n_samples,), by default None
 34 |     n_jobs : int, optional
 35 |         the number of cores to use for the computation, by default -1
 36 | 
 37 |     Returns
 38 |     -------
 39 |     pd.DataFrame
 40 |         concatenated results into a single pandas DF
 41 |     """
 42 |     # Determining the number of jobs
 43 |     n_jobs = cpu_count() if n_jobs == -1 else min(cpu_count(), n_jobs)
 44 | 
 45 |     if n_jobs == 1:
 46 |         lst = func(X=df, sample_weight=sample_weight, comb_list=comb_list)
 47 |         return pd.concat(lst, ignore_index=True).sort_values("val", ascending=False)
 48 | 
 49 |     comb_chunks = np.array_split(comb_list, n_jobs)
 50 |     lst = Parallel(n_jobs=n_jobs)(
 51 |         delayed(func)(X=df, sample_weight=sample_weight, comb_list=comb_chunk)
 52 |         for comb_chunk in comb_chunks
 53 |     )
 54 |     # Directly return the single DataFrame if lst contains only one element
 55 |     if len(lst) == 1:
 56 |         return lst[0]
 57 |     else:
 58 |         return pd.concat(list(chain(*lst)), ignore_index=True)
 59 | 
 60 | 
 61 | def parallel_df(func, df, series, sample_weight=None, n_jobs=-1):
 62 |     """parallel_df apply a function to each column of the dataframe, distributed by cores.
 63 |     This is similar to https://github.com/smazzanti/mrmr/mrmr/pandas.py
 64 | 
 65 |     Parameters
 66 |     ----------
 67 |     func : callable
 68 |         function to be applied to each column
 69 |     df : pd.DataFrame
 70 |         the dataframe on which to apply the function
 71 |     series : pd.Series
 72 |         series (target) used by the function
 73 |     sample_weight : pd.Series or np.array, optional
 74 |         The weight vector, if any, of shape (n_samples,), by default None
 75 |     n_jobs : int, optional
 76 |         the number of cores to use for the computation, by default -1
 77 | 
 78 |     Returns
 79 |     -------
 80 |     pd.DataFrame
 81 |         concatenated results into a single pandas DF
 82 |     """
 83 |     # Determining the number of jobs
 84 |     n_jobs = cpu_count() if n_jobs == -1 else min(cpu_count(), n_jobs)
 85 | 
 86 |     if n_jobs == 1:
 87 |         lst = func(df, series, sample_weight).sort_values(ascending=False)
 88 | 
 89 |         return (
 90 |             pd.concat(lst, ignore_index=True).sort_values("val", ascending=False)
 91 |             if isinstance(lst, list)
 92 |             else lst
 93 |         )
 94 |     else:
 95 |         col_chunks = np.array_split(range(len(df.columns)), n_jobs)
 96 |         lst = Parallel(n_jobs=n_jobs)(
 97 |             delayed(func)(df.iloc[:, col_chunk], series, sample_weight)
 98 |             for col_chunk in col_chunks
 99 |         )
100 | 
101 |         return pd.concat(lst).sort_values(ascending=False)
102 | 
103 | 
104 | def _compute_series(
105 |     X,
106 |     y,
107 |     sample_weight=None,
108 |     func_xyw=None,
109 | ):
110 |     """_compute_series is a utility function for computing the series
111 |     resulting of the ``apply``
112 | 
113 |     Parameters
114 |     ----------
115 |     X : pd.DataFrame, of shape (n_samples, n_features)
116 |         The set of regressors that will be tested sequentially
117 |     y : pd.Series or np.array, of shape (n_samples,)
118 |         The target vector
119 |     sample_weight : pd.Series or np.array, of shape (n_samples,), optional
120 |         The weight vector, if any, by default None
121 |     func_xyw : callable, optional
122 |         callable (function) for computing the individual elements of the series
123 |         takes two mandatory inputs (x and y) and an optional input w, sample_weights
124 |     """
125 | 
126 |     def _closure_compute_series(x, y, sample_weight):
127 |         x_not_na = ~x.isna()
128 |         if x_not_na.sum() == 0:
129 |             return 0
130 |         return func_xyw(
131 |             x=x[x_not_na],
132 |             y=y[x_not_na],
133 |             sample_weight=sample_weight[x_not_na],
134 |             as_frame=False,
135 |         )
136 | 
137 |     return X.apply(
138 |         lambda col: _closure_compute_series(x=col, y=y, sample_weight=sample_weight)
139 |     ).fillna(0.0)
140 | 
141 | 
142 | def _compute_matrix_entries(
143 |     X,
144 |     comb_list,
145 |     sample_weight=None,
146 |     func_xyw=None,
147 | ):
148 |     """base closure for computing matrix entries applying a function to each chunk of
149 |     combination of columns of the dataframe, distributed by cores.
150 |     This is similar to https://github.com/smazzanti/mrmr/mrmr/pandas.py
151 | 
152 |     Parameters
153 |     ----------
154 |     X : pd.DataFrame, of shape (n_samples, n_features)
155 |         The set of regressors that will be tested sequentially
156 |     sample_weight : pd.Series or np.array, of shape (n_samples,), optional
157 |         The weight vector, if any, by default None
158 |     func_xyw : callable, optional
159 |         callable (function) for computing the individual elements of the matrix
160 |         takes two mandatory inputs (x and y) and an optional input w, sample_weights
161 |     comb_list : list of 2-tuple of str
162 |         Pairs of column names corresponding to the entries
163 | 
164 |     Returns
165 |     -------
166 |     List[pd.DataFrame]
167 |         a list of partial dfs to be concatenated
168 |     """
169 |     v_df_list = [
170 |         func_xyw(x=X[comb[0]], y=X[comb[1]], sample_weight=sample_weight, as_frame=True)
171 |         for comb in comb_list
172 |     ]
173 | 
174 |     return v_df_list
175 | 


--------------------------------------------------------------------------------
/src/arfs/sampling.py:
--------------------------------------------------------------------------------
  1 | """This module provide methods for sampling large datasets for reducing the running time"""
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from scipy.sparse import issparse
  6 | from collections import Counter
  7 | from sklearn.cluster import AgglomerativeClustering
  8 | from sklearn.ensemble import IsolationForest
  9 | from scipy.stats import ks_2samp
 10 | from .utils import is_list_of_str, is_list_of_bool, is_list_of_int
 11 | 
 12 | 
 13 | def sample(df, n=1000, sample_weight=None, method="gower"):
 14 |     """Sampling rows from a dataframe when random sampling is not
 15 |     enough for reducing the number of rows.
 16 |     The strategies are either using hierarchical clustering
 17 |     based on the Gower distance or using isolation forest for identifying
 18 |     the most similar elements.
 19 |     For the clustering algorithm, clusters are determined using the Gower distance
 20 |     (mixed type data) and the dataset is shrunk from n_samples to n_clusters.
 21 | 
 22 |     For the isolation forest algorithm, samples are added till a sufficient 2-samples
 23 |     KS statistics is reached or if the number iteration reached the max number (20)
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     df : pd.DataFrame
 28 |         the dataframe to sample, with or without the target
 29 |     n : int, optional
 30 |         the number of clusters if method is ``"gower"``, by default 100
 31 |     sample_weight : pd.Series or np.array, optional
 32 |         sample weights, by default None
 33 |     method : str, optional
 34 |         the strategy to use for sampling the rows. Either ``"gower"`` or ``"isoforest"``, by default 'gower'
 35 | 
 36 |     Returns
 37 |     -------
 38 |     pd.DataFrame
 39 |         the sampled dataframe
 40 | 
 41 |     """
 42 |     assert isinstance(df, pd.DataFrame), "X should be a DataFrame"
 43 |     X = df.copy()
 44 |     num_cols = list(X.select_dtypes(include=[np.number]))
 45 |     non_num_cols = list(set(list(X.columns)) - set(num_cols))
 46 | 
 47 |     if method == "gower":
 48 |         # basic imputation
 49 |         if non_num_cols:
 50 |             X[non_num_cols] = X[non_num_cols].fillna(X[non_num_cols].mode().iloc[0])
 51 |         if num_cols:
 52 |             X[num_cols] = X[num_cols].fillna(X[num_cols].mean().iloc[0])
 53 | 
 54 |         # no need for scaling, it is built-in the computation of the Gower distance
 55 |         gd = gower_matrix(X, cat_features=non_num_cols, weight=sample_weight)
 56 | 
 57 |         labels = AgglomerativeClustering(
 58 |             n_clusters=n, metric="precomputed", linkage="complete"
 59 |         ).fit_predict(gd)
 60 |         X["label"] = labels
 61 |         X["label"] = "clus_" + X["label"].astype(str)
 62 |         X_num = X.groupby("label")[num_cols].agg("mean")
 63 |         if non_num_cols:
 64 |             X_nonnum = X.groupby("label")[non_num_cols].agg(get_most_common)
 65 |             X_sampled = X_num.join(X_nonnum)
 66 |         else:
 67 |             X_sampled = X_num
 68 |         X_sampled = X_sampled.reindex(X.columns, axis=1)
 69 |         return X_sampled
 70 |     elif method == "isoforest":
 71 |         X[non_num_cols] = X[non_num_cols].astype("str").astype("category")
 72 |         for col in non_num_cols:
 73 |             X[col] = X[col].astype("category").cat.codes
 74 |         idx = isof_find_sample(X, sample_weight=None)
 75 |         return X.iloc[idx, :]
 76 |     else:
 77 |         NotImplementedError(f"{method} not implemented")
 78 | 
 79 | 
 80 | def get_most_common(srs):
 81 |     x = list(srs)
 82 |     my_counter = Counter(x)
 83 |     return my_counter.most_common(1)[0][0]
 84 | 
 85 | 
 86 | def gower_matrix(
 87 |     data_x,
 88 |     data_y=None,
 89 |     weight=None,
 90 |     cat_features="auto",
 91 | ):
 92 |     """Computes the gower distances between X and Y
 93 | 
 94 |     Gower is a similarity measure for categorical, boolean and numerical mixed
 95 |     data.
 96 | 
 97 |     Parameters
 98 |     ----------
 99 |     data_x : np.array or pd.DataFrame
100 |         The data for computing the Gower distance
101 |     data_y : np.array or pd.DataFrame or pd.Series, optional
102 |         The reference matrix or vector to compare with, optional
103 |     weight : np.array or pd.Series, optional
104 |         sample weight, optional
105 |     cat_features : list of str or bool or int, optional
106 |         auto-detect cat features or a list of cat features, by default 'auto'
107 | 
108 |     Returns
109 |     -------
110 |     np.array
111 |         The Gower distance matrix, shape (n_samples, n_samples)
112 | 
113 |     Notes
114 |     -----
115 |     The non-numeric features, and numeric feature ranges are determined from X and not Y.
116 | 
117 |     Raises
118 |     ------
119 |     TypeError
120 |         If two dataframes are passed but have different number of columns
121 |     TypeError
122 |         If two arrays are passed but have different number of columns
123 |     TypeError
124 |         Sparse matrices are not supported
125 |     TypeError
126 |         if a list of categorical columns is passed, it should be a list of strings or integers or boolean values
127 |     """
128 |     # function checks
129 |     X = data_x
130 |     if data_y is None:
131 |         Y = data_x
132 |     else:
133 |         Y = data_y
134 |     if not isinstance(X, np.ndarray):
135 |         y_col = Y.columns if isinstance(Y, pd.DataFrame) else Y.index
136 |         if not np.array_equal(X.columns, y_col):
137 |             raise TypeError("X and Y must have same columns!")
138 |     else:
139 |         if not X.shape[1] == Y.shape[1]:
140 |             raise TypeError("X and Y must have same y-dim!")
141 |     if issparse(X) or issparse(Y):
142 |         raise TypeError("Sparse matrices are not supported!")
143 | 
144 |     x_n_rows, x_n_cols = X.shape
145 |     y_n_rows, y_n_cols = Y.shape
146 | 
147 |     if cat_features == "auto":
148 |         if not isinstance(X, np.ndarray):
149 |             is_number = np.vectorize(lambda x: not np.issubdtype(x, np.number))
150 |             cat_features = is_number(X.dtypes)
151 |         else:
152 |             cat_features = np.zeros(x_n_cols, dtype=bool)
153 |             for col in range(x_n_cols):
154 |                 if not np.issubdtype(type(X[0, col]), np.number):
155 |                     cat_features[col] = True
156 |     else:
157 |         # force categorical columns (if integer encoded for instance)
158 |         if is_list_of_str(cat_features):
159 |             cat_feat = [True if c in cat_features else False for c in X.columns]
160 |             cat_features = np.array(cat_feat)
161 |         elif is_list_of_bool(cat_features):
162 |             cat_features = np.array(cat_features)
163 |         elif is_list_of_int(cat_features):
164 |             cat_feat = [
165 |                 True if c in cat_features else False for c in range(len(X.columns))
166 |             ]
167 |             cat_features = np.array(cat_feat)
168 |         else:
169 |             raise TypeError(
170 |                 "If not 'auto' cat_features should be a list of strings, integers or Booleans"
171 |             )
172 | 
173 |     # print(cat_features)
174 | 
175 |     if not isinstance(X, np.ndarray):
176 |         X = np.asarray(X)
177 |     if not isinstance(Y, np.ndarray):
178 |         Y = np.asarray(Y)
179 | 
180 |     Z = np.concatenate((X, Y))
181 | 
182 |     x_index = range(0, x_n_rows)
183 |     y_index = range(x_n_rows, x_n_rows + y_n_rows)
184 | 
185 |     Z_num = Z[:, np.logical_not(cat_features)]
186 | 
187 |     num_cols = Z_num.shape[1]
188 |     num_ranges = np.zeros(num_cols)
189 |     num_max = np.zeros(num_cols)
190 | 
191 |     for col in range(num_cols):
192 |         col_array = Z_num[:, col].astype(np.float32)
193 |         max_ = np.nanmax(col_array)
194 |         min_ = np.nanmin(col_array)
195 | 
196 |         if np.isnan(max_):
197 |             max_ = 0.0
198 |         if np.isnan(min_):
199 |             min_ = 0.0
200 |         num_max[col] = max_
201 |         num_ranges[col] = (1 - min_ / max_) if (max_ != 0) else 0.0
202 | 
203 |     # This is to normalize the numeric values between 0 and 1.
204 |     Z_num = np.divide(
205 |         Z_num.astype(float),
206 |         num_max.astype(float),
207 |         out=np.zeros_like(Z_num).astype(float),
208 |         where=num_max != 0,
209 |     )
210 |     Z_cat = Z[:, cat_features]
211 | 
212 |     if weight is None:
213 |         weight = np.ones(Z.shape[1])
214 | 
215 |     # print(weight)
216 | 
217 |     weight_cat = weight[cat_features]
218 |     weight_num = weight[np.logical_not(cat_features)]
219 | 
220 |     out = np.zeros((x_n_rows, y_n_rows), dtype=np.float32)
221 | 
222 |     weight_sum = weight.sum()
223 | 
224 |     X_cat = Z_cat[x_index,]
225 |     X_num = Z_num[x_index,]
226 |     Y_cat = Z_cat[y_index,]
227 |     Y_num = Z_num[y_index,]
228 | 
229 |     # print(X_cat,X_num,Y_cat,Y_num)
230 | 
231 |     for i in range(x_n_rows):
232 |         j_start = i
233 |         if x_n_rows != y_n_rows:
234 |             j_start = 0
235 |         # call the main function
236 |         res = _gower_distance_row(
237 |             X_cat[i, :],
238 |             X_num[i, :],
239 |             Y_cat[j_start:y_n_rows, :],
240 |             Y_num[j_start:y_n_rows, :],
241 |             weight_cat,
242 |             weight_num,
243 |             weight_sum,
244 |             num_ranges,
245 |         )
246 |         # print(res)
247 |         out[i, j_start:] = res
248 |         if x_n_rows == y_n_rows:
249 |             out[i:, j_start] = res
250 | 
251 |     return out
252 | 
253 | 
254 | def _gower_distance_row(
255 |     xi_cat,
256 |     xi_num,
257 |     xj_cat,
258 |     xj_num,
259 |     feature_weight_cat,
260 |     feature_weight_num,
261 |     feature_weight_sum,
262 |     ranges_of_numeric,
263 | ):
264 |     """Compute a row of the Gower matrix
265 | 
266 |     Parameters
267 |     ----------
268 |     xi_cat : np.array
269 |         categorical row of the X matrix
270 |     xi_num : np.array
271 |         numerical row of the X matrix
272 |     xj_cat : np.array
273 |         categorical row of the X matrix
274 |     xj_num : np.array
275 |         numerical row of the X matrix
276 |     feature_weight_cat : np.array
277 |         weight vector for the categorical features
278 |     feature_weight_num : np.array
279 |         weight vector for the numerical features
280 |     feature_weight_sum : float
281 |         The sum of the weights
282 |     ranges_of_numeric : np.array
283 |         range of the scaled numerical features (between 0 and 1)
284 | 
285 |     Returns
286 |     -------
287 |     np.array : array
288 |         a row vector of the Gower distance
289 |     """
290 |     # categorical columns
291 |     sij_cat = np.where(xi_cat == xj_cat, np.zeros_like(xi_cat), np.ones_like(xi_cat))
292 |     sum_cat = np.multiply(feature_weight_cat, sij_cat).sum(axis=1)
293 | 
294 |     # numerical columns
295 |     abs_delta = np.absolute(xi_num - xj_num)
296 |     sij_num = np.divide(
297 |         abs_delta,
298 |         ranges_of_numeric,
299 |         out=np.zeros_like(abs_delta),
300 |         where=ranges_of_numeric != 0,
301 |     )
302 | 
303 |     sum_num = np.multiply(feature_weight_num, sij_num).sum(axis=1)
304 |     sums = np.add(sum_cat, sum_num)
305 |     sum_sij = np.divide(sums, feature_weight_sum)
306 | 
307 |     return sum_sij
308 | 
309 | 
310 | def smallest_indices(ary, n):
311 |     """Returns the n largest indices from a numpy array.
312 | 
313 |     Parameters
314 |     ----------
315 |     ary : np.array
316 |         the array for which to return largest indices
317 |     n : int
318 |         the number of indices to return
319 | 
320 |     Returns
321 |     -------
322 |     dict
323 |         the dictionary of indices and values of the largest elements
324 |     """
325 |     # n += 1
326 |     flat = np.nan_to_num(ary.flatten(), nan=999)
327 |     indices = np.argpartition(-flat, -n)[-n:]
328 |     indices = indices[np.argsort(flat[indices])]
329 |     # indices = np.delete(indices,0,0)
330 |     values = flat[indices]
331 |     return {"index": indices, "values": values}
332 | 
333 | 
334 | def gower_topn(
335 |     data_x,
336 |     data_y=None,
337 |     weight=None,
338 |     cat_features="auto",
339 |     n=5,
340 |     key=None,
341 | ):
342 |     """Get the n most similar elements
343 | 
344 |     Parameters
345 |     ----------
346 |     data_x : np.array or pd.DataFrame
347 |         The data for the look up
348 |     data_y : np.array or pd.DataFrame or pd.Series, optional
349 |         elements for which to return the most similar elements, should be a single row
350 |     weight : np.array or pd.Series, optional
351 |         sample weight, by default None
352 |     cat_features : list of str or bool or int, optional
353 |         auto detection of cat features or a list of strings, booleans or integers, by default 'auto'
354 |     n : int, optional
355 |         the number of neighbors/similar rows to find, by default 5
356 |     key : str, optional
357 |         identifier key. If several rows refer to the same id, this column
358 |         will be used for finding the nearest neighbors with a
359 |         different id, by default None
360 | 
361 |     Returns
362 |     -------
363 |     dict
364 |         the dictionary of indices and values of the closest elements
365 | 
366 |     Raises
367 |     ------
368 |     TypeError
369 |         if the reference element is not a single row
370 |     """
371 | 
372 |     if data_y.shape[0] >= 2:
373 |         raise TypeError("Only support `data_y` of 1 row. ")
374 |     if key is None:
375 |         dm = gower_matrix(data_y, data_x, weight, cat_features)
376 |     else:
377 |         X = data_x.drop(key, axis=1)
378 |         Y = data_x.drop(key, axis=1)
379 |         dm = gower_matrix(Y, X, weight, cat_features)
380 | 
381 |     if key is not None:
382 |         idx = smallest_indices(np.nan_to_num(dm[0], nan=1), n)["index"]
383 |         val = smallest_indices(np.nan_to_num(dm[0], nan=1), n)["values"]
384 |         unique_id = data_x.iloc[idx, :]
385 |         unique_id = unique_id[key]
386 |         nunique_id = unique_id.nunique()
387 |         mul = 1
388 |         # continue looking for the closest n unique records with a different id
389 |         while nunique_id < n:
390 |             idx = smallest_indices(np.nan_to_num(dm[0], nan=1), mul * n)["index"]
391 |             val = smallest_indices(np.nan_to_num(dm[0], nan=1), mul * n)["values"]
392 |             unique_id = data_x.iloc[idx, :].reset_index()
393 |             unique_id = unique_id[key]
394 |             nunique_id = unique_id.nunique()
395 |             mul += 1
396 | 
397 |         # find the indices of the unique id
398 |         _, idx_n = np.unique(unique_id, return_index=True)
399 |         # select only the rows corresponding to unique id
400 |         val = val[idx_n]
401 |         idx = idx[idx_n]
402 |         # sort them from the closest to the farthest, according to the Gower metrics
403 |         idx_n = np.argsort(val)
404 |         # return the n closest records, with a different id
405 |         return {"index": idx[idx_n[:n]], "values": val[idx_n[:n]]}
406 |     else:
407 |         return smallest_indices(np.nan_to_num(dm[0], nan=1), n)
408 | 
409 | 
410 | def get_5_percent_splits(length):
411 |     """splits dataframe into 5% intervals
412 | 
413 |     Parameters
414 |     ----------
415 |     length : int
416 |         array length
417 | 
418 |     Returns
419 |     -------
420 |     array
421 |         vector of sizes
422 |     """
423 | 
424 |     five_percent = round(5 / 100 * length)
425 |     return np.arange(five_percent, length, five_percent)
426 | 
427 | 
428 | def isolation_forest(X, sample_weight=None):
429 |     """fits isolation forest to the dataset and gives an anomaly score to every sample
430 | 
431 |     Parameters
432 |     ----------
433 |     X : pd.DataFrame or np.array
434 |         the predictors matrix
435 |     sample_weight : pd.Series or np.array, optional
436 |         the sample weights, if any, by default None
437 |     """
438 |     clf = IsolationForest().fit(X, sample_weight=sample_weight)
439 |     return clf.score_samples(X)
440 | 
441 | 
442 | def isof_find_sample(X, sample_weight=None):
443 |     """Finds a sample by comparing the distributions of the anomaly scores between the sample and the original
444 |     distribution using the KS-test. Starts of a 5% however will increase to 10% and then 15% etc. if a significant sample can not be found
445 | 
446 |     References
447 |     ----------
448 |     Sampling method taken from boruta_shap, author: https://github.com/Ekeany
449 | 
450 |     Parameters
451 |     ----------
452 |     X : pd.DataFrame
453 |         the predictors matrix
454 |     sample_weight : pd.Series or np.array, optional
455 |         the sample weights, if any, by default None
456 | 
457 |     Returns
458 |     -------
459 |     array
460 |         the indices for reducing the shadow predictors matrix
461 |     """
462 |     loop = True
463 |     iteration = 0
464 |     size = get_5_percent_splits(length=X.shape[0])
465 |     element = 1
466 |     preds = isolation_forest(X, sample_weight)
467 |     while loop:
468 |         sample_indices = np.random.choice(
469 |             np.arange(preds.size), size=size[element], replace=False
470 |         )
471 |         sample = np.take(preds, sample_indices)
472 |         if ks_2samp(preds, sample).pvalue > 0.95:
473 |             break
474 |         if iteration == 20:
475 |             element += 1
476 |             iteration = 0
477 |     return sample_indices
478 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_allrelevant.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | import lightgbm as lgb
  4 | from arfs.feature_selection.allrelevant import Leshy, BoostAGroota, GrootCV
  5 | from arfs.utils import (
  6 |     _make_corr_dataset_regression,
  7 |     _make_corr_dataset_classification,
  8 | )
  9 | from arfs.utils import LightForestClassifier, LightForestRegressor
 10 | 
 11 | 
 12 | class TestLeshy:
 13 |     """
 14 |     Test suite for all-relevant FS boruta-like method: Leshy
 15 |     """
 16 | 
 17 |     def test_borutaPy_vs_leshy_with_rfc_and_native_feature_importance(self):
 18 |         # too slow for circleci to run them in a reasonable time
 19 |         # takes 2 min on laptop, 1h or more on circleci
 20 |         # sklearn random forest implementation
 21 |         # X, y, w = _make_corr_dataset_classification()
 22 |         # rfc = RandomForestClassifier(max_features='sqrt', max_samples=0.632, n_estimators=100)
 23 |         # bt = BorutaPy(rfc)
 24 |         # bt.fit(X.values, y)
 25 |         # borutapy_rfc_list = sorted(list(X.columns[bt.support_]))
 26 | 
 27 |         # lightGBM random forest implementation
 28 |         baseline_list = ["var0", "var1", "var2", "var3", "var4"]
 29 |         X, y, w = _make_corr_dataset_classification(size=100)
 30 |         n_feat = X.shape[1]
 31 |         rfc = LightForestClassifier(n_feat)
 32 |         # RandomForestClassifier(max_features='sqrt', max_samples=0.632, n_estimators=100) # --> too slow
 33 |         arfs = Leshy(rfc, verbose=0, max_iter=10, random_state=42, importance="native")
 34 |         arfs.fit(X, y)
 35 |         leshy_rfc_list = sorted(arfs.feature_names_in_[arfs.support_])
 36 | 
 37 |         # assert borutapy_rfc_list == leshy_rfc_list, "same selected features are expected"
 38 |         assert bool(set(baseline_list) & set(leshy_rfc_list)), (
 39 |             "expect non-empty intersection"
 40 |         )
 41 | 
 42 |     def test_borutaPy_vs_leshy_with_rfr_and_native_feature_importance(self):
 43 |         # too slow for circleci to run them in a reasonable time
 44 |         # takes 2 min on laptop, 1h or more on circleci
 45 |         # # sklearn random forest implementation
 46 |         # X, y, w = _generated_corr_dataset_regr()
 47 |         # rfr = RandomForestRegressor(max_features=0.3, max_samples=0.632, n_estimators=100)
 48 |         # bt = BorutaPy(rfr)
 49 |         # bt.fit(X.values, y)
 50 |         # borutapy_rfc_list = sorted(list(X.columns[bt.support_]))
 51 | 
 52 |         # lightGBM random forest implementation
 53 |         baseline_list = ["var0", "var1", "var2", "var3", "var4"]
 54 |         X, y, w = _make_corr_dataset_regression(size=100)
 55 |         n_feat = X.shape[1]
 56 |         rfr = LightForestRegressor(n_feat)
 57 |         # rfr = RandomForestRegressor(max_features=0.3, max_samples=0.632, n_estimators=10)
 58 |         arfs = Leshy(rfr, verbose=0, max_iter=10, random_state=42, importance="native")
 59 |         arfs.fit(X, y)
 60 |         leshy_rfc_list = sorted(arfs.feature_names_in_[arfs.support_])
 61 | 
 62 |         # assert borutapy_rfc_list == leshy_rfc_list, "same selected features are expected"
 63 |         assert bool(set(baseline_list) & set(leshy_rfc_list)), (
 64 |             "expect non-empty intersection"
 65 |         )
 66 | 
 67 |     def test_borutaPy_vs_leshy_with_rfc_and_shap_feature_importance(self):
 68 |         # too slow for circleci to run them in a reasonable time
 69 |         # takes 2 min on laptop, 1h or more on circleci
 70 |         # # sklearn random forest implementation
 71 |         # X, y, w = _make_corr_dataset_classification()
 72 |         # rfc = RandomForestClassifier(max_features='sqrt', max_samples=0.632, n_estimators=100)
 73 |         # bt = BorutaPy(rfc)
 74 |         # bt.fit(X.values, y)
 75 |         # borutapy_rfc_list = sorted(list(X.columns[bt.support_]))
 76 | 
 77 |         # lightGBM random forest implementation
 78 |         baseline_list = ["var0", "var1", "var2", "var3", "var4"]
 79 |         X, y, w = _make_corr_dataset_classification(size=100)
 80 |         n_feat = X.shape[1]
 81 |         model = LightForestClassifier(n_feat)
 82 |         arfs = Leshy(model, verbose=0, max_iter=10, random_state=42, importance="shap")
 83 |         arfs.fit(X, y)
 84 |         leshy_rfc_list = sorted(arfs.feature_names_in_[arfs.support_])
 85 | 
 86 |         # assert borutapy_rfc_list == leshy_rfc_list, "same selected features are expected"
 87 |         assert bool(set(baseline_list) & set(leshy_rfc_list)), (
 88 |             "expect non-empty intersection"
 89 |         )
 90 | 
 91 |     def test_borutaPy_vs_leshy_with_rfr_and_shap_feature_importance(self):
 92 |         # too slow for circleci to run them in a reasonable time
 93 |         # takes 2 min on laptop, 1h or more on circleci
 94 |         # # sklearn random forest implementation
 95 |         # X, y, w = _generated_corr_dataset_regr()
 96 |         # rfr = RandomForestRegressor(max_features=0.3, max_samples=0.632, n_estimators=100)
 97 |         # bt = BorutaPy(rfr)
 98 |         # bt.fit(X.values, y)
 99 |         # borutapy_rfc_list = sorted(list(X.columns[bt.support_]))
100 | 
101 |         # lightGBM random forest implementation
102 |         baseline_list = ["var0", "var1", "var2", "var3", "var4"]
103 |         X, y, w = _make_corr_dataset_regression(size=500)
104 |         n_feat = X.shape[1]
105 |         model = LightForestRegressor(n_feat)
106 |         arfs = Leshy(model, verbose=0, max_iter=10, random_state=42, importance="shap")
107 |         arfs.fit(X, y)
108 |         leshy_rfc_list = sorted(arfs.feature_names_in_[arfs.support_])
109 | 
110 |         # assert borutapy_rfc_list == leshy_rfc_list, "same selected features are expected"
111 |         assert bool(set(baseline_list) & set(leshy_rfc_list)), (
112 |             "expect non-empty intersection"
113 |         )
114 | 
115 |     def test_leshy_clf_with_lgb_and_shap_feature_importance_and_sample_weight(self):
116 |         baseline_list = ["var0", "var1", "var2", "var3", "var4"]
117 | 
118 |         X, y, w = _make_corr_dataset_classification(size=500)
119 |         model = lgb.LGBMClassifier(verbose=-1, force_col_wise=True, n_estimators=10)
120 |         arfs = Leshy(model, verbose=0, max_iter=10, random_state=42, importance="shap")
121 |         arfs.fit(X, y, w)
122 |         leshy_list = sorted(arfs.feature_names_in_[arfs.support_])
123 | 
124 |         assert bool(set(baseline_list) & set(leshy_list)), (
125 |             "expect non-empty intersection"
126 |         )
127 | 
128 |     def test_leshy_regr_with_lgb_and_shap_feature_importance_and_sample_weight(self):
129 |         baseline_list = ["var0", "var1", "var2", "var3", "var4", "var5"]
130 | 
131 |         X, y, w = _make_corr_dataset_classification(size=500)
132 |         model = lgb.LGBMRegressor(verbose=-1, force_col_wise=True, n_estimators=10)
133 |         arfs = Leshy(model, verbose=0, max_iter=10, random_state=42, importance="shap")
134 |         arfs.fit(X, y, w)
135 |         leshy_list = sorted(arfs.feature_names_in_[arfs.support_])
136 | 
137 |         assert bool(set(baseline_list) & set(leshy_list)), (
138 |             "expect non-empty intersection"
139 |         )
140 | 
141 | 
142 | class TestBoostAGroota:
143 |     """
144 |     Test suite for all-relevant FS boruta-like method: Leshy
145 |     """
146 | 
147 |     def test_boostagroota_clf_with_lgb_and_shap_feature_importance_and_sample_weight(
148 |         self,
149 |     ):
150 |         baseline_list = ["var0", "var1", "var2", "var3", "var4"]
151 | 
152 |         X, y, w = _make_corr_dataset_classification(size=500)
153 |         model = lgb.LGBMClassifier(verbose=-1, force_col_wise=True, n_estimators=10)
154 |         arfs = BoostAGroota(
155 |             estimator=model,
156 |             cutoff=1,
157 |             iters=3,
158 |             max_rounds=3,
159 |             delta=0.1,
160 |             silent=False,
161 |             importance="shap",
162 |         )
163 |         arfs.fit(X, y, w)
164 |         leshy_list = sorted(arfs.feature_names_in_[arfs.support_])
165 | 
166 |         assert bool(set(baseline_list) & set(leshy_list)), (
167 |             "expect non-empty intersection"
168 |         )
169 | 
170 |     def test_boostagroota_clf_with_lgb_and_pimp_feature_importance_and_sample_weight(
171 |         self,
172 |     ):
173 |         baseline_list = ["var0", "var1", "var2", "var3", "var4"]
174 | 
175 |         X, y, w = _make_corr_dataset_classification(size=500)
176 |         model = lgb.LGBMClassifier(verbose=-1, force_col_wise=True, n_estimators=10)
177 |         arfs = BoostAGroota(
178 |             estimator=model,
179 |             cutoff=1,
180 |             iters=3,
181 |             max_rounds=3,
182 |             delta=0.1,
183 |             silent=False,
184 |             importance="pimp",
185 |         )
186 |         arfs.fit(X, y, w)
187 |         leshy_list = sorted(arfs.feature_names_in_[arfs.support_])
188 | 
189 |         assert bool(set(baseline_list) & set(leshy_list)), (
190 |             "expect non-empty intersection"
191 |         )
192 | 
193 |     def test_boostagroota_rgr_with_lgb_and_shap_feature_importance_and_sample_weight(
194 |         self,
195 |     ):
196 |         baseline_list = ["var0", "var1", "var2", "var3", "var4", "var5"]
197 | 
198 |         X, y, w = _make_corr_dataset_regression(size=500)
199 |         model = lgb.LGBMRegressor(verbose=-1, force_col_wise=True, n_estimators=10)
200 |         arfs = BoostAGroota(
201 |             estimator=model,
202 |             cutoff=1,
203 |             iters=3,
204 |             max_rounds=3,
205 |             delta=0.1,
206 |             silent=False,
207 |             importance="shap",
208 |         )
209 |         arfs.fit(X, y, w)
210 |         leshy_list = sorted(arfs.feature_names_in_[arfs.support_])
211 | 
212 |         assert bool(set(baseline_list) & set(leshy_list)), (
213 |             "expect non-empty intersection"
214 |         )
215 | 
216 |     def test_boostagroota_regr_with_lgb_and_pimp_feature_importance_and_sample_weight(
217 |         self,
218 |     ):
219 |         baseline_list = ["var0", "var1", "var2", "var3", "var4", "var5"]
220 | 
221 |         X, y, w = _make_corr_dataset_regression(size=500)
222 |         model = lgb.LGBMRegressor(verbose=-1, force_col_wise=True, n_estimators=10)
223 |         arfs = BoostAGroota(
224 |             estimator=model,
225 |             cutoff=1,
226 |             iters=3,
227 |             max_rounds=3,
228 |             delta=0.1,
229 |             silent=False,
230 |             importance="pimp",
231 |         )
232 |         arfs.fit(X, y, w)
233 |         leshy_list = sorted(arfs.feature_names_in_[arfs.support_])
234 | 
235 |         assert bool(set(baseline_list) & set(leshy_list)), (
236 |             "expect non-empty intersection"
237 |         )
238 | 
239 | 
240 | class TestGrootCV:
241 |     """
242 |     Test suite for all-relevant FS boruta-like method: Leshy
243 |     """
244 | 
245 |     def test_grootcv_classification_with_and_sample_weight(self):
246 |         baseline_list = ["var0", "var1", "var2", "var3", "var4"]
247 | 
248 |         X, y, w = _make_corr_dataset_classification(size=100)
249 |         arfs = GrootCV(objective="binary", cutoff=1, n_folds=3, n_iter=3, silent=False)
250 |         arfs.fit(X, y, w)
251 |         grootcv_list = sorted(arfs.feature_names_in_[arfs.support_])
252 | 
253 |         assert bool(set(baseline_list) & set(grootcv_list)), (
254 |             "expect non-empty intersection"
255 |         )
256 | 
257 |     def test_grootcv_regression_with_and_sample_weight(self):
258 |         baseline_list = ["var0", "var1", "var2", "var3", "var4", "var5"]
259 | 
260 |         X, y, w = _make_corr_dataset_regression(size=100)
261 |         arfs = GrootCV(objective="l2", cutoff=1, n_folds=3, n_iter=3, silent=False)
262 |         arfs.fit(X, y, w)
263 |         grootcv_list = sorted(arfs.feature_names_in_[arfs.support_])
264 | 
265 |         assert bool(set(baseline_list) & set(grootcv_list)), (
266 |             "expect non-empty intersection"
267 |         )
268 | 


--------------------------------------------------------------------------------
/tests/test_featselect.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | import pandas as pd
  4 | from arfs.feature_selection import (
  5 |     MissingValueThreshold,
  6 |     UniqueValuesThreshold,
  7 |     CardinalityThreshold,
  8 |     CollinearityThreshold,
  9 | )
 10 | from arfs.utils import (
 11 |     _make_corr_dataset_regression,
 12 |     _make_corr_dataset_classification,
 13 | )
 14 | 
 15 | 
 16 | class TestFeatSelectMissing:
 17 |     """
 18 |     Test suite for FeatureSelector, missing values
 19 |     """
 20 | 
 21 |     def test_identify_missing_for_classification(self):
 22 |         # not task dependent (same for clf and regr)
 23 |         X, y, w = _make_corr_dataset_classification(size=10)
 24 |         fs = MissingValueThreshold(threshold=0.01)
 25 |         fs.fit(X)
 26 |         message = "Expected: {0}, Actual: {1}".format(
 27 |             "var12", fs.not_selected_features_
 28 |         )
 29 |         assert fs.not_selected_features_ == ["var12"], message
 30 | 
 31 | 
 32 | class TestFeatSelectZeroVariance:
 33 |     """
 34 |     Test suite for FeatureSelector, missing values
 35 |     """
 36 | 
 37 |     def test_identify_single_unique_classification(self):
 38 |         # not task dependent (same for clf and regr)
 39 |         X, y, w = _make_corr_dataset_classification(size=10)
 40 |         fs = UniqueValuesThreshold(threshold=2)
 41 |         fs.fit(X)
 42 |         message = "Expected: {0}, Actual: {1}".format(
 43 |             "var10", fs.not_selected_features_
 44 |         )
 45 |         assert fs.not_selected_features_ == ["var10"], message
 46 | 
 47 | 
 48 | class TestFeatSelectHighCardinality:
 49 |     """
 50 |     Test suite for FeatureSelector, high cardinality
 51 |     """
 52 | 
 53 |     def test_identify_high_cardinality_classification(self):
 54 |         # not task dependent (same for clf and regr)
 55 |         X, y, w = _make_corr_dataset_classification(size=100)
 56 |         fs = CardinalityThreshold(threshold=5)
 57 |         fs.fit(X)
 58 |         expected = sorted(["dummy", "nice_guys"])
 59 |         actual = sorted(list(fs.not_selected_features_))
 60 |         message = "Expected: {0}, Actual: {1}".format(expected, actual)
 61 |         assert actual == expected, message
 62 | 
 63 | 
 64 | # class TestFeatSelectCollinearity:
 65 | #     """
 66 | #     test suite for FeatureSelector, high cardinality
 67 | #     """
 68 | 
 69 | #     def test_identify_collinear_spearman_no_encoding(self):
 70 | #         X, y, w = _generated_corr_dataset_regr(size=100)
 71 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
 72 | #         fs.identify_collinear(correlation_threshold=0.5, encode=False, method='spearman')
 73 | #         message = "Expected: {0}, Actual: {1}".format(['var2', 'var3', 'var4', 'var12'], fs.ops['collinear'])
 74 | #         assert fs.ops['collinear'] == ['var2', 'var3', 'var4', 'var12'], message
 75 | 
 76 | #     def test_identify_collinear_pearson_no_encoding(self):
 77 | #         X, y, w = _generated_corr_dataset_regr(size=100)
 78 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
 79 | #         fs.identify_collinear(correlation_threshold=0.5, encode=False, method='pearson')
 80 | #         message = "Expected: {0}, Actual: {1}".format(['var2', 'var3', 'var12'], fs.ops['collinear'])
 81 | #         assert fs.ops['collinear'] == ['var2', 'var3', 'var12'], message
 82 | 
 83 | #     def test_identify_collinear_spearman_with_encoding(self):
 84 | #         X, y, w = _generated_corr_dataset_regr(size=100)
 85 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
 86 | #         fs.identify_collinear(correlation_threshold=0.5, encode=True, method='spearman')
 87 | #         message = "Expected: {0}, Actual: {1}".format(['var2', 'var3', 'var4', 'var12'], fs.ops['collinear'])
 88 | #         assert fs.ops['collinear'] == ['var2', 'var3', 'var4', 'var12'], message
 89 | 
 90 | #     def test_identify_collinear_pearson_with_encoding(self):
 91 | #         X, y, w = _generated_corr_dataset_regr(size=100)
 92 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
 93 | #         fs.identify_collinear(correlation_threshold=0.5, encode=True, method='pearson')
 94 | #         message = "Expected: {0}, Actual: {1}".format(['var2', 'var3', 'var12'], fs.ops['collinear'])
 95 | #         assert fs.ops['collinear'] == ['var2', 'var3', 'var12'], message
 96 | 
 97 | 
 98 | # class TestFeatSelectZeroImportance:
 99 | #     """
100 | #     test suite for FeatureSelector, high cardinality
101 | #     """
102 | 
103 | #     def test_identify_zero_importance_for_regression_with_early_stopping(self):
104 | #         X, y, w = _generated_corr_dataset_regr(size=100)
105 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
106 | #         fs.identify_zero_importance(task='regression', eval_metric='l2', objective='l2', n_iterations=2,
107 | #                                     early_stopping=True)
108 | #         message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance'])
109 | #         assert 'var10' in fs.ops['zero_importance'], message
110 | 
111 | #     @pytest.mark.xfail
112 | #     def test_identify_zero_importance_for_regression_with_early_stopping_no_eval_metric(self):
113 | #         X, y, w = _generated_corr_dataset_regr(size=100)
114 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
115 | #         # Xfail: expected to fail because the eval metric is not provided
116 | #         fs.identify_zero_importance(task='regression', eval_metric=None, objective='l2', n_iterations=2,
117 | #                                     early_stopping=True)
118 | #         message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance'])
119 | #         assert 'var10' in fs.ops['zero_importance'], message
120 | 
121 | #     @pytest.mark.xfail
122 | #     def test_identify_zero_importance_for_regression_with_early_stopping_no_eval_metric_no_objective(self):
123 | #         X, y, w = _generated_corr_dataset_regr(size=100)
124 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
125 | #         # Xfail: expected to fail because the eval metric is not provided
126 | #         fs.identify_zero_importance(task='regression', eval_metric=None, objective=None, n_iterations=2,
127 | #                                     early_stopping=True)
128 | #         message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance'])
129 | #         assert 'var10' in fs.ops['zero_importance'], message
130 | 
131 | #     @pytest.mark.xfail
132 | #     def test_identify_zero_importance_for_regression_with_early_stopping_wrong_task(self):
133 | #         X, y, w = _generated_corr_dataset_regr(size=10)
134 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
135 | #         # Xfail: expected to fail because the eval metric is not provided
136 | #         fs.identify_zero_importance(task='classification', eval_metric='l2', objective='l2', n_iterations=2,
137 | #                                     early_stopping=True)
138 | #         message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance'])
139 | #         assert 'var10' in fs.ops['zero_importance'], message
140 | 
141 | #     def test_identify_zero_importance_for_regression_without_early_stopping(self):
142 | #         X, y, w = _generated_corr_dataset_regr(size=100)
143 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
144 | #         fs.identify_zero_importance(task='regression', eval_metric='l2', objective='l2', n_iterations=2,
145 | #                                     early_stopping=False)
146 | #         message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance'])
147 | #         assert 'var10' in fs.ops['zero_importance'], message
148 | 
149 | #     def test_identify_zero_importance_for_regression_without_early_stopping_no_objective(self):
150 | #         X, y, w = _generated_corr_dataset_regr(size=100)
151 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
152 | #         fs.identify_zero_importance(task='regression', n_iterations=2,
153 | #                                     early_stopping=False)
154 | #         message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance'])
155 | #         assert 'var10' in fs.ops['zero_importance'], message
156 | 
157 | #     def test_identify_zero_importance_for_classification_with_early_stopping(self):
158 | #         X, y, w = _make_corr_dataset_classification(size=100)
159 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
160 | #         fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=2,
161 | #                                     early_stopping=True)
162 | #         message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance'])
163 | #         assert 'var10' in fs.ops['zero_importance'], message
164 | 
165 | #     @pytest.mark.xfail
166 | #     def test_identify_zero_importance_for_classification_with_early_stopping_no_eval_metric(self):
167 | #         X, y, w = _make_corr_dataset_classification(size=10)
168 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
169 | #         # Xfail: expected to fail because the eval metric is not provided
170 | #         fs.identify_zero_importance(task='classification', eval_metric=None, n_iterations=2,
171 | #                                     early_stopping=True)
172 | #         message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance'])
173 | #         assert 'var10' in fs.ops['zero_importance'], message
174 | 
175 | #     @pytest.mark.xfail
176 | #     def test_identify_zero_importance_for_classification_with_early_stopping_no_eval_metric_no_objective(self):
177 | #         X, y, w = _make_corr_dataset_classification(size=10)
178 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
179 | #         # Xfail: expected to fail because the eval metric is not provided
180 | #         fs.identify_zero_importance(task='classification', eval_metric=None, objective=None, n_iterations=2,
181 | #                                     early_stopping=True)
182 | #         message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance'])
183 | #         assert 'var10' in fs.ops['zero_importance'], message
184 | 
185 | #     @pytest.mark.xfail
186 | #     def test_identify_zero_importance_for_classification_with_early_stopping_wrong_task(self):
187 | #         X, y, w = _make_corr_dataset_classification(size=10)
188 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
189 | #         # Xfail: expected to fail because the eval metric is not provided
190 | #         fs.identify_zero_importance(task='regression', eval_metric='auc', objective='cross-entropy', n_iterations=2,
191 | #                                     early_stopping=True)
192 | #         message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance'])
193 | #         assert 'var10' in fs.ops['zero_importance'], message
194 | 
195 | #     def test_identify_zero_importance_for_classification_without_early_stopping(self):
196 | #         X, y, w = _make_corr_dataset_classification(size=100)
197 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
198 | #         fs.identify_zero_importance(task='classification', objective='binary', n_iterations=2,
199 | #                                     early_stopping=False)
200 | #         message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance'])
201 | #         assert 'var10' in fs.ops['zero_importance'], message
202 | 
203 | #     def test_identify_zero_importance_for_classification_without_early_stopping_no_objective(self):
204 | #         X, y, w = _make_corr_dataset_classification(size=100)
205 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
206 | #         fs.identify_zero_importance(task='classification', n_iterations=2,
207 | #                                     early_stopping=False)
208 | #         message = "Expected: {0}, Actual: {1}".format(['var10'], fs.ops['zero_importance'])
209 | #         assert 'var10' in fs.ops['zero_importance'], message
210 | 
211 | 
212 | # class TestFeatSelectLowImportance:
213 | #     """
214 | #     test suite for FeatureSelector, high cardinality
215 | #     """
216 | 
217 | #     def test_identify_low_importance_for_regression_with_early_stopping(self):
218 | #         X, y, w = _generated_corr_dataset_regr(size=100)
219 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
220 | #         fs.identify_zero_importance(task='regression', eval_metric='l2', objective='l2', n_iterations=2,
221 | #                                     early_stopping=True)
222 | #         cum_imp_threshold = 0.95
223 | #         fs.identify_low_importance(cumulative_importance=cum_imp_threshold)
224 | #         expected = 1
225 | #         message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance']))
226 | #         assert len(fs.ops['low_importance']) >= expected, message
227 | 
228 | #     @pytest.mark.xfail
229 | #     def test_identify_low_importance_for_regression_with_early_stopping_no_eval_metric(self):
230 | #         X, y, w = _generated_corr_dataset_regr(size=100)
231 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
232 | #         # Xfail: expected to fail because the eval metric is not provided
233 | #         fs.identify_zero_importance(task='regression', eval_metric=None, objective='l2', n_iterations=2,
234 | #                                     early_stopping=True)
235 | #         cum_imp_threshold = 0.95
236 | #         fs.identify_low_importance(cumulative_importance=cum_imp_threshold)
237 | #         expected = 1
238 | #         message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance']))
239 | #         assert len(fs.ops['low_importance']) >= expected, message
240 | 
241 | #     @pytest.mark.xfail
242 | #     def test_identify_low_importance_for_regression_with_early_stopping_no_eval_metric_no_objective(self):
243 | #         X, y, w = _generated_corr_dataset_regr(size=100)
244 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
245 | #         # Xfail: expected to fail because the eval metric is not provided
246 | #         fs.identify_zero_importance(task='regression', eval_metric=None, objective=None, n_iterations=2,
247 | #                                     early_stopping=True)
248 | #         cum_imp_threshold = 0.95
249 | #         fs.identify_low_importance(cumulative_importance=cum_imp_threshold)
250 | #         expected = 1
251 | #         message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance']))
252 | #         assert len(fs.ops['low_importance']) >= expected, message
253 | 
254 | #     @pytest.mark.xfail
255 | #     def test_identify_low_importance_for_regression_with_early_stopping_wrong_task(self):
256 | #         X, y, w = _generated_corr_dataset_regr(size=100)
257 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
258 | #         # Xfail: expected to fail because the eval metric is not provided
259 | #         fs.identify_zero_importance(task='classification', eval_metric='l2', objective='l2', n_iterations=2,
260 | #                                     early_stopping=True)
261 | #         cum_imp_threshold = 0.95
262 | #         fs.identify_low_importance(cumulative_importance=cum_imp_threshold)
263 | #         expected = 1
264 | #         message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance']))
265 | #         assert len(fs.ops['low_importance']) >= expected, message
266 | 
267 | #     def test_identify_low_importance_for_regression_without_early_stopping(self):
268 | #         X, y, w = _generated_corr_dataset_regr(size=100)
269 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
270 | #         fs.identify_zero_importance(task='regression', objective='l2', n_iterations=2, early_stopping=False)
271 | #         cum_imp_threshold = 0.95
272 | #         fs.identify_low_importance(cumulative_importance=cum_imp_threshold)
273 | #         expected = 1
274 | #         message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance']))
275 | #         assert len(fs.ops['low_importance']) >= expected, message
276 | 
277 | #     def test_identify_low_importance_for_regression_without_early_stopping_no_objective(self):
278 | #         X, y, w = _generated_corr_dataset_regr(size=100)
279 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
280 | #         fs.identify_zero_importance(task='regression', n_iterations=2, early_stopping=False)
281 | #         cum_imp_threshold = 0.95
282 | #         fs.identify_low_importance(cumulative_importance=cum_imp_threshold)
283 | #         expected = 1
284 | #         message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance']))
285 | #         assert len(fs.ops['low_importance']) >= expected, message
286 | 
287 | #     def test_identify_low_importance_for_classification_with_early_stopping(self):
288 | #         X, y, w = _make_corr_dataset_classification(size=100)
289 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
290 | #         fs.identify_zero_importance(task='classification', eval_metric='auc', n_iterations=2, early_stopping=True)
291 | #         cum_imp_threshold = 0.95
292 | #         fs.identify_low_importance(cumulative_importance=cum_imp_threshold)
293 | #         expected = 1
294 | #         message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance']))
295 | #         assert len(fs.ops['low_importance']) >= expected, message
296 | 
297 | #     @pytest.mark.xfail
298 | #     def test_identify_low_importance_for_classification_with_early_stopping_no_eval_metric(self):
299 | #         X, y, w = _make_corr_dataset_classification(size=100)
300 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
301 | #         # Xfail: expected to fail because the eval metric is not provided
302 | #         fs.identify_zero_importance(task='classification', eval_metric=None, n_iterations=2,
303 | #                                     early_stopping=True)
304 | #         cum_imp_threshold = 0.95
305 | #         fs.identify_low_importance(cumulative_importance=cum_imp_threshold)
306 | #         expected = 1
307 | #         message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance']))
308 | #         assert len(fs.ops['low_importance']) >= expected, message
309 | 
310 | #     @pytest.mark.xfail
311 | #     def test_identify_low_importance_for_classification_with_early_stopping_no_eval_metric_no_objective(self):
312 | #         X, y, w = _make_corr_dataset_classification(size=100)
313 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
314 | #         # Xfail: expected to fail because the eval metric is not provided
315 | #         fs.identify_zero_importance(task='classification', eval_metric=None, objective=None, n_iterations=2,
316 | #                                     early_stopping=True)
317 | #         cum_imp_threshold = 0.95
318 | #         fs.identify_low_importance(cumulative_importance=cum_imp_threshold)
319 | #         expected = 1
320 | #         message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance']))
321 | #         assert len(fs.ops['low_importance']) >= expected, message
322 | 
323 | #     @pytest.mark.xfail
324 | #     def test_identify_low_importance_for_classification_with_early_stopping_wrong_task(self):
325 | #         X, y, w = _make_corr_dataset_classification(size=100)
326 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
327 | #         # Xfail: expected to fail because the eval metric is not provided
328 | #         fs.identify_zero_importance(task='regression', eval_metric='auc', objective='cross-entropy', n_iterations=2,
329 | #                                     early_stopping=True)
330 | #         cum_imp_threshold = 0.95
331 | #         fs.identify_low_importance(cumulative_importance=cum_imp_threshold)
332 | #         expected = 1
333 | #         message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance']))
334 | #         assert len(fs.ops['low_importance']) >= expected, message
335 | 
336 | #     def test_identify_low_importance_for_classification_without_early_stopping(self):
337 | #         X, y, w = _make_corr_dataset_classification(size=100)
338 | #         fs = FeatureSelector(X=X, y=y, sample_weight=w)
339 | #         fs.identify_zero_importance(task='classification', n_iterations=2, early_stopping=False)
340 | #         cum_imp_threshold = 0.95
341 | #         fs.identify_low_importance(cumulative_importance=cum_imp_threshold)
342 | #         expected = 1
343 | #         message = "Expected at least one predictor ruled out, Actual: {0}".format(sorted(fs.ops['low_importance']))
344 | #         assert len(fs.ops['low_importance']) >= expected, message
345 | 


--------------------------------------------------------------------------------