├── .github
├── FUNDING.yml
├── dependabot.yml
└── workflows
│ └── ruff.yml
├── .gitignore
├── .readthedocs.yml
├── .vs
├── VSWorkspaceState.json
├── allrelevantfs
│ └── v17
│ │ └── .suo
└── slnx.sqlite
├── CHANGELOG.md
├── CITATION.cff
├── LICENSE.md
├── README.md
├── docs
├── Introduction.rst
├── Makefile
├── Methods overview.rst
├── arfs.feature_selection.rst
├── arfs.rst
├── boostaroota.png
├── boruta.png
├── conf.py
├── grootcv.png
├── index.rst
├── logo.png
├── make.bat
├── modules.rst
├── notebooks
│ ├── arfs_boruta_borutaShap_comparison.ipynb
│ ├── arfs_classification.ipynb
│ ├── arfs_grootcv_custom_params.ipynb
│ ├── arfs_large_data_sampling.ipynb
│ ├── arfs_non_normal_loss_and_sample_weight.ipynb
│ ├── arfs_on_GPU.ipynb
│ ├── arfs_regression.ipynb
│ ├── arfs_shap_vs_fastshap.ipynb
│ ├── arfs_timeseries.ipynb
│ ├── association_and_feature_selection.ipynb
│ ├── basic_feature_selection.ipynb
│ ├── bender_hex_mini.png
│ ├── issue_categoricals.ipynb
│ ├── issue_collinearity.ipynb
│ ├── lasso_feature_selection.ipynb
│ ├── mrmr_feature_selection.ipynb
│ ├── mrmr_fs_VS_arfs.ipynb
│ └── preprocessing.ipynb
└── requirements.txt
├── images
├── boostagroota-boston-lgb.png
├── grootcv-boston.png
├── leshy-boston.png
├── leshy-titanic-catboost-shap.png
├── leshy-titanic-lgbm-shap.png
└── leshy-titanic-rndforest-shap.png
├── logo.png
├── pyproject.toml
├── src
└── arfs
│ ├── .gitignore
│ ├── __init__.py
│ ├── association.py
│ ├── benchmark.py
│ ├── dataset
│ ├── data
│ │ ├── boston_bunch.joblib
│ │ └── housing.zip
│ └── descr
│ │ └── housing.rst
│ ├── feature_selection
│ ├── __init__.py
│ ├── allrelevant.py
│ ├── base.py
│ ├── lasso.py
│ ├── mrmr.py
│ ├── summary.py
│ ├── unsupervised.py
│ └── variable_importance.py
│ ├── gbm.py
│ ├── parallel.py
│ ├── preprocessing.py
│ ├── sampling.py
│ └── utils.py
├── tests
├── __init__.py
├── test_allrelevant.py
└── test_featselect.py
└── uv.lock
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
4 | patreon: # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: V7V72SOHX
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # To get started with Dependabot version updates, you'll need to specify which
2 | # package ecosystems to update and where the package manifests are located.
3 | # Please see the documentation for all configuration options:
4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5 |
6 | version: 2
7 | updates:
8 | - package-ecosystem: "pip"
9 | directory: "/"
10 | schedule:
11 | interval: "weekly"
12 | allow:
13 | # Allow only direct updates for
14 | # Django and any packages starting "django"
15 | - dependency-name: "django*"
16 | dependency-type: "direct"
17 | # Allow only production updates for Sphinx
18 | - dependency-name: "sphinx"
19 | dependency-type: "production"
20 |
--------------------------------------------------------------------------------
/.github/workflows/ruff.yml:
--------------------------------------------------------------------------------
1 | name: Ruff
2 | on: [ push, pull_request ]
3 | jobs:
4 | ruff:
5 | runs-on: ubuntu-latest
6 | steps:
7 | - uses: actions/checkout@v4
8 | - uses: astral-sh/ruff-action@v3
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ### Python template
2 | # example NB
3 | examples/catboost_info/
4 | examples/.ipynb_checkpoints/
5 | examples/cb_model.json
6 |
7 | # Byte-compiled / optimized / DLL files
8 | __pycache__/
9 | *.py[cod]
10 | *$py.class
11 |
12 | # C extensions
13 | *.so
14 |
15 | # ide
16 | .idea/
17 | .vscode/
18 |
19 | # Distribution / packaging
20 | .Python
21 | build/
22 | develop-eggs/
23 | dist/
24 | downloads/
25 | eggs/
26 | .eggs/
27 | lib/
28 | lib64/
29 | parts/
30 | sdist/
31 | var/
32 | wheels/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg
36 | MANIFEST
37 |
38 | # PyInstaller
39 | # Usually these files are written by a python script from a template
40 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
41 | *.manifest
42 | *.spec
43 |
44 | # Installer logs
45 | pip-log.txt
46 | pip-delete-this-directory.txt
47 |
48 | # Unit test / coverage reports
49 | htmlcov/
50 | .tox/
51 | .coverage
52 | .coverage.*
53 | .cache
54 | nosetests.xml
55 | coverage.xml
56 | *.cover
57 | .hypothesis/
58 | .pytest_cache/
59 |
60 | # Translations
61 | *.mo
62 | *.pot
63 |
64 | # Django stuff:
65 | *.log
66 | local_settings.py
67 | db.sqlite3
68 |
69 | # Flask stuff:
70 | instance/
71 | .webassets-cache
72 |
73 | # Scrapy stuff:
74 | .scrapy
75 |
76 | # Sphinx documentation
77 | docs/_build/
78 |
79 | # PyBuilder
80 | target/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # celery beat schedule file
89 | celerybeat-schedule
90 |
91 | # SageMath parsed files
92 | *.sage.py
93 |
94 | # Environments
95 | .env
96 | .venv
97 | env/
98 | venv/
99 | ENV/
100 | env.bak/
101 | venv.bak/
102 |
103 | # Spyder project settings
104 | .spyderproject
105 | .spyproject
106 |
107 | # Rope project settings
108 | .ropeproject
109 |
110 | # mkdocs documentation
111 | /site
112 |
113 | # mypy
114 | .mypy_cache/
115 |
116 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
117 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
118 |
119 | # User-specific stuff
120 | .idea/**/workspace.xml
121 | .idea/**/tasks.xml
122 | .idea/**/dictionaries
123 | .idea/**/shelf
124 |
125 | # Sensitive or high-churn files
126 | .idea/**/dataSources/
127 | .idea/**/dataSources.ids
128 | .idea/**/dataSources.local.xml
129 | .idea/**/sqlDataSources.xml
130 | .idea/**/dynamic.xml
131 | .idea/**/uiDesigner.xml
132 | .idea/**/dbnavigator.xml
133 |
134 | # Gradle
135 | .idea/**/gradle.xml
136 | .idea/**/libraries
137 |
138 | # CMake
139 | cmake-build-debug/
140 | cmake-build-release/
141 |
142 | # Mongo Explorer plugin
143 | .idea/**/mongoSettings.xml
144 |
145 | # File-based project format
146 | *.iws
147 |
148 | # IntelliJ
149 | out/
150 |
151 | # mpeltonen/sbt-idea plugin
152 | .idea_modules/
153 |
154 | # JIRA plugin
155 | atlassian-ide-plugin.xml
156 |
157 | # Cursive Clojure plugin
158 | .idea/replstate.xml
159 |
160 | # Crashlytics plugin (for Android Studio and IntelliJ)
161 | com_crashlytics_export_strings.xml
162 | crashlytics.properties
163 | crashlytics-build.properties
164 | fabric.properties
165 |
166 | # Editor-based Rest Client
167 | .idea/httpRequests
168 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # Read the Docs configuration file for Sphinx projects
2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
3 |
4 | # Required
5 | version: 2
6 |
7 | # Set the OS, Python version and other tools you might need
8 | build:
9 | os: "ubuntu-22.04"
10 | tools:
11 | python: "3.10"
12 | jobs:
13 | post_install:
14 | - pip uninstall -y sphinx-rtd-theme
15 |
16 | # Build documentation in the "docs/" directory with Sphinx
17 | sphinx:
18 | configuration: docs/conf.py
19 |
20 | # We recommend specifying your dependencies to enable reproducible builds:
21 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
22 | python:
23 | install:
24 | - requirements: docs/requirements.txt
25 | - method: pip
26 | path: .
27 | extra_requirements:
28 | - docs
29 |
--------------------------------------------------------------------------------
/.vs/VSWorkspaceState.json:
--------------------------------------------------------------------------------
1 | {
2 | "ExpandedNodes": [
3 | "",
4 | "\\arfs"
5 | ],
6 | "SelectedNode": "\\arfs\\allrelevant.py",
7 | "PreviewInSolutionExplorer": false
8 | }
--------------------------------------------------------------------------------
/.vs/allrelevantfs/v17/.suo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/.vs/allrelevantfs/v17/.suo
--------------------------------------------------------------------------------
/.vs/slnx.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/.vs/slnx.sqlite
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changes
2 |
3 | # 3.0.0
4 |
5 | - [ENHANCEMENT] Upgrade to newer SHAP and lightgbm version
6 | - [ENHANCEMENT] Migrate project management to `uv`
7 |
8 | # 2.4.0
9 | - [BUG] Add a safety for the array size in the weighted correlation ratio
10 | - [DOC] Contribution for better documentation, typos and fixing docstrings
11 |
12 | # 2.3.3
13 | - [BUG] Fix range, which should run from 1 to `max_iter`
14 |
15 | # 2.3.2
16 | - [BUG] Fix errors generated when updating dependencies with different naming for arguments
17 |
18 | # 2.3.1
19 | - [BUG] replace np.Inf by np.inf for compatibility purpose
20 |
21 | # 2.3.0
22 | - [BUG] corrected the column names for the GrootCV scheme, setting the shadow var in last position to guarantee the real names are used
23 | - [ENHANCEMENT] support user defined cross-validation scheme for time series applications for GrootCV
24 |
25 | # 2.2.6
26 | - [BUG] fix the calculation of the SHAP feature importance for multi-class
27 | - [ENHANCEMENT] Update pandas aggregation to get rid of the future deprecation warnings
28 |
29 | # 2.2.5
30 | - [BUG] fix the calculation of the SHAP feature importance for multi-class
31 | - [ENHANCEMENT] return the feature for the importance
32 |
33 | # 2.2.4
34 | - [BUG] add axis=1 to compute the max on the right dimension in _reduce_vars_sklearn
35 | - [BUG] remove merge causing duplication of the feature importance in _reduce_vars_sklearn
36 |
37 | # 2.2.3
38 | - [BUG] change the default of the weighted correlation for consistency with existing doc
39 | - [ENHANCEMENTS] speedup the correlation feature selector
40 | # 2.2.1
41 | - [BUG] add copy() to prevent modifying the input pandas DF in the mrmr when fitting the mrmr selector
42 | # 2.2.0
43 | - [BUG] fix the collinearity feature elimination
44 | - [BUG] fix the feature importance if fasttreeshap not installed
45 | - [REFACTORING] refactor the association module for removing redundancy and faster computation
46 | # 2.1.3
47 | - [BUG] fix the hardcoded threshold in collinearity elimination, closes #33
48 | # 2.1.2
49 | - [BUG] fix a bug in computing the association matrix when a single column of a specific dtype is passed in the sub_matrix (nom-nom, num-num) calculators.
50 | # 2.1.1
51 | - Refactor TreeDiscretizer
52 | # 2.1.0
53 | - Add a mechanism to the TreeDiscretizer that restricts the length of combined strings for categorical columns, preventing excessively lengthy entries.
54 | # 2.0.7
55 | - implement link for the lasso feature selection, e.g. log for ensuring positivity
56 | # 2.0.6
57 | - downgrade the lightgbm version to 3.3.1 for compatibility reasons (with optuna for instance)
58 | ## 2.0.5
59 | - Fix: strictly greater than threshold rather than geq in the base threshold transformer
60 | - Update: due to a change in the lightgbm train API (v4), update the code for GBM
61 | ## 2.0.4
62 | - Documentation: fix the format of some docstrings and remove old sphinx generated files
63 | ## 2.0.3
64 | - Fix: remove unnecessary `__all__` in the preprocessing module and improve the consistency of the module docstrings
65 | ## 2.0.2
66 | - Fix: when the L1 == 0 in fit_regularized, statsmodels returns the regularized wrapper without refit, which breaks the class (statistics not available)
67 | ## 2.0.1
68 | - Build: remove explicit dependencies on holoviews and panel
69 | ## 2.0.0
70 | - Add fasttreeshap implementation as an option to compute shap importance (fasttreeshap does not work with XGBoost though)
71 | - New feature: lasso feature selection, especially useful for models without interactions (LM, GLM, GAM)
72 | - New feature: pass lightgbm parameters to GrootCV
73 | - Bug: fix sample weight shape in mrMR
74 | - Documentation: update and upgrade tuto NB
75 | ## 1.1.4
76 | - update the required python version >= 3.9
77 | ## 1.1.3
78 | - Change tqdm to auto for better rendering in NB for variable importance selector
79 | - User defined n_jobs for association matrix computation
80 | ## 1.1
81 |
82 | - Corrected an issue in Leshy that occurred when using categorical variables. The use of NumPy functions and methods instead of Pandas ones resulted in the modification of original data types.
83 |
84 | ## 1.0.7
85 |
86 | - Patch preventing zero division in the conditional entropy calculation
87 |
88 | ## 1.0.6
89 |
90 | - Return self in mrmr, fixing error when in scikit-learn pipeline
91 |
92 | ## 1.0.5
93 |
94 | - Patching classes where old unused argument was causing an error
95 |
96 | ## 1.0.2
97 |
98 | - Distribute a toy dataset for regression by modifying the Boston dataset adding noise and made up columns
99 |
100 | ## 1.0.1
101 |
102 | - Fix pkg data distribution
103 |
104 | ## 1.0.0
105 |
106 | - Parallelization of functions applied on pandas data frame
107 | - Faster and more modular association measures
108 | - Removing dependencies (e.g. dython)
109 | - Better static and interactive visualization
110 | - Sklearn selectors rather than a big class
111 | - Discretization of continuous and categorical predictors
112 | - Minimal redundancy maximal relevance feature selection added (a subset of all relevant predictors), based on Uber's MRmr flavor
113 | - architecture closer to the scikit-learn one
114 |
115 | ## 0.3.8
116 |
117 | - Fix bug when compute shap importance for classifier in GrootCV
118 |
119 | ## 0.3.7
120 |
121 | - Add defensive check if no categorical found in the subsampling of the dataset
122 | - Re-run the notebooks with the new version
123 | ## 0.3.6
124 |
125 | - Fix clustering when plotting only strongly correlated predictors
126 | - Remove palettable dependencies for plotting
127 | - Add default colormap but implement the user defined option
128 | ## 0.3.5
129 |
130 | - Enable clustering before plotting the correlation/association matrix, optional
131 | - Decrease fontsize for the lables of the correlation matrix
132 |
133 | ## 0.3.4
134 |
135 | - Update requirements
136 |
137 | ## 0.3.3
138 |
139 | - Upgrade documentation
140 |
141 | ## 0.3.2
142 |
143 | - Fix typo for distributing the dataset and pinned the dependencies
144 | ## 0.3.1
145 |
146 | - Update the syntax for computing associations using the latest version of dython
147 |
148 | ## 0.3.0
149 |
150 | - Fix the Boruta_py feature counts, now adds up to n_features
151 | - Fix the boxplot colours, when only rejected and accepted (no tentative) the background color was the tentative color
152 | - Numpy docstring style
153 | - Implement the new lightGBM callbacks. The new lgbm version (>3.3.0) implements the early stopping using a callback rather than an argument
154 | - Fix a bug for computing the shap importance when the estimator is lightGBM and the task is classification
155 | - Add ranking and absolute ranking attributes for all the classes
156 | - Fix future pandas TypeError when computing numerical values on a dataframe containing non-numerical columns
157 | - Add housing data to the distribution
158 | - Add "extreme" sampling methods
159 | - Re-run the NBs
160 | - reindex to keep the original columns order
161 |
162 | ## 0.2.3
163 |
164 | - Update syntax to stick to the new argument names in Dython
165 |
166 | ## 0.2.2
167 |
168 | - Check if no feature selected, warn rather than throw error
169 |
170 | ## 0.2.1
171 |
172 | - Fix a bug when removing collinear columns
173 |
174 | ## 0.2.0
175 |
176 | - Prefilters now support the filtering of continuous and nominal (categorical) collinear variables
177 |
178 | ## 0.1.6
179 |
180 | - improve the plot_y_vs_X function
181 | - remove gc.collect()
182 |
183 | ## 0.1.5
184 |
185 | - fix readme (typos)
186 | - move utilities in utils sub-package
187 | - make unit tests lighter
188 |
189 | ## 0.1.4
190 |
191 | - fix bug when using catboost, clone estimator (avoid error and be sure to use a non-fitted estimator)
192 |
193 | ## 0.1.3
194 |
195 | - change the defaut for categorical encoding in pre-filters (pd.cat to integers as default)
196 | - fix the unit tests with new defaults and names
197 |
198 | ## 0.1.2
199 |
200 | - change arguments name in pre-filters
201 |
202 | ## 0.1.1
203 |
204 | - remove old attribute names in unit-tests
205 |
206 | ## 0.1.0
207 |
208 | - Fix lightGBM warnings
209 | - Typo in repr
210 | - Provide load_data utility
211 | - Enhance jupyter NB examples
212 | - highlighting synthetic random predictors
213 | - Benchmark using sklearn permutation importance
214 | - Harmonization of the attributes and parameters
215 | - Fix categoricals handling
216 |
217 | ## 0.0.4
218 |
219 | - setting optimal number of features (according to "Elements of statistical learning") when using lightGBM random forest boosting.
220 | - Providing random forest, lightgbm implementation, estimators
221 |
222 | ## 0.0.3
223 |
224 | - Adding examples and expanding documentation
225 |
226 | ## 0.0.2
227 |
228 | - fix bug: relative import removed
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | abstract: >-
3 | All relevant feature selection means trying to find all features carrying information usable for prediction, rather than finding a possibly compact subset of features on which some particular model has a minimal error. This might include redundant predictors.
4 | title: All relevant feature selection
5 | message: >-
6 | If you use this software, please cite it using the metadata from this file.
7 | type: software
8 | authors:
9 | - given-names: Thomas
10 | family-names: Bury
11 | orcid: 'https://orcid.org/0000-0003-1421-4184'
12 | keywords:
13 | - "Feature Selection"
14 | - "All Relevant Feature Selection"
15 | - "Machine Learning"
16 | license: MIT License
17 | url: 'https://github.com/ThomasBury/arfs'
18 | version: 3.0.0
19 | date-released: 2021-12-18
20 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) [2020] [Thomas Bury]
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | [buy me caffeine](https://ko-fi.com/V7V72SOHX)
4 |
5 | [](https://badge.fury.io/py/arfs) [](https://pepy.tech/project/arfs) [](https://arfs.readthedocs.io/en/latest/?badge=latest) [](https://img.shields.io/badge/code%20style-black-black)
6 |
7 |
8 | [ARFS readthedocs](https://arfs.readthedocs.io/en/latest/#)
9 |
10 | # All relevant feature selection
11 |
12 | All relevant feature selection means trying to find all features carrying information usable for prediction, rather than finding a possibly compact subset of features on which some particular model has a minimal error. This might include redundant predictors. All relevant feature selection is model agnostic in the sense that it doesn't optimize a scoring function for a *specific* model but rather tries to select all the predictors which are related to the response.
13 |
14 | This package implements 3 different methods (Leshy is an evolution of Boruta, BoostAGroota is an evolution of BoostARoota and GrootCV is a new one). They are sklearn compatible. See hereunder for details about those methods. You can use any sklearn compatible estimator with Leshy and BoostAGroota but I recommend lightGBM. It's fast, accurate and has SHAP values builtin. It also provides a module for performing preprocessing and perform basic feature selection (autobinning, remove columns with too many missing values, zero variance, high-cardinality, highly correlated, etc.). Examples and detailled methods hereunder.
15 |
16 | Moreover, as an alternative to the all relevant problem, the ARFS package provides a MRmr feature selection which, theoretically, returns a subset of the predictors selected by an arfs method. ARFS also provides a `LASSO` feature selection which works especially well for (G)LMs and GAMs. You can combine Lasso with the `TreeDiscretizer` for introducing non-linearities into linear models and perform feature selection.
17 |
18 | Please note that one limitation of the lasso is that it treats the levels of a categorical predictor individually. However, this issue can be addressed by utilizing the `TreeDiscretizer`, which automatically bins numerical variables and groups the levels of categorical variables.
19 |
20 | ## Installation
21 |
22 | `$ pip install arfs`
23 |
24 | REM: If you're interested in using the `fastshap` option, you'll need to install [fasttreeshap](https://github.com/linkedin/FastTreeSHAP) first. For a smooth installation process, I suggest using `conda install -c conda-forge fasttreeshap` since the c++ source code requires compilation. Using pip may involve additional dependencies, such as requiring VS for compiling the c++ code.
25 |
26 | ## Example
27 |
28 | Working examples for:
29 |
30 | - [Preprocessing](./docs/notebooks/preprocessingipynb)
31 | - [Basic FS (best before ARFS)](./docs/notebooks/basic_feature_selection.ipynb)
32 | - [Regression](./docs/notebooks/arfs_regression.ipynb)
33 | - [Classification](./docs/notebooks/arfs_classification.ipynb)
34 | - [LASSO and (G)LM feature selection](./docs/notebooks/lasso_feature_selection.ipynb)
35 | - [Passing custom params](./docs/notebooks/arfs_grootcv_custom_params.ipynb)
36 | - [Non-normal loss and sample weights](./docs/notebooks/arfs_non_normal_loss_and_sample_weight.ipynb)
37 | - [ARFS on GPU](./docs/notebooks/arfs_on_GPU.ipynb)
38 | - [Fast Shap](./docs/notebooks/arfs_shap_vs_fastshap.ipynb)
39 | - [Categoricals](./docs/notebooks/issue_categoricals.ipynb)
40 | - [Collinearity](./docs/notebooks/issue_collinearity.ipynb)
41 | - [Reducing run time for large data](./docs/notebooks/arfs_large_data_sampling.ipynb)
42 | - [Comparison to Boruta and BorutaShap](./docs/notebooks/arfs_boruta_borutaShap_comparison.ipynb)
43 | - [MRmr alternative](./docs/notebooks/mrmr_feature_selection.ipynb)
44 | - [MRmr vs ARFS](./docs/notebooks/mrmr_fs_VS_arfs.ipynb)
45 |
46 | For imbalanced classification:
47 | - GrootCV will automatically detect imbalanced data and set the lightGBM `'is_unbalance' = True`
48 | - For Leshy and BoostAGroota, you can pass the estimator with the relevant parameter (e.g. `class_weight = 'balanced'`)
49 |
50 |
51 |
52 | ## Boruta
53 |
54 | The Boruta algorithm tries to capture all the important features you might have in your dataset with respect to an outcome variable. The procedure is the following:
55 |
56 | * Create duplicate copies of all independent variables. When the number of independent variables in the original data is less than 5, create at least 5 copies using existing variables.
57 | * Shuffle the values of added duplicate copies to remove their correlations with the target variable. It is called shadow features or permuted copies.
58 | * Combine the original ones with shuffled copies
59 | * Run a random forest classifier on the combined dataset and performs a variable importance measure (the default is Mean Decrease Accuracy) to evaluate the importance of each variable where higher means more important.
60 | * Then Z score is computed. It means mean of accuracy loss divided by the standard deviation of accuracy loss.
61 | * Find the maximum Z score among shadow attributes (MZSA)
62 | * Tag the variables as 'unimportant' when they have importance significantly lower than MZSA. Then we permanently remove them from the process.
63 | * Tag the variables as 'important' when they have importance significantly higher than MZSA.
64 | * Repeat the above steps for a predefined number of iterations (random forest runs), or until all attributes are either tagged 'unimportant' or 'important', whichever comes first.
65 |
66 | At every iteration, the algorithm compares the Z-scores of the shuffled copies of the features and the original features to see if the latter performed better than the former. If it does, the algorithm will mark the feature as important. In essence, the algorithm is trying to validate the importance of the feature by comparing with randomly shuffled copies, which increases the robustness. This is done by simply comparing the number of times a feature did better with the shadow features using a binomial distribution. Since the whole process is done on the same train-test split, the variance of the variable importance comes only from the different re-fit of the model over the different iterations.
67 |
68 |
69 |
70 |
71 | ## BoostARoota
72 |
73 | BoostARoota follows closely the Boruta method but modifies a few things:
74 |
75 | * One-Hot-Encode the feature set
76 | * Double width of the data set, making a copy of all features in the original dataset
77 | * Randomly shuffle the new features created in (2). These duplicated and shuffled features are referred to as "shadow features"
78 | * Run XGBoost classifier on the entire data set ten times. Running it ten times allows for random noise to be smoothed, resulting in more robust estimates of importance. The number of repeats is a parameter than can be changed.
79 | * Obtain importance values for each feature. This is a simple importance metric that sums up how many times the particular feature was split on in the XGBoost algorithm.
80 | * Compute "cutoff": the average feature importance value for all shadow features and divide by four. Shadow importance values are divided by four (parameter can be changed) to make it more difficult for the variables to be removed. With values lower than this, features are removed at too high of a rate.
81 | * Remove features with average importance across the ten iterations that is less than the cutoff specified in (6)
82 | * Go back to (2) until the number of features removed is less than ten per cent of the total.
83 | * Method returns the features remaining once completed.
84 |
85 | In the spirit, the same heuristic than Boruta but using Boosting (originally Boruta was supporting only random forest). The validation of the importance is done by comparing to the maximum of the median var. imp of the shadow predictors (in Boruta, a statistical test is performed using the Z-score). Since the whole process is done on the same train-test split, the variance of the variable importance comes only from the different re-fit of the model over the different iterations.
86 |
87 |
88 |
89 | ## Modifications to Boruta and BoostARoota
90 |
91 | I forked both Boruta and BoostARoota and made the following changes (under PR):
92 |
93 | **Boruta --> Leshy**:
94 |
95 | - The categorical features (they are detected, encoded. The tree-based models are working better with integer encoding rather than with OHE, which leads to deep and unstable trees). If Catboost is used, then the cat.pred (if any) are set up
96 | - Using lightGBM as the default speeds up by an order of magnitude the running time
97 | - Work with Catboost, sklearn API
98 | - Allow using sample_weight, for applications like Poisson regression or any requiring weights
99 | - Supports 3 different feature importances: native, SHAP and permutation. Native being the least consistent(because of the imp. biased towards numerical and large cardinality categorical) but the fastest of the 3. Indeed, the impurity var.imp. are biased en sensitive to large cardinality (see [scikit demo](https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py))
100 |
101 | **BoostARoota --> BoostAGroota**:
102 |
103 | - Replace XGBoost with LightGBM, you can still use tree-based scikitlearn models
104 | - Replace native var.imp by SHAP var.imp. Indeed, the impurity var.imp. are biased en sensitive to large cardinality (see [scikit demo](https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py)). Moreover, the native var. imp are computed on the train set, here the data are split (internally) in train and test, var. imp computed on the test set.
105 | - Handling categorical predictors. Cat. predictors should NOT be one-hot encoded, it leads to deep unstable trees. Instead, it's better to use the native method of lightGBM or CatBoost. A preprocessing step is needed to encode (ligthGBM and CatBoost use integer encoding and reference to categorical columns. The splitting strategies are different then, see official doc).
106 | - Work with sample_weight, for Poisson or any application requiring a weighting.
107 |
108 | ## GrootCV, a new method
109 |
110 | **New: GrootCV**:
111 |
112 | - Cross-validated feature importance to smooth out the noise, based on lightGBM only (which is, most of the time, the fastest and more accurate Boosting).
113 | - the feature importance is derived using SHAP importance
114 | - Taking the max of the median of the shadow var. imp over folds otherwise not enough conservative and it improves the convergence (needs less evaluation to find a threshold)
115 | - Not based on a given percentage of cols needed to be deleted
116 | - Plot method for var. imp
117 |
118 |
119 |
120 | ## References
121 |
122 | **Theory**
123 |
124 | - [Consistent feature selection for pattern recognition in polynomial time](https://www.jmlr.org/papers/volume8/nilsson07a/nilsson07a.pdf)
125 | - [Maximum Relevance and Minimum Redundancy Feature Selection Methods for a Marketing Machine Learning Platform](https://eng.uber.com/research/maximum-relevance-and-minimum-redundancy-feature-selection-methods-for-a-marketing-machine-learning-platform/)
126 |
127 | **Applications**
128 |
129 | - [The Boruta paper](https://www.jstatsoft.org/article/view/v036i11/v36i11.pdf)
130 | - [The python implementation](https://github.com/scikit-learn-contrib/boruta_py)
131 | - [BoostARoota](https://github.com/chasedehan/BoostARoota)
132 |
133 |
134 |
135 |
--------------------------------------------------------------------------------
/docs/Introduction.rst:
--------------------------------------------------------------------------------
1 | Introduction
2 | ============
3 |
4 | All relevant feature selection means trying to find all features carrying information usable for prediction,
5 | rather than finding a possibly compact subset of features on which some particular model has a minimal error.
6 | This might include redundant predictors. All relevant feature selection is model agnostic in the sense that it
7 | doesn't optimize a scoring function for a *specific* model but rather tries to select all the predictors which are related to the response.
8 | This package implements 3 different methods (Leshy is an evolution of Boruta, BoostAGroota is an evolution of BoostARoota and GrootCV is a new one).
9 | They are sklearn compatible. See hereunder for details about those methods. You can use any sklearn compatible estimator
10 | with Leshy and BoostAGroota but I recommend lightGBM. It's fast, accurate and has SHAP values builtin.
11 |
12 | It also provides a module for performing preprocessing and perform basic feature selection
13 | (autobinning, remove columns with too many missing values, zero variance, high-cardinality, highly correlated, etc.).
14 |
15 | Moreover, as an alternative to the all relevant problem, the ARFS package provides a MRmr feature selection which,
16 | theoretically, returns a subset of the predictors selected by an arfs method. ARFS also provides a `LASSO` feature
17 | selection which works especially well for (G)LMs and GAMs. You can combine Lasso with the `TreeDiscretizer` for introducing
18 | non-linearities into linear models and perform feature selection.
19 | Please note that one limitation of the lasso is that it treats the levels of a categorical predictor individually.
20 | However, this issue can be addressed by utilizing the `TreeDiscretizer`, which automatically bins numerical variables and
21 | groups the levels of categorical variables.
22 |
23 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/Methods overview.rst:
--------------------------------------------------------------------------------
1 | Methods overview
2 | ================
3 |
4 | Boruta
5 | ------
6 |
7 | The Boruta algorithm tries to capture all the important features you might have in your dataset with respect to an outcome variable. The procedure is as follows:
8 |
9 | * Create duplicate copies of all independent variables. When the number of independent variables in the original data is less than 5, create at least 5 copies using existing variables.
10 | * Shuffle the values of added duplicate copies to remove their correlations with the target variable. It is called shadow features or permuted copies.
11 | * Combine the original ones with shuffled copies.
12 | * Run a random forest classifier on the combined dataset and perform a variable importance measure (the default is Mean Decrease Accuracy) to evaluate the importance of each variable where higher means more important.
13 | * Then Z score is computed. It means the mean of accuracy loss divided by the standard deviation of accuracy loss.
14 | * Find the maximum Z score among shadow attributes (MZSA).
15 | * Tag the variables as 'unimportant' when they have importance significantly lower than MZSA. Then we permanently remove them from the process.
16 | * Tag the variables as 'important' when they have importance significantly higher than MZSA.
17 | * Repeat the above steps for a predefined number of iterations (random forest runs), or until all attributes are either tagged 'unimportant' or 'important', whichever comes first.
18 |
19 | At every iteration, the algorithm compares the Z-scores of the shuffled copies of the features and the original features to see if the latter performed better than the former. If it does, the algorithm will mark the feature as important. In essence, the algorithm is trying to validate the importance of the feature by comparing with randomly shuffled copies, which increases the robustness. This is done by simply comparing the number of times a feature did better with the shadow features using a binomial distribution. Since the whole process is done on the same train-test split, the variance of the variable importance comes only from the different re-fit of the model over the different iterations.
20 |
21 |
22 | BoostARoota
23 | -----------
24 |
25 | BoostARoota follows closely the Boruta method but modifies a few things:
26 |
27 | * One-Hot-Encode the feature set.
28 | * Double width of the data set, making a copy of all features in the original dataset.
29 | * Randomly shuffle the new features created in (2). These duplicated and shuffled features are referred to as "shadow features."
30 | * Run XGBoost classifier on the entire data set ten times. Running it ten times allows for random noise to be smoothed, resulting in more robust estimates of importance. The number of repeats is a parameter that can be changed.
31 | * Obtain importance values for each feature. This is a simple importance metric that sums up how many times the particular feature was split in the XGBoost algorithm.
32 | * Compute "cutoff": the average feature importance value for all shadow features and divide by four. Shadow importance values are divided by four (parameters can be changed) to make it more difficult for the variables to be removed. With values lower than this, features are removed at too high of a rate.
33 | * Remove features with average importance across the ten iterations that are less than the cutoff specified in (6).
34 | * Go back to (2) until the number of features removed is less than ten percent of the total.
35 | * The method returns the features remaining once completed.
36 |
37 | Modifications to Boruta
38 | -----------------------
39 |
40 | Boruta --> Leshy:
41 |
42 | For chronological development, see https://github.com/scikit-learn-contrib/boruta_py/pull/77 and https://github.com/scikit-learn-contrib/boruta_py/pull/100
43 |
44 | Leshy vs. BorutaPy:
45 | To summarize, this PR solves/enhances:
46 | * The categorical features (they are detected, encoded. The tree-based models are working better with integer encoding rather than with OHE, which leads to deep and unstable trees). If Catboost is used, then the cat.pred (if any) are set up.
47 | * Work with Catboost sklearn API.
48 | * Allow using sample_weight, for applications like Poisson regression or any requiring weights.
49 | * 3 different feature importances: native, SHAP, and permutation. Native being the least consistent (because of the imp. biased towards numerical and large cardinality categorical) but the fastest of the 3.
50 | * Using LightGBM as default speed up by an order of magnitude the running time.
51 | * Visualization like in the R package.
52 |
53 | BorutaPy vs. Boruta R:
54 | The improvements of this implementation include:
55 | * Faster run times: Thanks to scikit-learn's fast implementation of the ensemble methods.
56 | * Scikit-learn like interface: Use BorutaPy just like any other scikit-learn: fit, fit_transform, and transform are all implemented in a similar fashion.
57 | * Modularity: Any ensemble method could be used: random forest, extra trees classifier, even gradient boosted trees.
58 | * Two-step correction: The original Boruta code corrects for multiple testing in an overly conservative way. In this implementation, the Benjamini Hochberg FDR is used to correct in each iteration across active features. This means only those features are included in the correction which are still in the selection process. Following this, each that passed goes through a regular Bonferroni correction to check for the repeated testing over the iterations.
59 | * Percentile: Instead of using the max values of the shadow features, the user can specify which percentile to use. This gives a finer control over this crucial parameter. For more info, please read about the perc parameter.
60 | * Automatic tree number: Setting the n_estimator to 'auto' will calculate the number of trees in each iteration based on the number of features under investigation. This way more trees are used when the training data has many features and fewer when most of the features have been rejected.
61 | * Ranking of features: After fitting BorutaPy, it provides the user with ranking of features. Confirmed ones are 1, Tentatives are 2, and the rejected are ranked starting from 3, based on their feature importance history through the iterations.
62 | * Using either the native variable importance, scikit permutation importance, SHAP importance.
63 |
64 | We highly recommend using pruned trees with a depth between 3-7. For more, see the docs of these functions, and the examples below. Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/
65 |
66 | GrootCV, a new method
67 | ---------------------
68 |
69 | New: GrootCV:
70 | - Cross-validated feature importance to smooth out the noise, based on lightGBM only (which is, most of the time, the fastest and more accurate Boosting).
71 | - The feature importance is derived using SHAP importance.
72 | - Taking the max of median of the shadow var. imp over folds otherwise not enough conservative and it improves the convergence (needs less evaluation to find a threshold).
73 | - Not based on a given percentage of cols needed to be deleted.
74 | - Plot method for var. imp.
75 |
76 | MRmr
77 | ----
78 |
79 | Re-implementing the Uber MRmr scheme using associations for handling continuous and categorical predictors.
80 | - Theil's U statistics for the categorical-categorical association (correlation).
81 | - Variance ratio for continuous-categorical association.
82 | - Pearson or Spearman correlation for continuous-continuous association.
83 |
84 | Lasso
85 | -----
86 |
87 | Performing a simple grid search with enforced lasso regularization.
88 | The best model is chosen based on the minimum BIC or deviance score, and all non-zero coefficients are selected.
89 | The loss function can belong to the exponential family, as seen in the statsmodels GLM documentation.
90 | Using the bic metric is faster since it is evaluated on the training data, making it unsuitable for the test data, whereas the deviance is cross-validated.
91 |
92 | This approach can be combined with the TreeDiscretizer transformer to introduce univariate non-linearities (tree-GAM) before feature selection.
93 | This serves as a workaround to compensate for the absence of fused and grouped lasso regularization.
94 |
95 | References
96 | ----------
97 |
98 | **Theory**
99 | - [Consistent feature selection for pattern recognition in polynomial time](http://compmed.se/files/6914/2107/3475/pub_2007_5.pdf)
100 | - [Maximum Relevance and Minimum Redundancy Feature Selection Methods for a Marketing Machine Learning Platform](https://www.uber.com/blog/research/maximum-relevance-and-minimum-redundancy-feature-selection-methods-for-a-marketing-machine-learning-platform)
101 |
102 | **Applications**
103 | - [The Boruta paper](https://www.jstatsoft.org/article/view/v036i11/v36i11.pdf)
104 | - [The python implementation](https://github.com/scikit-learn-contrib/boruta_py)
105 | - [BoostARoota](https://github.com/chasedehan/BoostARoota)
--------------------------------------------------------------------------------
/docs/arfs.feature_selection.rst:
--------------------------------------------------------------------------------
1 | arfs.feature\_selection package
2 | ===============================
3 |
4 | Submodules
5 | ----------
6 |
7 | arfs.feature\_selection.allrelevant module
8 | ------------------------------------------
9 |
10 | .. automodule:: arfs.feature_selection.allrelevant
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 | arfs.feature\_selection.base module
16 | -----------------------------------
17 |
18 | .. automodule:: arfs.feature_selection.base
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
23 | arfs.feature\_selection.lasso module
24 | ------------------------------------
25 |
26 | .. automodule:: arfs.feature_selection.lasso
27 | :members:
28 | :undoc-members:
29 | :show-inheritance:
30 |
31 | arfs.feature\_selection.mrmr module
32 | -----------------------------------
33 |
34 | .. automodule:: arfs.feature_selection.mrmr
35 | :members:
36 | :undoc-members:
37 | :show-inheritance:
38 |
39 | arfs.feature\_selection.summary module
40 | --------------------------------------
41 |
42 | .. automodule:: arfs.feature_selection.summary
43 | :members:
44 | :undoc-members:
45 | :show-inheritance:
46 |
47 | arfs.feature\_selection.unsupervised module
48 | -------------------------------------------
49 |
50 | .. automodule:: arfs.feature_selection.unsupervised
51 | :members:
52 | :undoc-members:
53 | :show-inheritance:
54 |
55 | arfs.feature\_selection.variable\_importance module
56 | ---------------------------------------------------
57 |
58 | .. automodule:: arfs.feature_selection.variable_importance
59 | :members:
60 | :undoc-members:
61 | :show-inheritance:
62 |
63 | Module contents
64 | ---------------
65 |
66 | .. automodule:: arfs.feature_selection
67 | :members:
68 | :undoc-members:
69 | :show-inheritance:
70 |
--------------------------------------------------------------------------------
/docs/arfs.rst:
--------------------------------------------------------------------------------
1 | arfs package
2 | ============
3 |
4 | Subpackages
5 | -----------
6 |
7 | .. toctree::
8 | :maxdepth: 4
9 |
10 | arfs.feature_selection
11 |
12 | Submodules
13 | ----------
14 |
15 | arfs.association module
16 | -----------------------
17 |
18 | .. automodule:: arfs.association
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
23 | arfs.benchmark module
24 | ---------------------
25 |
26 | .. automodule:: arfs.benchmark
27 | :members:
28 | :undoc-members:
29 | :show-inheritance:
30 |
31 | arfs.gbm module
32 | ---------------
33 |
34 | .. automodule:: arfs.gbm
35 | :members:
36 | :undoc-members:
37 | :show-inheritance:
38 |
39 | arfs.parallel module
40 | --------------------
41 |
42 | .. automodule:: arfs.parallel
43 | :members:
44 | :undoc-members:
45 | :show-inheritance:
46 |
47 | arfs.preprocessing module
48 | -------------------------
49 |
50 | .. automodule:: arfs.preprocessing
51 | :members:
52 | :undoc-members:
53 | :show-inheritance:
54 |
55 | arfs.sampling module
56 | --------------------
57 |
58 | .. automodule:: arfs.sampling
59 | :members:
60 | :undoc-members:
61 | :show-inheritance:
62 |
63 | arfs.utils module
64 | -----------------
65 |
66 | .. automodule:: arfs.utils
67 | :members:
68 | :undoc-members:
69 | :show-inheritance:
70 |
71 | Module contents
72 | ---------------
73 |
74 | .. automodule:: arfs
75 | :members:
76 | :undoc-members:
77 | :show-inheritance:
78 |
--------------------------------------------------------------------------------
/docs/boostaroota.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/docs/boostaroota.png
--------------------------------------------------------------------------------
/docs/boruta.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/docs/boruta.png
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | import sys
17 | import os
18 | import datetime
19 |
20 | sys.path.insert(0, os.path.abspath("../../arfs"))
21 | # -- Project information -----------------------------------------------------
22 |
23 | project = "arfs"
24 | copyright = "2024, Thomas Bury"
25 | author = "Thomas Bury"
26 |
27 | # The full version, including alpha/beta/rc tags
28 | release = "3.0.0"
29 |
30 | # If extensions (or modules to document with autodoc) are in another
31 | # directory, add these directories to sys.path here. If the directory is
32 | # relative to the documentation root, use os.path.abspath to make it
33 | # absolute, like shown here.
34 | # sys.path.append(os.path.join(os.path.abspath(os.pardir)))
35 |
36 | # Don't add the same path again, remove the following line:
37 | # sys.path.insert(0, os.path.abspath(".."))
38 |
39 | sys.path.append(os.path.abspath(os.path.join(__file__, "../../src")))
40 | autodoc_mock_imports = ["_tkinter", "sphinx_tabs.tabs"]
41 |
42 | # Get the project root dir, which is the parent dir of this
43 | cwd = os.getcwd()
44 | project_root = os.path.dirname(cwd)
45 |
46 |
47 | # -- General configuration ---------------------------------------------------
48 |
49 | # Add any Sphinx extension module names here, as strings. They can be
50 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
51 | # ones.
52 | extensions = [
53 | "sphinx.ext.autodoc",
54 | "sphinx.ext.autosectionlabel",
55 | "sphinx.ext.napoleon",
56 | "sphinx.ext.viewcode",
57 | "sphinx_autodoc_typehints",
58 | "sphinx_copybutton",
59 | "nbsphinx",
60 | "sphinx_tabs.tabs",
61 | ]
62 |
63 | # Add any paths that contain templates here, relative to this directory.
64 | templates_path = ["_templates"]
65 | autosummary_generate = True
66 |
67 | # List of patterns, relative to source directory, that match files and
68 | # directories to ignore when looking for source files.
69 | # This pattern also affects html_static_path and html_extra_path.
70 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
71 |
72 |
73 | # -- Options for HTML output -------------------------------------------------
74 |
75 | # The theme to use for HTML and HTML Help pages. See the documentation for
76 | # a list of builtin themes.
77 | #
78 | # html_theme = "sphinx_rtd_theme"
79 | html_permalinks_icon = "#"
80 | html_theme = "sphinxawesome_theme"
81 |
82 | # If not None, a 'Last updated on:' timestamp is inserted at every page
83 | # bottom, using the given strftime format.
84 | # The empty string is equivalent to '%b %d, %Y'.
85 | html_last_updated_fmt = "%B %d, %Y at %H:%M"
86 | today_fmt = "%B %d, %Y at %H:%M"
87 |
88 | # Add any paths that contain custom static files (such as style sheets) here,
89 | # relative to this directory. They are copied after the builtin static files,
90 | # so a file named "default.css" will overwrite the builtin "default.css".
91 | # html_static_path = ["_static"]
92 | html_title = "ARFS Documentation"
93 | html_show_sourcelink = True
94 | html_logo = "logo.png"
95 |
96 | # -- Napoleon settings (for numpydoc parsing) --------------------------------
97 | # https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html#configuration
98 | napoleon_google_docstring = False
99 | napoleon_numpy_docstring = True
100 | napoleon_include_init_with_doc = True
101 | napoleon_include_private_with_doc = True
102 | napoleon_include_special_with_doc = True
103 | napoleon_use_admonition_for_examples = False
104 | napoleon_use_admonition_for_notes = False
105 | napoleon_use_admonition_for_references = False
106 | napoleon_use_ivar = True
107 | napoleon_use_param = True
108 | napoleon_use_rtype = False
109 | napoleon_preprocess_types = True
110 | napoleon_type_aliases = None
111 | napoleon_attr_annotations = True
112 |
--------------------------------------------------------------------------------
/docs/grootcv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/docs/grootcv.png
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | Welcome to arfs's documentation!
2 | ================================
3 |
4 | A package for performing All Relevant Feature Selection but not only that.
5 |
6 | Documentation last change: |today|
7 |
8 | .. toctree::
9 | :maxdepth: 2
10 | :caption: User's guide
11 |
12 | Introduction
13 | Methods overview
14 | modules
15 |
16 |
17 | .. toctree::
18 | :maxdepth: 4
19 | :glob:
20 | :caption: Tutorials
21 |
22 | notebooks/preprocessing.ipynb
23 | notebooks/basic_feature_selection.ipynb
24 | notebooks/association_and_feature_selection.ipynb
25 | notebooks/arfs_classification.ipynb
26 | notebooks/arfs_regression.ipynb
27 | notebooks/arfs_timeseries.ipynb
28 | notebooks/arfs_large_data_sampling.ipynb
29 | notebooks/arfs_on_GPU.ipynb
30 | notebooks/arfs_shap_vs_fastshap.ipynb
31 | notebooks/arfs_grootcv_custom_params.ipynb
32 | notebooks/arfs_boruta_borutaShap_comparison.ipynb
33 | notebooks/arfs_non_normal_loss_and_sample_weight.ipynb
34 | notebooks/mrmr_feature_selection.ipynb
35 | notebooks/mrmr_fs_VS_arfs.ipynb
36 | notebooks/lasso_feature_selection.ipynb
37 | notebooks/issue_categoricals.ipynb
38 | notebooks/issue_collinearity.ipynb
39 |
40 | Indices and tables
41 | ==================
42 |
43 | * :ref:`genindex`
44 | * :ref:`modindex`
45 | * :ref:`search`
--------------------------------------------------------------------------------
/docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasBury/arfs/03f67d0a54b69fac5ddbb83e306c8e8e72e2d3a2/docs/logo.png
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | echo.
16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | echo.installed, then set the SPHINXBUILD environment variable to point
18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | echo.may add the Sphinx directory to PATH.
20 | echo.
21 | echo.If you don't have Sphinx installed, grab it from
22 | echo.https://www.sphinx-doc.org/
23 | exit /b 1
24 | )
25 |
26 | if "%1" == "" goto help
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | src
2 | ===
3 |
4 | .. toctree::
5 | :maxdepth: 4
6 |
7 | arfs
8 |
--------------------------------------------------------------------------------
/docs/notebooks/arfs_on_GPU.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# ARFS - Using GPU\n",
8 | "\n",
9 | "You can leverage the GPU implementation of lightGBM (or other GBM flavours) but this often requires to compile or install some libraries or kit (such as CUDA)"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# from IPython.core.display import display, HTML\n",
19 | "# display(HTML(\"\"))\n",
20 | "import time\n",
21 | "import numpy as np\n",
22 | "import pandas as pd\n",
23 | "import matplotlib as mpl\n",
24 | "import matplotlib.pyplot as plt\n",
25 | "from lightgbm import LGBMRegressor\n",
26 | "\n",
27 | "import arfs\n",
28 | "from arfs.feature_selection import GrootCV, Leshy\n",
29 | "from arfs.utils import load_data\n",
30 | "from arfs.benchmark import highlight_tick\n",
31 | "\n",
32 | "rng = np.random.RandomState(seed=42)\n",
33 | "\n",
34 | "# import warnings\n",
35 | "# warnings.filterwarnings('ignore')"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "## GrootCV on GPU\n",
43 | "\n",
44 | "If the data is small, using a GPU mught not be the most efficient."
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "from sklearn.datasets import make_regression\n",
54 | "from sklearn.model_selection import train_test_split\n",
55 | "\n",
56 | "# Generate synthetic data with Poisson-distributed target variable\n",
57 | "bias = 1\n",
58 | "\n",
59 | "n_samples = 100_00 # 1_000_000\n",
60 | "n_features = 100\n",
61 | "n_informative = 20\n",
62 | "\n",
63 | "X, y, true_coef = make_regression(\n",
64 | " n_samples=n_samples,\n",
65 | " n_features=n_features,\n",
66 | " n_informative=n_informative,\n",
67 | " noise=1,\n",
68 | " random_state=8,\n",
69 | " bias=bias,\n",
70 | " coef=True,\n",
71 | ")\n",
72 | "y = (y - y.mean()) / y.std()\n",
73 | "y = np.exp(y) # Transform to positive values for Poisson distribution\n",
74 | "y = np.random.poisson(y) # Add Poisson noise to the target variable\n",
75 | "# dummy sample weight (e.g. exposure), smallest being 30 days\n",
76 | "w = np.random.uniform(30 / 365, 1, size=len(y))\n",
77 | "# make the count a Poisson rate (frequency)\n",
78 | "y = y / w\n",
79 | "\n",
80 | "X = pd.DataFrame(X)\n",
81 | "X.columns = [f\"pred_{i}\" for i in range(X.shape[1])]\n",
82 | "\n",
83 | "# Split the data into training and testing sets\n",
84 | "X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(\n",
85 | " X, y, w, test_size=0.5, random_state=42\n",
86 | ")\n",
87 | "\n",
88 | "true_coef = pd.Series(true_coef)\n",
89 | "true_coef.index = X.columns\n",
90 | "true_coef = pd.Series({**{\"intercept\": bias}, **true_coef})\n",
91 | "true_coef\n",
92 | "\n",
93 | "genuine_predictors = true_coef[true_coef > 0.0]\n",
94 | "\n",
95 | "print(f\"The true coefficient of the linear data generating process are:\\n {true_coef}\")"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "GPU"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "%%time\n",
112 | "feat_selector = GrootCV(\n",
113 | " objective=\"rmse\",\n",
114 | " cutoff=1,\n",
115 | " n_folds=3,\n",
116 | " n_iter=3,\n",
117 | " silent=True,\n",
118 | " fastshap=True,\n",
119 | " n_jobs=0,\n",
120 | " lgbm_params={\"device\": \"gpu\", \"gpu_device_id\": 1},\n",
121 | ")\n",
122 | "feat_selector.fit(X_train, y_train, sample_weight=None)\n",
123 | "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
124 | "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n",
125 | "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n",
126 | "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n",
127 | "\n",
128 | "# highlight synthetic random variable\n",
129 | "for name in true_coef.index:\n",
130 | " if name in genuine_predictors.index:\n",
131 | " fig = highlight_tick(figure=fig, str_match=name, color=\"green\")\n",
132 | " else:\n",
133 | " fig = highlight_tick(figure=fig, str_match=name)\n",
134 | "\n",
135 | "plt.show()"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {},
141 | "source": [
142 | "CPU"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "metadata": {},
149 | "outputs": [],
150 | "source": [
151 | "%%time\n",
152 | "feat_selector = GrootCV(\n",
153 | " objective=\"rmse\",\n",
154 | " cutoff=1,\n",
155 | " n_folds=3,\n",
156 | " n_iter=3,\n",
157 | " silent=True,\n",
158 | " fastshap=True,\n",
159 | " n_jobs=0,\n",
160 | " lgbm_params={\"device\": \"cpu\"},\n",
161 | ")\n",
162 | "feat_selector.fit(X_train, y_train, sample_weight=None)\n",
163 | "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
164 | "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n",
165 | "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n",
166 | "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n",
167 | "\n",
168 | "# highlight synthetic random variable\n",
169 | "for name in true_coef.index:\n",
170 | " if name in genuine_predictors.index:\n",
171 | " fig = highlight_tick(figure=fig, str_match=name, color=\"green\")\n",
172 | " else:\n",
173 | " fig = highlight_tick(figure=fig, str_match=name)\n",
174 | "\n",
175 | "plt.show()"
176 | ]
177 | },
178 | {
179 | "cell_type": "markdown",
180 | "metadata": {},
181 | "source": [
182 | "On a smaller data set, for illustrative purposes."
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 5,
188 | "metadata": {},
189 | "outputs": [],
190 | "source": [
191 | "boston = load_data(name=\"Boston\")\n",
192 | "X, y = boston.data, boston.target"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "%%time\n",
202 | "feat_selector = GrootCV(\n",
203 | " objective=\"rmse\",\n",
204 | " cutoff=1,\n",
205 | " n_folds=5,\n",
206 | " n_iter=5,\n",
207 | " silent=True,\n",
208 | " fastshap=True,\n",
209 | " n_jobs=0,\n",
210 | " lgbm_params={\"device\": \"cpu\"},\n",
211 | ")\n",
212 | "feat_selector.fit(X, y, sample_weight=None)\n",
213 | "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
214 | "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n",
215 | "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n",
216 | "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n",
217 | "\n",
218 | "# highlight synthetic random variable\n",
219 | "fig = highlight_tick(figure=fig, str_match=\"random\")\n",
220 | "fig = highlight_tick(figure=fig, str_match=\"genuine\", color=\"green\")\n",
221 | "plt.show()"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "metadata": {},
228 | "outputs": [],
229 | "source": [
230 | "%%time\n",
231 | "feat_selector = GrootCV(\n",
232 | " objective=\"rmse\",\n",
233 | " cutoff=1,\n",
234 | " n_folds=5,\n",
235 | " n_iter=5,\n",
236 | " silent=True,\n",
237 | " fastshap=True,\n",
238 | " n_jobs=0,\n",
239 | " lgbm_params={\"device\": \"gpu\"},\n",
240 | ")\n",
241 | "feat_selector.fit(X, y, sample_weight=None)\n",
242 | "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
243 | "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n",
244 | "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n",
245 | "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n",
246 | "\n",
247 | "# highlight synthetic random variable\n",
248 | "fig = highlight_tick(figure=fig, str_match=\"random\")\n",
249 | "fig = highlight_tick(figure=fig, str_match=\"genuine\", color=\"green\")\n",
250 | "plt.show()"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": null,
256 | "metadata": {},
257 | "outputs": [],
258 | "source": [
259 | "%%time\n",
260 | "feat_selector = GrootCV(\n",
261 | " objective=\"rmse\",\n",
262 | " cutoff=1,\n",
263 | " n_folds=5,\n",
264 | " n_iter=5,\n",
265 | " silent=True,\n",
266 | " fastshap=True,\n",
267 | " n_jobs=0,\n",
268 | " lgbm_params={\"device\": \"cuda\"},\n",
269 | ")\n",
270 | "feat_selector.fit(X, y, sample_weight=None)\n",
271 | "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
272 | "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n",
273 | "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n",
274 | "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n",
275 | "\n",
276 | "# highlight synthetic random variable\n",
277 | "fig = highlight_tick(figure=fig, str_match=\"random\")\n",
278 | "fig = highlight_tick(figure=fig, str_match=\"genuine\", color=\"green\")\n",
279 | "plt.show()"
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {},
285 | "source": [
286 | "## Leshy on GPU"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 9,
292 | "metadata": {},
293 | "outputs": [],
294 | "source": [
295 | "model = LGBMRegressor(random_state=42, verbose=-1, device=\"gpu\")"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": null,
301 | "metadata": {},
302 | "outputs": [],
303 | "source": [
304 | "%%time\n",
305 | "# Leshy\n",
306 | "feat_selector = Leshy(\n",
307 | " model, n_estimators=20, verbose=1, max_iter=10, random_state=42, importance=\"native\"\n",
308 | ")\n",
309 | "feat_selector.fit(X, y, sample_weight=None)\n",
310 | "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
311 | "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n",
312 | "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n",
313 | "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n",
314 | "\n",
315 | "# highlight synthetic random variable\n",
316 | "fig = highlight_tick(figure=fig, str_match=\"random\")\n",
317 | "fig = highlight_tick(figure=fig, str_match=\"genuine\", color=\"green\")\n",
318 | "plt.show()"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 11,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "model = LGBMRegressor(random_state=42, verbose=-1, device=\"cpu\")"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {},
334 | "outputs": [],
335 | "source": [
336 | "%%time\n",
337 | "# Leshy\n",
338 | "feat_selector = Leshy(\n",
339 | " model, n_estimators=20, verbose=1, max_iter=10, random_state=42, importance=\"native\"\n",
340 | ")\n",
341 | "feat_selector.fit(X, y, sample_weight=None)\n",
342 | "print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
343 | "print(f\"The agnostic ranking: {feat_selector.ranking_}\")\n",
344 | "print(f\"The naive ranking: {feat_selector.ranking_absolutes_}\")\n",
345 | "fig = feat_selector.plot_importance(n_feat_per_inch=5)\n",
346 | "\n",
347 | "# highlight synthetic random variable\n",
348 | "fig = highlight_tick(figure=fig, str_match=\"random\")\n",
349 | "fig = highlight_tick(figure=fig, str_match=\"genuine\", color=\"green\")\n",
350 | "plt.show()"
351 | ]
352 | }
353 | ],
354 | "metadata": {
355 | "kernelspec": {
356 | "display_name": "arfs",
357 | "language": "python",
358 | "name": "python3"
359 | },
360 | "language_info": {
361 | "codemirror_mode": {
362 | "name": "ipython",
363 | "version": 3
364 | },
365 | "file_extension": ".py",
366 | "mimetype": "text/x-python",
367 | "name": "python",
368 | "nbconvert_exporter": "python",
369 | "pygments_lexer": "ipython3",
370 | "version": "3.10.14"
371 | },
372 | "orig_nbformat": 4
373 | },
374 | "nbformat": 4,
375 | "nbformat_minor": 2
376 | }
377 |
--------------------------------------------------------------------------------
/docs/notebooks/arfs_shap_vs_fastshap.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# ARFS - fasttreeshap vs shap\n",
8 | "\n",
9 | "Leshy, BoostAGroota, and GrootCV are tree-based algorithms. They benefit from a [faster implementation of the Shapley values by LinkedIn](https://engineering.linkedin.com/blog/2022/fasttreeshap--accelerating-shap-value-computation-for-trees), which is claimed to outperform both the treeExplainer in the SHAP package and the native C++ implementation of lightgbm/xgboost/catboost. The improvement in speed will vary depending on the size of the task and your hardware resources (including virtualization for VMs). On older machine, the `fasttreeshap` implementation might actually be slower.\n",
10 | "\n",
11 | "However, it currently does not work with xgboost (not a deal breaker because lightgbm is the preferred default)."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 1,
17 | "metadata": {},
18 | "outputs": [
19 | {
20 | "name": "stderr",
21 | "output_type": "stream",
22 | "text": [
23 | "Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n"
24 | ]
25 | }
26 | ],
27 | "source": [
28 | "import numpy as np\n",
29 | "import pandas as pd\n",
30 | "\n",
31 | "from sklearn.datasets import make_regression\n",
32 | "from sklearn.model_selection import train_test_split\n",
33 | "\n",
34 | "import arfs\n",
35 | "from arfs.feature_selection import GrootCV, Leshy\n",
36 | "from arfs.utils import load_data\n",
37 | "from arfs.benchmark import highlight_tick\n",
38 | "\n",
39 | "rng = np.random.RandomState(seed=42)"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 2,
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "name": "stdout",
49 | "output_type": "stream",
50 | "text": [
51 | "The true coefficient of the linear data generating process are:\n",
52 | " intercept 1.000000\n",
53 | "pred_0 0.000000\n",
54 | "pred_1 0.000000\n",
55 | "pred_2 0.000000\n",
56 | "pred_3 0.000000\n",
57 | " ... \n",
58 | "pred_95 0.000000\n",
59 | "pred_96 10.576299\n",
60 | "pred_97 0.000000\n",
61 | "pred_98 0.000000\n",
62 | "pred_99 62.472033\n",
63 | "Length: 101, dtype: float64\n"
64 | ]
65 | }
66 | ],
67 | "source": [
68 | "# Generate synthetic data with Poisson-distributed target variable\n",
69 | "bias = 1\n",
70 | "\n",
71 | "n_samples = 100_000\n",
72 | "n_features = 100\n",
73 | "n_informative = 20\n",
74 | "\n",
75 | "X, y, true_coef = make_regression(\n",
76 | " n_samples=n_samples,\n",
77 | " n_features=n_features,\n",
78 | " n_informative=n_informative,\n",
79 | " noise=1,\n",
80 | " random_state=8,\n",
81 | " bias=bias,\n",
82 | " coef=True,\n",
83 | ")\n",
84 | "y = (y - y.mean()) / y.std()\n",
85 | "y = np.exp(y) # Transform to positive values for Poisson distribution\n",
86 | "y = np.random.poisson(y) # Add Poisson noise to the target variable\n",
87 | "# dummy sample weight (e.g. exposure), smallest being 30 days\n",
88 | "w = np.random.uniform(30 / 365, 1, size=len(y))\n",
89 | "# make the count a Poisson rate (frequency)\n",
90 | "y = y / w\n",
91 | "\n",
92 | "X = pd.DataFrame(X)\n",
93 | "X.columns = [f\"pred_{i}\" for i in range(X.shape[1])]\n",
94 | "\n",
95 | "# Split the data into training and testing sets\n",
96 | "X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(\n",
97 | " X, y, w, test_size=0.5, random_state=42\n",
98 | ")\n",
99 | "\n",
100 | "true_coef = pd.Series(true_coef)\n",
101 | "true_coef.index = X.columns\n",
102 | "true_coef = pd.Series({**{\"intercept\": bias}, **true_coef})\n",
103 | "true_coef\n",
104 | "\n",
105 | "genuine_predictors = true_coef[true_coef > 0.0]\n",
106 | "\n",
107 | "print(f\"The true coefficient of the linear data generating process are:\\n {true_coef}\")"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {},
113 | "source": [
114 | "## GrootCV - fastshap vs shap \n",
115 | "\n",
116 | "### Fastshap enable"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 3,
122 | "metadata": {},
123 | "outputs": [
124 | {
125 | "data": {
126 | "application/vnd.jupyter.widget-view+json": {
127 | "model_id": "b4a9fbb99730414786a1cc452df59ca8",
128 | "version_major": 2,
129 | "version_minor": 0
130 | },
131 | "text/plain": [
132 | "Repeated k-fold: 0%| | 0/9 [00:00, ?it/s]"
133 | ]
134 | },
135 | "metadata": {},
136 | "output_type": "display_data"
137 | },
138 | {
139 | "name": "stdout",
140 | "output_type": "stream",
141 | "text": [
142 | "CPU times: user 10min 34s, sys: 4.55 s, total: 10min 39s\n",
143 | "Wall time: 3min 11s\n"
144 | ]
145 | },
146 | {
147 | "data": {
148 | "text/html": [
149 | "
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.