├── .gitignore ├── LICENSE ├── README.md ├── imgs └── shap-hypetune-diagram.png ├── notebooks ├── LGBM_usage.ipynb └── XGBoost_usage.ipynb ├── requirements.txt ├── setup.py └── shaphypetune ├── __init__.py ├── _classes.py ├── shaphypetune.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | # Created by https://www.gitignore.io/api/python 4 | 5 | ### Python ### 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | # End of https://www.gitignore.io/api/python 109 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Marco Cerliani 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # shap-hypetune 2 | A python package for simultaneous Hyperparameters Tuning and Features Selection for Gradient Boosting Models. 3 | 4 | ![shap-hypetune diagram](https://raw.githubusercontent.com/cerlymarco/shap-hypetune/master/imgs/shap-hypetune-diagram.png#center) 5 | 6 | ## Overview 7 | Hyperparameters tuning and features selection are two common steps in every machine learning pipeline. Most of the time they are computed separately and independently. This may result in suboptimal performances and in a more time expensive process. 8 | 9 | shap-hypetune aims to combine hyperparameters tuning and features selection in a single pipeline optimizing the optimal number of features while searching for the optimal parameters configuration. Hyperparameters Tuning or Features Selection can also be carried out as standalone operations. 10 | 11 | **shap-hypetune main features:** 12 | 13 | - designed for gradient boosting models, as LGBModel or XGBModel; 14 | - developed to be integrable with the scikit-learn ecosystem; 15 | - effective in both classification or regression tasks; 16 | - customizable training process, supporting early-stopping and all the other fitting options available in the standard algorithms api; 17 | - ranking feature selection algorithms: Recursive Feature Elimination (RFE); Recursive Feature Addition (RFA); or Boruta; 18 | - classical boosting based feature importances or SHAP feature importances (the later can be computed also on the eval_set); 19 | - apply grid-search, random-search, or bayesian-search (from hyperopt); 20 | - parallelized computations with joblib. 21 | 22 | ## Installation 23 | ```shell 24 | pip install --upgrade shap-hypetune 25 | ``` 26 | lightgbm, xgboost are not needed requirements. The module depends only on NumPy, shap, scikit-learn and hyperopt. Python 3.6 or above is supported. 27 | 28 | ## Media 29 | - [SHAP for Feature Selection and HyperParameter Tuning](https://towardsdatascience.com/shap-for-feature-selection-and-hyperparameter-tuning-a330ec0ea104) 30 | - [Boruta and SHAP for better Feature Selection](https://towardsdatascience.com/boruta-and-shap-for-better-feature-selection-20ea97595f4a) 31 | - [Recursive Feature Selection: Addition or Elimination?](https://towardsdatascience.com/recursive-feature-selection-addition-or-elimination-755e5d86a791) 32 | - [Boruta SHAP for Temporal Feature Selection](https://towardsdatascience.com/boruta-shap-for-temporal-feature-selection-96a7840c7713) 33 | 34 | ## Usage 35 | ```python 36 | from shaphypetune import BoostSearch, BoostRFE, BoostRFA, BoostBoruta 37 | ``` 38 | #### Hyperparameters Tuning 39 | ```python 40 | BoostSearch( 41 | estimator, # LGBModel or XGBModel 42 | param_grid=None, # parameters to be optimized 43 | greater_is_better=False, # minimize or maximize the monitored score 44 | n_iter=None, # number of sampled parameter configurations 45 | sampling_seed=None, # the seed used for parameter sampling 46 | verbose=1, # verbosity mode 47 | n_jobs=None # number of jobs to run in parallel 48 | ) 49 | ``` 50 | #### Feature Selection (RFE) 51 | ```python 52 | BoostRFE( 53 | estimator, # LGBModel or XGBModel 54 | min_features_to_select=None, # the minimum number of features to be selected 55 | step=1, # number of features to remove at each iteration 56 | param_grid=None, # parameters to be optimized 57 | greater_is_better=False, # minimize or maximize the monitored score 58 | importance_type='feature_importances', # which importance measure to use: default or shap 59 | train_importance=True, # where to compute the shap feature importance 60 | n_iter=None, # number of sampled parameter configurations 61 | sampling_seed=None, # the seed used for parameter sampling 62 | verbose=1, # verbosity mode 63 | n_jobs=None # number of jobs to run in parallel 64 | ) 65 | ``` 66 | #### Feature Selection (BORUTA) 67 | ```python 68 | BoostBoruta( 69 | estimator, # LGBModel or XGBModel 70 | perc=100, # threshold used to compare shadow and real features 71 | alpha=0.05, # p-value levels for feature rejection 72 | max_iter=100, # maximum Boruta iterations to perform 73 | early_stopping_boruta_rounds=None, # maximum iterations without confirming a feature 74 | param_grid=None, # parameters to be optimized 75 | greater_is_better=False, # minimize or maximize the monitored score 76 | importance_type='feature_importances', # which importance measure to use: default or shap 77 | train_importance=True, # where to compute the shap feature importance 78 | n_iter=None, # number of sampled parameter configurations 79 | sampling_seed=None, # the seed used for parameter sampling 80 | verbose=1, # verbosity mode 81 | n_jobs=None # number of jobs to run in parallel 82 | ) 83 | ``` 84 | #### Feature Selection (RFA) 85 | ```python 86 | BoostRFA( 87 | estimator, # LGBModel or XGBModel 88 | min_features_to_select=None, # the minimum number of features to be selected 89 | step=1, # number of features to remove at each iteration 90 | param_grid=None, # parameters to be optimized 91 | greater_is_better=False, # minimize or maximize the monitored score 92 | importance_type='feature_importances', # which importance measure to use: default or shap 93 | train_importance=True, # where to compute the shap feature importance 94 | n_iter=None, # number of sampled parameter configurations 95 | sampling_seed=None, # the seed used for parameter sampling 96 | verbose=1, # verbosity mode 97 | n_jobs=None # number of jobs to run in parallel 98 | ) 99 | ``` 100 | 101 | Full examples in the [notebooks folder](https://github.com/cerlymarco/shap-hypetune/tree/main/notebooks). 102 | -------------------------------------------------------------------------------- /imgs/shap-hypetune-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cerlymarco/shap-hypetune/8f46e161d27a1e413d5d1e360e8a5bf51ebfa0e1/imgs/shap-hypetune-diagram.png -------------------------------------------------------------------------------- /notebooks/LGBM_usage.ipynb: -------------------------------------------------------------------------------- 1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nfrom scipy import stats\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.datasets import make_classification, make_regression\n\nfrom hyperopt import hp\nfrom hyperopt import Trials\n\nfrom lightgbm import *\n\ntry:\n from shaphypetune import BoostSearch, BoostBoruta, BoostRFE, BoostRFA\nexcept:\n !pip install --upgrade shap-hypetune\n from shaphypetune import BoostSearch, BoostBoruta, BoostRFE, BoostRFA\n\nimport warnings\nwarnings.simplefilter('ignore')","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:46:43.363945Z","iopub.execute_input":"2022-01-01T11:46:43.364356Z","iopub.status.idle":"2022-01-01T11:46:45.084134Z","shell.execute_reply.started":"2022-01-01T11:46:43.364243Z","shell.execute_reply":"2022-01-01T11:46:45.083177Z"},"trusted":true},"execution_count":1,"outputs":[{"output_type":"display_data","data":{"text/plain":"","text/html":"\n"},"metadata":{}}]},{"cell_type":"code","source":"X_clf, y_clf = make_classification(n_samples=6000, n_features=20, n_classes=2, \n n_informative=4, n_redundant=6, random_state=0)\n\nX_clf_train, X_clf_valid, y_clf_train, y_clf_valid = train_test_split(\n X_clf, y_clf, test_size=0.3, shuffle=False)\n\nX_regr, y_regr = make_classification(n_samples=6000, n_features=20,\n n_informative=7, random_state=0)\n\nX_regr_train, X_regr_valid, y_regr_train, y_regr_valid = train_test_split(\n X_regr, y_regr, test_size=0.3, shuffle=False)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:46:45.086875Z","iopub.execute_input":"2022-01-01T11:46:45.087123Z","iopub.status.idle":"2022-01-01T11:46:45.118700Z","shell.execute_reply.started":"2022-01-01T11:46:45.087094Z","shell.execute_reply":"2022-01-01T11:46:45.117983Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"code","source":"param_grid = {\n 'learning_rate': [0.2, 0.1],\n 'num_leaves': [25, 35],\n 'max_depth': [10, 12]\n}\n\nparam_dist = {\n 'learning_rate': stats.uniform(0.09, 0.25),\n 'num_leaves': stats.randint(20,40),\n 'max_depth': [10, 12]\n}\n\nparam_dist_hyperopt = {\n 'max_depth': 15 + hp.randint('num_leaves', 5), \n 'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),\n 'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0)\n}\n\n\nregr_lgbm = LGBMRegressor(n_estimators=150, random_state=0, n_jobs=-1)\nclf_lgbm = LGBMClassifier(n_estimators=150, random_state=0, n_jobs=-1)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:46:45.120073Z","iopub.execute_input":"2022-01-01T11:46:45.120376Z","iopub.status.idle":"2022-01-01T11:46:45.132838Z","shell.execute_reply.started":"2022-01-01T11:46:45.120336Z","shell.execute_reply":"2022-01-01T11:46:45.131615Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"markdown","source":"# Hyperparameters Tuning","metadata":{}},{"cell_type":"code","source":"### HYPERPARAM TUNING WITH GRID-SEARCH ###\n\nmodel = BoostSearch(clf_lgbm, param_grid=param_grid)\nmodel.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:46:45.134450Z","iopub.execute_input":"2022-01-01T11:46:45.135435Z","iopub.status.idle":"2022-01-01T11:46:46.383589Z","shell.execute_reply.started":"2022-01-01T11:46:45.135389Z","shell.execute_reply":"2022-01-01T11:46:46.382860Z"},"trusted":true},"execution_count":4,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')\n\ntrial: 0001 ### iterations: 00023 ### eval_score: 0.2085\ntrial: 0002 ### iterations: 00019 ### eval_score: 0.21112\ntrial: 0003 ### iterations: 00026 ### eval_score: 0.21162\ntrial: 0004 ### iterations: 00032 ### eval_score: 0.20747\ntrial: 0005 ### iterations: 00054 ### eval_score: 0.20244\ntrial: 0006 ### iterations: 00071 ### eval_score: 0.20052\ntrial: 0007 ### iterations: 00047 ### eval_score: 0.20306\ntrial: 0008 ### iterations: 00050 ### eval_score: 0.20506\n","output_type":"stream"},{"execution_count":4,"output_type":"execute_result","data":{"text/plain":"BoostSearch(estimator=LGBMClassifier(n_estimators=150, random_state=0),\n param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],\n 'num_leaves': [25, 35]})"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.best_params_, model.best_score_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:46:46.388010Z","iopub.execute_input":"2022-01-01T11:46:46.389926Z","iopub.status.idle":"2022-01-01T11:46:46.397550Z","shell.execute_reply.started":"2022-01-01T11:46:46.389888Z","shell.execute_reply":"2022-01-01T11:46:46.396658Z"},"trusted":true},"execution_count":5,"outputs":[{"execution_count":5,"output_type":"execute_result","data":{"text/plain":"(LGBMClassifier(max_depth=12, n_estimators=150, num_leaves=25, random_state=0),\n {'learning_rate': 0.1, 'num_leaves': 25, 'max_depth': 12},\n 0.20051586840398297)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_clf_valid, y_clf_valid), \n model.predict(X_clf_valid).shape, \n model.predict_proba(X_clf_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:46:46.398765Z","iopub.execute_input":"2022-01-01T11:46:46.399534Z","iopub.status.idle":"2022-01-01T11:46:46.436761Z","shell.execute_reply.started":"2022-01-01T11:46:46.399498Z","shell.execute_reply":"2022-01-01T11:46:46.431623Z"},"trusted":true},"execution_count":6,"outputs":[{"execution_count":6,"output_type":"execute_result","data":{"text/plain":"(0.9183333333333333, (1800,), (1800, 2))"},"metadata":{}}]},{"cell_type":"code","source":"### HYPERPARAM TUNING WITH RANDOM-SEARCH ###\n\nmodel = BoostSearch(\n regr_lgbm, param_grid=param_dist,\n n_iter=8, sampling_seed=0\n)\nmodel.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:46:46.438241Z","iopub.execute_input":"2022-01-01T11:46:46.438923Z","iopub.status.idle":"2022-01-01T11:46:47.128794Z","shell.execute_reply.started":"2022-01-01T11:46:46.438892Z","shell.execute_reply":"2022-01-01T11:46:47.128107Z"},"trusted":true},"execution_count":7,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')\n\ntrial: 0001 ### iterations: 00023 ### eval_score: 0.07643\ntrial: 0002 ### iterations: 00052 ### eval_score: 0.06818\ntrial: 0003 ### iterations: 00062 ### eval_score: 0.07042\ntrial: 0004 ### iterations: 00033 ### eval_score: 0.07035\ntrial: 0005 ### iterations: 00032 ### eval_score: 0.07153\ntrial: 0006 ### iterations: 00012 ### eval_score: 0.07547\ntrial: 0007 ### iterations: 00041 ### eval_score: 0.07355\ntrial: 0008 ### iterations: 00025 ### eval_score: 0.07805\n","output_type":"stream"},{"execution_count":7,"output_type":"execute_result","data":{"text/plain":"BoostSearch(estimator=LGBMRegressor(n_estimators=150, random_state=0), n_iter=8,\n param_grid={'learning_rate': ,\n 'max_depth': [10, 12],\n 'num_leaves': },\n sampling_seed=0)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.best_params_, model.best_score_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:46:47.132071Z","iopub.execute_input":"2022-01-01T11:46:47.132611Z","iopub.status.idle":"2022-01-01T11:46:47.142185Z","shell.execute_reply.started":"2022-01-01T11:46:47.132575Z","shell.execute_reply":"2022-01-01T11:46:47.141271Z"},"trusted":true},"execution_count":8,"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":"(LGBMRegressor(learning_rate=0.1350674222191923, max_depth=10, n_estimators=150,\n num_leaves=38, random_state=0),\n {'learning_rate': 0.1350674222191923, 'num_leaves': 38, 'max_depth': 10},\n 0.06817737242646997)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape, \n model.predict(X_regr_valid, pred_contrib=True).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:46:47.143613Z","iopub.execute_input":"2022-01-01T11:46:47.143856Z","iopub.status.idle":"2022-01-01T11:46:47.611056Z","shell.execute_reply.started":"2022-01-01T11:46:47.143827Z","shell.execute_reply":"2022-01-01T11:46:47.610379Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"(0.7272820930747703, (1800,), (1800, 21))"},"metadata":{}}]},{"cell_type":"code","source":"### HYPERPARAM TUNING WITH HYPEROPT ###\n\nmodel = BoostSearch(\n regr_lgbm, param_grid=param_dist_hyperopt,\n n_iter=8, sampling_seed=0\n)\nmodel.fit(\n X_regr_train, y_regr_train, trials=Trials(), \n eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0\n)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:46:47.614530Z","iopub.execute_input":"2022-01-01T11:46:47.616779Z","iopub.status.idle":"2022-01-01T11:46:49.268236Z","shell.execute_reply.started":"2022-01-01T11:46:47.616738Z","shell.execute_reply":"2022-01-01T11:46:49.267608Z"},"trusted":true},"execution_count":10,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('max_depth', 'learning_rate', 'colsample_bytree')\n\ntrial: 0001 ### iterations: 00149 ### eval_score: 0.06979\ntrial: 0002 ### iterations: 00055 ### eval_score: 0.07039\ntrial: 0003 ### iterations: 00056 ### eval_score: 0.0716\ntrial: 0004 ### iterations: 00150 ### eval_score: 0.07352\ntrial: 0005 ### iterations: 00150 ### eval_score: 0.07936\ntrial: 0006 ### iterations: 00147 ### eval_score: 0.06833\ntrial: 0007 ### iterations: 00032 ### eval_score: 0.07261\ntrial: 0008 ### iterations: 00096 ### eval_score: 0.07074\n","output_type":"stream"},{"execution_count":10,"output_type":"execute_result","data":{"text/plain":"BoostSearch(estimator=LGBMRegressor(n_estimators=150, random_state=0), n_iter=8,\n param_grid={'colsample_bytree': ,\n 'learning_rate': ,\n 'max_depth': },\n sampling_seed=0)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.best_params_, model.best_score_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:46:49.271739Z","iopub.execute_input":"2022-01-01T11:46:49.272301Z","iopub.status.idle":"2022-01-01T11:46:49.279337Z","shell.execute_reply.started":"2022-01-01T11:46:49.272264Z","shell.execute_reply":"2022-01-01T11:46:49.278727Z"},"trusted":true},"execution_count":11,"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":"(LGBMRegressor(colsample_bytree=0.7597292534356749,\n learning_rate=0.059836658149176665, max_depth=16,\n n_estimators=150, random_state=0),\n {'colsample_bytree': 0.7597292534356749,\n 'learning_rate': 0.059836658149176665,\n 'max_depth': 16},\n 0.06832542425080958)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape, \n model.predict(X_regr_valid, pred_contrib=True).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:46:49.280499Z","iopub.execute_input":"2022-01-01T11:46:49.280735Z","iopub.status.idle":"2022-01-01T11:46:50.260345Z","shell.execute_reply.started":"2022-01-01T11:46:49.280700Z","shell.execute_reply":"2022-01-01T11:46:50.259694Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":"(0.7266898674988451, (1800,), (1800, 21))"},"metadata":{}}]},{"cell_type":"markdown","source":"# Features Selection","metadata":{}},{"cell_type":"code","source":"### BORUTA ###\n\nmodel = BoostBoruta(clf_lgbm, max_iter=200, perc=100)\nmodel.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:46:50.263726Z","iopub.execute_input":"2022-01-01T11:46:50.265917Z","iopub.status.idle":"2022-01-01T11:46:56.714012Z","shell.execute_reply.started":"2022-01-01T11:46:50.265869Z","shell.execute_reply":"2022-01-01T11:46:56.713278Z"},"trusted":true},"execution_count":13,"outputs":[{"execution_count":13,"output_type":"execute_result","data":{"text/plain":"BoostBoruta(estimator=LGBMClassifier(n_estimators=150, random_state=0),\n max_iter=200)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:46:56.720017Z","iopub.execute_input":"2022-01-01T11:46:56.720486Z","iopub.status.idle":"2022-01-01T11:46:56.727782Z","shell.execute_reply.started":"2022-01-01T11:46:56.720450Z","shell.execute_reply":"2022-01-01T11:46:56.726815Z"},"trusted":true},"execution_count":14,"outputs":[{"execution_count":14,"output_type":"execute_result","data":{"text/plain":"(LGBMClassifier(n_estimators=150, random_state=0), 10)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_clf_valid, y_clf_valid), \n model.predict(X_clf_valid).shape, \n model.transform(X_clf_valid).shape,\n model.predict_proba(X_clf_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:46:56.730004Z","iopub.execute_input":"2022-01-01T11:46:56.730326Z","iopub.status.idle":"2022-01-01T11:46:56.765852Z","shell.execute_reply.started":"2022-01-01T11:46:56.730286Z","shell.execute_reply":"2022-01-01T11:46:56.760625Z"},"trusted":true},"execution_count":15,"outputs":[{"execution_count":15,"output_type":"execute_result","data":{"text/plain":"(0.91, (1800,), (1800, 10), (1800, 2))"},"metadata":{}}]},{"cell_type":"code","source":"### RECURSIVE FEATURE ELIMINATION (RFE) ###\n\nmodel = BoostRFE(regr_lgbm, min_features_to_select=1, step=1)\nmodel.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:46:56.767160Z","iopub.execute_input":"2022-01-01T11:46:56.767432Z","iopub.status.idle":"2022-01-01T11:46:59.411924Z","shell.execute_reply.started":"2022-01-01T11:46:56.767401Z","shell.execute_reply":"2022-01-01T11:46:59.411240Z"},"trusted":true},"execution_count":16,"outputs":[{"execution_count":16,"output_type":"execute_result","data":{"text/plain":"BoostRFE(estimator=LGBMRegressor(n_estimators=150, random_state=0),\n min_features_to_select=1)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:46:59.415300Z","iopub.execute_input":"2022-01-01T11:46:59.417330Z","iopub.status.idle":"2022-01-01T11:46:59.424201Z","shell.execute_reply.started":"2022-01-01T11:46:59.417288Z","shell.execute_reply":"2022-01-01T11:46:59.423561Z"},"trusted":true},"execution_count":17,"outputs":[{"execution_count":17,"output_type":"execute_result","data":{"text/plain":"(LGBMRegressor(n_estimators=150, random_state=0), 7)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape, \n model.transform(X_regr_valid).shape,\n model.predict(X_regr_valid, pred_contrib=True).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:46:59.425449Z","iopub.execute_input":"2022-01-01T11:46:59.425674Z","iopub.status.idle":"2022-01-01T11:47:00.248420Z","shell.execute_reply.started":"2022-01-01T11:46:59.425645Z","shell.execute_reply":"2022-01-01T11:47:00.247703Z"},"trusted":true},"execution_count":18,"outputs":[{"execution_count":18,"output_type":"execute_result","data":{"text/plain":"(0.7766363424352807, (1800,), (1800, 7), (1800, 8))"},"metadata":{}}]},{"cell_type":"code","source":"### RECURSIVE FEATURE ADDITION (RFA) ###\n\nmodel = BoostRFA(regr_lgbm, min_features_to_select=1, step=1)\nmodel.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:47:00.251993Z","iopub.execute_input":"2022-01-01T11:47:00.252510Z","iopub.status.idle":"2022-01-01T11:47:03.954790Z","shell.execute_reply.started":"2022-01-01T11:47:00.252473Z","shell.execute_reply":"2022-01-01T11:47:03.954052Z"},"trusted":true},"execution_count":19,"outputs":[{"execution_count":19,"output_type":"execute_result","data":{"text/plain":"BoostRFA(estimator=LGBMRegressor(n_estimators=150, random_state=0),\n min_features_to_select=1)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:47:03.958397Z","iopub.execute_input":"2022-01-01T11:47:03.958982Z","iopub.status.idle":"2022-01-01T11:47:03.967715Z","shell.execute_reply.started":"2022-01-01T11:47:03.958931Z","shell.execute_reply":"2022-01-01T11:47:03.966909Z"},"trusted":true},"execution_count":20,"outputs":[{"execution_count":20,"output_type":"execute_result","data":{"text/plain":"(LGBMRegressor(n_estimators=150, random_state=0), 8)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape, \n model.transform(X_regr_valid).shape,\n model.predict(X_regr_valid, pred_contrib=True).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:47:03.969215Z","iopub.execute_input":"2022-01-01T11:47:03.969612Z","iopub.status.idle":"2022-01-01T11:47:04.838820Z","shell.execute_reply.started":"2022-01-01T11:47:03.969569Z","shell.execute_reply":"2022-01-01T11:47:04.838192Z"},"trusted":true},"execution_count":21,"outputs":[{"execution_count":21,"output_type":"execute_result","data":{"text/plain":"(0.7723191919698336, (1800,), (1800, 8), (1800, 9))"},"metadata":{}}]},{"cell_type":"markdown","source":"# Features Selection with SHAP","metadata":{}},{"cell_type":"code","source":"### BORUTA SHAP ###\n\nmodel = BoostBoruta(\n clf_lgbm, max_iter=200, perc=100,\n importance_type='shap_importances', train_importance=False\n)\nmodel.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:47:04.842289Z","iopub.execute_input":"2022-01-01T11:47:04.844564Z","iopub.status.idle":"2022-01-01T11:47:17.780389Z","shell.execute_reply.started":"2022-01-01T11:47:04.844522Z","shell.execute_reply":"2022-01-01T11:47:17.779726Z"},"trusted":true},"execution_count":22,"outputs":[{"execution_count":22,"output_type":"execute_result","data":{"text/plain":"BoostBoruta(estimator=LGBMClassifier(n_estimators=150, random_state=0),\n importance_type='shap_importances', max_iter=200,\n train_importance=False)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:47:17.781535Z","iopub.execute_input":"2022-01-01T11:47:17.784569Z","iopub.status.idle":"2022-01-01T11:47:17.791371Z","shell.execute_reply.started":"2022-01-01T11:47:17.784530Z","shell.execute_reply":"2022-01-01T11:47:17.790591Z"},"trusted":true},"execution_count":23,"outputs":[{"execution_count":23,"output_type":"execute_result","data":{"text/plain":"(LGBMClassifier(n_estimators=150, random_state=0), 9)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_clf_valid, y_clf_valid), \n model.predict(X_clf_valid).shape, \n model.transform(X_clf_valid).shape,\n model.predict_proba(X_clf_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:47:17.794450Z","iopub.execute_input":"2022-01-01T11:47:17.794986Z","iopub.status.idle":"2022-01-01T11:47:17.813842Z","shell.execute_reply.started":"2022-01-01T11:47:17.794933Z","shell.execute_reply":"2022-01-01T11:47:17.813126Z"},"trusted":true},"execution_count":24,"outputs":[{"execution_count":24,"output_type":"execute_result","data":{"text/plain":"(0.9111111111111111, (1800,), (1800, 9), (1800, 2))"},"metadata":{}}]},{"cell_type":"code","source":"### RECURSIVE FEATURE ELIMINATION (RFE) SHAP ###\n\nmodel = BoostRFE(\n regr_lgbm, min_features_to_select=1, step=1,\n importance_type='shap_importances', train_importance=False\n)\nmodel.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:47:17.817477Z","iopub.execute_input":"2022-01-01T11:47:17.819641Z","iopub.status.idle":"2022-01-01T11:47:32.735329Z","shell.execute_reply.started":"2022-01-01T11:47:17.819595Z","shell.execute_reply":"2022-01-01T11:47:32.734687Z"},"trusted":true},"execution_count":25,"outputs":[{"execution_count":25,"output_type":"execute_result","data":{"text/plain":"BoostRFE(estimator=LGBMRegressor(n_estimators=150, random_state=0),\n importance_type='shap_importances', min_features_to_select=1,\n train_importance=False)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:47:32.736646Z","iopub.execute_input":"2022-01-01T11:47:32.737109Z","iopub.status.idle":"2022-01-01T11:47:32.743398Z","shell.execute_reply.started":"2022-01-01T11:47:32.737074Z","shell.execute_reply":"2022-01-01T11:47:32.742747Z"},"trusted":true},"execution_count":26,"outputs":[{"execution_count":26,"output_type":"execute_result","data":{"text/plain":"(LGBMRegressor(n_estimators=150, random_state=0), 7)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape, \n model.transform(X_regr_valid).shape,\n model.predict(X_regr_valid, pred_contrib=True).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:47:32.744765Z","iopub.execute_input":"2022-01-01T11:47:32.747374Z","iopub.status.idle":"2022-01-01T11:47:33.570515Z","shell.execute_reply.started":"2022-01-01T11:47:32.747336Z","shell.execute_reply":"2022-01-01T11:47:33.569899Z"},"trusted":true},"execution_count":27,"outputs":[{"execution_count":27,"output_type":"execute_result","data":{"text/plain":"(0.7766363424352807, (1800,), (1800, 7), (1800, 8))"},"metadata":{}}]},{"cell_type":"code","source":"### RECURSIVE FEATURE ADDITION (RFA) SHAP ###\n\nmodel = BoostRFA(\n regr_lgbm, min_features_to_select=1, step=1,\n importance_type='shap_importances', train_importance=False\n)\nmodel.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:47:33.571778Z","iopub.execute_input":"2022-01-01T11:47:33.572261Z","iopub.status.idle":"2022-01-01T11:47:39.941084Z","shell.execute_reply.started":"2022-01-01T11:47:33.572226Z","shell.execute_reply":"2022-01-01T11:47:39.940356Z"},"trusted":true},"execution_count":28,"outputs":[{"execution_count":28,"output_type":"execute_result","data":{"text/plain":"BoostRFA(estimator=LGBMRegressor(n_estimators=150, random_state=0),\n importance_type='shap_importances', min_features_to_select=1,\n train_importance=False)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:47:39.944497Z","iopub.execute_input":"2022-01-01T11:47:39.946592Z","iopub.status.idle":"2022-01-01T11:47:39.953717Z","shell.execute_reply.started":"2022-01-01T11:47:39.946550Z","shell.execute_reply":"2022-01-01T11:47:39.952924Z"},"trusted":true},"execution_count":29,"outputs":[{"execution_count":29,"output_type":"execute_result","data":{"text/plain":"(LGBMRegressor(n_estimators=150, random_state=0), 9)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape, \n model.transform(X_regr_valid).shape,\n model.predict(X_regr_valid, pred_contrib=True).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:47:39.954955Z","iopub.execute_input":"2022-01-01T11:47:39.955713Z","iopub.status.idle":"2022-01-01T11:47:40.853749Z","shell.execute_reply.started":"2022-01-01T11:47:39.955669Z","shell.execute_reply":"2022-01-01T11:47:40.853100Z"},"trusted":true},"execution_count":30,"outputs":[{"execution_count":30,"output_type":"execute_result","data":{"text/plain":"(0.7699366468805918, (1800,), (1800, 9), (1800, 10))"},"metadata":{}}]},{"cell_type":"markdown","source":"# Hyperparameters Tuning + Features Selection","metadata":{}},{"cell_type":"code","source":"### HYPERPARAM TUNING WITH GRID-SEARCH + BORUTA ###\n\nmodel = BoostBoruta(clf_lgbm, param_grid=param_grid, max_iter=200, perc=100)\nmodel.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:47:40.857000Z","iopub.execute_input":"2022-01-01T11:47:40.859123Z","iopub.status.idle":"2022-01-01T11:48:08.045782Z","shell.execute_reply.started":"2022-01-01T11:47:40.859074Z","shell.execute_reply":"2022-01-01T11:48:08.043191Z"},"trusted":true},"execution_count":31,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')\n\ntrial: 0001 ### iterations: 00023 ### eval_score: 0.19868\ntrial: 0002 ### iterations: 00030 ### eval_score: 0.19844\ntrial: 0003 ### iterations: 00023 ### eval_score: 0.19695\ntrial: 0004 ### iterations: 00026 ### eval_score: 0.19949\ntrial: 0005 ### iterations: 00067 ### eval_score: 0.19583\ntrial: 0006 ### iterations: 00051 ### eval_score: 0.1949\ntrial: 0007 ### iterations: 00045 ### eval_score: 0.19675\ntrial: 0008 ### iterations: 00055 ### eval_score: 0.19906\n","output_type":"stream"},{"execution_count":31,"output_type":"execute_result","data":{"text/plain":"BoostBoruta(estimator=LGBMClassifier(n_estimators=150, random_state=0),\n max_iter=200,\n param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],\n 'num_leaves': [25, 35]})"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.best_params_, model.best_score_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:48:08.047190Z","iopub.execute_input":"2022-01-01T11:48:08.048047Z","iopub.status.idle":"2022-01-01T11:48:08.056353Z","shell.execute_reply.started":"2022-01-01T11:48:08.048000Z","shell.execute_reply":"2022-01-01T11:48:08.055615Z"},"trusted":true},"execution_count":32,"outputs":[{"execution_count":32,"output_type":"execute_result","data":{"text/plain":"(LGBMClassifier(max_depth=12, n_estimators=150, num_leaves=25, random_state=0),\n {'learning_rate': 0.1, 'num_leaves': 25, 'max_depth': 12},\n 0.19489866976777023,\n 9)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_clf_valid, y_clf_valid), \n model.predict(X_clf_valid).shape, \n model.transform(X_clf_valid).shape,\n model.predict_proba(X_clf_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:48:08.058015Z","iopub.execute_input":"2022-01-01T11:48:08.058593Z","iopub.status.idle":"2022-01-01T11:48:08.109632Z","shell.execute_reply.started":"2022-01-01T11:48:08.058410Z","shell.execute_reply":"2022-01-01T11:48:08.108670Z"},"trusted":true},"execution_count":33,"outputs":[{"execution_count":33,"output_type":"execute_result","data":{"text/plain":"(0.915, (1800,), (1800, 9), (1800, 2))"},"metadata":{}}]},{"cell_type":"code","source":"### HYPERPARAM TUNING WITH RANDOM-SEARCH + RECURSIVE FEATURE ELIMINATION (RFE) ###\n\nmodel = BoostRFE(\n regr_lgbm, param_grid=param_dist, min_features_to_select=1, step=1,\n n_iter=8, sampling_seed=0\n)\nmodel.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:48:08.114460Z","iopub.execute_input":"2022-01-01T11:48:08.116626Z","iopub.status.idle":"2022-01-01T11:48:20.506235Z","shell.execute_reply.started":"2022-01-01T11:48:08.116579Z","shell.execute_reply":"2022-01-01T11:48:20.505511Z"},"trusted":true},"execution_count":34,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')\n\ntrial: 0001 ### iterations: 00107 ### eval_score: 0.06016\ntrial: 0002 ### iterations: 00095 ### eval_score: 0.05711\ntrial: 0003 ### iterations: 00121 ### eval_score: 0.05926\ntrial: 0004 ### iterations: 00103 ### eval_score: 0.05688\ntrial: 0005 ### iterations: 00119 ### eval_score: 0.05618\ntrial: 0006 ### iterations: 00049 ### eval_score: 0.06188\ntrial: 0007 ### iterations: 00150 ### eval_score: 0.05538\ntrial: 0008 ### iterations: 00083 ### eval_score: 0.06084\n","output_type":"stream"},{"execution_count":34,"output_type":"execute_result","data":{"text/plain":"BoostRFE(estimator=LGBMRegressor(n_estimators=150, random_state=0),\n min_features_to_select=1, n_iter=8,\n param_grid={'learning_rate': ,\n 'max_depth': [10, 12],\n 'num_leaves': },\n sampling_seed=0)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.best_params_, model.best_score_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:48:20.509788Z","iopub.execute_input":"2022-01-01T11:48:20.511633Z","iopub.status.idle":"2022-01-01T11:48:20.521139Z","shell.execute_reply.started":"2022-01-01T11:48:20.511592Z","shell.execute_reply":"2022-01-01T11:48:20.520293Z"},"trusted":true},"execution_count":35,"outputs":[{"execution_count":35,"output_type":"execute_result","data":{"text/plain":"(LGBMRegressor(learning_rate=0.13639381870463482, max_depth=12, n_estimators=150,\n num_leaves=25, random_state=0),\n {'learning_rate': 0.13639381870463482, 'num_leaves': 25, 'max_depth': 12},\n 0.0553821617278472,\n 7)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape, \n model.transform(X_regr_valid).shape,\n model.predict(X_regr_valid, pred_contrib=True).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:48:20.522443Z","iopub.execute_input":"2022-01-01T11:48:20.522817Z","iopub.status.idle":"2022-01-01T11:48:21.145683Z","shell.execute_reply.started":"2022-01-01T11:48:20.522785Z","shell.execute_reply":"2022-01-01T11:48:21.145033Z"},"trusted":true},"execution_count":36,"outputs":[{"execution_count":36,"output_type":"execute_result","data":{"text/plain":"(0.7784645155736596, (1800,), (1800, 7), (1800, 8))"},"metadata":{}}]},{"cell_type":"code","source":"### HYPERPARAM TUNING WITH HYPEROPT + RECURSIVE FEATURE ADDITION (RFA) ###\n\nmodel = BoostRFA(\n regr_lgbm, param_grid=param_dist_hyperopt, min_features_to_select=1, step=1,\n n_iter=8, sampling_seed=0\n)\nmodel.fit(\n X_regr_train, y_regr_train, trials=Trials(), \n eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0\n)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:48:21.149492Z","iopub.execute_input":"2022-01-01T11:48:21.151302Z","iopub.status.idle":"2022-01-01T11:48:56.679453Z","shell.execute_reply.started":"2022-01-01T11:48:21.151261Z","shell.execute_reply":"2022-01-01T11:48:56.678720Z"},"trusted":true},"execution_count":37,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('max_depth', 'learning_rate', 'colsample_bytree')\n\ntrial: 0001 ### iterations: 00150 ### eval_score: 0.06507\ntrial: 0002 ### iterations: 00075 ### eval_score: 0.05784\ntrial: 0003 ### iterations: 00095 ### eval_score: 0.06088\ntrial: 0004 ### iterations: 00150 ### eval_score: 0.06976\ntrial: 0005 ### iterations: 00150 ### eval_score: 0.07593\ntrial: 0006 ### iterations: 00149 ### eval_score: 0.05995\ntrial: 0007 ### iterations: 00058 ### eval_score: 0.05916\ntrial: 0008 ### iterations: 00150 ### eval_score: 0.06366\n","output_type":"stream"},{"execution_count":37,"output_type":"execute_result","data":{"text/plain":"BoostRFA(estimator=LGBMRegressor(n_estimators=150, random_state=0),\n min_features_to_select=1, n_iter=8,\n param_grid={'colsample_bytree': ,\n 'learning_rate': ,\n 'max_depth': },\n sampling_seed=0)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.best_params_, model.best_score_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:48:56.682847Z","iopub.execute_input":"2022-01-01T11:48:56.684405Z","iopub.status.idle":"2022-01-01T11:48:56.691812Z","shell.execute_reply.started":"2022-01-01T11:48:56.684368Z","shell.execute_reply":"2022-01-01T11:48:56.690932Z"},"trusted":true},"execution_count":38,"outputs":[{"execution_count":38,"output_type":"execute_result","data":{"text/plain":"(LGBMRegressor(colsample_bytree=0.8515260655364685,\n learning_rate=0.13520045129619862, max_depth=18, n_estimators=150,\n random_state=0),\n {'colsample_bytree': 0.8515260655364685,\n 'learning_rate': 0.13520045129619862,\n 'max_depth': 18},\n 0.0578369356489881,\n 8)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape, \n model.transform(X_regr_valid).shape,\n model.predict(X_regr_valid, pred_contrib=True).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:48:56.693078Z","iopub.execute_input":"2022-01-01T11:48:56.693305Z","iopub.status.idle":"2022-01-01T11:48:57.115924Z","shell.execute_reply.started":"2022-01-01T11:48:56.693277Z","shell.execute_reply":"2022-01-01T11:48:57.115308Z"},"trusted":true},"execution_count":39,"outputs":[{"execution_count":39,"output_type":"execute_result","data":{"text/plain":"(0.7686451168212334, (1800,), (1800, 8), (1800, 9))"},"metadata":{}}]},{"cell_type":"markdown","source":"# Hyperparameters Tuning + Features Selection with SHAP","metadata":{}},{"cell_type":"code","source":"### HYPERPARAM TUNING WITH GRID-SEARCH + BORUTA SHAP ###\n\nmodel = BoostBoruta(\n clf_lgbm, param_grid=param_grid, max_iter=200, perc=100,\n importance_type='shap_importances', train_importance=False\n)\nmodel.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"scrolled":true,"execution":{"iopub.status.busy":"2022-01-01T11:48:57.119397Z","iopub.execute_input":"2022-01-01T11:48:57.120009Z","iopub.status.idle":"2022-01-01T11:50:15.982498Z","shell.execute_reply.started":"2022-01-01T11:48:57.119958Z","shell.execute_reply":"2022-01-01T11:50:15.981774Z"},"trusted":true},"execution_count":40,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')\n\ntrial: 0001 ### iterations: 00036 ### eval_score: 0.19716\ntrial: 0002 ### iterations: 00030 ### eval_score: 0.19818\ntrial: 0003 ### iterations: 00031 ### eval_score: 0.19881\ntrial: 0004 ### iterations: 00026 ### eval_score: 0.19949\ntrial: 0005 ### iterations: 00067 ### eval_score: 0.19583\ntrial: 0006 ### iterations: 00051 ### eval_score: 0.1949\ntrial: 0007 ### iterations: 00045 ### eval_score: 0.19675\ntrial: 0008 ### iterations: 00057 ### eval_score: 0.19284\n","output_type":"stream"},{"execution_count":40,"output_type":"execute_result","data":{"text/plain":"BoostBoruta(estimator=LGBMClassifier(n_estimators=150, random_state=0),\n importance_type='shap_importances', max_iter=200,\n param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],\n 'num_leaves': [25, 35]},\n train_importance=False)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.best_params_, model.best_score_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:50:15.988196Z","iopub.execute_input":"2022-01-01T11:50:15.988729Z","iopub.status.idle":"2022-01-01T11:50:15.996898Z","shell.execute_reply.started":"2022-01-01T11:50:15.988685Z","shell.execute_reply":"2022-01-01T11:50:15.996175Z"},"trusted":true},"execution_count":41,"outputs":[{"execution_count":41,"output_type":"execute_result","data":{"text/plain":"(LGBMClassifier(max_depth=12, n_estimators=150, num_leaves=35, random_state=0),\n {'learning_rate': 0.1, 'num_leaves': 35, 'max_depth': 12},\n 0.1928371931511303,\n 10)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_clf_valid, y_clf_valid), \n model.predict(X_clf_valid).shape, \n model.transform(X_clf_valid).shape,\n model.predict_proba(X_clf_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:50:15.998631Z","iopub.execute_input":"2022-01-01T11:50:15.999269Z","iopub.status.idle":"2022-01-01T11:50:16.029050Z","shell.execute_reply.started":"2022-01-01T11:50:15.999228Z","shell.execute_reply":"2022-01-01T11:50:16.028270Z"},"trusted":true},"execution_count":42,"outputs":[{"execution_count":42,"output_type":"execute_result","data":{"text/plain":"(0.9111111111111111, (1800,), (1800, 10), (1800, 2))"},"metadata":{}}]},{"cell_type":"code","source":"### HYPERPARAM TUNING WITH RANDOM-SEARCH + RECURSIVE FEATURE ELIMINATION (RFE) SHAP ###\n\nmodel = BoostRFE(\n regr_lgbm, param_grid=param_dist, min_features_to_select=1, step=1,\n n_iter=8, sampling_seed=0,\n importance_type='shap_importances', train_importance=False\n)\nmodel.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:50:16.030261Z","iopub.execute_input":"2022-01-01T11:50:16.030658Z","iopub.status.idle":"2022-01-01T11:51:19.095150Z","shell.execute_reply.started":"2022-01-01T11:50:16.030625Z","shell.execute_reply":"2022-01-01T11:51:19.094483Z"},"trusted":true},"execution_count":43,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')\n\ntrial: 0001 ### iterations: 00107 ### eval_score: 0.06016\ntrial: 0002 ### iterations: 00102 ### eval_score: 0.05525\ntrial: 0003 ### iterations: 00150 ### eval_score: 0.05869\ntrial: 0004 ### iterations: 00149 ### eval_score: 0.05863\ntrial: 0005 ### iterations: 00119 ### eval_score: 0.05618\ntrial: 0006 ### iterations: 00049 ### eval_score: 0.06188\ntrial: 0007 ### iterations: 00150 ### eval_score: 0.05538\ntrial: 0008 ### iterations: 00083 ### eval_score: 0.06084\n","output_type":"stream"},{"execution_count":43,"output_type":"execute_result","data":{"text/plain":"BoostRFE(estimator=LGBMRegressor(n_estimators=150, random_state=0),\n importance_type='shap_importances', min_features_to_select=1, n_iter=8,\n param_grid={'learning_rate': ,\n 'max_depth': [10, 12],\n 'num_leaves': },\n sampling_seed=0, train_importance=False)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.best_params_, model.best_score_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:51:19.098487Z","iopub.execute_input":"2022-01-01T11:51:19.099062Z","iopub.status.idle":"2022-01-01T11:51:19.108772Z","shell.execute_reply.started":"2022-01-01T11:51:19.099027Z","shell.execute_reply":"2022-01-01T11:51:19.107939Z"},"trusted":true},"execution_count":44,"outputs":[{"execution_count":44,"output_type":"execute_result","data":{"text/plain":"(LGBMRegressor(learning_rate=0.1350674222191923, max_depth=10, n_estimators=150,\n num_leaves=38, random_state=0),\n {'learning_rate': 0.1350674222191923, 'num_leaves': 38, 'max_depth': 10},\n 0.05524518772497125,\n 9)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape, \n model.transform(X_regr_valid).shape,\n model.predict(X_regr_valid, pred_contrib=True).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:51:19.110141Z","iopub.execute_input":"2022-01-01T11:51:19.110358Z","iopub.status.idle":"2022-01-01T11:51:19.840667Z","shell.execute_reply.started":"2022-01-01T11:51:19.110333Z","shell.execute_reply":"2022-01-01T11:51:19.840035Z"},"trusted":true},"execution_count":45,"outputs":[{"execution_count":45,"output_type":"execute_result","data":{"text/plain":"(0.779012428496056, (1800,), (1800, 9), (1800, 10))"},"metadata":{}}]},{"cell_type":"code","source":"### HYPERPARAM TUNING WITH HYPEROPT + RECURSIVE FEATURE ADDITION (RFA) SHAP ###\n\nmodel = BoostRFA(\n regr_lgbm, param_grid=param_dist_hyperopt, min_features_to_select=1, step=1,\n n_iter=8, sampling_seed=0,\n importance_type='shap_importances', train_importance=False\n)\nmodel.fit(\n X_regr_train, y_regr_train, trials=Trials(), \n eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0\n)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:51:19.844245Z","iopub.execute_input":"2022-01-01T11:51:19.844839Z","iopub.status.idle":"2022-01-01T11:52:27.830673Z","shell.execute_reply.started":"2022-01-01T11:51:19.844800Z","shell.execute_reply":"2022-01-01T11:52:27.829915Z"},"trusted":true},"execution_count":46,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('max_depth', 'learning_rate', 'colsample_bytree')\n\ntrial: 0001 ### iterations: 00150 ### eval_score: 0.06508\ntrial: 0002 ### iterations: 00091 ### eval_score: 0.05997\ntrial: 0003 ### iterations: 00094 ### eval_score: 0.06078\ntrial: 0004 ### iterations: 00150 ### eval_score: 0.06773\ntrial: 0005 ### iterations: 00150 ### eval_score: 0.07565\ntrial: 0006 ### iterations: 00150 ### eval_score: 0.05935\ntrial: 0007 ### iterations: 00083 ### eval_score: 0.06047\ntrial: 0008 ### iterations: 00150 ### eval_score: 0.05966\n","output_type":"stream"},{"execution_count":46,"output_type":"execute_result","data":{"text/plain":"BoostRFA(estimator=LGBMRegressor(n_estimators=150, random_state=0),\n importance_type='shap_importances', min_features_to_select=1, n_iter=8,\n param_grid={'colsample_bytree': ,\n 'learning_rate': ,\n 'max_depth': },\n sampling_seed=0, train_importance=False)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.best_params_, model.best_score_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:52:27.834402Z","iopub.execute_input":"2022-01-01T11:52:27.835864Z","iopub.status.idle":"2022-01-01T11:52:27.842813Z","shell.execute_reply.started":"2022-01-01T11:52:27.835812Z","shell.execute_reply":"2022-01-01T11:52:27.842095Z"},"trusted":true},"execution_count":47,"outputs":[{"execution_count":47,"output_type":"execute_result","data":{"text/plain":"(LGBMRegressor(colsample_bytree=0.7597292534356749,\n learning_rate=0.059836658149176665, max_depth=16,\n n_estimators=150, random_state=0),\n {'colsample_bytree': 0.7597292534356749,\n 'learning_rate': 0.059836658149176665,\n 'max_depth': 16},\n 0.059352961644604275,\n 9)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape, \n model.transform(X_regr_valid).shape,\n model.predict(X_regr_valid, pred_contrib=True).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:52:27.844085Z","iopub.execute_input":"2022-01-01T11:52:27.844320Z","iopub.status.idle":"2022-01-01T11:52:28.690931Z","shell.execute_reply.started":"2022-01-01T11:52:27.844291Z","shell.execute_reply":"2022-01-01T11:52:28.690302Z"},"trusted":true},"execution_count":48,"outputs":[{"execution_count":48,"output_type":"execute_result","data":{"text/plain":"(0.7625808256692885, (1800,), (1800, 9), (1800, 10))"},"metadata":{}}]},{"cell_type":"markdown","source":"# CUSTOM EVAL METRIC SUPPORT","metadata":{}},{"cell_type":"code","source":"from sklearn.metrics import roc_auc_score\n\ndef AUC(y_true, y_hat):\n return 'auc', roc_auc_score(y_true, y_hat), True","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:52:28.691909Z","iopub.execute_input":"2022-01-01T11:52:28.692560Z","iopub.status.idle":"2022-01-01T11:52:28.696813Z","shell.execute_reply.started":"2022-01-01T11:52:28.692526Z","shell.execute_reply":"2022-01-01T11:52:28.696058Z"},"trusted":true},"execution_count":49,"outputs":[]},{"cell_type":"code","source":"model = BoostRFE(\n LGBMClassifier(n_estimators=150, random_state=0, metric=\"custom\"), \n param_grid=param_grid, min_features_to_select=1, step=1,\n greater_is_better=True\n)\nmodel.fit(\n X_clf_train, y_clf_train, \n eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0, \n eval_metric=AUC\n)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:52:28.700234Z","iopub.execute_input":"2022-01-01T11:52:28.700461Z","iopub.status.idle":"2022-01-01T11:52:49.577997Z","shell.execute_reply.started":"2022-01-01T11:52:28.700433Z","shell.execute_reply":"2022-01-01T11:52:49.577317Z"},"trusted":true},"execution_count":50,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')\n\ntrial: 0001 ### iterations: 00028 ### eval_score: 0.97581\ntrial: 0002 ### iterations: 00016 ### eval_score: 0.97514\ntrial: 0003 ### iterations: 00015 ### eval_score: 0.97574\ntrial: 0004 ### iterations: 00032 ### eval_score: 0.97549\ntrial: 0005 ### iterations: 00075 ### eval_score: 0.97551\ntrial: 0006 ### iterations: 00041 ### eval_score: 0.97597\ntrial: 0007 ### iterations: 00076 ### eval_score: 0.97592\ntrial: 0008 ### iterations: 00060 ### eval_score: 0.97539\n","output_type":"stream"},{"execution_count":50,"output_type":"execute_result","data":{"text/plain":"BoostRFE(estimator=LGBMClassifier(metric='custom', n_estimators=150,\n random_state=0),\n greater_is_better=True, min_features_to_select=1,\n param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],\n 'num_leaves': [25, 35]})"},"metadata":{}}]},{"cell_type":"markdown","source":"# CATEGORICAL FEATURE SUPPORT","metadata":{}},{"cell_type":"code","source":"categorical_feature = [0,1,2]\n\nX_clf_train[:,categorical_feature] = (X_clf_train[:,categorical_feature]+100).clip(0).astype(int)\nX_clf_valid[:,categorical_feature] = (X_clf_valid[:,categorical_feature]+100).clip(0).astype(int)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:52:49.581409Z","iopub.execute_input":"2022-01-01T11:52:49.581982Z","iopub.status.idle":"2022-01-01T11:52:49.589315Z","shell.execute_reply.started":"2022-01-01T11:52:49.581931Z","shell.execute_reply":"2022-01-01T11:52:49.588511Z"},"trusted":true},"execution_count":51,"outputs":[]},{"cell_type":"code","source":"### MANUAL PASS categorical_feature WITH NUMPY ARRAYS ###\n\nmodel = BoostRFE(clf_lgbm, param_grid=param_grid, min_features_to_select=1, step=1)\nmodel.fit(\n X_clf_train, y_clf_train, \n eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0,\n categorical_feature=categorical_feature\n)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:52:49.590366Z","iopub.execute_input":"2022-01-01T11:52:49.590604Z","iopub.status.idle":"2022-01-01T11:53:00.495917Z","shell.execute_reply.started":"2022-01-01T11:52:49.590576Z","shell.execute_reply":"2022-01-01T11:53:00.495224Z"},"trusted":true},"execution_count":52,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')\n\ntrial: 0001 ### iterations: 00029 ### eval_score: 0.2036\ntrial: 0002 ### iterations: 00030 ### eval_score: 0.2034\ntrial: 0003 ### iterations: 00027 ### eval_score: 0.20617\ntrial: 0004 ### iterations: 00024 ### eval_score: 0.20003\ntrial: 0005 ### iterations: 00060 ### eval_score: 0.20332\ntrial: 0006 ### iterations: 00063 ### eval_score: 0.20329\ntrial: 0007 ### iterations: 00054 ### eval_score: 0.20136\ntrial: 0008 ### iterations: 00052 ### eval_score: 0.19959\n","output_type":"stream"},{"execution_count":52,"output_type":"execute_result","data":{"text/plain":"BoostRFE(estimator=LGBMClassifier(n_estimators=150, random_state=0),\n min_features_to_select=1,\n param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],\n 'num_leaves': [25, 35]})"},"metadata":{}}]},{"cell_type":"code","source":"X_clf_train = pd.DataFrame(X_clf_train)\nX_clf_train[categorical_feature] = X_clf_train[categorical_feature].astype('category')\n\nX_clf_valid = pd.DataFrame(X_clf_valid)\nX_clf_valid[categorical_feature] = X_clf_valid[categorical_feature].astype('category')","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:53:00.499198Z","iopub.execute_input":"2022-01-01T11:53:00.499858Z","iopub.status.idle":"2022-01-01T11:53:00.527402Z","shell.execute_reply.started":"2022-01-01T11:53:00.499814Z","shell.execute_reply":"2022-01-01T11:53:00.526779Z"},"trusted":true},"execution_count":53,"outputs":[]},{"cell_type":"code","source":"### PASS category COLUMNS IN PANDAS DF ###\n\nmodel = BoostRFE(clf_lgbm, param_grid=param_grid, min_features_to_select=1, step=1)\nmodel.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:53:00.529027Z","iopub.execute_input":"2022-01-01T11:53:00.529320Z","iopub.status.idle":"2022-01-01T11:53:12.422092Z","shell.execute_reply.started":"2022-01-01T11:53:00.529281Z","shell.execute_reply":"2022-01-01T11:53:12.421368Z"},"trusted":true},"execution_count":54,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')\n\ntrial: 0001 ### iterations: 00029 ### eval_score: 0.2036\ntrial: 0002 ### iterations: 00030 ### eval_score: 0.2034\ntrial: 0003 ### iterations: 00027 ### eval_score: 0.20617\ntrial: 0004 ### iterations: 00024 ### eval_score: 0.20003\ntrial: 0005 ### iterations: 00060 ### eval_score: 0.20332\ntrial: 0006 ### iterations: 00063 ### eval_score: 0.20329\ntrial: 0007 ### iterations: 00054 ### eval_score: 0.20136\ntrial: 0008 ### iterations: 00052 ### eval_score: 0.19959\n","output_type":"stream"},{"execution_count":54,"output_type":"execute_result","data":{"text/plain":"BoostRFE(estimator=LGBMClassifier(n_estimators=150, random_state=0),\n min_features_to_select=1,\n param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],\n 'num_leaves': [25, 35]})"},"metadata":{}}]}]} -------------------------------------------------------------------------------- /notebooks/XGBoost_usage.ipynb: -------------------------------------------------------------------------------- 1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nfrom scipy import stats\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.datasets import make_classification, make_regression\n\nfrom hyperopt import hp\nfrom hyperopt import Trials\n\nfrom xgboost import *\n\ntry:\n from shaphypetune import BoostSearch, BoostBoruta, BoostRFE, BoostRFA\nexcept:\n !pip install --upgrade shap-hypetune\n from shaphypetune import BoostSearch, BoostBoruta, BoostRFE, BoostRFA\n\nimport warnings\nwarnings.simplefilter('ignore')","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:49:44.031173Z","iopub.execute_input":"2022-01-01T11:49:44.031497Z","iopub.status.idle":"2022-01-01T11:49:45.071830Z","shell.execute_reply.started":"2022-01-01T11:49:44.031410Z","shell.execute_reply":"2022-01-01T11:49:45.070928Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"X_clf, y_clf = make_classification(n_samples=6000, n_features=20, n_classes=2, \n n_informative=4, n_redundant=6, random_state=0)\n\nX_clf_train, X_clf_valid, y_clf_train, y_clf_valid = train_test_split(\n X_clf, y_clf, test_size=0.3, shuffle=False)\n\nX_regr, y_regr = make_classification(n_samples=6000, n_features=20,\n n_informative=7, random_state=0)\n\nX_regr_train, X_regr_valid, y_regr_train, y_regr_valid = train_test_split(\n X_regr, y_regr, test_size=0.3, shuffle=False)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:49:45.073832Z","iopub.execute_input":"2022-01-01T11:49:45.074046Z","iopub.status.idle":"2022-01-01T11:49:45.098178Z","shell.execute_reply.started":"2022-01-01T11:49:45.074004Z","shell.execute_reply":"2022-01-01T11:49:45.097461Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"code","source":"param_grid = {\n 'learning_rate': [0.2, 0.1],\n 'num_leaves': [25, 35],\n 'max_depth': [10, 12]\n}\n\nparam_dist = {\n 'learning_rate': stats.uniform(0.09, 0.25),\n 'num_leaves': stats.randint(20,40),\n 'max_depth': [10, 12]\n}\n\nparam_dist_hyperopt = {\n 'max_depth': 15 + hp.randint('num_leaves', 5), \n 'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),\n 'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0)\n}\n\n\nregr_xgb = XGBRegressor(n_estimators=150, random_state=0, verbosity=0, n_jobs=-1)\nclf_xgb = XGBClassifier(n_estimators=150, random_state=0, verbosity=0, n_jobs=-1)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:49:45.099715Z","iopub.execute_input":"2022-01-01T11:49:45.099916Z","iopub.status.idle":"2022-01-01T11:49:45.108765Z","shell.execute_reply.started":"2022-01-01T11:49:45.099890Z","shell.execute_reply":"2022-01-01T11:49:45.107996Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"markdown","source":"# Hyperparameters Tuning","metadata":{}},{"cell_type":"code","source":"### HYPERPARAM TUNING WITH GRID-SEARCH ###\n\nmodel = BoostSearch(clf_xgb, param_grid=param_grid)\nmodel.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:49:45.109686Z","iopub.execute_input":"2022-01-01T11:49:45.109871Z","iopub.status.idle":"2022-01-01T11:49:52.490942Z","shell.execute_reply.started":"2022-01-01T11:49:45.109848Z","shell.execute_reply":"2022-01-01T11:49:52.490078Z"},"trusted":true},"execution_count":4,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')\n\ntrial: 0001 ### iterations: 00021 ### eval_score: 0.2045\ntrial: 0002 ### iterations: 00026 ### eval_score: 0.19472\ntrial: 0003 ### iterations: 00021 ### eval_score: 0.2045\ntrial: 0004 ### iterations: 00026 ### eval_score: 0.19472\ntrial: 0005 ### iterations: 00045 ### eval_score: 0.19964\ntrial: 0006 ### iterations: 00050 ### eval_score: 0.20157\ntrial: 0007 ### iterations: 00045 ### eval_score: 0.19964\ntrial: 0008 ### iterations: 00050 ### eval_score: 0.20157\n","output_type":"stream"},{"execution_count":4,"output_type":"execute_result","data":{"text/plain":"BoostSearch(estimator=XGBClassifier(base_score=None, booster=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None,\n enable_categorical=False, gamma=None,\n gpu_id=None, importance_type=None,\n interaction_constraints=None,\n learning_rate=None, max_delta_step=None,\n max_depth=None, min_child_weight=None,\n missing=nan, monotone_constraints=None,\n n_estimators=150, n_jobs=-1,\n num_parallel_tree=None, predictor=None,\n random_state=0, reg_alpha=None,\n reg_lambda=None, scale_pos_weight=None,\n subsample=None, tree_method=None,\n validate_parameters=None, verbosity=0),\n param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],\n 'num_leaves': [25, 35]})"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.best_params_, model.best_score_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:49:52.493607Z","iopub.execute_input":"2022-01-01T11:49:52.494126Z","iopub.status.idle":"2022-01-01T11:49:52.504649Z","shell.execute_reply.started":"2022-01-01T11:49:52.494081Z","shell.execute_reply":"2022-01-01T11:49:52.503849Z"},"trusted":true},"execution_count":5,"outputs":[{"execution_count":5,"output_type":"execute_result","data":{"text/plain":"(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, enable_categorical=False,\n gamma=0, gpu_id=-1, importance_type=None,\n interaction_constraints='', learning_rate=0.2, max_delta_step=0,\n max_depth=12, min_child_weight=1, missing=nan,\n monotone_constraints='()', n_estimators=150, n_jobs=-1,\n num_leaves=25, num_parallel_tree=1, predictor='auto',\n random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,\n subsample=1, tree_method='exact', validate_parameters=1,\n verbosity=0),\n {'learning_rate': 0.2, 'num_leaves': 25, 'max_depth': 12},\n 0.194719)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_clf_valid, y_clf_valid), \n model.predict(X_clf_valid).shape, \n model.predict_proba(X_clf_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:49:52.506201Z","iopub.execute_input":"2022-01-01T11:49:52.506365Z","iopub.status.idle":"2022-01-01T11:49:52.528604Z","shell.execute_reply.started":"2022-01-01T11:49:52.506344Z","shell.execute_reply":"2022-01-01T11:49:52.528078Z"},"trusted":true},"execution_count":6,"outputs":[{"execution_count":6,"output_type":"execute_result","data":{"text/plain":"(0.9138888888888889, (1800,), (1800, 2))"},"metadata":{}}]},{"cell_type":"code","source":"### HYPERPARAM TUNING WITH RANDOM-SEARCH ###\n\nmodel = BoostSearch(\n regr_xgb, param_grid=param_dist,\n n_iter=8, sampling_seed=0\n)\nmodel.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:49:52.529476Z","iopub.execute_input":"2022-01-01T11:49:52.530097Z","iopub.status.idle":"2022-01-01T11:50:03.018637Z","shell.execute_reply.started":"2022-01-01T11:49:52.530066Z","shell.execute_reply":"2022-01-01T11:50:03.017927Z"},"trusted":true},"execution_count":7,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')\n\ntrial: 0001 ### iterations: 00012 ### eval_score: 0.27616\ntrial: 0002 ### iterations: 00056 ### eval_score: 0.26211\ntrial: 0003 ### iterations: 00078 ### eval_score: 0.27603\ntrial: 0004 ### iterations: 00045 ### eval_score: 0.26117\ntrial: 0005 ### iterations: 00046 ### eval_score: 0.27868\ntrial: 0006 ### iterations: 00035 ### eval_score: 0.27815\ntrial: 0007 ### iterations: 00039 ### eval_score: 0.2753\ntrial: 0008 ### iterations: 00016 ### eval_score: 0.28116\n","output_type":"stream"},{"execution_count":7,"output_type":"execute_result","data":{"text/plain":"BoostSearch(estimator=XGBRegressor(base_score=None, booster=None,\n colsample_bylevel=None,\n colsample_bynode=None, colsample_bytree=None,\n enable_categorical=False, gamma=None,\n gpu_id=None, importance_type=None,\n interaction_constraints=None,\n learning_rate=None, max_delta_step=None,\n max_depth=None, min_child_weight=None,\n missing=nan, monotone_constraints=None,\n n_estim...\n random_state=0, reg_alpha=None,\n reg_lambda=None, scale_pos_weight=None,\n subsample=None, tree_method=None,\n validate_parameters=None, verbosity=0),\n n_iter=8,\n param_grid={'learning_rate': ,\n 'max_depth': [10, 12],\n 'num_leaves': },\n sampling_seed=0)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.best_params_, model.best_score_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:50:03.019747Z","iopub.execute_input":"2022-01-01T11:50:03.020416Z","iopub.status.idle":"2022-01-01T11:50:03.030730Z","shell.execute_reply.started":"2022-01-01T11:50:03.020379Z","shell.execute_reply":"2022-01-01T11:50:03.030065Z"},"trusted":true},"execution_count":8,"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":"(XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, enable_categorical=False,\n gamma=0, gpu_id=-1, importance_type=None,\n interaction_constraints='', learning_rate=0.1669837381562427,\n max_delta_step=0, max_depth=10, min_child_weight=1, missing=nan,\n monotone_constraints='()', n_estimators=150, n_jobs=-1,\n num_leaves=25, num_parallel_tree=1, predictor='auto',\n random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,\n subsample=1, tree_method='exact', validate_parameters=1,\n verbosity=0),\n {'learning_rate': 0.1669837381562427, 'num_leaves': 25, 'max_depth': 10},\n 0.26117)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:50:03.032146Z","iopub.execute_input":"2022-01-01T11:50:03.032612Z","iopub.status.idle":"2022-01-01T11:50:03.058721Z","shell.execute_reply.started":"2022-01-01T11:50:03.032572Z","shell.execute_reply":"2022-01-01T11:50:03.058084Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"(0.7271524639165458, (1800,))"},"metadata":{}}]},{"cell_type":"code","source":"### HYPERPARAM TUNING WITH HYPEROPT ###\n\nmodel = BoostSearch(\n regr_xgb, param_grid=param_dist_hyperopt,\n n_iter=8, sampling_seed=0\n)\nmodel.fit(\n X_regr_train, y_regr_train, trials=Trials(), \n eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0\n)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:50:03.059800Z","iopub.execute_input":"2022-01-01T11:50:03.062204Z","iopub.status.idle":"2022-01-01T11:50:32.323625Z","shell.execute_reply.started":"2022-01-01T11:50:03.062158Z","shell.execute_reply":"2022-01-01T11:50:32.322789Z"},"trusted":true},"execution_count":10,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('max_depth', 'learning_rate', 'colsample_bytree')\n\ntrial: 0001 ### iterations: 00149 ### eval_score: 0.27498\ntrial: 0002 ### iterations: 00074 ### eval_score: 0.27186\ntrial: 0003 ### iterations: 00038 ### eval_score: 0.28326\ntrial: 0004 ### iterations: 00149 ### eval_score: 0.29455\ntrial: 0005 ### iterations: 00149 ### eval_score: 0.28037\ntrial: 0006 ### iterations: 00149 ### eval_score: 0.26421\ntrial: 0007 ### iterations: 00052 ### eval_score: 0.27191\ntrial: 0008 ### iterations: 00133 ### eval_score: 0.29251\n","output_type":"stream"},{"execution_count":10,"output_type":"execute_result","data":{"text/plain":"BoostSearch(estimator=XGBRegressor(base_score=None, booster=None,\n colsample_bylevel=None,\n colsample_bynode=None, colsample_bytree=None,\n enable_categorical=False, gamma=None,\n gpu_id=None, importance_type=None,\n interaction_constraints=None,\n learning_rate=None, max_delta_step=None,\n max_depth=None, min_child_weight=None,\n missing=nan, monotone_constraints=None,\n n_estim...\n random_state=0, reg_alpha=None,\n reg_lambda=None, scale_pos_weight=None,\n subsample=None, tree_method=None,\n validate_parameters=None, verbosity=0),\n n_iter=8,\n param_grid={'colsample_bytree': ,\n 'learning_rate': ,\n 'max_depth': },\n sampling_seed=0)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.best_params_, model.best_score_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:50:32.324994Z","iopub.execute_input":"2022-01-01T11:50:32.325480Z","iopub.status.idle":"2022-01-01T11:50:32.335828Z","shell.execute_reply.started":"2022-01-01T11:50:32.325441Z","shell.execute_reply":"2022-01-01T11:50:32.334970Z"},"trusted":true},"execution_count":11,"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":"(XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=0.7597292534356749,\n enable_categorical=False, gamma=0, gpu_id=-1, importance_type=None,\n interaction_constraints='', learning_rate=0.059836658149176665,\n max_delta_step=0, max_depth=16, min_child_weight=1, missing=nan,\n monotone_constraints='()', n_estimators=150, n_jobs=-1,\n num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,\n reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',\n validate_parameters=1, verbosity=0),\n {'colsample_bytree': 0.7597292534356749,\n 'learning_rate': 0.059836658149176665,\n 'max_depth': 16},\n 0.264211)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:50:32.337011Z","iopub.execute_input":"2022-01-01T11:50:32.337395Z","iopub.status.idle":"2022-01-01T11:50:32.370381Z","shell.execute_reply.started":"2022-01-01T11:50:32.337369Z","shell.execute_reply":"2022-01-01T11:50:32.369816Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":"(0.7207605727361562, (1800,))"},"metadata":{}}]},{"cell_type":"markdown","source":"# Features Selection","metadata":{}},{"cell_type":"code","source":"### BORUTA ###\n\nmodel = BoostBoruta(clf_xgb, max_iter=200, perc=100)\nmodel.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:50:32.371634Z","iopub.execute_input":"2022-01-01T11:50:32.372109Z","iopub.status.idle":"2022-01-01T11:50:50.797541Z","shell.execute_reply.started":"2022-01-01T11:50:32.372066Z","shell.execute_reply":"2022-01-01T11:50:50.797059Z"},"trusted":true},"execution_count":13,"outputs":[{"execution_count":13,"output_type":"execute_result","data":{"text/plain":"BoostBoruta(estimator=XGBClassifier(base_score=None, booster=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None,\n enable_categorical=False, gamma=None,\n gpu_id=None, importance_type=None,\n interaction_constraints=None,\n learning_rate=None, max_delta_step=None,\n max_depth=None, min_child_weight=None,\n missing=nan, monotone_constraints=None,\n n_estimators=150, n_jobs=-1,\n num_parallel_tree=None, predictor=None,\n random_state=0, reg_alpha=None,\n reg_lambda=None, scale_pos_weight=None,\n subsample=None, tree_method=None,\n validate_parameters=None, verbosity=0),\n max_iter=200)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:50:50.800394Z","iopub.execute_input":"2022-01-01T11:50:50.800795Z","iopub.status.idle":"2022-01-01T11:50:50.809566Z","shell.execute_reply.started":"2022-01-01T11:50:50.800767Z","shell.execute_reply":"2022-01-01T11:50:50.808911Z"},"trusted":true},"execution_count":14,"outputs":[{"execution_count":14,"output_type":"execute_result","data":{"text/plain":"(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, enable_categorical=False,\n gamma=0, gpu_id=-1, importance_type=None,\n interaction_constraints='', learning_rate=0.300000012,\n max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,\n monotone_constraints='()', n_estimators=150, n_jobs=-1,\n num_parallel_tree=1, predictor='auto', random_state=0,\n reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,\n tree_method='exact', validate_parameters=1, verbosity=0),\n 11)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_clf_valid, y_clf_valid), \n model.predict(X_clf_valid).shape, \n model.transform(X_clf_valid).shape,\n model.predict_proba(X_clf_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:50:50.810633Z","iopub.execute_input":"2022-01-01T11:50:50.811078Z","iopub.status.idle":"2022-01-01T11:50:50.834426Z","shell.execute_reply.started":"2022-01-01T11:50:50.811040Z","shell.execute_reply":"2022-01-01T11:50:50.833776Z"},"trusted":true},"execution_count":15,"outputs":[{"execution_count":15,"output_type":"execute_result","data":{"text/plain":"(0.9161111111111111, (1800,), (1800, 11), (1800, 2))"},"metadata":{}}]},{"cell_type":"code","source":"### RECURSIVE FEATURE ELIMINATION (RFE) ###\n\nmodel = BoostRFE(regr_xgb, min_features_to_select=1, step=1)\nmodel.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:50:50.835608Z","iopub.execute_input":"2022-01-01T11:50:50.836142Z","iopub.status.idle":"2022-01-01T11:50:58.558180Z","shell.execute_reply.started":"2022-01-01T11:50:50.836100Z","shell.execute_reply":"2022-01-01T11:50:58.557365Z"},"trusted":true},"execution_count":16,"outputs":[{"execution_count":16,"output_type":"execute_result","data":{"text/plain":"BoostRFE(estimator=XGBRegressor(base_score=None, booster=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, enable_categorical=False,\n gamma=None, gpu_id=None, importance_type=None,\n interaction_constraints=None,\n learning_rate=None, max_delta_step=None,\n max_depth=None, min_child_weight=None,\n missing=nan, monotone_constraints=None,\n n_estimators=150, n_jobs=-1,\n num_parallel_tree=None, predictor=None,\n random_state=0, reg_alpha=None, reg_lambda=None,\n scale_pos_weight=None, subsample=None,\n tree_method=None, validate_parameters=None,\n verbosity=0),\n min_features_to_select=1)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:50:58.559585Z","iopub.execute_input":"2022-01-01T11:50:58.560110Z","iopub.status.idle":"2022-01-01T11:50:58.569301Z","shell.execute_reply.started":"2022-01-01T11:50:58.560048Z","shell.execute_reply":"2022-01-01T11:50:58.568542Z"},"trusted":true},"execution_count":17,"outputs":[{"execution_count":17,"output_type":"execute_result","data":{"text/plain":"(XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, enable_categorical=False,\n gamma=0, gpu_id=-1, importance_type=None,\n interaction_constraints='', learning_rate=0.300000012,\n max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,\n monotone_constraints='()', n_estimators=150, n_jobs=-1,\n num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,\n reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',\n validate_parameters=1, verbosity=0),\n 7)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape, \n model.transform(X_regr_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:50:58.570558Z","iopub.execute_input":"2022-01-01T11:50:58.570828Z","iopub.status.idle":"2022-01-01T11:50:58.584624Z","shell.execute_reply.started":"2022-01-01T11:50:58.570792Z","shell.execute_reply":"2022-01-01T11:50:58.584081Z"},"trusted":true},"execution_count":18,"outputs":[{"execution_count":18,"output_type":"execute_result","data":{"text/plain":"(0.7317444492376407, (1800,), (1800, 7))"},"metadata":{}}]},{"cell_type":"code","source":"### RECURSIVE FEATURE ADDITION (RFA) ###\n\nmodel = BoostRFA(regr_xgb, min_features_to_select=1, step=1)\nmodel.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:50:58.585749Z","iopub.execute_input":"2022-01-01T11:50:58.586163Z","iopub.status.idle":"2022-01-01T11:51:09.404587Z","shell.execute_reply.started":"2022-01-01T11:50:58.586126Z","shell.execute_reply":"2022-01-01T11:51:09.403781Z"},"trusted":true},"execution_count":19,"outputs":[{"execution_count":19,"output_type":"execute_result","data":{"text/plain":"BoostRFA(estimator=XGBRegressor(base_score=None, booster=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, enable_categorical=False,\n gamma=None, gpu_id=None, importance_type=None,\n interaction_constraints=None,\n learning_rate=None, max_delta_step=None,\n max_depth=None, min_child_weight=None,\n missing=nan, monotone_constraints=None,\n n_estimators=150, n_jobs=-1,\n num_parallel_tree=None, predictor=None,\n random_state=0, reg_alpha=None, reg_lambda=None,\n scale_pos_weight=None, subsample=None,\n tree_method=None, validate_parameters=None,\n verbosity=0),\n min_features_to_select=1)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:51:09.406057Z","iopub.execute_input":"2022-01-01T11:51:09.406434Z","iopub.status.idle":"2022-01-01T11:51:09.416068Z","shell.execute_reply.started":"2022-01-01T11:51:09.406399Z","shell.execute_reply":"2022-01-01T11:51:09.415411Z"},"trusted":true},"execution_count":20,"outputs":[{"execution_count":20,"output_type":"execute_result","data":{"text/plain":"(XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, enable_categorical=False,\n gamma=0, gpu_id=-1, importance_type=None,\n interaction_constraints='', learning_rate=0.300000012,\n max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,\n monotone_constraints='()', n_estimators=150, n_jobs=-1,\n num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,\n reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',\n validate_parameters=1, verbosity=0),\n 8)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape, \n model.transform(X_regr_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:51:09.417248Z","iopub.execute_input":"2022-01-01T11:51:09.417698Z","iopub.status.idle":"2022-01-01T11:51:09.450280Z","shell.execute_reply.started":"2022-01-01T11:51:09.417657Z","shell.execute_reply":"2022-01-01T11:51:09.449664Z"},"trusted":true},"execution_count":21,"outputs":[{"execution_count":21,"output_type":"execute_result","data":{"text/plain":"(0.7274037362877257, (1800,), (1800, 8))"},"metadata":{}}]},{"cell_type":"markdown","source":"# Features Selection with SHAP","metadata":{}},{"cell_type":"code","source":"### BORUTA SHAP ###\n\nmodel = BoostBoruta(\n clf_xgb, max_iter=200, perc=100,\n importance_type='shap_importances', train_importance=False\n)\nmodel.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:51:09.451169Z","iopub.execute_input":"2022-01-01T11:51:09.451507Z","iopub.status.idle":"2022-01-01T11:51:33.925757Z","shell.execute_reply.started":"2022-01-01T11:51:09.451482Z","shell.execute_reply":"2022-01-01T11:51:33.925076Z"},"trusted":true},"execution_count":22,"outputs":[{"execution_count":22,"output_type":"execute_result","data":{"text/plain":"BoostBoruta(estimator=XGBClassifier(base_score=None, booster=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None,\n enable_categorical=False, gamma=None,\n gpu_id=None, importance_type=None,\n interaction_constraints=None,\n learning_rate=None, max_delta_step=None,\n max_depth=None, min_child_weight=None,\n missing=nan, monotone_constraints=None,\n n_estimators=150, n_jobs=-1,\n num_parallel_tree=None, predictor=None,\n random_state=0, reg_alpha=None,\n reg_lambda=None, scale_pos_weight=None,\n subsample=None, tree_method=None,\n validate_parameters=None, verbosity=0),\n importance_type='shap_importances', max_iter=200,\n train_importance=False)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:51:33.926762Z","iopub.execute_input":"2022-01-01T11:51:33.926940Z","iopub.status.idle":"2022-01-01T11:51:33.934907Z","shell.execute_reply.started":"2022-01-01T11:51:33.926918Z","shell.execute_reply":"2022-01-01T11:51:33.934315Z"},"trusted":true},"execution_count":23,"outputs":[{"execution_count":23,"output_type":"execute_result","data":{"text/plain":"(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, enable_categorical=False,\n gamma=0, gpu_id=-1, importance_type=None,\n interaction_constraints='', learning_rate=0.300000012,\n max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,\n monotone_constraints='()', n_estimators=150, n_jobs=-1,\n num_parallel_tree=1, predictor='auto', random_state=0,\n reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,\n tree_method='exact', validate_parameters=1, verbosity=0),\n 10)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_clf_valid, y_clf_valid), \n model.predict(X_clf_valid).shape, \n model.transform(X_clf_valid).shape,\n model.predict_proba(X_clf_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:51:33.935950Z","iopub.execute_input":"2022-01-01T11:51:33.936419Z","iopub.status.idle":"2022-01-01T11:51:33.961319Z","shell.execute_reply.started":"2022-01-01T11:51:33.936381Z","shell.execute_reply":"2022-01-01T11:51:33.960533Z"},"trusted":true},"execution_count":24,"outputs":[{"execution_count":24,"output_type":"execute_result","data":{"text/plain":"(0.91, (1800,), (1800, 10), (1800, 2))"},"metadata":{}}]},{"cell_type":"code","source":"### RECURSIVE FEATURE ELIMINATION (RFE) SHAP ###\n\nmodel = BoostRFE(\n regr_xgb, min_features_to_select=1, step=1,\n importance_type='shap_importances', train_importance=False\n)\nmodel.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:51:33.962369Z","iopub.execute_input":"2022-01-01T11:51:33.962555Z","iopub.status.idle":"2022-01-01T11:51:47.059712Z","shell.execute_reply.started":"2022-01-01T11:51:33.962532Z","shell.execute_reply":"2022-01-01T11:51:47.058892Z"},"trusted":true},"execution_count":25,"outputs":[{"execution_count":25,"output_type":"execute_result","data":{"text/plain":"BoostRFE(estimator=XGBRegressor(base_score=None, booster=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, enable_categorical=False,\n gamma=None, gpu_id=None, importance_type=None,\n interaction_constraints=None,\n learning_rate=None, max_delta_step=None,\n max_depth=None, min_child_weight=None,\n missing=nan, monotone_constraints=None,\n n_estimators=150, n_jobs=-1,\n num_parallel_tree=None, predictor=None,\n random_state=0, reg_alpha=None, reg_lambda=None,\n scale_pos_weight=None, subsample=None,\n tree_method=None, validate_parameters=None,\n verbosity=0),\n importance_type='shap_importances', min_features_to_select=1,\n train_importance=False)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:51:47.060847Z","iopub.execute_input":"2022-01-01T11:51:47.061090Z","iopub.status.idle":"2022-01-01T11:51:47.069229Z","shell.execute_reply.started":"2022-01-01T11:51:47.061061Z","shell.execute_reply":"2022-01-01T11:51:47.068462Z"},"trusted":true},"execution_count":26,"outputs":[{"execution_count":26,"output_type":"execute_result","data":{"text/plain":"(XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, enable_categorical=False,\n gamma=0, gpu_id=-1, importance_type=None,\n interaction_constraints='', learning_rate=0.300000012,\n max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,\n monotone_constraints='()', n_estimators=150, n_jobs=-1,\n num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,\n reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',\n validate_parameters=1, verbosity=0),\n 7)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape, \n model.transform(X_regr_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:51:47.070353Z","iopub.execute_input":"2022-01-01T11:51:47.071217Z","iopub.status.idle":"2022-01-01T11:51:47.087333Z","shell.execute_reply.started":"2022-01-01T11:51:47.071168Z","shell.execute_reply":"2022-01-01T11:51:47.086754Z"},"trusted":true},"execution_count":27,"outputs":[{"execution_count":27,"output_type":"execute_result","data":{"text/plain":"(0.7317444492376407, (1800,), (1800, 7))"},"metadata":{}}]},{"cell_type":"code","source":"### RECURSIVE FEATURE ADDITION (RFA) SHAP ###\n\nmodel = BoostRFA(\n regr_xgb, min_features_to_select=1, step=1,\n importance_type='shap_importances', train_importance=False\n)\nmodel.fit(\n X_regr_train, y_regr_train, trials=Trials(), \n eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0\n)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:51:47.088455Z","iopub.execute_input":"2022-01-01T11:51:47.088921Z","iopub.status.idle":"2022-01-01T11:51:59.186202Z","shell.execute_reply.started":"2022-01-01T11:51:47.088885Z","shell.execute_reply":"2022-01-01T11:51:59.185431Z"},"trusted":true},"execution_count":28,"outputs":[{"execution_count":28,"output_type":"execute_result","data":{"text/plain":"BoostRFA(estimator=XGBRegressor(base_score=None, booster=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, enable_categorical=False,\n gamma=None, gpu_id=None, importance_type=None,\n interaction_constraints=None,\n learning_rate=None, max_delta_step=None,\n max_depth=None, min_child_weight=None,\n missing=nan, monotone_constraints=None,\n n_estimators=150, n_jobs=-1,\n num_parallel_tree=None, predictor=None,\n random_state=0, reg_alpha=None, reg_lambda=None,\n scale_pos_weight=None, subsample=None,\n tree_method=None, validate_parameters=None,\n verbosity=0),\n importance_type='shap_importances', min_features_to_select=1,\n train_importance=False)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:51:59.187276Z","iopub.execute_input":"2022-01-01T11:51:59.188081Z","iopub.status.idle":"2022-01-01T11:51:59.199276Z","shell.execute_reply.started":"2022-01-01T11:51:59.188004Z","shell.execute_reply":"2022-01-01T11:51:59.198325Z"},"trusted":true},"execution_count":29,"outputs":[{"execution_count":29,"output_type":"execute_result","data":{"text/plain":"(XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, enable_categorical=False,\n gamma=0, gpu_id=-1, importance_type=None,\n interaction_constraints='', learning_rate=0.300000012,\n max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,\n monotone_constraints='()', n_estimators=150, n_jobs=-1,\n num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,\n reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',\n validate_parameters=1, verbosity=0),\n 9)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape, \n model.transform(X_regr_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:51:59.200366Z","iopub.execute_input":"2022-01-01T11:51:59.200640Z","iopub.status.idle":"2022-01-01T11:51:59.222774Z","shell.execute_reply.started":"2022-01-01T11:51:59.200592Z","shell.execute_reply":"2022-01-01T11:51:59.222078Z"},"trusted":true},"execution_count":30,"outputs":[{"execution_count":30,"output_type":"execute_result","data":{"text/plain":"(0.7249664284333042, (1800,), (1800, 9))"},"metadata":{}}]},{"cell_type":"markdown","source":"# Hyperparameters Tuning + Features Selection","metadata":{}},{"cell_type":"code","source":"### HYPERPARAM TUNING WITH GRID-SEARCH + BORUTA ###\n\nmodel = BoostBoruta(clf_xgb, param_grid=param_grid, max_iter=200, perc=100)\nmodel.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T11:51:59.224176Z","iopub.execute_input":"2022-01-01T11:51:59.224707Z","iopub.status.idle":"2022-01-01T12:14:09.045290Z","shell.execute_reply.started":"2022-01-01T11:51:59.224667Z","shell.execute_reply":"2022-01-01T12:14:09.044649Z"},"trusted":true},"execution_count":31,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')\n\ntrial: 0001 ### iterations: 00026 ### eval_score: 0.20001\ntrial: 0002 ### iterations: 00022 ### eval_score: 0.20348\ntrial: 0003 ### iterations: 00026 ### eval_score: 0.20001\ntrial: 0004 ### iterations: 00022 ### eval_score: 0.20348\ntrial: 0005 ### iterations: 00048 ### eval_score: 0.19925\ntrial: 0006 ### iterations: 00052 ### eval_score: 0.20307\ntrial: 0007 ### iterations: 00048 ### eval_score: 0.19925\ntrial: 0008 ### iterations: 00052 ### eval_score: 0.20307\n","output_type":"stream"},{"execution_count":31,"output_type":"execute_result","data":{"text/plain":"BoostBoruta(estimator=XGBClassifier(base_score=None, booster=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None,\n enable_categorical=False, gamma=None,\n gpu_id=None, importance_type=None,\n interaction_constraints=None,\n learning_rate=None, max_delta_step=None,\n max_depth=None, min_child_weight=None,\n missing=nan, monotone_constraints=None,\n n_estimators=150, n_jobs=-1,\n num_parallel_tree=None, predictor=None,\n random_state=0, reg_alpha=None,\n reg_lambda=None, scale_pos_weight=None,\n subsample=None, tree_method=None,\n validate_parameters=None, verbosity=0),\n max_iter=200,\n param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],\n 'num_leaves': [25, 35]})"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.best_params_, model.best_score_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T12:14:09.046490Z","iopub.execute_input":"2022-01-01T12:14:09.047104Z","iopub.status.idle":"2022-01-01T12:14:09.056559Z","shell.execute_reply.started":"2022-01-01T12:14:09.047070Z","shell.execute_reply":"2022-01-01T12:14:09.056076Z"},"trusted":true},"execution_count":32,"outputs":[{"execution_count":32,"output_type":"execute_result","data":{"text/plain":"(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, enable_categorical=False,\n gamma=0, gpu_id=-1, importance_type=None,\n interaction_constraints='', learning_rate=0.1, max_delta_step=0,\n max_depth=10, min_child_weight=1, missing=nan,\n monotone_constraints='()', n_estimators=150, n_jobs=-1,\n num_leaves=25, num_parallel_tree=1, predictor='auto',\n random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,\n subsample=1, tree_method='exact', validate_parameters=1,\n verbosity=0),\n {'learning_rate': 0.1, 'num_leaves': 25, 'max_depth': 10},\n 0.199248,\n 11)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_clf_valid, y_clf_valid), \n model.predict(X_clf_valid).shape, \n model.transform(X_clf_valid).shape,\n model.predict_proba(X_clf_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T12:14:09.057462Z","iopub.execute_input":"2022-01-01T12:14:09.057740Z","iopub.status.idle":"2022-01-01T12:14:09.086612Z","shell.execute_reply.started":"2022-01-01T12:14:09.057716Z","shell.execute_reply":"2022-01-01T12:14:09.085920Z"},"trusted":true},"execution_count":33,"outputs":[{"execution_count":33,"output_type":"execute_result","data":{"text/plain":"(0.9144444444444444, (1800,), (1800, 11), (1800, 2))"},"metadata":{}}]},{"cell_type":"code","source":"### HYPERPARAM TUNING WITH RANDOM-SEARCH + RECURSIVE FEATURE ELIMINATION (RFE) ###\n\nmodel = BoostRFE(\n regr_xgb, param_grid=param_dist, min_features_to_select=1, step=1,\n n_iter=8, sampling_seed=0\n)\nmodel.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T12:14:09.087595Z","iopub.execute_input":"2022-01-01T12:14:09.087798Z","iopub.status.idle":"2022-01-01T12:16:42.203604Z","shell.execute_reply.started":"2022-01-01T12:14:09.087772Z","shell.execute_reply":"2022-01-01T12:16:42.202743Z"},"trusted":true},"execution_count":34,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')\n\ntrial: 0001 ### iterations: 00021 ### eval_score: 0.25941\ntrial: 0002 ### iterations: 00077 ### eval_score: 0.25055\ntrial: 0003 ### iterations: 00086 ### eval_score: 0.25676\ntrial: 0004 ### iterations: 00098 ### eval_score: 0.25383\ntrial: 0005 ### iterations: 00050 ### eval_score: 0.25751\ntrial: 0006 ### iterations: 00028 ### eval_score: 0.26007\ntrial: 0007 ### iterations: 00084 ### eval_score: 0.2603\ntrial: 0008 ### iterations: 00024 ### eval_score: 0.26278\n","output_type":"stream"},{"execution_count":34,"output_type":"execute_result","data":{"text/plain":"BoostRFE(estimator=XGBRegressor(base_score=None, booster=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, enable_categorical=False,\n gamma=None, gpu_id=None, importance_type=None,\n interaction_constraints=None,\n learning_rate=None, max_delta_step=None,\n max_depth=None, min_child_weight=None,\n missing=nan, monotone_constraints=None,\n n_estimato...\n random_state=0, reg_alpha=None, reg_lambda=None,\n scale_pos_weight=None, subsample=None,\n tree_method=None, validate_parameters=None,\n verbosity=0),\n min_features_to_select=1, n_iter=8,\n param_grid={'learning_rate': ,\n 'max_depth': [10, 12],\n 'num_leaves': },\n sampling_seed=0)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.best_params_, model.best_score_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T12:16:42.205176Z","iopub.execute_input":"2022-01-01T12:16:42.205439Z","iopub.status.idle":"2022-01-01T12:16:42.215355Z","shell.execute_reply.started":"2022-01-01T12:16:42.205404Z","shell.execute_reply":"2022-01-01T12:16:42.214732Z"},"trusted":true},"execution_count":35,"outputs":[{"execution_count":35,"output_type":"execute_result","data":{"text/plain":"(XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, enable_categorical=False,\n gamma=0, gpu_id=-1, importance_type=None,\n interaction_constraints='', learning_rate=0.1350674222191923,\n max_delta_step=0, max_depth=10, min_child_weight=1, missing=nan,\n monotone_constraints='()', n_estimators=150, n_jobs=-1,\n num_leaves=38, num_parallel_tree=1, predictor='auto',\n random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,\n subsample=1, tree_method='exact', validate_parameters=1,\n verbosity=0),\n {'learning_rate': 0.1350674222191923, 'num_leaves': 38, 'max_depth': 10},\n 0.250552,\n 10)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape, \n model.transform(X_regr_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T12:16:42.216398Z","iopub.execute_input":"2022-01-01T12:16:42.216642Z","iopub.status.idle":"2022-01-01T12:16:42.242381Z","shell.execute_reply.started":"2022-01-01T12:16:42.216606Z","shell.execute_reply":"2022-01-01T12:16:42.241879Z"},"trusted":true},"execution_count":36,"outputs":[{"execution_count":36,"output_type":"execute_result","data":{"text/plain":"(0.7488873349293266, (1800,), (1800, 10))"},"metadata":{}}]},{"cell_type":"code","source":"### HYPERPARAM TUNING WITH HYPEROPT + RECURSIVE FEATURE ADDITION (RFA) ###\n\nmodel = BoostRFA(\n regr_xgb, param_grid=param_dist_hyperopt, min_features_to_select=1, step=1,\n n_iter=8, sampling_seed=0\n)\nmodel.fit(\n X_regr_train, y_regr_train, trials=Trials(), \n eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0\n)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T12:16:42.245655Z","iopub.execute_input":"2022-01-01T12:16:42.247219Z","iopub.status.idle":"2022-01-01T12:26:08.685124Z","shell.execute_reply.started":"2022-01-01T12:16:42.247188Z","shell.execute_reply":"2022-01-01T12:26:08.684364Z"},"trusted":true},"execution_count":37,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('max_depth', 'learning_rate', 'colsample_bytree')\n\ntrial: 0001 ### iterations: 00149 ### eval_score: 0.26412\ntrial: 0002 ### iterations: 00080 ### eval_score: 0.25357\ntrial: 0003 ### iterations: 00054 ### eval_score: 0.26123\ntrial: 0004 ### iterations: 00149 ### eval_score: 0.2801\ntrial: 0005 ### iterations: 00149 ### eval_score: 0.27046\ntrial: 0006 ### iterations: 00149 ### eval_score: 0.24789\ntrial: 0007 ### iterations: 00054 ### eval_score: 0.25928\ntrial: 0008 ### iterations: 00140 ### eval_score: 0.27284\n","output_type":"stream"},{"execution_count":37,"output_type":"execute_result","data":{"text/plain":"BoostRFA(estimator=XGBRegressor(base_score=None, booster=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, enable_categorical=False,\n gamma=None, gpu_id=None, importance_type=None,\n interaction_constraints=None,\n learning_rate=None, max_delta_step=None,\n max_depth=None, min_child_weight=None,\n missing=nan, monotone_constraints=None,\n n_estimato...\n random_state=0, reg_alpha=None, reg_lambda=None,\n scale_pos_weight=None, subsample=None,\n tree_method=None, validate_parameters=None,\n verbosity=0),\n min_features_to_select=1, n_iter=8,\n param_grid={'colsample_bytree': ,\n 'learning_rate': ,\n 'max_depth': },\n sampling_seed=0)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.best_params_, model.best_score_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T12:26:08.686184Z","iopub.execute_input":"2022-01-01T12:26:08.686931Z","iopub.status.idle":"2022-01-01T12:26:08.696854Z","shell.execute_reply.started":"2022-01-01T12:26:08.686898Z","shell.execute_reply":"2022-01-01T12:26:08.696004Z"},"trusted":true},"execution_count":38,"outputs":[{"execution_count":38,"output_type":"execute_result","data":{"text/plain":"(XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=0.7597292534356749,\n enable_categorical=False, gamma=0, gpu_id=-1, importance_type=None,\n interaction_constraints='', learning_rate=0.059836658149176665,\n max_delta_step=0, max_depth=16, min_child_weight=1, missing=nan,\n monotone_constraints='()', n_estimators=150, n_jobs=-1,\n num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,\n reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',\n validate_parameters=1, verbosity=0),\n {'colsample_bytree': 0.7597292534356749,\n 'learning_rate': 0.059836658149176665,\n 'max_depth': 16},\n 0.247887,\n 8)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape, \n model.transform(X_regr_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T12:26:08.697934Z","iopub.execute_input":"2022-01-01T12:26:08.698155Z","iopub.status.idle":"2022-01-01T12:26:08.736781Z","shell.execute_reply.started":"2022-01-01T12:26:08.698128Z","shell.execute_reply":"2022-01-01T12:26:08.736145Z"},"trusted":true},"execution_count":39,"outputs":[{"execution_count":39,"output_type":"execute_result","data":{"text/plain":"(0.7542006308661441, (1800,), (1800, 8))"},"metadata":{}}]},{"cell_type":"markdown","source":"# Hyperparameters Tuning + Features Selection with SHAP","metadata":{}},{"cell_type":"code","source":"### HYPERPARAM TUNING WITH GRID-SEARCH + BORUTA SHAP ###\n\nmodel = BoostBoruta(\n clf_xgb, param_grid=param_grid, max_iter=200, perc=100,\n importance_type='shap_importances', train_importance=False\n)\nmodel.fit(X_clf_train, y_clf_train, eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"scrolled":true,"execution":{"iopub.status.busy":"2022-01-01T12:26:08.740222Z","iopub.execute_input":"2022-01-01T12:26:08.741848Z","iopub.status.idle":"2022-01-01T12:56:13.612807Z","shell.execute_reply.started":"2022-01-01T12:26:08.741813Z","shell.execute_reply":"2022-01-01T12:56:13.611991Z"},"trusted":true},"execution_count":40,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')\n\ntrial: 0001 ### iterations: 00024 ### eval_score: 0.20151\ntrial: 0002 ### iterations: 00020 ### eval_score: 0.20877\ntrial: 0003 ### iterations: 00024 ### eval_score: 0.20151\ntrial: 0004 ### iterations: 00020 ### eval_score: 0.20877\ntrial: 0005 ### iterations: 00048 ### eval_score: 0.20401\ntrial: 0006 ### iterations: 00048 ### eval_score: 0.20575\ntrial: 0007 ### iterations: 00048 ### eval_score: 0.20401\ntrial: 0008 ### iterations: 00048 ### eval_score: 0.20575\n","output_type":"stream"},{"execution_count":40,"output_type":"execute_result","data":{"text/plain":"BoostBoruta(estimator=XGBClassifier(base_score=None, booster=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None,\n enable_categorical=False, gamma=None,\n gpu_id=None, importance_type=None,\n interaction_constraints=None,\n learning_rate=None, max_delta_step=None,\n max_depth=None, min_child_weight=None,\n missing=nan, monotone_constraints=None,\n n_estimators=150, n_jobs=-1,\n num_parallel_tree=None, predictor=None,\n random_state=0, reg_alpha=None,\n reg_lambda=None, scale_pos_weight=None,\n subsample=None, tree_method=None,\n validate_parameters=None, verbosity=0),\n importance_type='shap_importances', max_iter=200,\n param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],\n 'num_leaves': [25, 35]},\n train_importance=False)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.best_params_, model.best_score_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T12:56:13.617168Z","iopub.execute_input":"2022-01-01T12:56:13.617372Z","iopub.status.idle":"2022-01-01T12:56:13.626563Z","shell.execute_reply.started":"2022-01-01T12:56:13.617349Z","shell.execute_reply":"2022-01-01T12:56:13.626036Z"},"trusted":true},"execution_count":41,"outputs":[{"execution_count":41,"output_type":"execute_result","data":{"text/plain":"(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, enable_categorical=False,\n gamma=0, gpu_id=-1, importance_type=None,\n interaction_constraints='', learning_rate=0.2, max_delta_step=0,\n max_depth=10, min_child_weight=1, missing=nan,\n monotone_constraints='()', n_estimators=150, n_jobs=-1,\n num_leaves=25, num_parallel_tree=1, predictor='auto',\n random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,\n subsample=1, tree_method='exact', validate_parameters=1,\n verbosity=0),\n {'learning_rate': 0.2, 'num_leaves': 25, 'max_depth': 10},\n 0.201509,\n 10)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_clf_valid, y_clf_valid), \n model.predict(X_clf_valid).shape, \n model.transform(X_clf_valid).shape,\n model.predict_proba(X_clf_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T12:56:13.627454Z","iopub.execute_input":"2022-01-01T12:56:13.627825Z","iopub.status.idle":"2022-01-01T12:56:13.665907Z","shell.execute_reply.started":"2022-01-01T12:56:13.627797Z","shell.execute_reply":"2022-01-01T12:56:13.664686Z"},"trusted":true},"execution_count":42,"outputs":[{"execution_count":42,"output_type":"execute_result","data":{"text/plain":"(0.9144444444444444, (1800,), (1800, 10), (1800, 2))"},"metadata":{}}]},{"cell_type":"code","source":"### HYPERPARAM TUNING WITH RANDOM-SEARCH + RECURSIVE FEATURE ELIMINATION (RFE) SHAP ###\n\nmodel = BoostRFE(\n regr_xgb, param_grid=param_dist, min_features_to_select=1, step=1,\n n_iter=8, sampling_seed=0,\n importance_type='shap_importances', train_importance=False\n)\nmodel.fit(X_regr_train, y_regr_train, eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T12:56:13.667149Z","iopub.execute_input":"2022-01-01T12:56:13.667539Z","iopub.status.idle":"2022-01-01T13:08:38.854835Z","shell.execute_reply.started":"2022-01-01T12:56:13.667509Z","shell.execute_reply":"2022-01-01T13:08:38.854142Z"},"trusted":true},"execution_count":43,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')\n\ntrial: 0001 ### iterations: 00021 ### eval_score: 0.25941\ntrial: 0002 ### iterations: 00064 ### eval_score: 0.25075\ntrial: 0003 ### iterations: 00075 ### eval_score: 0.25493\ntrial: 0004 ### iterations: 00084 ### eval_score: 0.25002\ntrial: 0005 ### iterations: 00093 ### eval_score: 0.25609\ntrial: 0006 ### iterations: 00039 ### eval_score: 0.2573\ntrial: 0007 ### iterations: 00074 ### eval_score: 0.25348\ntrial: 0008 ### iterations: 00032 ### eval_score: 0.2583\n","output_type":"stream"},{"execution_count":43,"output_type":"execute_result","data":{"text/plain":"BoostRFE(estimator=XGBRegressor(base_score=None, booster=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, enable_categorical=False,\n gamma=None, gpu_id=None, importance_type=None,\n interaction_constraints=None,\n learning_rate=None, max_delta_step=None,\n max_depth=None, min_child_weight=None,\n missing=nan, monotone_constraints=None,\n n_estimato...\n tree_method=None, validate_parameters=None,\n verbosity=0),\n importance_type='shap_importances', min_features_to_select=1, n_iter=8,\n param_grid={'learning_rate': ,\n 'max_depth': [10, 12],\n 'num_leaves': },\n sampling_seed=0, train_importance=False)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.best_params_, model.best_score_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T13:08:38.855807Z","iopub.execute_input":"2022-01-01T13:08:38.856007Z","iopub.status.idle":"2022-01-01T13:08:38.866421Z","shell.execute_reply.started":"2022-01-01T13:08:38.855982Z","shell.execute_reply":"2022-01-01T13:08:38.865771Z"},"trusted":true},"execution_count":44,"outputs":[{"execution_count":44,"output_type":"execute_result","data":{"text/plain":"(XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, enable_categorical=False,\n gamma=0, gpu_id=-1, importance_type=None,\n interaction_constraints='', learning_rate=0.1669837381562427,\n max_delta_step=0, max_depth=10, min_child_weight=1, missing=nan,\n monotone_constraints='()', n_estimators=150, n_jobs=-1,\n num_leaves=25, num_parallel_tree=1, predictor='auto',\n random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,\n subsample=1, tree_method='exact', validate_parameters=1,\n verbosity=0),\n {'learning_rate': 0.1669837381562427, 'num_leaves': 25, 'max_depth': 10},\n 0.250021,\n 11)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape, \n model.transform(X_regr_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T13:08:38.867249Z","iopub.execute_input":"2022-01-01T13:08:38.867888Z","iopub.status.idle":"2022-01-01T13:08:38.887178Z","shell.execute_reply.started":"2022-01-01T13:08:38.867860Z","shell.execute_reply":"2022-01-01T13:08:38.886666Z"},"trusted":true},"execution_count":45,"outputs":[{"execution_count":45,"output_type":"execute_result","data":{"text/plain":"(0.7499501426259738, (1800,), (1800, 11))"},"metadata":{}}]},{"cell_type":"code","source":"### HYPERPARAM TUNING WITH HYPEROPT + RECURSIVE FEATURE ADDITION (RFA) SHAP ###\n\nmodel = BoostRFA(\n regr_xgb, param_grid=param_dist_hyperopt, min_features_to_select=1, step=1,\n n_iter=8, sampling_seed=0,\n importance_type='shap_importances', train_importance=False\n)\nmodel.fit(\n X_regr_train, y_regr_train, trials=Trials(), \n eval_set=[(X_regr_valid, y_regr_valid)], early_stopping_rounds=6, verbose=0\n)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T13:08:38.890197Z","iopub.execute_input":"2022-01-01T13:08:38.891876Z","iopub.status.idle":"2022-01-01T13:41:32.886109Z","shell.execute_reply.started":"2022-01-01T13:08:38.891845Z","shell.execute_reply":"2022-01-01T13:41:32.885257Z"},"trusted":true},"execution_count":46,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('max_depth', 'learning_rate', 'colsample_bytree')\n\ntrial: 0001 ### iterations: 00149 ### eval_score: 0.25811\ntrial: 0002 ### iterations: 00078 ### eval_score: 0.25554\ntrial: 0003 ### iterations: 00059 ### eval_score: 0.26658\ntrial: 0004 ### iterations: 00149 ### eval_score: 0.27356\ntrial: 0005 ### iterations: 00149 ### eval_score: 0.26426\ntrial: 0006 ### iterations: 00149 ### eval_score: 0.25537\ntrial: 0007 ### iterations: 00052 ### eval_score: 0.26107\ntrial: 0008 ### iterations: 00137 ### eval_score: 0.27787\n","output_type":"stream"},{"execution_count":46,"output_type":"execute_result","data":{"text/plain":"BoostRFA(estimator=XGBRegressor(base_score=None, booster=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, enable_categorical=False,\n gamma=None, gpu_id=None, importance_type=None,\n interaction_constraints=None,\n learning_rate=None, max_delta_step=None,\n max_depth=None, min_child_weight=None,\n missing=nan, monotone_constraints=None,\n n_estimato...\n tree_method=None, validate_parameters=None,\n verbosity=0),\n importance_type='shap_importances', min_features_to_select=1, n_iter=8,\n param_grid={'colsample_bytree': ,\n 'learning_rate': ,\n 'max_depth': },\n sampling_seed=0, train_importance=False)"},"metadata":{}}]},{"cell_type":"code","source":"model.estimator_, model.best_params_, model.best_score_, model.n_features_","metadata":{"execution":{"iopub.status.busy":"2022-01-01T13:41:32.887300Z","iopub.execute_input":"2022-01-01T13:41:32.887495Z","iopub.status.idle":"2022-01-01T13:41:32.897203Z","shell.execute_reply.started":"2022-01-01T13:41:32.887472Z","shell.execute_reply":"2022-01-01T13:41:32.896455Z"},"trusted":true},"execution_count":47,"outputs":[{"execution_count":47,"output_type":"execute_result","data":{"text/plain":"(XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=0.7597292534356749,\n enable_categorical=False, gamma=0, gpu_id=-1, importance_type=None,\n interaction_constraints='', learning_rate=0.059836658149176665,\n max_delta_step=0, max_depth=16, min_child_weight=1, missing=nan,\n monotone_constraints='()', n_estimators=150, n_jobs=-1,\n num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,\n reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',\n validate_parameters=1, verbosity=0),\n {'colsample_bytree': 0.7597292534356749,\n 'learning_rate': 0.059836658149176665,\n 'max_depth': 16},\n 0.255374,\n 11)"},"metadata":{}}]},{"cell_type":"code","source":"(model.score(X_regr_valid, y_regr_valid), \n model.predict(X_regr_valid).shape, \n model.transform(X_regr_valid).shape)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T13:41:32.898201Z","iopub.execute_input":"2022-01-01T13:41:32.898493Z","iopub.status.idle":"2022-01-01T13:41:32.931801Z","shell.execute_reply.started":"2022-01-01T13:41:32.898469Z","shell.execute_reply":"2022-01-01T13:41:32.931131Z"},"trusted":true},"execution_count":48,"outputs":[{"execution_count":48,"output_type":"execute_result","data":{"text/plain":"(0.7391290836488575, (1800,), (1800, 11))"},"metadata":{}}]},{"cell_type":"markdown","source":"# CUSTOM EVAL METRIC SUPPORT","metadata":{}},{"cell_type":"code","source":"from sklearn.metrics import roc_auc_score\n\ndef AUC(y_hat, dtrain):\n y_true = dtrain.get_label()\n return 'auc', roc_auc_score(y_true, y_hat)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T13:41:32.932773Z","iopub.execute_input":"2022-01-01T13:41:32.932979Z","iopub.status.idle":"2022-01-01T13:41:32.940277Z","shell.execute_reply.started":"2022-01-01T13:41:32.932952Z","shell.execute_reply":"2022-01-01T13:41:32.939659Z"},"trusted":true},"execution_count":49,"outputs":[]},{"cell_type":"code","source":"model = BoostRFE(\n clf_xgb, \n param_grid=param_grid, min_features_to_select=1, step=1,\n greater_is_better=True\n)\nmodel.fit(\n X_clf_train, y_clf_train, \n eval_set=[(X_clf_valid, y_clf_valid)], early_stopping_rounds=6, verbose=0,\n eval_metric=AUC\n)","metadata":{"execution":{"iopub.status.busy":"2022-01-01T13:41:32.943194Z","iopub.execute_input":"2022-01-01T13:41:32.944797Z","iopub.status.idle":"2022-01-01T13:43:50.574377Z","shell.execute_reply.started":"2022-01-01T13:41:32.944765Z","shell.execute_reply":"2022-01-01T13:43:50.573628Z"},"trusted":true},"execution_count":50,"outputs":[{"name":"stdout","text":"\n8 trials detected for ('learning_rate', 'num_leaves', 'max_depth')\n\ntrial: 0001 ### iterations: 00017 ### eval_score: 0.9757\ntrial: 0002 ### iterations: 00026 ### eval_score: 0.97632\ntrial: 0003 ### iterations: 00017 ### eval_score: 0.9757\ntrial: 0004 ### iterations: 00026 ### eval_score: 0.97632\ntrial: 0005 ### iterations: 00033 ### eval_score: 0.97594\ntrial: 0006 ### iterations: 00034 ### eval_score: 0.97577\ntrial: 0007 ### iterations: 00033 ### eval_score: 0.97594\ntrial: 0008 ### iterations: 00034 ### eval_score: 0.97577\n","output_type":"stream"},{"execution_count":50,"output_type":"execute_result","data":{"text/plain":"BoostRFE(estimator=XGBClassifier(base_score=None, booster=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None,\n enable_categorical=False, gamma=None,\n gpu_id=None, importance_type=None,\n interaction_constraints=None,\n learning_rate=None, max_delta_step=None,\n max_depth=None, min_child_weight=None,\n missing=nan, monotone_constraints=None,\n n_estimators=150, n_jobs=-1,\n num_parallel_tree=None, predictor=None,\n random_state=0, reg_alpha=None,\n reg_lambda=None, scale_pos_weight=None,\n subsample=None, tree_method=None,\n validate_parameters=None, verbosity=0),\n greater_is_better=True, min_features_to_select=1,\n param_grid={'learning_rate': [0.2, 0.1], 'max_depth': [10, 12],\n 'num_leaves': [25, 35]})"},"metadata":{}}]}]} -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | scikit-learn>=0.24.1 4 | shap>=0.39.0 5 | hyperopt==0.2.5 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from setuptools import setup, find_packages 3 | 4 | HERE = pathlib.Path(__file__).parent 5 | 6 | VERSION = '0.2.7' 7 | PACKAGE_NAME = 'shap-hypetune' 8 | AUTHOR = 'Marco Cerliani' 9 | AUTHOR_EMAIL = 'cerlymarco@gmail.com' 10 | URL = 'https://github.com/cerlymarco/shap-hypetune' 11 | 12 | LICENSE = 'MIT' 13 | DESCRIPTION = 'A python package for simultaneous Hyperparameters Tuning and Features Selection for Gradient Boosting Models.' 14 | LONG_DESCRIPTION = (HERE / "README.md").read_text() 15 | LONG_DESC_TYPE = "text/markdown" 16 | 17 | INSTALL_REQUIRES = [ 18 | 'numpy', 19 | 'scipy', 20 | 'scikit-learn>=0.24.1', 21 | 'shap>=0.39.0', 22 | 'hyperopt==0.2.5' 23 | ] 24 | 25 | setup(name=PACKAGE_NAME, 26 | version=VERSION, 27 | description=DESCRIPTION, 28 | long_description=LONG_DESCRIPTION, 29 | long_description_content_type=LONG_DESC_TYPE, 30 | author=AUTHOR, 31 | license=LICENSE, 32 | author_email=AUTHOR_EMAIL, 33 | url=URL, 34 | install_requires=INSTALL_REQUIRES, 35 | python_requires='>=3', 36 | packages=find_packages() 37 | ) -------------------------------------------------------------------------------- /shaphypetune/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * 2 | from ._classes import * 3 | from .shaphypetune import * -------------------------------------------------------------------------------- /shaphypetune/_classes.py: -------------------------------------------------------------------------------- 1 | import io 2 | import contextlib 3 | import warnings 4 | import numpy as np 5 | import scipy as sp 6 | from copy import deepcopy 7 | 8 | from sklearn.base import clone 9 | from sklearn.utils.validation import check_is_fitted 10 | from sklearn.base import BaseEstimator, TransformerMixin 11 | 12 | from joblib import Parallel, delayed 13 | from hyperopt import fmin, tpe 14 | 15 | from .utils import ParameterSampler, _check_param, _check_boosting 16 | from .utils import _set_categorical_indexes, _get_categorical_support 17 | from .utils import _feature_importances, _shap_importances 18 | 19 | 20 | class _BoostSearch(BaseEstimator): 21 | """Base class for BoostSearch meta-estimator. 22 | 23 | Warning: This class should not be used directly. Use derived classes 24 | instead. 25 | """ 26 | 27 | def __init__(self): 28 | pass 29 | 30 | def _validate_param_grid(self, fit_params): 31 | """Private method to validate fitting parameters.""" 32 | 33 | if not isinstance(self.param_grid, dict): 34 | raise ValueError("Pass param_grid in dict format.") 35 | self._param_grid = self.param_grid.copy() 36 | 37 | for p_k, p_v in self._param_grid.items(): 38 | self._param_grid[p_k] = _check_param(p_v) 39 | 40 | if 'eval_set' not in fit_params: 41 | raise ValueError( 42 | "When tuning parameters, at least " 43 | "a evaluation set is required.") 44 | 45 | self._eval_score = np.argmax if self.greater_is_better else np.argmin 46 | self._score_sign = -1 if self.greater_is_better else 1 47 | 48 | rs = ParameterSampler( 49 | n_iter=self.n_iter, 50 | param_distributions=self._param_grid, 51 | random_state=self.sampling_seed 52 | ) 53 | self._param_combi, self._tuning_type = rs.sample() 54 | self._trial_id = 1 55 | 56 | if self.verbose > 0: 57 | n_trials = self.n_iter if self._tuning_type is 'hyperopt' \ 58 | else len(self._param_combi) 59 | print("\n{} trials detected for {}\n".format( 60 | n_trials, tuple(self.param_grid.keys()))) 61 | 62 | def _fit(self, X, y, fit_params, params=None): 63 | """Private method to fit a single boosting model and extract results.""" 64 | 65 | model = self._build_model(params) 66 | if isinstance(model, _BoostSelector): 67 | model.fit(X=X, y=y, **fit_params) 68 | else: 69 | with contextlib.redirect_stdout(io.StringIO()): 70 | model.fit(X=X, y=y, **fit_params) 71 | 72 | results = {'params': params, 'status': 'ok'} 73 | 74 | if isinstance(model, _BoostSelector): 75 | results['booster'] = model.estimator_ 76 | results['model'] = model 77 | else: 78 | results['booster'] = model 79 | results['model'] = None 80 | 81 | if 'eval_set' not in fit_params: 82 | return results 83 | 84 | if self.boost_type_ == 'XGB': 85 | # w/ eval_set and w/ early_stopping_rounds 86 | if hasattr(results['booster'], 'best_score'): 87 | results['iterations'] = results['booster'].best_iteration 88 | # w/ eval_set and w/o early_stopping_rounds 89 | else: 90 | valid_id = list(results['booster'].evals_result_.keys())[-1] 91 | eval_metric = list(results['booster'].evals_result_[valid_id])[-1] 92 | results['iterations'] = \ 93 | len(results['booster'].evals_result_[valid_id][eval_metric]) 94 | else: 95 | # w/ eval_set and w/ early_stopping_rounds 96 | if results['booster'].best_iteration_ is not None: 97 | results['iterations'] = results['booster'].best_iteration_ 98 | # w/ eval_set and w/o early_stopping_rounds 99 | else: 100 | valid_id = list(results['booster'].evals_result_.keys())[-1] 101 | eval_metric = list(results['booster'].evals_result_[valid_id])[-1] 102 | results['iterations'] = \ 103 | len(results['booster'].evals_result_[valid_id][eval_metric]) 104 | 105 | if self.boost_type_ == 'XGB': 106 | # w/ eval_set and w/ early_stopping_rounds 107 | if hasattr(results['booster'], 'best_score'): 108 | results['loss'] = results['booster'].best_score 109 | # w/ eval_set and w/o early_stopping_rounds 110 | else: 111 | valid_id = list(results['booster'].evals_result_.keys())[-1] 112 | eval_metric = list(results['booster'].evals_result_[valid_id])[-1] 113 | results['loss'] = \ 114 | results['booster'].evals_result_[valid_id][eval_metric][-1] 115 | else: 116 | valid_id = list(results['booster'].best_score_.keys())[-1] 117 | eval_metric = list(results['booster'].best_score_[valid_id])[-1] 118 | results['loss'] = results['booster'].best_score_[valid_id][eval_metric] 119 | 120 | if params is not None: 121 | if self.verbose > 0: 122 | msg = "trial: {} ### iterations: {} ### eval_score: {}".format( 123 | str(self._trial_id).zfill(4), 124 | str(results['iterations']).zfill(5), 125 | round(results['loss'], 5) 126 | ) 127 | print(msg) 128 | 129 | self._trial_id += 1 130 | results['loss'] *= self._score_sign 131 | 132 | return results 133 | 134 | def fit(self, X, y, trials=None, **fit_params): 135 | """Fit the provided boosting algorithm while searching the best subset 136 | of features (according to the selected strategy) and choosing the best 137 | parameters configuration (if provided). 138 | 139 | It takes the same arguments available in the estimator fit. 140 | 141 | Parameters 142 | ---------- 143 | X : array-like of shape (n_samples, n_features) 144 | The training input samples. 145 | 146 | y : array-like of shape (n_samples,) 147 | Target values. 148 | 149 | trials : hyperopt.Trials() object, default=None 150 | A hyperopt trials object, used to store intermediate results for all 151 | optimization runs. Effective (and required) only when hyperopt 152 | parameter searching is computed. 153 | 154 | **fit_params : Additional fitting arguments. 155 | 156 | Returns 157 | ------- 158 | self : object 159 | """ 160 | 161 | self.boost_type_ = _check_boosting(self.estimator) 162 | 163 | if self.param_grid is None: 164 | results = self._fit(X, y, fit_params) 165 | 166 | for v in vars(results['model']): 167 | if v.endswith("_") and not v.startswith("__"): 168 | setattr(self, str(v), getattr(results['model'], str(v))) 169 | 170 | else: 171 | self._validate_param_grid(fit_params) 172 | 173 | if self._tuning_type == 'hyperopt': 174 | if trials is None: 175 | raise ValueError( 176 | "trials must be not None when using hyperopt." 177 | ) 178 | 179 | search = fmin( 180 | fn=lambda p: self._fit( 181 | params=p, X=X, y=y, fit_params=fit_params 182 | ), 183 | space=self._param_combi, algo=tpe.suggest, 184 | max_evals=self.n_iter, trials=trials, 185 | rstate=np.random.RandomState(self.sampling_seed), 186 | show_progressbar=False, verbose=0 187 | ) 188 | all_results = trials.results 189 | 190 | else: 191 | all_results = Parallel( 192 | n_jobs=self.n_jobs, verbose=self.verbose * int(bool(self.n_jobs)) 193 | )(delayed(self._fit)(X, y, fit_params, params) 194 | for params in self._param_combi) 195 | 196 | # extract results from parallel loops 197 | self.trials_, self.iterations_, self.scores_, models = [], [], [], [] 198 | for job_res in all_results: 199 | self.trials_.append(job_res['params']) 200 | self.iterations_.append(job_res['iterations']) 201 | self.scores_.append(self._score_sign * job_res['loss']) 202 | if isinstance(job_res['model'], _BoostSelector): 203 | models.append(job_res['model']) 204 | else: 205 | models.append(job_res['booster']) 206 | 207 | # get the best 208 | id_best = self._eval_score(self.scores_) 209 | self.best_params_ = self.trials_[id_best] 210 | self.best_iter_ = self.iterations_[id_best] 211 | self.best_score_ = self.scores_[id_best] 212 | self.estimator_ = models[id_best] 213 | 214 | for v in vars(models[id_best]): 215 | if v.endswith("_") and not v.startswith("__"): 216 | setattr(self, str(v), getattr(models[id_best], str(v))) 217 | 218 | return self 219 | 220 | def predict(self, X, **predict_params): 221 | """Predict X. 222 | 223 | Parameters 224 | ---------- 225 | X : array-like of shape (n_samples, n_features) 226 | Samples. 227 | 228 | **predict_params : Additional predict arguments. 229 | 230 | Returns 231 | ------- 232 | pred : ndarray of shape (n_samples,) 233 | The predicted values. 234 | """ 235 | 236 | check_is_fitted(self) 237 | 238 | if hasattr(self, 'transform'): 239 | X = self.transform(X) 240 | 241 | return self.estimator_.predict(X, **predict_params) 242 | 243 | def predict_proba(self, X, **predict_params): 244 | """Predict X probabilities. 245 | 246 | Parameters 247 | ---------- 248 | X : array-like of shape (n_samples, n_features) 249 | Samples. 250 | 251 | **predict_params : Additional predict arguments. 252 | 253 | Returns 254 | ------- 255 | pred : ndarray of shape (n_samples, n_classes) 256 | The predicted values. 257 | """ 258 | 259 | check_is_fitted(self) 260 | 261 | # raise original AttributeError 262 | getattr(self.estimator_, 'predict_proba') 263 | 264 | if hasattr(self, 'transform'): 265 | X = self.transform(X) 266 | 267 | return self.estimator_.predict_proba(X, **predict_params) 268 | 269 | def score(self, X, y, sample_weight=None): 270 | """Return the score on the given test data and labels. 271 | 272 | Parameters 273 | ---------- 274 | X : array-like of shape (n_samples, n_features) 275 | Test samples. 276 | 277 | y : array-like of shape (n_samples,) 278 | True values for X. 279 | 280 | sample_weight : array-like of shape (n_samples,), default=None 281 | Sample weights. 282 | 283 | Returns 284 | ------- 285 | score : float 286 | Accuracy for classification, R2 for regression. 287 | """ 288 | 289 | check_is_fitted(self) 290 | 291 | if hasattr(self, 'transform'): 292 | X = self.transform(X) 293 | 294 | return self.estimator_.score(X, y, sample_weight=sample_weight) 295 | 296 | 297 | class _BoostSelector(BaseEstimator, TransformerMixin): 298 | """Base class for feature selection meta-estimator. 299 | 300 | Warning: This class should not be used directly. Use derived classes 301 | instead. 302 | """ 303 | 304 | def __init__(self): 305 | pass 306 | 307 | def transform(self, X): 308 | """Reduces the input X to the features selected by Boruta. 309 | 310 | Parameters 311 | ---------- 312 | X : array-like of shape (n_samples, n_features) 313 | Samples. 314 | 315 | Returns 316 | ------- 317 | X : array-like of shape (n_samples, n_features_) 318 | The input samples with only the selected features by Boruta. 319 | """ 320 | 321 | check_is_fitted(self) 322 | 323 | shapes = np.shape(X) 324 | if len(shapes) != 2: 325 | raise ValueError("X must be 2D.") 326 | 327 | if shapes[1] != self.support_.shape[0]: 328 | raise ValueError( 329 | "Expected {} features, received {}.".format( 330 | self.support_.shape[0], shapes[1])) 331 | 332 | if isinstance(X, np.ndarray): 333 | return X[:, self.support_] 334 | elif hasattr(X, 'loc'): 335 | return X.loc[:, self.support_] 336 | else: 337 | raise ValueError("Data type not understood.") 338 | 339 | def get_support(self, indices=False): 340 | """Get a mask, or integer index, of the features selected. 341 | 342 | Parameters 343 | ---------- 344 | indices : bool, default=False 345 | If True, the return value will be an array of integers, rather 346 | than a boolean mask. 347 | 348 | Returns 349 | ------- 350 | support : array 351 | An index that selects the retained features from a feature vector. 352 | If `indices` is False, this is a boolean array of shape 353 | [# input features], in which an element is True iff its 354 | corresponding feature is selected for retention. If `indices` is 355 | True, this is an integer array of shape [# output features] whose 356 | values are indices into the input feature vector. 357 | """ 358 | 359 | check_is_fitted(self) 360 | 361 | mask = self.support_ 362 | return mask if not indices else np.where(mask)[0] 363 | 364 | 365 | class _Boruta(_BoostSelector): 366 | """Base class for BoostBoruta meta-estimator. 367 | 368 | Warning: This class should not be used directly. Use derived classes 369 | instead. 370 | 371 | Notes 372 | ----- 373 | The code for the Boruta algorithm is inspired and improved from: 374 | https://github.com/scikit-learn-contrib/boruta_py 375 | """ 376 | 377 | def __init__(self, 378 | estimator, *, 379 | perc=100, 380 | alpha=0.05, 381 | max_iter=100, 382 | early_stopping_boruta_rounds=None, 383 | importance_type='feature_importances', 384 | train_importance=True, 385 | verbose=0): 386 | 387 | self.estimator = estimator 388 | self.perc = perc 389 | self.alpha = alpha 390 | self.max_iter = max_iter 391 | self.early_stopping_boruta_rounds = early_stopping_boruta_rounds 392 | self.importance_type = importance_type 393 | self.train_importance = train_importance 394 | self.verbose = verbose 395 | 396 | def _create_X(self, X, feat_id_real): 397 | """Private method to add shadow features to the original ones. """ 398 | 399 | if isinstance(X, np.ndarray): 400 | X_real = X[:, feat_id_real].copy() 401 | X_sha = X_real.copy() 402 | X_sha = np.apply_along_axis(self._random_state.permutation, 0, X_sha) 403 | 404 | X = np.hstack((X_real, X_sha)) 405 | 406 | elif hasattr(X, 'iloc'): 407 | X_real = X.iloc[:, feat_id_real].copy() 408 | X_sha = X_real.copy() 409 | X_sha = X_sha.apply(self._random_state.permutation) 410 | X_sha = X_sha.astype(X_real.dtypes) 411 | 412 | X = X_real.join(X_sha, rsuffix='_SHA') 413 | 414 | else: 415 | raise ValueError("Data type not understood.") 416 | 417 | return X 418 | 419 | def _check_fit_params(self, fit_params, feat_id_real=None): 420 | """Private method to validate and check fit_params.""" 421 | 422 | _fit_params = deepcopy(fit_params) 423 | estimator = clone(self.estimator) 424 | # add here possible estimator checks in each iteration 425 | 426 | _fit_params = _set_categorical_indexes( 427 | self.support_, self._cat_support, _fit_params, duplicate=True) 428 | 429 | if feat_id_real is None: # final model fit 430 | if 'eval_set' in _fit_params: 431 | _fit_params['eval_set'] = list(map(lambda x: ( 432 | self.transform(x[0]), x[1] 433 | ), _fit_params['eval_set'])) 434 | else: 435 | if 'eval_set' in _fit_params: # iterative model fit 436 | _fit_params['eval_set'] = list(map(lambda x: ( 437 | self._create_X(x[0], feat_id_real), x[1] 438 | ), _fit_params['eval_set'])) 439 | 440 | if 'feature_name' in _fit_params: # LGB 441 | _fit_params['feature_name'] = 'auto' 442 | 443 | if 'feature_weights' in _fit_params: # XGB import warnings 444 | warnings.warn( 445 | "feature_weights is not supported when selecting features. " 446 | "It's automatically set to None.") 447 | _fit_params['feature_weights'] = None 448 | 449 | return _fit_params, estimator 450 | 451 | def _do_tests(self, dec_reg, hit_reg, iter_id): 452 | """Private method to operate Bonferroni corrections on the feature 453 | selections.""" 454 | 455 | active_features = np.where(dec_reg >= 0)[0] 456 | hits = hit_reg[active_features] 457 | # get uncorrected p values based on hit_reg 458 | to_accept_ps = sp.stats.binom.sf(hits - 1, iter_id, .5).flatten() 459 | to_reject_ps = sp.stats.binom.cdf(hits, iter_id, .5).flatten() 460 | 461 | # Bonferroni correction with the total n_features in each iteration 462 | to_accept = to_accept_ps <= self.alpha / float(len(dec_reg)) 463 | to_reject = to_reject_ps <= self.alpha / float(len(dec_reg)) 464 | 465 | # find features which are 0 and have been rejected or accepted 466 | to_accept = np.where((dec_reg[active_features] == 0) * to_accept)[0] 467 | to_reject = np.where((dec_reg[active_features] == 0) * to_reject)[0] 468 | 469 | # updating dec_reg 470 | dec_reg[active_features[to_accept]] = 1 471 | dec_reg[active_features[to_reject]] = -1 472 | 473 | return dec_reg 474 | 475 | def fit(self, X, y, **fit_params): 476 | """Fit the Boruta algorithm to automatically tune 477 | the number of selected features.""" 478 | 479 | self.boost_type_ = _check_boosting(self.estimator) 480 | 481 | if self.max_iter < 1: 482 | raise ValueError('max_iter should be an integer >0.') 483 | 484 | if self.perc <= 0 or self.perc > 100: 485 | raise ValueError('The percentile should be between 0 and 100.') 486 | 487 | if self.alpha <= 0 or self.alpha > 1: 488 | raise ValueError('alpha should be between 0 and 1.') 489 | 490 | if self.early_stopping_boruta_rounds is None: 491 | es_boruta_rounds = self.max_iter 492 | else: 493 | if self.early_stopping_boruta_rounds < 1: 494 | raise ValueError( 495 | 'early_stopping_boruta_rounds should be an integer >0.') 496 | es_boruta_rounds = self.early_stopping_boruta_rounds 497 | 498 | importances = ['feature_importances', 'shap_importances'] 499 | if self.importance_type not in importances: 500 | raise ValueError( 501 | "importance_type must be one of {}. Get '{}'".format( 502 | importances, self.importance_type)) 503 | 504 | if self.importance_type == 'shap_importances': 505 | if not self.train_importance and not 'eval_set' in fit_params: 506 | raise ValueError( 507 | "When train_importance is set to False, using " 508 | "shap_importances, pass at least a eval_set.") 509 | eval_importance = not self.train_importance and 'eval_set' in fit_params 510 | 511 | shapes = np.shape(X) 512 | if len(shapes) != 2: 513 | raise ValueError("X must be 2D.") 514 | n_features = shapes[1] 515 | 516 | # create mask for user-defined categorical features 517 | self._cat_support = _get_categorical_support(n_features, fit_params) 518 | 519 | # holds the decision about each feature: 520 | # default (0); accepted (1); rejected (-1) 521 | dec_reg = np.zeros(n_features, dtype=int) 522 | dec_history = np.zeros((self.max_iter, n_features), dtype=int) 523 | # counts how many times a given feature was more important than 524 | # the best of the shadow features 525 | hit_reg = np.zeros(n_features, dtype=int) 526 | # record the history of the iterations 527 | imp_history = np.zeros(n_features, dtype=float) 528 | sha_max_history = [] 529 | 530 | for i in range(self.max_iter): 531 | if (dec_reg != 0).all(): 532 | if self.verbose > 1: 533 | print("All Features analyzed. Boruta stop!") 534 | break 535 | 536 | if self.verbose > 1: 537 | print('Iteration: {} / {}'.format(i + 1, self.max_iter)) 538 | 539 | self._random_state = np.random.RandomState(i + 1000) 540 | 541 | # add shadow attributes, shuffle and train estimator 542 | self.support_ = dec_reg >= 0 543 | feat_id_real = np.where(self.support_)[0] 544 | n_real = feat_id_real.shape[0] 545 | _fit_params, estimator = self._check_fit_params(fit_params, feat_id_real) 546 | estimator.set_params(random_state=i + 1000) 547 | _X = self._create_X(X, feat_id_real) 548 | with contextlib.redirect_stdout(io.StringIO()): 549 | estimator.fit(_X, y, **_fit_params) 550 | 551 | # get coefs 552 | if self.importance_type == 'feature_importances': 553 | coefs = _feature_importances(estimator) 554 | else: 555 | if eval_importance: 556 | coefs = _shap_importances( 557 | estimator, _fit_params['eval_set'][-1][0]) 558 | else: 559 | coefs = _shap_importances(estimator, _X) 560 | 561 | # separate importances of real and shadow features 562 | imp_sha = coefs[n_real:] 563 | imp_real = np.zeros(n_features) * np.nan 564 | imp_real[feat_id_real] = coefs[:n_real] 565 | 566 | # get the threshold of shadow importances used for rejection 567 | imp_sha_max = np.percentile(imp_sha, self.perc) 568 | 569 | # record importance history 570 | sha_max_history.append(imp_sha_max) 571 | imp_history = np.vstack((imp_history, imp_real)) 572 | 573 | # register which feature is more imp than the max of shadows 574 | hit_reg[np.where(imp_real[~np.isnan(imp_real)] > imp_sha_max)[0]] += 1 575 | 576 | # check if a feature is doing better than expected by chance 577 | dec_reg = self._do_tests(dec_reg, hit_reg, i + 1) 578 | dec_history[i] = dec_reg 579 | 580 | es_id = i - es_boruta_rounds 581 | if es_id >= 0: 582 | if np.equal(dec_history[es_id:(i + 1)], dec_reg).all(): 583 | if self.verbose > 0: 584 | print("Boruta early stopping at iteration {}".format(i + 1)) 585 | break 586 | 587 | confirmed = np.where(dec_reg == 1)[0] 588 | tentative = np.where(dec_reg == 0)[0] 589 | 590 | self.support_ = np.zeros(n_features, dtype=bool) 591 | self.ranking_ = np.ones(n_features, dtype=int) * 4 592 | self.n_features_ = confirmed.shape[0] 593 | self.importance_history_ = imp_history[1:] 594 | 595 | if tentative.shape[0] > 0: 596 | tentative_median = np.nanmedian(imp_history[1:, tentative], axis=0) 597 | tentative_low = tentative[ 598 | np.where(tentative_median <= np.median(sha_max_history))[0]] 599 | tentative_up = np.setdiff1d(tentative, tentative_low) 600 | 601 | self.ranking_[tentative_low] = 3 602 | if tentative_up.shape[0] > 0: 603 | self.ranking_[tentative_up] = 2 604 | 605 | if confirmed.shape[0] > 0: 606 | self.support_[confirmed] = True 607 | self.ranking_[confirmed] = 1 608 | 609 | if (~self.support_).all(): 610 | raise RuntimeError( 611 | "Boruta didn't select any feature. Try to increase max_iter or " 612 | "increase (if not None) early_stopping_boruta_rounds or " 613 | "decrese perc.") 614 | 615 | _fit_params, self.estimator_ = self._check_fit_params(fit_params) 616 | with contextlib.redirect_stdout(io.StringIO()): 617 | self.estimator_.fit(self.transform(X), y, **_fit_params) 618 | 619 | return self 620 | 621 | 622 | class _RFE(_BoostSelector): 623 | """Base class for BoostRFE meta-estimator. 624 | 625 | Warning: This class should not be used directly. Use derived classes 626 | instead. 627 | """ 628 | 629 | def __init__(self, 630 | estimator, *, 631 | min_features_to_select=None, 632 | step=1, 633 | greater_is_better=False, 634 | importance_type='feature_importances', 635 | train_importance=True, 636 | verbose=0): 637 | 638 | self.estimator = estimator 639 | self.min_features_to_select = min_features_to_select 640 | self.step = step 641 | self.greater_is_better = greater_is_better 642 | self.importance_type = importance_type 643 | self.train_importance = train_importance 644 | self.verbose = verbose 645 | 646 | def _check_fit_params(self, fit_params): 647 | """Private method to validate and check fit_params.""" 648 | 649 | _fit_params = deepcopy(fit_params) 650 | estimator = clone(self.estimator) 651 | # add here possible estimator checks in each iteration 652 | 653 | _fit_params = _set_categorical_indexes( 654 | self.support_, self._cat_support, _fit_params) 655 | 656 | if 'eval_set' in _fit_params: 657 | _fit_params['eval_set'] = list(map(lambda x: ( 658 | self.transform(x[0]), x[1] 659 | ), _fit_params['eval_set'])) 660 | 661 | if 'feature_name' in _fit_params: # LGB 662 | _fit_params['feature_name'] = 'auto' 663 | 664 | if 'feature_weights' in _fit_params: # XGB import warnings 665 | warnings.warn( 666 | "feature_weights is not supported when selecting features. " 667 | "It's automatically set to None.") 668 | _fit_params['feature_weights'] = None 669 | 670 | return _fit_params, estimator 671 | 672 | def _step_score(self, estimator): 673 | """Return the score for a fit on eval_set.""" 674 | 675 | if self.boost_type_ == 'LGB': 676 | valid_id = list(estimator.best_score_.keys())[-1] 677 | eval_metric = list(estimator.best_score_[valid_id])[-1] 678 | score = estimator.best_score_[valid_id][eval_metric] 679 | else: 680 | # w/ eval_set and w/ early_stopping_rounds 681 | if hasattr(estimator, 'best_score'): 682 | score = estimator.best_score 683 | # w/ eval_set and w/o early_stopping_rounds 684 | else: 685 | valid_id = list(estimator.evals_result_.keys())[-1] 686 | eval_metric = list(estimator.evals_result_[valid_id])[-1] 687 | score = estimator.evals_result_[valid_id][eval_metric][-1] 688 | 689 | return score 690 | 691 | def fit(self, X, y, **fit_params): 692 | """Fit the RFE algorithm to automatically tune 693 | the number of selected features.""" 694 | 695 | self.boost_type_ = _check_boosting(self.estimator) 696 | 697 | importances = ['feature_importances', 'shap_importances'] 698 | if self.importance_type not in importances: 699 | raise ValueError( 700 | "importance_type must be one of {}. Get '{}'".format( 701 | importances, self.importance_type)) 702 | 703 | # scoring controls the calculation of self.score_history_ 704 | # scoring is used automatically when 'eval_set' is in fit_params 705 | scoring = 'eval_set' in fit_params 706 | if self.importance_type == 'shap_importances': 707 | if not self.train_importance and not scoring: 708 | raise ValueError( 709 | "When train_importance is set to False, using " 710 | "shap_importances, pass at least a eval_set.") 711 | eval_importance = not self.train_importance and scoring 712 | 713 | shapes = np.shape(X) 714 | if len(shapes) != 2: 715 | raise ValueError("X must be 2D.") 716 | n_features = shapes[1] 717 | 718 | # create mask for user-defined categorical features 719 | self._cat_support = _get_categorical_support(n_features, fit_params) 720 | 721 | if self.min_features_to_select is None: 722 | if scoring: 723 | min_features_to_select = 1 724 | else: 725 | min_features_to_select = n_features // 2 726 | else: 727 | min_features_to_select = self.min_features_to_select 728 | 729 | if 0.0 < self.step < 1.0: 730 | step = int(max(1, self.step * n_features)) 731 | else: 732 | step = int(self.step) 733 | if step <= 0: 734 | raise ValueError("Step must be >0.") 735 | 736 | self.support_ = np.ones(n_features, dtype=bool) 737 | self.ranking_ = np.ones(n_features, dtype=int) 738 | if scoring: 739 | self.score_history_ = [] 740 | eval_score = np.max if self.greater_is_better else np.min 741 | best_score = -np.inf if self.greater_is_better else np.inf 742 | 743 | while np.sum(self.support_) > min_features_to_select: 744 | # remaining features 745 | features = np.arange(n_features)[self.support_] 746 | _fit_params, estimator = self._check_fit_params(fit_params) 747 | 748 | if self.verbose > 1: 749 | print("Fitting estimator with {} features".format( 750 | self.support_.sum())) 751 | with contextlib.redirect_stdout(io.StringIO()): 752 | estimator.fit(self.transform(X), y, **_fit_params) 753 | 754 | # get coefs 755 | if self.importance_type == 'feature_importances': 756 | coefs = _feature_importances(estimator) 757 | else: 758 | if eval_importance: 759 | coefs = _shap_importances( 760 | estimator, _fit_params['eval_set'][-1][0]) 761 | else: 762 | coefs = _shap_importances( 763 | estimator, self.transform(X)) 764 | ranks = np.argsort(coefs) 765 | 766 | # eliminate the worse features 767 | threshold = min(step, np.sum(self.support_) - min_features_to_select) 768 | 769 | # compute step score on the previous selection iteration 770 | # because 'estimator' must use features 771 | # that have not been eliminated yet 772 | if scoring: 773 | score = self._step_score(estimator) 774 | self.score_history_.append(score) 775 | if best_score != eval_score([score, best_score]): 776 | best_score = score 777 | best_support = self.support_.copy() 778 | best_ranking = self.ranking_.copy() 779 | best_estimator = estimator 780 | 781 | self.support_[features[ranks][:threshold]] = False 782 | self.ranking_[np.logical_not(self.support_)] += 1 783 | 784 | # set final attributes 785 | _fit_params, self.estimator_ = self._check_fit_params(fit_params) 786 | if self.verbose > 1: 787 | print("Fitting estimator with {} features".format(self.support_.sum())) 788 | with contextlib.redirect_stdout(io.StringIO()): 789 | self.estimator_.fit(self.transform(X), y, **_fit_params) 790 | 791 | # compute step score when only min_features_to_select features left 792 | if scoring: 793 | score = self._step_score(self.estimator_) 794 | self.score_history_.append(score) 795 | if best_score == eval_score([score, best_score]): 796 | self.support_ = best_support 797 | self.ranking_ = best_ranking 798 | self.estimator_ = best_estimator 799 | self.n_features_ = self.support_.sum() 800 | 801 | return self 802 | 803 | 804 | class _RFA(_BoostSelector): 805 | """Base class for BoostRFA meta-estimator. 806 | 807 | Warning: This class should not be used directly. Use derived classes 808 | instead. 809 | """ 810 | 811 | def __init__(self, 812 | estimator, *, 813 | min_features_to_select=None, 814 | step=1, 815 | greater_is_better=False, 816 | importance_type='feature_importances', 817 | train_importance=True, 818 | verbose=0): 819 | 820 | self.estimator = estimator 821 | self.min_features_to_select = min_features_to_select 822 | self.step = step 823 | self.greater_is_better = greater_is_better 824 | self.importance_type = importance_type 825 | self.train_importance = train_importance 826 | self.verbose = verbose 827 | 828 | def _check_fit_params(self, fit_params, inverse=False): 829 | """Private method to validate and check fit_params.""" 830 | 831 | _fit_params = deepcopy(fit_params) 832 | estimator = clone(self.estimator) 833 | # add here possible estimator checks in each iteration 834 | 835 | _fit_params = _set_categorical_indexes( 836 | self.support_, self._cat_support, _fit_params) 837 | 838 | if 'eval_set' in _fit_params: 839 | _fit_params['eval_set'] = list(map(lambda x: ( 840 | self._transform(x[0], inverse), x[1] 841 | ), _fit_params['eval_set'])) 842 | 843 | if 'feature_name' in _fit_params: # LGB 844 | _fit_params['feature_name'] = 'auto' 845 | 846 | if 'feature_weights' in _fit_params: # XGB import warnings 847 | warnings.warn( 848 | "feature_weights is not supported when selecting features. " 849 | "It's automatically set to None.") 850 | _fit_params['feature_weights'] = None 851 | 852 | return _fit_params, estimator 853 | 854 | def _step_score(self, estimator): 855 | """Return the score for a fit on eval_set.""" 856 | 857 | if self.boost_type_ == 'LGB': 858 | valid_id = list(estimator.best_score_.keys())[-1] 859 | eval_metric = list(estimator.best_score_[valid_id])[-1] 860 | score = estimator.best_score_[valid_id][eval_metric] 861 | else: 862 | # w/ eval_set and w/ early_stopping_rounds 863 | if hasattr(estimator, 'best_score'): 864 | score = estimator.best_score 865 | # w/ eval_set and w/o early_stopping_rounds 866 | else: 867 | valid_id = list(estimator.evals_result_.keys())[-1] 868 | eval_metric = list(estimator.evals_result_[valid_id])[-1] 869 | score = estimator.evals_result_[valid_id][eval_metric][-1] 870 | 871 | return score 872 | 873 | def fit(self, X, y, **fit_params): 874 | """Fit the RFA algorithm to automatically tune 875 | the number of selected features.""" 876 | 877 | self.boost_type_ = _check_boosting(self.estimator) 878 | 879 | importances = ['feature_importances', 'shap_importances'] 880 | if self.importance_type not in importances: 881 | raise ValueError( 882 | "importance_type must be one of {}. Get '{}'".format( 883 | importances, self.importance_type)) 884 | 885 | # scoring controls the calculation of self.score_history_ 886 | # scoring is used automatically when 'eval_set' is in fit_params 887 | scoring = 'eval_set' in fit_params 888 | if self.importance_type == 'shap_importances': 889 | if not self.train_importance and not scoring: 890 | raise ValueError( 891 | "When train_importance is set to False, using " 892 | "shap_importances, pass at least a eval_set.") 893 | eval_importance = not self.train_importance and scoring 894 | 895 | shapes = np.shape(X) 896 | if len(shapes) != 2: 897 | raise ValueError("X must be 2D.") 898 | n_features = shapes[1] 899 | 900 | # create mask for user-defined categorical features 901 | self._cat_support = _get_categorical_support(n_features, fit_params) 902 | 903 | if self.min_features_to_select is None: 904 | if scoring: 905 | min_features_to_select = 1 906 | else: 907 | min_features_to_select = n_features // 2 908 | else: 909 | if scoring: 910 | min_features_to_select = self.min_features_to_select 911 | else: 912 | min_features_to_select = n_features - self.min_features_to_select 913 | 914 | if 0.0 < self.step < 1.0: 915 | step = int(max(1, self.step * n_features)) 916 | else: 917 | step = int(self.step) 918 | if step <= 0: 919 | raise ValueError("Step must be >0.") 920 | 921 | self.support_ = np.zeros(n_features, dtype=bool) 922 | self._support = np.ones(n_features, dtype=bool) 923 | self.ranking_ = np.ones(n_features, dtype=int) 924 | self._ranking = np.ones(n_features, dtype=int) 925 | if scoring: 926 | self.score_history_ = [] 927 | eval_score = np.max if self.greater_is_better else np.min 928 | best_score = -np.inf if self.greater_is_better else np.inf 929 | 930 | while np.sum(self._support) > min_features_to_select: 931 | # remaining features 932 | features = np.arange(n_features)[self._support] 933 | 934 | # scoring the previous added features 935 | if scoring and np.sum(self.support_) > 0: 936 | _fit_params, estimator = self._check_fit_params(fit_params) 937 | with contextlib.redirect_stdout(io.StringIO()): 938 | estimator.fit(self._transform(X, inverse=False), y, **_fit_params) 939 | score = self._step_score(estimator) 940 | self.score_history_.append(score) 941 | if best_score != eval_score([score, best_score]): 942 | best_score = score 943 | best_support = self.support_.copy() 944 | best_ranking = self.ranking_.copy() 945 | best_estimator = estimator 946 | 947 | # evaluate the remaining features 948 | _fit_params, _estimator = self._check_fit_params(fit_params, inverse=True) 949 | if self.verbose > 1: 950 | print("Fitting estimator with {} features".format(self._support.sum())) 951 | with contextlib.redirect_stdout(io.StringIO()): 952 | _estimator.fit(self._transform(X, inverse=True), y, **_fit_params) 953 | if self._support.sum() == n_features: 954 | all_features_estimator = _estimator 955 | 956 | # get coefs 957 | if self.importance_type == 'feature_importances': 958 | coefs = _feature_importances(_estimator) 959 | else: 960 | if eval_importance: 961 | coefs = _shap_importances( 962 | _estimator, _fit_params['eval_set'][-1][0]) 963 | else: 964 | coefs = _shap_importances( 965 | _estimator, self._transform(X, inverse=True)) 966 | ranks = np.argsort(-coefs) # the rank is inverted 967 | 968 | # add the best features 969 | threshold = min(step, np.sum(self._support) - min_features_to_select) 970 | 971 | # remaining features to test 972 | self._support[features[ranks][:threshold]] = False 973 | self._ranking[np.logical_not(self._support)] += 1 974 | # features tested 975 | self.support_[features[ranks][:threshold]] = True 976 | self.ranking_[np.logical_not(self.support_)] += 1 977 | 978 | # set final attributes 979 | _fit_params, self.estimator_ = self._check_fit_params(fit_params) 980 | if self.verbose > 1: 981 | print("Fitting estimator with {} features".format(self._support.sum())) 982 | with contextlib.redirect_stdout(io.StringIO()): 983 | self.estimator_.fit(self._transform(X, inverse=False), y, **_fit_params) 984 | 985 | # compute step score when only min_features_to_select features left 986 | if scoring: 987 | score = self._step_score(self.estimator_) 988 | self.score_history_.append(score) 989 | if best_score == eval_score([score, best_score]): 990 | self.support_ = best_support 991 | self.ranking_ = best_ranking 992 | self.estimator_ = best_estimator 993 | 994 | if len(set(self.score_history_)) == 1: 995 | self.support_ = np.ones(n_features, dtype=bool) 996 | self.ranking_ = np.ones(n_features, dtype=int) 997 | self.estimator_ = all_features_estimator 998 | self.n_features_ = self.support_.sum() 999 | 1000 | return self 1001 | 1002 | def _transform(self, X, inverse=False): 1003 | """Private method to reduce the input X to the features selected.""" 1004 | 1005 | shapes = np.shape(X) 1006 | if len(shapes) != 2: 1007 | raise ValueError("X must be 2D.") 1008 | 1009 | if shapes[1] != self.support_.shape[0]: 1010 | raise ValueError( 1011 | "Expected {} features, received {}.".format( 1012 | self.support_.shape[0], shapes[1])) 1013 | 1014 | if inverse: 1015 | if isinstance(X, np.ndarray): 1016 | return X[:, self._support] 1017 | elif hasattr(X, 'loc'): 1018 | return X.loc[:, self._support] 1019 | elif sp.sparse.issparse(X): 1020 | return X[:, self._support] 1021 | else: 1022 | raise ValueError("Data type not understood.") 1023 | else: 1024 | if isinstance(X, np.ndarray): 1025 | return X[:, self.support_] 1026 | elif hasattr(X, 'loc'): 1027 | return X.loc[:, self.support_] 1028 | elif sp.sparse.issparse(X): 1029 | return X[:, self.support_] 1030 | else: 1031 | raise ValueError("Data type not understood.") 1032 | 1033 | def transform(self, X): 1034 | """Reduces the input X to the features selected with RFA. 1035 | 1036 | Parameters 1037 | ---------- 1038 | X : array-like of shape (n_samples, n_features) 1039 | Samples. 1040 | 1041 | Returns 1042 | ------- 1043 | X : array-like of shape (n_samples, n_features_) 1044 | The input samples with only the selected features by Boruta. 1045 | """ 1046 | 1047 | check_is_fitted(self) 1048 | 1049 | return self._transform(X, inverse=False) 1050 | -------------------------------------------------------------------------------- /shaphypetune/shaphypetune.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import clone 2 | 3 | from ._classes import _BoostSearch, _Boruta, _RFA, _RFE 4 | 5 | 6 | class BoostSearch(_BoostSearch): 7 | """Hyperparamater searching and optimization on a given validation set 8 | for LGBModel or XGBModel. 9 | 10 | Pass a LGBModel or XGBModel, and a dictionary with the parameter boundaries 11 | for grid, random or bayesian search. 12 | To operate random search pass distributions in the param_grid with rvs 13 | method for sampling (such as those from scipy.stats.distributions). 14 | To operate bayesian search pass hyperopt distributions. 15 | The specification of n_iter or sampling_seed is effective only with random 16 | or hyperopt searches. 17 | The best parameter combination is the one which obtain the better score 18 | (as returned by eval_metric) on the provided eval_set. 19 | 20 | If all parameters are presented as a list/floats/integers, grid-search 21 | is performed. If at least one parameter is given as a distribution (such as 22 | those from scipy.stats.distributions), random-search is performed computing 23 | sampling with replacement. Bayesian search is effective only when all the 24 | parameters to tune are in form of hyperopt distributions. 25 | It is highly recommended to use continuous distributions for continuous 26 | parameters. 27 | 28 | Parameters 29 | ---------- 30 | estimator : object 31 | A supervised learning estimator of LGBModel or XGBModel type. 32 | 33 | param_grid : dict 34 | Dictionary with parameters names (`str`) as keys and distributions 35 | or lists of parameters to try. 36 | 37 | greater_is_better : bool, default=False 38 | Whether the quantity to monitor is a score function, 39 | meaning high is good, or a loss function, meaning low is good. 40 | 41 | n_iter : int, default=None 42 | Effective only for random or hyperopt search. 43 | Number of parameter settings that are sampled. 44 | n_iter trades off runtime vs quality of the solution. 45 | 46 | sampling_seed : int, default=None 47 | Effective only for random or hyperopt search. 48 | The seed used to sample from the hyperparameter distributions. 49 | 50 | n_jobs : int, default=None 51 | Effective only with grid and random search. 52 | The number of jobs to run in parallel for model fitting. 53 | ``None`` means 1 using one processor. ``-1`` means using all 54 | processors. 55 | 56 | verbose : int, default=1 57 | Verbosity mode. <=0 silent all; >0 print trial logs with the 58 | connected score. 59 | 60 | Attributes 61 | ---------- 62 | estimator_ : estimator 63 | Estimator that was chosen by the search, i.e. estimator 64 | which gave the best score on the eval_set. 65 | 66 | best_params_ : dict 67 | Parameter setting that gave the best results on the eval_set. 68 | 69 | trials_ : list 70 | A list of dicts. The dicts are all the parameter combinations tried 71 | and derived from the param_grid. 72 | 73 | best_score_ : float 74 | The best score achieved by all the possible combination created. 75 | 76 | scores_ : list 77 | The scores achieved on the eval_set by all the models tried. 78 | 79 | best_iter_ : int 80 | The boosting iterations achieved by the best parameters combination. 81 | 82 | iterations_ : list 83 | The boosting iterations of all the models tried. 84 | 85 | boost_type_ : str 86 | The type of the boosting estimator (LGB or XGB). 87 | """ 88 | 89 | def __init__(self, 90 | estimator, *, 91 | param_grid, 92 | greater_is_better=False, 93 | n_iter=None, 94 | sampling_seed=None, 95 | verbose=1, 96 | n_jobs=None): 97 | self.estimator = estimator 98 | self.param_grid = param_grid 99 | self.greater_is_better = greater_is_better 100 | self.n_iter = n_iter 101 | self.sampling_seed = sampling_seed 102 | self.verbose = verbose 103 | self.n_jobs = n_jobs 104 | 105 | def _build_model(self, params): 106 | """Private method to build model.""" 107 | 108 | model = clone(self.estimator) 109 | model.set_params(**params) 110 | 111 | return model 112 | 113 | 114 | class BoostBoruta(_BoostSearch, _Boruta): 115 | """Simultaneous features selection with Boruta algorithm and hyperparamater 116 | searching on a given validation set for LGBModel or XGBModel. 117 | 118 | Pass a LGBModel or XGBModel to compute features selection with Boruta 119 | algorithm. The best features are used to train a new gradient boosting 120 | instance. When a eval_set is provided, shadow features are build also on it. 121 | 122 | If param_grid is a dictionary with parameter boundaries, a hyperparameter 123 | tuning is computed simultaneously. The parameter combinations are scored on 124 | the provided eval_set. 125 | To operate random search pass distributions in the param_grid with rvs 126 | method for sampling (such as those from scipy.stats.distributions). 127 | To operate bayesian search pass hyperopt distributions. 128 | The specification of n_iter or sampling_seed is effective only with random 129 | or hyperopt searches. 130 | The best parameter combination is the one which obtain the better score 131 | (as returned by eval_metric) on the provided eval_set. 132 | 133 | If all parameters are presented as a list/floats/integers, grid-search 134 | is performed. If at least one parameter is given as a distribution (such as 135 | those from scipy.stats.distributions), random-search is performed computing 136 | sampling with replacement. Bayesian search is effective only when all the 137 | parameters to tune are in form of hyperopt distributions. 138 | It is highly recommended to use continuous distributions for continuous 139 | parameters. 140 | 141 | Parameters 142 | ---------- 143 | estimator : object 144 | A supervised learning estimator of LGBModel or XGBModel type. 145 | 146 | perc : int, default=100 147 | Threshold for comparison between shadow and real features. 148 | The lower perc is the more false positives will be picked as relevant 149 | but also the less relevant features will be left out. 150 | 100 correspond to the max. 151 | 152 | alpha : float, default=0.05 153 | Level at which the corrected p-values will get rejected in the 154 | correction steps. 155 | 156 | max_iter : int, default=100 157 | The number of maximum Boruta iterations to perform. 158 | 159 | early_stopping_boruta_rounds : int, default=None 160 | The maximum amount of iterations without confirming a tentative 161 | feature. Use early stopping to terminate the selection process 162 | before reaching `max_iter` iterations if the algorithm cannot 163 | confirm a tentative feature after N iterations. 164 | None means no early stopping search. 165 | 166 | importance_type : str, default='feature_importances' 167 | Which importance measure to use. It can be 'feature_importances' 168 | (the default feature importance of the gradient boosting estimator) 169 | or 'shap_importances'. 170 | 171 | train_importance : bool, default=True 172 | Effective only when importance_type='shap_importances'. 173 | Where to compute the shap feature importance: on train (True) 174 | or on eval_set (False). 175 | 176 | param_grid : dict, default=None 177 | Dictionary with parameters names (`str`) as keys and distributions 178 | or lists of parameters to try. 179 | None means no hyperparameters search. 180 | 181 | greater_is_better : bool, default=False 182 | Effective only when hyperparameters searching. 183 | Whether the quantity to monitor is a score function, 184 | meaning high is good, or a loss function, meaning low is good. 185 | 186 | n_iter : int, default=None 187 | Effective only when hyperparameters searching. 188 | Effective only for random or hyperopt seraches. 189 | Number of parameter settings that are sampled. 190 | n_iter trades off runtime vs quality of the solution. 191 | 192 | sampling_seed : int, default=None 193 | Effective only when hyperparameters searching. 194 | Effective only for random or hyperopt serach. 195 | The seed used to sample from the hyperparameter distributions. 196 | 197 | n_jobs : int, default=None 198 | Effective only when hyperparameters searching without hyperopt. 199 | The number of jobs to run in parallel for model fitting. 200 | ``None`` means 1 using one processor. ``-1`` means using all 201 | processors. 202 | 203 | verbose : int, default=1 204 | Verbosity mode. <=0 silent all; ==1 print trial logs (when 205 | hyperparameters searching); >1 print feature selection logs plus 206 | trial logs (when hyperparameters searching). 207 | 208 | Attributes 209 | ---------- 210 | estimator_ : estimator 211 | The fitted estimator with the select features and the optimal 212 | parameter combination (when hyperparameters searching). 213 | 214 | n_features_ : int 215 | The number of selected features (from the best param config 216 | when hyperparameters searching). 217 | 218 | ranking_ : ndarray of shape (n_features,) 219 | The feature ranking, such that ``ranking_[i]`` corresponds to the 220 | ranking position of the i-th feature (from the best param config 221 | when hyperparameters searching). Selected features are assigned 222 | rank 1 (2: tentative upper bound, 3: tentative lower bound, 4: 223 | rejected). 224 | 225 | support_ : ndarray of shape (n_features,) 226 | The mask of selected features (from the best param config 227 | when hyperparameters searching). 228 | 229 | importance_history_ : ndarray of shape (n_features, n_iters) 230 | The importance values for each feature across all iterations. 231 | 232 | best_params_ : dict 233 | Available only when hyperparameters searching. 234 | Parameter setting that gave the best results on the eval_set. 235 | 236 | trials_ : list 237 | Available only when hyperparameters searching. 238 | A list of dicts. The dicts are all the parameter combinations tried 239 | and derived from the param_grid. 240 | 241 | best_score_ : float 242 | Available only when hyperparameters searching. 243 | The best score achieved by all the possible combination created. 244 | 245 | scores_ : list 246 | Available only when hyperparameters searching. 247 | The scores achived on the eval_set by all the models tried. 248 | 249 | best_iter_ : int 250 | Available only when hyperparameters searching. 251 | The boosting iterations achieved by the best parameters combination. 252 | 253 | iterations_ : list 254 | Available only when hyperparameters searching. 255 | The boosting iterations of all the models tried. 256 | 257 | boost_type_ : str 258 | The type of the boosting estimator (LGB or XGB). 259 | 260 | Notes 261 | ----- 262 | The code for the Boruta algorithm is inspired and improved from: 263 | https://github.com/scikit-learn-contrib/boruta_py 264 | """ 265 | 266 | def __init__(self, 267 | estimator, *, 268 | perc=100, 269 | alpha=0.05, 270 | max_iter=100, 271 | early_stopping_boruta_rounds=None, 272 | param_grid=None, 273 | greater_is_better=False, 274 | importance_type='feature_importances', 275 | train_importance=True, 276 | n_iter=None, 277 | sampling_seed=None, 278 | verbose=1, 279 | n_jobs=None): 280 | 281 | self.estimator = estimator 282 | self.perc = perc 283 | self.alpha = alpha 284 | self.max_iter = max_iter 285 | self.early_stopping_boruta_rounds = early_stopping_boruta_rounds 286 | self.param_grid = param_grid 287 | self.greater_is_better = greater_is_better 288 | self.importance_type = importance_type 289 | self.train_importance = train_importance 290 | self.n_iter = n_iter 291 | self.sampling_seed = sampling_seed 292 | self.verbose = verbose 293 | self.n_jobs = n_jobs 294 | 295 | def _build_model(self, params=None): 296 | """Private method to build model.""" 297 | 298 | estimator = clone(self.estimator) 299 | 300 | if params is None: 301 | model = _Boruta( 302 | estimator=estimator, 303 | perc=self.perc, 304 | alpha=self.alpha, 305 | max_iter=self.max_iter, 306 | early_stopping_boruta_rounds=self.early_stopping_boruta_rounds, 307 | importance_type=self.importance_type, 308 | train_importance=self.train_importance, 309 | verbose=self.verbose 310 | ) 311 | 312 | else: 313 | estimator.set_params(**params) 314 | model = _Boruta( 315 | estimator=estimator, 316 | perc=self.perc, 317 | alpha=self.alpha, 318 | max_iter=self.max_iter, 319 | early_stopping_boruta_rounds=self.early_stopping_boruta_rounds, 320 | importance_type=self.importance_type, 321 | train_importance=self.train_importance, 322 | verbose=self.verbose 323 | ) 324 | 325 | return model 326 | 327 | 328 | class BoostRFE(_BoostSearch, _RFE): 329 | """Simultaneous features selection with RFE and hyperparamater searching 330 | on a given validation set for LGBModel or XGBModel. 331 | 332 | Pass a LGBModel or XGBModel to compute features selection with RFE. 333 | The gradient boosting instance with the best features is selected. 334 | When a eval_set is provided, the best gradient boosting and the best 335 | features are obtained evaluating the score with eval_metric. 336 | Otherwise, the best combination is obtained looking only at feature 337 | importance. 338 | 339 | If param_grid is a dictionary with parameter boundaries, a hyperparameter 340 | tuning is computed simultaneously. The parameter combinations are scored on 341 | the provided eval_set. 342 | To operate random search pass distributions in the param_grid with rvs 343 | method for sampling (such as those from scipy.stats.distributions). 344 | To operate bayesian search pass hyperopt distributions. 345 | The specification of n_iter or sampling_seed is effective only with random 346 | or hyperopt searches. 347 | The best parameter combination is the one which obtain the better score 348 | (as returned by eval_metric) on the provided eval_set. 349 | 350 | If all parameters are presented as a list/floats/integers, grid-search 351 | is performed. If at least one parameter is given as a distribution (such as 352 | those from scipy.stats.distributions), random-search is performed computing 353 | sampling with replacement. Bayesian search is effective only when all the 354 | parameters to tune are in form of hyperopt distributions. 355 | It is highly recommended to use continuous distributions for continuous 356 | parameters. 357 | 358 | Parameters 359 | ---------- 360 | estimator : object 361 | A supervised learning estimator of LGBModel or XGBModel type. 362 | 363 | step : int or float, default=1 364 | If greater than or equal to 1, then `step` corresponds to the 365 | (integer) number of features to remove at each iteration. 366 | If within (0.0, 1.0), then `step` corresponds to the percentage 367 | (rounded down) of features to remove at each iteration. 368 | Note that the last iteration may remove fewer than `step` features in 369 | order to reach `min_features_to_select`. 370 | 371 | min_features_to_select : int, default=None 372 | The minimum number of features to be selected. This number of features 373 | will always be scored, even if the difference between the original 374 | feature count and `min_features_to_select` isn't divisible by 375 | `step`. The default value for min_features_to_select is set to 1 when a 376 | eval_set is provided, otherwise it always corresponds to n_features // 2. 377 | 378 | importance_type : str, default='feature_importances' 379 | Which importance measure to use. It can be 'feature_importances' 380 | (the default feature importance of the gradient boosting estimator) 381 | or 'shap_importances'. 382 | 383 | train_importance : bool, default=True 384 | Effective only when importance_type='shap_importances'. 385 | Where to compute the shap feature importance: on train (True) 386 | or on eval_set (False). 387 | 388 | param_grid : dict, default=None 389 | Dictionary with parameters names (`str`) as keys and distributions 390 | or lists of parameters to try. 391 | None means no hyperparameters search. 392 | 393 | greater_is_better : bool, default=False 394 | Effective only when hyperparameters searching. 395 | Whether the quantity to monitor is a score function, 396 | meaning high is good, or a loss function, meaning low is good. 397 | 398 | n_iter : int, default=None 399 | Effective only when hyperparameters searching. 400 | Effective only for random or hyperopt serach. 401 | Number of parameter settings that are sampled. 402 | n_iter trades off runtime vs quality of the solution. 403 | 404 | sampling_seed : int, default=None 405 | Effective only when hyperparameters searching. 406 | Effective only for random or hyperopt serach. 407 | The seed used to sample from the hyperparameter distributions. 408 | 409 | n_jobs : int, default=None 410 | Effective only when hyperparameters searching without hyperopt. 411 | The number of jobs to run in parallel for model fitting. 412 | ``None`` means 1 using one processor. ``-1`` means using all 413 | processors. 414 | 415 | verbose : int, default=1 416 | Verbosity mode. <=0 silent all; ==1 print trial logs (when 417 | hyperparameters searching); >1 print feature selection logs plus 418 | trial logs (when hyperparameters searching). 419 | 420 | Attributes 421 | ---------- 422 | estimator_ : estimator 423 | The fitted estimator with the select features and the optimal 424 | parameter combination (when hyperparameters searching). 425 | 426 | n_features_ : int 427 | The number of selected features (from the best param config 428 | when hyperparameters searching). 429 | 430 | ranking_ : ndarray of shape (n_features,) 431 | The feature ranking, such that ``ranking_[i]`` corresponds to the 432 | ranking position of the i-th feature (from the best param config 433 | when hyperparameters searching). Selected features are assigned 434 | rank 1. 435 | 436 | support_ : ndarray of shape (n_features,) 437 | The mask of selected features (from the best param config 438 | when hyperparameters searching). 439 | 440 | score_history_ : list 441 | Available only when a eval_set is provided. 442 | Scores obtained reducing the features (from the best param config 443 | when hyperparameters searching). 444 | 445 | best_params_ : dict 446 | Available only when hyperparameters searching. 447 | Parameter setting that gave the best results on the eval_set. 448 | 449 | trials_ : list 450 | Available only when hyperparameters searching. 451 | A list of dicts. The dicts are all the parameter combinations tried 452 | and derived from the param_grid. 453 | 454 | best_score_ : float 455 | Available only when hyperparameters searching. 456 | The best score achieved by all the possible combination created. 457 | 458 | scores_ : list 459 | Available only when hyperparameters searching. 460 | The scores achieved on the eval_set by all the models tried. 461 | 462 | best_iter_ : int 463 | Available only when hyperparameters searching. 464 | The boosting iterations achieved by the best parameters combination. 465 | 466 | iterations_ : list 467 | Available only when hyperparameters searching. 468 | The boosting iterations of all the models tried. 469 | 470 | boost_type_ : str 471 | The type of the boosting estimator (LGB or XGB). 472 | """ 473 | 474 | def __init__(self, 475 | estimator, *, 476 | min_features_to_select=None, 477 | step=1, 478 | param_grid=None, 479 | greater_is_better=False, 480 | importance_type='feature_importances', 481 | train_importance=True, 482 | n_iter=None, 483 | sampling_seed=None, 484 | verbose=1, 485 | n_jobs=None): 486 | 487 | self.estimator = estimator 488 | self.min_features_to_select = min_features_to_select 489 | self.step = step 490 | self.param_grid = param_grid 491 | self.greater_is_better = greater_is_better 492 | self.importance_type = importance_type 493 | self.train_importance = train_importance 494 | self.n_iter = n_iter 495 | self.sampling_seed = sampling_seed 496 | self.verbose = verbose 497 | self.n_jobs = n_jobs 498 | 499 | def _build_model(self, params=None): 500 | """Private method to build model.""" 501 | 502 | estimator = clone(self.estimator) 503 | 504 | if params is None: 505 | model = _RFE( 506 | estimator=estimator, 507 | min_features_to_select=self.min_features_to_select, 508 | step=self.step, 509 | greater_is_better=self.greater_is_better, 510 | importance_type=self.importance_type, 511 | train_importance=self.train_importance, 512 | verbose=self.verbose 513 | ) 514 | 515 | else: 516 | estimator.set_params(**params) 517 | model = _RFE( 518 | estimator=estimator, 519 | min_features_to_select=self.min_features_to_select, 520 | step=self.step, 521 | greater_is_better=self.greater_is_better, 522 | importance_type=self.importance_type, 523 | train_importance=self.train_importance, 524 | verbose=self.verbose 525 | ) 526 | 527 | return model 528 | 529 | 530 | class BoostRFA(_BoostSearch, _RFA): 531 | """Simultaneous features selection with RFA and hyperparamater searching 532 | on a given validation set for LGBModel or XGBModel. 533 | 534 | Pass a LGBModel or XGBModel to compute features selection with RFA. 535 | The gradient boosting instance with the best features is selected. 536 | When a eval_set is provided, the best gradient boosting and the best 537 | features are obtained evaluating the score with eval_metric. 538 | Otherwise, the best combination is obtained looking only at feature 539 | importance. 540 | 541 | If param_grid is a dictionary with parameter boundaries, a hyperparameter 542 | tuning is computed simultaneously. The parameter combinations are scored on 543 | the provided eval_set. 544 | To operate random search pass distributions in the param_grid with rvs 545 | method for sampling (such as those from scipy.stats.distributions). 546 | To operate bayesian search pass hyperopt distributions. 547 | The specification of n_iter or sampling_seed is effective only with random 548 | or hyperopt searches. 549 | The best parameter combination is the one which obtain the better score 550 | (as returned by eval_metric) on the provided eval_set. 551 | 552 | If all parameters are presented as a list/floats/integers, grid-search 553 | is performed. If at least one parameter is given as a distribution (such as 554 | those from scipy.stats.distributions), random-search is performed computing 555 | sampling with replacement. Bayesian search is effective only when all the 556 | parameters to tune are in form of hyperopt distributions. 557 | It is highly recommended to use continuous distributions for continuous 558 | parameters. 559 | 560 | Parameters 561 | ---------- 562 | estimator : object 563 | A supervised learning estimator of LGBModel or XGBModel type. 564 | 565 | step : int or float, default=1 566 | If greater than or equal to 1, then `step` corresponds to the 567 | (integer) number of features to remove at each iteration. 568 | If within (0.0, 1.0), then `step` corresponds to the percentage 569 | (rounded down) of features to remove at each iteration. 570 | Note that the last iteration may remove fewer than `step` features in 571 | order to reach `min_features_to_select`. 572 | 573 | min_features_to_select : int, default=None 574 | The minimum number of features to be selected. This number of features 575 | will always be scored, even if the difference between the original 576 | feature count and `min_features_to_select` isn't divisible by 577 | `step`. The default value for min_features_to_select is set to 1 when a 578 | eval_set is provided, otherwise it always corresponds to n_features // 2. 579 | 580 | importance_type : str, default='feature_importances' 581 | Which importance measure to use. It can be 'feature_importances' 582 | (the default feature importance of the gradient boosting estimator) 583 | or 'shap_importances'. 584 | 585 | train_importance : bool, default=True 586 | Effective only when importance_type='shap_importances'. 587 | Where to compute the shap feature importance: on train (True) 588 | or on eval_set (False). 589 | 590 | param_grid : dict, default=None 591 | Dictionary with parameters names (`str`) as keys and distributions 592 | or lists of parameters to try. 593 | None means no hyperparameters search. 594 | 595 | greater_is_better : bool, default=False 596 | Effective only when hyperparameters searching. 597 | Whether the quantity to monitor is a score function, 598 | meaning high is good, or a loss function, meaning low is good. 599 | 600 | n_iter : int, default=None 601 | Effective only when hyperparameters searching. 602 | Effective only for random or hyperopt serach. 603 | Number of parameter settings that are sampled. 604 | n_iter trades off runtime vs quality of the solution. 605 | 606 | sampling_seed : int, default=None 607 | Effective only when hyperparameters searching. 608 | Effective only for random or hyperopt serach. 609 | The seed used to sample from the hyperparameter distributions. 610 | 611 | n_jobs : int, default=None 612 | Effective only when hyperparameters searching without hyperopt. 613 | The number of jobs to run in parallel for model fitting. 614 | ``None`` means 1 using one processor. ``-1`` means using all 615 | processors. 616 | 617 | verbose : int, default=1 618 | Verbosity mode. <=0 silent all; ==1 print trial logs (when 619 | hyperparameters searching); >1 print feature selection logs plus 620 | trial logs (when hyperparameters searching). 621 | 622 | Attributes 623 | ---------- 624 | estimator_ : estimator 625 | The fitted estimator with the select features and the optimal 626 | parameter combination (when hyperparameters searching). 627 | 628 | n_features_ : int 629 | The number of selected features (from the best param config 630 | when hyperparameters searching). 631 | 632 | ranking_ : ndarray of shape (n_features,) 633 | The feature ranking, such that ``ranking_[i]`` corresponds to the 634 | ranking position of the i-th feature (from the best param config 635 | when hyperparameters searching). Selected features are assigned 636 | rank 1. 637 | 638 | support_ : ndarray of shape (n_features,) 639 | The mask of selected features (from the best param config 640 | when hyperparameters searching). 641 | 642 | score_history_ : list 643 | Available only when a eval_set is provided. 644 | Scores obtained reducing the features (from the best param config 645 | when hyperparameters searching). 646 | 647 | best_params_ : dict 648 | Available only when hyperparameters searching. 649 | Parameter setting that gave the best results on the eval_set. 650 | 651 | trials_ : list 652 | Available only when hyperparameters searching. 653 | A list of dicts. The dicts are all the parameter combinations tried 654 | and derived from the param_grid. 655 | 656 | best_score_ : float 657 | Available only when hyperparameters searching. 658 | The best score achieved by all the possible combination created. 659 | 660 | scores_ : list 661 | Available only when hyperparameters searching. 662 | The scores achieved on the eval_set by all the models tried. 663 | 664 | best_iter_ : int 665 | Available only when hyperparameters searching. 666 | The boosting iterations achieved by the best parameters combination. 667 | 668 | iterations_ : list 669 | Available only when hyperparameters searching. 670 | The boosting iterations of all the models tried. 671 | 672 | boost_type_ : str 673 | The type of the boosting estimator (LGB or XGB). 674 | 675 | Notes 676 | ----- 677 | The code for the RFA algorithm is inspired and improved from: 678 | https://github.com/heberleh/recursive-feature-addition 679 | """ 680 | 681 | def __init__(self, 682 | estimator, *, 683 | min_features_to_select=None, 684 | step=1, 685 | param_grid=None, 686 | greater_is_better=False, 687 | importance_type='feature_importances', 688 | train_importance=True, 689 | n_iter=None, 690 | sampling_seed=None, 691 | verbose=1, 692 | n_jobs=None): 693 | 694 | self.estimator = estimator 695 | self.min_features_to_select = min_features_to_select 696 | self.step = step 697 | self.param_grid = param_grid 698 | self.greater_is_better = greater_is_better 699 | self.importance_type = importance_type 700 | self.train_importance = train_importance 701 | self.n_iter = n_iter 702 | self.sampling_seed = sampling_seed 703 | self.verbose = verbose 704 | self.n_jobs = n_jobs 705 | 706 | def _build_model(self, params=None): 707 | """Private method to build model.""" 708 | 709 | estimator = clone(self.estimator) 710 | 711 | if params is None: 712 | model = _RFA( 713 | estimator=estimator, 714 | min_features_to_select=self.min_features_to_select, 715 | step=self.step, 716 | greater_is_better=self.greater_is_better, 717 | importance_type=self.importance_type, 718 | train_importance=self.train_importance, 719 | verbose=self.verbose 720 | ) 721 | 722 | else: 723 | estimator.set_params(**params) 724 | model = _RFA( 725 | estimator=estimator, 726 | min_features_to_select=self.min_features_to_select, 727 | step=self.step, 728 | greater_is_better=self.greater_is_better, 729 | importance_type=self.importance_type, 730 | train_importance=self.train_importance, 731 | verbose=self.verbose 732 | ) 733 | 734 | return model -------------------------------------------------------------------------------- /shaphypetune/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | from itertools import product 4 | 5 | from shap import TreeExplainer 6 | 7 | 8 | def _check_boosting(model): 9 | """Check if the estimator is a LGBModel or XGBModel. 10 | 11 | Returns 12 | ------- 13 | Model type in string format. 14 | """ 15 | 16 | estimator_type = str(type(model)).lower() 17 | 18 | boost_type = ('LGB' if 'lightgbm' in estimator_type else '') + \ 19 | ('XGB' if 'xgboost' in estimator_type else '') 20 | 21 | if len(boost_type) != 3: 22 | raise ValueError("Pass a LGBModel or XGBModel.") 23 | 24 | return boost_type 25 | 26 | 27 | def _shap_importances(model, X): 28 | """Extract feature importances from fitted boosting models 29 | using TreeExplainer from shap. 30 | 31 | Returns 32 | ------- 33 | array of feature importances. 34 | """ 35 | 36 | explainer = TreeExplainer( 37 | model, feature_perturbation="tree_path_dependent") 38 | coefs = explainer.shap_values(X) 39 | 40 | if isinstance(coefs, list): 41 | coefs = list(map(lambda x: np.abs(x).mean(0), coefs)) 42 | coefs = np.sum(coefs, axis=0) 43 | else: 44 | coefs = np.abs(coefs).mean(0) 45 | 46 | return coefs 47 | 48 | 49 | def _feature_importances(model): 50 | """Extract feature importances from fitted boosting models. 51 | 52 | Returns 53 | ------- 54 | array of feature importances. 55 | """ 56 | 57 | if hasattr(model, 'coef_'): ## booster='gblinear' (xgb) 58 | coefs = np.square(model.coef_).sum(axis=0) 59 | else: 60 | coefs = model.feature_importances_ 61 | 62 | return coefs 63 | 64 | 65 | def _get_categorical_support(n_features, fit_params): 66 | """Obtain boolean mask for categorical features""" 67 | 68 | cat_support = np.zeros(n_features, dtype=bool) 69 | cat_ids = [] 70 | 71 | msg = "When manually setting categarical features, " \ 72 | "pass a 1D array-like of categorical columns indices " \ 73 | "(specified as integers)." 74 | 75 | if 'categorical_feature' in fit_params: # LGB 76 | cat_ids = fit_params['categorical_feature'] 77 | if len(np.shape(cat_ids)) != 1: 78 | raise ValueError(msg) 79 | if not all([isinstance(c, int) for c in cat_ids]): 80 | raise ValueError(msg) 81 | 82 | cat_support[cat_ids] = True 83 | 84 | return cat_support 85 | 86 | 87 | def _set_categorical_indexes(support, cat_support, _fit_params, 88 | duplicate=False): 89 | """Map categorical features in each data repartition""" 90 | 91 | if cat_support.any(): 92 | 93 | n_features = support.sum() 94 | support_id = np.zeros_like(support, dtype='int32') 95 | support_id[support] = np.arange(n_features, dtype='int32') 96 | cat_feat = support_id[np.where(support & cat_support)[0]] 97 | # empty if support and cat_support are not alligned 98 | 99 | if duplicate: # is Boruta 100 | cat_feat = cat_feat.tolist() + (n_features + cat_feat).tolist() 101 | else: 102 | cat_feat = cat_feat.tolist() 103 | 104 | _fit_params['categorical_feature'] = cat_feat 105 | 106 | return _fit_params 107 | 108 | 109 | def _check_param(values): 110 | """Check the parameter boundaries passed in dict values. 111 | 112 | Returns 113 | ------- 114 | list of checked parameters. 115 | """ 116 | 117 | if isinstance(values, (list, tuple, np.ndarray)): 118 | return list(set(values)) 119 | elif 'scipy' in str(type(values)).lower(): 120 | return values 121 | elif 'hyperopt' in str(type(values)).lower(): 122 | return values 123 | else: 124 | return [values] 125 | 126 | 127 | class ParameterSampler(object): 128 | """Generator on parameters sampled from given distributions. 129 | If all parameters are presented as a list, sampling without replacement is 130 | performed. If at least one parameter is given as a scipy distribution, 131 | sampling with replacement is used. If all parameters are given as hyperopt 132 | distributions Tree of Parzen Estimators searching from hyperopt is computed. 133 | It is highly recommended to use continuous distributions for continuous 134 | parameters. 135 | 136 | Parameters 137 | ---------- 138 | param_distributions : dict 139 | Dictionary with parameters names (`str`) as keys and distributions 140 | or lists of parameters to try. Distributions must provide a ``rvs`` 141 | method for random sampling (such as those from scipy.stats.distributions) 142 | or be hyperopt distributions for bayesian searching. 143 | If a list is given, it is sampled uniformly. 144 | 145 | n_iter : integer, default=None 146 | Number of parameter configurations that are produced. 147 | 148 | random_state : int, default=None 149 | Pass an int for reproducible output across multiple 150 | function calls. 151 | 152 | Returns 153 | ------- 154 | param_combi : list of dicts or dict of hyperopt distributions 155 | Parameter combinations. 156 | 157 | searching_type : str 158 | The searching algorithm used. 159 | """ 160 | 161 | def __init__(self, param_distributions, n_iter=None, random_state=None): 162 | 163 | self.n_iter = n_iter 164 | self.random_state = random_state 165 | self.param_distributions = param_distributions 166 | 167 | def sample(self): 168 | """Generator parameter combinations from given distributions.""" 169 | 170 | param_distributions = self.param_distributions.copy() 171 | 172 | is_grid = all(isinstance(p, list) 173 | for p in param_distributions.values()) 174 | is_random = all(isinstance(p, list) or 'scipy' in str(type(p)).lower() 175 | for p in param_distributions.values()) 176 | is_hyperopt = all('hyperopt' in str(type(p)).lower() 177 | or (len(p) < 2 if isinstance(p, list) else False) 178 | for p in param_distributions.values()) 179 | 180 | if is_grid: 181 | param_combi = list(product(*param_distributions.values())) 182 | param_combi = [ 183 | dict(zip(param_distributions.keys(), combi)) 184 | for combi in param_combi 185 | ] 186 | return param_combi, 'grid' 187 | 188 | elif is_random: 189 | if self.n_iter is None: 190 | raise ValueError( 191 | "n_iter must be an integer >0 when scipy parameter " 192 | "distributions are provided. Get None." 193 | ) 194 | 195 | seed = (random.randint(1, 100) if self.random_state is None 196 | else self.random_state + 1) 197 | random.seed(seed) 198 | 199 | param_combi = [] 200 | k = self.n_iter 201 | for i in range(self.n_iter): 202 | dist = param_distributions.copy() 203 | combi = [] 204 | for j, v in enumerate(dist.values()): 205 | if 'scipy' in str(type(v)).lower(): 206 | combi.append(v.rvs(random_state=seed * (k + j))) 207 | else: 208 | combi.append(v[random.randint(0, len(v) - 1)]) 209 | k += i + j 210 | param_combi.append( 211 | dict(zip(param_distributions.keys(), combi)) 212 | ) 213 | np.random.mtrand._rand 214 | 215 | return param_combi, 'random' 216 | 217 | elif is_hyperopt: 218 | if self.n_iter is None: 219 | raise ValueError( 220 | "n_iter must be an integer >0 when hyperopt " 221 | "search spaces are provided. Get None." 222 | ) 223 | param_distributions = { 224 | k: p[0] if isinstance(p, list) else p 225 | for k, p in param_distributions.items() 226 | } 227 | 228 | return param_distributions, 'hyperopt' 229 | 230 | else: 231 | raise ValueError( 232 | "Parameters not recognized. " 233 | "Pass lists, scipy distributions (also in conjunction " 234 | "with lists), or hyperopt search spaces." 235 | ) --------------------------------------------------------------------------------