├── .gitignore ├── Example Notebook - NestedCV.ipynb ├── LICENSE ├── README.md ├── nested_cv ├── __init__.py ├── __pycache__ │ └── __init__.cpython-37.pyc └── nested_cv.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .DS_Store 3 | nested_cv/__pycache__/nested_cv.cpython-37.pyc 4 | .ipynb_checkpoints/Example Notebook - NestedCV-checkpoint.ipynb 5 | -------------------------------------------------------------------------------- /Example Notebook - NestedCV.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from nested_cv import NestedCV\n", 10 | "\n", 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "from sklearn.datasets import load_boston, load_iris, load_breast_cancer\n", 14 | "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", 15 | "from sklearn.model_selection import KFold\n", 16 | "\n", 17 | "# When using Random Search, we get a user warning with this little number of hyperparameters\n", 18 | "# Suppress it\n", 19 | "import warnings\n", 20 | "warnings.simplefilter(action='ignore', category=UserWarning)\n", 21 | "warnings.simplefilter(action='ignore', category=FutureWarning)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "# Regression Example" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 6, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "data": { 38 | "text/plain": [ 39 | "[4.2847157727008085,\n", 40 | " 3.4072642252185354,\n", 41 | " 2.8051399371930317,\n", 42 | " 2.2417498868498766,\n", 43 | " 3.4797203328262443]" 44 | ] 45 | }, 46 | "execution_count": 6, 47 | "metadata": {}, 48 | "output_type": "execute_result" 49 | } 50 | ], 51 | "source": [ 52 | "boston = load_boston()\n", 53 | "X = boston.data\n", 54 | "y = boston.target\n", 55 | "\n", 56 | "# Define a parameters grid\n", 57 | "param_grid = {\n", 58 | " 'max_depth': [3, 7, 10, None],\n", 59 | " 'n_estimators': [100,200],\n", 60 | " 'min_samples_split':[2,3,5,7,10]\n", 61 | "}\n", 62 | "\n", 63 | "# Either specify a strategy or number\n", 64 | "# Here we choose a strategy\n", 65 | "outer_cv = KFold(n_splits=5,\n", 66 | " shuffle=True,\n", 67 | " random_state=123)\n", 68 | "inner_cv = KFold(n_splits=5,\n", 69 | " shuffle=True,\n", 70 | " random_state=123)\n", 71 | "\n", 72 | "NCV = NestedCV(model=RandomForestRegressor(), params_grid=param_grid,\n", 73 | " outer_cv=outer_cv, inner_cv=inner_cv, n_jobs = -1,\n", 74 | " cv_options={'sqrt_of_score':True, \n", 75 | " 'recursive_feature_elimination':False, \n", 76 | " 'rfe_n_features':2})\n", 77 | "NCV.fit(X=X,y=y)\n", 78 | "\n", 79 | "NCV.outer_scores" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "# Classification Example" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "## Breast Cancer (2 Classes)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 3, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "[0.9912337662337662,\n", 105 | " 0.974025974025974,\n", 106 | " 0.9803240740740741,\n", 107 | " 0.9963247577681257,\n", 108 | " 0.9969551282051282]" 109 | ] 110 | }, 111 | "execution_count": 3, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "from sklearn.metrics import roc_auc_score\n", 118 | "\n", 119 | "# Binary classification\n", 120 | "cancer = load_breast_cancer()\n", 121 | "X = cancer.data\n", 122 | "y = cancer.target\n", 123 | "\n", 124 | "# Define a parameters grid\n", 125 | "param_grid = {\n", 126 | " 'max_depth': [3, None],\n", 127 | " 'n_estimators': [10, 20]\n", 128 | "}\n", 129 | "\n", 130 | "NCV = NestedCV(model=RandomForestClassifier(), params_grid=param_grid,\n", 131 | " outer_cv=5, inner_cv=5,\n", 132 | " cv_options={'metric':roc_auc_score, \n", 133 | " 'metric_score_indicator_lower':False,\n", 134 | " 'randomized_search_iter':30, \n", 135 | " 'predict_proba':True})\n", 136 | "NCV.fit(X=X,y=y)\n", 137 | "\n", 138 | "NCV.outer_scores" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "## Iris (3 Classes)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 4, 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "[0.9333333333333333,\n", 157 | " 0.9333333333333333,\n", 158 | " 0.9333333333333333,\n", 159 | " 0.9333333333333333,\n", 160 | " 0.9666666666666667]" 161 | ] 162 | }, 163 | "execution_count": 4, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "from sklearn.metrics import f1_score\n", 170 | "\n", 171 | "# Multiclass classification\n", 172 | "iris = load_iris()\n", 173 | "X = iris.data\n", 174 | "y = iris.target\n", 175 | "\n", 176 | "# Define a parameters grid\n", 177 | "param_grid = {\n", 178 | " 'max_depth': [3, None],\n", 179 | " 'n_estimators': [10, 20]\n", 180 | "}\n", 181 | "\n", 182 | "NCV = NestedCV(model=RandomForestClassifier(), params_grid=param_grid, \n", 183 | " outer_cv=5, inner_cv=5,\n", 184 | " cv_options={'metric':f1_score, \n", 185 | " 'metric_score_indicator_lower':False,\n", 186 | " 'randomized_search_iter':30, \n", 187 | " 'predict_proba':False,\n", 188 | " 'multiclass_average': 'micro'})\n", 189 | "NCV.fit(X=X,y=y)\n", 190 | "\n", 191 | "NCV.outer_scores" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [] 200 | } 201 | ], 202 | "metadata": { 203 | "kernelspec": { 204 | "display_name": "Python 3", 205 | "language": "python", 206 | "name": "python3" 207 | }, 208 | "language_info": { 209 | "codemirror_mode": { 210 | "name": "ipython", 211 | "version": 3 212 | }, 213 | "file_extension": ".py", 214 | "mimetype": "text/x-python", 215 | "name": "python", 216 | "nbconvert_exporter": "python", 217 | "pygments_lexer": "ipython3", 218 | "version": "3.7.3" 219 | } 220 | }, 221 | "nbformat": 4, 222 | "nbformat_minor": 2 223 | } 224 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Casper Bøgeskov Hansen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Nested-Cross-Validation 2 | This repository implements a general nested cross-validation function. Ready to use with ANY estimator that implements the Scikit-Learn estimator interface. 3 | ## Installing the pacakge: 4 | You can find the package on [pypi](https://pypi.org/project/nested-cv/)* and install it via pip by using the following command: 5 | ```bash 6 | pip install nested-cv 7 | ``` 8 | You can also install it from the wheel file on the [Releases](https://github.com/casperbh96/Nested-Cross-Validation/releases) page. 9 | 10 | \* we gradually push updates, pull this master from github if you want the absolute latest changes. 11 | 12 | ## Usage 13 | Be mindful of the options that are available for NestedCV. Some cross-validation options are defined in a dictionary `cv_options`. 14 | This package is optimized for any estimator that implements a scikit-learn wrapper, e.g. XGBoost, LightGBM, KerasRegressor, KerasClassifier etc. 15 | 16 | -->**[See notebook for more examples](https://github.com/casperbh96/Nested-Cross-Validation/blob/master/Example%20Notebook%20-%20NestedCV.ipynb)** 17 | 18 | ### Simple 19 | Here is a single example using Random Forest. Check out the example notebook for more. 20 | ```python 21 | from nested_cv import NestedCV 22 | from sklearn.ensemble import RandomForestRegressor 23 | 24 | # Define a parameters grid 25 | param_grid = { 26 | 'max_depth': [3, None], 27 | 'n_estimators': [10] 28 | } 29 | 30 | NCV = NestedCV(model=RandomForestRegressor(), params_grid=param_grid, 31 | outer_cv=5, inner_cv=5, n_jobs = -1, 32 | cv_options={'sqrt_of_score':True, 33 | 'recursive_feature_elimination':True, 34 | 'rfe_n_features':2}) 35 | NCV.fit(X=X,y=y) 36 | NCV.outer_scores 37 | ``` 38 | 39 | ### NestedCV Parameters 40 | | Name | type | description | 41 | | :------------- |:-------------| :-----| 42 | | model | estimator | The estimator implements scikit-learn estimator interface. | 43 | | params_grid | dictionary "dict" | The dict contains hyperparameters for model. | 44 | | outer_cv | int or cv splitter class | Outer splitting strategy. If int, KFold is default. | 45 | | inner_cv | int or cv splitter class | Inner splitting strategy. If int, KFold is default. | 46 | | cv_options | dictionary "dict" | [Next section](#cv_options-value-options) | 47 | | n_jobs | int | Number of jobs for joblib to run (multiprocessing) | 48 | 49 | ### `cv_options` value options 50 | **`metric` :** Callable from sklearn.metrics, default = mean_squared_error 51 | 52 |       A scoring metric used to score each model 53 | 54 | **`metric_score_indicator_lower` :** boolean, default = True 55 | 56 |       Choose whether lower score is better for the metric calculation or higher score is better. 57 | 58 | **`sqrt_of_score` :** boolean, default = False 59 | 60 |       Whether or not if the square root should be taken of score 61 | 62 | **`randomized_search` :** boolean, default = True 63 | 64 |       Whether to use gridsearch or randomizedsearch from sklearn 65 | 66 | **`randomized_search_iter` :** int, default = 10 67 | 68 |       Number of iterations for randomized search 69 | 70 | **`recursive_feature_elimination` :** boolean, default = False 71 | 72 |       Whether to do feature elimination 73 | 74 | **`predict_proba` :** boolean, default = False 75 | 76 |       If true, predict probabilities instead for a class, instead of predicting a class 77 | 78 | **`multiclass_average` :** string, default = 'binary' 79 | 80 |       For some classification metrics with a multiclass prediction, you need to specify an 81 | average other than 'binary' 82 | 83 | ### Returns 84 | **`variance` :** Model variance by numpy.var() 85 | 86 | **`outer_scores` :** A list of the outer scores, from the outer cross-validation 87 | 88 | **`best_inner_score_list` :** A list of best inner scores for each outer loop 89 | 90 | **`best_params` :** All best params from each inner loop cumulated in a dict 91 | 92 | **`best_inner_params_list` :** Best inner params for each outer loop as an array of dictionaries 93 | 94 | ## How to use the output? 95 | We suggest looking at the best hyperparameters together with the score for each outer loop. Look at how stable the model appears to be in a nested cross-validation setting. If the outer score changes a lot, then it might indicate instability in your model. In that case, start over with making a new model. 96 | 97 | ### After Nested Cross-Validation? 98 | If the results from nested cross-validation are stable: Run a normal cross-validation with the same procedure as in nested cross-validation, i.e. if you used feature selection in nested cross-validation, you should also do that in normal cross-validation. Use the best parameters as input to your normal cross-validation. 99 | 100 | ## Limitations 101 | - [XGBoost](https://xgboost.readthedocs.io/en/latest/) implements a `early_stopping_rounds`, which cannot be used in this implementation. Other similar parameters might not work in combination with this implementation. The function will have to be adopted to use special parameters like that. 102 | 103 | ## What did we learn? 104 | - Using [Scikit-Learn](https://github.com/scikit-learn/scikit-learn) will lead to a faster implementation, since the Scikit-Learn community has implemented many functions that does much of the work. 105 | - We have learned and applied this package in our main project about [House Price Prediction](https://github.com/casperbh96/house-price-prediction). 106 | 107 | ## Why use Nested Cross-Validation? 108 | Controlling the bias-variance tradeoff is an essential and important task in machine learning, indicated by [[Cawley and Talbot, 2010]](http://jmlr.csail.mit.edu/papers/volume11/cawley10a/cawley10a.pdf). Many articles indicate that this is possible by the use of nested cross-validation, one of them by [Varma and Simon, 2006](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1397873/pdf/1471-2105-7-91.pdf). Other interesting literature for nested cross-validation are [[Varoquaox et al., 2017]](https://arxiv.org/pdf/1606.05201.pdf) and [[Krstajic et al., 2014]](https://jcheminf.biomedcentral.com/track/pdf/10.1186/1758-2946-6-10). 109 | -------------------------------------------------------------------------------- /nested_cv/__init__.py: -------------------------------------------------------------------------------- 1 | from nested_cv.nested_cv import NestedCV -------------------------------------------------------------------------------- /nested_cv/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/casper-hansen/Nested-Cross-Validation/ebb942addc01cf53d240ac19c81bbb3875c74cd3/nested_cv/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /nested_cv/nested_cv.py: -------------------------------------------------------------------------------- 1 | import logging as log 2 | import pandas as pd 3 | import numpy as np 4 | import numbers 5 | from matplotlib import pyplot as plt 6 | from sklearn.model_selection import KFold, ParameterGrid, ParameterSampler 7 | from sklearn.metrics import mean_squared_error 8 | from sklearn.feature_selection import RFECV 9 | from sklearn.utils.multiclass import type_of_target 10 | from joblib import Parallel, delayed 11 | 12 | class NestedCV(): 13 | '''A general class to handle nested cross-validation for any estimator that 14 | implements the scikit-learn estimator interface. 15 | 16 | Parameters 17 | ---------- 18 | model : estimator 19 | The estimator implements scikit-learn estimator interface. 20 | 21 | params_grid : dict 22 | The dict contains hyperparameters for model. 23 | 24 | outer_cv : int or cv splitter class (e.g. KFold, StratifiedKFold etc.) 25 | Outer splitting strategy. If int, KFold is default. For more information, 26 | visit https://scikit-learn.org/stable/modules/classes.html#splitter-classes. 27 | 28 | inner_cv : int or cv splitter class (e.g. KFold, StratifiedKFold etc.) 29 | Inner splitting strategy. If int, KFold is default. For more information, 30 | visit https://scikit-learn.org/stable/modules/classes.html#splitter-classes. 31 | 32 | n_jobs : int 33 | Number of jobs to run in parallel 34 | 35 | cv_options: dict, default = {} 36 | Nested Cross-Validation Options, check docs for details. 37 | 38 | metric : callable from sklearn.metrics, default = mean_squared_error 39 | A scoring metric used to score each model 40 | 41 | metric_score_indicator_lower : boolean, default = True 42 | Choose whether lower score is better for the metric calculation or higher score is better, 43 | `True` means lower score is better. 44 | 45 | sqrt_of_score : boolean, default = False 46 | Whether or not the square root should be taken of score 47 | 48 | randomized_search : boolean, default = False 49 | Whether to use gridsearch or randomizedsearch from sklearn 50 | 51 | randomized_search_iter : int, default = 10 52 | Number of iterations for randomized search 53 | 54 | recursive_feature_elimination : boolean, default = False 55 | Whether to do recursive feature selection (rfe) for each set of different hyperparameters 56 | in the inner most loop of the fit function. 57 | 58 | rfe_n_features : int, default = 1 59 | If recursive_feature_elimination is enabled, select n number of features 60 | 61 | predict_proba : boolean, default = False 62 | If true, predict probabilities instead for a class, instead of predicting a class 63 | 64 | multiclass_average : string, default = 'binary' 65 | For some classification metrics with a multiclass prediction, you need to specify an 66 | average other than 'binary' 67 | ''' 68 | 69 | def __init__(self, model, params_grid, outer_cv = None, inner_cv = None, n_jobs = 1, cv_options={}, 70 | outer_kfolds = None, inner_kfolds = None): 71 | self.model = model 72 | self.params_grid = params_grid 73 | self.outer_cv = outer_cv 74 | self.inner_cv = inner_cv 75 | self.n_jobs = n_jobs 76 | self.metric = cv_options.get('metric', mean_squared_error) 77 | self.metric_score_indicator_lower = cv_options.get( 78 | 'metric_score_indicator_lower', True) 79 | self.sqrt_of_score = cv_options.get('sqrt_of_score', False) 80 | self.randomized_search = cv_options.get('randomized_search', False) 81 | self.randomized_search_iter = cv_options.get( 82 | 'randomized_search_iter', 10) 83 | self.recursive_feature_elimination = cv_options.get( 84 | 'recursive_feature_elimination', False) 85 | self.rfe_n_features = cv_options.get( 86 | 'rfe_n_features', 0) 87 | self.predict_proba = cv_options.get( 88 | 'predict_proba', False) 89 | self.multiclass_average = cv_options.get( 90 | 'multiclass_average', 'binary') 91 | self.outer_scores = [] 92 | self.best_params = {} 93 | self.best_inner_score_list = [] 94 | self.variance = [] 95 | 96 | if(outer_kfolds != None or inner_kfolds != None): 97 | raise NameError('outer_kfolds and inner_kfolds is renamed to outer_cv and inner_cv, ' \ 98 | 'please replace the variables in your code. Will be removed in future release') 99 | 100 | # to check if use sqrt_of_score and handle the different cases 101 | def _transform_score_format(self, scoreValue): 102 | if self.sqrt_of_score: 103 | return np.sqrt(scoreValue) 104 | return scoreValue 105 | 106 | # to convert array of dict to dict with array values, so it can be used as params for parameter tuning 107 | def _score_to_best_params(self, best_inner_params_list): 108 | params_dict = {} 109 | for best_inner_params in best_inner_params_list: 110 | for key, value in best_inner_params.items(): 111 | if key in params_dict: 112 | if value not in params_dict[key]: 113 | params_dict[key].append(value) 114 | else: 115 | params_dict[key] = [value] 116 | return params_dict 117 | 118 | # a function to handle recursive feature elimination 119 | def _fit_recursive_feature_elimination(self, X_train_outer, y_train_outer, X_test_outer): 120 | rfe = RFECV(estimator=self.model, 121 | min_features_to_select=self.rfe_n_features, cv=self.inner_cv, n_jobs = self.n_jobs) 122 | rfe.fit(X_train_outer, y_train_outer) 123 | 124 | log.info('Best number of features was: {0}'.format(rfe.n_features_)) 125 | 126 | # Assign selected features to data 127 | return rfe.transform(X_train_outer), rfe.transform(X_test_outer) 128 | 129 | def _predict_and_score(self, X_test, y_test): 130 | #XXX: Implement type_of_target(y) 131 | 132 | if(self.predict_proba): 133 | y_type = type_of_target(y_test) 134 | if(y_type in ('binary')): 135 | pred = self.model.predict_proba(X_test)[:,1] 136 | else: 137 | pred = self.model.predict_proba(X_test) 138 | 139 | else: 140 | pred = self.model.predict(X_test) 141 | 142 | if(self.multiclass_average == 'binary'): 143 | return self.metric(y_test, pred), pred 144 | else: 145 | return self.metric(y_test, pred, average=self.multiclass_average), pred 146 | def _best_of_results(self, results): 147 | best_score = None 148 | best_parameters = {} 149 | 150 | for score_parameter in results: 151 | if(self.metric_score_indicator_lower): 152 | if(best_score == None or score_parameter[0] < best_score): 153 | best_score = score_parameter[0] 154 | best_parameters = score_parameter[1] 155 | else: 156 | if(best_score == None or score_parameter[0] > best_score): 157 | best_score = score_parameter[0] 158 | best_parameters = score_parameter[1] 159 | 160 | return best_score, best_parameters 161 | 162 | def fit(self, X, y): 163 | '''A method to fit nested cross-validation 164 | Parameters 165 | ---------- 166 | X : pandas dataframe (rows, columns) 167 | Training dataframe, where rows is total number of observations and columns 168 | is total number of features 169 | 170 | y : pandas dataframe 171 | Output dataframe, also called output variable. y is what you want to predict. 172 | 173 | Returns 174 | ------- 175 | It will not return directly the values, but it's accessable from the class object it self. 176 | You should be able to access: 177 | 178 | variance 179 | Model variance by numpy.var() 180 | 181 | outer_scores 182 | Outer score List. 183 | 184 | best_inner_score_list 185 | Best inner scores for each outer loop 186 | 187 | best_params 188 | All best params from each inner loop cumulated in a dict 189 | 190 | best_inner_params_list 191 | Best inner params for each outer loop as an array of dictionaries 192 | ''' 193 | 194 | log.debug( 195 | '\n{0} <-- Running this model now'.format(type(self.model).__name__)) 196 | 197 | self.X = X 198 | self.y = y 199 | 200 | # If Pandas dataframe or series, convert to array 201 | if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): 202 | X = X.to_numpy() 203 | if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series): 204 | y = y.to_numpy() 205 | if(self.randomized_search): 206 | param_func = ParameterSampler(param_distributions=self.params_grid, 207 | n_iter=self.randomized_search_iter) 208 | else: 209 | param_func = ParameterGrid(param_grid=self.params_grid) 210 | 211 | if(isinstance(self.outer_cv, numbers.Number) and 212 | isinstance(self.inner_cv, numbers.Number)): 213 | outer_cv = KFold(n_splits=self.outer_cv, shuffle=True) 214 | inner_cv = KFold(n_splits=self.inner_cv, shuffle=True) 215 | else: 216 | outer_cv = self.outer_cv 217 | inner_cv = self.inner_cv 218 | 219 | outer_scores = [] 220 | variance = [] 221 | best_inner_params_list = [] # Change both to by one thing out of key-value pair 222 | best_inner_score_list = [] 223 | 224 | # Split X and y into K-partitions to Outer CV 225 | for (i, (train_index, test_index)) in enumerate(outer_cv.split(X, y)): 226 | log.debug( 227 | '\n{0}/{1} <-- Current outer fold'.format(i+1, self.outer_cv)) 228 | X_train_outer, X_test_outer = X[train_index], X[test_index] 229 | y_train_outer, y_test_outer = y[train_index], y[test_index] 230 | best_inner_params = {} 231 | best_inner_score = None 232 | search_scores = [] 233 | 234 | # Split X_train_outer and y_train_outer into K-partitions to be inner CV 235 | for (j, (train_index_inner, test_index_inner)) in enumerate(inner_cv.split(X_train_outer, y_train_outer)): 236 | log.debug( 237 | '\n\t{0}/{1} <-- Current inner fold'.format(j+1, self.inner_cv)) 238 | X_train_inner, X_test_inner = X_train_outer[train_index_inner], X_train_outer[test_index_inner] 239 | y_train_inner, y_test_inner = y_train_outer[train_index_inner], y_train_outer[test_index_inner] 240 | 241 | if self.recursive_feature_elimination: 242 | X_train_inner, X_test_inner = self._fit_recursive_feature_elimination( 243 | X_train_inner, y_train_inner, X_test_inner) 244 | 245 | def _parallel_fitting(X_train_inner, X_test_inner, y_train_inner, y_test_inner, param_dict): 246 | log.debug( 247 | '\n\tFitting these parameters:\n\t{0}'.format(param_dict)) 248 | # Set hyperparameters, train model on inner split, predict results. 249 | self.model.set_params(**param_dict) 250 | 251 | # Fit model with current hyperparameters and score it 252 | self.model.fit(X_train_inner, y_train_inner) 253 | 254 | # Predict and score model 255 | inner_grid_score, inner_pred = self._predict_and_score(X_test_inner, y_test_inner) 256 | 257 | # Cleanup for Keras 258 | if(type(self.model).__name__ == 'KerasRegressor' or 259 | type(self.model).__name__ == 'KerasClassifier'): 260 | from keras import backend as K 261 | K.clear_session() 262 | 263 | return self._transform_score_format(inner_grid_score), param_dict 264 | 265 | results = Parallel(n_jobs=self.n_jobs)(delayed(_parallel_fitting)( 266 | X_train_inner, X_test_inner, 267 | y_train_inner, y_test_inner, 268 | param_dict=parameters) 269 | for parameters in param_func) 270 | search_scores.extend(results) 271 | 272 | best_inner_score, best_inner_params = self._best_of_results(search_scores) 273 | 274 | best_inner_params_list.append(best_inner_params) 275 | best_inner_score_list.append(best_inner_score) 276 | 277 | # Fit the best hyperparameters from one of the K inner loops 278 | self.model.set_params(**best_inner_params) 279 | self.model.fit(X_train_outer, y_train_outer) 280 | 281 | # Get score and prediction 282 | score,pred = self._predict_and_score(X_test_outer, y_test_outer) 283 | outer_scores.append(self._transform_score_format(score)) 284 | 285 | # Append variance 286 | variance.append(np.var(pred, ddof=1)) 287 | 288 | log.debug('\nResults for outer fold:\nBest inner parameters was: {0}'.format( 289 | best_inner_params_list[i])) 290 | log.debug('Outer score: {0}'.format(outer_scores[i])) 291 | log.debug('Inner score: {0}'.format(best_inner_score_list[i])) 292 | 293 | self.variance = variance 294 | self.outer_scores = outer_scores 295 | self.best_inner_score_list = best_inner_score_list 296 | self.best_params = self._score_to_best_params(best_inner_params_list) 297 | self.best_inner_params_list = best_inner_params_list 298 | 299 | # Method to show score vs variance chart. You can run it only after fitting the model. 300 | def score_vs_variance_plot(self): 301 | # Plot score vs variance 302 | plt.figure() 303 | plt.subplot(211) 304 | 305 | variance_plot, = plt.plot(self.variance, color='b') 306 | score_plot, = plt.plot(self.outer_scores, color='r') 307 | 308 | plt.legend([variance_plot, score_plot], 309 | ["Variance", "Score"], 310 | bbox_to_anchor=(0, .4, .5, 0)) 311 | 312 | plt.title("{0}: Score VS Variance".format(type(self.model).__name__), 313 | x=.5, y=1.1, fontsize="15") 314 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | import setuptools 3 | from os import path 4 | this_directory = path.abspath(path.dirname(__file__)) 5 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: 6 | long_description = f.read() 7 | 8 | setup( 9 | name = 'nested_cv', 10 | packages = ['nested_cv'], 11 | version = '0.916', 12 | license='MIT', 13 | description = 'A general package to handle nested cross-validation for any estimator that implements the scikit-learn estimator interface.', 14 | author_email = 'ahmedmagdi@outlook.com', 15 | maintainer_email= 'casperbh.96@gmail.com', 16 | long_description=long_description, 17 | long_description_content_type='text/markdown', 18 | url = 'https://github.com/casperbh96/Nested-Cross-Validation', 19 | download_url = 'https://github.com/user/reponame/archive/v_01.tar.gz', 20 | keywords = ['ml', 'xgboost', 'numpy','scikit-learn','pandas'], 21 | install_requires=[ 22 | 'pandas', 23 | 'matplotlib', 24 | 'scikit-learn', 25 | 'numpy', 26 | ], 27 | classifiers=[ 28 | 'Development Status :: 4 - Beta', 29 | 'Intended Audience :: Developers', 30 | 'Topic :: Software Development :: Libraries', 31 | 'License :: OSI Approved :: MIT License', 32 | 'Programming Language :: Python :: 3', 33 | 'Programming Language :: Python :: 3.4', 34 | 'Programming Language :: Python :: 3.5', 35 | 'Programming Language :: Python :: 3.6', 36 | 'Programming Language :: Python :: 3.7', 37 | ], 38 | ) 39 | --------------------------------------------------------------------------------