├── .gitignore
├── Example Notebook - NestedCV.ipynb
├── LICENSE
├── README.md
├── nested_cv
    ├── __init__.py
    ├── __pycache__
    │   └── __init__.cpython-37.pyc
    └── nested_cv.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | .DS_Store
3 | nested_cv/__pycache__/nested_cv.cpython-37.pyc
4 | .ipynb_checkpoints/Example Notebook - NestedCV-checkpoint.ipynb
5 | 


--------------------------------------------------------------------------------
/Example Notebook - NestedCV.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from nested_cv import NestedCV\n",
 10 |     "\n",
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "from sklearn.datasets import load_boston, load_iris, load_breast_cancer\n",
 14 |     "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
 15 |     "from sklearn.model_selection import KFold\n",
 16 |     "\n",
 17 |     "# When using Random Search, we get a user warning with this little number of hyperparameters\n",
 18 |     "# Suppress it\n",
 19 |     "import warnings\n",
 20 |     "warnings.simplefilter(action='ignore', category=UserWarning)\n",
 21 |     "warnings.simplefilter(action='ignore', category=FutureWarning)"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Regression Example"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 6,
 34 |    "metadata": {},
 35 |    "outputs": [
 36 |     {
 37 |      "data": {
 38 |       "text/plain": [
 39 |        "[4.2847157727008085,\n",
 40 |        " 3.4072642252185354,\n",
 41 |        " 2.8051399371930317,\n",
 42 |        " 2.2417498868498766,\n",
 43 |        " 3.4797203328262443]"
 44 |       ]
 45 |      },
 46 |      "execution_count": 6,
 47 |      "metadata": {},
 48 |      "output_type": "execute_result"
 49 |     }
 50 |    ],
 51 |    "source": [
 52 |     "boston = load_boston()\n",
 53 |     "X = boston.data\n",
 54 |     "y = boston.target\n",
 55 |     "\n",
 56 |     "# Define a parameters grid\n",
 57 |     "param_grid = {\n",
 58 |     "     'max_depth': [3, 7, 10, None],\n",
 59 |     "     'n_estimators': [100,200],\n",
 60 |     "     'min_samples_split':[2,3,5,7,10]\n",
 61 |     "}\n",
 62 |     "\n",
 63 |     "# Either specify a strategy or number\n",
 64 |     "# Here we choose a strategy\n",
 65 |     "outer_cv = KFold(n_splits=5,\n",
 66 |     "                 shuffle=True,\n",
 67 |     "                 random_state=123)\n",
 68 |     "inner_cv = KFold(n_splits=5,\n",
 69 |     "                 shuffle=True,\n",
 70 |     "                 random_state=123)\n",
 71 |     "\n",
 72 |     "NCV = NestedCV(model=RandomForestRegressor(), params_grid=param_grid,\n",
 73 |     "               outer_cv=outer_cv, inner_cv=inner_cv, n_jobs = -1,\n",
 74 |     "               cv_options={'sqrt_of_score':True, \n",
 75 |     "                           'recursive_feature_elimination':False, \n",
 76 |     "                           'rfe_n_features':2})\n",
 77 |     "NCV.fit(X=X,y=y)\n",
 78 |     "\n",
 79 |     "NCV.outer_scores"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "# Classification Example"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "## Breast Cancer (2 Classes)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 3,
 99 |    "metadata": {},
100 |    "outputs": [
101 |     {
102 |      "data": {
103 |       "text/plain": [
104 |        "[0.9912337662337662,\n",
105 |        " 0.974025974025974,\n",
106 |        " 0.9803240740740741,\n",
107 |        " 0.9963247577681257,\n",
108 |        " 0.9969551282051282]"
109 |       ]
110 |      },
111 |      "execution_count": 3,
112 |      "metadata": {},
113 |      "output_type": "execute_result"
114 |     }
115 |    ],
116 |    "source": [
117 |     "from sklearn.metrics import roc_auc_score\n",
118 |     "\n",
119 |     "# Binary classification\n",
120 |     "cancer = load_breast_cancer()\n",
121 |     "X = cancer.data\n",
122 |     "y = cancer.target\n",
123 |     "\n",
124 |     "# Define a parameters grid\n",
125 |     "param_grid = {\n",
126 |     "     'max_depth': [3, None],\n",
127 |     "     'n_estimators': [10, 20]\n",
128 |     "}\n",
129 |     "\n",
130 |     "NCV = NestedCV(model=RandomForestClassifier(), params_grid=param_grid,\n",
131 |     "               outer_cv=5, inner_cv=5,\n",
132 |     "               cv_options={'metric':roc_auc_score, \n",
133 |     "                           'metric_score_indicator_lower':False,\n",
134 |     "                           'randomized_search_iter':30, \n",
135 |     "                           'predict_proba':True})\n",
136 |     "NCV.fit(X=X,y=y)\n",
137 |     "\n",
138 |     "NCV.outer_scores"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "## Iris (3 Classes)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 4,
151 |    "metadata": {},
152 |    "outputs": [
153 |     {
154 |      "data": {
155 |       "text/plain": [
156 |        "[0.9333333333333333,\n",
157 |        " 0.9333333333333333,\n",
158 |        " 0.9333333333333333,\n",
159 |        " 0.9333333333333333,\n",
160 |        " 0.9666666666666667]"
161 |       ]
162 |      },
163 |      "execution_count": 4,
164 |      "metadata": {},
165 |      "output_type": "execute_result"
166 |     }
167 |    ],
168 |    "source": [
169 |     "from sklearn.metrics import f1_score\n",
170 |     "\n",
171 |     "# Multiclass classification\n",
172 |     "iris = load_iris()\n",
173 |     "X = iris.data\n",
174 |     "y = iris.target\n",
175 |     "\n",
176 |     "# Define a parameters grid\n",
177 |     "param_grid = {\n",
178 |     "     'max_depth': [3, None],\n",
179 |     "     'n_estimators': [10, 20]\n",
180 |     "}\n",
181 |     "\n",
182 |     "NCV = NestedCV(model=RandomForestClassifier(), params_grid=param_grid, \n",
183 |     "               outer_cv=5, inner_cv=5,\n",
184 |     "               cv_options={'metric':f1_score, \n",
185 |     "                           'metric_score_indicator_lower':False,\n",
186 |     "                           'randomized_search_iter':30, \n",
187 |     "                           'predict_proba':False,\n",
188 |     "                           'multiclass_average': 'micro'})\n",
189 |     "NCV.fit(X=X,y=y)\n",
190 |     "\n",
191 |     "NCV.outer_scores"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": []
200 |   }
201 |  ],
202 |  "metadata": {
203 |   "kernelspec": {
204 |    "display_name": "Python 3",
205 |    "language": "python",
206 |    "name": "python3"
207 |   },
208 |   "language_info": {
209 |    "codemirror_mode": {
210 |     "name": "ipython",
211 |     "version": 3
212 |    },
213 |    "file_extension": ".py",
214 |    "mimetype": "text/x-python",
215 |    "name": "python",
216 |    "nbconvert_exporter": "python",
217 |    "pygments_lexer": "ipython3",
218 |    "version": "3.7.3"
219 |   }
220 |  },
221 |  "nbformat": 4,
222 |  "nbformat_minor": 2
223 | }
224 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Casper Bøgeskov Hansen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Nested-Cross-Validation
  2 | This repository implements a general nested cross-validation function. Ready to use with ANY estimator that implements the Scikit-Learn estimator interface.
  3 | ## Installing the pacakge:
  4 | You can find the package on [pypi](https://pypi.org/project/nested-cv/)* and install it via pip by using the following command:
  5 | ```bash
  6 | pip install nested-cv
  7 | ```
  8 | You can also install it from the wheel file on the [Releases](https://github.com/casperbh96/Nested-Cross-Validation/releases) page.
  9 | 
 10 | \* we gradually push updates, pull this master from github if you want the absolute latest changes.
 11 | 
 12 | ## Usage
 13 | Be mindful of the options that are available for NestedCV. Some cross-validation options are defined in a dictionary `cv_options`.
 14 | This package is optimized for any estimator that implements a scikit-learn wrapper, e.g. XGBoost, LightGBM, KerasRegressor, KerasClassifier etc.
 15 | 
 16 | -->**[See notebook for more examples](https://github.com/casperbh96/Nested-Cross-Validation/blob/master/Example%20Notebook%20-%20NestedCV.ipynb)**
 17 | 
 18 | ### Simple
 19 | Here is a single example using Random Forest. Check out the example notebook for more.
 20 | ```python
 21 | from nested_cv import NestedCV
 22 | from sklearn.ensemble import RandomForestRegressor
 23 | 
 24 | # Define a parameters grid
 25 | param_grid = {
 26 |      'max_depth': [3, None],
 27 |      'n_estimators': [10]
 28 | }
 29 | 
 30 | NCV = NestedCV(model=RandomForestRegressor(), params_grid=param_grid,
 31 |                outer_cv=5, inner_cv=5, n_jobs = -1,
 32 |                cv_options={'sqrt_of_score':True, 
 33 |                            'recursive_feature_elimination':True, 
 34 |                            'rfe_n_features':2})
 35 | NCV.fit(X=X,y=y)
 36 | NCV.outer_scores
 37 | ```
 38 | 
 39 | ### NestedCV Parameters 
 40 | | Name        | type           | description  |
 41 | | :------------- |:-------------| :-----|
 42 | | model      | estimator | The estimator implements scikit-learn estimator interface. |
 43 | | params_grid      | dictionary "dict"      |   The dict contains hyperparameters for model. |
 44 | | outer_cv | int or cv splitter class      |    Outer splitting strategy. If int, KFold is default. |
 45 | | inner_cv | int or cv splitter class     | Inner splitting strategy. If int, KFold is default.    | 
 46 | | cv_options | dictionary "dict"      |    [Next section](#cv_options-value-options) |
 47 | | n_jobs | int      | Number of jobs for joblib to run (multiprocessing)    | 
 48 | 
 49 | ### `cv_options` value options
 50 | **`metric` :** Callable from sklearn.metrics, default = mean_squared_error
 51 | 
 52 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;A scoring metric used to score each model
 53 | 
 54 | **`metric_score_indicator_lower` :** boolean, default = True
 55 | 
 56 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Choose whether lower score is better for the metric calculation or higher score is better.
 57 | 
 58 | **`sqrt_of_score` :** boolean, default = False
 59 | 
 60 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Whether or not if the square root should be taken of score
 61 | 
 62 | **`randomized_search` :** boolean, default = True
 63 | 
 64 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Whether to use gridsearch or randomizedsearch from sklearn
 65 | 
 66 | **`randomized_search_iter` :** int, default = 10
 67 | 
 68 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Number of iterations for randomized search
 69 | 
 70 | **`recursive_feature_elimination` :** boolean, default = False
 71 | 
 72 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Whether to do feature elimination
 73 | 
 74 | **`predict_proba` :** boolean, default = False
 75 | 
 76 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;If true, predict probabilities instead for a class, instead of predicting a class
 77 | 
 78 | **`multiclass_average` :** string, default = 'binary'
 79 | 
 80 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;For some classification metrics with a multiclass prediction, you need to specify an
 81 |             average other than 'binary'
 82 | 
 83 | ### Returns
 84 | **`variance` :** Model variance by numpy.var()
 85 | 
 86 | **`outer_scores` :** A list of the outer scores, from the outer cross-validation
 87 | 
 88 | **`best_inner_score_list` :** A list of best inner scores for each outer loop
 89 | 
 90 | **`best_params` :** All best params from each inner loop cumulated in a dict
 91 | 
 92 | **`best_inner_params_list` :** Best inner params for each outer loop as an array of dictionaries
 93 | 
 94 | ## How to use the output?
 95 | We suggest looking at the best hyperparameters together with the score for each outer loop. Look at how stable the model appears to be in a nested cross-validation setting. If the outer score changes a lot, then it might indicate instability in your model. In that case, start over with making a new model.
 96 | 
 97 | ### After Nested Cross-Validation?
 98 | If the results from nested cross-validation are stable: Run a normal cross-validation with the same procedure as in nested cross-validation, i.e. if you used feature selection in nested cross-validation, you should also do that in normal cross-validation. Use the best parameters as input to your normal cross-validation.
 99 | 
100 | ## Limitations
101 | - [XGBoost](https://xgboost.readthedocs.io/en/latest/) implements a `early_stopping_rounds`, which cannot be used in this implementation. Other similar parameters might not work in combination with this implementation. The function will have to be adopted to use special parameters like that.
102 | 
103 | ## What did we learn?
104 | - Using [Scikit-Learn](https://github.com/scikit-learn/scikit-learn) will lead to a faster implementation, since the Scikit-Learn community has implemented many functions that does much of the work.
105 | - We have learned and applied this package in our main project about [House Price Prediction](https://github.com/casperbh96/house-price-prediction).
106 | 
107 | ## Why use Nested Cross-Validation?
108 | Controlling the bias-variance tradeoff is an essential and important task in machine learning, indicated by [[Cawley and Talbot, 2010]](http://jmlr.csail.mit.edu/papers/volume11/cawley10a/cawley10a.pdf). Many articles indicate that this is possible by the use of nested cross-validation, one of them by [Varma and Simon, 2006](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1397873/pdf/1471-2105-7-91.pdf). Other interesting literature for nested cross-validation are [[Varoquaox et al., 2017]](https://arxiv.org/pdf/1606.05201.pdf) and [[Krstajic et al., 2014]](https://jcheminf.biomedcentral.com/track/pdf/10.1186/1758-2946-6-10).
109 | 


--------------------------------------------------------------------------------
/nested_cv/__init__.py:
--------------------------------------------------------------------------------
1 | from nested_cv.nested_cv import NestedCV


--------------------------------------------------------------------------------
/nested_cv/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/casper-hansen/Nested-Cross-Validation/ebb942addc01cf53d240ac19c81bbb3875c74cd3/nested_cv/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/nested_cv/nested_cv.py:
--------------------------------------------------------------------------------
  1 | import logging as log
  2 | import pandas as pd
  3 | import numpy as np
  4 | import numbers
  5 | from matplotlib import pyplot as plt
  6 | from sklearn.model_selection import KFold, ParameterGrid, ParameterSampler
  7 | from sklearn.metrics import mean_squared_error
  8 | from sklearn.feature_selection import RFECV
  9 | from sklearn.utils.multiclass import type_of_target
 10 | from joblib import Parallel, delayed
 11 | 
 12 | class NestedCV():
 13 |     '''A general class to handle nested cross-validation for any estimator that
 14 |     implements the scikit-learn estimator interface.
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     model : estimator
 19 |         The estimator implements scikit-learn estimator interface.
 20 | 
 21 |     params_grid : dict
 22 |         The dict contains hyperparameters for model.
 23 | 
 24 |     outer_cv : int or cv splitter class (e.g. KFold, StratifiedKFold etc.)
 25 |         Outer splitting strategy. If int, KFold is default. For more information,
 26 |         visit https://scikit-learn.org/stable/modules/classes.html#splitter-classes.
 27 | 
 28 |     inner_cv : int or cv splitter class (e.g. KFold, StratifiedKFold etc.)
 29 |         Inner splitting strategy. If int, KFold is default. For more information,
 30 |         visit https://scikit-learn.org/stable/modules/classes.html#splitter-classes.
 31 |         
 32 |     n_jobs : int
 33 |         Number of jobs to run in parallel
 34 | 
 35 |     cv_options: dict, default = {}
 36 |         Nested Cross-Validation Options, check docs for details.
 37 | 
 38 |         metric : callable from sklearn.metrics, default = mean_squared_error
 39 |             A scoring metric used to score each model
 40 | 
 41 |         metric_score_indicator_lower : boolean, default = True
 42 |             Choose whether lower score is better for the metric calculation or higher score is better,
 43 |             `True` means lower score is better.
 44 | 
 45 |         sqrt_of_score : boolean, default = False
 46 |             Whether or not the square root should be taken of score
 47 | 
 48 |         randomized_search : boolean, default = False
 49 |             Whether to use gridsearch or randomizedsearch from sklearn
 50 | 
 51 |         randomized_search_iter : int, default = 10
 52 |             Number of iterations for randomized search
 53 | 
 54 |         recursive_feature_elimination : boolean, default = False
 55 |             Whether to do recursive feature selection (rfe) for each set of different hyperparameters
 56 |             in the inner most loop of the fit function.
 57 | 
 58 |         rfe_n_features : int, default = 1
 59 |             If recursive_feature_elimination is enabled, select n number of features
 60 |         
 61 |         predict_proba : boolean, default = False
 62 |             If true, predict probabilities instead for a class, instead of predicting a class
 63 |         
 64 |         multiclass_average : string, default = 'binary'
 65 |             For some classification metrics with a multiclass prediction, you need to specify an
 66 |             average other than 'binary'
 67 |     '''
 68 | 
 69 |     def __init__(self, model, params_grid, outer_cv = None, inner_cv = None, n_jobs = 1, cv_options={},
 70 |                  outer_kfolds = None, inner_kfolds = None):
 71 |         self.model = model
 72 |         self.params_grid = params_grid
 73 |         self.outer_cv = outer_cv
 74 |         self.inner_cv = inner_cv
 75 |         self.n_jobs = n_jobs
 76 |         self.metric = cv_options.get('metric', mean_squared_error)
 77 |         self.metric_score_indicator_lower = cv_options.get(
 78 |             'metric_score_indicator_lower', True)
 79 |         self.sqrt_of_score = cv_options.get('sqrt_of_score', False)
 80 |         self.randomized_search = cv_options.get('randomized_search', False)
 81 |         self.randomized_search_iter = cv_options.get(
 82 |             'randomized_search_iter', 10)
 83 |         self.recursive_feature_elimination = cv_options.get(
 84 |             'recursive_feature_elimination', False)
 85 |         self.rfe_n_features = cv_options.get(
 86 |             'rfe_n_features', 0)
 87 |         self.predict_proba = cv_options.get(
 88 |             'predict_proba', False)
 89 |         self.multiclass_average = cv_options.get(
 90 |             'multiclass_average', 'binary')
 91 |         self.outer_scores = []
 92 |         self.best_params = {}
 93 |         self.best_inner_score_list = []
 94 |         self.variance = []
 95 |         
 96 |         if(outer_kfolds != None or inner_kfolds != None):
 97 |             raise NameError('outer_kfolds and inner_kfolds is renamed to outer_cv and inner_cv, ' \
 98 |                             'please replace the variables in your code. Will be removed in future release')
 99 | 
100 |     # to check if use sqrt_of_score and handle the different cases
101 |     def _transform_score_format(self, scoreValue):
102 |         if self.sqrt_of_score:
103 |             return np.sqrt(scoreValue)
104 |         return scoreValue
105 | 
106 |     # to convert array of dict to dict with array values, so it can be used as params for parameter tuning
107 |     def _score_to_best_params(self, best_inner_params_list):
108 |         params_dict = {}
109 |         for best_inner_params in best_inner_params_list:
110 |             for key, value in best_inner_params.items():
111 |                 if key in params_dict:
112 |                     if value not in params_dict[key]:
113 |                         params_dict[key].append(value)
114 |                 else:
115 |                     params_dict[key] = [value]
116 |         return params_dict
117 | 
118 |     # a function to handle recursive feature elimination
119 |     def _fit_recursive_feature_elimination(self, X_train_outer, y_train_outer, X_test_outer):
120 |         rfe = RFECV(estimator=self.model,
121 |                     min_features_to_select=self.rfe_n_features, cv=self.inner_cv, n_jobs = self.n_jobs)
122 |         rfe.fit(X_train_outer, y_train_outer)
123 |         
124 |         log.info('Best number of features was: {0}'.format(rfe.n_features_))
125 | 
126 |         # Assign selected features to data
127 |         return rfe.transform(X_train_outer), rfe.transform(X_test_outer)
128 |     
129 |     def _predict_and_score(self, X_test, y_test):
130 |         #XXX: Implement type_of_target(y)
131 |         
132 |         if(self.predict_proba):
133 |             y_type = type_of_target(y_test)
134 |             if(y_type in ('binary')):
135 |                 pred = self.model.predict_proba(X_test)[:,1]
136 |             else:
137 |                 pred = self.model.predict_proba(X_test)
138 |                 
139 |         else:
140 |             pred = self.model.predict(X_test)
141 |         
142 |         if(self.multiclass_average == 'binary'):
143 |             return self.metric(y_test, pred), pred
144 |         else:
145 |             return self.metric(y_test, pred, average=self.multiclass_average), pred
146 |     def _best_of_results(self, results):
147 |         best_score = None
148 |         best_parameters = {}
149 |         
150 |         for score_parameter in results:
151 |             if(self.metric_score_indicator_lower):
152 |                 if(best_score == None or score_parameter[0] < best_score):
153 |                     best_score = score_parameter[0]
154 |                     best_parameters = score_parameter[1]
155 |             else:
156 |                 if(best_score == None or score_parameter[0] > best_score):
157 |                     best_score = score_parameter[0]
158 |                     best_parameters = score_parameter[1]
159 |         
160 |         return best_score, best_parameters
161 | 
162 |     def fit(self, X, y):
163 |         '''A method to fit nested cross-validation 
164 |         Parameters
165 |         ----------
166 |         X : pandas dataframe (rows, columns)
167 |             Training dataframe, where rows is total number of observations and columns
168 |             is total number of features
169 | 
170 |         y : pandas dataframe
171 |             Output dataframe, also called output variable. y is what you want to predict.
172 | 
173 |         Returns
174 |         -------
175 |         It will not return directly the values, but it's accessable from the class object it self.
176 |         You should be able to access:
177 | 
178 |         variance
179 |             Model variance by numpy.var()
180 | 
181 |         outer_scores 
182 |             Outer score List.
183 | 
184 |         best_inner_score_list 
185 |             Best inner scores for each outer loop
186 | 
187 |         best_params 
188 |             All best params from each inner loop cumulated in a dict
189 | 
190 |         best_inner_params_list 
191 |             Best inner params for each outer loop as an array of dictionaries
192 |         '''
193 |         
194 |         log.debug(
195 |             '\n{0} <-- Running this model now'.format(type(self.model).__name__))
196 | 
197 |         self.X = X
198 |         self.y = y
199 | 
200 |         # If Pandas dataframe or series, convert to array
201 |         if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
202 |             X = X.to_numpy()
203 |         if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
204 |             y = y.to_numpy()
205 |         if(self.randomized_search):
206 |             param_func = ParameterSampler(param_distributions=self.params_grid,
207 |                                                    n_iter=self.randomized_search_iter)
208 |         else:
209 |             param_func = ParameterGrid(param_grid=self.params_grid)
210 |         
211 |         if(isinstance(self.outer_cv, numbers.Number) and
212 |            isinstance(self.inner_cv, numbers.Number)):
213 |             outer_cv = KFold(n_splits=self.outer_cv, shuffle=True)
214 |             inner_cv = KFold(n_splits=self.inner_cv, shuffle=True)
215 |         else:
216 |             outer_cv = self.outer_cv
217 |             inner_cv = self.inner_cv
218 | 
219 |         outer_scores = []
220 |         variance = []
221 |         best_inner_params_list = []  # Change both to by one thing out of key-value pair
222 |         best_inner_score_list = []
223 | 
224 |         # Split X and y into K-partitions to Outer CV
225 |         for (i, (train_index, test_index)) in enumerate(outer_cv.split(X, y)):
226 |             log.debug(
227 |                 '\n{0}/{1} <-- Current outer fold'.format(i+1, self.outer_cv))
228 |             X_train_outer, X_test_outer = X[train_index], X[test_index]
229 |             y_train_outer, y_test_outer = y[train_index], y[test_index]
230 |             best_inner_params = {}
231 |             best_inner_score = None
232 |             search_scores = []
233 | 
234 |             # Split X_train_outer and y_train_outer into K-partitions to be inner CV
235 |             for (j, (train_index_inner, test_index_inner)) in enumerate(inner_cv.split(X_train_outer, y_train_outer)):
236 |                 log.debug(
237 |                     '\n\t{0}/{1} <-- Current inner fold'.format(j+1, self.inner_cv))
238 |                 X_train_inner, X_test_inner = X_train_outer[train_index_inner], X_train_outer[test_index_inner]
239 |                 y_train_inner, y_test_inner = y_train_outer[train_index_inner], y_train_outer[test_index_inner]
240 |                 
241 |                 if self.recursive_feature_elimination:
242 |                         X_train_inner, X_test_inner = self._fit_recursive_feature_elimination(
243 |                                     X_train_inner, y_train_inner, X_test_inner)
244 |                 
245 |                 def _parallel_fitting(X_train_inner, X_test_inner, y_train_inner, y_test_inner, param_dict):
246 |                     log.debug(
247 |                         '\n\tFitting these parameters:\n\t{0}'.format(param_dict))
248 |                     # Set hyperparameters, train model on inner split, predict results.
249 |                     self.model.set_params(**param_dict)
250 | 
251 |                     # Fit model with current hyperparameters and score it
252 |                     self.model.fit(X_train_inner, y_train_inner)
253 |                     
254 |                     # Predict and score model
255 |                     inner_grid_score, inner_pred = self._predict_and_score(X_test_inner, y_test_inner)
256 |                     
257 |                     # Cleanup for Keras
258 |                     if(type(self.model).__name__ == 'KerasRegressor' or
259 |                        type(self.model).__name__ == 'KerasClassifier'):
260 |                         from keras import backend as K
261 |                         K.clear_session()
262 |                     
263 |                     return self._transform_score_format(inner_grid_score), param_dict
264 |             
265 |                 results = Parallel(n_jobs=self.n_jobs)(delayed(_parallel_fitting)(
266 |                                                     X_train_inner, X_test_inner,
267 |                                                     y_train_inner, y_test_inner,
268 |                                                     param_dict=parameters)
269 |                                             for parameters in param_func)
270 |                 search_scores.extend(results)
271 |             
272 |             best_inner_score, best_inner_params = self._best_of_results(search_scores)
273 |             
274 |             best_inner_params_list.append(best_inner_params)
275 |             best_inner_score_list.append(best_inner_score)
276 | 
277 |             # Fit the best hyperparameters from one of the K inner loops
278 |             self.model.set_params(**best_inner_params)
279 |             self.model.fit(X_train_outer, y_train_outer)
280 |             
281 |             # Get score and prediction
282 |             score,pred = self._predict_and_score(X_test_outer, y_test_outer)
283 |             outer_scores.append(self._transform_score_format(score))
284 | 
285 |             # Append variance
286 |             variance.append(np.var(pred, ddof=1))
287 | 
288 |             log.debug('\nResults for outer fold:\nBest inner parameters was: {0}'.format(
289 |                 best_inner_params_list[i]))
290 |             log.debug('Outer score: {0}'.format(outer_scores[i]))
291 |             log.debug('Inner score: {0}'.format(best_inner_score_list[i]))
292 |          
293 |         self.variance = variance
294 |         self.outer_scores = outer_scores
295 |         self.best_inner_score_list = best_inner_score_list
296 |         self.best_params = self._score_to_best_params(best_inner_params_list)
297 |         self.best_inner_params_list = best_inner_params_list
298 | 
299 |     # Method to show score vs variance chart. You can run it only after fitting the model.
300 |     def score_vs_variance_plot(self):
301 |         # Plot score vs variance
302 |         plt.figure()
303 |         plt.subplot(211)
304 | 
305 |         variance_plot, = plt.plot(self.variance, color='b')
306 |         score_plot, = plt.plot(self.outer_scores, color='r')
307 | 
308 |         plt.legend([variance_plot, score_plot],
309 |                    ["Variance", "Score"],
310 |                    bbox_to_anchor=(0, .4, .5, 0))
311 | 
312 |         plt.title("{0}: Score VS Variance".format(type(self.model).__name__),
313 |                   x=.5, y=1.1, fontsize="15")
314 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | import setuptools
 3 | from os import path
 4 | this_directory = path.abspath(path.dirname(__file__))
 5 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
 6 |     long_description = f.read()
 7 | 
 8 | setup(
 9 |   name = 'nested_cv',       
10 |   packages = ['nested_cv'], 
11 |   version = '0.916',     
12 |   license='MIT',       
13 |   description = 'A general package to handle nested cross-validation for any estimator that implements the scikit-learn estimator interface.',   
14 |   author_email = 'ahmedmagdi@outlook.com',
15 |   maintainer_email= 'casperbh.96@gmail.com',
16 |   long_description=long_description,
17 |   long_description_content_type='text/markdown',
18 |   url = 'https://github.com/casperbh96/Nested-Cross-Validation',   
19 |   download_url = 'https://github.com/user/reponame/archive/v_01.tar.gz',    
20 |   keywords = ['ml', 'xgboost', 'numpy','scikit-learn','pandas'],  
21 |   install_requires=[        
22 |           'pandas',
23 |           'matplotlib',
24 |           'scikit-learn',
25 |           'numpy',
26 |       ],
27 |   classifiers=[
28 |     'Development Status :: 4 - Beta',      
29 |     'Intended Audience :: Developers',   
30 |     'Topic :: Software Development :: Libraries',
31 |     'License :: OSI Approved :: MIT License', 
32 |     'Programming Language :: Python :: 3',     
33 |     'Programming Language :: Python :: 3.4',
34 |     'Programming Language :: Python :: 3.5',
35 |     'Programming Language :: Python :: 3.6',
36 |     'Programming Language :: Python :: 3.7',
37 |   ],
38 | )
39 | 


--------------------------------------------------------------------------------