├── test.py ├── README.md └── stepwiseSelection.py /test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 22 08:07:22 2019 4 | 5 | @author: Sinan Talha Hascelik 6 | """ 7 | 8 | # Import Libraries 9 | import stepwiseSelection as ss 10 | import pandas as pd 11 | 12 | # Read Data 13 | df = pd.read_csv("test_data.csv") 14 | 15 | # Dependent and Independent Variables 16 | X = df.drop(columns= "Exited") 17 | y = df.Exited 18 | 19 | # Magic Happens 20 | final_vars, iterations_logs = ss.backwardSelection(X,y, model_type="logistic") 21 | 22 | # Write Logs To .txt 23 | iterations_file = open("Iterations_logs.txt","w+") 24 | iterations_file.write(iterations_logs) 25 | iterations_file.close() 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Automated Stepwise Backward and Forward Selection 2 | 3 | This script is about an automated stepwise backward and forward feature selection. You can easily apply on Dataframes. 4 | Functions returns not only the final features but also elimination iterations, so you can track what exactly happend at the iterations. 5 | 6 | You can apply it on both Linear and Logistic problems. Eliminations can be apply with Akaike information criterion (AIC), Bayesian information criterion (BIC), R-squared (Only works with linear), Adjusted R-squared (Only works with linear). 7 | Also you don't have to worry about varchar variables, code will handle it for you. 8 | 9 | Enjoy the code! 10 | 11 | 12 | Required Libraries: pandas, numpy, statmodels 13 | 14 | See more about stepwise regression : https://en.wikipedia.org/wiki/Stepwise_regression 15 | -------------------------------------------------------------------------------- /stepwiseSelection.py: -------------------------------------------------------------------------------- 1 | #Copyright 2019 Sinan Talha Hascelik 2 | # 3 | #Licensed under the Apache License, Version 2.0 (the "License"); 4 | #you may not use this file except in compliance with the License. 5 | #You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | #Unless required by applicable law or agreed to in writing, software 10 | #distributed under the License is distributed on an "AS IS" BASIS, 11 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | #See the License for the specific language governing permissions and 13 | #limitations under the License. 14 | 15 | import numpy as np 16 | import pandas as pd 17 | import statsmodels.formula.api as sm 18 | 19 | def forwardSelection(X, y, model_type ="linear",elimination_criteria = "aic", varchar_process = "dummy_dropfirst", sl=0.05): 20 | """ 21 | Forward Selection is a function, based on regression models, that returns significant features and selection iterations.\n 22 | Required Libraries: pandas, numpy, statmodels 23 | 24 | Parameters 25 | ---------- 26 | X : Independent variables (Pandas Dataframe)\n 27 | y : Dependent variable (Pandas Series, Pandas Dataframe)\n 28 | model_type : 'linear' or 'logistic'\n 29 | elimination_criteria : 'aic', 'bic', 'r2', 'adjr2' or None\n 30 | 'aic' refers Akaike information criterion\n 31 | 'bic' refers Bayesian information criterion\n 32 | 'r2' refers R-squared (Only works on linear model type)\n 33 | 'r2' refers Adjusted R-squared (Only works on linear model type)\n 34 | varchar_process : 'drop', 'dummy' or 'dummy_dropfirst'\n 35 | 'drop' drops varchar features\n 36 | 'dummy' creates dummies for all levels of all varchars\n 37 | 'dummy_dropfirst' creates dummies for all levels of all varchars, and drops first levels\n 38 | sl : Significance Level (default: 0.05)\n 39 | 40 | 41 | Returns 42 | ------- 43 | columns(list), iteration_logs(str)\n\n 44 | Not Returns a Model 45 | 46 | 47 | Tested On 48 | --------- 49 | Python v3.6.7, Pandas v0.23.4, Numpy v1.15.04, StatModels v0.9.0 50 | 51 | 52 | See Also 53 | -------- 54 | https://en.wikipedia.org/wiki/Stepwise_regression 55 | """ 56 | X = __varcharProcessing__(X,varchar_process = varchar_process) 57 | return __forwardSelectionRaw__(X, y, model_type = model_type,elimination_criteria = elimination_criteria , sl=sl) 58 | 59 | def backwardSelection(X, y, model_type ="linear",elimination_criteria = "aic", varchar_process = "dummy_dropfirst", sl=0.05): 60 | """ 61 | Backward Selection is a function, based on regression models, that returns significant features and selection iterations.\n 62 | Required Libraries: pandas, numpy, statmodels 63 | 64 | Parameters 65 | ---------- 66 | X : Independent variables (Pandas Dataframe)\n 67 | y : Dependent variable (Pandas Series, Pandas Dataframe)\n 68 | model_type : 'linear' or 'logistic'\n 69 | elimination_criteria : 'aic', 'bic', 'r2', 'adjr2' or None\n 70 | 'aic' refers Akaike information criterion\n 71 | 'bic' refers Bayesian information criterion\n 72 | 'r2' refers R-squared (Only works on linear model type)\n 73 | 'r2' refers Adjusted R-squared (Only works on linear model type)\n 74 | varchar_process : 'drop', 'dummy' or 'dummy_dropfirst'\n 75 | 'drop' drops varchar features\n 76 | 'dummy' creates dummies for all levels of all varchars\n 77 | 'dummy_dropfirst' creates dummies for all levels of all varchars, and drops first levels\n 78 | sl : Significance Level (default: 0.05)\n 79 | 80 | 81 | Returns 82 | ------- 83 | columns(list), iteration_logs(str)\n\n 84 | Not Returns a Model 85 | 86 | 87 | Tested On 88 | --------- 89 | Python v3.6.7, Pandas v0.23.4, Numpy v1.15.04, StatModels v0.9.0 90 | 91 | 92 | See Also 93 | -------- 94 | https://en.wikipedia.org/wiki/Stepwise_regression 95 | """ 96 | X = __varcharProcessing__(X,varchar_process = varchar_process) 97 | return __backwardSelectionRaw__(X, y, model_type = model_type,elimination_criteria = elimination_criteria , sl=sl) 98 | 99 | def __varcharProcessing__(X, varchar_process = "dummy_dropfirst"): 100 | 101 | dtypes = X.dtypes 102 | if varchar_process == "drop": 103 | X = X.drop(columns = dtypes[dtypes == np.object].index.tolist()) 104 | print("Character Variables (Dropped):", dtypes[dtypes == np.object].index.tolist()) 105 | elif varchar_process == "dummy": 106 | X = pd.get_dummies(X,drop_first=False) 107 | print("Character Variables (Dummies Generated):", dtypes[dtypes == np.object].index.tolist()) 108 | elif varchar_process == "dummy_dropfirst": 109 | X = pd.get_dummies(X,drop_first=True) 110 | print("Character Variables (Dummies Generated, First Dummies Dropped):", dtypes[dtypes == np.object].index.tolist()) 111 | else: 112 | X = pd.get_dummies(X,drop_first=True) 113 | print("Character Variables (Dummies Generated, First Dummies Dropped):", dtypes[dtypes == np.object].index.tolist()) 114 | 115 | X["intercept"] = 1 116 | cols = X.columns.tolist() 117 | cols = cols[-1:] + cols[:-1] 118 | X = X[cols] 119 | 120 | return X 121 | 122 | def __forwardSelectionRaw__(X, y, model_type ="linear",elimination_criteria = "aic", sl=0.05): 123 | 124 | iterations_log = "" 125 | cols = X.columns.tolist() 126 | 127 | def regressor(y,X, model_type=model_type): 128 | if model_type == "linear": 129 | regressor = sm.OLS(y, X).fit() 130 | elif model_type == "logistic": 131 | regressor = sm.Logit(y, X).fit() 132 | else: 133 | print("\nWrong Model Type : "+ model_type +"\nLinear model type is seleted.") 134 | model_type = "linear" 135 | regressor = sm.OLS(y, X).fit() 136 | return regressor 137 | 138 | selected_cols = ["intercept"] 139 | other_cols = cols.copy() 140 | other_cols.remove("intercept") 141 | 142 | model = regressor(y, X[selected_cols]) 143 | 144 | if elimination_criteria == "aic": 145 | criteria = model.aic 146 | elif elimination_criteria == "bic": 147 | criteria = model.bic 148 | elif elimination_criteria == "r2" and model_type =="linear": 149 | criteria = model.rsquared 150 | elif elimination_criteria == "adjr2" and model_type =="linear": 151 | criteria = model.rsquared_adj 152 | 153 | 154 | for i in range(X.shape[1]): 155 | pvals = pd.DataFrame(columns = ["Cols","Pval"]) 156 | for j in other_cols: 157 | model = regressor(y, X[selected_cols+[j]]) 158 | pvals = pvals.append(pd.DataFrame([[j, model.pvalues[j]]],columns = ["Cols","Pval"]),ignore_index=True) 159 | pvals = pvals.sort_values(by = ["Pval"]).reset_index(drop=True) 160 | pvals = pvals[pvals.Pval<=sl] 161 | if pvals.shape[0] > 0: 162 | 163 | model = regressor(y, X[selected_cols+[pvals["Cols"][0]]]) 164 | iterations_log += str("\nEntered : "+pvals["Cols"][0] + "\n") 165 | iterations_log += "\n\n"+str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic)+"\n\n" 166 | 167 | 168 | if elimination_criteria == "aic": 169 | new_criteria = model.aic 170 | if new_criteria < criteria: 171 | print("Entered :", pvals["Cols"][0], "\tAIC :", model.aic) 172 | selected_cols.append(pvals["Cols"][0]) 173 | other_cols.remove(pvals["Cols"][0]) 174 | criteria = new_criteria 175 | else: 176 | print("break : Criteria") 177 | break 178 | elif elimination_criteria == "bic": 179 | new_criteria = model.bic 180 | if new_criteria < criteria: 181 | print("Entered :", pvals["Cols"][0], "\tBIC :", model.bic) 182 | selected_cols.append(pvals["Cols"][0]) 183 | other_cols.remove(pvals["Cols"][0]) 184 | criteria = new_criteria 185 | else: 186 | print("break : Criteria") 187 | break 188 | elif elimination_criteria == "r2" and model_type =="linear": 189 | new_criteria = model.rsquared 190 | if new_criteria > criteria: 191 | print("Entered :", pvals["Cols"][0], "\tR2 :", model.rsquared) 192 | selected_cols.append(pvals["Cols"][0]) 193 | other_cols.remove(pvals["Cols"][0]) 194 | criteria = new_criteria 195 | else: 196 | print("break : Criteria") 197 | break 198 | elif elimination_criteria == "adjr2" and model_type =="linear": 199 | new_criteria = model.rsquared_adj 200 | if new_criteria > criteria: 201 | print("Entered :", pvals["Cols"][0], "\tAdjR2 :", model.rsquared_adj) 202 | selected_cols.append(pvals["Cols"][0]) 203 | other_cols.remove(pvals["Cols"][0]) 204 | criteria = new_criteria 205 | else: 206 | print("Break : Criteria") 207 | break 208 | else: 209 | print("Entered :", pvals["Cols"][0]) 210 | selected_cols.append(pvals["Cols"][0]) 211 | other_cols.remove(pvals["Cols"][0]) 212 | 213 | else: 214 | print("Break : Significance Level") 215 | break 216 | 217 | model = regressor(y, X[selected_cols]) 218 | if elimination_criteria == "aic": 219 | criteria = model.aic 220 | elif elimination_criteria == "bic": 221 | criteria = model.bic 222 | elif elimination_criteria == "r2" and model_type =="linear": 223 | criteria = model.rsquared 224 | elif elimination_criteria == "adjr2" and model_type =="linear": 225 | criteria = model.rsquared_adj 226 | 227 | print(model.summary()) 228 | print("AIC: "+str(model.aic)) 229 | print("BIC: "+str(model.bic)) 230 | print("Final Variables:", selected_cols) 231 | 232 | return selected_cols, iterations_log 233 | 234 | def __backwardSelectionRaw__(X, y, model_type ="linear",elimination_criteria = "aic", sl=0.05): 235 | 236 | iterations_log = "" 237 | last_eleminated = "" 238 | cols = X.columns.tolist() 239 | 240 | def regressor(y,X, model_type=model_type): 241 | if model_type =="linear": 242 | regressor = sm.OLS(y, X).fit() 243 | elif model_type == "logistic": 244 | regressor = sm.Logit(y, X).fit() 245 | else: 246 | print("\nWrong Model Type : "+ model_type +"\nLinear model type is seleted.") 247 | model_type = "linear" 248 | regressor = sm.OLS(y, X).fit() 249 | return regressor 250 | for i in range(X.shape[1]): 251 | if i != 0 : 252 | if elimination_criteria == "aic": 253 | criteria = model.aic 254 | new_model = regressor(y,X) 255 | new_criteria = new_model.aic 256 | if criteria < new_criteria: 257 | print("Regained : ", last_eleminated) 258 | iterations_log += "\n"+str(new_model.summary())+"\nAIC: "+ str(new_model.aic) + "\nBIC: "+ str(new_model.bic)+"\n" 259 | iterations_log += str("\n\nRegained : "+last_eleminated + "\n\n") 260 | break 261 | elif elimination_criteria == "bic": 262 | criteria = model.bic 263 | new_model = regressor(y,X) 264 | new_criteria = new_model.bic 265 | if criteria < new_criteria: 266 | print("Regained : ", last_eleminated) 267 | iterations_log += "\n"+str(new_model.summary())+"\nAIC: "+ str(new_model.aic) + "\nBIC: "+ str(new_model.bic)+"\n" 268 | iterations_log += str("\n\nRegained : "+last_eleminated + "\n\n") 269 | break 270 | elif elimination_criteria == "adjr2" and model_type =="linear": 271 | criteria = model.rsquared_adj 272 | new_model = regressor(y,X) 273 | new_criteria = new_model.rsquared_adj 274 | if criteria > new_criteria: 275 | print("Regained : ", last_eleminated) 276 | iterations_log += "\n"+str(new_model.summary())+"\nAIC: "+ str(new_model.aic) + "\nBIC: "+ str(new_model.bic)+"\n" 277 | iterations_log += str("\n\nRegained : "+last_eleminated + "\n\n") 278 | break 279 | elif elimination_criteria == "r2" and model_type =="linear": 280 | criteria = model.rsquared 281 | new_model = regressor(y,X) 282 | new_criteria = new_model.rsquared 283 | if criteria > new_criteria: 284 | print("Regained : ", last_eleminated) 285 | iterations_log += "\n"+str(new_model.summary())+"\nAIC: "+ str(new_model.aic) + "\nBIC: "+ str(new_model.bic)+"\n" 286 | iterations_log += str("\n\nRegained : "+last_eleminated + "\n\n") 287 | break 288 | else: 289 | new_model = regressor(y,X) 290 | model = new_model 291 | iterations_log += "\n"+str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic)+"\n" 292 | else: 293 | model = regressor(y,X) 294 | iterations_log += "\n"+str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic)+"\n" 295 | maxPval = max(model.pvalues) 296 | cols = X.columns.tolist() 297 | if maxPval > sl: 298 | for j in cols: 299 | if (model.pvalues[j] == maxPval): 300 | print("Eliminated :" ,j) 301 | iterations_log += str("\n\nEliminated : "+j+ "\n\n") 302 | 303 | del X[j] 304 | last_eleminated = j 305 | else: 306 | break 307 | print(str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic)) 308 | print("Final Variables:", cols) 309 | iterations_log += "\n"+str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic)+"\n" 310 | return cols, iterations_log 311 | --------------------------------------------------------------------------------