├── test.py
├── README.md
└── stepwiseSelection.py


/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Apr 22 08:07:22 2019
 4 | 
 5 | @author: Sinan Talha Hascelik
 6 | """
 7 | 
 8 | # Import Libraries
 9 | import stepwiseSelection as ss
10 | import pandas as pd
11 | 
12 | # Read Data
13 | df = pd.read_csv("test_data.csv")
14 | 
15 | # Dependent and Independent Variables
16 | X = df.drop(columns= "Exited")
17 | y = df.Exited
18 | 
19 | # Magic Happens
20 | final_vars, iterations_logs = ss.backwardSelection(X,y, model_type="logistic")
21 | 
22 | # Write Logs To .txt
23 | iterations_file = open("Iterations_logs.txt","w+") 
24 | iterations_file.write(iterations_logs)
25 | iterations_file.close()
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Automated Stepwise Backward and Forward Selection
 2 | 
 3 | This script is about an automated stepwise backward and forward feature selection. You can easily apply on Dataframes.
 4 | Functions returns not only the final features but also elimination iterations, so you can track what exactly happend at the iterations.
 5 | 
 6 | You can apply it on both Linear and Logistic problems. Eliminations can be apply with Akaike information criterion (AIC), Bayesian information criterion (BIC), R-squared (Only works with linear), Adjusted R-squared (Only works with linear). 
 7 | Also you don't have to worry about varchar variables, code will handle it for you.
 8 | 
 9 | Enjoy the code!
10 | 
11 | 
12 | Required Libraries: pandas, numpy, statmodels
13 | 
14 | See more about stepwise regression : https://en.wikipedia.org/wiki/Stepwise_regression
15 | 


--------------------------------------------------------------------------------
/stepwiseSelection.py:
--------------------------------------------------------------------------------
  1 | #Copyright 2019 Sinan Talha Hascelik
  2 | #
  3 | #Licensed under the Apache License, Version 2.0 (the "License");
  4 | #you may not use this file except in compliance with the License.
  5 | #You may obtain a copy of the License at
  6 | #
  7 | #    http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #Unless required by applicable law or agreed to in writing, software
 10 | #distributed under the License is distributed on an "AS IS" BASIS,
 11 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #See the License for the specific language governing permissions and
 13 | #limitations under the License.
 14 | 
 15 | import numpy as np
 16 | import pandas as pd
 17 | import statsmodels.formula.api as sm
 18 | 
 19 | def forwardSelection(X, y, model_type ="linear",elimination_criteria = "aic", varchar_process = "dummy_dropfirst", sl=0.05):
 20 |     """
 21 |     Forward Selection is a function, based on regression models, that returns significant features and selection iterations.\n
 22 |     Required Libraries: pandas, numpy, statmodels
 23 |     
 24 |     Parameters
 25 |     ----------
 26 |     X : Independent variables (Pandas Dataframe)\n
 27 |     y : Dependent variable (Pandas Series, Pandas Dataframe)\n
 28 |     model_type : 'linear' or 'logistic'\n
 29 |     elimination_criteria : 'aic', 'bic', 'r2', 'adjr2' or None\n
 30 |         'aic' refers Akaike information criterion\n
 31 |         'bic' refers Bayesian information criterion\n
 32 |         'r2' refers R-squared (Only works on linear model type)\n
 33 |         'r2' refers Adjusted R-squared (Only works on linear model type)\n
 34 |     varchar_process : 'drop', 'dummy' or 'dummy_dropfirst'\n
 35 |         'drop' drops varchar features\n
 36 |         'dummy' creates dummies for all levels of all varchars\n
 37 |         'dummy_dropfirst' creates dummies for all levels of all varchars, and drops first levels\n
 38 |     sl : Significance Level (default: 0.05)\n
 39 |     
 40 | 
 41 |     Returns
 42 |     -------
 43 |     columns(list), iteration_logs(str)\n\n
 44 |     Not Returns a Model
 45 |     
 46 | 
 47 |     Tested On
 48 |     ---------
 49 |     Python v3.6.7, Pandas v0.23.4, Numpy v1.15.04, StatModels v0.9.0
 50 |     
 51 | 
 52 |     See Also
 53 |     --------
 54 |     https://en.wikipedia.org/wiki/Stepwise_regression
 55 |     """
 56 |     X = __varcharProcessing__(X,varchar_process = varchar_process)
 57 |     return __forwardSelectionRaw__(X, y, model_type = model_type,elimination_criteria = elimination_criteria , sl=sl)
 58 |     
 59 | def backwardSelection(X, y, model_type ="linear",elimination_criteria = "aic", varchar_process = "dummy_dropfirst", sl=0.05):
 60 |     """
 61 |     Backward Selection is a function, based on regression models, that returns significant features and selection iterations.\n
 62 |     Required Libraries: pandas, numpy, statmodels
 63 |     
 64 |     Parameters
 65 |     ----------
 66 |     X : Independent variables (Pandas Dataframe)\n
 67 |     y : Dependent variable (Pandas Series, Pandas Dataframe)\n
 68 |     model_type : 'linear' or 'logistic'\n
 69 |     elimination_criteria : 'aic', 'bic', 'r2', 'adjr2' or None\n
 70 |         'aic' refers Akaike information criterion\n
 71 |         'bic' refers Bayesian information criterion\n
 72 |         'r2' refers R-squared (Only works on linear model type)\n
 73 |         'r2' refers Adjusted R-squared (Only works on linear model type)\n
 74 |     varchar_process : 'drop', 'dummy' or 'dummy_dropfirst'\n
 75 |         'drop' drops varchar features\n
 76 |         'dummy' creates dummies for all levels of all varchars\n
 77 |         'dummy_dropfirst' creates dummies for all levels of all varchars, and drops first levels\n
 78 |     sl : Significance Level (default: 0.05)\n
 79 |     
 80 | 
 81 |     Returns
 82 |     -------
 83 |     columns(list), iteration_logs(str)\n\n
 84 |     Not Returns a Model
 85 |     
 86 | 
 87 |     Tested On
 88 |     ---------
 89 |     Python v3.6.7, Pandas v0.23.4, Numpy v1.15.04, StatModels v0.9.0
 90 |     
 91 | 
 92 |     See Also
 93 |     --------
 94 |     https://en.wikipedia.org/wiki/Stepwise_regression    
 95 |     """
 96 |     X = __varcharProcessing__(X,varchar_process = varchar_process)
 97 |     return __backwardSelectionRaw__(X, y, model_type = model_type,elimination_criteria = elimination_criteria , sl=sl)
 98 | 
 99 | def __varcharProcessing__(X, varchar_process = "dummy_dropfirst"):
100 |     
101 |     dtypes = X.dtypes
102 |     if varchar_process == "drop":   
103 |         X = X.drop(columns = dtypes[dtypes == np.object].index.tolist())
104 |         print("Character Variables (Dropped):", dtypes[dtypes == np.object].index.tolist())
105 |     elif varchar_process == "dummy":
106 |         X = pd.get_dummies(X,drop_first=False)
107 |         print("Character Variables (Dummies Generated):", dtypes[dtypes == np.object].index.tolist())
108 |     elif varchar_process == "dummy_dropfirst":
109 |         X = pd.get_dummies(X,drop_first=True)
110 |         print("Character Variables (Dummies Generated, First Dummies Dropped):", dtypes[dtypes == np.object].index.tolist())
111 |     else: 
112 |         X = pd.get_dummies(X,drop_first=True)
113 |         print("Character Variables (Dummies Generated, First Dummies Dropped):", dtypes[dtypes == np.object].index.tolist())
114 |     
115 |     X["intercept"] = 1
116 |     cols = X.columns.tolist()
117 |     cols = cols[-1:] + cols[:-1]
118 |     X = X[cols]
119 |     
120 |     return X
121 | 
122 | def __forwardSelectionRaw__(X, y, model_type ="linear",elimination_criteria = "aic", sl=0.05):
123 | 
124 |     iterations_log = ""
125 |     cols = X.columns.tolist()
126 |     
127 |     def regressor(y,X, model_type=model_type):
128 |         if model_type == "linear":
129 |             regressor = sm.OLS(y, X).fit()
130 |         elif model_type == "logistic":
131 |             regressor = sm.Logit(y, X).fit()
132 |         else:
133 |             print("\nWrong Model Type : "+ model_type +"\nLinear model type is seleted.")
134 |             model_type = "linear"
135 |             regressor = sm.OLS(y, X).fit()
136 |         return regressor
137 |     
138 |     selected_cols = ["intercept"]
139 |     other_cols = cols.copy()
140 |     other_cols.remove("intercept")
141 |     
142 |     model = regressor(y, X[selected_cols])
143 |     
144 |     if elimination_criteria == "aic":
145 |         criteria = model.aic
146 |     elif elimination_criteria == "bic":
147 |         criteria = model.bic
148 |     elif elimination_criteria == "r2" and model_type =="linear":
149 |         criteria = model.rsquared
150 |     elif elimination_criteria == "adjr2" and model_type =="linear":
151 |         criteria = model.rsquared_adj
152 |     
153 |     
154 |     for i in range(X.shape[1]):
155 |         pvals = pd.DataFrame(columns = ["Cols","Pval"])
156 |         for j in other_cols:
157 |             model = regressor(y, X[selected_cols+[j]])
158 |             pvals = pvals.append(pd.DataFrame([[j, model.pvalues[j]]],columns = ["Cols","Pval"]),ignore_index=True)
159 |         pvals = pvals.sort_values(by = ["Pval"]).reset_index(drop=True)
160 |         pvals = pvals[pvals.Pval<=sl]
161 |         if pvals.shape[0] > 0:
162 |             
163 |             model = regressor(y, X[selected_cols+[pvals["Cols"][0]]])
164 |             iterations_log += str("\nEntered : "+pvals["Cols"][0] + "\n")    
165 |             iterations_log += "\n\n"+str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic)+"\n\n"
166 |                     
167 |         
168 |             if  elimination_criteria == "aic":
169 |                 new_criteria = model.aic
170 |                 if new_criteria < criteria:
171 |                     print("Entered :", pvals["Cols"][0], "\tAIC :", model.aic)
172 |                     selected_cols.append(pvals["Cols"][0])
173 |                     other_cols.remove(pvals["Cols"][0])
174 |                     criteria = new_criteria
175 |                 else:
176 |                     print("break : Criteria")
177 |                     break
178 |             elif  elimination_criteria == "bic":
179 |                 new_criteria = model.bic
180 |                 if new_criteria < criteria:
181 |                     print("Entered :", pvals["Cols"][0], "\tBIC :", model.bic)
182 |                     selected_cols.append(pvals["Cols"][0])
183 |                     other_cols.remove(pvals["Cols"][0])
184 |                     criteria = new_criteria
185 |                 else:
186 |                     print("break : Criteria")
187 |                     break        
188 |             elif  elimination_criteria == "r2" and model_type =="linear":
189 |                 new_criteria = model.rsquared
190 |                 if new_criteria > criteria:
191 |                     print("Entered :", pvals["Cols"][0], "\tR2 :", model.rsquared)
192 |                     selected_cols.append(pvals["Cols"][0])
193 |                     other_cols.remove(pvals["Cols"][0])
194 |                     criteria = new_criteria
195 |                 else:
196 |                     print("break : Criteria")
197 |                     break           
198 |             elif  elimination_criteria == "adjr2" and model_type =="linear":
199 |                 new_criteria = model.rsquared_adj
200 |                 if new_criteria > criteria:
201 |                     print("Entered :", pvals["Cols"][0], "\tAdjR2 :", model.rsquared_adj)
202 |                     selected_cols.append(pvals["Cols"][0])
203 |                     other_cols.remove(pvals["Cols"][0])
204 |                     criteria = new_criteria
205 |                 else:
206 |                     print("Break : Criteria")
207 |                     break
208 |             else:
209 |                 print("Entered :", pvals["Cols"][0])
210 |                 selected_cols.append(pvals["Cols"][0])
211 |                 other_cols.remove(pvals["Cols"][0])            
212 |                 
213 |         else:
214 |             print("Break : Significance Level")
215 |             break
216 |         
217 |     model = regressor(y, X[selected_cols])
218 |     if elimination_criteria == "aic":
219 |         criteria = model.aic
220 |     elif elimination_criteria == "bic":
221 |         criteria = model.bic
222 |     elif elimination_criteria == "r2" and model_type =="linear":
223 |         criteria = model.rsquared
224 |     elif elimination_criteria == "adjr2" and model_type =="linear":
225 |         criteria = model.rsquared_adj
226 |     
227 |     print(model.summary())
228 |     print("AIC: "+str(model.aic))
229 |     print("BIC: "+str(model.bic))
230 |     print("Final Variables:", selected_cols)
231 | 
232 |     return selected_cols, iterations_log
233 | 
234 | def __backwardSelectionRaw__(X, y, model_type ="linear",elimination_criteria = "aic", sl=0.05):
235 |     
236 |     iterations_log = ""
237 |     last_eleminated = ""    
238 |     cols = X.columns.tolist()
239 | 
240 |     def regressor(y,X, model_type=model_type):
241 |         if model_type =="linear":
242 |             regressor = sm.OLS(y, X).fit()
243 |         elif model_type == "logistic":
244 |             regressor = sm.Logit(y, X).fit()
245 |         else:
246 |             print("\nWrong Model Type : "+ model_type +"\nLinear model type is seleted.")
247 |             model_type = "linear"
248 |             regressor = sm.OLS(y, X).fit()
249 |         return regressor
250 |     for i in range(X.shape[1]):
251 |         if i != 0 :          
252 |             if elimination_criteria == "aic":
253 |                 criteria = model.aic
254 |                 new_model = regressor(y,X)
255 |                 new_criteria = new_model.aic
256 |                 if criteria < new_criteria:
257 |                     print("Regained : ", last_eleminated)
258 |                     iterations_log += "\n"+str(new_model.summary())+"\nAIC: "+ str(new_model.aic) + "\nBIC: "+ str(new_model.bic)+"\n"
259 |                     iterations_log += str("\n\nRegained : "+last_eleminated + "\n\n")
260 |                     break  
261 |             elif elimination_criteria == "bic":
262 |                 criteria = model.bic
263 |                 new_model = regressor(y,X)
264 |                 new_criteria = new_model.bic
265 |                 if criteria < new_criteria:
266 |                     print("Regained : ", last_eleminated)
267 |                     iterations_log += "\n"+str(new_model.summary())+"\nAIC: "+ str(new_model.aic) + "\nBIC: "+ str(new_model.bic)+"\n"
268 |                     iterations_log += str("\n\nRegained : "+last_eleminated + "\n\n")
269 |                     break  
270 |             elif elimination_criteria == "adjr2" and model_type =="linear":
271 |                 criteria = model.rsquared_adj
272 |                 new_model = regressor(y,X)
273 |                 new_criteria = new_model.rsquared_adj
274 |                 if criteria > new_criteria:
275 |                     print("Regained : ", last_eleminated)
276 |                     iterations_log += "\n"+str(new_model.summary())+"\nAIC: "+ str(new_model.aic) + "\nBIC: "+ str(new_model.bic)+"\n"
277 |                     iterations_log += str("\n\nRegained : "+last_eleminated + "\n\n")
278 |                     break  
279 |             elif elimination_criteria == "r2" and model_type =="linear":
280 |                 criteria = model.rsquared
281 |                 new_model = regressor(y,X)
282 |                 new_criteria = new_model.rsquared
283 |                 if criteria > new_criteria:
284 |                     print("Regained : ", last_eleminated)
285 |                     iterations_log += "\n"+str(new_model.summary())+"\nAIC: "+ str(new_model.aic) + "\nBIC: "+ str(new_model.bic)+"\n"
286 |                     iterations_log += str("\n\nRegained : "+last_eleminated + "\n\n")
287 |                     break   
288 |             else: 
289 |                 new_model = regressor(y,X)
290 |             model = new_model
291 |             iterations_log += "\n"+str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic)+"\n"
292 |         else:
293 |             model = regressor(y,X)
294 |             iterations_log += "\n"+str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic)+"\n"
295 |         maxPval = max(model.pvalues)
296 |         cols = X.columns.tolist()
297 |         if maxPval > sl:
298 |             for j in cols:
299 |                 if (model.pvalues[j] == maxPval):
300 |                     print("Eliminated :" ,j)
301 |                     iterations_log += str("\n\nEliminated : "+j+ "\n\n")
302 |                     
303 |                     del X[j]
304 |                     last_eleminated = j
305 |         else:
306 |             break
307 |     print(str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic))
308 |     print("Final Variables:", cols)
309 |     iterations_log += "\n"+str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic)+"\n"
310 |     return cols, iterations_log
311 | 


--------------------------------------------------------------------------------