├── README.md
├── magicloop.py
├── mlfunctions.py
├── simpleloop.py
├── temporal_validate.py
└── todo.txt


/README.md:
--------------------------------------------------------------------------------
 1 | # magicloops
 2 | This is an example for loop that takes a number of machine learning classifiers implemented in sklearn, goes over a range of hyper-parameters for each classifier, and stores a set of evaluation metrics in a dataframe (or csv). 
 3 | 
 4 | There are three grid sizes:
 5 | * test: to test if things are working
 6 | * small: if you've got less than an hour
 7 | * large: if you've got time or cores
 8 | 
 9 | You can add more classifiers to it, add more hyperparameters, metrics, adapt it for regression, or clustering. You can also add another level of for loops to loop over different features, to see the effect of leaving a feature out or just using that feature.en
10 | 
11 | There are three files here that may be of interest:
12 | 1. mlfunctions.py contains several helpfer functions you might want to use for doing machine learning
13 | 2. simpleloops.py simple for loop over models
14 | 3. magicloop.py uses mlfunctions and buids a simple machine learning pipelie that loops over
15 |   * various models
16 |   * hyperparameters for each model
17 |   * different outcomes
18 |   * different dates for validation
19 |   * different predictor/feature subsets
20 |  
21 | and then stores results with several evaluation metrics including
22 |   * precision at k%
23 |   * recall at k%
24 |   * area under ROC curve
25 |   * baselines
26 |   
27 | 


--------------------------------------------------------------------------------
/magicloop.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Import Statements
  3 | import pandas as pd
  4 | import numpy as np
  5 | import os
  6 | import matplotlib
  7 | import matplotlib.pyplot as plt
  8 | import pylab as pl
  9 | from datetime import timedelta
 10 | from datetime import datetime
 11 | from __future__ import division
 12 | import random
 13 | from scipy import optimize
 14 | import time
 15 | import seaborn as sns
 16 | import csv
 17 | 
 18 | from mlfunctions import *
 19 | 
 20 | 
 21 | def main():
 22 | 
 23 |     print 'Number of arguments:', len(sys.argv), 'arguments.'
 24 |     print 'Argument List:', str(sys.argv)
 25 | 
 26 |     # parse input parameters
 27 | 
 28 |     # csv data file to be used as input
 29 |     infile = sys.argv[1]
 30 | 
 31 |     # the filename we want to write results to
 32 |     outfile = sys.argv[2]
 33 | 
 34 |     # which model(s) to run
 35 |     model = sys.argv[3]
 36 | 
 37 |     # which parameter grid do we want to use (test, small, large)
 38 |     grid_size = sys.argv[4]
 39 | 
 40 |     #read the csv data
 41 |     data = pd.read_csv(infile)
 42 | 
 43 |     # which variable to use for prediction_time
 44 |     prediction_time = 'dis_date'
 45 | 
 46 |     # outcome variables we want to loop over
 47 |     outcomes = ['30_day_readmits', '60_day_readmits','180_day_readmits']
 48 |     
 49 |     # validation dates we want to loop over
 50 |     validation_dates = ['2012-04-01', '2012-10-01', '2013-04-01']
 51 | 
 52 |     # define feature groups
 53 |     demographic_predictors = ['age', 'gender', 'race']
 54 |     admission_predictors = ['num_visits_so_far','avg_los_so_far','min_los_so_far','max_los_so_far','std_los_so_far']
 55 |     sensor_predictors = ['reading1', 'reading2', 'reading3']
 56 |     survey_predictors=['response1', 'response2', 'response3']
 57 | 
 58 |    
 59 |     # models_to_run=['RF','DT','KNN', 'ET', 'AB', 'GB', 'LR', 'NB']
 60 |     if (model == 'all'):
 61 |         models_to_run=['RF','LR','DT','ET','AB']
 62 |     else:
 63 |         models_to_run = []
 64 |         models_to_run.append(model)
 65 | 
 66 |     clfs, grid = define_clfs_params(grid_size)
 67 | 
 68 |     # which feature/predictor sets do we want to use in our analysis
 69 |     predictor_sets = [demographic_predictors, admission_predictors,sensor_predictors,survey_predictors]
 70 |     
 71 |     # generate all possible subsets of the feature/predictor groups
 72 |     predictor_subsets = get_subsets(predictor_sets)
 73 | 
 74 |     all_predictors=[]
 75 |     for p in predictor_subsets:
 76 |         merged = list(itertools.chain.from_iterable(p))
 77 |         all_predictors.append(merged)
 78 | 
 79 |     # write header for the csv
 80 |     with open(outfile, "w") as myfile:
 81 |         myfile.write("model_type ,clf, parameters, outcome, validation_date, group,train_set_size, validation_set_size,predictors,baseline,precision_at_5,precision_at_10,precision_at_20,precision_at_30,precision_at_40,precision_at_50,recall_at_5,recall_at_10,recall_at_20,recall_at_30,recall_at_40, ecall_at_50,auc-roc")
 82 | 
 83 |     # define dataframe to write results to
 84 |     results_df =  pd.DataFrame(columns=('model_type','clf', 'parameters', 'outcome', 'validation_date', 'group',
 85 |                                         'train_set_size', 'validation_set_size','predictors',
 86 |                                         'baseline','precision_at_5','precision_at_10','precision_at_20','precision_at_30','precision_at_40',
 87 |                                         'precision_at_50','recall_at_5','recall_at_10','recall_at_20','recall_at_30','recall_at_40',
 88 |                                         'recall_at_50','auc-roc'))
 89 | 
 90 |     # the magic loop starts here
 91 |     # we will loop over models, parameters, outcomes, validation_Dates
 92 |     # and store several evaluation metrics
 93 | 
 94 |     for index,clf in enumerate([clfs[x] for x in models_to_run]):
 95 |         parameter_values = grid[models_to_run[index]]
 96 |         for p in ParameterGrid(parameter_values):
 97 |             for current_outcome in outcomes:
 98 |                 for predictor in all_predictors:
 99 |                     for validation_date in validation_dates:
100 |                         try:
101 |                             print models_to_run[index]
102 |                             clf.set_params(**p)
103 |                             if (outcome == '30_day_readmits'):
104 |                                 delta = 30
105 |                             elif (outcome == '60_day_readmits'):
106 |                                 delta = 60
107 |                             elif (outcome == '180_day_readmits'):
108 |                                 delta = 180
109 |                             else:
110 |                                 raise ValueError('value of outcome is unknown')                 
111 |                         
112 |                             train_set = data[data[prediction_time] <= datetime.strptime(validation_date, '%Y-%m-%d') - timedelta(days=delta)]
113 |                             # fill in missing values for train set using just the train set
114 |                             # we'll do it a very naive way here but you should think more carefully about this first
115 |                             train_set.fillna(train_set.mean(), inplace=True)
116 |                             train_set.dropna(axis=1, how='any', inplace=True)
117 |                             
118 |                             validation_set = data[data[prediction_time] > datetime.strptime(validation_date, '%Y-%m-%d') - timedelta(days=0)]
119 |                             # fill in missing values for validation set using all the data
120 |                             # we'll do it a very naive way here but you should think more carefully about this first
121 |                             validation_set.fillna(data.mean(), inplace=True)
122 |                             validation_set.dropna(axis=1, how='any', inplace=True)
123 | 
124 |                             print predictor
125 |                             # get predictors by removing those dropped by dropna
126 |                             predictors_to_use = list(set(predictor).intersection(train_set.columns))
127 | 
128 |                             model = clf.fit(train_set[predictor], train_set[current_outcome]) 
129 |                             pred_probs = clf.predict_proba(validation_set[predictor])[::,1]
130 |                             print len(train_set)
131 |                             print len(validation_set)
132 |                             #pred_probs_sorted, true_outcome_sorted = zip(*sorted(zip(pred_probs, validation_set[current_outcome]), reverse=True))
133 |                             results_df.loc[len(results_df)] = [models_to_run[index],clf, p, current_outcome, validation_date, group,
134 |                                                                len(train_set),len(validation_set), 
135 |                                                                predictor, 
136 |                                                                 precision_at_k(validation_set[current_outcome],pred_probs, 100),
137 |                                                                 precision_at_k(validation_set[current_outcome],pred_probs, 5),
138 |                                                                 precision_at_k(validation_set[current_outcome],pred_probs, 10),
139 |                                                                 precision_at_k(validation_set[current_outcome],pred_probs, 20),
140 |                                                                 precision_at_k(validation_set[current_outcome],pred_probs, 30),
141 |                                                                 precision_at_k(validation_set[current_outcome],pred_probs, 40),
142 |                                                                 precision_at_k(validation_set[current_outcome],pred_probs, 50),
143 |                                                                 recall_at_k(validation_set[current_outcome],pred_probs, 5),
144 |                                                                 recall_at_k(validation_set[current_outcome],pred_probs, 10),
145 |                                                                 recall_at_k(validation_set[current_outcome],pred_probs, 20),
146 |                                                                 recall_at_k(validation_set[current_outcome],pred_probs, 30),
147 |                                                                 recall_at_k(validation_set[current_outcome],pred_probs, 40),
148 |                                                                 recall_at_k(validation_set[current_outcome],pred_probs, 50),
149 |                                                                 roc_auc_score(validation_set[current_outcome], pred_probs)]
150 | 
151 |                             # plot precision recall graph
152 |                             # we'll show them here but you can also save them to disk
153 |                             plot_precision_recall_n(validation_set[current_outcome], pred_probs, clf, 'show')
154 |                             # write results to csv as they come in so we always have something to see even if models runs for days
155 |                             with open(outfile, "a") as myfile:
156 |                                 csvwriter = csv.writer(myfile, dialect='excel', quoting=csv.QUOTE_ALL)
157 |                                 strp = str(p)
158 |                                 strp.replace('\n', '')
159 |                                 strclf = str(clf)
160 |                                 strclf.replace('\n', '')
161 |                                 csvwriter.writerow([models_to_run[index],strclf, strp, current_outcome, validation_date, group,len(train_set),len(validation_set), predictor,  precision_at_k(validation_set[current_outcome],pred_probs, 100), precision_at_k(validation_set[current_outcome],pred_probs, 5), precision_at_k(validation_set[current_outcome],pred_probs, 10), precision_at_k(validation_set[current_outcome],pred_probs, 20), precision_at_k(validation_set[current_outcome],pred_probs, 30), precision_at_k(validation_set[current_outcome],pred_probs, 40), precision_at_k(validation_set[current_outcome],pred_probs, 50), recall_at_k(validation_set[current_outcome],pred_probs, 5), recall_at_k(validation_set[current_outcome],pred_probs, 10), recall_at_k(validation_set[current_outcome],pred_probs, 20), recall_at_k(validation_set[current_outcome],pred_probs, 30), recall_at_k(validation_set[current_outcome],pred_probs, 40), recall_at_k(validation_set[current_outcome],pred_probs, 50),roc_auc_score(validation_set[current_outcome], pred_probs)])
162 |                         except IndexError, e:
163 |                             print 'Error:',e
164 |                             continue
165 |     
166 |     # write final dataframe to csv
167 |     dfoutfile = 'df_' + outfile
168 |     results_df.to_csv(dfoutfile, index=False)
169 | 
170 | 
171 | if __name__ == '__main__':
172 |     main()
173 | 
174 | 


--------------------------------------------------------------------------------
/mlfunctions.py:
--------------------------------------------------------------------------------
  1 | """mlfunctions.py
  2 | This contains helper functions for doing machine learning.
  3 | You can import this in your code
  4 | """
  5 | 
  6 | 
  7 | 
  8 | 
  9 | # Import Statements
 10 | from __future__ import division
 11 | import pandas as pd
 12 | import numpy as np
 13 | import os
 14 | import matplotlib.pyplot as plt
 15 | import pylab as pl
 16 | from datetime import timedelta
 17 | import random
 18 | from scipy import optimize
 19 | import time
 20 | import seaborn as sns
 21 | from mlfunctions import *
 22 | from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix
 23 | from sklearn import preprocessing, cross_validation, svm, metrics, tree, decomposition, svm
 24 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
 25 | from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, OrthogonalMatchingPursuit, RandomizedLogisticRegression
 26 | from sklearn.neighbors.nearest_centroid import NearestCentroid
 27 | from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
 28 | from sklearn.tree import DecisionTreeClassifier
 29 | from sklearn.neighbors import KNeighborsClassifier
 30 | from sklearn.cross_validation import train_test_split
 31 | from sklearn.grid_search import ParameterGrid
 32 | from sklearn.metrics import *
 33 | from sklearn.preprocessing import StandardScaler
 34 | import itertools
 35 | 
 36 | 
 37 | # modeling helper functions
 38 | 
 39 | def define_clfs_params(grid_size):
 40 | 
 41 |     """
 42 | This functions defines parameter grid for all the classifiers
 43 | 
 44 | Args:
 45 |     grid_size: how big of a grid do you want. it can be test, small, or large
 46 | 
 47 | Returns:
 48 |     a set of model and parameters
 49 | 
 50 | Raises:
 51 |     KeyError: Raises an exception.
 52 | """
 53 | 
 54 |     clfs = {'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),
 55 |         'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
 56 |         'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
 57 |         'LR': LogisticRegression(penalty='l1', C=1e5),
 58 |         'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
 59 |         'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
 60 |         'NB': GaussianNB(),
 61 |         'DT': DecisionTreeClassifier(),
 62 |         'SGD': SGDClassifier(loss="hinge", penalty="l2"),
 63 |         'KNN': KNeighborsClassifier(n_neighbors=3) 
 64 |             }
 65 | 
 66 |     large_grid = { 
 67 |     'RF':{'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]},
 68 |     'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10]},
 69 |     'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
 70 |     'ET': { 'n_estimators': [1,10,100,1000,10000], 'criterion' : ['gini', 'entropy'] ,'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]},
 71 |     'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
 72 |     'GB': {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]},
 73 |     'NB' : {},
 74 |     'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100], 'max_features': [None],'min_samples_split': [2,5,10]},
 75 |     'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
 76 |     'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
 77 |            }
 78 |     
 79 |     small_grid = { 
 80 |     'RF':{'n_estimators': [100, 10000], 'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs':[-1]},
 81 |     'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.001,0.1,1,10]},
 82 |     'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
 83 |     'ET': { 'n_estimators': [100, 10000], 'criterion' : ['gini', 'entropy'] ,'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs':[-1]},
 84 |     'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
 85 |     'GB': {'n_estimators': [100, 10000], 'learning_rate' : [0.001,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [5,50]},
 86 |     'NB' : {},
 87 |     'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100], 'max_features': [None],'min_samples_split': [2,5,10]},
 88 |     'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
 89 |     'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
 90 |            }
 91 |     
 92 |     test_grid = { 
 93 |     'RF':{'n_estimators': [1], 'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10], 'n_jobs': [-1]},
 94 |     'LR': { 'penalty': ['l1'], 'C': [0.01]},
 95 |     'SGD': { 'loss': ['perceptron'], 'penalty': ['l2']},
 96 |     'ET': { 'n_estimators': [1], 'criterion' : ['gini'] ,'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10], 'n_jobs': [-1]},
 97 |     'AB': { 'algorithm': ['SAMME'], 'n_estimators': [1]},
 98 |     'GB': {'n_estimators': [1], 'learning_rate' : [0.1],'subsample' : [0.5], 'max_depth': [1]},
 99 |     'NB' : {},
100 |     'DT': {'criterion': ['gini'], 'max_depth': [1], 'max_features': [None],'min_samples_split': [10]},
101 |     'SVM' :{'C' :[0.01],'kernel':['linear']},
102 |     'KNN' :{'n_neighbors': [5],'weights': ['uniform'],'algorithm': ['auto']}
103 |            }
104 |     
105 |     if (grid_size == 'large'):
106 |         return clfs, large_grid
107 |     elif (grid_size == 'small'):
108 |         return clfs, small_grid
109 |     elif (grid_size == 'test'):
110 |         return clfs, test_grid
111 |     else:
112 |         return 0, 0
113 | 
114 | 
115 | # Evaluation functions
116 | # calculate precision, recall and auc metrics
117 | 
118 | def plot_roc(name, probs, true, output_type):
119 |     fpr, tpr, thresholds = roc_curve(true, probs)
120 |     roc_auc = auc(fpr, tpr)
121 |     pl.clf()
122 |     pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
123 |     pl.plot([0, 1], [0, 1], 'k--')
124 |     pl.xlim([0.0, 1.05])
125 |     pl.ylim([0.0, 1.05])
126 |     pl.xlabel('False Positive Rate')
127 |     pl.ylabel('True Positive Rate')
128 |     pl.title(name)
129 |     pl.legend(loc="lower right")
130 |     if (output_type == 'save'):
131 |         plt.savefig(name)
132 |     elif (output_type == 'show'):
133 |         plt.show()
134 |     else:
135 |         plt.show()
136 | 
137 | def generate_binary_at_k(y_scores, k):
138 |     cutoff_index = int(len(y_scores) * (k / 100.0))
139 |     predictions_binary = [1 if x < cutoff_index else 0 for x in range(len(y_scores))]
140 |     return predictions_binary
141 | 
142 | def precision_at_k(y_true, y_scores, k):
143 |     #y_scores_sorted, y_true_sorted = zip(*sorted(zip(y_scores, y_true), reverse=True))
144 |     y_scores_sorted, y_true_sorted = joint_sort_descending(np.array(y_scores), np.array(y_true))
145 |     preds_at_k = generate_binary_at_k(y_scores_sorted, k)
146 |     #precision, _, _, _ = metrics.precision_recall_fscore_support(y_true, preds_at_k)
147 |     #precision = precision[1]  # only interested in precision for label 1
148 |     precision = precision_score(y_true_sorted, preds_at_k)
149 |     return precision
150 | 
151 | def recall_at_k(y_true, y_scores, k):
152 |     #y_scores_sorted, y_true_sorted = zip(*sorted(zip(y_scores, y_true), reverse=True))
153 |     y_scores_sorted, y_true_sorted = joint_sort_descending(np.array(y_scores), np.array(y_true))
154 |     preds_at_k = generate_binary_at_k(y_scores_sorted, k)
155 |     #precision, _, _, _ = metrics.precision_recall_fscore_support(y_true, preds_at_k)
156 |     #precision = precision[1]  # only interested in precision for label 1
157 |     recall = recall_score(y_true_sorted, preds_at_k)
158 |     return recall
159 | 
160 | def plot_precision_recall_n(y_true, y_prob, model_name, output_type):
161 |     from sklearn.metrics import precision_recall_curve
162 |     y_score = y_prob
163 |     precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true, y_score)
164 |     precision_curve = precision_curve[:-1]
165 |     recall_curve = recall_curve[:-1]
166 |     pct_above_per_thresh = []
167 |     number_scored = len(y_score)
168 |     for value in pr_thresholds:
169 |         num_above_thresh = len(y_score[y_score>=value])
170 |         pct_above_thresh = num_above_thresh / float(number_scored)
171 |         pct_above_per_thresh.append(pct_above_thresh)
172 |     pct_above_per_thresh = np.array(pct_above_per_thresh)
173 |     
174 |     plt.clf()
175 |     fig, ax1 = plt.subplots()
176 |     ax1.plot(pct_above_per_thresh, precision_curve, 'b')
177 |     ax1.set_xlabel('percent of population')
178 |     ax1.set_ylabel('precision', color='b')
179 |     ax2 = ax1.twinx()
180 |     ax2.plot(pct_above_per_thresh, recall_curve, 'r')
181 |     ax2.set_ylabel('recall', color='r')
182 |     ax1.set_ylim([0,1])
183 |     ax1.set_ylim([0,1])
184 |     ax2.set_xlim([0,1])
185 |     
186 |     name = model_name
187 |     plt.title(name)
188 |     if (output_type == 'save'):
189 |         plt.savefig(name)
190 |     elif (output_type == 'show'):
191 |         plt.show()
192 |     else:
193 |         plt.show()
194 | 
195 | 
196 | # Other helper functions
197 | 
198 | def get_subsets(l):
199 |     subsets = []
200 |     for i in range(1, len(l) + 1):
201 |         for combo in itertools.combinations(l, i):
202 |             subsets.append(list(combo))
203 |     return subsets
204 | 
205 | def joint_sort_descending(l1, l2):
206 |     # l1 and l2 have to be numpy arrays
207 |     idx = np.argsort(l1)[::-1]
208 |     return l1[idx], l2[idx]
209 | 


--------------------------------------------------------------------------------
/simpleloop.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn import preprocessing, cross_validation, svm, metrics, tree, decomposition, svm
  5 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
  6 | from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, OrthogonalMatchingPursuit, RandomizedLogisticRegression
  7 | from sklearn.neighbors.nearest_centroid import NearestCentroid
  8 | from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
  9 | from sklearn.tree import DecisionTreeClassifier
 10 | from sklearn.neighbors import KNeighborsClassifier
 11 | from sklearn.cross_validation import train_test_split
 12 | from sklearn.grid_search import ParameterGrid
 13 | from sklearn.metrics import *
 14 | from sklearn.preprocessing import StandardScaler
 15 | import random
 16 | import matplotlib.pyplot as plt
 17 | from scipy import optimize
 18 | import time
 19 | import seaborn as sns
 20 | 
 21 | # for jupyter notebooks
 22 | #%matplotlib inline
 23 | 
 24 | # if you're running this in a jupyter notebook, print out the graphs
 25 | NOTEBOOK = 0
 26 | 
 27 | def define_clfs_params(grid_size):
 28 |     """Define defaults for different classifiers.
 29 |     Define three types of grids:
 30 |     Test: for testing your code
 31 |     Small: small grid
 32 |     Large: Larger grid that has a lot more parameter sweeps
 33 |     """
 34 | 
 35 |     clfs = {'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),
 36 |         'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
 37 |         'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
 38 |         'LR': LogisticRegression(penalty='l1', C=1e5),
 39 |         'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
 40 |         'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
 41 |         'NB': GaussianNB(),
 42 |         'DT': DecisionTreeClassifier(),
 43 |         'SGD': SGDClassifier(loss="hinge", penalty="l2"),
 44 |         'KNN': KNeighborsClassifier(n_neighbors=3) 
 45 |             }
 46 | 
 47 |     large_grid = { 
 48 |     'RF':{'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]},
 49 |     'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10]},
 50 |     'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
 51 |     'ET': { 'n_estimators': [1,10,100,1000,10000], 'criterion' : ['gini', 'entropy'] ,'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]},
 52 |     'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
 53 |     'GB': {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]},
 54 |     'NB' : {},
 55 |     'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],'min_samples_split': [2,5,10]},
 56 |     'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
 57 |     'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
 58 |            }
 59 |     
 60 |     small_grid = { 
 61 |     'RF':{'n_estimators': [10,100], 'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs': [-1]},
 62 |     'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.001,0.1,1,10]},
 63 |     'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
 64 |     'ET': { 'n_estimators': [10,100], 'criterion' : ['gini', 'entropy'] ,'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs': [-1]},
 65 |     'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
 66 |     'GB': {'n_estimators': [10,100], 'learning_rate' : [0.001,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [5,50]},
 67 |     'NB' : {},
 68 |     'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],'min_samples_split': [2,5,10]},
 69 |     'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
 70 |     'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
 71 |            }
 72 |     
 73 |     test_grid = { 
 74 |     'RF':{'n_estimators': [1], 'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10]},
 75 |     'LR': { 'penalty': ['l1'], 'C': [0.01]},
 76 |     'SGD': { 'loss': ['perceptron'], 'penalty': ['l2']},
 77 |     'ET': { 'n_estimators': [1], 'criterion' : ['gini'] ,'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10]},
 78 |     'AB': { 'algorithm': ['SAMME'], 'n_estimators': [1]},
 79 |     'GB': {'n_estimators': [1], 'learning_rate' : [0.1],'subsample' : [0.5], 'max_depth': [1]},
 80 |     'NB' : {},
 81 |     'DT': {'criterion': ['gini'], 'max_depth': [1],'min_samples_split': [10]},
 82 |     'SVM' :{'C' :[0.01],'kernel':['linear']},
 83 |     'KNN' :{'n_neighbors': [5],'weights': ['uniform'],'algorithm': ['auto']}
 84 |            }
 85 |     
 86 |     if (grid_size == 'large'):
 87 |         return clfs, large_grid
 88 |     elif (grid_size == 'small'):
 89 |         return clfs, small_grid
 90 |     elif (grid_size == 'test'):
 91 |         return clfs, test_grid
 92 |     else:
 93 |         return 0, 0
 94 | 
 95 | # a set of helper function to do machine learning evalaution
 96 | 
 97 | def joint_sort_descending(l1, l2):
 98 |     # l1 and l2 have to be numpy arrays
 99 |     idx = np.argsort(l1)[::-1]
100 |     return l1[idx], l2[idx]
101 | 
102 | def generate_binary_at_k(y_scores, k):
103 |     cutoff_index = int(len(y_scores) * (k / 100.0))
104 |     test_predictions_binary = [1 if x < cutoff_index else 0 for x in range(len(y_scores))]
105 |     return test_predictions_binary
106 | 
107 | def precision_at_k(y_true, y_scores, k):
108 |     y_scores, y_true = joint_sort_descending(np.array(y_scores), np.array(y_true))
109 |     preds_at_k = generate_binary_at_k(y_scores, k)
110 |     #precision, _, _, _ = metrics.precision_recall_fscore_support(y_true, preds_at_k)
111 |     #precision = precision[1]  # only interested in precision for label 1
112 |     precision = precision_score(y_true, preds_at_k)
113 |     return precision
114 | 
115 | def plot_precision_recall_n(y_true, y_prob, model_name):
116 |     from sklearn.metrics import precision_recall_curve
117 |     y_score = y_prob
118 |     precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true, y_score)
119 |     precision_curve = precision_curve[:-1]
120 |     recall_curve = recall_curve[:-1]
121 |     pct_above_per_thresh = []
122 |     number_scored = len(y_score)
123 |     for value in pr_thresholds:
124 |         num_above_thresh = len(y_score[y_score>=value])
125 |         pct_above_thresh = num_above_thresh / float(number_scored)
126 |         pct_above_per_thresh.append(pct_above_thresh)
127 |     pct_above_per_thresh = np.array(pct_above_per_thresh)
128 |     
129 |     plt.clf()
130 |     fig, ax1 = plt.subplots()
131 |     ax1.plot(pct_above_per_thresh, precision_curve, 'b')
132 |     ax1.set_xlabel('percent of population')
133 |     ax1.set_ylabel('precision', color='b')
134 |     ax2 = ax1.twinx()
135 |     ax2.plot(pct_above_per_thresh, recall_curve, 'r')
136 |     ax2.set_ylabel('recall', color='r')
137 |     ax1.set_ylim([0,1])
138 |     ax1.set_ylim([0,1])
139 |     ax2.set_xlim([0,1])
140 |     
141 |     name = model_name
142 |     plt.title(name)
143 |     #plt.savefig(name)
144 |     plt.show()
145 |     
146 | 
147 | 
148 | def clf_loop(models_to_run, clfs, grid, X, y):
149 |     """Runs the loop using models_to_run, clfs, gridm and the data
150 |     """
151 |     results_df =  pd.DataFrame(columns=('model_type','clf', 'parameters', 'auc-roc','p_at_5', 'p_at_10', 'p_at_20'))
152 |     for n in range(1, 2):
153 |         # create training and valdation sets
154 |         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
155 |         for index,clf in enumerate([clfs[x] for x in models_to_run]):
156 |             print(models_to_run[index])
157 |             parameter_values = grid[models_to_run[index]]
158 |             for p in ParameterGrid(parameter_values):
159 |                 try:
160 |                     clf.set_params(**p)
161 |                     y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:,1]
162 |                     # you can also store the model, feature importances, and prediction scores
163 |                     # we're only storing the metrics for now
164 |                     y_pred_probs_sorted, y_test_sorted = zip(*sorted(zip(y_pred_probs, y_test), reverse=True))
165 |                     results_df.loc[len(results_df)] = [models_to_run[index],clf, p,
166 |                                                        roc_auc_score(y_test, y_pred_probs),
167 |                                                        precision_at_k(y_test_sorted,y_pred_probs_sorted,5.0),
168 |                                                        precision_at_k(y_test_sorted,y_pred_probs_sorted,10.0),
169 |                                                        precision_at_k(y_test_sorted,y_pred_probs_sorted,20.0)]
170 |                     if NOTEBOOK == 1:
171 |                         plot_precision_recall_n(y_test,y_pred_probs,clf)
172 |                 except IndexError as e:
173 |                     print('Error:',e)
174 |                     continue
175 |     return results_df
176 | 
177 | 
178 | 
179 | def main():
180 | 
181 |     # define grid to use: test, small, large
182 |     grid_size = 'test'
183 |     clfs, grid = define_clfs_params(grid_size)
184 | 
185 |     # define models to run
186 |     models_to_run=['RF','DT','KNN', 'ET', 'AB', 'GB', 'LR', 'NB']
187 | 
188 |     # load data from csv
189 |     df = pd.read_csv("/Users/rayid/Projects/uchicago/Teaching/MLPP-2017/Homeworks/Assignment 2/credit-data.csv")
190 | 
191 |     # select features to use
192 |     features  =  ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio', 'age', 'NumberOfTimes90DaysLate']
193 |     X = df[features]
194 |     
195 |     # define label
196 |     y = df.SeriousDlqin2yrs
197 | 
198 |     # call clf_loop and store results in results_df
199 |     results_df = clf_loop(models_to_run, clfs,grid, X,y)
200 |     if NOTEBOOK == 1:
201 |         results_df
202 | 
203 |     # save to csv
204 |     results_df.to_csv('results.csv', index=False)
205 | 
206 | 
207 | if __name__ == '__main__':
208 |     main()
209 | 


--------------------------------------------------------------------------------
/temporal_validate.py:
--------------------------------------------------------------------------------
 1 | # sample config file to run temporal validation
 2 | 
 3 | # start time of our data
 4 | start_time = '2009-01-01'
 5 | 
 6 | #last date of data including labels and outcomes that we have
 7 | end_time = '2016-01-01'
 8 | 
 9 | #how far out do we want to predict (let's say in months for now)
10 | prediction_windows = [6, 12]
11 | 
12 | #how often is this prediction being made? every day? every month? once a year?
13 | update_window = 12
14 | 
15 | from datetime import date, datetime, timedelta
16 | from dateutil.relativedelta import relativedelta
17 | 
18 | start_time_date = datetime.strptime(start_time, '%Y-%m-%d')
19 | end_time_date = datetime.strptime(end_time, '%Y-%m-%d')
20 | 
21 | for prediction_window in prediction_windows:
22 |     test_end_time = end_time_date
23 |     while (test_end_time >= start_time_date + 2 * relativedelta(months=+prediction_window)):
24 |         test_start_time = test_end_time - relativedelta(months=+prediction_window)
25 |         train_end_time = test_start_time  - relativedelta(days=+1) # minus 1 day
26 |         train_start_time = train_end_time - relativedelta(months=+prediction_window)
27 |         while (train_start_time >= start_time_date ):
28 |             print train_start_time,train_end_time,test_start_time,test_end_time, prediction_window
29 |             train_start_time -= relativedelta(months=+prediction_window)
30 |             # call function to get data
31 |             train_set, test_set = extract_train_test_sets (train_start_time, train_end_time, test_start_time, test_end_time)
32 |             # fit on train data
33 |             # predict on test data
34 |         test_end_time -= relativedelta(months=+update_window)
35 | 


--------------------------------------------------------------------------------
/todo.txt:
--------------------------------------------------------------------------------
 1 | if 'feature_importances_' in dir(model):
 2 |                             feature_importance = model.feature_importances_
 3 |                         else:
 4 |                             feature_importance = None
 5 |                         # Here we get the predicted results from the model
 6 |                         # SVC does not have 'predict_proba', so we need to use 'decision_function'
 7 |                         
 8 |                         if hasattr(clf, 'predict_proba'):
 9 |                             yscores = model.predict_proba(X_test_sel)[:,1]
10 |                         else:
11 |                             yscores = model.decision_function(X_test_sel)
12 | 


--------------------------------------------------------------------------------