├── README.md ├── magicloop.py ├── mlfunctions.py ├── simpleloop.py ├── temporal_validate.py └── todo.txt /README.md: -------------------------------------------------------------------------------- 1 | # magicloops 2 | This is an example for loop that takes a number of machine learning classifiers implemented in sklearn, goes over a range of hyper-parameters for each classifier, and stores a set of evaluation metrics in a dataframe (or csv). 3 | 4 | There are three grid sizes: 5 | * test: to test if things are working 6 | * small: if you've got less than an hour 7 | * large: if you've got time or cores 8 | 9 | You can add more classifiers to it, add more hyperparameters, metrics, adapt it for regression, or clustering. You can also add another level of for loops to loop over different features, to see the effect of leaving a feature out or just using that feature.en 10 | 11 | There are three files here that may be of interest: 12 | 1. mlfunctions.py contains several helpfer functions you might want to use for doing machine learning 13 | 2. simpleloops.py simple for loop over models 14 | 3. magicloop.py uses mlfunctions and buids a simple machine learning pipelie that loops over 15 | * various models 16 | * hyperparameters for each model 17 | * different outcomes 18 | * different dates for validation 19 | * different predictor/feature subsets 20 | 21 | and then stores results with several evaluation metrics including 22 | * precision at k% 23 | * recall at k% 24 | * area under ROC curve 25 | * baselines 26 | 27 | -------------------------------------------------------------------------------- /magicloop.py: -------------------------------------------------------------------------------- 1 | 2 | # Import Statements 3 | import pandas as pd 4 | import numpy as np 5 | import os 6 | import matplotlib 7 | import matplotlib.pyplot as plt 8 | import pylab as pl 9 | from datetime import timedelta 10 | from datetime import datetime 11 | from __future__ import division 12 | import random 13 | from scipy import optimize 14 | import time 15 | import seaborn as sns 16 | import csv 17 | 18 | from mlfunctions import * 19 | 20 | 21 | def main(): 22 | 23 | print 'Number of arguments:', len(sys.argv), 'arguments.' 24 | print 'Argument List:', str(sys.argv) 25 | 26 | # parse input parameters 27 | 28 | # csv data file to be used as input 29 | infile = sys.argv[1] 30 | 31 | # the filename we want to write results to 32 | outfile = sys.argv[2] 33 | 34 | # which model(s) to run 35 | model = sys.argv[3] 36 | 37 | # which parameter grid do we want to use (test, small, large) 38 | grid_size = sys.argv[4] 39 | 40 | #read the csv data 41 | data = pd.read_csv(infile) 42 | 43 | # which variable to use for prediction_time 44 | prediction_time = 'dis_date' 45 | 46 | # outcome variables we want to loop over 47 | outcomes = ['30_day_readmits', '60_day_readmits','180_day_readmits'] 48 | 49 | # validation dates we want to loop over 50 | validation_dates = ['2012-04-01', '2012-10-01', '2013-04-01'] 51 | 52 | # define feature groups 53 | demographic_predictors = ['age', 'gender', 'race'] 54 | admission_predictors = ['num_visits_so_far','avg_los_so_far','min_los_so_far','max_los_so_far','std_los_so_far'] 55 | sensor_predictors = ['reading1', 'reading2', 'reading3'] 56 | survey_predictors=['response1', 'response2', 'response3'] 57 | 58 | 59 | # models_to_run=['RF','DT','KNN', 'ET', 'AB', 'GB', 'LR', 'NB'] 60 | if (model == 'all'): 61 | models_to_run=['RF','LR','DT','ET','AB'] 62 | else: 63 | models_to_run = [] 64 | models_to_run.append(model) 65 | 66 | clfs, grid = define_clfs_params(grid_size) 67 | 68 | # which feature/predictor sets do we want to use in our analysis 69 | predictor_sets = [demographic_predictors, admission_predictors,sensor_predictors,survey_predictors] 70 | 71 | # generate all possible subsets of the feature/predictor groups 72 | predictor_subsets = get_subsets(predictor_sets) 73 | 74 | all_predictors=[] 75 | for p in predictor_subsets: 76 | merged = list(itertools.chain.from_iterable(p)) 77 | all_predictors.append(merged) 78 | 79 | # write header for the csv 80 | with open(outfile, "w") as myfile: 81 | myfile.write("model_type ,clf, parameters, outcome, validation_date, group,train_set_size, validation_set_size,predictors,baseline,precision_at_5,precision_at_10,precision_at_20,precision_at_30,precision_at_40,precision_at_50,recall_at_5,recall_at_10,recall_at_20,recall_at_30,recall_at_40, ecall_at_50,auc-roc") 82 | 83 | # define dataframe to write results to 84 | results_df = pd.DataFrame(columns=('model_type','clf', 'parameters', 'outcome', 'validation_date', 'group', 85 | 'train_set_size', 'validation_set_size','predictors', 86 | 'baseline','precision_at_5','precision_at_10','precision_at_20','precision_at_30','precision_at_40', 87 | 'precision_at_50','recall_at_5','recall_at_10','recall_at_20','recall_at_30','recall_at_40', 88 | 'recall_at_50','auc-roc')) 89 | 90 | # the magic loop starts here 91 | # we will loop over models, parameters, outcomes, validation_Dates 92 | # and store several evaluation metrics 93 | 94 | for index,clf in enumerate([clfs[x] for x in models_to_run]): 95 | parameter_values = grid[models_to_run[index]] 96 | for p in ParameterGrid(parameter_values): 97 | for current_outcome in outcomes: 98 | for predictor in all_predictors: 99 | for validation_date in validation_dates: 100 | try: 101 | print models_to_run[index] 102 | clf.set_params(**p) 103 | if (outcome == '30_day_readmits'): 104 | delta = 30 105 | elif (outcome == '60_day_readmits'): 106 | delta = 60 107 | elif (outcome == '180_day_readmits'): 108 | delta = 180 109 | else: 110 | raise ValueError('value of outcome is unknown') 111 | 112 | train_set = data[data[prediction_time] <= datetime.strptime(validation_date, '%Y-%m-%d') - timedelta(days=delta)] 113 | # fill in missing values for train set using just the train set 114 | # we'll do it a very naive way here but you should think more carefully about this first 115 | train_set.fillna(train_set.mean(), inplace=True) 116 | train_set.dropna(axis=1, how='any', inplace=True) 117 | 118 | validation_set = data[data[prediction_time] > datetime.strptime(validation_date, '%Y-%m-%d') - timedelta(days=0)] 119 | # fill in missing values for validation set using all the data 120 | # we'll do it a very naive way here but you should think more carefully about this first 121 | validation_set.fillna(data.mean(), inplace=True) 122 | validation_set.dropna(axis=1, how='any', inplace=True) 123 | 124 | print predictor 125 | # get predictors by removing those dropped by dropna 126 | predictors_to_use = list(set(predictor).intersection(train_set.columns)) 127 | 128 | model = clf.fit(train_set[predictor], train_set[current_outcome]) 129 | pred_probs = clf.predict_proba(validation_set[predictor])[::,1] 130 | print len(train_set) 131 | print len(validation_set) 132 | #pred_probs_sorted, true_outcome_sorted = zip(*sorted(zip(pred_probs, validation_set[current_outcome]), reverse=True)) 133 | results_df.loc[len(results_df)] = [models_to_run[index],clf, p, current_outcome, validation_date, group, 134 | len(train_set),len(validation_set), 135 | predictor, 136 | precision_at_k(validation_set[current_outcome],pred_probs, 100), 137 | precision_at_k(validation_set[current_outcome],pred_probs, 5), 138 | precision_at_k(validation_set[current_outcome],pred_probs, 10), 139 | precision_at_k(validation_set[current_outcome],pred_probs, 20), 140 | precision_at_k(validation_set[current_outcome],pred_probs, 30), 141 | precision_at_k(validation_set[current_outcome],pred_probs, 40), 142 | precision_at_k(validation_set[current_outcome],pred_probs, 50), 143 | recall_at_k(validation_set[current_outcome],pred_probs, 5), 144 | recall_at_k(validation_set[current_outcome],pred_probs, 10), 145 | recall_at_k(validation_set[current_outcome],pred_probs, 20), 146 | recall_at_k(validation_set[current_outcome],pred_probs, 30), 147 | recall_at_k(validation_set[current_outcome],pred_probs, 40), 148 | recall_at_k(validation_set[current_outcome],pred_probs, 50), 149 | roc_auc_score(validation_set[current_outcome], pred_probs)] 150 | 151 | # plot precision recall graph 152 | # we'll show them here but you can also save them to disk 153 | plot_precision_recall_n(validation_set[current_outcome], pred_probs, clf, 'show') 154 | # write results to csv as they come in so we always have something to see even if models runs for days 155 | with open(outfile, "a") as myfile: 156 | csvwriter = csv.writer(myfile, dialect='excel', quoting=csv.QUOTE_ALL) 157 | strp = str(p) 158 | strp.replace('\n', '') 159 | strclf = str(clf) 160 | strclf.replace('\n', '') 161 | csvwriter.writerow([models_to_run[index],strclf, strp, current_outcome, validation_date, group,len(train_set),len(validation_set), predictor, precision_at_k(validation_set[current_outcome],pred_probs, 100), precision_at_k(validation_set[current_outcome],pred_probs, 5), precision_at_k(validation_set[current_outcome],pred_probs, 10), precision_at_k(validation_set[current_outcome],pred_probs, 20), precision_at_k(validation_set[current_outcome],pred_probs, 30), precision_at_k(validation_set[current_outcome],pred_probs, 40), precision_at_k(validation_set[current_outcome],pred_probs, 50), recall_at_k(validation_set[current_outcome],pred_probs, 5), recall_at_k(validation_set[current_outcome],pred_probs, 10), recall_at_k(validation_set[current_outcome],pred_probs, 20), recall_at_k(validation_set[current_outcome],pred_probs, 30), recall_at_k(validation_set[current_outcome],pred_probs, 40), recall_at_k(validation_set[current_outcome],pred_probs, 50),roc_auc_score(validation_set[current_outcome], pred_probs)]) 162 | except IndexError, e: 163 | print 'Error:',e 164 | continue 165 | 166 | # write final dataframe to csv 167 | dfoutfile = 'df_' + outfile 168 | results_df.to_csv(dfoutfile, index=False) 169 | 170 | 171 | if __name__ == '__main__': 172 | main() 173 | 174 | -------------------------------------------------------------------------------- /mlfunctions.py: -------------------------------------------------------------------------------- 1 | """mlfunctions.py 2 | This contains helper functions for doing machine learning. 3 | You can import this in your code 4 | """ 5 | 6 | 7 | 8 | 9 | # Import Statements 10 | from __future__ import division 11 | import pandas as pd 12 | import numpy as np 13 | import os 14 | import matplotlib.pyplot as plt 15 | import pylab as pl 16 | from datetime import timedelta 17 | import random 18 | from scipy import optimize 19 | import time 20 | import seaborn as sns 21 | from mlfunctions import * 22 | from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix 23 | from sklearn import preprocessing, cross_validation, svm, metrics, tree, decomposition, svm 24 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier 25 | from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, OrthogonalMatchingPursuit, RandomizedLogisticRegression 26 | from sklearn.neighbors.nearest_centroid import NearestCentroid 27 | from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB 28 | from sklearn.tree import DecisionTreeClassifier 29 | from sklearn.neighbors import KNeighborsClassifier 30 | from sklearn.cross_validation import train_test_split 31 | from sklearn.grid_search import ParameterGrid 32 | from sklearn.metrics import * 33 | from sklearn.preprocessing import StandardScaler 34 | import itertools 35 | 36 | 37 | # modeling helper functions 38 | 39 | def define_clfs_params(grid_size): 40 | 41 | """ 42 | This functions defines parameter grid for all the classifiers 43 | 44 | Args: 45 | grid_size: how big of a grid do you want. it can be test, small, or large 46 | 47 | Returns: 48 | a set of model and parameters 49 | 50 | Raises: 51 | KeyError: Raises an exception. 52 | """ 53 | 54 | clfs = {'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1), 55 | 'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'), 56 | 'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200), 57 | 'LR': LogisticRegression(penalty='l1', C=1e5), 58 | 'SVM': svm.SVC(kernel='linear', probability=True, random_state=0), 59 | 'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10), 60 | 'NB': GaussianNB(), 61 | 'DT': DecisionTreeClassifier(), 62 | 'SGD': SGDClassifier(loss="hinge", penalty="l2"), 63 | 'KNN': KNeighborsClassifier(n_neighbors=3) 64 | } 65 | 66 | large_grid = { 67 | 'RF':{'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]}, 68 | 'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10]}, 69 | 'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']}, 70 | 'ET': { 'n_estimators': [1,10,100,1000,10000], 'criterion' : ['gini', 'entropy'] ,'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]}, 71 | 'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]}, 72 | 'GB': {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]}, 73 | 'NB' : {}, 74 | 'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100], 'max_features': [None],'min_samples_split': [2,5,10]}, 75 | 'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']}, 76 | 'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']} 77 | } 78 | 79 | small_grid = { 80 | 'RF':{'n_estimators': [100, 10000], 'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs':[-1]}, 81 | 'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.001,0.1,1,10]}, 82 | 'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']}, 83 | 'ET': { 'n_estimators': [100, 10000], 'criterion' : ['gini', 'entropy'] ,'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs':[-1]}, 84 | 'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]}, 85 | 'GB': {'n_estimators': [100, 10000], 'learning_rate' : [0.001,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [5,50]}, 86 | 'NB' : {}, 87 | 'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100], 'max_features': [None],'min_samples_split': [2,5,10]}, 88 | 'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']}, 89 | 'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']} 90 | } 91 | 92 | test_grid = { 93 | 'RF':{'n_estimators': [1], 'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10], 'n_jobs': [-1]}, 94 | 'LR': { 'penalty': ['l1'], 'C': [0.01]}, 95 | 'SGD': { 'loss': ['perceptron'], 'penalty': ['l2']}, 96 | 'ET': { 'n_estimators': [1], 'criterion' : ['gini'] ,'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10], 'n_jobs': [-1]}, 97 | 'AB': { 'algorithm': ['SAMME'], 'n_estimators': [1]}, 98 | 'GB': {'n_estimators': [1], 'learning_rate' : [0.1],'subsample' : [0.5], 'max_depth': [1]}, 99 | 'NB' : {}, 100 | 'DT': {'criterion': ['gini'], 'max_depth': [1], 'max_features': [None],'min_samples_split': [10]}, 101 | 'SVM' :{'C' :[0.01],'kernel':['linear']}, 102 | 'KNN' :{'n_neighbors': [5],'weights': ['uniform'],'algorithm': ['auto']} 103 | } 104 | 105 | if (grid_size == 'large'): 106 | return clfs, large_grid 107 | elif (grid_size == 'small'): 108 | return clfs, small_grid 109 | elif (grid_size == 'test'): 110 | return clfs, test_grid 111 | else: 112 | return 0, 0 113 | 114 | 115 | # Evaluation functions 116 | # calculate precision, recall and auc metrics 117 | 118 | def plot_roc(name, probs, true, output_type): 119 | fpr, tpr, thresholds = roc_curve(true, probs) 120 | roc_auc = auc(fpr, tpr) 121 | pl.clf() 122 | pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) 123 | pl.plot([0, 1], [0, 1], 'k--') 124 | pl.xlim([0.0, 1.05]) 125 | pl.ylim([0.0, 1.05]) 126 | pl.xlabel('False Positive Rate') 127 | pl.ylabel('True Positive Rate') 128 | pl.title(name) 129 | pl.legend(loc="lower right") 130 | if (output_type == 'save'): 131 | plt.savefig(name) 132 | elif (output_type == 'show'): 133 | plt.show() 134 | else: 135 | plt.show() 136 | 137 | def generate_binary_at_k(y_scores, k): 138 | cutoff_index = int(len(y_scores) * (k / 100.0)) 139 | predictions_binary = [1 if x < cutoff_index else 0 for x in range(len(y_scores))] 140 | return predictions_binary 141 | 142 | def precision_at_k(y_true, y_scores, k): 143 | #y_scores_sorted, y_true_sorted = zip(*sorted(zip(y_scores, y_true), reverse=True)) 144 | y_scores_sorted, y_true_sorted = joint_sort_descending(np.array(y_scores), np.array(y_true)) 145 | preds_at_k = generate_binary_at_k(y_scores_sorted, k) 146 | #precision, _, _, _ = metrics.precision_recall_fscore_support(y_true, preds_at_k) 147 | #precision = precision[1] # only interested in precision for label 1 148 | precision = precision_score(y_true_sorted, preds_at_k) 149 | return precision 150 | 151 | def recall_at_k(y_true, y_scores, k): 152 | #y_scores_sorted, y_true_sorted = zip(*sorted(zip(y_scores, y_true), reverse=True)) 153 | y_scores_sorted, y_true_sorted = joint_sort_descending(np.array(y_scores), np.array(y_true)) 154 | preds_at_k = generate_binary_at_k(y_scores_sorted, k) 155 | #precision, _, _, _ = metrics.precision_recall_fscore_support(y_true, preds_at_k) 156 | #precision = precision[1] # only interested in precision for label 1 157 | recall = recall_score(y_true_sorted, preds_at_k) 158 | return recall 159 | 160 | def plot_precision_recall_n(y_true, y_prob, model_name, output_type): 161 | from sklearn.metrics import precision_recall_curve 162 | y_score = y_prob 163 | precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true, y_score) 164 | precision_curve = precision_curve[:-1] 165 | recall_curve = recall_curve[:-1] 166 | pct_above_per_thresh = [] 167 | number_scored = len(y_score) 168 | for value in pr_thresholds: 169 | num_above_thresh = len(y_score[y_score>=value]) 170 | pct_above_thresh = num_above_thresh / float(number_scored) 171 | pct_above_per_thresh.append(pct_above_thresh) 172 | pct_above_per_thresh = np.array(pct_above_per_thresh) 173 | 174 | plt.clf() 175 | fig, ax1 = plt.subplots() 176 | ax1.plot(pct_above_per_thresh, precision_curve, 'b') 177 | ax1.set_xlabel('percent of population') 178 | ax1.set_ylabel('precision', color='b') 179 | ax2 = ax1.twinx() 180 | ax2.plot(pct_above_per_thresh, recall_curve, 'r') 181 | ax2.set_ylabel('recall', color='r') 182 | ax1.set_ylim([0,1]) 183 | ax1.set_ylim([0,1]) 184 | ax2.set_xlim([0,1]) 185 | 186 | name = model_name 187 | plt.title(name) 188 | if (output_type == 'save'): 189 | plt.savefig(name) 190 | elif (output_type == 'show'): 191 | plt.show() 192 | else: 193 | plt.show() 194 | 195 | 196 | # Other helper functions 197 | 198 | def get_subsets(l): 199 | subsets = [] 200 | for i in range(1, len(l) + 1): 201 | for combo in itertools.combinations(l, i): 202 | subsets.append(list(combo)) 203 | return subsets 204 | 205 | def joint_sort_descending(l1, l2): 206 | # l1 and l2 have to be numpy arrays 207 | idx = np.argsort(l1)[::-1] 208 | return l1[idx], l2[idx] 209 | -------------------------------------------------------------------------------- /simpleloop.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn import preprocessing, cross_validation, svm, metrics, tree, decomposition, svm 5 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier 6 | from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, OrthogonalMatchingPursuit, RandomizedLogisticRegression 7 | from sklearn.neighbors.nearest_centroid import NearestCentroid 8 | from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB 9 | from sklearn.tree import DecisionTreeClassifier 10 | from sklearn.neighbors import KNeighborsClassifier 11 | from sklearn.cross_validation import train_test_split 12 | from sklearn.grid_search import ParameterGrid 13 | from sklearn.metrics import * 14 | from sklearn.preprocessing import StandardScaler 15 | import random 16 | import matplotlib.pyplot as plt 17 | from scipy import optimize 18 | import time 19 | import seaborn as sns 20 | 21 | # for jupyter notebooks 22 | #%matplotlib inline 23 | 24 | # if you're running this in a jupyter notebook, print out the graphs 25 | NOTEBOOK = 0 26 | 27 | def define_clfs_params(grid_size): 28 | """Define defaults for different classifiers. 29 | Define three types of grids: 30 | Test: for testing your code 31 | Small: small grid 32 | Large: Larger grid that has a lot more parameter sweeps 33 | """ 34 | 35 | clfs = {'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1), 36 | 'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'), 37 | 'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200), 38 | 'LR': LogisticRegression(penalty='l1', C=1e5), 39 | 'SVM': svm.SVC(kernel='linear', probability=True, random_state=0), 40 | 'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10), 41 | 'NB': GaussianNB(), 42 | 'DT': DecisionTreeClassifier(), 43 | 'SGD': SGDClassifier(loss="hinge", penalty="l2"), 44 | 'KNN': KNeighborsClassifier(n_neighbors=3) 45 | } 46 | 47 | large_grid = { 48 | 'RF':{'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]}, 49 | 'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10]}, 50 | 'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']}, 51 | 'ET': { 'n_estimators': [1,10,100,1000,10000], 'criterion' : ['gini', 'entropy'] ,'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]}, 52 | 'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]}, 53 | 'GB': {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]}, 54 | 'NB' : {}, 55 | 'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],'min_samples_split': [2,5,10]}, 56 | 'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']}, 57 | 'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']} 58 | } 59 | 60 | small_grid = { 61 | 'RF':{'n_estimators': [10,100], 'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs': [-1]}, 62 | 'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.001,0.1,1,10]}, 63 | 'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']}, 64 | 'ET': { 'n_estimators': [10,100], 'criterion' : ['gini', 'entropy'] ,'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs': [-1]}, 65 | 'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]}, 66 | 'GB': {'n_estimators': [10,100], 'learning_rate' : [0.001,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [5,50]}, 67 | 'NB' : {}, 68 | 'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],'min_samples_split': [2,5,10]}, 69 | 'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']}, 70 | 'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']} 71 | } 72 | 73 | test_grid = { 74 | 'RF':{'n_estimators': [1], 'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10]}, 75 | 'LR': { 'penalty': ['l1'], 'C': [0.01]}, 76 | 'SGD': { 'loss': ['perceptron'], 'penalty': ['l2']}, 77 | 'ET': { 'n_estimators': [1], 'criterion' : ['gini'] ,'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10]}, 78 | 'AB': { 'algorithm': ['SAMME'], 'n_estimators': [1]}, 79 | 'GB': {'n_estimators': [1], 'learning_rate' : [0.1],'subsample' : [0.5], 'max_depth': [1]}, 80 | 'NB' : {}, 81 | 'DT': {'criterion': ['gini'], 'max_depth': [1],'min_samples_split': [10]}, 82 | 'SVM' :{'C' :[0.01],'kernel':['linear']}, 83 | 'KNN' :{'n_neighbors': [5],'weights': ['uniform'],'algorithm': ['auto']} 84 | } 85 | 86 | if (grid_size == 'large'): 87 | return clfs, large_grid 88 | elif (grid_size == 'small'): 89 | return clfs, small_grid 90 | elif (grid_size == 'test'): 91 | return clfs, test_grid 92 | else: 93 | return 0, 0 94 | 95 | # a set of helper function to do machine learning evalaution 96 | 97 | def joint_sort_descending(l1, l2): 98 | # l1 and l2 have to be numpy arrays 99 | idx = np.argsort(l1)[::-1] 100 | return l1[idx], l2[idx] 101 | 102 | def generate_binary_at_k(y_scores, k): 103 | cutoff_index = int(len(y_scores) * (k / 100.0)) 104 | test_predictions_binary = [1 if x < cutoff_index else 0 for x in range(len(y_scores))] 105 | return test_predictions_binary 106 | 107 | def precision_at_k(y_true, y_scores, k): 108 | y_scores, y_true = joint_sort_descending(np.array(y_scores), np.array(y_true)) 109 | preds_at_k = generate_binary_at_k(y_scores, k) 110 | #precision, _, _, _ = metrics.precision_recall_fscore_support(y_true, preds_at_k) 111 | #precision = precision[1] # only interested in precision for label 1 112 | precision = precision_score(y_true, preds_at_k) 113 | return precision 114 | 115 | def plot_precision_recall_n(y_true, y_prob, model_name): 116 | from sklearn.metrics import precision_recall_curve 117 | y_score = y_prob 118 | precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true, y_score) 119 | precision_curve = precision_curve[:-1] 120 | recall_curve = recall_curve[:-1] 121 | pct_above_per_thresh = [] 122 | number_scored = len(y_score) 123 | for value in pr_thresholds: 124 | num_above_thresh = len(y_score[y_score>=value]) 125 | pct_above_thresh = num_above_thresh / float(number_scored) 126 | pct_above_per_thresh.append(pct_above_thresh) 127 | pct_above_per_thresh = np.array(pct_above_per_thresh) 128 | 129 | plt.clf() 130 | fig, ax1 = plt.subplots() 131 | ax1.plot(pct_above_per_thresh, precision_curve, 'b') 132 | ax1.set_xlabel('percent of population') 133 | ax1.set_ylabel('precision', color='b') 134 | ax2 = ax1.twinx() 135 | ax2.plot(pct_above_per_thresh, recall_curve, 'r') 136 | ax2.set_ylabel('recall', color='r') 137 | ax1.set_ylim([0,1]) 138 | ax1.set_ylim([0,1]) 139 | ax2.set_xlim([0,1]) 140 | 141 | name = model_name 142 | plt.title(name) 143 | #plt.savefig(name) 144 | plt.show() 145 | 146 | 147 | 148 | def clf_loop(models_to_run, clfs, grid, X, y): 149 | """Runs the loop using models_to_run, clfs, gridm and the data 150 | """ 151 | results_df = pd.DataFrame(columns=('model_type','clf', 'parameters', 'auc-roc','p_at_5', 'p_at_10', 'p_at_20')) 152 | for n in range(1, 2): 153 | # create training and valdation sets 154 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) 155 | for index,clf in enumerate([clfs[x] for x in models_to_run]): 156 | print(models_to_run[index]) 157 | parameter_values = grid[models_to_run[index]] 158 | for p in ParameterGrid(parameter_values): 159 | try: 160 | clf.set_params(**p) 161 | y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:,1] 162 | # you can also store the model, feature importances, and prediction scores 163 | # we're only storing the metrics for now 164 | y_pred_probs_sorted, y_test_sorted = zip(*sorted(zip(y_pred_probs, y_test), reverse=True)) 165 | results_df.loc[len(results_df)] = [models_to_run[index],clf, p, 166 | roc_auc_score(y_test, y_pred_probs), 167 | precision_at_k(y_test_sorted,y_pred_probs_sorted,5.0), 168 | precision_at_k(y_test_sorted,y_pred_probs_sorted,10.0), 169 | precision_at_k(y_test_sorted,y_pred_probs_sorted,20.0)] 170 | if NOTEBOOK == 1: 171 | plot_precision_recall_n(y_test,y_pred_probs,clf) 172 | except IndexError as e: 173 | print('Error:',e) 174 | continue 175 | return results_df 176 | 177 | 178 | 179 | def main(): 180 | 181 | # define grid to use: test, small, large 182 | grid_size = 'test' 183 | clfs, grid = define_clfs_params(grid_size) 184 | 185 | # define models to run 186 | models_to_run=['RF','DT','KNN', 'ET', 'AB', 'GB', 'LR', 'NB'] 187 | 188 | # load data from csv 189 | df = pd.read_csv("/Users/rayid/Projects/uchicago/Teaching/MLPP-2017/Homeworks/Assignment 2/credit-data.csv") 190 | 191 | # select features to use 192 | features = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio', 'age', 'NumberOfTimes90DaysLate'] 193 | X = df[features] 194 | 195 | # define label 196 | y = df.SeriousDlqin2yrs 197 | 198 | # call clf_loop and store results in results_df 199 | results_df = clf_loop(models_to_run, clfs,grid, X,y) 200 | if NOTEBOOK == 1: 201 | results_df 202 | 203 | # save to csv 204 | results_df.to_csv('results.csv', index=False) 205 | 206 | 207 | if __name__ == '__main__': 208 | main() 209 | -------------------------------------------------------------------------------- /temporal_validate.py: -------------------------------------------------------------------------------- 1 | # sample config file to run temporal validation 2 | 3 | # start time of our data 4 | start_time = '2009-01-01' 5 | 6 | #last date of data including labels and outcomes that we have 7 | end_time = '2016-01-01' 8 | 9 | #how far out do we want to predict (let's say in months for now) 10 | prediction_windows = [6, 12] 11 | 12 | #how often is this prediction being made? every day? every month? once a year? 13 | update_window = 12 14 | 15 | from datetime import date, datetime, timedelta 16 | from dateutil.relativedelta import relativedelta 17 | 18 | start_time_date = datetime.strptime(start_time, '%Y-%m-%d') 19 | end_time_date = datetime.strptime(end_time, '%Y-%m-%d') 20 | 21 | for prediction_window in prediction_windows: 22 | test_end_time = end_time_date 23 | while (test_end_time >= start_time_date + 2 * relativedelta(months=+prediction_window)): 24 | test_start_time = test_end_time - relativedelta(months=+prediction_window) 25 | train_end_time = test_start_time - relativedelta(days=+1) # minus 1 day 26 | train_start_time = train_end_time - relativedelta(months=+prediction_window) 27 | while (train_start_time >= start_time_date ): 28 | print train_start_time,train_end_time,test_start_time,test_end_time, prediction_window 29 | train_start_time -= relativedelta(months=+prediction_window) 30 | # call function to get data 31 | train_set, test_set = extract_train_test_sets (train_start_time, train_end_time, test_start_time, test_end_time) 32 | # fit on train data 33 | # predict on test data 34 | test_end_time -= relativedelta(months=+update_window) 35 | -------------------------------------------------------------------------------- /todo.txt: -------------------------------------------------------------------------------- 1 | if 'feature_importances_' in dir(model): 2 | feature_importance = model.feature_importances_ 3 | else: 4 | feature_importance = None 5 | # Here we get the predicted results from the model 6 | # SVC does not have 'predict_proba', so we need to use 'decision_function' 7 | 8 | if hasattr(clf, 'predict_proba'): 9 | yscores = model.predict_proba(X_test_sel)[:,1] 10 | else: 11 | yscores = model.decision_function(X_test_sel) 12 | --------------------------------------------------------------------------------